From 984afe0af6a3f729a2ab6d85cd49186689a45f3b Mon Sep 17 00:00:00 2001 From: eabdullin Date: Thu, 13 Feb 2025 00:22:33 +0000 Subject: [PATCH] import UBI runc-1.1.12-6.module+el8.10.0+22722+0028f543 --- ...Bump-runtime-spec-to-latest-git-HEAD.patch | 508 ++++++++++++++++++ ...1.1-runc-exec-implement-CPU-affinity.patch | 475 ++++++++++++++++ SPECS/runc.spec | 8 +- 3 files changed, 990 insertions(+), 1 deletion(-) create mode 100644 SOURCES/0001-1.1-Bump-runtime-spec-to-latest-git-HEAD.patch create mode 100644 SOURCES/0002-1.1-runc-exec-implement-CPU-affinity.patch diff --git a/SOURCES/0001-1.1-Bump-runtime-spec-to-latest-git-HEAD.patch b/SOURCES/0001-1.1-Bump-runtime-spec-to-latest-git-HEAD.patch new file mode 100644 index 0000000..2a48db6 --- /dev/null +++ b/SOURCES/0001-1.1-Bump-runtime-spec-to-latest-git-HEAD.patch @@ -0,0 +1,508 @@ +From 50f50245235097b0c87b31e97b86fd11685232a3 Mon Sep 17 00:00:00 2001 +From: Kir Kolyshkin +Date: Thu, 16 Jan 2025 15:40:28 -0800 +Subject: [PATCH 1/2] [1.1] Bump runtime-spec to latest git HEAD + +This is to include + - https://github.com/opencontainers/runtime-spec/pull/1261 + - https://github.com/opencontainers/runtime-spec/pull/1253 + +Signed-off-by: Kir Kolyshkin +--- + go.mod | 2 +- + go.sum | 4 +- + .../runtime-spec/specs-go/config.go | 239 ++++++++++++++++-- + .../runtime-spec/specs-go/version.go | 6 +- + vendor/modules.txt | 2 +- + 5 files changed, 225 insertions(+), 28 deletions(-) + +diff --git a/go.mod b/go.mod +index f51b6432..87c8d4b4 100644 +--- a/go.mod ++++ b/go.mod +@@ -12,7 +12,7 @@ require ( + github.com/godbus/dbus/v5 v5.0.6 + github.com/moby/sys/mountinfo v0.5.0 + github.com/mrunalp/fileutils v0.5.1 +- github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 ++ github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95 + github.com/opencontainers/selinux v1.10.0 + github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646 + github.com/sirupsen/logrus v1.8.1 +diff --git a/go.sum b/go.sum +index ecabd398..9d3bedc0 100644 +--- a/go.sum ++++ b/go.sum +@@ -33,8 +33,8 @@ github.com/moby/sys/mountinfo v0.5.0 h1:2Ks8/r6lopsxWi9m58nlwjaeSzUX9iiL1vj5qB/9 + github.com/moby/sys/mountinfo v0.5.0/go.mod h1:3bMD3Rg+zkqx8MRYPi7Pyb0Ie97QEBmdxbhnCLlSvSU= + github.com/mrunalp/fileutils v0.5.1 h1:F+S7ZlNKnrwHfSwdlgNSkKo67ReVf8o9fel6C3dkm/Q= + github.com/mrunalp/fileutils v0.5.1/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ= +-github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 h1:3snG66yBm59tKhhSPQrQ/0bCrv1LQbKt40LnUPiUxdc= +-github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= ++github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95 h1:Ghl8Z3l+yPQUDSxAp7Kg7fJLRNNXjOsR6ooDcca7PjU= ++github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= + github.com/opencontainers/selinux v1.10.0 h1:rAiKF8hTcgLI3w0DHm6i0ylVVcOrlgR1kK99DRLDhyU= + github.com/opencontainers/selinux v1.10.0/go.mod h1:2i0OySw99QjzBBQByd1Gr9gSjvuho1lHsJxIJ3gGbJI= + github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go +index 6a7a91e5..671f0d01 100644 +--- a/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go ++++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go +@@ -12,10 +12,12 @@ type Spec struct { + Root *Root `json:"root,omitempty"` + // Hostname configures the container's hostname. + Hostname string `json:"hostname,omitempty"` ++ // Domainname configures the container's domainname. ++ Domainname string `json:"domainname,omitempty"` + // Mounts configures additional mounts (on top of Root). + Mounts []Mount `json:"mounts,omitempty"` + // Hooks configures callbacks for container lifecycle events. +- Hooks *Hooks `json:"hooks,omitempty" platform:"linux,solaris"` ++ Hooks *Hooks `json:"hooks,omitempty" platform:"linux,solaris,zos"` + // Annotations contains arbitrary metadata for the container. + Annotations map[string]string `json:"annotations,omitempty"` + +@@ -27,6 +29,36 @@ type Spec struct { + Windows *Windows `json:"windows,omitempty" platform:"windows"` + // VM specifies configuration for virtual-machine-based containers. + VM *VM `json:"vm,omitempty" platform:"vm"` ++ // ZOS is platform-specific configuration for z/OS based containers. ++ ZOS *ZOS `json:"zos,omitempty" platform:"zos"` ++} ++ ++// Scheduler represents the scheduling attributes for a process. It is based on ++// the Linux sched_setattr(2) syscall. ++type Scheduler struct { ++ // Policy represents the scheduling policy (e.g., SCHED_FIFO, SCHED_RR, SCHED_OTHER). ++ Policy LinuxSchedulerPolicy `json:"policy"` ++ ++ // Nice is the nice value for the process, which affects its priority. ++ Nice int32 `json:"nice,omitempty"` ++ ++ // Priority represents the static priority of the process. ++ Priority int32 `json:"priority,omitempty"` ++ ++ // Flags is an array of scheduling flags. ++ Flags []LinuxSchedulerFlag `json:"flags,omitempty"` ++ ++ // The following ones are used by the DEADLINE scheduler. ++ ++ // Runtime is the amount of time in nanoseconds during which the process ++ // is allowed to run in a given period. ++ Runtime uint64 `json:"runtime,omitempty"` ++ ++ // Deadline is the absolute deadline for the process to complete its execution. ++ Deadline uint64 `json:"deadline,omitempty"` ++ ++ // Period is the length of the period in nanoseconds used for determining the process runtime. ++ Period uint64 `json:"period,omitempty"` + } + + // Process contains information to start a specific application inside the container. +@@ -49,15 +81,21 @@ type Process struct { + // Capabilities are Linux capabilities that are kept for the process. + Capabilities *LinuxCapabilities `json:"capabilities,omitempty" platform:"linux"` + // Rlimits specifies rlimit options to apply to the process. +- Rlimits []POSIXRlimit `json:"rlimits,omitempty" platform:"linux,solaris"` ++ Rlimits []POSIXRlimit `json:"rlimits,omitempty" platform:"linux,solaris,zos"` + // NoNewPrivileges controls whether additional privileges could be gained by processes in the container. + NoNewPrivileges bool `json:"noNewPrivileges,omitempty" platform:"linux"` + // ApparmorProfile specifies the apparmor profile for the container. + ApparmorProfile string `json:"apparmorProfile,omitempty" platform:"linux"` + // Specify an oom_score_adj for the container. + OOMScoreAdj *int `json:"oomScoreAdj,omitempty" platform:"linux"` ++ // Scheduler specifies the scheduling attributes for a process ++ Scheduler *Scheduler `json:"scheduler,omitempty" platform:"linux"` + // SelinuxLabel specifies the selinux context that the container process is run as. + SelinuxLabel string `json:"selinuxLabel,omitempty" platform:"linux"` ++ // IOPriority contains the I/O priority settings for the cgroup. ++ IOPriority *LinuxIOPriority `json:"ioPriority,omitempty" platform:"linux"` ++ // ExecCPUAffinity specifies CPU affinity for exec processes. ++ ExecCPUAffinity *CPUAffinity `json:"execCPUAffinity,omitempty" platform:"linux"` + } + + // LinuxCapabilities specifies the list of allowed capabilities that are kept for a process. +@@ -75,6 +113,28 @@ type LinuxCapabilities struct { + Ambient []string `json:"ambient,omitempty" platform:"linux"` + } + ++// IOPriority represents I/O priority settings for the container's processes within the process group. ++type LinuxIOPriority struct { ++ Class IOPriorityClass `json:"class"` ++ Priority int `json:"priority"` ++} ++ ++// IOPriorityClass represents an I/O scheduling class. ++type IOPriorityClass string ++ ++// Possible values for IOPriorityClass. ++const ( ++ IOPRIO_CLASS_RT IOPriorityClass = "IOPRIO_CLASS_RT" ++ IOPRIO_CLASS_BE IOPriorityClass = "IOPRIO_CLASS_BE" ++ IOPRIO_CLASS_IDLE IOPriorityClass = "IOPRIO_CLASS_IDLE" ++) ++ ++// CPUAffinity specifies process' CPU affinity. ++type CPUAffinity struct { ++ Initial string `json:"initial,omitempty"` ++ Final string `json:"final,omitempty"` ++} ++ + // Box specifies dimensions of a rectangle. Used for specifying the size of a console. + type Box struct { + // Height is the vertical dimension of a box. +@@ -86,11 +146,11 @@ type Box struct { + // User specifies specific user (and group) information for the container process. + type User struct { + // UID is the user id. +- UID uint32 `json:"uid" platform:"linux,solaris"` ++ UID uint32 `json:"uid" platform:"linux,solaris,zos"` + // GID is the group id. +- GID uint32 `json:"gid" platform:"linux,solaris"` ++ GID uint32 `json:"gid" platform:"linux,solaris,zos"` + // Umask is the umask for the init process. +- Umask *uint32 `json:"umask,omitempty" platform:"linux,solaris"` ++ Umask *uint32 `json:"umask,omitempty" platform:"linux,solaris,zos"` + // AdditionalGids are additional group ids set for the container's process. + AdditionalGids []uint32 `json:"additionalGids,omitempty" platform:"linux,solaris"` + // Username is the user name. +@@ -110,11 +170,16 @@ type Mount struct { + // Destination is the absolute path where the mount will be placed in the container. + Destination string `json:"destination"` + // Type specifies the mount kind. +- Type string `json:"type,omitempty" platform:"linux,solaris"` ++ Type string `json:"type,omitempty" platform:"linux,solaris,zos"` + // Source specifies the source path of the mount. + Source string `json:"source,omitempty"` + // Options are fstab style mount options. + Options []string `json:"options,omitempty"` ++ ++ // UID/GID mappings used for changing file owners w/o calling chown, fs should support it. ++ // Every mount point could have its own mapping. ++ UIDMappings []LinuxIDMapping `json:"uidMappings,omitempty" platform:"linux"` ++ GIDMappings []LinuxIDMapping `json:"gidMappings,omitempty" platform:"linux"` + } + + // Hook specifies a command that is run at a particular event in the lifecycle of a container +@@ -130,6 +195,10 @@ type Hook struct { + type Hooks struct { + // Prestart is Deprecated. Prestart is a list of hooks to be run before the container process is executed. + // It is called in the Runtime Namespace ++ // ++ // Deprecated: use [Hooks.CreateRuntime], [Hooks.CreateContainer], and ++ // [Hooks.StartContainer] instead, which allow more granular hook control ++ // during the create and start phase. + Prestart []Hook `json:"prestart,omitempty"` + // CreateRuntime is a list of hooks to be run after the container has been created but before pivot_root or any equivalent operation has been called + // It is called in the Runtime Namespace +@@ -178,10 +247,12 @@ type Linux struct { + // MountLabel specifies the selinux context for the mounts in the container. + MountLabel string `json:"mountLabel,omitempty"` + // IntelRdt contains Intel Resource Director Technology (RDT) information for +- // handling resource constraints (e.g., L3 cache, memory bandwidth) for the container ++ // handling resource constraints and monitoring metrics (e.g., L3 cache, memory bandwidth) for the container + IntelRdt *LinuxIntelRdt `json:"intelRdt,omitempty"` + // Personality contains configuration for the Linux personality syscall + Personality *LinuxPersonality `json:"personality,omitempty"` ++ // TimeOffsets specifies the offset for supporting time namespaces. ++ TimeOffsets map[string]LinuxTimeOffset `json:"timeOffsets,omitempty"` + } + + // LinuxNamespace is the configuration for a Linux namespace +@@ -211,6 +282,8 @@ const ( + UserNamespace LinuxNamespaceType = "user" + // CgroupNamespace for isolating cgroup hierarchies + CgroupNamespace LinuxNamespaceType = "cgroup" ++ // TimeNamespace for isolating the clocks ++ TimeNamespace LinuxNamespaceType = "time" + ) + + // LinuxIDMapping specifies UID/GID mappings +@@ -223,6 +296,14 @@ type LinuxIDMapping struct { + Size uint32 `json:"size"` + } + ++// LinuxTimeOffset specifies the offset for Time Namespace ++type LinuxTimeOffset struct { ++ // Secs is the offset of clock (in secs) in the container ++ Secs int64 `json:"secs,omitempty"` ++ // Nanosecs is the additional offset for Secs (in nanosecs) ++ Nanosecs uint32 `json:"nanosecs,omitempty"` ++} ++ + // POSIXRlimit type and restrictions + type POSIXRlimit struct { + // Type of the rlimit to set +@@ -233,12 +314,13 @@ type POSIXRlimit struct { + Soft uint64 `json:"soft"` + } + +-// LinuxHugepageLimit structure corresponds to limiting kernel hugepages ++// LinuxHugepageLimit structure corresponds to limiting kernel hugepages. ++// Default to reservation limits if supported. Otherwise fallback to page fault limits. + type LinuxHugepageLimit struct { +- // Pagesize is the hugepage size +- // Format: "B' (e.g. 64KB, 2MB, 1GB, etc.) ++ // Pagesize is the hugepage size. ++ // Format: "B' (e.g. 64KB, 2MB, 1GB, etc.). + Pagesize string `json:"pageSize"` +- // Limit is the limit of "hugepagesize" hugetlb usage ++ // Limit is the limit of "hugepagesize" hugetlb reservations (if supported) or usage. + Limit uint64 `json:"limit"` + } + +@@ -250,8 +332,8 @@ type LinuxInterfacePriority struct { + Priority uint32 `json:"priority"` + } + +-// linuxBlockIODevice holds major:minor format supported in blkio cgroup +-type linuxBlockIODevice struct { ++// LinuxBlockIODevice holds major:minor format supported in blkio cgroup ++type LinuxBlockIODevice struct { + // Major is the device's major number. + Major int64 `json:"major"` + // Minor is the device's minor number. +@@ -260,7 +342,7 @@ type linuxBlockIODevice struct { + + // LinuxWeightDevice struct holds a `major:minor weight` pair for weightDevice + type LinuxWeightDevice struct { +- linuxBlockIODevice ++ LinuxBlockIODevice + // Weight is the bandwidth rate for the device. + Weight *uint16 `json:"weight,omitempty"` + // LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, CFQ scheduler only +@@ -269,7 +351,7 @@ type LinuxWeightDevice struct { + + // LinuxThrottleDevice struct holds a `major:minor rate_per_second` pair + type LinuxThrottleDevice struct { +- linuxBlockIODevice ++ LinuxBlockIODevice + // Rate is the IO rate limit per cgroup per device + Rate uint64 `json:"rate"` + } +@@ -301,6 +383,12 @@ type LinuxMemory struct { + // Total memory limit (memory + swap). + Swap *int64 `json:"swap,omitempty"` + // Kernel memory limit (in bytes). ++ // ++ // Deprecated: kernel-memory limits are not supported in cgroups v2, and ++ // were obsoleted in [kernel v5.4]. This field should no longer be used, ++ // as it may be ignored by runtimes. ++ // ++ // [kernel v5.4]: https://github.com/torvalds/linux/commit/0158115f702b0ba208ab0 + Kernel *int64 `json:"kernel,omitempty"` + // Kernel memory limit for tcp (in bytes) + KernelTCP *int64 `json:"kernelTCP,omitempty"` +@@ -310,6 +398,10 @@ type LinuxMemory struct { + DisableOOMKiller *bool `json:"disableOOMKiller,omitempty"` + // Enables hierarchical memory accounting + UseHierarchy *bool `json:"useHierarchy,omitempty"` ++ // CheckBeforeUpdate enables checking if a new memory limit is lower ++ // than the current usage during update, and if so, rejecting the new ++ // limit. ++ CheckBeforeUpdate *bool `json:"checkBeforeUpdate,omitempty"` + } + + // LinuxCPU for Linux cgroup 'cpu' resource management +@@ -318,6 +410,9 @@ type LinuxCPU struct { + Shares *uint64 `json:"shares,omitempty"` + // CPU hardcap limit (in usecs). Allowed cpu time in a given period. + Quota *int64 `json:"quota,omitempty"` ++ // CPU hardcap burst limit (in usecs). Allowed accumulated cpu time additionally for burst in a ++ // given period. ++ Burst *uint64 `json:"burst,omitempty"` + // CPU period to be used for hardcapping (in usecs). + Period *uint64 `json:"period,omitempty"` + // How much time realtime scheduling may use (in usecs). +@@ -328,6 +423,8 @@ type LinuxCPU struct { + Cpus string `json:"cpus,omitempty"` + // List of memory nodes in the cpuset. Default is to use any available memory node. + Mems string `json:"mems,omitempty"` ++ // cgroups are configured with minimum weight, 0: default behavior, 1: SCHED_IDLE. ++ Idle *int64 `json:"idle,omitempty"` + } + + // LinuxPids for Linux cgroup 'pids' resource management (Linux 4.3) +@@ -364,7 +461,7 @@ type LinuxResources struct { + Pids *LinuxPids `json:"pids,omitempty"` + // BlockIO restriction configuration + BlockIO *LinuxBlockIO `json:"blockIO,omitempty"` +- // Hugetlb limit (in bytes) ++ // Hugetlb limits (in bytes). Default to reservation limits if supported. + HugepageLimits []LinuxHugepageLimit `json:"hugepageLimits,omitempty"` + // Network restriction configuration + Network *LinuxNetwork `json:"network,omitempty"` +@@ -522,11 +619,21 @@ type WindowsMemoryResources struct { + + // WindowsCPUResources contains CPU resource management settings. + type WindowsCPUResources struct { +- // Number of CPUs available to the container. ++ // Count is the number of CPUs available to the container. It represents the ++ // fraction of the configured processor `count` in a container in relation ++ // to the processors available in the host. The fraction ultimately ++ // determines the portion of processor cycles that the threads in a ++ // container can use during each scheduling interval, as the number of ++ // cycles per 10,000 cycles. + Count *uint64 `json:"count,omitempty"` +- // CPU shares (relative weight to other containers with cpu shares). ++ // Shares limits the share of processor time given to the container relative ++ // to other workloads on the processor. The processor `shares` (`weight` at ++ // the platform level) is a value between 0 and 10000. + Shares *uint16 `json:"shares,omitempty"` +- // Specifies the portion of processor cycles that this container can use as a percentage times 100. ++ // Maximum determines the portion of processor cycles that the threads in a ++ // container can use during each scheduling interval, as the number of ++ // cycles per 10,000 cycles. Set processor `maximum` to a percentage times ++ // 100. + Maximum *uint16 `json:"maximum,omitempty"` + } + +@@ -613,6 +720,23 @@ type Arch string + // LinuxSeccompFlag is a flag to pass to seccomp(2). + type LinuxSeccompFlag string + ++const ( ++ // LinuxSeccompFlagLog is a seccomp flag to request all returned ++ // actions except SECCOMP_RET_ALLOW to be logged. An administrator may ++ // override this filter flag by preventing specific actions from being ++ // logged via the /proc/sys/kernel/seccomp/actions_logged file. (since ++ // Linux 4.14) ++ LinuxSeccompFlagLog LinuxSeccompFlag = "SECCOMP_FILTER_FLAG_LOG" ++ ++ // LinuxSeccompFlagSpecAllow can be used to disable Speculative Store ++ // Bypass mitigation. (since Linux 4.17) ++ LinuxSeccompFlagSpecAllow LinuxSeccompFlag = "SECCOMP_FILTER_FLAG_SPEC_ALLOW" ++ ++ // LinuxSeccompFlagWaitKillableRecv can be used to switch to the wait ++ // killable semantics. (since Linux 5.19) ++ LinuxSeccompFlagWaitKillableRecv LinuxSeccompFlag = "SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV" ++) ++ + // Additional architectures permitted to be used for system calls + // By default only the native architecture of the kernel is permitted + const ( +@@ -683,8 +807,9 @@ type LinuxSyscall struct { + Args []LinuxSeccompArg `json:"args,omitempty"` + } + +-// LinuxIntelRdt has container runtime resource constraints for Intel RDT +-// CAT and MBA features which introduced in Linux 4.10 and 4.12 kernel ++// LinuxIntelRdt has container runtime resource constraints for Intel RDT CAT and MBA ++// features and flags enabling Intel RDT CMT and MBM features. ++// Intel RDT features are available in Linux 4.14 and newer kernel versions. + type LinuxIntelRdt struct { + // The identity for RDT Class of Service + ClosID string `json:"closID,omitempty"` +@@ -697,4 +822,76 @@ type LinuxIntelRdt struct { + // The unit of memory bandwidth is specified in "percentages" by + // default, and in "MBps" if MBA Software Controller is enabled. + MemBwSchema string `json:"memBwSchema,omitempty"` ++ ++ // EnableCMT is the flag to indicate if the Intel RDT CMT is enabled. CMT (Cache Monitoring Technology) supports monitoring of ++ // the last-level cache (LLC) occupancy for the container. ++ EnableCMT bool `json:"enableCMT,omitempty"` ++ ++ // EnableMBM is the flag to indicate if the Intel RDT MBM is enabled. MBM (Memory Bandwidth Monitoring) supports monitoring of ++ // total and local memory bandwidth for the container. ++ EnableMBM bool `json:"enableMBM,omitempty"` ++} ++ ++// ZOS contains platform-specific configuration for z/OS based containers. ++type ZOS struct { ++ // Devices are a list of device nodes that are created for the container ++ Devices []ZOSDevice `json:"devices,omitempty"` ++} ++ ++// ZOSDevice represents the mknod information for a z/OS special device file ++type ZOSDevice struct { ++ // Path to the device. ++ Path string `json:"path"` ++ // Device type, block, char, etc. ++ Type string `json:"type"` ++ // Major is the device's major number. ++ Major int64 `json:"major"` ++ // Minor is the device's minor number. ++ Minor int64 `json:"minor"` ++ // FileMode permission bits for the device. ++ FileMode *os.FileMode `json:"fileMode,omitempty"` ++ // UID of the device. ++ UID *uint32 `json:"uid,omitempty"` ++ // Gid of the device. ++ GID *uint32 `json:"gid,omitempty"` + } ++ ++// LinuxSchedulerPolicy represents different scheduling policies used with the Linux Scheduler ++type LinuxSchedulerPolicy string ++ ++const ( ++ // SchedOther is the default scheduling policy ++ SchedOther LinuxSchedulerPolicy = "SCHED_OTHER" ++ // SchedFIFO is the First-In-First-Out scheduling policy ++ SchedFIFO LinuxSchedulerPolicy = "SCHED_FIFO" ++ // SchedRR is the Round-Robin scheduling policy ++ SchedRR LinuxSchedulerPolicy = "SCHED_RR" ++ // SchedBatch is the Batch scheduling policy ++ SchedBatch LinuxSchedulerPolicy = "SCHED_BATCH" ++ // SchedISO is the Isolation scheduling policy ++ SchedISO LinuxSchedulerPolicy = "SCHED_ISO" ++ // SchedIdle is the Idle scheduling policy ++ SchedIdle LinuxSchedulerPolicy = "SCHED_IDLE" ++ // SchedDeadline is the Deadline scheduling policy ++ SchedDeadline LinuxSchedulerPolicy = "SCHED_DEADLINE" ++) ++ ++// LinuxSchedulerFlag represents the flags used by the Linux Scheduler. ++type LinuxSchedulerFlag string ++ ++const ( ++ // SchedFlagResetOnFork represents the reset on fork scheduling flag ++ SchedFlagResetOnFork LinuxSchedulerFlag = "SCHED_FLAG_RESET_ON_FORK" ++ // SchedFlagReclaim represents the reclaim scheduling flag ++ SchedFlagReclaim LinuxSchedulerFlag = "SCHED_FLAG_RECLAIM" ++ // SchedFlagDLOverrun represents the deadline overrun scheduling flag ++ SchedFlagDLOverrun LinuxSchedulerFlag = "SCHED_FLAG_DL_OVERRUN" ++ // SchedFlagKeepPolicy represents the keep policy scheduling flag ++ SchedFlagKeepPolicy LinuxSchedulerFlag = "SCHED_FLAG_KEEP_POLICY" ++ // SchedFlagKeepParams represents the keep parameters scheduling flag ++ SchedFlagKeepParams LinuxSchedulerFlag = "SCHED_FLAG_KEEP_PARAMS" ++ // SchedFlagUtilClampMin represents the utilization clamp minimum scheduling flag ++ SchedFlagUtilClampMin LinuxSchedulerFlag = "SCHED_FLAG_UTIL_CLAMP_MIN" ++ // SchedFlagUtilClampMin represents the utilization clamp maximum scheduling flag ++ SchedFlagUtilClampMax LinuxSchedulerFlag = "SCHED_FLAG_UTIL_CLAMP_MAX" ++) +diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go +index 596af0c2..f6c15f6c 100644 +--- a/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go ++++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go +@@ -6,12 +6,12 @@ const ( + // VersionMajor is for an API incompatible changes + VersionMajor = 1 + // VersionMinor is for functionality in a backwards-compatible manner +- VersionMinor = 0 ++ VersionMinor = 2 + // VersionPatch is for backwards-compatible bug fixes +- VersionPatch = 2 ++ VersionPatch = 0 + + // VersionDev indicates development branch. Releases will be empty string. +- VersionDev = "-dev" ++ VersionDev = "+dev" + ) + + // Version is the specification version that the package types support. +diff --git a/vendor/modules.txt b/vendor/modules.txt +index a5537dfe..40089cd4 100644 +--- a/vendor/modules.txt ++++ b/vendor/modules.txt +@@ -35,7 +35,7 @@ github.com/moby/sys/mountinfo + # github.com/mrunalp/fileutils v0.5.1 + ## explicit; go 1.13 + github.com/mrunalp/fileutils +-# github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 ++# github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95 + ## explicit + github.com/opencontainers/runtime-spec/specs-go + # github.com/opencontainers/selinux v1.10.0 +-- +2.47.1 + diff --git a/SOURCES/0002-1.1-runc-exec-implement-CPU-affinity.patch b/SOURCES/0002-1.1-runc-exec-implement-CPU-affinity.patch new file mode 100644 index 0000000..d04d619 --- /dev/null +++ b/SOURCES/0002-1.1-runc-exec-implement-CPU-affinity.patch @@ -0,0 +1,475 @@ +From 1af672a2635628ca24ce3b5ed3344d316548f1ca Mon Sep 17 00:00:00 2001 +From: Kir Kolyshkin +Date: Mon, 21 Oct 2024 15:50:38 -0700 +Subject: [PATCH 2/2] [1.1] runc exec: implement CPU affinity + +As per +- https://github.com/opencontainers/runtime-spec/pull/1253 +- https://github.com/opencontainers/runtime-spec/pull/1261 + +CPU affinity can be set in two ways: +1. When creating/starting a container, in config.json's + Process.ExecCPUAffinity, which is when applied to all execs. +2. When running an exec, in process.json's CPUAffinity, which + applied to a given exec and overrides the value from (1). + +Add some basic tests. + +Note that older kernels (RHEL8, Ubuntu 20.04) change CPU affinity of a +process to that of a container's cgroup, as soon as it is moved to that +cgroup, while newer kernels (Ubuntu 24.04, Fedora 41) don't do that. + +Because of the above, + - it's impossible to really test initial CPU affinity without adding + debug logging to libcontainer/nsenter; + - for older kernels, there can be a brief moment when exec's affinity + is different than either initial or final affinity being set; + - exec's final CPU affinity, if not specified, can be different + depending on the kernel, therefore we don't test it. + +Signed-off-by: Kir Kolyshkin +--- + libcontainer/configs/config.go | 73 ++++++++++++++++++++ + libcontainer/container_linux.go | 4 ++ + libcontainer/init_linux.go | 1 + + libcontainer/nsenter/nsexec.c | 36 +++++++++- + libcontainer/process.go | 2 + + libcontainer/process_linux.go | 51 +++++++++++++- + libcontainer/specconv/spec_linux.go | 5 ++ + tests/integration/cpu_affinity.bats | 101 ++++++++++++++++++++++++++++ + utils_linux.go | 6 ++ + 9 files changed, 275 insertions(+), 4 deletions(-) + create mode 100644 tests/integration/cpu_affinity.bats + +diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go +index 6ebf5ec7..997f2724 100644 +--- a/libcontainer/configs/config.go ++++ b/libcontainer/configs/config.go +@@ -3,11 +3,15 @@ package configs + import ( + "bytes" + "encoding/json" ++ "errors" + "fmt" + "os/exec" ++ "strconv" ++ "strings" + "time" + + "github.com/sirupsen/logrus" ++ "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/opencontainers/runtime-spec/specs-go" +@@ -211,6 +215,75 @@ type Config struct { + // RootlessCgroups is set when unlikely to have the full access to cgroups. + // When RootlessCgroups is set, cgroups errors are ignored. + RootlessCgroups bool `json:"rootless_cgroups,omitempty"` ++ ++ // ExecCPUAffinity is CPU affinity for a non-init process to be run in the container. ++ ExecCPUAffinity *CPUAffinity `json:"exec_cpu_affinity,omitempty"` ++} ++ ++type CPUAffinity struct { ++ Initial, Final *unix.CPUSet ++} ++ ++func toCPUSet(str string) (*unix.CPUSet, error) { ++ if str == "" { ++ return nil, nil ++ } ++ s := new(unix.CPUSet) ++ for _, r := range strings.Split(str, ",") { ++ // Allow extra spaces around. ++ r = strings.TrimSpace(r) ++ // Allow empty elements (extra commas). ++ if r == "" { ++ continue ++ } ++ if r0, r1, found := strings.Cut(r, "-"); found { ++ start, err := strconv.ParseUint(r0, 10, 32) ++ if err != nil { ++ return nil, err ++ } ++ end, err := strconv.ParseUint(r1, 10, 32) ++ if err != nil { ++ return nil, err ++ } ++ if start > end { ++ return nil, errors.New("invalid range: " + r) ++ } ++ for i := int(start); i <= int(end); i++ { ++ s.Set(i) ++ } ++ } else { ++ val, err := strconv.ParseUint(r, 10, 32) ++ if err != nil { ++ return nil, err ++ } ++ s.Set(int(val)) ++ } ++ } ++ ++ return s, nil ++} ++ ++// ConvertCPUAffinity converts [specs.CPUAffinity] to [CPUAffinity]. ++func ConvertCPUAffinity(sa *specs.CPUAffinity) (*CPUAffinity, error) { ++ if sa == nil { ++ return nil, nil ++ } ++ initial, err := toCPUSet(sa.Initial) ++ if err != nil { ++ return nil, fmt.Errorf("bad CPUAffinity.Initial: %w", err) ++ } ++ final, err := toCPUSet(sa.Final) ++ if err != nil { ++ return nil, fmt.Errorf("bad CPUAffinity.Final: %w", err) ++ } ++ if initial == nil && final == nil { ++ return nil, nil ++ } ++ ++ return &CPUAffinity{ ++ Initial: initial, ++ Final: final, ++ }, nil + } + + type ( +diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go +index 40b332f9..68b6a74f 100644 +--- a/libcontainer/container_linux.go ++++ b/libcontainer/container_linux.go +@@ -692,6 +692,7 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig { + AppArmorProfile: c.config.AppArmorProfile, + ProcessLabel: c.config.ProcessLabel, + Rlimits: c.config.Rlimits, ++ CPUAffinity: c.config.ExecCPUAffinity, + CreateConsole: process.ConsoleSocket != nil, + ConsoleWidth: process.ConsoleWidth, + ConsoleHeight: process.ConsoleHeight, +@@ -708,6 +709,9 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig { + if len(process.Rlimits) > 0 { + cfg.Rlimits = process.Rlimits + } ++ if process.CPUAffinity != nil { ++ cfg.CPUAffinity = process.CPUAffinity ++ } + if cgroups.IsCgroup2UnifiedMode() { + cfg.Cgroup2Path = c.cgroupManager.Path("") + } +diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go +index d9f18139..1f8562ec 100644 +--- a/libcontainer/init_linux.go ++++ b/libcontainer/init_linux.go +@@ -70,6 +70,7 @@ type initConfig struct { + RootlessCgroups bool `json:"rootless_cgroups,omitempty"` + SpecState *specs.State `json:"spec_state,omitempty"` + Cgroup2Path string `json:"cgroup2_path,omitempty"` ++ CPUAffinity *configs.CPUAffinity `json:"cpu_affinity,omitempty"` + } + + type initer interface { +diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c +index 2d224bab..6f70aa87 100644 +--- a/libcontainer/nsenter/nsexec.c ++++ b/libcontainer/nsenter/nsexec.c +@@ -149,13 +149,18 @@ int setns(int fd, int nstype) + } + #endif + ++bool log_enabled_for(int level) ++{ ++ return (logfd >= 0 && level <= loglevel); ++} ++ + static void write_log(int level, const char *format, ...) + { + char *message = NULL, *stage = NULL, *json = NULL; + va_list args; + int ret; + +- if (logfd < 0 || level > loglevel) ++ if (!log_enabled_for(level)) + goto out; + + va_start(args, format); +@@ -851,6 +856,25 @@ void try_unshare(int flags, const char *msg) + bail("failed to unshare %s", msg); + } + ++void print_cpu_affinity() ++{ ++ cpu_set_t cpus = { }; ++ size_t i, mask = 0; ++ ++ if (sched_getaffinity(0, sizeof(cpus), &cpus) < 0) { ++ write_log(WARNING, "sched_getaffinity: %m"); ++ return; ++ } ++ ++ /* Do not print the complete mask, we only need a few first CPUs. */ ++ for (i = 0; i < sizeof(mask) * 8; i++) { ++ if (CPU_ISSET(i, &cpus)) ++ mask |= 1 << i; ++ } ++ ++ write_log(DEBUG, "affinity: 0x%zx", mask); ++} ++ + void nsexec(void) + { + int pipenum; +@@ -892,6 +916,16 @@ void nsexec(void) + + write_log(DEBUG, "=> nsexec container setup"); + ++ /* This is for ../../tests/integration/cpu_affinity.bats test only. ++ * ++ * Printing this from Go code might be too late as some kernels ++ * change the process' CPU affinity to that of container's cpuset ++ * as soon as the process is moved into container's cgroup. ++ */ ++ if (log_enabled_for(DEBUG)) { ++ print_cpu_affinity(); ++ } ++ + /* Parse all of the netlink configuration. */ + nl_parse(pipenum, &config); + +diff --git a/libcontainer/process.go b/libcontainer/process.go +index 8a5d340d..99167274 100644 +--- a/libcontainer/process.go ++++ b/libcontainer/process.go +@@ -89,6 +89,8 @@ type Process struct { + // + // For cgroup v2, the only key allowed is "". + SubCgroupPaths map[string]string ++ ++ CPUAffinity *configs.CPUAffinity + } + + // Wait waits for the process to exit. +diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go +index 0d9ceb9c..3b48ae76 100644 +--- a/libcontainer/process_linux.go ++++ b/libcontainer/process_linux.go +@@ -9,6 +9,7 @@ import ( + "os" + "os/exec" + "path/filepath" ++ "runtime" + "strconv" + "time" + +@@ -78,12 +79,52 @@ func (p *setnsProcess) signal(sig os.Signal) error { + return unix.Kill(p.pid(), s) + } + ++// Starts setns process with specified initial CPU affinity. ++func (p *setnsProcess) startWithCPUAffinity() error { ++ aff := p.config.CPUAffinity ++ if aff == nil || aff.Initial == nil { ++ return p.cmd.Start() ++ } ++ errCh := make(chan error) ++ defer close(errCh) ++ ++ // Use a goroutine to dedicate an OS thread. ++ go func() { ++ runtime.LockOSThread() ++ // Command inherits the CPU affinity. ++ if err := unix.SchedSetaffinity(unix.Gettid(), aff.Initial); err != nil { ++ runtime.UnlockOSThread() ++ errCh <- fmt.Errorf("error setting initial CPU affinity: %w", err) ++ return ++ } ++ ++ errCh <- p.cmd.Start() ++ // Deliberately omit runtime.UnlockOSThread here. ++ // https://pkg.go.dev/runtime#LockOSThread says: ++ // "If the calling goroutine exits without unlocking the ++ // thread, the thread will be terminated". ++ }() ++ ++ return <-errCh ++} ++ ++func (p *setnsProcess) setFinalCPUAffinity() error { ++ aff := p.config.CPUAffinity ++ if aff == nil || aff.Final == nil { ++ return nil ++ } ++ if err := unix.SchedSetaffinity(p.pid(), aff.Final); err != nil { ++ return fmt.Errorf("error setting final CPU affinity: %w", err) ++ } ++ return nil ++} ++ + func (p *setnsProcess) start() (retErr error) { + defer p.messageSockPair.parent.Close() +- // get the "before" value of oom kill count ++ // Get the "before" value of oom kill count. + oom, _ := p.manager.OOMKillCount() +- err := p.cmd.Start() +- // close the write-side of the pipes (controlled by child) ++ err := p.startWithCPUAffinity() ++ // Close the child-side of the pipes (controlled by child). + p.messageSockPair.child.Close() + p.logFilePair.child.Close() + if err != nil { +@@ -143,6 +184,10 @@ func (p *setnsProcess) start() (retErr error) { + } + } + } ++ // Set final CPU affinity right after the process is moved into container's cgroup. ++ if err := p.setFinalCPUAffinity(); err != nil { ++ return err ++ } + if p.intelRdtPath != "" { + // if Intel RDT "resource control" filesystem path exists + _, err := os.Stat(p.intelRdtPath) +diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go +index 7dbfb869..b59e0d59 100644 +--- a/libcontainer/specconv/spec_linux.go ++++ b/libcontainer/specconv/spec_linux.go +@@ -493,6 +493,11 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { + Ambient: spec.Process.Capabilities.Ambient, + } + } ++ config.ExecCPUAffinity, err = configs.ConvertCPUAffinity(spec.Process.ExecCPUAffinity) ++ if err != nil { ++ return nil, err ++ } ++ + } + createHooks(spec, config) + config.Version = specs.Version +diff --git a/tests/integration/cpu_affinity.bats b/tests/integration/cpu_affinity.bats +new file mode 100644 +index 00000000..f6adfa2a +--- /dev/null ++++ b/tests/integration/cpu_affinity.bats +@@ -0,0 +1,101 @@ ++#!/usr/bin/env bats ++# Exec CPU affinity tests. For more details, see: ++# - https://github.com/opencontainers/runtime-spec/pull/1253 ++ ++load helpers ++ ++function setup() { ++ requires smp cgroups_cpuset ++ setup_busybox ++} ++ ++function teardown() { ++ teardown_bundle ++} ++ ++function first_cpu() { ++ sed 's/[-,].*//g' "-". ++ cpus=${cpus//-/ } # 2. "-" --> " ". ++ ++ for c in $cpus; do ++ mask=$((mask | 1 << c)) ++ done ++ ++ printf "0x%x" $mask ++} ++ ++@test "runc exec [CPU affinity, only initial set from process.json]" { ++ first="$(first_cpu)" ++ second=$((first + 1)) # Hacky; might not work in all environments. ++ ++ runc run -d --console-socket "$CONSOLE_SOCKET" ct1 ++ [ "$status" -eq 0 ] ++ ++ for cpus in "$second" "$first-$second" "$first,$second" "$first"; do ++ proc=' ++{ ++ "terminal": false, ++ "execCPUAffinity": { ++ "initial": "'$cpus'" ++ }, ++ "args": [ "/bin/true" ], ++ "cwd": "/" ++}' ++ mask=$(cpus_to_mask "$cpus") ++ echo "CPUS: $cpus, mask: $mask" ++ runc --debug exec --process <(echo "$proc") ct1 ++ [[ "$output" == *"nsexec"*": affinity: $mask"* ]] ++ done ++} ++ ++@test "runc exec [CPU affinity, initial and final set from process.json]" { ++ first="$(first_cpu)" ++ second=$((first + 1)) # Hacky; might not work in all environments. ++ ++ runc run -d --console-socket "$CONSOLE_SOCKET" ct1 ++ [ "$status" -eq 0 ] ++ ++ for cpus in "$second" "$first-$second" "$first,$second" "$first"; do ++ proc=' ++{ ++ "terminal": false, ++ "execCPUAffinity": { ++ "initial": "'$cpus'", ++ "final": "'$cpus'" ++ }, ++ "args": [ "/bin/grep", "-F", "Cpus_allowed_list:", "/proc/self/status" ], ++ "cwd": "/" ++}' ++ mask=$(cpus_to_mask "$cpus") ++ exp=${cpus//,/-} # "," --> "-". ++ echo "CPUS: $cpus, mask: $mask, final: $exp" ++ runc --debug exec --process <(echo "$proc") ct1 ++ [[ "$output" == *"nsexec"*": affinity: $mask"* ]] ++ [[ "$output" == *"Cpus_allowed_list: $exp"* ]] # Mind the literal tab. ++ done ++} ++ ++@test "runc exec [CPU affinity, initial and final set from config.json]" { ++ initial="$(first_cpu)" ++ final=$((initial + 1)) # Hacky; might not work in all environments. ++ ++ update_config " .process.execCPUAffinity.initial = \"$initial\" ++ | .process.execCPUAffinity.final = \"$final\"" ++ ++ runc run -d --console-socket "$CONSOLE_SOCKET" ct1 ++ [ "$status" -eq 0 ] ++ ++ runc --debug exec ct1 grep "Cpus_allowed_list:" /proc/self/status ++ [ "$status" -eq 0 ] ++ mask=$(cpus_to_mask "$initial") ++ [[ "$output" == *"nsexec"*": affinity: $mask"* ]] ++ [[ "$output" == *"Cpus_allowed_list: $final"* ]] # Mind the literal tab. ++} +diff --git a/utils_linux.go b/utils_linux.go +index 60d534e8..30204133 100644 +--- a/utils_linux.go ++++ b/utils_linux.go +@@ -109,6 +109,12 @@ func newProcess(p specs.Process) (*libcontainer.Process, error) { + } + lp.Rlimits = append(lp.Rlimits, rl) + } ++ aff, err := configs.ConvertCPUAffinity(p.ExecCPUAffinity) ++ if err != nil { ++ return nil, err ++ } ++ lp.CPUAffinity = aff ++ + return lp, nil + } + +-- +2.47.1 + diff --git a/SPECS/runc.spec b/SPECS/runc.spec index 12c8778..e90fd43 100644 --- a/SPECS/runc.spec +++ b/SPECS/runc.spec @@ -23,7 +23,7 @@ go build -buildmode pie -compiler gc -tags="rpm_crashtraceback libtrust_openssl Epoch: 1 Name: %{repo} Version: 1.1.12 -Release: 5%{?dist} +Release: 6%{?dist} Summary: CLI for running Open Containers # https://fedoraproject.org/wiki/PackagingDrafts/Go#Go_Language_Architectures #ExclusiveArch: %%{go_arches} @@ -33,6 +33,8 @@ ExcludeArch: %{ix86} License: ASL 2.0 URL: %{git0} Source0: %{git0}/archive/v%{version}.tar.gz +Patch0: 0001-1.1-Bump-runtime-spec-to-latest-git-HEAD.patch +Patch1: 0002-1.1-runc-exec-implement-CPU-affinity.patch Provides: oci-runtime BuildRequires: golang >= 1.21.4 BuildRequires: git @@ -85,6 +87,10 @@ make install install-man install-bash DESTDIR=$RPM_BUILD_ROOT PREFIX=%{_prefix} %{_datadir}/bash-completion/completions/%{name} %changelog +* Mon Jan 20 2025 Jindrich Novy - 1:1.1.12-6 +- Add CPU affinity feature from Kir Kolishkin +- Resolves: RHEL-74865 + * Tue Oct 01 2024 Kir Kolyshkin - 1:1.1.12-5 - bump golang buildrequires - add no_openssl build tag