Compare commits
	
		
			No commits in common. "c8-stream-1.0" and "c8-stream-rhel8" have entirely different histories.
		
	
	
		
			c8-stream-
			...
			c8-stream-
		
	
		
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -1 +1 @@ | ||||
| SOURCES/runc-2abd837.tar.gz | ||||
| SOURCES/v1.1.12.tar.gz | ||||
|  | ||||
| @ -1 +1 @@ | ||||
| cf7119a838db2963e7af6ecdba90a2cc95ec0d56 SOURCES/runc-2abd837.tar.gz | ||||
| 3fac650358578b8694012a44b1d5b156523c3402 SOURCES/v1.1.12.tar.gz | ||||
|  | ||||
							
								
								
									
										508
									
								
								SOURCES/0001-1.1-Bump-runtime-spec-to-latest-git-HEAD.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										508
									
								
								SOURCES/0001-1.1-Bump-runtime-spec-to-latest-git-HEAD.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,508 @@ | ||||
| From 50f50245235097b0c87b31e97b86fd11685232a3 Mon Sep 17 00:00:00 2001 | ||||
| From: Kir Kolyshkin <kolyshkin@gmail.com> | ||||
| Date: Thu, 16 Jan 2025 15:40:28 -0800 | ||||
| Subject: [PATCH 1/2] [1.1] Bump runtime-spec to latest git HEAD | ||||
| 
 | ||||
| This is to include | ||||
|  - https://github.com/opencontainers/runtime-spec/pull/1261 | ||||
|  - https://github.com/opencontainers/runtime-spec/pull/1253 | ||||
| 
 | ||||
| Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com> | ||||
| ---
 | ||||
|  go.mod                                        |   2 +- | ||||
|  go.sum                                        |   4 +- | ||||
|  .../runtime-spec/specs-go/config.go           | 239 ++++++++++++++++-- | ||||
|  .../runtime-spec/specs-go/version.go          |   6 +- | ||||
|  vendor/modules.txt                            |   2 +- | ||||
|  5 files changed, 225 insertions(+), 28 deletions(-) | ||||
| 
 | ||||
| diff --git a/go.mod b/go.mod
 | ||||
| index f51b6432..87c8d4b4 100644
 | ||||
| --- a/go.mod
 | ||||
| +++ b/go.mod
 | ||||
| @@ -12,7 +12,7 @@ require (
 | ||||
|  	github.com/godbus/dbus/v5 v5.0.6 | ||||
|  	github.com/moby/sys/mountinfo v0.5.0 | ||||
|  	github.com/mrunalp/fileutils v0.5.1 | ||||
| -	github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417
 | ||||
| +	github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95
 | ||||
|  	github.com/opencontainers/selinux v1.10.0 | ||||
|  	github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646 | ||||
|  	github.com/sirupsen/logrus v1.8.1 | ||||
| diff --git a/go.sum b/go.sum
 | ||||
| index ecabd398..9d3bedc0 100644
 | ||||
| --- a/go.sum
 | ||||
| +++ b/go.sum
 | ||||
| @@ -33,8 +33,8 @@ github.com/moby/sys/mountinfo v0.5.0 h1:2Ks8/r6lopsxWi9m58nlwjaeSzUX9iiL1vj5qB/9
 | ||||
|  github.com/moby/sys/mountinfo v0.5.0/go.mod h1:3bMD3Rg+zkqx8MRYPi7Pyb0Ie97QEBmdxbhnCLlSvSU= | ||||
|  github.com/mrunalp/fileutils v0.5.1 h1:F+S7ZlNKnrwHfSwdlgNSkKo67ReVf8o9fel6C3dkm/Q= | ||||
|  github.com/mrunalp/fileutils v0.5.1/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ= | ||||
| -github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 h1:3snG66yBm59tKhhSPQrQ/0bCrv1LQbKt40LnUPiUxdc=
 | ||||
| -github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
 | ||||
| +github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95 h1:Ghl8Z3l+yPQUDSxAp7Kg7fJLRNNXjOsR6ooDcca7PjU=
 | ||||
| +github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
 | ||||
|  github.com/opencontainers/selinux v1.10.0 h1:rAiKF8hTcgLI3w0DHm6i0ylVVcOrlgR1kK99DRLDhyU= | ||||
|  github.com/opencontainers/selinux v1.10.0/go.mod h1:2i0OySw99QjzBBQByd1Gr9gSjvuho1lHsJxIJ3gGbJI= | ||||
|  github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= | ||||
| diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go
 | ||||
| index 6a7a91e5..671f0d01 100644
 | ||||
| --- a/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go
 | ||||
| +++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go
 | ||||
| @@ -12,10 +12,12 @@ type Spec struct {
 | ||||
|  	Root *Root `json:"root,omitempty"` | ||||
|  	// Hostname configures the container's hostname. | ||||
|  	Hostname string `json:"hostname,omitempty"` | ||||
| +	// Domainname configures the container's domainname.
 | ||||
| +	Domainname string `json:"domainname,omitempty"`
 | ||||
|  	// Mounts configures additional mounts (on top of Root). | ||||
|  	Mounts []Mount `json:"mounts,omitempty"` | ||||
|  	// Hooks configures callbacks for container lifecycle events. | ||||
| -	Hooks *Hooks `json:"hooks,omitempty" platform:"linux,solaris"`
 | ||||
| +	Hooks *Hooks `json:"hooks,omitempty" platform:"linux,solaris,zos"`
 | ||||
|  	// Annotations contains arbitrary metadata for the container. | ||||
|  	Annotations map[string]string `json:"annotations,omitempty"` | ||||
|   | ||||
| @@ -27,6 +29,36 @@ type Spec struct {
 | ||||
|  	Windows *Windows `json:"windows,omitempty" platform:"windows"` | ||||
|  	// VM specifies configuration for virtual-machine-based containers. | ||||
|  	VM *VM `json:"vm,omitempty" platform:"vm"` | ||||
| +	// ZOS is platform-specific configuration for z/OS based containers.
 | ||||
| +	ZOS *ZOS `json:"zos,omitempty" platform:"zos"`
 | ||||
| +}
 | ||||
| +
 | ||||
| +// Scheduler represents the scheduling attributes for a process. It is based on
 | ||||
| +// the Linux sched_setattr(2) syscall.
 | ||||
| +type Scheduler struct {
 | ||||
| +	// Policy represents the scheduling policy (e.g., SCHED_FIFO, SCHED_RR, SCHED_OTHER).
 | ||||
| +	Policy LinuxSchedulerPolicy `json:"policy"`
 | ||||
| +
 | ||||
| +	// Nice is the nice value for the process, which affects its priority.
 | ||||
| +	Nice int32 `json:"nice,omitempty"`
 | ||||
| +
 | ||||
| +	// Priority represents the static priority of the process.
 | ||||
| +	Priority int32 `json:"priority,omitempty"`
 | ||||
| +
 | ||||
| +	// Flags is an array of scheduling flags.
 | ||||
| +	Flags []LinuxSchedulerFlag `json:"flags,omitempty"`
 | ||||
| +
 | ||||
| +	// The following ones are used by the DEADLINE scheduler.
 | ||||
| +
 | ||||
| +	// Runtime is the amount of time in nanoseconds during which the process
 | ||||
| +	// is allowed to run in a given period.
 | ||||
| +	Runtime uint64 `json:"runtime,omitempty"`
 | ||||
| +
 | ||||
| +	// Deadline is the absolute deadline for the process to complete its execution.
 | ||||
| +	Deadline uint64 `json:"deadline,omitempty"`
 | ||||
| +
 | ||||
| +	// Period is the length of the period in nanoseconds used for determining the process runtime.
 | ||||
| +	Period uint64 `json:"period,omitempty"`
 | ||||
|  } | ||||
|   | ||||
|  // Process contains information to start a specific application inside the container. | ||||
| @@ -49,15 +81,21 @@ type Process struct {
 | ||||
|  	// Capabilities are Linux capabilities that are kept for the process. | ||||
|  	Capabilities *LinuxCapabilities `json:"capabilities,omitempty" platform:"linux"` | ||||
|  	// Rlimits specifies rlimit options to apply to the process. | ||||
| -	Rlimits []POSIXRlimit `json:"rlimits,omitempty" platform:"linux,solaris"`
 | ||||
| +	Rlimits []POSIXRlimit `json:"rlimits,omitempty" platform:"linux,solaris,zos"`
 | ||||
|  	// NoNewPrivileges controls whether additional privileges could be gained by processes in the container. | ||||
|  	NoNewPrivileges bool `json:"noNewPrivileges,omitempty" platform:"linux"` | ||||
|  	// ApparmorProfile specifies the apparmor profile for the container. | ||||
|  	ApparmorProfile string `json:"apparmorProfile,omitempty" platform:"linux"` | ||||
|  	// Specify an oom_score_adj for the container. | ||||
|  	OOMScoreAdj *int `json:"oomScoreAdj,omitempty" platform:"linux"` | ||||
| +	// Scheduler specifies the scheduling attributes for a process
 | ||||
| +	Scheduler *Scheduler `json:"scheduler,omitempty" platform:"linux"`
 | ||||
|  	// SelinuxLabel specifies the selinux context that the container process is run as. | ||||
|  	SelinuxLabel string `json:"selinuxLabel,omitempty" platform:"linux"` | ||||
| +	// IOPriority contains the I/O priority settings for the cgroup.
 | ||||
| +	IOPriority *LinuxIOPriority `json:"ioPriority,omitempty" platform:"linux"`
 | ||||
| +	// ExecCPUAffinity specifies CPU affinity for exec processes.
 | ||||
| +	ExecCPUAffinity *CPUAffinity `json:"execCPUAffinity,omitempty" platform:"linux"`
 | ||||
|  } | ||||
|   | ||||
|  // LinuxCapabilities specifies the list of allowed capabilities that are kept for a process. | ||||
| @@ -75,6 +113,28 @@ type LinuxCapabilities struct {
 | ||||
|  	Ambient []string `json:"ambient,omitempty" platform:"linux"` | ||||
|  } | ||||
|   | ||||
| +// IOPriority represents I/O priority settings for the container's processes within the process group.
 | ||||
| +type LinuxIOPriority struct {
 | ||||
| +	Class    IOPriorityClass `json:"class"`
 | ||||
| +	Priority int             `json:"priority"`
 | ||||
| +}
 | ||||
| +
 | ||||
| +// IOPriorityClass represents an I/O scheduling class.
 | ||||
| +type IOPriorityClass string
 | ||||
| +
 | ||||
| +// Possible values for IOPriorityClass.
 | ||||
| +const (
 | ||||
| +	IOPRIO_CLASS_RT   IOPriorityClass = "IOPRIO_CLASS_RT"
 | ||||
| +	IOPRIO_CLASS_BE   IOPriorityClass = "IOPRIO_CLASS_BE"
 | ||||
| +	IOPRIO_CLASS_IDLE IOPriorityClass = "IOPRIO_CLASS_IDLE"
 | ||||
| +)
 | ||||
| +
 | ||||
| +// CPUAffinity specifies process' CPU affinity.
 | ||||
| +type CPUAffinity struct {
 | ||||
| +	Initial string `json:"initial,omitempty"`
 | ||||
| +	Final   string `json:"final,omitempty"`
 | ||||
| +}
 | ||||
| +
 | ||||
|  // Box specifies dimensions of a rectangle. Used for specifying the size of a console. | ||||
|  type Box struct { | ||||
|  	// Height is the vertical dimension of a box. | ||||
| @@ -86,11 +146,11 @@ type Box struct {
 | ||||
|  // User specifies specific user (and group) information for the container process. | ||||
|  type User struct { | ||||
|  	// UID is the user id. | ||||
| -	UID uint32 `json:"uid" platform:"linux,solaris"`
 | ||||
| +	UID uint32 `json:"uid" platform:"linux,solaris,zos"`
 | ||||
|  	// GID is the group id. | ||||
| -	GID uint32 `json:"gid" platform:"linux,solaris"`
 | ||||
| +	GID uint32 `json:"gid" platform:"linux,solaris,zos"`
 | ||||
|  	// Umask is the umask for the init process. | ||||
| -	Umask *uint32 `json:"umask,omitempty" platform:"linux,solaris"`
 | ||||
| +	Umask *uint32 `json:"umask,omitempty" platform:"linux,solaris,zos"`
 | ||||
|  	// AdditionalGids are additional group ids set for the container's process. | ||||
|  	AdditionalGids []uint32 `json:"additionalGids,omitempty" platform:"linux,solaris"` | ||||
|  	// Username is the user name. | ||||
| @@ -110,11 +170,16 @@ type Mount struct {
 | ||||
|  	// Destination is the absolute path where the mount will be placed in the container. | ||||
|  	Destination string `json:"destination"` | ||||
|  	// Type specifies the mount kind. | ||||
| -	Type string `json:"type,omitempty" platform:"linux,solaris"`
 | ||||
| +	Type string `json:"type,omitempty" platform:"linux,solaris,zos"`
 | ||||
|  	// Source specifies the source path of the mount. | ||||
|  	Source string `json:"source,omitempty"` | ||||
|  	// Options are fstab style mount options. | ||||
|  	Options []string `json:"options,omitempty"` | ||||
| +
 | ||||
| +	// UID/GID mappings used for changing file owners w/o calling chown, fs should support it.
 | ||||
| +	// Every mount point could have its own mapping.
 | ||||
| +	UIDMappings []LinuxIDMapping `json:"uidMappings,omitempty" platform:"linux"`
 | ||||
| +	GIDMappings []LinuxIDMapping `json:"gidMappings,omitempty" platform:"linux"`
 | ||||
|  } | ||||
|   | ||||
|  // Hook specifies a command that is run at a particular event in the lifecycle of a container | ||||
| @@ -130,6 +195,10 @@ type Hook struct {
 | ||||
|  type Hooks struct { | ||||
|  	// Prestart is Deprecated. Prestart is a list of hooks to be run before the container process is executed. | ||||
|  	// It is called in the Runtime Namespace | ||||
| +	//
 | ||||
| +	// Deprecated: use [Hooks.CreateRuntime], [Hooks.CreateContainer], and
 | ||||
| +	// [Hooks.StartContainer] instead, which allow more granular hook control
 | ||||
| +	// during the create and start phase.
 | ||||
|  	Prestart []Hook `json:"prestart,omitempty"` | ||||
|  	// CreateRuntime is a list of hooks to be run after the container has been created but before pivot_root or any equivalent operation has been called | ||||
|  	// It is called in the Runtime Namespace | ||||
| @@ -178,10 +247,12 @@ type Linux struct {
 | ||||
|  	// MountLabel specifies the selinux context for the mounts in the container. | ||||
|  	MountLabel string `json:"mountLabel,omitempty"` | ||||
|  	// IntelRdt contains Intel Resource Director Technology (RDT) information for | ||||
| -	// handling resource constraints (e.g., L3 cache, memory bandwidth) for the container
 | ||||
| +	// handling resource constraints and monitoring metrics (e.g., L3 cache, memory bandwidth) for the container
 | ||||
|  	IntelRdt *LinuxIntelRdt `json:"intelRdt,omitempty"` | ||||
|  	// Personality contains configuration for the Linux personality syscall | ||||
|  	Personality *LinuxPersonality `json:"personality,omitempty"` | ||||
| +	// TimeOffsets specifies the offset for supporting time namespaces.
 | ||||
| +	TimeOffsets map[string]LinuxTimeOffset `json:"timeOffsets,omitempty"`
 | ||||
|  } | ||||
|   | ||||
|  // LinuxNamespace is the configuration for a Linux namespace | ||||
| @@ -211,6 +282,8 @@ const (
 | ||||
|  	UserNamespace LinuxNamespaceType = "user" | ||||
|  	// CgroupNamespace for isolating cgroup hierarchies | ||||
|  	CgroupNamespace LinuxNamespaceType = "cgroup" | ||||
| +	// TimeNamespace for isolating the clocks
 | ||||
| +	TimeNamespace LinuxNamespaceType = "time"
 | ||||
|  ) | ||||
|   | ||||
|  // LinuxIDMapping specifies UID/GID mappings | ||||
| @@ -223,6 +296,14 @@ type LinuxIDMapping struct {
 | ||||
|  	Size uint32 `json:"size"` | ||||
|  } | ||||
|   | ||||
| +// LinuxTimeOffset specifies the offset for Time Namespace
 | ||||
| +type LinuxTimeOffset struct {
 | ||||
| +	// Secs is the offset of clock (in secs) in the container
 | ||||
| +	Secs int64 `json:"secs,omitempty"`
 | ||||
| +	// Nanosecs is the additional offset for Secs (in nanosecs)
 | ||||
| +	Nanosecs uint32 `json:"nanosecs,omitempty"`
 | ||||
| +}
 | ||||
| +
 | ||||
|  // POSIXRlimit type and restrictions | ||||
|  type POSIXRlimit struct { | ||||
|  	// Type of the rlimit to set | ||||
| @@ -233,12 +314,13 @@ type POSIXRlimit struct {
 | ||||
|  	Soft uint64 `json:"soft"` | ||||
|  } | ||||
|   | ||||
| -// LinuxHugepageLimit structure corresponds to limiting kernel hugepages
 | ||||
| +// LinuxHugepageLimit structure corresponds to limiting kernel hugepages.
 | ||||
| +// Default to reservation limits if supported. Otherwise fallback to page fault limits.
 | ||||
|  type LinuxHugepageLimit struct { | ||||
| -	// Pagesize is the hugepage size
 | ||||
| -	// Format: "<size><unit-prefix>B' (e.g. 64KB, 2MB, 1GB, etc.)
 | ||||
| +	// Pagesize is the hugepage size.
 | ||||
| +	// Format: "<size><unit-prefix>B' (e.g. 64KB, 2MB, 1GB, etc.).
 | ||||
|  	Pagesize string `json:"pageSize"` | ||||
| -	// Limit is the limit of "hugepagesize" hugetlb usage
 | ||||
| +	// Limit is the limit of "hugepagesize" hugetlb reservations (if supported) or usage.
 | ||||
|  	Limit uint64 `json:"limit"` | ||||
|  } | ||||
|   | ||||
| @@ -250,8 +332,8 @@ type LinuxInterfacePriority struct {
 | ||||
|  	Priority uint32 `json:"priority"` | ||||
|  } | ||||
|   | ||||
| -// linuxBlockIODevice holds major:minor format supported in blkio cgroup
 | ||||
| -type linuxBlockIODevice struct {
 | ||||
| +// LinuxBlockIODevice holds major:minor format supported in blkio cgroup
 | ||||
| +type LinuxBlockIODevice struct {
 | ||||
|  	// Major is the device's major number. | ||||
|  	Major int64 `json:"major"` | ||||
|  	// Minor is the device's minor number. | ||||
| @@ -260,7 +342,7 @@ type linuxBlockIODevice struct {
 | ||||
|   | ||||
|  // LinuxWeightDevice struct holds a `major:minor weight` pair for weightDevice | ||||
|  type LinuxWeightDevice struct { | ||||
| -	linuxBlockIODevice
 | ||||
| +	LinuxBlockIODevice
 | ||||
|  	// Weight is the bandwidth rate for the device. | ||||
|  	Weight *uint16 `json:"weight,omitempty"` | ||||
|  	// LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, CFQ scheduler only | ||||
| @@ -269,7 +351,7 @@ type LinuxWeightDevice struct {
 | ||||
|   | ||||
|  // LinuxThrottleDevice struct holds a `major:minor rate_per_second` pair | ||||
|  type LinuxThrottleDevice struct { | ||||
| -	linuxBlockIODevice
 | ||||
| +	LinuxBlockIODevice
 | ||||
|  	// Rate is the IO rate limit per cgroup per device | ||||
|  	Rate uint64 `json:"rate"` | ||||
|  } | ||||
| @@ -301,6 +383,12 @@ type LinuxMemory struct {
 | ||||
|  	// Total memory limit (memory + swap). | ||||
|  	Swap *int64 `json:"swap,omitempty"` | ||||
|  	// Kernel memory limit (in bytes). | ||||
| +	//
 | ||||
| +	// Deprecated: kernel-memory limits are not supported in cgroups v2, and
 | ||||
| +	// were obsoleted in [kernel v5.4]. This field should no longer be used,
 | ||||
| +	// as it may be ignored by runtimes.
 | ||||
| +	//
 | ||||
| +	// [kernel v5.4]: https://github.com/torvalds/linux/commit/0158115f702b0ba208ab0
 | ||||
|  	Kernel *int64 `json:"kernel,omitempty"` | ||||
|  	// Kernel memory limit for tcp (in bytes) | ||||
|  	KernelTCP *int64 `json:"kernelTCP,omitempty"` | ||||
| @@ -310,6 +398,10 @@ type LinuxMemory struct {
 | ||||
|  	DisableOOMKiller *bool `json:"disableOOMKiller,omitempty"` | ||||
|  	// Enables hierarchical memory accounting | ||||
|  	UseHierarchy *bool `json:"useHierarchy,omitempty"` | ||||
| +	// CheckBeforeUpdate enables checking if a new memory limit is lower
 | ||||
| +	// than the current usage during update, and if so, rejecting the new
 | ||||
| +	// limit.
 | ||||
| +	CheckBeforeUpdate *bool `json:"checkBeforeUpdate,omitempty"`
 | ||||
|  } | ||||
|   | ||||
|  // LinuxCPU for Linux cgroup 'cpu' resource management | ||||
| @@ -318,6 +410,9 @@ type LinuxCPU struct {
 | ||||
|  	Shares *uint64 `json:"shares,omitempty"` | ||||
|  	// CPU hardcap limit (in usecs). Allowed cpu time in a given period. | ||||
|  	Quota *int64 `json:"quota,omitempty"` | ||||
| +	// CPU hardcap burst limit (in usecs). Allowed accumulated cpu time additionally for burst in a
 | ||||
| +	// given period.
 | ||||
| +	Burst *uint64 `json:"burst,omitempty"`
 | ||||
|  	// CPU period to be used for hardcapping (in usecs). | ||||
|  	Period *uint64 `json:"period,omitempty"` | ||||
|  	// How much time realtime scheduling may use (in usecs). | ||||
| @@ -328,6 +423,8 @@ type LinuxCPU struct {
 | ||||
|  	Cpus string `json:"cpus,omitempty"` | ||||
|  	// List of memory nodes in the cpuset. Default is to use any available memory node. | ||||
|  	Mems string `json:"mems,omitempty"` | ||||
| +	// cgroups are configured with minimum weight, 0: default behavior, 1: SCHED_IDLE.
 | ||||
| +	Idle *int64 `json:"idle,omitempty"`
 | ||||
|  } | ||||
|   | ||||
|  // LinuxPids for Linux cgroup 'pids' resource management (Linux 4.3) | ||||
| @@ -364,7 +461,7 @@ type LinuxResources struct {
 | ||||
|  	Pids *LinuxPids `json:"pids,omitempty"` | ||||
|  	// BlockIO restriction configuration | ||||
|  	BlockIO *LinuxBlockIO `json:"blockIO,omitempty"` | ||||
| -	// Hugetlb limit (in bytes)
 | ||||
| +	// Hugetlb limits (in bytes). Default to reservation limits if supported.
 | ||||
|  	HugepageLimits []LinuxHugepageLimit `json:"hugepageLimits,omitempty"` | ||||
|  	// Network restriction configuration | ||||
|  	Network *LinuxNetwork `json:"network,omitempty"` | ||||
| @@ -522,11 +619,21 @@ type WindowsMemoryResources struct {
 | ||||
|   | ||||
|  // WindowsCPUResources contains CPU resource management settings. | ||||
|  type WindowsCPUResources struct { | ||||
| -	// Number of CPUs available to the container.
 | ||||
| +	// Count is the number of CPUs available to the container. It represents the
 | ||||
| +	// fraction of the configured processor `count` in a container in relation
 | ||||
| +	// to the processors available in the host. The fraction ultimately
 | ||||
| +	// determines the portion of processor cycles that the threads in a
 | ||||
| +	// container can use during each scheduling interval, as the number of
 | ||||
| +	// cycles per 10,000 cycles.
 | ||||
|  	Count *uint64 `json:"count,omitempty"` | ||||
| -	// CPU shares (relative weight to other containers with cpu shares).
 | ||||
| +	// Shares limits the share of processor time given to the container relative
 | ||||
| +	// to other workloads on the processor. The processor `shares` (`weight` at
 | ||||
| +	// the platform level) is a value between 0 and 10000.
 | ||||
|  	Shares *uint16 `json:"shares,omitempty"` | ||||
| -	// Specifies the portion of processor cycles that this container can use as a percentage times 100.
 | ||||
| +	// Maximum determines the portion of processor cycles that the threads in a
 | ||||
| +	// container can use during each scheduling interval, as the number of
 | ||||
| +	// cycles per 10,000 cycles. Set processor `maximum` to a percentage times
 | ||||
| +	// 100.
 | ||||
|  	Maximum *uint16 `json:"maximum,omitempty"` | ||||
|  } | ||||
|   | ||||
| @@ -613,6 +720,23 @@ type Arch string
 | ||||
|  // LinuxSeccompFlag is a flag to pass to seccomp(2). | ||||
|  type LinuxSeccompFlag string | ||||
|   | ||||
| +const (
 | ||||
| +	// LinuxSeccompFlagLog is a seccomp flag to request all returned
 | ||||
| +	// actions except SECCOMP_RET_ALLOW to be logged. An administrator may
 | ||||
| +	// override this filter flag by preventing specific actions from being
 | ||||
| +	// logged via the /proc/sys/kernel/seccomp/actions_logged file. (since
 | ||||
| +	// Linux 4.14)
 | ||||
| +	LinuxSeccompFlagLog LinuxSeccompFlag = "SECCOMP_FILTER_FLAG_LOG"
 | ||||
| +
 | ||||
| +	// LinuxSeccompFlagSpecAllow can be used to disable Speculative Store
 | ||||
| +	// Bypass mitigation. (since Linux 4.17)
 | ||||
| +	LinuxSeccompFlagSpecAllow LinuxSeccompFlag = "SECCOMP_FILTER_FLAG_SPEC_ALLOW"
 | ||||
| +
 | ||||
| +	// LinuxSeccompFlagWaitKillableRecv can be used to switch to the wait
 | ||||
| +	// killable semantics. (since Linux 5.19)
 | ||||
| +	LinuxSeccompFlagWaitKillableRecv LinuxSeccompFlag = "SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV"
 | ||||
| +)
 | ||||
| +
 | ||||
|  // Additional architectures permitted to be used for system calls | ||||
|  // By default only the native architecture of the kernel is permitted | ||||
|  const ( | ||||
| @@ -683,8 +807,9 @@ type LinuxSyscall struct {
 | ||||
|  	Args     []LinuxSeccompArg  `json:"args,omitempty"` | ||||
|  } | ||||
|   | ||||
| -// LinuxIntelRdt has container runtime resource constraints for Intel RDT
 | ||||
| -// CAT and MBA features which introduced in Linux 4.10 and 4.12 kernel
 | ||||
| +// LinuxIntelRdt has container runtime resource constraints for Intel RDT CAT and MBA
 | ||||
| +// features and flags enabling Intel RDT CMT and MBM features.
 | ||||
| +// Intel RDT features are available in Linux 4.14 and newer kernel versions.
 | ||||
|  type LinuxIntelRdt struct { | ||||
|  	// The identity for RDT Class of Service | ||||
|  	ClosID string `json:"closID,omitempty"` | ||||
| @@ -697,4 +822,76 @@ type LinuxIntelRdt struct {
 | ||||
|  	// The unit of memory bandwidth is specified in "percentages" by | ||||
|  	// default, and in "MBps" if MBA Software Controller is enabled. | ||||
|  	MemBwSchema string `json:"memBwSchema,omitempty"` | ||||
| +
 | ||||
| +	// EnableCMT is the flag to indicate if the Intel RDT CMT is enabled. CMT (Cache Monitoring Technology) supports monitoring of
 | ||||
| +	// the last-level cache (LLC) occupancy for the container.
 | ||||
| +	EnableCMT bool `json:"enableCMT,omitempty"`
 | ||||
| +
 | ||||
| +	// EnableMBM is the flag to indicate if the Intel RDT MBM is enabled. MBM (Memory Bandwidth Monitoring) supports monitoring of
 | ||||
| +	// total and local memory bandwidth for the container.
 | ||||
| +	EnableMBM bool `json:"enableMBM,omitempty"`
 | ||||
| +}
 | ||||
| +
 | ||||
| +// ZOS contains platform-specific configuration for z/OS based containers.
 | ||||
| +type ZOS struct {
 | ||||
| +	// Devices are a list of device nodes that are created for the container
 | ||||
| +	Devices []ZOSDevice `json:"devices,omitempty"`
 | ||||
| +}
 | ||||
| +
 | ||||
| +// ZOSDevice represents the mknod information for a z/OS special device file
 | ||||
| +type ZOSDevice struct {
 | ||||
| +	// Path to the device.
 | ||||
| +	Path string `json:"path"`
 | ||||
| +	// Device type, block, char, etc.
 | ||||
| +	Type string `json:"type"`
 | ||||
| +	// Major is the device's major number.
 | ||||
| +	Major int64 `json:"major"`
 | ||||
| +	// Minor is the device's minor number.
 | ||||
| +	Minor int64 `json:"minor"`
 | ||||
| +	// FileMode permission bits for the device.
 | ||||
| +	FileMode *os.FileMode `json:"fileMode,omitempty"`
 | ||||
| +	// UID of the device.
 | ||||
| +	UID *uint32 `json:"uid,omitempty"`
 | ||||
| +	// Gid of the device.
 | ||||
| +	GID *uint32 `json:"gid,omitempty"`
 | ||||
|  } | ||||
| +
 | ||||
| +// LinuxSchedulerPolicy represents different scheduling policies used with the Linux Scheduler
 | ||||
| +type LinuxSchedulerPolicy string
 | ||||
| +
 | ||||
| +const (
 | ||||
| +	// SchedOther is the default scheduling policy
 | ||||
| +	SchedOther LinuxSchedulerPolicy = "SCHED_OTHER"
 | ||||
| +	// SchedFIFO is the First-In-First-Out scheduling policy
 | ||||
| +	SchedFIFO LinuxSchedulerPolicy = "SCHED_FIFO"
 | ||||
| +	// SchedRR is the Round-Robin scheduling policy
 | ||||
| +	SchedRR LinuxSchedulerPolicy = "SCHED_RR"
 | ||||
| +	// SchedBatch is the Batch scheduling policy
 | ||||
| +	SchedBatch LinuxSchedulerPolicy = "SCHED_BATCH"
 | ||||
| +	// SchedISO is the Isolation scheduling policy
 | ||||
| +	SchedISO LinuxSchedulerPolicy = "SCHED_ISO"
 | ||||
| +	// SchedIdle is the Idle scheduling policy
 | ||||
| +	SchedIdle LinuxSchedulerPolicy = "SCHED_IDLE"
 | ||||
| +	// SchedDeadline is the Deadline scheduling policy
 | ||||
| +	SchedDeadline LinuxSchedulerPolicy = "SCHED_DEADLINE"
 | ||||
| +)
 | ||||
| +
 | ||||
| +// LinuxSchedulerFlag represents the flags used by the Linux Scheduler.
 | ||||
| +type LinuxSchedulerFlag string
 | ||||
| +
 | ||||
| +const (
 | ||||
| +	// SchedFlagResetOnFork represents the reset on fork scheduling flag
 | ||||
| +	SchedFlagResetOnFork LinuxSchedulerFlag = "SCHED_FLAG_RESET_ON_FORK"
 | ||||
| +	// SchedFlagReclaim represents the reclaim scheduling flag
 | ||||
| +	SchedFlagReclaim LinuxSchedulerFlag = "SCHED_FLAG_RECLAIM"
 | ||||
| +	// SchedFlagDLOverrun represents the deadline overrun scheduling flag
 | ||||
| +	SchedFlagDLOverrun LinuxSchedulerFlag = "SCHED_FLAG_DL_OVERRUN"
 | ||||
| +	// SchedFlagKeepPolicy represents the keep policy scheduling flag
 | ||||
| +	SchedFlagKeepPolicy LinuxSchedulerFlag = "SCHED_FLAG_KEEP_POLICY"
 | ||||
| +	// SchedFlagKeepParams represents the keep parameters scheduling flag
 | ||||
| +	SchedFlagKeepParams LinuxSchedulerFlag = "SCHED_FLAG_KEEP_PARAMS"
 | ||||
| +	// SchedFlagUtilClampMin represents the utilization clamp minimum scheduling flag
 | ||||
| +	SchedFlagUtilClampMin LinuxSchedulerFlag = "SCHED_FLAG_UTIL_CLAMP_MIN"
 | ||||
| +	// SchedFlagUtilClampMin represents the utilization clamp maximum scheduling flag
 | ||||
| +	SchedFlagUtilClampMax LinuxSchedulerFlag = "SCHED_FLAG_UTIL_CLAMP_MAX"
 | ||||
| +)
 | ||||
| diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go
 | ||||
| index 596af0c2..f6c15f6c 100644
 | ||||
| --- a/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go
 | ||||
| +++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go
 | ||||
| @@ -6,12 +6,12 @@ const (
 | ||||
|  	// VersionMajor is for an API incompatible changes | ||||
|  	VersionMajor = 1 | ||||
|  	// VersionMinor is for functionality in a backwards-compatible manner | ||||
| -	VersionMinor = 0
 | ||||
| +	VersionMinor = 2
 | ||||
|  	// VersionPatch is for backwards-compatible bug fixes | ||||
| -	VersionPatch = 2
 | ||||
| +	VersionPatch = 0
 | ||||
|   | ||||
|  	// VersionDev indicates development branch. Releases will be empty string. | ||||
| -	VersionDev = "-dev"
 | ||||
| +	VersionDev = "+dev"
 | ||||
|  ) | ||||
|   | ||||
|  // Version is the specification version that the package types support. | ||||
| diff --git a/vendor/modules.txt b/vendor/modules.txt
 | ||||
| index a5537dfe..40089cd4 100644
 | ||||
| --- a/vendor/modules.txt
 | ||||
| +++ b/vendor/modules.txt
 | ||||
| @@ -35,7 +35,7 @@ github.com/moby/sys/mountinfo
 | ||||
|  # github.com/mrunalp/fileutils v0.5.1 | ||||
|  ## explicit; go 1.13 | ||||
|  github.com/mrunalp/fileutils | ||||
| -# github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417
 | ||||
| +# github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95
 | ||||
|  ## explicit | ||||
|  github.com/opencontainers/runtime-spec/specs-go | ||||
|  # github.com/opencontainers/selinux v1.10.0 | ||||
| -- 
 | ||||
| 2.47.1 | ||||
| 
 | ||||
| @ -1,62 +0,0 @@ | ||||
| From dfb3496c174377b860b62872ce6af951364cc3ac Mon Sep 17 00:00:00 2001 | ||||
| From: Lokesh Mandvekar <lsm5@fedoraproject.org> | ||||
| Date: Tue, 12 Dec 2017 13:22:42 +0530 | ||||
| Subject: [PATCH] Revert "Apply cgroups earlier" | ||||
| 
 | ||||
| This reverts commit 7062c7556b71188abc18d7516441ff4b03fbc1fc. | ||||
| ---
 | ||||
|  libcontainer/process_linux.go | 31 ++++++++++++++----------------- | ||||
|  1 file changed, 14 insertions(+), 17 deletions(-) | ||||
| 
 | ||||
| diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
 | ||||
| index 149b1126..b8a395af 100644
 | ||||
| --- a/libcontainer/process_linux.go
 | ||||
| +++ b/libcontainer/process_linux.go
 | ||||
| @@ -272,6 +272,20 @@ func (p *initProcess) start() error {
 | ||||
|  		p.process.ops = nil | ||||
|  		return newSystemErrorWithCause(err, "starting init process command") | ||||
|  	} | ||||
| +	if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
 | ||||
| +		return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
 | ||||
| +	}
 | ||||
| +	if err := p.execSetns(); err != nil {
 | ||||
| +		return newSystemErrorWithCause(err, "running exec setns process for init")
 | ||||
| +	}
 | ||||
| +	// Save the standard descriptor names before the container process
 | ||||
| +	// can potentially move them (e.g., via dup2()).  If we don't do this now,
 | ||||
| +	// we won't know at checkpoint time which file descriptor to look up.
 | ||||
| +	fds, err := getPipeFds(p.pid())
 | ||||
| +	if err != nil {
 | ||||
| +		return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
 | ||||
| +	}
 | ||||
| +	p.setExternalDescriptors(fds)
 | ||||
|  	// Do this before syncing with child so that no children can escape the | ||||
|  	// cgroup. We don't need to worry about not doing this and not being root | ||||
|  	// because we'd be using the rootless cgroup manager in that case. | ||||
| @@ -292,23 +306,6 @@ func (p *initProcess) start() error {
 | ||||
|  			} | ||||
|  		} | ||||
|  	}() | ||||
| -
 | ||||
| -	if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
 | ||||
| -		return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
 | ||||
| -	}
 | ||||
| -
 | ||||
| -	if err := p.execSetns(); err != nil {
 | ||||
| -		return newSystemErrorWithCause(err, "running exec setns process for init")
 | ||||
| -	}
 | ||||
| -
 | ||||
| -	// Save the standard descriptor names before the container process
 | ||||
| -	// can potentially move them (e.g., via dup2()).  If we don't do this now,
 | ||||
| -	// we won't know at checkpoint time which file descriptor to look up.
 | ||||
| -	fds, err := getPipeFds(p.pid())
 | ||||
| -	if err != nil {
 | ||||
| -		return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
 | ||||
| -	}
 | ||||
| -	p.setExternalDescriptors(fds)
 | ||||
|  	if err := p.createNetworkInterfaces(); err != nil { | ||||
|  		return newSystemErrorWithCause(err, "creating network interfaces") | ||||
|  	} | ||||
| -- 
 | ||||
| 2.14.3 | ||||
| 
 | ||||
| @ -1,290 +0,0 @@ | ||||
| From bf6405284aa3870a39b402309003633a1c230ed9 Mon Sep 17 00:00:00 2001 | ||||
| From: Aleksa Sarai <asarai@suse.de> | ||||
| Date: Wed, 9 Jan 2019 13:40:01 +1100 | ||||
| Subject: [PATCH 1/1] nsenter: clone /proc/self/exe to avoid exposing host | ||||
|  binary to container | ||||
| 
 | ||||
| There are quite a few circumstances where /proc/self/exe pointing to a | ||||
| pretty important container binary is a _bad_ thing, so to avoid this we | ||||
| have to make a copy (preferably doing self-clean-up and not being | ||||
| writeable). | ||||
| 
 | ||||
| As a hotfix we require memfd_create(2), but we can always extend this to | ||||
| use a scratch MNT_DETACH overlayfs or tmpfs. The main downside to this | ||||
| approach is no page-cache sharing for the runc binary (which overlayfs | ||||
| would give us) but this is far less complicated. | ||||
| 
 | ||||
| This is only done during nsenter so that it happens transparently to the | ||||
| Go code, and any libcontainer users benefit from it. This also makes | ||||
| ExtraFiles and --preserve-fds handling trivial (because we don't need to | ||||
| worry about it). | ||||
| 
 | ||||
| Fixes: CVE-2019-5736 | ||||
| Co-developed-by: Christian Brauner <christian.brauner@ubuntu.com> | ||||
| Signed-off-by: Aleksa Sarai <asarai@suse.de> | ||||
| Signed-off-by: Mrunal Patel <mrunalp@gmail.com> | ||||
| ---
 | ||||
|  libcontainer/nsenter/cloned_binary.c | 221 +++++++++++++++++++++++++++ | ||||
|  libcontainer/nsenter/nsexec.c        |  11 ++ | ||||
|  2 files changed, 232 insertions(+) | ||||
|  create mode 100644 libcontainer/nsenter/cloned_binary.c | ||||
| 
 | ||||
| diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c
 | ||||
| new file mode 100644 | ||||
| index 00000000..d9f6093a
 | ||||
| --- /dev/null
 | ||||
| +++ b/libcontainer/nsenter/cloned_binary.c
 | ||||
| @@ -0,0 +1,221 @@
 | ||||
| +#define _GNU_SOURCE
 | ||||
| +#include <unistd.h>
 | ||||
| +#include <stdio.h>
 | ||||
| +#include <stdlib.h>
 | ||||
| +#include <stdbool.h>
 | ||||
| +#include <string.h>
 | ||||
| +#include <limits.h>
 | ||||
| +#include <fcntl.h>
 | ||||
| +#include <errno.h>
 | ||||
| +
 | ||||
| +#include <sys/types.h>
 | ||||
| +#include <sys/stat.h>
 | ||||
| +#include <sys/vfs.h>
 | ||||
| +#include <sys/mman.h>
 | ||||
| +#include <sys/sendfile.h>
 | ||||
| +#include <sys/syscall.h>
 | ||||
| +
 | ||||
| +#include <linux/magic.h>
 | ||||
| +#include <linux/memfd.h>
 | ||||
| +
 | ||||
| +/* Use our own wrapper for memfd_create. */
 | ||||
| +#if !defined(SYS_memfd_create) && defined(__NR_memfd_create)
 | ||||
| +#  define SYS_memfd_create __NR_memfd_create
 | ||||
| +#endif
 | ||||
| +#ifndef SYS_memfd_create
 | ||||
| +#  error "memfd_create(2) syscall not supported by this glibc version"
 | ||||
| +#endif
 | ||||
| +int memfd_create(const char *name, unsigned int flags)
 | ||||
| +{
 | ||||
| +	return syscall(SYS_memfd_create, name, flags);
 | ||||
| +}
 | ||||
| +
 | ||||
| +/* This comes directly from <linux/fcntl.h>. */
 | ||||
| +#ifndef F_LINUX_SPECIFIC_BASE
 | ||||
| +#  define F_LINUX_SPECIFIC_BASE 1024
 | ||||
| +#endif
 | ||||
| +#ifndef F_ADD_SEALS
 | ||||
| +#  define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
 | ||||
| +#  define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
 | ||||
| +#endif
 | ||||
| +#ifndef F_SEAL_SEAL
 | ||||
| +#  define F_SEAL_SEAL   0x0001	/* prevent further seals from being set */
 | ||||
| +#  define F_SEAL_SHRINK 0x0002	/* prevent file from shrinking */
 | ||||
| +#  define F_SEAL_GROW   0x0004	/* prevent file from growing */
 | ||||
| +#  define F_SEAL_WRITE  0x0008	/* prevent writes */
 | ||||
| +#endif
 | ||||
| +
 | ||||
| +
 | ||||
| +#define OUR_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
 | ||||
| +#define OUR_MEMFD_SEALS \
 | ||||
| +	(F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)
 | ||||
| +
 | ||||
| +static void *must_realloc(void *ptr, size_t size)
 | ||||
| +{
 | ||||
| +	void *old = ptr;
 | ||||
| +	do {
 | ||||
| +		ptr = realloc(old, size);
 | ||||
| +	} while(!ptr);
 | ||||
| +	return ptr;
 | ||||
| +}
 | ||||
| +
 | ||||
| +/*
 | ||||
| + * Verify whether we are currently in a self-cloned program (namely, is
 | ||||
| + * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather
 | ||||
| + * for shmem files), and we want to be sure it's actually sealed.
 | ||||
| + */
 | ||||
| +static int is_self_cloned(void)
 | ||||
| +{
 | ||||
| +	int fd, seals;
 | ||||
| +
 | ||||
| +	fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC);
 | ||||
| +	if (fd < 0)
 | ||||
| +		return -ENOTRECOVERABLE;
 | ||||
| +
 | ||||
| +	seals = fcntl(fd, F_GET_SEALS);
 | ||||
| +	close(fd);
 | ||||
| +	return seals == OUR_MEMFD_SEALS;
 | ||||
| +}
 | ||||
| +
 | ||||
| +/*
 | ||||
| + * Basic wrapper around mmap(2) that gives you the file length so you can
 | ||||
| + * safely treat it as an ordinary buffer. Only gives you read access.
 | ||||
| + */
 | ||||
| +static char *read_file(char *path, size_t *length)
 | ||||
| +{
 | ||||
| +	int fd;
 | ||||
| +	char buf[4096], *copy = NULL;
 | ||||
| +
 | ||||
| +	if (!length)
 | ||||
| +		return NULL;
 | ||||
| +
 | ||||
| +	fd = open(path, O_RDONLY | O_CLOEXEC);
 | ||||
| +	if (fd < 0)
 | ||||
| +		return NULL;
 | ||||
| +
 | ||||
| +	*length = 0;
 | ||||
| +	for (;;) {
 | ||||
| +		int n;
 | ||||
| +
 | ||||
| +		n = read(fd, buf, sizeof(buf));
 | ||||
| +		if (n < 0)
 | ||||
| +			goto error;
 | ||||
| +		if (!n)
 | ||||
| +			break;
 | ||||
| +
 | ||||
| +		copy = must_realloc(copy, (*length + n) * sizeof(*copy));
 | ||||
| +		memcpy(copy + *length, buf, n);
 | ||||
| +		*length += n;
 | ||||
| +	}
 | ||||
| +	close(fd);
 | ||||
| +	return copy;
 | ||||
| +
 | ||||
| +error:
 | ||||
| +	close(fd);
 | ||||
| +	free(copy);
 | ||||
| +	return NULL;
 | ||||
| +}
 | ||||
| +
 | ||||
| +/*
 | ||||
| + * A poor-man's version of "xargs -0". Basically parses a given block of
 | ||||
| + * NUL-delimited data, within the given length and adds a pointer to each entry
 | ||||
| + * to the array of pointers.
 | ||||
| + */
 | ||||
| +static int parse_xargs(char *data, int data_length, char ***output)
 | ||||
| +{
 | ||||
| +	int num = 0;
 | ||||
| +	char *cur = data;
 | ||||
| +
 | ||||
| +	if (!data || *output != NULL)
 | ||||
| +		return -1;
 | ||||
| +
 | ||||
| +	while (cur < data + data_length) {
 | ||||
| +		num++;
 | ||||
| +		*output = must_realloc(*output, (num + 1) * sizeof(**output));
 | ||||
| +		(*output)[num - 1] = cur;
 | ||||
| +		cur += strlen(cur) + 1;
 | ||||
| +	}
 | ||||
| +	(*output)[num] = NULL;
 | ||||
| +	return num;
 | ||||
| +}
 | ||||
| +
 | ||||
| +/*
 | ||||
| + * "Parse" out argv and envp from /proc/self/cmdline and /proc/self/environ.
 | ||||
| + * This is necessary because we are running in a context where we don't have a
 | ||||
| + * main() that we can just get the arguments from.
 | ||||
| + */
 | ||||
| +static int fetchve(char ***argv, char ***envp)
 | ||||
| +{
 | ||||
| +	char *cmdline = NULL, *environ = NULL;
 | ||||
| +	size_t cmdline_size, environ_size;
 | ||||
| +
 | ||||
| +	cmdline = read_file("/proc/self/cmdline", &cmdline_size);
 | ||||
| +	if (!cmdline)
 | ||||
| +		goto error;
 | ||||
| +	environ = read_file("/proc/self/environ", &environ_size);
 | ||||
| +	if (!environ)
 | ||||
| +		goto error;
 | ||||
| +
 | ||||
| +	if (parse_xargs(cmdline, cmdline_size, argv) <= 0)
 | ||||
| +		goto error;
 | ||||
| +	if (parse_xargs(environ, environ_size, envp) <= 0)
 | ||||
| +		goto error;
 | ||||
| +
 | ||||
| +	return 0;
 | ||||
| +
 | ||||
| +error:
 | ||||
| +	free(environ);
 | ||||
| +	free(cmdline);
 | ||||
| +	return -EINVAL;
 | ||||
| +}
 | ||||
| +
 | ||||
| +#define SENDFILE_MAX 0x7FFFF000 /* sendfile(2) is limited to 2GB. */
 | ||||
| +static int clone_binary(void)
 | ||||
| +{
 | ||||
| +	int binfd, memfd, err;
 | ||||
| +	ssize_t sent = 0;
 | ||||
| +
 | ||||
| +	memfd = memfd_create(OUR_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING);
 | ||||
| +	if (memfd < 0)
 | ||||
| +		return -ENOTRECOVERABLE;
 | ||||
| +
 | ||||
| +	binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
 | ||||
| +	if (binfd < 0)
 | ||||
| +		goto error;
 | ||||
| +
 | ||||
| +	sent = sendfile(memfd, binfd, NULL, SENDFILE_MAX);
 | ||||
| +	close(binfd);
 | ||||
| +	if (sent < 0)
 | ||||
| +		goto error;
 | ||||
| +
 | ||||
| +	err = fcntl(memfd, F_ADD_SEALS, OUR_MEMFD_SEALS);
 | ||||
| +	if (err < 0)
 | ||||
| +		goto error;
 | ||||
| +
 | ||||
| +	return memfd;
 | ||||
| +
 | ||||
| +error:
 | ||||
| +	close(memfd);
 | ||||
| +	return -EIO;
 | ||||
| +}
 | ||||
| +
 | ||||
| +int ensure_cloned_binary(void)
 | ||||
| +{
 | ||||
| +	int execfd;
 | ||||
| +	char **argv = NULL, **envp = NULL;
 | ||||
| +
 | ||||
| +	/* Check that we're not self-cloned, and if we are then bail. */
 | ||||
| +	int cloned = is_self_cloned();
 | ||||
| +	if (cloned > 0 || cloned == -ENOTRECOVERABLE)
 | ||||
| +		return cloned;
 | ||||
| +
 | ||||
| +	if (fetchve(&argv, &envp) < 0)
 | ||||
| +		return -EINVAL;
 | ||||
| +
 | ||||
| +	execfd = clone_binary();
 | ||||
| +	if (execfd < 0)
 | ||||
| +		return -EIO;
 | ||||
| +
 | ||||
| +	fexecve(execfd, argv, envp);
 | ||||
| +	return -ENOEXEC;
 | ||||
| +}
 | ||||
| diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
 | ||||
| index cb224314..784fd9b0 100644
 | ||||
| --- a/libcontainer/nsenter/nsexec.c
 | ||||
| +++ b/libcontainer/nsenter/nsexec.c
 | ||||
| @@ -528,6 +528,9 @@ void join_namespaces(char *nslist)
 | ||||
|  	free(namespaces); | ||||
|  } | ||||
|   | ||||
| +/* Defined in cloned_binary.c. */
 | ||||
| +int ensure_cloned_binary(void);
 | ||||
| +
 | ||||
|  void nsexec(void) | ||||
|  { | ||||
|  	int pipenum; | ||||
| @@ -543,6 +546,14 @@ void nsexec(void)
 | ||||
|  	if (pipenum == -1) | ||||
|  		return; | ||||
|   | ||||
| +	/*
 | ||||
| +	 * We need to re-exec if we are not in a cloned binary. This is necessary
 | ||||
| +	 * to ensure that containers won't be able to access the host binary
 | ||||
| +	 * through /proc/self/exe. See CVE-2019-5736.
 | ||||
| +	 */
 | ||||
| +	if (ensure_cloned_binary() < 0)
 | ||||
| +		bail("could not ensure we are a cloned binary");
 | ||||
| +
 | ||||
|  	/* Parse all of the netlink configuration. */ | ||||
|  	nl_parse(pipenum, &config); | ||||
|   | ||||
| -- 
 | ||||
| 2.20.1 | ||||
| 
 | ||||
							
								
								
									
										475
									
								
								SOURCES/0002-1.1-runc-exec-implement-CPU-affinity.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										475
									
								
								SOURCES/0002-1.1-runc-exec-implement-CPU-affinity.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,475 @@ | ||||
| From 1af672a2635628ca24ce3b5ed3344d316548f1ca Mon Sep 17 00:00:00 2001 | ||||
| From: Kir Kolyshkin <kolyshkin@gmail.com> | ||||
| Date: Mon, 21 Oct 2024 15:50:38 -0700 | ||||
| Subject: [PATCH 2/2] [1.1] runc exec: implement CPU affinity | ||||
| 
 | ||||
| As per | ||||
| - https://github.com/opencontainers/runtime-spec/pull/1253
 | ||||
| - https://github.com/opencontainers/runtime-spec/pull/1261
 | ||||
| 
 | ||||
| CPU affinity can be set in two ways: | ||||
| 1. When creating/starting a container, in config.json's | ||||
|    Process.ExecCPUAffinity, which is when applied to all execs. | ||||
| 2. When running an exec, in process.json's CPUAffinity, which | ||||
|    applied to a given exec and overrides the value from (1). | ||||
| 
 | ||||
| Add some basic tests. | ||||
| 
 | ||||
| Note that older kernels (RHEL8, Ubuntu 20.04) change CPU affinity of a | ||||
| process to that of a container's cgroup, as soon as it is moved to that | ||||
| cgroup, while newer kernels (Ubuntu 24.04, Fedora 41) don't do that. | ||||
| 
 | ||||
| Because of the above, | ||||
|  - it's impossible to really test initial CPU affinity without adding | ||||
|    debug logging to libcontainer/nsenter; | ||||
|  - for older kernels, there can be a brief moment when exec's affinity | ||||
|    is different than either initial or final affinity being set; | ||||
|  - exec's final CPU affinity, if not specified, can be different | ||||
|    depending on the kernel, therefore we don't test it. | ||||
| 
 | ||||
| Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com> | ||||
| ---
 | ||||
|  libcontainer/configs/config.go      |  73 ++++++++++++++++++++ | ||||
|  libcontainer/container_linux.go     |   4 ++ | ||||
|  libcontainer/init_linux.go          |   1 + | ||||
|  libcontainer/nsenter/nsexec.c       |  36 +++++++++- | ||||
|  libcontainer/process.go             |   2 + | ||||
|  libcontainer/process_linux.go       |  51 +++++++++++++- | ||||
|  libcontainer/specconv/spec_linux.go |   5 ++ | ||||
|  tests/integration/cpu_affinity.bats | 101 ++++++++++++++++++++++++++++ | ||||
|  utils_linux.go                      |   6 ++ | ||||
|  9 files changed, 275 insertions(+), 4 deletions(-) | ||||
|  create mode 100644 tests/integration/cpu_affinity.bats | ||||
| 
 | ||||
| diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go
 | ||||
| index 6ebf5ec7..997f2724 100644
 | ||||
| --- a/libcontainer/configs/config.go
 | ||||
| +++ b/libcontainer/configs/config.go
 | ||||
| @@ -3,11 +3,15 @@ package configs
 | ||||
|  import ( | ||||
|  	"bytes" | ||||
|  	"encoding/json" | ||||
| +	"errors"
 | ||||
|  	"fmt" | ||||
|  	"os/exec" | ||||
| +	"strconv"
 | ||||
| +	"strings"
 | ||||
|  	"time" | ||||
|   | ||||
|  	"github.com/sirupsen/logrus" | ||||
| +	"golang.org/x/sys/unix"
 | ||||
|   | ||||
|  	"github.com/opencontainers/runc/libcontainer/devices" | ||||
|  	"github.com/opencontainers/runtime-spec/specs-go" | ||||
| @@ -211,6 +215,75 @@ type Config struct {
 | ||||
|  	// RootlessCgroups is set when unlikely to have the full access to cgroups. | ||||
|  	// When RootlessCgroups is set, cgroups errors are ignored. | ||||
|  	RootlessCgroups bool `json:"rootless_cgroups,omitempty"` | ||||
| +
 | ||||
| +	// ExecCPUAffinity is CPU affinity for a non-init process to be run in the container.
 | ||||
| +	ExecCPUAffinity *CPUAffinity `json:"exec_cpu_affinity,omitempty"`
 | ||||
| +}
 | ||||
| +
 | ||||
| +type CPUAffinity struct {
 | ||||
| +	Initial, Final *unix.CPUSet
 | ||||
| +}
 | ||||
| +
 | ||||
| +func toCPUSet(str string) (*unix.CPUSet, error) {
 | ||||
| +	if str == "" {
 | ||||
| +		return nil, nil
 | ||||
| +	}
 | ||||
| +	s := new(unix.CPUSet)
 | ||||
| +	for _, r := range strings.Split(str, ",") {
 | ||||
| +		// Allow extra spaces around.
 | ||||
| +		r = strings.TrimSpace(r)
 | ||||
| +		// Allow empty elements (extra commas).
 | ||||
| +		if r == "" {
 | ||||
| +			continue
 | ||||
| +		}
 | ||||
| +		if r0, r1, found := strings.Cut(r, "-"); found {
 | ||||
| +			start, err := strconv.ParseUint(r0, 10, 32)
 | ||||
| +			if err != nil {
 | ||||
| +				return nil, err
 | ||||
| +			}
 | ||||
| +			end, err := strconv.ParseUint(r1, 10, 32)
 | ||||
| +			if err != nil {
 | ||||
| +				return nil, err
 | ||||
| +			}
 | ||||
| +			if start > end {
 | ||||
| +				return nil, errors.New("invalid range: " + r)
 | ||||
| +			}
 | ||||
| +			for i := int(start); i <= int(end); i++ {
 | ||||
| +				s.Set(i)
 | ||||
| +			}
 | ||||
| +		} else {
 | ||||
| +			val, err := strconv.ParseUint(r, 10, 32)
 | ||||
| +			if err != nil {
 | ||||
| +				return nil, err
 | ||||
| +			}
 | ||||
| +			s.Set(int(val))
 | ||||
| +		}
 | ||||
| +	}
 | ||||
| +
 | ||||
| +	return s, nil
 | ||||
| +}
 | ||||
| +
 | ||||
| +// ConvertCPUAffinity converts [specs.CPUAffinity] to [CPUAffinity].
 | ||||
| +func ConvertCPUAffinity(sa *specs.CPUAffinity) (*CPUAffinity, error) {
 | ||||
| +	if sa == nil {
 | ||||
| +		return nil, nil
 | ||||
| +	}
 | ||||
| +	initial, err := toCPUSet(sa.Initial)
 | ||||
| +	if err != nil {
 | ||||
| +		return nil, fmt.Errorf("bad CPUAffinity.Initial: %w", err)
 | ||||
| +	}
 | ||||
| +	final, err := toCPUSet(sa.Final)
 | ||||
| +	if err != nil {
 | ||||
| +		return nil, fmt.Errorf("bad CPUAffinity.Final: %w", err)
 | ||||
| +	}
 | ||||
| +	if initial == nil && final == nil {
 | ||||
| +		return nil, nil
 | ||||
| +	}
 | ||||
| +
 | ||||
| +	return &CPUAffinity{
 | ||||
| +		Initial: initial,
 | ||||
| +		Final:   final,
 | ||||
| +	}, nil
 | ||||
|  } | ||||
|   | ||||
|  type ( | ||||
| diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
 | ||||
| index 40b332f9..68b6a74f 100644
 | ||||
| --- a/libcontainer/container_linux.go
 | ||||
| +++ b/libcontainer/container_linux.go
 | ||||
| @@ -692,6 +692,7 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
 | ||||
|  		AppArmorProfile:  c.config.AppArmorProfile, | ||||
|  		ProcessLabel:     c.config.ProcessLabel, | ||||
|  		Rlimits:          c.config.Rlimits, | ||||
| +		CPUAffinity:      c.config.ExecCPUAffinity,
 | ||||
|  		CreateConsole:    process.ConsoleSocket != nil, | ||||
|  		ConsoleWidth:     process.ConsoleWidth, | ||||
|  		ConsoleHeight:    process.ConsoleHeight, | ||||
| @@ -708,6 +709,9 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
 | ||||
|  	if len(process.Rlimits) > 0 { | ||||
|  		cfg.Rlimits = process.Rlimits | ||||
|  	} | ||||
| +	if process.CPUAffinity != nil {
 | ||||
| +		cfg.CPUAffinity = process.CPUAffinity
 | ||||
| +	}
 | ||||
|  	if cgroups.IsCgroup2UnifiedMode() { | ||||
|  		cfg.Cgroup2Path = c.cgroupManager.Path("") | ||||
|  	} | ||||
| diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
 | ||||
| index d9f18139..1f8562ec 100644
 | ||||
| --- a/libcontainer/init_linux.go
 | ||||
| +++ b/libcontainer/init_linux.go
 | ||||
| @@ -70,6 +70,7 @@ type initConfig struct {
 | ||||
|  	RootlessCgroups  bool                  `json:"rootless_cgroups,omitempty"` | ||||
|  	SpecState        *specs.State          `json:"spec_state,omitempty"` | ||||
|  	Cgroup2Path      string                `json:"cgroup2_path,omitempty"` | ||||
| +	CPUAffinity      *configs.CPUAffinity  `json:"cpu_affinity,omitempty"`
 | ||||
|  } | ||||
|   | ||||
|  type initer interface { | ||||
| diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
 | ||||
| index 2d224bab..6f70aa87 100644
 | ||||
| --- a/libcontainer/nsenter/nsexec.c
 | ||||
| +++ b/libcontainer/nsenter/nsexec.c
 | ||||
| @@ -149,13 +149,18 @@ int setns(int fd, int nstype)
 | ||||
|  } | ||||
|  #endif | ||||
|   | ||||
| +bool log_enabled_for(int level)
 | ||||
| +{
 | ||||
| +	return (logfd >= 0 && level <= loglevel);
 | ||||
| +}
 | ||||
| +
 | ||||
|  static void write_log(int level, const char *format, ...) | ||||
|  { | ||||
|  	char *message = NULL, *stage = NULL, *json = NULL; | ||||
|  	va_list args; | ||||
|  	int ret; | ||||
|   | ||||
| -	if (logfd < 0 || level > loglevel)
 | ||||
| +	if (!log_enabled_for(level))
 | ||||
|  		goto out; | ||||
|   | ||||
|  	va_start(args, format); | ||||
| @@ -851,6 +856,25 @@ void try_unshare(int flags, const char *msg)
 | ||||
|  	bail("failed to unshare %s", msg); | ||||
|  } | ||||
|   | ||||
| +void print_cpu_affinity()
 | ||||
| +{
 | ||||
| +	cpu_set_t cpus = { };
 | ||||
| +	size_t i, mask = 0;
 | ||||
| +
 | ||||
| +	if (sched_getaffinity(0, sizeof(cpus), &cpus) < 0) {
 | ||||
| +		write_log(WARNING, "sched_getaffinity: %m");
 | ||||
| +		return;
 | ||||
| +	}
 | ||||
| +
 | ||||
| +	/* Do not print the complete mask, we only need a few first CPUs. */
 | ||||
| +	for (i = 0; i < sizeof(mask) * 8; i++) {
 | ||||
| +		if (CPU_ISSET(i, &cpus))
 | ||||
| +			mask |= 1 << i;
 | ||||
| +	}
 | ||||
| +
 | ||||
| +	write_log(DEBUG, "affinity: 0x%zx", mask);
 | ||||
| +}
 | ||||
| +
 | ||||
|  void nsexec(void) | ||||
|  { | ||||
|  	int pipenum; | ||||
| @@ -892,6 +916,16 @@ void nsexec(void)
 | ||||
|   | ||||
|  	write_log(DEBUG, "=> nsexec container setup"); | ||||
|   | ||||
| +	/* This is for ../../tests/integration/cpu_affinity.bats test only.
 | ||||
| +	 *
 | ||||
| +	 * Printing this from Go code might be too late as some kernels
 | ||||
| +	 * change the process' CPU affinity to that of container's cpuset
 | ||||
| +	 * as soon as the process is moved into container's cgroup.
 | ||||
| +	 */
 | ||||
| +	if (log_enabled_for(DEBUG)) {
 | ||||
| +		print_cpu_affinity();
 | ||||
| +	}
 | ||||
| +
 | ||||
|  	/* Parse all of the netlink configuration. */ | ||||
|  	nl_parse(pipenum, &config); | ||||
|   | ||||
| diff --git a/libcontainer/process.go b/libcontainer/process.go
 | ||||
| index 8a5d340d..99167274 100644
 | ||||
| --- a/libcontainer/process.go
 | ||||
| +++ b/libcontainer/process.go
 | ||||
| @@ -89,6 +89,8 @@ type Process struct {
 | ||||
|  	// | ||||
|  	// For cgroup v2, the only key allowed is "". | ||||
|  	SubCgroupPaths map[string]string | ||||
| +
 | ||||
| +	CPUAffinity *configs.CPUAffinity
 | ||||
|  } | ||||
|   | ||||
|  // Wait waits for the process to exit. | ||||
| diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
 | ||||
| index 0d9ceb9c..3b48ae76 100644
 | ||||
| --- a/libcontainer/process_linux.go
 | ||||
| +++ b/libcontainer/process_linux.go
 | ||||
| @@ -9,6 +9,7 @@ import (
 | ||||
|  	"os" | ||||
|  	"os/exec" | ||||
|  	"path/filepath" | ||||
| +	"runtime"
 | ||||
|  	"strconv" | ||||
|  	"time" | ||||
|   | ||||
| @@ -78,12 +79,52 @@ func (p *setnsProcess) signal(sig os.Signal) error {
 | ||||
|  	return unix.Kill(p.pid(), s) | ||||
|  } | ||||
|   | ||||
| +// Starts setns process with specified initial CPU affinity.
 | ||||
| +func (p *setnsProcess) startWithCPUAffinity() error {
 | ||||
| +	aff := p.config.CPUAffinity
 | ||||
| +	if aff == nil || aff.Initial == nil {
 | ||||
| +		return p.cmd.Start()
 | ||||
| +	}
 | ||||
| +	errCh := make(chan error)
 | ||||
| +	defer close(errCh)
 | ||||
| +
 | ||||
| +	// Use a goroutine to dedicate an OS thread.
 | ||||
| +	go func() {
 | ||||
| +		runtime.LockOSThread()
 | ||||
| +		// Command inherits the CPU affinity.
 | ||||
| +		if err := unix.SchedSetaffinity(unix.Gettid(), aff.Initial); err != nil {
 | ||||
| +			runtime.UnlockOSThread()
 | ||||
| +			errCh <- fmt.Errorf("error setting initial CPU affinity: %w", err)
 | ||||
| +			return
 | ||||
| +		}
 | ||||
| +
 | ||||
| +		errCh <- p.cmd.Start()
 | ||||
| +		// Deliberately omit runtime.UnlockOSThread here.
 | ||||
| +		// https://pkg.go.dev/runtime#LockOSThread says:
 | ||||
| +		// "If the calling goroutine exits without unlocking the
 | ||||
| +		// thread, the thread will be terminated".
 | ||||
| +	}()
 | ||||
| +
 | ||||
| +	return <-errCh
 | ||||
| +}
 | ||||
| +
 | ||||
| +func (p *setnsProcess) setFinalCPUAffinity() error {
 | ||||
| +	aff := p.config.CPUAffinity
 | ||||
| +	if aff == nil || aff.Final == nil {
 | ||||
| +		return nil
 | ||||
| +	}
 | ||||
| +	if err := unix.SchedSetaffinity(p.pid(), aff.Final); err != nil {
 | ||||
| +		return fmt.Errorf("error setting final CPU affinity: %w", err)
 | ||||
| +	}
 | ||||
| +	return nil
 | ||||
| +}
 | ||||
| +
 | ||||
|  func (p *setnsProcess) start() (retErr error) { | ||||
|  	defer p.messageSockPair.parent.Close() | ||||
| -	// get the "before" value of oom kill count
 | ||||
| +	// Get the "before" value of oom kill count.
 | ||||
|  	oom, _ := p.manager.OOMKillCount() | ||||
| -	err := p.cmd.Start()
 | ||||
| -	// close the write-side of the pipes (controlled by child)
 | ||||
| +	err := p.startWithCPUAffinity()
 | ||||
| +	// Close the child-side of the pipes (controlled by child).
 | ||||
|  	p.messageSockPair.child.Close() | ||||
|  	p.logFilePair.child.Close() | ||||
|  	if err != nil { | ||||
| @@ -143,6 +184,10 @@ func (p *setnsProcess) start() (retErr error) {
 | ||||
|  			} | ||||
|  		} | ||||
|  	} | ||||
| +	// Set final CPU affinity right after the process is moved into container's cgroup.
 | ||||
| +	if err := p.setFinalCPUAffinity(); err != nil {
 | ||||
| +		return err
 | ||||
| +	}
 | ||||
|  	if p.intelRdtPath != "" { | ||||
|  		// if Intel RDT "resource control" filesystem path exists | ||||
|  		_, err := os.Stat(p.intelRdtPath) | ||||
| diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go
 | ||||
| index 7dbfb869..b59e0d59 100644
 | ||||
| --- a/libcontainer/specconv/spec_linux.go
 | ||||
| +++ b/libcontainer/specconv/spec_linux.go
 | ||||
| @@ -493,6 +493,11 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
 | ||||
|  				Ambient:     spec.Process.Capabilities.Ambient, | ||||
|  			} | ||||
|  		} | ||||
| +		config.ExecCPUAffinity, err = configs.ConvertCPUAffinity(spec.Process.ExecCPUAffinity)
 | ||||
| +		if err != nil {
 | ||||
| +			return nil, err
 | ||||
| +		}
 | ||||
| +
 | ||||
|  	} | ||||
|  	createHooks(spec, config) | ||||
|  	config.Version = specs.Version | ||||
| diff --git a/tests/integration/cpu_affinity.bats b/tests/integration/cpu_affinity.bats
 | ||||
| new file mode 100644 | ||||
| index 00000000..f6adfa2a
 | ||||
| --- /dev/null
 | ||||
| +++ b/tests/integration/cpu_affinity.bats
 | ||||
| @@ -0,0 +1,101 @@
 | ||||
| +#!/usr/bin/env bats
 | ||||
| +# Exec CPU affinity tests. For more details, see:
 | ||||
| +#  - https://github.com/opencontainers/runtime-spec/pull/1253
 | ||||
| +
 | ||||
| +load helpers
 | ||||
| +
 | ||||
| +function setup() {
 | ||||
| +	requires smp cgroups_cpuset
 | ||||
| +	setup_busybox
 | ||||
| +}
 | ||||
| +
 | ||||
| +function teardown() {
 | ||||
| +	teardown_bundle
 | ||||
| +}
 | ||||
| +
 | ||||
| +function first_cpu() {
 | ||||
| +	sed 's/[-,].*//g' </sys/devices/system/cpu/online
 | ||||
| +}
 | ||||
| +
 | ||||
| +# Convert list of cpus ("0,1" or "0-1") to mask as printed by nsexec.
 | ||||
| +# NOTE the range conversion is not proper, merely sufficient for tests here.
 | ||||
| +function cpus_to_mask() {
 | ||||
| +	local cpus=$* mask=0
 | ||||
| +
 | ||||
| +	cpus=${cpus//,/-} # 1. "," --> "-".
 | ||||
| +	cpus=${cpus//-/ } # 2. "-" --> " ".
 | ||||
| +
 | ||||
| +	for c in $cpus; do
 | ||||
| +		mask=$((mask | 1 << c))
 | ||||
| +	done
 | ||||
| +
 | ||||
| +	printf "0x%x" $mask
 | ||||
| +}
 | ||||
| +
 | ||||
| +@test "runc exec [CPU affinity, only initial set from process.json]" {
 | ||||
| +	first="$(first_cpu)"
 | ||||
| +	second=$((first + 1)) # Hacky; might not work in all environments.
 | ||||
| +
 | ||||
| +	runc run -d --console-socket "$CONSOLE_SOCKET" ct1
 | ||||
| +	[ "$status" -eq 0 ]
 | ||||
| +
 | ||||
| +	for cpus in "$second" "$first-$second" "$first,$second" "$first"; do
 | ||||
| +		proc='
 | ||||
| +{
 | ||||
| +    "terminal": false,
 | ||||
| +    "execCPUAffinity": {
 | ||||
| +	    "initial": "'$cpus'"
 | ||||
| +    },
 | ||||
| +    "args": [ "/bin/true" ],
 | ||||
| +    "cwd": "/"
 | ||||
| +}'
 | ||||
| +		mask=$(cpus_to_mask "$cpus")
 | ||||
| +		echo "CPUS: $cpus, mask: $mask"
 | ||||
| +		runc --debug exec --process <(echo "$proc") ct1
 | ||||
| +		[[ "$output" == *"nsexec"*": affinity: $mask"* ]]
 | ||||
| +	done
 | ||||
| +}
 | ||||
| +
 | ||||
| +@test "runc exec [CPU affinity, initial and final set from process.json]" {
 | ||||
| +	first="$(first_cpu)"
 | ||||
| +	second=$((first + 1)) # Hacky; might not work in all environments.
 | ||||
| +
 | ||||
| +	runc run -d --console-socket "$CONSOLE_SOCKET" ct1
 | ||||
| +	[ "$status" -eq 0 ]
 | ||||
| +
 | ||||
| +	for cpus in "$second" "$first-$second" "$first,$second" "$first"; do
 | ||||
| +		proc='
 | ||||
| +{
 | ||||
| +    "terminal": false,
 | ||||
| +    "execCPUAffinity": {
 | ||||
| +	    "initial": "'$cpus'",
 | ||||
| +	    "final": "'$cpus'"
 | ||||
| +    },
 | ||||
| +    "args": [ "/bin/grep", "-F", "Cpus_allowed_list:", "/proc/self/status" ],
 | ||||
| +    "cwd": "/"
 | ||||
| +}'
 | ||||
| +		mask=$(cpus_to_mask "$cpus")
 | ||||
| +		exp=${cpus//,/-} # "," --> "-".
 | ||||
| +		echo "CPUS: $cpus, mask: $mask, final: $exp"
 | ||||
| +		runc --debug exec --process <(echo "$proc") ct1
 | ||||
| +		[[ "$output" == *"nsexec"*": affinity: $mask"* ]]
 | ||||
| +		[[ "$output" == *"Cpus_allowed_list:	$exp"* ]] # Mind the literal tab.
 | ||||
| +	done
 | ||||
| +}
 | ||||
| +
 | ||||
| +@test "runc exec [CPU affinity, initial and final set from config.json]" {
 | ||||
| +	initial="$(first_cpu)"
 | ||||
| +	final=$((initial + 1)) # Hacky; might not work in all environments.
 | ||||
| +
 | ||||
| +	update_config "	  .process.execCPUAffinity.initial = \"$initial\"
 | ||||
| +			| .process.execCPUAffinity.final = \"$final\""
 | ||||
| +
 | ||||
| +	runc run -d --console-socket "$CONSOLE_SOCKET" ct1
 | ||||
| +	[ "$status" -eq 0 ]
 | ||||
| +
 | ||||
| +	runc --debug exec ct1 grep "Cpus_allowed_list:" /proc/self/status
 | ||||
| +	[ "$status" -eq 0 ]
 | ||||
| +	mask=$(cpus_to_mask "$initial")
 | ||||
| +	[[ "$output" == *"nsexec"*": affinity: $mask"* ]]
 | ||||
| +	[[ "$output" == *"Cpus_allowed_list:	$final"* ]] # Mind the literal tab.
 | ||||
| +}
 | ||||
| diff --git a/utils_linux.go b/utils_linux.go
 | ||||
| index 60d534e8..30204133 100644
 | ||||
| --- a/utils_linux.go
 | ||||
| +++ b/utils_linux.go
 | ||||
| @@ -109,6 +109,12 @@ func newProcess(p specs.Process) (*libcontainer.Process, error) {
 | ||||
|  		} | ||||
|  		lp.Rlimits = append(lp.Rlimits, rl) | ||||
|  	} | ||||
| +	aff, err := configs.ConvertCPUAffinity(p.ExecCPUAffinity)
 | ||||
| +	if err != nil {
 | ||||
| +		return nil, err
 | ||||
| +	}
 | ||||
| +	lp.CPUAffinity = aff
 | ||||
| +
 | ||||
|  	return lp, nil | ||||
|  } | ||||
|   | ||||
| -- 
 | ||||
| 2.47.1 | ||||
| 
 | ||||
| @ -1,200 +0,0 @@ | ||||
| From ecf53c23545092019602578583031c28fde4d2a1 Mon Sep 17 00:00:00 2001 | ||||
| From: Giuseppe Scrivano <gscrivan@redhat.com> | ||||
| Date: Fri, 25 May 2018 18:04:06 +0200 | ||||
| Subject: [PATCH] sd-notify: do not hang when NOTIFY_SOCKET is used with create | ||||
| 
 | ||||
| if NOTIFY_SOCKET is used, do not block the main runc process waiting | ||||
| for events on the notify socket.  Change the logic to create a new | ||||
| process that monitors exclusively the notify socket until an event is | ||||
| received. | ||||
| 
 | ||||
| Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com> | ||||
| ---
 | ||||
|  init.go          |  12 +++++++ | ||||
|  notify_socket.go | 101 ++++++++++++++++++++++++++++++++++++++++++++++--------- | ||||
|  signals.go       |   5 +-- | ||||
|  3 files changed, 99 insertions(+), 19 deletions(-) | ||||
| 
 | ||||
| diff --git a/init.go b/init.go
 | ||||
| index c8f453192..6a3d9e91c 100644
 | ||||
| --- a/init.go
 | ||||
| +++ b/init.go
 | ||||
| @@ -20,6 +20,18 @@ var initCommand = cli.Command{
 | ||||
|  	Name:  "init", | ||||
|  	Usage: `initialize the namespaces and launch the process (do not call it outside of runc)`, | ||||
|  	Action: func(context *cli.Context) error { | ||||
| +		// If NOTIFY_SOCKET is used create a new process that stays around
 | ||||
| +		// so to not block "runc start".  It will automatically exits when the
 | ||||
| +		// container notifies that it is ready, or when the container is deleted
 | ||||
| +		if os.Getenv("_NOTIFY_SOCKET_FD") != "" {
 | ||||
| +			fd := os.Getenv("_NOTIFY_SOCKET_FD")
 | ||||
| +			pid := os.Getenv("_NOTIFY_SOCKET_PID")
 | ||||
| +			hostNotifySocket := os.Getenv("_NOTIFY_SOCKET_HOST")
 | ||||
| +			notifySocketPath := os.Getenv("_NOTIFY_SOCKET_PATH")
 | ||||
| +			notifySocketInit(fd, pid, hostNotifySocket, notifySocketPath)
 | ||||
| +			os.Exit(0)
 | ||||
| +		}
 | ||||
| +
 | ||||
|  		factory, _ := libcontainer.New("") | ||||
|  		if err := factory.StartInitialization(); err != nil { | ||||
|  			// as the error is sent back to the parent there is no need to log | ||||
| diff --git a/notify_socket.go b/notify_socket.go
 | ||||
| index cd6c0a989..e04e9d660 100644
 | ||||
| --- a/notify_socket.go
 | ||||
| +++ b/notify_socket.go
 | ||||
| @@ -6,10 +6,13 @@ import (
 | ||||
|  	"bytes" | ||||
|  	"fmt" | ||||
|  	"net" | ||||
| +	"os"
 | ||||
| +	"os/exec"
 | ||||
|  	"path/filepath" | ||||
| +	"strconv"
 | ||||
| +	"time"
 | ||||
|   | ||||
|  	"github.com/opencontainers/runtime-spec/specs-go" | ||||
| -
 | ||||
|  	"github.com/sirupsen/logrus" | ||||
|  	"github.com/urfave/cli" | ||||
|  ) | ||||
| @@ -64,24 +67,94 @@ func (s *notifySocket) setupSocket() error {
 | ||||
|  	return nil | ||||
|  } | ||||
|   | ||||
| +func (notifySocket *notifySocket) notifyNewPid(pid int) {
 | ||||
| +	notifySocketHostAddr := net.UnixAddr{Name: notifySocket.host, Net: "unixgram"}
 | ||||
| +	client, err := net.DialUnix("unixgram", nil, ¬ifySocketHostAddr)
 | ||||
| +	if err != nil {
 | ||||
| +		return
 | ||||
| +	}
 | ||||
| +	newPid := fmt.Sprintf("MAINPID=%d\n", pid)
 | ||||
| +	client.Write([]byte(newPid))
 | ||||
| +}
 | ||||
| +
 | ||||
|  // pid1 must be set only with -d, as it is used to set the new process as the main process | ||||
|  // for the service in systemd | ||||
|  func (notifySocket *notifySocket) run(pid1 int) { | ||||
| -	buf := make([]byte, 512)
 | ||||
| -	notifySocketHostAddr := net.UnixAddr{Name: notifySocket.host, Net: "unixgram"}
 | ||||
| -	client, err := net.DialUnix("unixgram", nil, ¬ifySocketHostAddr)
 | ||||
| +	file, err := notifySocket.socket.File()
 | ||||
|  	if err != nil { | ||||
|  		logrus.Error(err) | ||||
|  		return | ||||
|  	} | ||||
| -	for {
 | ||||
| -		r, err := notifySocket.socket.Read(buf)
 | ||||
| -		if err != nil {
 | ||||
| -			break
 | ||||
| +	defer file.Close()
 | ||||
| +	defer notifySocket.socket.Close()
 | ||||
| +
 | ||||
| +	cmd := exec.Command("/proc/self/exe", "init")
 | ||||
| +	cmd.ExtraFiles = []*os.File{file}
 | ||||
| +	cmd.Env = append(cmd.Env, "_NOTIFY_SOCKET_FD=3",
 | ||||
| +		fmt.Sprintf("_NOTIFY_SOCKET_PID=%d", pid1),
 | ||||
| +		fmt.Sprintf("_NOTIFY_SOCKET_HOST=%s", notifySocket.host),
 | ||||
| +		fmt.Sprintf("_NOTIFY_SOCKET_PATH=%s", notifySocket.socketPath))
 | ||||
| +
 | ||||
| +	if err := cmd.Start(); err != nil {
 | ||||
| +		logrus.Fatal(err)
 | ||||
| +	}
 | ||||
| +	notifySocket.notifyNewPid(cmd.Process.Pid)
 | ||||
| +	cmd.Process.Release()
 | ||||
| +}
 | ||||
| +
 | ||||
| +func notifySocketInit(envFd string, envPid string, notifySocketHost string, notifySocketPath string) {
 | ||||
| +	intFd, err := strconv.Atoi(envFd)
 | ||||
| +	if err != nil {
 | ||||
| +		return
 | ||||
| +	}
 | ||||
| +	pid1, err := strconv.Atoi(envPid)
 | ||||
| +	if err != nil {
 | ||||
| +		return
 | ||||
| +	}
 | ||||
| +
 | ||||
| +	file := os.NewFile(uintptr(intFd), "unixgram")
 | ||||
| +	defer file.Close()
 | ||||
| +
 | ||||
| +	fileChan := make(chan []byte)
 | ||||
| +	exitChan := make(chan bool)
 | ||||
| +
 | ||||
| +	go func() {
 | ||||
| +		for {
 | ||||
| +			buf := make([]byte, 512)
 | ||||
| +			r, err := file.Read(buf)
 | ||||
| +			if err != nil {
 | ||||
| +				return
 | ||||
| +			}
 | ||||
| +			fileChan <- buf[0:r]
 | ||||
|  		} | ||||
| -		var out bytes.Buffer
 | ||||
| -		for _, line := range bytes.Split(buf[0:r], []byte{'\n'}) {
 | ||||
| -			if bytes.HasPrefix(line, []byte("READY=")) {
 | ||||
| +	}()
 | ||||
| +	go func() {
 | ||||
| +		for {
 | ||||
| +			if _, err := os.Stat(notifySocketPath); os.IsNotExist(err) {
 | ||||
| +				exitChan <- true
 | ||||
| +				return
 | ||||
| +			}
 | ||||
| +			time.Sleep(time.Second)
 | ||||
| +		}
 | ||||
| +	}()
 | ||||
| +
 | ||||
| +	notifySocketHostAddr := net.UnixAddr{Name: notifySocketHost, Net: "unixgram"}
 | ||||
| +	client, err := net.DialUnix("unixgram", nil, ¬ifySocketHostAddr)
 | ||||
| +	if err != nil {
 | ||||
| +		return
 | ||||
| +	}
 | ||||
| +
 | ||||
| +	for {
 | ||||
| +		select {
 | ||||
| +		case <-exitChan:
 | ||||
| +			return
 | ||||
| +		case b := <-fileChan:
 | ||||
| +			for _, line := range bytes.Split(b, []byte{'\n'}) {
 | ||||
| +				if !bytes.HasPrefix(line, []byte("READY=")) {
 | ||||
| +					continue
 | ||||
| +				}
 | ||||
| +
 | ||||
| +				var out bytes.Buffer
 | ||||
|  				_, err = out.Write(line) | ||||
|  				if err != nil { | ||||
|  					return | ||||
| @@ -98,10 +171,8 @@ func (notifySocket *notifySocket) run(pid1 int) {
 | ||||
|  				} | ||||
|   | ||||
|  				// now we can inform systemd to use pid1 as the pid to monitor | ||||
| -				if pid1 > 0 {
 | ||||
| -					newPid := fmt.Sprintf("MAINPID=%d\n", pid1)
 | ||||
| -					client.Write([]byte(newPid))
 | ||||
| -				}
 | ||||
| +				newPid := fmt.Sprintf("MAINPID=%d\n", pid1)
 | ||||
| +				client.Write([]byte(newPid))
 | ||||
|  				return | ||||
|  			} | ||||
|  		} | ||||
| diff --git a/signals.go b/signals.go
 | ||||
| index 1811de837..d0988cb39 100644
 | ||||
| --- a/signals.go
 | ||||
| +++ b/signals.go
 | ||||
| @@ -70,7 +70,7 @@ func (h *signalHandler) forward(process *libcontainer.Process, tty *tty, detach
 | ||||
|  			h.notifySocket.run(pid1) | ||||
|  			return 0, nil | ||||
|  		} else { | ||||
| -			go h.notifySocket.run(0)
 | ||||
| +			h.notifySocket.run(os.Getpid())
 | ||||
|  		} | ||||
|  	} | ||||
|   | ||||
| @@ -98,9 +98,6 @@ func (h *signalHandler) forward(process *libcontainer.Process, tty *tty, detach
 | ||||
|  					// status because we must ensure that any of the go specific process | ||||
|  					// fun such as flushing pipes are complete before we return. | ||||
|  					process.Wait() | ||||
| -					if h.notifySocket != nil {
 | ||||
| -						h.notifySocket.Close()
 | ||||
| -					}
 | ||||
|  					return e.status, nil | ||||
|  				} | ||||
|  			} | ||||
| @ -1 +0,0 @@ | ||||
| fs.may_detach_mounts=1 | ||||
| @ -1,61 +0,0 @@ | ||||
| diff --git a/list.go b/list.go
 | ||||
| index 0313d8c..328798b 100644
 | ||||
| --- a/list.go
 | ||||
| +++ b/list.go
 | ||||
| @@ -50,7 +50,7 @@ var listCommand = cli.Command{
 | ||||
|  	ArgsUsage: ` | ||||
|   | ||||
|  Where the given root is specified via the global option "--root" | ||||
| -(default: "/run/runc").
 | ||||
| +(default: "/run/runc-ctrs").
 | ||||
|   | ||||
|  EXAMPLE 1: | ||||
|  To list containers created via the default "--root": | ||||
| diff --git a/main.go b/main.go
 | ||||
| index 278399a..0f49fce 100644
 | ||||
| --- a/main.go
 | ||||
| +++ b/main.go
 | ||||
| @@ -62,7 +62,7 @@ func main() {
 | ||||
|  	v = append(v, fmt.Sprintf("spec: %s", specs.Version)) | ||||
|  	app.Version = strings.Join(v, "\n") | ||||
|   | ||||
| -	root := "/run/runc"
 | ||||
| +	root := "/run/runc-ctrs"
 | ||||
|  	rootless, err := isRootless(nil) | ||||
|  	if err != nil { | ||||
|  		fatal(err) | ||||
| @@ -70,7 +70,7 @@ func main() {
 | ||||
|  	if rootless { | ||||
|  		runtimeDir := os.Getenv("XDG_RUNTIME_DIR") | ||||
|  		if runtimeDir != "" { | ||||
| -			root = runtimeDir + "/runc"
 | ||||
| +			root = runtimeDir + "/runc-ctrs"
 | ||||
|  			// According to the XDG specification, we need to set anything in | ||||
|  			// XDG_RUNTIME_DIR to have a sticky bit if we don't want it to get | ||||
|  			// auto-pruned. | ||||
| diff --git a/man/runc-list.8.md b/man/runc-list.8.md
 | ||||
| index f737424..107220e 100644
 | ||||
| --- a/man/runc-list.8.md
 | ||||
| +++ b/man/runc-list.8.md
 | ||||
| @@ -6,7 +6,7 @@
 | ||||
|   | ||||
|  # EXAMPLE | ||||
|  Where the given root is specified via the global option "--root" | ||||
| -(default: "/run/runc").
 | ||||
| +(default: "/run/runc-ctrs").
 | ||||
|   | ||||
|  To list containers created via the default "--root": | ||||
|         # runc list | ||||
| diff --git a/man/runc.8.md b/man/runc.8.md
 | ||||
| index 6d0ddff..337bc73 100644
 | ||||
| --- a/man/runc.8.md
 | ||||
| +++ b/man/runc.8.md
 | ||||
| @@ -51,7 +51,7 @@ value for "bundle" is the current directory.
 | ||||
|     --debug              enable debug output for logging | ||||
|     --log value          set the log file path where internal debug information is written (default: "/dev/null") | ||||
|     --log-format value   set the format used by logs ('text' (default), or 'json') (default: "text") | ||||
| -   --root value         root directory for storage of container state (this should be located in tmpfs) (default: "/run/runc" or $XDG_RUNTIME_DIR/runc for rootless containers)
 | ||||
| +   --root value         root directory for storage of container state (this should be located in tmpfs) (default: "/run/runc-ctrs" or $XDG_RUNTIME_DIR/runc-ctrs for rootless containers)
 | ||||
|     --criu value         path to the criu binary used for checkpoint and restore (default: "criu") | ||||
|     --systemd-cgroup     enable systemd cgroup support, expects cgroupsPath to be of form "slice:prefix:name" for e.g. "system.slice:runc:434234" | ||||
|     --rootless value    enable rootless mode ('true', 'false', or 'auto') (default: "auto") | ||||
| @ -1,72 +0,0 @@ | ||||
| From 28a697cce3e4f905dca700eda81d681a30eef9cd Mon Sep 17 00:00:00 2001 | ||||
| From: Giuseppe Scrivano <gscrivan@redhat.com> | ||||
| Date: Fri, 11 Jan 2019 21:53:45 +0100 | ||||
| Subject: [PATCH] rootfs: umount all procfs and sysfs with --no-pivot | ||||
| 
 | ||||
| When creating a new user namespace, the kernel doesn't allow to mount | ||||
| a new procfs or sysfs file system if there is not already one instance | ||||
| fully visible in the current mount namespace. | ||||
| 
 | ||||
| When using --no-pivot we were effectively inhibiting this protection | ||||
| from the kernel, as /proc and /sys from the host are still present in | ||||
| the container mount namespace. | ||||
| 
 | ||||
| A container without full access to /proc could then create a new user | ||||
| namespace, and from there able to mount a fully visible /proc, bypassing | ||||
| the limitations in the container. | ||||
| 
 | ||||
| A simple reproducer for this issue is: | ||||
| 
 | ||||
| unshare -mrfp sh -c "mount -t proc none /proc && echo c > /proc/sysrq-trigger" | ||||
| 
 | ||||
| Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com> | ||||
| ---
 | ||||
|  libcontainer/rootfs_linux.go | 35 +++++++++++++++++++++++++++++++++++ | ||||
|  1 file changed, 35 insertions(+) | ||||
| 
 | ||||
| diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
 | ||||
| index e7c2f8ada..6bd6da74a 100644
 | ||||
| --- a/libcontainer/rootfs_linux.go
 | ||||
| +++ b/libcontainer/rootfs_linux.go
 | ||||
| @@ -748,6 +748,41 @@ func pivotRoot(rootfs string) error {
 | ||||
|  } | ||||
|   | ||||
|  func msMoveRoot(rootfs string) error { | ||||
| +	mountinfos, err := mount.GetMounts()
 | ||||
| +	if err != nil {
 | ||||
| +		return err
 | ||||
| +	}
 | ||||
| +
 | ||||
| +	absRootfs, err := filepath.Abs(rootfs)
 | ||||
| +	if err != nil {
 | ||||
| +		return err
 | ||||
| +	}
 | ||||
| +
 | ||||
| +	for _, info := range mountinfos {
 | ||||
| +		p, err := filepath.Abs(info.Mountpoint)
 | ||||
| +		if err != nil {
 | ||||
| +			return err
 | ||||
| +		}
 | ||||
| +		// Umount every syfs and proc file systems, except those under the container rootfs
 | ||||
| +		if (info.Fstype != "proc" && info.Fstype != "sysfs") || filepath.HasPrefix(p, absRootfs) {
 | ||||
| +			continue
 | ||||
| +		}
 | ||||
| +		// Be sure umount events are not propagated to the host.
 | ||||
| +		if err := unix.Mount("", p, "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
 | ||||
| +			return err
 | ||||
| +		}
 | ||||
| +		if err := unix.Unmount(p, unix.MNT_DETACH); err != nil {
 | ||||
| +			if err != unix.EINVAL && err != unix.EPERM {
 | ||||
| +				return err
 | ||||
| +			} else {
 | ||||
| +				// If we have not privileges for umounting (e.g. rootless), then
 | ||||
| +				// cover the path.
 | ||||
| +				if err := unix.Mount("tmpfs", p, "tmpfs", 0, ""); err != nil {
 | ||||
| +					return err
 | ||||
| +				}
 | ||||
| +			}
 | ||||
| +		}
 | ||||
| +	}
 | ||||
|  	if err := unix.Mount(rootfs, "/", "", unix.MS_MOVE, ""); err != nil { | ||||
|  		return err | ||||
|  	} | ||||
							
								
								
									
										290
									
								
								SPECS/runc.spec
									
									
									
									
									
								
							
							
						
						
									
										290
									
								
								SPECS/runc.spec
									
									
									
									
									
								
							| @ -1,52 +1,47 @@ | ||||
| %global with_debug 1 | ||||
| %global with_bundled 1 | ||||
| %global with_check 0 | ||||
| 
 | ||||
| %if 0%{?with_debug} | ||||
| %global _find_debuginfo_dwz_opts %{nil} | ||||
| %global _dwz_low_mem_die_limit 0 | ||||
| %else | ||||
| %global debug_package   %{nil} | ||||
| %endif | ||||
| 
 | ||||
| %if 0%{?rhel} > 7 && ! 0%{?fedora} | ||||
| %define gobuild(o:) \ | ||||
| go build -buildmode pie -compiler gc -tags="rpm_crashtraceback no_openssl ${BUILDTAGS:-}" -ldflags "${LDFLAGS:-} -compressdwarf=false -B 0x$(head -c20 /dev/urandom|od -An -tx1|tr -d ' \\n') -extldflags '%__global_ldflags'" -a -v -x %{?**}; | ||||
| %endif # distro | ||||
| go build -buildmode pie -compiler gc -tags="rpm_crashtraceback libtrust_openssl ${BUILDTAGS:-}" -ldflags "${LDFLAGS:-} -linkmode=external -compressdwarf=false -B 0x$(head -c20 /dev/urandom|od -An -tx1|tr -d ' \\n') -extldflags '%__global_ldflags'" -a -v %{?**}; | ||||
| %else | ||||
| %if ! 0%{?gobuild:1} | ||||
| %define gobuild(o:) GO111MODULE=off go build -buildmode pie -compiler gc -tags="rpm_crashtraceback ${BUILDTAGS:-}" -ldflags "${LDFLAGS:-} -linkmode=external -B 0x$(head -c20 /dev/urandom|od -An -tx1|tr -d ' \\n') -extldflags '-Wl,-z,relro -Wl,-z,now -specs=/usr/lib/rpm/redhat/redhat-hardened-ld '" -a -v %{?**}; | ||||
| %endif | ||||
| %endif | ||||
| 
 | ||||
| %global provider github | ||||
| %global provider_tld com | ||||
| %global project opencontainers | ||||
| %global repo runc | ||||
| # https://github.com/opencontainers/runc | ||||
| %global provider_prefix %{provider}.%{provider_tld}/%{project}/%{repo} | ||||
| %global import_path %{provider_prefix} | ||||
| %global git0 https://github.com/opencontainers/runc | ||||
| %global commit0 2abd837c8c25b0102ac4ce14f17bc0bc7ddffba7 | ||||
| %global shortcommit0 %(c=%{commit0}; echo ${c:0:7}) | ||||
| %global import_path %{provider}.%{provider_tld}/%{project}/%{repo} | ||||
| %global git0 https://%{import_path} | ||||
| 
 | ||||
| Epoch: 1 | ||||
| Name: %{repo} | ||||
| Version: 1.0.0 | ||||
| Release: 56.rc5.dev.git%{shortcommit0}%{?dist} | ||||
| Version: 1.1.12 | ||||
| Release: 6%{?dist} | ||||
| Summary: CLI for running Open Containers | ||||
| # https://fedoraproject.org/wiki/PackagingDrafts/Go#Go_Language_Architectures | ||||
| #ExclusiveArch: %%{go_arches} | ||||
| # still use arch exclude as the macro above still refers %%{ix86} in RHEL8.4: | ||||
| # https://bugzilla.redhat.com/show_bug.cgi?id=1905383 | ||||
| ExcludeArch: %{ix86} | ||||
| License: ASL 2.0 | ||||
| URL: http//%{provider_prefix} | ||||
| Source0: %{git0}/archive/%{commit0}/%{repo}-%{shortcommit0}.tar.gz | ||||
| Source1: 99-containers.conf | ||||
| Patch0: change-default-root.patch | ||||
| Patch1: 0001-Revert-Apply-cgroups-earlier.patch | ||||
| Patch2: 1807.patch | ||||
| Patch3: 0001-nsenter-clone-proc-self-exe-to-avoid-exposing-host-b-runc.patch | ||||
| Patch4: pivot-root.patch | ||||
| Requires: criu | ||||
| Requires(pre): container-selinux >= 2:2.2-2 | ||||
| 
 | ||||
| # If go_compiler is not set to 1, there is no virtual provide. Use golang instead. | ||||
| BuildRequires: %{?go_compiler:compiler(go-compiler)}%{!?go_compiler:golang} >= 1.6.2 | ||||
| URL: %{git0} | ||||
| Source0: %{git0}/archive/v%{version}.tar.gz | ||||
| Patch0: 0001-1.1-Bump-runtime-spec-to-latest-git-HEAD.patch | ||||
| Patch1: 0002-1.1-runc-exec-implement-CPU-affinity.patch | ||||
| Provides: oci-runtime | ||||
| BuildRequires: golang >= 1.21.4 | ||||
| BuildRequires: git | ||||
| BuildRequires: go-md2man | ||||
| BuildRequires: libseccomp-devel | ||||
| BuildRequires: /usr/bin/go-md2man | ||||
| BuildRequires: libseccomp-devel >= 2.5 | ||||
| Requires: libseccomp >= 2.5 | ||||
| Requires: criu | ||||
| 
 | ||||
| %description | ||||
| The runc command can be used to start containers which are packaged | ||||
| @ -54,7 +49,7 @@ in accordance with the Open Container Initiative's specifications, | ||||
| and to manage containers running under runc. | ||||
| 
 | ||||
| %prep | ||||
| %autosetup -Sgit -n %{repo}-%{commit0} | ||||
| %autosetup -Sgit | ||||
| sed -i '/\#\!\/bin\/bash/d' contrib/completions/bash/%{name} | ||||
| 
 | ||||
| %build | ||||
| @ -65,24 +60,19 @@ pushd GOPATH | ||||
| popd | ||||
| 
 | ||||
| pushd GOPATH/src/%{import_path} | ||||
| export GO111MODULE=off | ||||
| export GOPATH=%{gopath}:$(pwd)/GOPATH | ||||
| export BUILDTAGS="selinux seccomp" | ||||
| %gobuild -o %{name} %{import_path}  | ||||
| export CGO_CFLAGS="%{optflags} -D_GNU_SOURCE -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64" | ||||
| export BUILDTAGS="selinux seccomp no_openssl" | ||||
| export LDFLAGS="-X main.gitCommit= -X main.version=%{version}" | ||||
| %gobuild -o %{name} %{import_path} | ||||
| 
 | ||||
| pushd man | ||||
| ./md2man-all.sh | ||||
| popd | ||||
| 
 | ||||
| %install | ||||
| install -d -p %{buildroot}%{_bindir} | ||||
| install -p -m 755 %{name} %{buildroot}%{_bindir} | ||||
| 
 | ||||
| # install man pages | ||||
| install -d -p %{buildroot}%{_mandir}/man8 | ||||
| install -p -m 644 man/man8/* %{buildroot}%{_mandir}/man8 | ||||
| # install bash completion | ||||
| install -d -p %{buildroot}%{_datadir}/bash-completion/completions | ||||
| install -p -m 0644 contrib/completions/bash/%{name} %{buildroot}%{_datadir}/bash-completion/completions | ||||
| make install install-man install-bash DESTDIR=$RPM_BUILD_ROOT PREFIX=%{_prefix} LIBDIR=%{_libdir} BINDIR=%{_bindir} | ||||
| 
 | ||||
| %check | ||||
| 
 | ||||
| @ -97,12 +87,222 @@ install -p -m 0644 contrib/completions/bash/%{name} %{buildroot}%{_datadir}/bash | ||||
| %{_datadir}/bash-completion/completions/%{name} | ||||
| 
 | ||||
| %changelog | ||||
| * Thu Nov 28 2019 Jindrich Novy <jnovy@redhat.com> - 1.0.0-56.rc5.dev.git2abd837 | ||||
| * Mon Jan 20 2025 Jindrich Novy <jnovy@redhat.com> - 1:1.1.12-6 | ||||
| - Add CPU affinity feature from Kir Kolishkin | ||||
| - Resolves: RHEL-74865 | ||||
| 
 | ||||
| * Tue Oct 01 2024 Kir Kolyshkin <kir@redhat.com> - 1:1.1.12-5 | ||||
| - bump golang buildrequires | ||||
| - add no_openssl build tag | ||||
| - Resolves RHEL-55757 | ||||
| 
 | ||||
| * Mon Aug 05 2024 Jindrich Novy <jnovy@redhat.com> - 1:1.1.12-4 | ||||
| - rebuild for  golang fixes | ||||
| - Related: RHEL-28452 | ||||
| 
 | ||||
| * Thu Aug 01 2024 Jindrich Novy <jnovy@redhat.com> - 1:1.1.12-3 | ||||
| - rebuild for  golang fixes | ||||
| - Related: RHEL-28452 | ||||
| 
 | ||||
| * Fri Jun 21 2024 Jindrich Novy <jnovy@redhat.com> - 1:1.1.12-2 | ||||
| - rebuild for CVE-2024-1394 | ||||
| - Resolves: RHEL-24297 | ||||
| 
 | ||||
| * Thu Feb 01 2024 Jindrich Novy <jnovy@redhat.com> - 1:1.1.12-1 | ||||
| - update to https://github.com/opencontainers/runc/releases/tag/v1.1.12 | ||||
| - Related: Jira:RHEL-2110 | ||||
| 
 | ||||
| * Tue Jan 02 2024 Jindrich Novy <jnovy@redhat.com> - 1:1.1.11-1 | ||||
| - update to https://github.com/opencontainers/runc/releases/tag/v1.1.11 | ||||
| - Related: Jira:RHEL-2110 | ||||
| 
 | ||||
| * Wed Nov 08 2023 Jindrich Novy <jnovy@redhat.com> - 1:1.1.10-1 | ||||
| - update to https://github.com/opencontainers/runc/releases/tag/v1.1.10 | ||||
| - require container-selinux >= 2.224.0 for dmz feature | ||||
| - Related: Jira:RHEL-2110 | ||||
| 
 | ||||
| * Fri Aug 11 2023 Jindrich Novy <jnovy@redhat.com> - 1:1.1.9-1 | ||||
| - update to https://github.com/opencontainers/runc/releases/tag/v1.1.9 | ||||
| - Related: #2176055 | ||||
| 
 | ||||
| * Fri Jul 21 2023 Jindrich Novy <jnovy@redhat.com> - 1:1.1.8-1 | ||||
| - update to https://github.com/opencontainers/runc/releases/tag/v1.1.8 | ||||
| - Related: #2176055 | ||||
| 
 | ||||
| * Fri Jun 16 2023 Jindrich Novy <jnovy@redhat.com> - 1:1.1.7-2 | ||||
| - rebuild for following CVEs: | ||||
| CVE-2022-41724 | ||||
| - Resolves: #2179972 | ||||
| 
 | ||||
| * Wed May 03 2023 Jindrich Novy <jnovy@redhat.com> - 1:1.1.7-1 | ||||
| - update to https://github.com/opencontainers/runc/releases/tag/v1.1.7 | ||||
| - Related: #2176055 | ||||
| 
 | ||||
| * Wed Apr 12 2023 Jindrich Novy <jnovy@redhat.com> - 1:1.1.6-1 | ||||
| - update to https://github.com/opencontainers/runc/releases/tag/v1.1.6 | ||||
| - Related: #2176055 | ||||
| 
 | ||||
| * Fri Mar 31 2023 Jindrich Novy <jnovy@redhat.com> - 1:1.1.5-1 | ||||
| - update to https://github.com/opencontainers/runc/releases/tag/v1.1.5 | ||||
| - Related: #2176055 | ||||
| 
 | ||||
| * Thu Mar 09 2023 Jindrich Novy <jnovy@redhat.com> - 1:1.1.4-2 | ||||
| - update to https://github.com/opencontainers/runc/releases/tag/v1.1.4 | ||||
| - Related: #2176055 | ||||
| 
 | ||||
| * Fri Aug 26 2022 Jindrich Novy <jnovy@redhat.com> - 1:1.1.4-1 | ||||
| - update to https://github.com/opencontainers/runc/releases/tag/v1.1.4 | ||||
| - Related: #2061390 | ||||
| 
 | ||||
| * Thu Aug 25 2022 Jindrich Novy <jnovy@redhat.com> - 1:1.1.3-3 | ||||
| - fix "Error: runc: exec failed: unable to start container process: | ||||
|   open /dev/pts/0: operation not permitted: OCI permission denied" | ||||
| - Related: #2061390 | ||||
| 
 | ||||
| * Wed Jun 15 2022 Jindrich Novy <jnovy@redhat.com> - 1:1.1.3-2 | ||||
| - add patch in attempt to fix gating tests - thanks to Kir Kolyshkin | ||||
| - Related: #2061390 | ||||
| 
 | ||||
| * Thu Jun 09 2022 Jindrich Novy <jnovy@redhat.com> - 1:1.1.3-1 | ||||
| - update to https://github.com/opencontainers/runc/releases/tag/v1.1.3 | ||||
| - Related: #2061390 | ||||
| 
 | ||||
| * Fri Jun 03 2022 Jindrich Novy <jnovy@redhat.com> - 1:1.1.2-1 | ||||
| - update to https://github.com/opencontainers/runc/releases/tag/v1.1.2 | ||||
| - Related: #2061390 | ||||
| 
 | ||||
| * Thu May 12 2022 Jindrich Novy <jnovy@redhat.com> - 1:1.0.3-6 | ||||
| - Fix every podman run invocation generates two "Couldn't stat device | ||||
|   /dev/char/10:200: No such file or directory" lines in the journal | ||||
| - Related: #2061390 | ||||
| 
 | ||||
| * Wed May 11 2022 Jindrich Novy <jnovy@redhat.com> - 1:1.0.3-5 | ||||
| - BuildRequires: /usr/bin/go-md2man | ||||
| - Related: #2061390 | ||||
| 
 | ||||
| * Fri Apr 08 2022 Jindrich Novy <jnovy@redhat.com> - 1:1.0.3-4 | ||||
| - Related: #2061390 | ||||
| 
 | ||||
| * Tue Mar 08 2022 Jindrich Novy <jnovy@redhat.com> - 1:1.0.3-3 | ||||
| - require at least libseccomp >= 2.5 | ||||
| - Resolves: #2053990 | ||||
| 
 | ||||
| * Wed Feb 16 2022 Jindrich Novy <jnovy@redhat.com> - 1.0.3-2 | ||||
| - rollback to 1.0.3 due to gating test issues | ||||
| - Related: #2001445 | ||||
| 
 | ||||
| * Tue Jan 18 2022 Jindrich Novy <jnovy@redhat.com> - 1.1.0-1 | ||||
| - update to https://github.com/opencontainers/runc/releases/tag/v1.1.0 | ||||
| - Related: #2001445 | ||||
| 
 | ||||
| * Mon Dec 06 2021 Jindrich Novy <jnovy@redhat.com> - 1.0.3-1 | ||||
| - update to https://github.com/opencontainers/runc/releases/tag/v1.0.3 | ||||
| - Related: #2001445 | ||||
| 
 | ||||
| * Wed Aug 25 2021 Jindrich Novy <jnovy@redhat.com> - 1.0.2-1 | ||||
| - update to https://github.com/opencontainers/runc/releases/tag/v1.0.2 | ||||
| - Related: #1934415 | ||||
| 
 | ||||
| * Fri Aug 06 2021 Jindrich Novy <jnovy@redhat.com> - 1.0.1-5 | ||||
| - do not use versioned provide | ||||
| - Related: #1934415 | ||||
| 
 | ||||
| * Thu Jul 29 2021 Jindrich Novy <jnovy@redhat.com> - 1.0.1-4 | ||||
| - fix "unknown version" displayed by runc -v | ||||
| - Related: #1934415 | ||||
| 
 | ||||
| * Mon Jul 26 2021 Jindrich Novy <jnovy@redhat.com> - 1.0.1-3 | ||||
| - be sure to compile runc binaries the right way | ||||
| - Related: #1934415 | ||||
| 
 | ||||
| * Mon Jul 26 2021 Jindrich Novy <jnovy@redhat.com> - 1.0.1-2 | ||||
| - use Makefile | ||||
| - Related: #1934415 | ||||
| 
 | ||||
| * Wed Jul 21 2021 Jindrich Novy <jnovy@redhat.com> - 1.0.1-1 | ||||
| - update to https://github.com/opencontainers/runc/releases/tag/v1.0.1 | ||||
| - Related: #1934415 | ||||
| 
 | ||||
| * Thu May 20 2021 Jindrich Novy <jnovy@redhat.com> - 1.0.0-76.rc95 | ||||
| - updated to rc95 to fix CVE-2021-30465 | ||||
| - Related: #1934415 | ||||
| 
 | ||||
| * Tue May 18 2021 Jindrich Novy <jnovy@redhat.com> - 1.0.0-75.rc94 | ||||
| - set GO111MODULE=off to fix build | ||||
| - Related: #1934415 | ||||
| 
 | ||||
| * Fri May 14 2021 Jindrich Novy <jnovy@redhat.com> - 1.0.0-74.rc94 | ||||
| - update to https://github.com/opencontainers/runc/releases/tag/v1.0.0-rc94 | ||||
| - Related: #1934415 | ||||
| 
 | ||||
| * Tue May 11 2021 Jindrich Novy <jnovy@redhat.com> - 1.0.0-73.rc93 | ||||
| - fix CVE-2021-30465 | ||||
| - Related: #1934415 | ||||
| 
 | ||||
| * Tue Mar 30 2021 Jindrich Novy <jnovy@redhat.com> - 1.0.0-72.rc93 | ||||
| - upload rc93 tarball | ||||
| - Related: #1934415 | ||||
| 
 | ||||
| * Tue Mar 30 2021 Jindrich Novy <jnovy@redhat.com> - 1.0.0-71.rc93 | ||||
| - update to rc93 | ||||
| - Related: #1934415 | ||||
| 
 | ||||
| * Fri Jan 29 2021 Jindrich Novy <jnovy@redhat.com> - 1.0.0-70.rc92 | ||||
| - add missing Provides: oci-runtime = 1 | ||||
| - Related: #1883490 | ||||
| 
 | ||||
| * Tue Dec 08 2020 Jindrich Novy <jnovy@redhat.com> - 1.0.0-69.rc92 | ||||
| - still use ExcludeArch as go_arches macro is broken for 8.4 | ||||
| - Related: #1883490 | ||||
| 
 | ||||
| * Tue Aug 11 2020 Jindrich Novy <jnovy@redhat.com> - 1.0.0-68.rc92 | ||||
| - update to https://github.com/opencontainers/runc/releases/tag/v1.0.0-rc92 | ||||
| - propagate proper CFLAGS to CGO_CFLAGS to assure code hardening and optimization | ||||
| - Related: #1821193 | ||||
| 
 | ||||
| * Thu Jul 02 2020 Jindrich Novy <jnovy@redhat.com> - 1.0.0-67.rc91 | ||||
| - update to https://github.com/opencontainers/runc/releases/tag/v1.0.0-rc91 | ||||
| - Related: #1821193 | ||||
| 
 | ||||
| * Tue May 12 2020 Jindrich Novy <jnovy@redhat.com> - 1.0.0-66.rc10 | ||||
| - synchronize containter-tools 8.3.0 with 8.2.1 | ||||
| - Related: #1821193 | ||||
| 
 | ||||
| * Wed Feb 12 2020 Jindrich Novy <jnovy@redhat.com> - 1.0.0-65.rc10 | ||||
| - address CVE-2019-19921 by updating to rc10 | ||||
| - Resolves: #1801887 | ||||
| 
 | ||||
| * Wed Dec 11 2019 Jindrich Novy <jnovy@redhat.com> - 1.0.0-64.rc9 | ||||
| - use no_openssl in BUILDTAGS (no vendored crypto in runc) | ||||
| - Related: RHELPLAN-25139 | ||||
| 
 | ||||
| * Mon Dec 09 2019 Jindrich Novy <jnovy@redhat.com> - 1.0.0-63.rc9 | ||||
| - be sure to use golang >= 1.12.12-4 | ||||
| - Related: RHELPLAN-25139 | ||||
| 
 | ||||
| * Thu Nov 28 2019 Jindrich Novy <jnovy@redhat.com> - 1.0.0-62.rc9 | ||||
| - rebuild because of CVE-2019-9512 and CVE-2019-9514 | ||||
| - Resolves: #1766328, #1766300 | ||||
| - Resolves: #1766331, #1766303 | ||||
| 
 | ||||
| * Thu Nov 21 2019 Jindrich Novy <jnovy@redhat.com> - 1.0.0-61.rc9 | ||||
| - update to runc 1.0.0-rc9 release | ||||
| - amend golang deps | ||||
| - fixes CVE-2019-16884 | ||||
| - Resolves: #1759651 | ||||
| 
 | ||||
| * Mon Jun 17 2019 Lokesh Mandvekar <lsm5@redhat.com> - 1.0.0-60.rc8 | ||||
| - Resolves: #1721247 - enable fips mode | ||||
| 
 | ||||
| * Mon Jun 17 2019 Lokesh Mandvekar <lsm5@redhat.com> - 1.0.0-59.rc8 | ||||
| - Resolves: #1720654 - rebase to v1.0.0-rc8 | ||||
| 
 | ||||
| * Thu Apr 11 2019 Eduardo Santiago <santiago@redhat.com> - 1.0.0-57.rc5.dev.git2abd837 | ||||
| - Resolves: #1693424 - podman rootless: cannot specify gid= mount options | ||||
| 
 | ||||
| * Wed Feb 27 2019 Lokesh Mandvekar <lsm5@redhat.com> - 1.0.0-56.rc5.dev.git2abd837 | ||||
| - change-default-root patch not needed as there's no docker on rhel8 | ||||
| 
 | ||||
| * Tue Feb 12 2019 Lokesh Mandvekar <lsm5@redhat.com> - 1.0.0-55.rc5.dev.git2abd837 | ||||
| - Resolves: #1665770 - rootfs: umount all procfs and sysfs with --no-pivot | ||||
| - Resolves: CVE-2019-5736 | ||||
| 
 | ||||
| * Tue Dec 18 2018 Frantisek Kluknavsky <fkluknav@redhat.com> - 1.0.0-54.rc5.dev.git2abd837 | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user