From 2a38ee711e449bde0befa0c0ea2d3d89cf864d61 Mon Sep 17 00:00:00 2001 From: DistroBaker Date: Tue, 16 Feb 2021 18:25:28 +0000 Subject: [PATCH] Merged update from upstream sources This is an automated DistroBaker update from upstream sources. If you do not know what this is about or would like to opt out, contact the OSCI team. Source: https://src.fedoraproject.org/rpms/systemd.git#0257583091a9b13d4ca7012ee3632f67af009b85 --- .gitignore | 1 + .zuul.yaml | 5 + 10-oomd-defaults.conf | 2 + 10-oomd-root-slice-defaults.conf | 2 + 10-oomd-user-service-defaults.conf | 3 + 17829.patch | 60 + 18361.patch | 403 ++++++ 18401.patch | 1201 +++++++++++++++++ 18444.patch | 987 ++++++++++++++ ...39f04efa278ac93881e6e364a6ae520b03e7.patch | 40 + owner-check.sh | 36 + owner-check.template | 20 + split-files.py | 3 + systemd.spec | 86 +- 14 files changed, 2829 insertions(+), 20 deletions(-) create mode 100644 .zuul.yaml create mode 100644 10-oomd-defaults.conf create mode 100644 10-oomd-root-slice-defaults.conf create mode 100644 10-oomd-user-service-defaults.conf create mode 100644 17829.patch create mode 100644 18361.patch create mode 100644 18401.patch create mode 100644 18444.patch create mode 100644 95ca39f04efa278ac93881e6e364a6ae520b03e7.patch create mode 100755 owner-check.sh create mode 100644 owner-check.template diff --git a/.gitignore b/.gitignore index 911034e..6cf7897 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *~ +/.mail.list /systemd-*/ /.build-*.log /x86_64/ diff --git a/.zuul.yaml b/.zuul.yaml new file mode 100644 index 0000000..591bb8a --- /dev/null +++ b/.zuul.yaml @@ -0,0 +1,5 @@ +- project: + vars: + install_repo_exclude: + - systemd-standalone-tmpfiles + - systemd-standalone-sysuser diff --git a/10-oomd-defaults.conf b/10-oomd-defaults.conf new file mode 100644 index 0000000..3660cd2 --- /dev/null +++ b/10-oomd-defaults.conf @@ -0,0 +1,2 @@ +[OOM] +DefaultMemoryPressureDurationSec=10s diff --git a/10-oomd-root-slice-defaults.conf b/10-oomd-root-slice-defaults.conf new file mode 100644 index 0000000..49958e8 --- /dev/null +++ b/10-oomd-root-slice-defaults.conf @@ -0,0 +1,2 @@ +[Slice] +ManagedOOMSwap=kill diff --git a/10-oomd-user-service-defaults.conf b/10-oomd-user-service-defaults.conf new file mode 100644 index 0000000..d78f327 --- /dev/null +++ b/10-oomd-user-service-defaults.conf @@ -0,0 +1,3 @@ +[Service] +ManagedOOMMemoryPressure=kill +ManagedOOMMemoryPressureLimit=4% diff --git a/17829.patch b/17829.patch new file mode 100644 index 0000000..176b969 --- /dev/null +++ b/17829.patch @@ -0,0 +1,60 @@ +From 14d044da23d6f2fa03066aedcc2600a479c1f731 Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Wed, 2 Dec 2020 14:41:38 -0800 +Subject: [PATCH] test: fix TEST-56-OOMD thresholds for linux 5.9 changes + +Fixes #17533 + +The memory pressure values of the units in TEST-56-OOMD seemed to be a +lot lower after updating to linux 5.9. This is likely due to a fix from +https://github.com/torvalds/linux/commit/e22c6ed90aa91abc08f107344428ebb8c2629e98. + +To account for this, I lowered memory.high on testbloat.service to +throttle it even more. This was enough to generate the 50%+ value to trigger +oomd for the test, but as an extra precaution I also lowered the oomd +threshold to 1% so it's certain to try and kill testbloat.service. +--- + test/units/testsuite-56-testbloat.service | 6 +++--- + test/units/testsuite-56-workload.slice | 2 +- + test/units/testsuite-56.sh | 2 +- + 3 files changed, 5 insertions(+), 5 deletions(-) + +diff --git a/test/units/testsuite-56-testbloat.service b/test/units/testsuite-56-testbloat.service +index 40cf5a9f36f..6163aae1dba 100644 +--- a/test/units/testsuite-56-testbloat.service ++++ b/test/units/testsuite-56-testbloat.service +@@ -2,8 +2,8 @@ + Description=Create a lot of memory pressure + + [Service] +-# A very small memory.high will cause the script (trying to use a lot of memory) +-# to throttle and be put under heavy pressure +-MemoryHigh=2M ++# A VERY small memory.high will cause the script (trying to use a lot of memory) ++# to throttle and be put under heavy pressure. ++MemoryHigh=1M + Slice=testsuite-56-workload.slice + ExecStart=/usr/lib/systemd/tests/testdata/units/testsuite-56-slowgrowth.sh +diff --git a/test/units/testsuite-56-workload.slice b/test/units/testsuite-56-workload.slice +index 3d542ec2bae..45b04914c63 100644 +--- a/test/units/testsuite-56-workload.slice ++++ b/test/units/testsuite-56-workload.slice +@@ -7,4 +7,4 @@ MemoryAccounting=true + IOAccounting=true + TasksAccounting=true + ManagedOOMMemoryPressure=kill +-ManagedOOMMemoryPressureLimitPercent=50% ++ManagedOOMMemoryPressureLimitPercent=1% +diff --git a/test/units/testsuite-56.sh b/test/units/testsuite-56.sh +index 37d62d943c0..1846248855b 100755 +--- a/test/units/testsuite-56.sh ++++ b/test/units/testsuite-56.sh +@@ -19,7 +19,7 @@ systemctl start testsuite-56-testchill.service + + # Verify systemd-oomd is monitoring the expected units + oomctl | grep "/testsuite-56-workload.slice" +-oomctl | grep "50%" ++oomctl | grep "1%" + + # systemd-oomd watches for elevated pressure for 30 seconds before acting. + # It can take time to build up pressure so either wait 5 minutes or for the service to fail. diff --git a/18361.patch b/18361.patch new file mode 100644 index 0000000..282b7f3 --- /dev/null +++ b/18361.patch @@ -0,0 +1,403 @@ +From c20aa7b17166b9f331da33ad9288f9ede75c72db Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Sun, 24 Jan 2021 00:16:19 -0800 +Subject: [PATCH 1/4] oom: make memory pressure duration configurable through + oomd.conf + +--- + man/oomd.conf.xml | 12 +++++++++++- + src/oom/oomd-manager.c | 13 +++++++++---- + src/oom/oomd-manager.h | 5 +++-- + src/oom/oomd-util.h | 1 + + src/oom/oomd.c | 4 +++- + src/oom/oomd.conf | 1 + + test/units/testsuite-56.sh | 3 +++ + 7 files changed, 31 insertions(+), 8 deletions(-) + +diff --git a/man/oomd.conf.xml b/man/oomd.conf.xml +index 35a0686bc50..bb5da87c548 100644 +--- a/man/oomd.conf.xml ++++ b/man/oomd.conf.xml +@@ -65,13 +65,23 @@ + will take action. A unit can override this value with ManagedOOMMemoryPressureLimitPercent=. + The memory pressure for this property represents the fraction of time in a 10 second window in which all tasks + in the cgroup were delayed. For each monitored cgroup, if the memory pressure on that cgroup exceeds the +- limit set for more than 30 seconds, systemd-oomd will act on eligible descendant cgroups, ++ limit set for longer than the duration set by DefaultMemoryPressureDurationSec=, ++ systemd-oomd will act on eligible descendant cgroups, + starting from the ones with the most reclaim activity to the least reclaim activity. Which cgroups are + monitored and what action gets taken depends on what the unit has configured for + ManagedOOMMemoryPressure=. Takes a percentage value between 0% and 100%, inclusive. + Defaults to 60%. + + ++ ++ DefaultMemoryPressureDurationSec= ++ ++ Sets the amount of time a unit's cgroup needs to have exceeded memory pressure limits before ++ systemd-oomd will take action. Memory pressure limits are defined by ++ DefaultMemoryPressureLimitPercent= and ManagedOOMMemoryPressureLimitPercent=. ++ Defaults to 30 seconds when this property is unset or set to 0. ++ ++ + + + +diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c +index fec96519e01..e8ed6a52739 100644 +--- a/src/oom/oomd-manager.c ++++ b/src/oom/oomd-manager.c +@@ -306,7 +306,7 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo + m->post_action_delay_start = 0; + } + +- r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, PRESSURE_DURATION_USEC, &targets); ++ r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets); + if (r == -ENOMEM) + return log_error_errno(r, "Failed to check if memory pressure exceeded limits"); + else if (r == 1) { +@@ -325,7 +325,7 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo + + SET_FOREACH(t, targets) { + log_notice("Memory pressure for %s is greater than %lu for more than %"PRIu64" seconds and there was reclaim activity", +- t->path, LOAD_INT(t->mem_pressure_limit), PRESSURE_DURATION_USEC / USEC_PER_SEC); ++ t->path, LOAD_INT(t->mem_pressure_limit), m->default_mem_pressure_duration_usec / USEC_PER_SEC); + + r = oomd_kill_by_pgscan(candidates, t->path, m->dry_run); + if (r == -ENOMEM) +@@ -471,7 +471,7 @@ static int manager_connect_bus(Manager *m) { + return 0; + } + +-int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit) { ++int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit, usec_t mem_pressure_usec) { + unsigned long l; + int r; + +@@ -487,6 +487,8 @@ int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressur + if (r < 0) + return r; + ++ m->default_mem_pressure_duration_usec = mem_pressure_usec ?: DEFAULT_MEM_PRESSURE_DURATION_USEC; ++ + r = manager_connect_bus(m); + if (r < 0) + return r; +@@ -505,6 +507,7 @@ int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressur + int manager_get_dump_string(Manager *m, char **ret) { + _cleanup_free_ char *dump = NULL; + _cleanup_fclose_ FILE *f = NULL; ++ char buf[FORMAT_TIMESPAN_MAX]; + OomdCGroupContext *c; + size_t size; + char *key; +@@ -521,10 +524,12 @@ int manager_get_dump_string(Manager *m, char **ret) { + "Dry Run: %s\n" + "Swap Used Limit: %u%%\n" + "Default Memory Pressure Limit: %lu%%\n" ++ "Default Memory Pressure Duration: %s\n" + "System Context:\n", + yes_no(m->dry_run), + m->swap_used_limit, +- LOAD_INT(m->default_mem_pressure_limit)); ++ LOAD_INT(m->default_mem_pressure_limit), ++ format_timespan(buf, sizeof(buf), m->default_mem_pressure_duration_usec, USEC_PER_SEC)); + oomd_dump_system_context(&m->system_context, f, "\t"); + + fprintf(f, "Swap Monitored CGroups:\n"); +diff --git a/src/oom/oomd-manager.h b/src/oom/oomd-manager.h +index 3f3eb5aa4b6..ede9903e5a6 100644 +--- a/src/oom/oomd-manager.h ++++ b/src/oom/oomd-manager.h +@@ -16,7 +16,7 @@ + * percentage of time all tasks were delayed (i.e. unproductive). + * Generally 60 or higher might be acceptable for something like system.slice with no memory.high set; processes in + * system.slice are assumed to be less latency sensitive. */ +-#define PRESSURE_DURATION_USEC (30 * USEC_PER_SEC) ++#define DEFAULT_MEM_PRESSURE_DURATION_USEC (30 * USEC_PER_SEC) + #define DEFAULT_MEM_PRESSURE_LIMIT 60 + #define DEFAULT_SWAP_USED_LIMIT 90 + +@@ -33,6 +33,7 @@ struct Manager { + bool dry_run; + unsigned swap_used_limit; + loadavg_t default_mem_pressure_limit; ++ usec_t default_mem_pressure_duration_usec; + + /* k: cgroup paths -> v: OomdCGroupContext + * Used to detect when to take action. */ +@@ -53,7 +54,7 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free); + + int manager_new(Manager **ret); + +-int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit); ++int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit, usec_t mem_pressure_usec); + + int manager_get_dump_string(Manager *m, char **ret); + +diff --git a/src/oom/oomd-util.h b/src/oom/oomd-util.h +index 0834cbf09d7..d7a9890e7a2 100644 +--- a/src/oom/oomd-util.h ++++ b/src/oom/oomd-util.h +@@ -31,6 +31,7 @@ struct OomdCGroupContext { + + /* These are only used by oomd_pressure_above for acting on high memory pressure. */ + loadavg_t mem_pressure_limit; ++ usec_t mem_pressure_duration_usec; + usec_t last_hit_mem_pressure_limit; + }; + +diff --git a/src/oom/oomd.c b/src/oom/oomd.c +index 8cf776ec0f5..1b0f8ff6c40 100644 +--- a/src/oom/oomd.c ++++ b/src/oom/oomd.c +@@ -19,11 +19,13 @@ + static bool arg_dry_run = false; + static int arg_swap_used_limit = -1; + static int arg_mem_pressure_limit = -1; ++static usec_t arg_mem_pressure_usec = 0; + + static int parse_config(void) { + static const ConfigTableItem items[] = { + { "OOM", "SwapUsedLimitPercent", config_parse_percent, 0, &arg_swap_used_limit }, + { "OOM", "DefaultMemoryPressureLimitPercent", config_parse_percent, 0, &arg_mem_pressure_limit }, ++ { "OOM", "DefaultMemoryPressureDurationSec", config_parse_sec, 0, &arg_mem_pressure_usec }, + {} + }; + +@@ -160,7 +162,7 @@ static int run(int argc, char *argv[]) { + if (r < 0) + return log_error_errno(r, "Failed to create manager: %m"); + +- r = manager_start(m, arg_dry_run, arg_swap_used_limit, arg_mem_pressure_limit); ++ r = manager_start(m, arg_dry_run, arg_swap_used_limit, arg_mem_pressure_limit, arg_mem_pressure_usec); + if (r < 0) + return log_error_errno(r, "Failed to start up daemon: %m"); + +diff --git a/src/oom/oomd.conf b/src/oom/oomd.conf +index 8ac97169610..766cb1717f7 100644 +--- a/src/oom/oomd.conf ++++ b/src/oom/oomd.conf +@@ -14,3 +14,4 @@ + [OOM] + #SwapUsedLimitPercent=90% + #DefaultMemoryPressureLimitPercent=60% ++#DefaultMemoryPressureDurationSec=30s +diff --git a/test/units/testsuite-56.sh b/test/units/testsuite-56.sh +index 1846248855b..6e7941a57fc 100755 +--- a/test/units/testsuite-56.sh ++++ b/test/units/testsuite-56.sh +@@ -14,12 +14,15 @@ if [[ "$cgroup_type" != *"cgroup2"* ]] && [[ "$cgroup_type" != *"0x63677270"* ]] + fi + [[ -e /skipped ]] && exit 0 || true + ++echo "DefaultMemoryPressureDurationSec=5s" >> /etc/systemd/oomd.conf ++ + systemctl start testsuite-56-testbloat.service + systemctl start testsuite-56-testchill.service + + # Verify systemd-oomd is monitoring the expected units + oomctl | grep "/testsuite-56-workload.slice" + oomctl | grep "1%" ++oomctl | grep "Default Memory Pressure Duration: 5s" + + # systemd-oomd watches for elevated pressure for 30 seconds before acting. + # It can take time to build up pressure so either wait 5 minutes or for the service to fail. + +From 408a3bbd76326793ea5d1cf4e0a9444a4c252d86 Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Sat, 23 Jan 2021 22:10:42 -0800 +Subject: [PATCH 2/4] oom: make swap a soft requirement + +--- + man/systemd-oomd.service.xml | 4 ++-- + src/oom/oomd-manager.c | 8 ++++++-- + src/oom/oomd.c | 6 ++---- + src/oom/test-oomd-util.c | 11 +++++++++++ + 4 files changed, 21 insertions(+), 8 deletions(-) + +diff --git a/man/systemd-oomd.service.xml b/man/systemd-oomd.service.xml +index 9cb9c6076a9..ebd2467ee23 100644 +--- a/man/systemd-oomd.service.xml ++++ b/man/systemd-oomd.service.xml +@@ -56,8 +56,8 @@ + + You will need a kernel compiled with PSI support. This is available in Linux 4.20 and above. + +- The system must also have swap enabled for systemd-oomd to function correctly. With swap +- enabled, the system spends enough time swapping pages to let systemd-oomd react. ++ It is highly recommended for the system to have swap enabled for systemd-oomd to function ++ optimally. With swap enabled, the system spends enough time swapping pages to let systemd-oomd react. + Without swap, the system enters a livelocked state much more quickly and may prevent systemd-oomd + from responding in a reasonable amount of time. See + "In defence of swap: common misconceptions" +diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c +index e8ed6a52739..814fda51f31 100644 +--- a/src/oom/oomd-manager.c ++++ b/src/oom/oomd-manager.c +@@ -6,6 +6,7 @@ + #include "cgroup-util.h" + #include "fd-util.h" + #include "fileio.h" ++#include "memory-util.h" + #include "oomd-manager-bus.h" + #include "oomd-manager.h" + #include "path-util.h" +@@ -294,9 +295,12 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo + return log_error_errno(r, "Failed to update monitored memory pressure cgroup contexts"); + + r = oomd_system_context_acquire("/proc/swaps", &m->system_context); +- /* If there aren't units depending on swap actions, the only error we exit on is ENOMEM */ +- if (r == -ENOMEM || (r < 0 && !hashmap_isempty(m->monitored_swap_cgroup_contexts))) ++ /* If there aren't units depending on swap actions, the only error we exit on is ENOMEM. ++ * Allow ENOENT in the event that swap is disabled on the system. */ ++ if (r == -ENOMEM || (r < 0 && r != -ENOENT && !hashmap_isempty(m->monitored_swap_cgroup_contexts))) + return log_error_errno(r, "Failed to acquire system context"); ++ else if (r == -ENOENT) ++ zero(m->system_context); + + /* If we're still recovering from a kill, don't try to kill again yet */ + if (m->post_action_delay_start > 0) { +diff --git a/src/oom/oomd.c b/src/oom/oomd.c +index 1b0f8ff6c40..1fbcf41492d 100644 +--- a/src/oom/oomd.c ++++ b/src/oom/oomd.c +@@ -142,10 +142,8 @@ static int run(int argc, char *argv[]) { + return log_error_errno(r, "Failed to get SwapTotal from /proc/meminfo: %m"); + + r = safe_atollu(swap, &s); +- if (r < 0) +- return log_error_errno(r, "Failed to parse SwapTotal from /proc/meminfo: %s: %m", swap); +- if (s == 0) +- return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Requires swap to operate"); ++ if (r < 0 || s == 0) ++ log_warning("Swap is currently not detected; memory pressure usage will be degraded"); + + if (!is_pressure_supported()) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Pressure Stall Information (PSI) is not supported"); +diff --git a/src/oom/test-oomd-util.c b/src/oom/test-oomd-util.c +index 8143408902b..54fe2a03d14 100644 +--- a/src/oom/test-oomd-util.c ++++ b/src/oom/test-oomd-util.c +@@ -159,6 +159,11 @@ static void test_oomd_system_context_acquire(void) { + assert_se(ctx.swap_total == 0); + assert_se(ctx.swap_used == 0); + ++ assert_se(write_string_file(path, "Filename Type Size Used Priority", WRITE_STRING_FILE_CREATE) == 0); ++ assert_se(oomd_system_context_acquire(path, &ctx) == 0); ++ assert_se(ctx.swap_total == 0); ++ assert_se(ctx.swap_used == 0); ++ + assert_se(write_string_file(path, "Filename Type Size Used Priority\n" + "/swapvol/swapfile file 18971644 0 -3\n" + "/dev/vda2 partition 1999868 993780 -2", WRITE_STRING_FILE_CREATE) == 0); +@@ -268,6 +273,12 @@ static void test_oomd_swap_free_below(void) { + .swap_used = 3310136 * 1024U, + }; + assert_se(oomd_swap_free_below(&ctx, 20) == false); ++ ++ ctx = (OomdSystemContext) { ++ .swap_total = 0, ++ .swap_used = 0, ++ }; ++ assert_se(oomd_swap_free_below(&ctx, 20) == false); + } + + static void test_oomd_sort_cgroups(void) { + +From 924c89e9fe95d47b6ad94544bfdd5f087646daea Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Sun, 24 Jan 2021 01:22:51 -0800 +Subject: [PATCH 3/4] oom: fix reclaim activity detection + +This should have been checking for any reclaim activity within a larger interval +of time rather than within the past second. On systems with swap this +doesn't seem to have mattered too much as reclaim would always increase when +memory pressure was elevated. But testing in the no swap case having +this larger interval made a difference between oomd killing or not. +--- + src/oom/oomd-manager.c | 7 +++++-- + src/oom/oomd-manager.h | 2 ++ + 2 files changed, 7 insertions(+), 2 deletions(-) + +diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c +index 814fda51f31..3efa629002e 100644 +--- a/src/oom/oomd-manager.c ++++ b/src/oom/oomd-manager.c +@@ -302,6 +302,9 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo + else if (r == -ENOENT) + zero(m->system_context); + ++ if (oomd_memory_reclaim(m->monitored_mem_pressure_cgroup_contexts)) ++ m->last_reclaim_at = usec_now; ++ + /* If we're still recovering from a kill, don't try to kill again yet */ + if (m->post_action_delay_start > 0) { + if (m->post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now) +@@ -314,12 +317,12 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo + if (r == -ENOMEM) + return log_error_errno(r, "Failed to check if memory pressure exceeded limits"); + else if (r == 1) { +- /* Check if there was reclaim activity in the last interval. The concern is the following case: ++ /* Check if there was reclaim activity in the given interval. The concern is the following case: + * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending + * cgroup. Even after this, well-behaved processes will fault in recently resident pages and + * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need + * to kill something (it won't help anyways). */ +- if (oomd_memory_reclaim(m->monitored_mem_pressure_cgroup_contexts)) { ++ if ((usec_now - m->last_reclaim_at) <= RECLAIM_DURATION_USEC) { + _cleanup_hashmap_free_ Hashmap *candidates = NULL; + OomdCGroupContext *t; + +diff --git a/src/oom/oomd-manager.h b/src/oom/oomd-manager.h +index ede9903e5a6..ee17abced26 100644 +--- a/src/oom/oomd-manager.h ++++ b/src/oom/oomd-manager.h +@@ -20,6 +20,7 @@ + #define DEFAULT_MEM_PRESSURE_LIMIT 60 + #define DEFAULT_SWAP_USED_LIMIT 90 + ++#define RECLAIM_DURATION_USEC (30 * USEC_PER_SEC) + #define POST_ACTION_DELAY_USEC (15 * USEC_PER_SEC) + + typedef struct Manager Manager; +@@ -42,6 +43,7 @@ struct Manager { + + OomdSystemContext system_context; + ++ usec_t last_reclaim_at; + usec_t post_action_delay_start; + + sd_event_source *cgroup_context_event_source; + +From 2e744a2cd89fc0ea67cf78cfba617b5105a26215 Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Sun, 24 Jan 2021 01:34:23 -0800 +Subject: [PATCH 4/4] oom: update extended test to remove swap gating + +--- + test/units/testsuite-56.sh | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/test/units/testsuite-56.sh b/test/units/testsuite-56.sh +index 6e7941a57fc..4dc9d8c7a86 100755 +--- a/test/units/testsuite-56.sh ++++ b/test/units/testsuite-56.sh +@@ -6,7 +6,6 @@ systemd-analyze log-level debug + systemd-analyze log-target console + + # Loose checks to ensure the environment has the necessary features for systemd-oomd +-[[ "$( awk '/SwapTotal/ { print $2 }' /proc/meminfo )" != "0" ]] || echo "no swap" >> /skipped + [[ -e /proc/pressure ]] || echo "no PSI" >> /skipped + cgroup_type=$(stat -fc %T /sys/fs/cgroup/) + if [[ "$cgroup_type" != *"cgroup2"* ]] && [[ "$cgroup_type" != *"0x63677270"* ]]; then +@@ -16,8 +15,8 @@ fi + + echo "DefaultMemoryPressureDurationSec=5s" >> /etc/systemd/oomd.conf + +-systemctl start testsuite-56-testbloat.service + systemctl start testsuite-56-testchill.service ++systemctl start testsuite-56-testbloat.service + + # Verify systemd-oomd is monitoring the expected units + oomctl | grep "/testsuite-56-workload.slice" diff --git a/18401.patch b/18401.patch new file mode 100644 index 0000000..c42ae7e --- /dev/null +++ b/18401.patch @@ -0,0 +1,1201 @@ +From 2ccd5198faa8ca65001f90c551924e86bf737a85 Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Mon, 25 Jan 2021 23:56:23 -0800 +Subject: [PATCH 1/7] oom: shorten xattr name + +--- + src/core/cgroup.c | 2 +- + src/oom/oomd-util.c | 4 ++-- + src/oom/test-oomd-util.c | 2 +- + 3 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/src/core/cgroup.c b/src/core/cgroup.c +index c9cf7fb16c6..70282a7abda 100644 +--- a/src/core/cgroup.c ++++ b/src/core/cgroup.c +@@ -2746,7 +2746,7 @@ int unit_check_oomd_kill(Unit *u) { + else if (r == 0) + return 0; + +- r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "user.systemd_oomd_kill", &value); ++ r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "user.oomd_kill", &value); + if (r < 0 && r != -ENODATA) + return r; + +diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c +index fcccddb92ea..80b9583440c 100644 +--- a/src/oom/oomd-util.c ++++ b/src/oom/oomd-util.c +@@ -201,9 +201,9 @@ int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run) { + if (r < 0) + return r; + +- r = increment_oomd_xattr(path, "user.systemd_oomd_kill", set_size(pids_killed)); ++ r = increment_oomd_xattr(path, "user.oomd_kill", set_size(pids_killed)); + if (r < 0) +- log_debug_errno(r, "Failed to set user.systemd_oomd_kill on kill: %m"); ++ log_debug_errno(r, "Failed to set user.oomd_kill on kill: %m"); + + return set_size(pids_killed) != 0; + } +diff --git a/src/oom/test-oomd-util.c b/src/oom/test-oomd-util.c +index 54fe2a03d14..3dec4f0ff06 100644 +--- a/src/oom/test-oomd-util.c ++++ b/src/oom/test-oomd-util.c +@@ -79,7 +79,7 @@ static void test_oomd_cgroup_kill(void) { + sleep(2); + assert_se(cg_is_empty(SYSTEMD_CGROUP_CONTROLLER, cgroup) == true); + +- assert_se(cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.systemd_oomd_kill", &v) >= 0); ++ assert_se(cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_kill", &v) >= 0); + assert_se(memcmp(v, i == 0 ? "2" : "4", 2) == 0); + } + } + +From d38916b398127e005d0cf131092a99317661ec3c Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Fri, 5 Feb 2021 03:00:11 -0800 +Subject: [PATCH 2/7] oom: wrap reply.path with empty_to_root + +--- + src/oom/oomd-manager.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c +index 338935b3ec6..825fe38e189 100644 +--- a/src/oom/oomd-manager.c ++++ b/src/oom/oomd-manager.c +@@ -93,7 +93,7 @@ static int process_managed_oom_reply( + m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts; + + if (reply.mode == MANAGED_OOM_AUTO) { +- (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, reply.path)); ++ (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, empty_to_root(reply.path))); + continue; + } + +@@ -109,7 +109,7 @@ static int process_managed_oom_reply( + } + } + +- ret = oomd_insert_cgroup_context(NULL, monitor_hm, reply.path); ++ ret = oomd_insert_cgroup_context(NULL, monitor_hm, empty_to_root(reply.path)); + if (ret == -ENOMEM) { + r = ret; + goto finish; +@@ -117,7 +117,7 @@ static int process_managed_oom_reply( + + /* Always update the limit in case it was changed. For non-memory pressure detection the value is + * ignored so always updating it here is not a problem. */ +- ctx = hashmap_get(monitor_hm, reply.path); ++ ctx = hashmap_get(monitor_hm, empty_to_root(reply.path)); + if (ctx) + ctx->mem_pressure_limit = limit; + } + +From a695da238e7a6bd6eb440facc784aa6fca6c3d90 Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Wed, 27 Jan 2021 23:43:13 -0800 +Subject: [PATCH 3/7] oom: sort by pgscan and memory usage + +If 2 candidates have the same pgscan, prioritize the one with the larger +memory usage. +--- + src/oom/oomd-util.c | 2 +- + src/oom/oomd-util.h | 5 ++++- + src/oom/test-oomd-util.c | 24 ++++++++++++++---------- + 3 files changed, 19 insertions(+), 12 deletions(-) + +diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c +index 80b9583440c..8f138d64c6c 100644 +--- a/src/oom/oomd-util.c ++++ b/src/oom/oomd-util.c +@@ -214,7 +214,7 @@ int oomd_kill_by_pgscan(Hashmap *h, const char *prefix, bool dry_run) { + + assert(h); + +- r = oomd_sort_cgroup_contexts(h, compare_pgscan, prefix, &sorted); ++ r = oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, prefix, &sorted); + if (r < 0) + return r; + +diff --git a/src/oom/oomd-util.h b/src/oom/oomd-util.h +index d7a9890e7a2..f0648c5dcdd 100644 +--- a/src/oom/oomd-util.h ++++ b/src/oom/oomd-util.h +@@ -61,10 +61,13 @@ bool oomd_memory_reclaim(Hashmap *h); + /* Returns true if the amount of swap free is below the percentage of swap specified by `threshold_percent`. */ + bool oomd_swap_free_below(const OomdSystemContext *ctx, uint64_t threshold_percent); + +-static inline int compare_pgscan(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) { ++static inline int compare_pgscan_and_memory_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) { + assert(c1); + assert(c2); + ++ if ((*c2)->pgscan == (*c1)->pgscan) ++ return CMP((*c2)->current_memory_usage, (*c1)->current_memory_usage); ++ + return CMP((*c2)->pgscan, (*c1)->pgscan); + } + +diff --git a/src/oom/test-oomd-util.c b/src/oom/test-oomd-util.c +index 3dec4f0ff06..a1fe78806a1 100644 +--- a/src/oom/test-oomd-util.c ++++ b/src/oom/test-oomd-util.c +@@ -292,16 +292,20 @@ static void test_oomd_sort_cgroups(void) { + OomdCGroupContext ctx[4] = { + { .path = paths[0], + .swap_usage = 20, +- .pgscan = 60 }, ++ .pgscan = 60, ++ .current_memory_usage = 10 }, + { .path = paths[1], + .swap_usage = 60, +- .pgscan = 40 }, ++ .pgscan = 40, ++ .current_memory_usage = 20 }, + { .path = paths[2], + .swap_usage = 40, +- .pgscan = 20 }, ++ .pgscan = 40, ++ .current_memory_usage = 40 }, + { .path = paths[3], + .swap_usage = 10, +- .pgscan = 80 }, ++ .pgscan = 80, ++ .current_memory_usage = 10 }, + }; + + assert_se(h = hashmap_new(&string_hash_ops)); +@@ -318,16 +322,16 @@ static void test_oomd_sort_cgroups(void) { + assert_se(sorted_cgroups[3] == &ctx[3]); + sorted_cgroups = mfree(sorted_cgroups); + +- assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan, NULL, &sorted_cgroups) == 4); ++ assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, NULL, &sorted_cgroups) == 4); + assert_se(sorted_cgroups[0] == &ctx[3]); + assert_se(sorted_cgroups[1] == &ctx[0]); +- assert_se(sorted_cgroups[2] == &ctx[1]); +- assert_se(sorted_cgroups[3] == &ctx[2]); ++ assert_se(sorted_cgroups[2] == &ctx[2]); ++ assert_se(sorted_cgroups[3] == &ctx[1]); + sorted_cgroups = mfree(sorted_cgroups); + +- assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan, "/herp.slice/derp.scope", &sorted_cgroups) == 2); +- assert_se(sorted_cgroups[0] == &ctx[1]); +- assert_se(sorted_cgroups[1] == &ctx[2]); ++ assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, "/herp.slice/derp.scope", &sorted_cgroups) == 2); ++ assert_se(sorted_cgroups[0] == &ctx[2]); ++ assert_se(sorted_cgroups[1] == &ctx[1]); + assert_se(sorted_cgroups[2] == 0); + assert_se(sorted_cgroups[3] == 0); + sorted_cgroups = mfree(sorted_cgroups); + +From c73a2c3a6788a2a28899f29579fdd68816f60d59 Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Thu, 28 Jan 2021 15:47:26 -0800 +Subject: [PATCH 4/7] oom: skip over cgroups with no memory usage + +--- + src/oom/oomd-util.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c +index 8f138d64c6c..fa8b8b70b19 100644 +--- a/src/oom/oomd-util.c ++++ b/src/oom/oomd-util.c +@@ -219,7 +219,8 @@ int oomd_kill_by_pgscan(Hashmap *h, const char *prefix, bool dry_run) { + return r; + + for (int i = 0; i < r; i++) { +- if (sorted[i]->pgscan == 0) ++ /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure */ ++ if (sorted[i]->pgscan == 0 && sorted[i]->current_memory_usage == 0) + break; + + r = oomd_cgroup_kill(sorted[i]->path, true, dry_run); + +From 63d6d9160523a2c1a71e96ff4125a1440d827b32 Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Tue, 26 Jan 2021 00:57:36 -0800 +Subject: [PATCH 5/7] oom: implement avoid/omit xattr support + +There may be situations where a cgroup should be protected from killing +or deprioritized as a candidate. In FB oomd xattrs are used to bias oomd +away from supervisor cgroups and towards worker cgroups in container +tasks. On desktops this can be used to protect important units with +unpredictable resource consumption. + +The patch allows systemd-oomd to understand 2 xattrs: +"user.oomd_avoid" and "user.oomd_omit". If systemd-oomd sees these +xattrs set to 1 on a candidate cgroup (i.e. while attempting to kill something) +AND the cgroup is owned by root:root, it will either deprioritize the cgroup as +a candidate (avoid) or remove it completely as a candidate (omit). + +Usage is restricted to root:root cgroups to prevent situations where an +unprivileged user can set their own cgroups lower in the kill priority than +another user's (and prevent them from omitting their units from +systemd-oomd killing). +--- + src/basic/cgroup-util.c | 22 +++++++++ + src/basic/cgroup-util.h | 1 + + src/oom/oomd-util.c | 35 ++++++++++++--- + src/oom/oomd-util.h | 11 +++++ + src/oom/test-oomd-util.c | 54 +++++++++++++++++++++-- + test/test-functions | 1 + + test/units/testsuite-56-testmunch.service | 7 +++ + test/units/testsuite-56.sh | 31 +++++++++++-- + 8 files changed, 149 insertions(+), 13 deletions(-) + create mode 100644 test/units/testsuite-56-testmunch.service + +diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c +index b567822b7ef..45dc1142048 100644 +--- a/src/basic/cgroup-util.c ++++ b/src/basic/cgroup-util.c +@@ -1703,6 +1703,28 @@ int cg_get_attribute_as_bool(const char *controller, const char *path, const cha + return 0; + } + ++ ++int cg_get_owner(const char *controller, const char *path, uid_t *ret_uid, gid_t *ret_gid) { ++ _cleanup_free_ char *f = NULL; ++ struct stat stats; ++ int r; ++ ++ assert(ret_uid); ++ assert(ret_gid); ++ ++ r = cg_get_path(controller, path, NULL, &f); ++ if (r < 0) ++ return r; ++ ++ r = stat(f, &stats); ++ if (r < 0) ++ return -errno; ++ ++ *ret_uid = stats.st_uid; ++ *ret_gid = stats.st_gid; ++ return 0; ++} ++ + int cg_get_keyed_attribute_full( + const char *controller, + const char *path, +diff --git a/src/basic/cgroup-util.h b/src/basic/cgroup-util.h +index bdc0d0d086c..63bd25f703e 100644 +--- a/src/basic/cgroup-util.h ++++ b/src/basic/cgroup-util.h +@@ -212,6 +212,7 @@ int cg_get_attribute_as_uint64(const char *controller, const char *path, const c + int cg_get_attribute_as_bool(const char *controller, const char *path, const char *attribute, bool *ret); + + int cg_set_access(const char *controller, const char *path, uid_t uid, gid_t gid); ++int cg_get_owner(const char *controller, const char *path, uid_t *ret_uid, gid_t *ret_gid); + + int cg_set_xattr(const char *controller, const char *path, const char *name, const void *value, size_t size, int flags); + int cg_get_xattr(const char *controller, const char *path, const char *name, void *value, size_t size); +diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c +index fa8b8b70b19..db6383bf436 100644 +--- a/src/oom/oomd-util.c ++++ b/src/oom/oomd-util.c +@@ -159,7 +159,8 @@ int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const cha + return -ENOMEM; + + HASHMAP_FOREACH(item, h) { +- if (item->path && prefix && !path_startswith(item->path, prefix)) ++ /* Skip over cgroups that are not valid candidates or are explicitly marked for omission */ ++ if ((item->path && prefix && !path_startswith(item->path, prefix)) || item->omit) + continue; + + sorted[k++] = item; +@@ -219,9 +220,10 @@ int oomd_kill_by_pgscan(Hashmap *h, const char *prefix, bool dry_run) { + return r; + + for (int i = 0; i < r; i++) { +- /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure */ ++ /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure. */ ++ /* Don't break since there might be "avoid" cgroups at the end. */ + if (sorted[i]->pgscan == 0 && sorted[i]->current_memory_usage == 0) +- break; ++ continue; + + r = oomd_cgroup_kill(sorted[i]->path, true, dry_run); + if (r > 0 || r == -ENOMEM) +@@ -244,8 +246,10 @@ int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run) { + /* Try to kill cgroups with non-zero swap usage until we either succeed in + * killing or we get to a cgroup with no swap usage. */ + for (int i = 0; i < r; i++) { ++ /* Skip over cgroups with no resource usage. Don't break since there might be "avoid" ++ * cgroups at the end. */ + if (sorted[i]->swap_usage == 0) +- break; ++ continue; + + r = oomd_cgroup_kill(sorted[i]->path, true, dry_run); + if (r > 0 || r == -ENOMEM) +@@ -257,8 +261,10 @@ int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run) { + + int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) { + _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL; +- _cleanup_free_ char *p = NULL, *val = NULL; ++ _cleanup_free_ char *p = NULL, *val = NULL, *avoid_val = NULL, *omit_val = NULL; + bool is_root; ++ uid_t uid; ++ gid_t gid; + int r; + + assert(path); +@@ -278,6 +284,25 @@ int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) { + if (r < 0) + return log_debug_errno(r, "Error parsing memory pressure from %s: %m", p); + ++ r = cg_get_owner(SYSTEMD_CGROUP_CONTROLLER, path, &uid, &gid); ++ if (r < 0) ++ log_debug_errno(r, "Failed to get owner/group from %s: %m", path); ++ else if (uid == 0 && gid == 0) { ++ /* Ignore most errors when reading the xattr since it is usually unset and cgroup xattrs are only used ++ * as an optional feature of systemd-oomd (and the system might not even support them). */ ++ r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, path, "user.oomd_avoid", &avoid_val); ++ if (r >= 0 && streq(avoid_val, "1")) ++ ctx->avoid = true; ++ else if (r == -ENOMEM) ++ return r; ++ ++ r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, path, "user.oomd_omit", &omit_val); ++ if (r >= 0 && streq(omit_val, "1")) ++ ctx->omit = true; ++ else if (r == -ENOMEM) ++ return r; ++ } ++ + if (is_root) { + r = procfs_memory_get_used(&ctx->current_memory_usage); + if (r < 0) +diff --git a/src/oom/oomd-util.h b/src/oom/oomd-util.h +index f0648c5dcdd..ab6a8da1ef6 100644 +--- a/src/oom/oomd-util.h ++++ b/src/oom/oomd-util.h +@@ -29,6 +29,9 @@ struct OomdCGroupContext { + uint64_t last_pgscan; + uint64_t pgscan; + ++ bool avoid; ++ bool omit; ++ + /* These are only used by oomd_pressure_above for acting on high memory pressure. */ + loadavg_t mem_pressure_limit; + usec_t mem_pressure_duration_usec; +@@ -61,10 +64,15 @@ bool oomd_memory_reclaim(Hashmap *h); + /* Returns true if the amount of swap free is below the percentage of swap specified by `threshold_percent`. */ + bool oomd_swap_free_below(const OomdSystemContext *ctx, uint64_t threshold_percent); + ++/* The compare functions will sort from largest to smallest, putting all the contexts with "avoid" at the end ++ * (after the smallest values). */ + static inline int compare_pgscan_and_memory_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) { + assert(c1); + assert(c2); + ++ if ((*c1)->avoid != (*c2)->avoid) ++ return CMP((*c1)->avoid, (*c2)->avoid); ++ + if ((*c2)->pgscan == (*c1)->pgscan) + return CMP((*c2)->current_memory_usage, (*c1)->current_memory_usage); + +@@ -75,6 +83,9 @@ static inline int compare_swap_usage(OomdCGroupContext * const *c1, OomdCGroupCo + assert(c1); + assert(c2); + ++ if ((*c1)->avoid != (*c2)->avoid) ++ return CMP((*c1)->avoid, (*c2)->avoid); ++ + return CMP((*c2)->swap_usage, (*c1)->swap_usage); + } + +diff --git a/src/oom/test-oomd-util.c b/src/oom/test-oomd-util.c +index a1fe78806a1..193edee0eba 100644 +--- a/src/oom/test-oomd-util.c ++++ b/src/oom/test-oomd-util.c +@@ -89,6 +89,8 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) { + _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL; + _cleanup_free_ char *cgroup = NULL; + OomdCGroupContext *c1, *c2; ++ bool test_xattrs; ++ int r; + + if (geteuid() != 0) + return (void) log_tests_skipped("not root"); +@@ -101,6 +103,16 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) { + + assert_se(cg_pid_get_path(NULL, 0, &cgroup) >= 0); + ++ /* If we don't have permissions to set xattrs we're likely in a userns or missing capabilities ++ * so skip the xattr portions of the test. */ ++ r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_test", "1", 1, 0); ++ test_xattrs = !ERRNO_IS_PRIVILEGE(r) && !ERRNO_IS_NOT_SUPPORTED(r); ++ ++ if (test_xattrs) { ++ assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_omit", "1", 1, 0) >= 0); ++ assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_avoid", "1", 1, 0) >= 0); ++ } ++ + assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0); + + assert_se(streq(ctx->path, cgroup)); +@@ -110,12 +122,21 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) { + assert_se(ctx->swap_usage == 0); + assert_se(ctx->last_pgscan == 0); + assert_se(ctx->pgscan == 0); ++ if (test_xattrs) { ++ assert_se(ctx->omit == true); ++ assert_se(ctx->avoid == true); ++ } else { ++ assert_se(ctx->omit == false); ++ assert_se(ctx->avoid == false); ++ } + ctx = oomd_cgroup_context_free(ctx); + + /* Test the root cgroup */ + assert_se(oomd_cgroup_context_acquire("", &ctx) == 0); + assert_se(streq(ctx->path, "/")); + assert_se(ctx->current_memory_usage > 0); ++ assert_se(ctx->omit == false); ++ assert_se(ctx->avoid == false); + + /* Test hashmap inserts */ + assert_se(h1 = hashmap_new(&oomd_cgroup_ctx_hash_ops)); +@@ -137,6 +158,15 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) { + assert_se(c2->last_pgscan == 5555); + assert_se(c2->mem_pressure_limit == 6789); + assert_se(c2->last_hit_mem_pressure_limit == 42); ++ ++ /* Assert that avoid/omit are not set if the cgroup is not owned by root */ ++ if (test_xattrs) { ++ ctx = oomd_cgroup_context_free(ctx); ++ assert_se(cg_set_access(SYSTEMD_CGROUP_CONTROLLER, cgroup, 65534, 65534) >= 0); ++ assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0); ++ assert_se(ctx->omit == false); ++ assert_se(ctx->avoid == false); ++ } + } + + static void test_oomd_system_context_acquire(void) { +@@ -287,9 +317,11 @@ static void test_oomd_sort_cgroups(void) { + char **paths = STRV_MAKE("/herp.slice", + "/herp.slice/derp.scope", + "/herp.slice/derp.scope/sheep.service", +- "/zupa.slice"); ++ "/zupa.slice", ++ "/omitted.slice", ++ "/avoid.slice"); + +- OomdCGroupContext ctx[4] = { ++ OomdCGroupContext ctx[6] = { + { .path = paths[0], + .swap_usage = 20, + .pgscan = 60, +@@ -306,6 +338,14 @@ static void test_oomd_sort_cgroups(void) { + .swap_usage = 10, + .pgscan = 80, + .current_memory_usage = 10 }, ++ { .path = paths[4], ++ .swap_usage = 90, ++ .pgscan = 100, ++ .omit = true }, ++ { .path = paths[5], ++ .swap_usage = 99, ++ .pgscan = 200, ++ .avoid = true }, + }; + + assert_se(h = hashmap_new(&string_hash_ops)); +@@ -314,19 +354,23 @@ static void test_oomd_sort_cgroups(void) { + assert_se(hashmap_put(h, "/herp.slice/derp.scope", &ctx[1]) >= 0); + assert_se(hashmap_put(h, "/herp.slice/derp.scope/sheep.service", &ctx[2]) >= 0); + assert_se(hashmap_put(h, "/zupa.slice", &ctx[3]) >= 0); ++ assert_se(hashmap_put(h, "/omitted.slice", &ctx[4]) >= 0); ++ assert_se(hashmap_put(h, "/avoid.slice", &ctx[5]) >= 0); + +- assert_se(oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted_cgroups) == 4); ++ assert_se(oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted_cgroups) == 5); + assert_se(sorted_cgroups[0] == &ctx[1]); + assert_se(sorted_cgroups[1] == &ctx[2]); + assert_se(sorted_cgroups[2] == &ctx[0]); + assert_se(sorted_cgroups[3] == &ctx[3]); ++ assert_se(sorted_cgroups[4] == &ctx[5]); + sorted_cgroups = mfree(sorted_cgroups); + +- assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, NULL, &sorted_cgroups) == 4); ++ assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, NULL, &sorted_cgroups) == 5); + assert_se(sorted_cgroups[0] == &ctx[3]); + assert_se(sorted_cgroups[1] == &ctx[0]); + assert_se(sorted_cgroups[2] == &ctx[2]); + assert_se(sorted_cgroups[3] == &ctx[1]); ++ assert_se(sorted_cgroups[4] == &ctx[5]); + sorted_cgroups = mfree(sorted_cgroups); + + assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, "/herp.slice/derp.scope", &sorted_cgroups) == 2); +@@ -334,6 +378,8 @@ static void test_oomd_sort_cgroups(void) { + assert_se(sorted_cgroups[1] == &ctx[1]); + assert_se(sorted_cgroups[2] == 0); + assert_se(sorted_cgroups[3] == 0); ++ assert_se(sorted_cgroups[4] == 0); ++ assert_se(sorted_cgroups[5] == 0); + sorted_cgroups = mfree(sorted_cgroups); + } + +diff --git a/test/test-functions b/test/test-functions +index df6022982c2..6996cd74752 100644 +--- a/test/test-functions ++++ b/test/test-functions +@@ -124,6 +124,7 @@ BASICTOOLS=( + rmdir + sed + seq ++ setfattr + setfont + setsid + sfdisk +diff --git a/test/units/testsuite-56-testmunch.service b/test/units/testsuite-56-testmunch.service +new file mode 100644 +index 00000000000..b4b925a7af0 +--- /dev/null ++++ b/test/units/testsuite-56-testmunch.service +@@ -0,0 +1,7 @@ ++[Unit] ++Description=Create some memory pressure ++ ++[Service] ++MemoryHigh=2M ++Slice=testsuite-56-workload.slice ++ExecStart=/usr/lib/systemd/tests/testdata/units/testsuite-56-slowgrowth.sh +diff --git a/test/units/testsuite-56.sh b/test/units/testsuite-56.sh +index 8b01fe37ed4..88c185b8869 100755 +--- a/test/units/testsuite-56.sh ++++ b/test/units/testsuite-56.sh +@@ -23,20 +23,43 @@ oomctl | grep "/testsuite-56-workload.slice" + oomctl | grep "1.00%" + oomctl | grep "Default Memory Pressure Duration: 5s" + +-# systemd-oomd watches for elevated pressure for 30 seconds before acting. +-# It can take time to build up pressure so either wait 5 minutes or for the service to fail. +-timeout=$(date -ud "5 minutes" +%s) ++# systemd-oomd watches for elevated pressure for 5 seconds before acting. ++# It can take time to build up pressure so either wait 2 minutes or for the service to fail. ++timeout=$(date -ud "2 minutes" +%s) + while [[ $(date -u +%s) -le $timeout ]]; do + if ! systemctl status testsuite-56-testbloat.service; then + break + fi +- sleep 15 ++ sleep 5 + done + + # testbloat should be killed and testchill should be fine + if systemctl status testsuite-56-testbloat.service; then exit 42; fi + if ! systemctl status testsuite-56-testchill.service; then exit 24; fi + ++# only run this portion of the test if we can set xattrs ++if setfattr -n user.xattr_test -v 1 /sys/fs/cgroup/; then ++ sleep 120 # wait for systemd-oomd kill cool down and elevated memory pressure to come down ++ ++ systemctl start testsuite-56-testchill.service ++ systemctl start testsuite-56-testmunch.service ++ systemctl start testsuite-56-testbloat.service ++ setfattr -n user.oomd_avoid -v 1 /sys/fs/cgroup/testsuite.slice/testsuite-56.slice/testsuite-56-workload.slice/testsuite-56-testbloat.service ++ ++ timeout=$(date -ud "2 minutes" +%s) ++ while [[ $(date -u +%s) -le $timeout ]]; do ++ if ! systemctl status testsuite-56-testmunch.service; then ++ break ++ fi ++ sleep 5 ++ done ++ ++ # testmunch should be killed since testbloat had the avoid xattr on it ++ if ! systemctl status testsuite-56-testbloat.service; then exit 25; fi ++ if systemctl status testsuite-56-testmunch.service; then exit 43; fi ++ if ! systemctl status testsuite-56-testchill.service; then exit 24; fi ++fi ++ + systemd-analyze log-level info + + echo OK > /testok + +From d87ecfecdb6fb77097f843888e2a05945b6b396b Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Thu, 28 Jan 2021 02:31:44 -0800 +Subject: [PATCH 6/7] oom: add unit file settings for oomd avoid/omit xattrs + +--- + docs/TRANSIENT-SETTINGS.md | 1 + + src/core/cgroup.c | 58 ++++++++++++++++++--- + src/core/cgroup.h | 15 ++++++ + src/core/dbus-cgroup.c | 22 ++++++++ + src/core/execute.c | 4 ++ + src/core/load-fragment-gperf.gperf.m4 | 1 + + src/core/load-fragment.c | 1 + + src/core/load-fragment.h | 1 + + src/shared/bus-unit-util.c | 3 +- + src/test/test-tables.c | 1 + + test/fuzz/fuzz-unit-file/directives.service | 4 ++ + test/units/testsuite-56.sh | 8 ++- + 12 files changed, 109 insertions(+), 10 deletions(-) + +diff --git a/docs/TRANSIENT-SETTINGS.md b/docs/TRANSIENT-SETTINGS.md +index 50370602543..9f69a3162a0 100644 +--- a/docs/TRANSIENT-SETTINGS.md ++++ b/docs/TRANSIENT-SETTINGS.md +@@ -273,6 +273,7 @@ All cgroup/resource control settings are available for transient units + ✓ ManagedOOMSwap= + ✓ ManagedOOMMemoryPressure= + ✓ ManagedOOMMemoryPressureLimit= ++✓ ManagedOOMPreference= + ``` + + ## Process Killing Settings +diff --git a/src/core/cgroup.c b/src/core/cgroup.c +index 70282a7abda..833b434b555 100644 +--- a/src/core/cgroup.c ++++ b/src/core/cgroup.c +@@ -131,6 +131,7 @@ void cgroup_context_init(CGroupContext *c) { + + .moom_swap = MANAGED_OOM_AUTO, + .moom_mem_pressure = MANAGED_OOM_AUTO, ++ .moom_preference = MANAGED_OOM_PREFERENCE_NONE, + }; + } + +@@ -417,7 +418,8 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { + "%sDelegate: %s\n" + "%sManagedOOMSwap: %s\n" + "%sManagedOOMMemoryPressure: %s\n" +- "%sManagedOOMMemoryPressureLimit: %" PRIu32 ".%02" PRIu32 "%%\n", ++ "%sManagedOOMMemoryPressureLimit: %" PRIu32 ".%02" PRIu32 "%%\n" ++ "%sManagedOOMPreference: %s%%\n", + prefix, yes_no(c->cpu_accounting), + prefix, yes_no(c->io_accounting), + prefix, yes_no(c->blockio_accounting), +@@ -450,7 +452,8 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { + prefix, yes_no(c->delegate), + prefix, managed_oom_mode_to_string(c->moom_swap), + prefix, managed_oom_mode_to_string(c->moom_mem_pressure), +- prefix, c->moom_mem_pressure_limit_permyriad / 100, c->moom_mem_pressure_limit_permyriad % 100); ++ prefix, c->moom_mem_pressure_limit_permyriad / 100, c->moom_mem_pressure_limit_permyriad % 100, ++ prefix, managed_oom_preference_to_string(c->moom_preference)); + + if (c->delegate) { + _cleanup_free_ char *t = NULL; +@@ -600,6 +603,35 @@ int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode) + UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_low); + UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_min); + ++void cgroup_oomd_xattr_apply(Unit *u, const char *cgroup_path) { ++ CGroupContext *c; ++ int r; ++ ++ assert(u); ++ ++ c = unit_get_cgroup_context(u); ++ if (!c) ++ return; ++ ++ r = cg_remove_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.oomd_avoid"); ++ if (r != -ENODATA) ++ log_unit_debug_errno(u, r, "Failed to remove oomd_avoid flag on control group %s, ignoring: %m", cgroup_path); ++ ++ r = cg_remove_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.oomd_omit"); ++ if (r != -ENODATA) ++ log_unit_debug_errno(u, r, "Failed to remove oomd_omit flag on control group %s, ignoring: %m", cgroup_path); ++ ++ if (c->moom_preference == MANAGED_OOM_PREFERENCE_AVOID) { ++ r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.oomd_avoid", "1", 1, 0); ++ if (r < 0) ++ log_unit_debug_errno(u, r, "Failed to set oomd_avoid flag on control group %s, ignoring: %m", cgroup_path); ++ } else if (c->moom_preference == MANAGED_OOM_PREFERENCE_OMIT) { ++ r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.oomd_omit", "1", 1, 0); ++ if (r < 0) ++ log_unit_debug_errno(u, r, "Failed to set oomd_omit flag on control group %s, ignoring: %m", cgroup_path); ++ } ++} ++ + static void cgroup_xattr_apply(Unit *u) { + char ids[SD_ID128_STRING_MAX]; + int r; +@@ -630,6 +662,8 @@ static void cgroup_xattr_apply(Unit *u) { + if (r != -ENODATA) + log_unit_debug_errno(u, r, "Failed to remove delegate flag on control group %s, ignoring: %m", u->cgroup_path); + } ++ ++ cgroup_oomd_xattr_apply(u, u->cgroup_path); + } + + static int lookup_block_device(const char *p, dev_t *ret) { +@@ -3737,12 +3771,6 @@ int unit_cgroup_freezer_action(Unit *u, FreezerAction action) { + return 1; + } + +-static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = { +- [CGROUP_DEVICE_POLICY_AUTO] = "auto", +- [CGROUP_DEVICE_POLICY_CLOSED] = "closed", +- [CGROUP_DEVICE_POLICY_STRICT] = "strict", +-}; +- + int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) { + _cleanup_free_ char *v = NULL; + int r; +@@ -3771,6 +3799,12 @@ int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) { + return parse_cpu_set_full(v, cpus, false, NULL, NULL, 0, NULL); + } + ++static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = { ++ [CGROUP_DEVICE_POLICY_AUTO] = "auto", ++ [CGROUP_DEVICE_POLICY_CLOSED] = "closed", ++ [CGROUP_DEVICE_POLICY_STRICT] = "strict", ++}; ++ + DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy); + + static const char* const freezer_action_table[_FREEZER_ACTION_MAX] = { +@@ -3779,3 +3813,11 @@ static const char* const freezer_action_table[_FREEZER_ACTION_MAX] = { + }; + + DEFINE_STRING_TABLE_LOOKUP(freezer_action, FreezerAction); ++ ++static const char* const managed_oom_preference_table[_MANAGED_OOM_PREFERENCE_MAX] = { ++ [MANAGED_OOM_PREFERENCE_NONE] = "none", ++ [MANAGED_OOM_PREFERENCE_AVOID] = "avoid", ++ [MANAGED_OOM_PREFERENCE_OMIT] = "omit", ++}; ++ ++DEFINE_STRING_TABLE_LOOKUP(managed_oom_preference, ManagedOOMPreference); +diff --git a/src/core/cgroup.h b/src/core/cgroup.h +index 9fbfabbb7e3..7d9ab4ae6b8 100644 +--- a/src/core/cgroup.h ++++ b/src/core/cgroup.h +@@ -94,6 +94,15 @@ struct CGroupBlockIODeviceBandwidth { + uint64_t wbps; + }; + ++typedef enum ManagedOOMPreference { ++ MANAGED_OOM_PREFERENCE_NONE, ++ MANAGED_OOM_PREFERENCE_AVOID, ++ MANAGED_OOM_PREFERENCE_OMIT, ++ ++ _MANAGED_OOM_PREFERENCE_MAX, ++ _MANAGED_OOM_PREFERENCE_INVALID = -1 ++} ManagedOOMPreference; ++ + struct CGroupContext { + bool cpu_accounting; + bool io_accounting; +@@ -164,6 +173,7 @@ struct CGroupContext { + ManagedOOMMode moom_swap; + ManagedOOMMode moom_mem_pressure; + uint32_t moom_mem_pressure_limit_permyriad; ++ ManagedOOMPreference moom_preference; + }; + + /* Used when querying IP accounting data */ +@@ -204,6 +214,8 @@ void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockI + + int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode); + ++void cgroup_oomd_xattr_apply(Unit *u, const char *cgroup_path); ++ + CGroupMask unit_get_own_mask(Unit *u); + CGroupMask unit_get_delegate_mask(Unit *u); + CGroupMask unit_get_members_mask(Unit *u); +@@ -294,3 +306,6 @@ int unit_cgroup_freezer_action(Unit *u, FreezerAction action); + + const char* freezer_action_to_string(FreezerAction a) _const_; + FreezerAction freezer_action_from_string(const char *s) _pure_; ++ ++const char* managed_oom_preference_to_string(ManagedOOMPreference a) _const_; ++ManagedOOMPreference managed_oom_preference_from_string(const char *s) _pure_; +diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c +index 6f309feb236..0b2d945283e 100644 +--- a/src/core/dbus-cgroup.c ++++ b/src/core/dbus-cgroup.c +@@ -21,6 +21,7 @@ BUS_DEFINE_PROPERTY_GET(bus_property_get_tasks_max, "t", TasksMax, tasks_max_res + + static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_cgroup_device_policy, cgroup_device_policy, CGroupDevicePolicy); + static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_managed_oom_mode, managed_oom_mode, ManagedOOMMode); ++static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_managed_oom_preference, managed_oom_preference, ManagedOOMPreference); + + static int property_get_cgroup_mask( + sd_bus *bus, +@@ -395,6 +396,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = { + SD_BUS_PROPERTY("ManagedOOMSwap", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_swap), 0), + SD_BUS_PROPERTY("ManagedOOMMemoryPressure", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_mem_pressure), 0), + SD_BUS_PROPERTY("ManagedOOMMemoryPressureLimitPermyriad", "u", NULL, offsetof(CGroupContext, moom_mem_pressure_limit_permyriad), 0), ++ SD_BUS_PROPERTY("ManagedOOMPreference", "s", property_get_managed_oom_preference, offsetof(CGroupContext, moom_preference), 0), + SD_BUS_VTABLE_END + }; + +@@ -1720,6 +1722,26 @@ int bus_cgroup_set_property( + return 1; + } + ++ if (streq(name, "ManagedOOMPreference")) { ++ ManagedOOMPreference p; ++ const char *pref; ++ ++ r = sd_bus_message_read(message, "s", &pref); ++ if (r < 0) ++ return r; ++ ++ p = managed_oom_preference_from_string(pref); ++ if (p < 0) ++ return -EINVAL; ++ ++ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { ++ c->moom_preference = p; ++ unit_write_settingf(u, flags, name, "ManagedOOMPreference=%s", pref); ++ } ++ ++ return 1; ++ } ++ + if (streq(name, "DisableControllers") || (u->transient && u->load_state == UNIT_STUB)) + return bus_cgroup_set_transient_property(u, c, name, message, flags, error); + +diff --git a/src/core/execute.c b/src/core/execute.c +index b7d78f2197e..0368582884c 100644 +--- a/src/core/execute.c ++++ b/src/core/execute.c +@@ -4701,6 +4701,10 @@ int exec_spawn(Unit *unit, + r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path); ++ ++ /* Normally we would not propagate the oomd xattrs to children but since we created this ++ * sub-cgroup interally we should do it. */ ++ cgroup_oomd_xattr_apply(unit, subcgroup_path); + } + } + +diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 +index 81f4561a572..dbcbe645934 100644 +--- a/src/core/load-fragment-gperf.gperf.m4 ++++ b/src/core/load-fragment-gperf.gperf.m4 +@@ -230,6 +230,7 @@ $1.IPEgressFilterPath, config_parse_ip_filter_bpf_progs, + $1.ManagedOOMSwap, config_parse_managed_oom_mode, 0, offsetof($1, cgroup_context.moom_swap) + $1.ManagedOOMMemoryPressure, config_parse_managed_oom_mode, 0, offsetof($1, cgroup_context.moom_mem_pressure) + $1.ManagedOOMMemoryPressureLimit, config_parse_managed_oom_mem_pressure_limit, 0, offsetof($1, cgroup_context.moom_mem_pressure_limit_permyriad) ++$1.ManagedOOMPreference, config_parse_managed_oom_preference, 0, offsetof($1, cgroup_context.moom_preference) + $1.NetClass, config_parse_warn_compat, DISABLED_LEGACY, 0' + )m4_dnl + Unit.Description, config_parse_unit_string_printf, 0, offsetof(Unit, description) +diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c +index 06b71aaf157..c6b017556f9 100644 +--- a/src/core/load-fragment.c ++++ b/src/core/load-fragment.c +@@ -133,6 +133,7 @@ DEFINE_CONFIG_PARSE_ENUM(config_parse_service_restart, service_restart, ServiceR + DEFINE_CONFIG_PARSE_ENUM(config_parse_service_timeout_failure_mode, service_timeout_failure_mode, ServiceTimeoutFailureMode, "Failed to parse timeout failure mode"); + DEFINE_CONFIG_PARSE_ENUM(config_parse_socket_bind, socket_address_bind_ipv6_only_or_bool, SocketAddressBindIPv6Only, "Failed to parse bind IPv6 only value"); + DEFINE_CONFIG_PARSE_ENUM(config_parse_oom_policy, oom_policy, OOMPolicy, "Failed to parse OOM policy"); ++DEFINE_CONFIG_PARSE_ENUM(config_parse_managed_oom_preference, managed_oom_preference, ManagedOOMPreference, "Failed to parse ManagedOOMPreference="); + DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_ip_tos, ip_tos, int, -1, "Failed to parse IP TOS value"); + DEFINE_CONFIG_PARSE_PTR(config_parse_blockio_weight, cg_blkio_weight_parse, uint64_t, "Invalid block IO weight"); + DEFINE_CONFIG_PARSE_PTR(config_parse_cg_weight, cg_weight_parse, uint64_t, "Invalid weight"); +diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h +index 6b2175cd2af..e4a5cb79869 100644 +--- a/src/core/load-fragment.h ++++ b/src/core/load-fragment.h +@@ -78,6 +78,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_tasks_max); + CONFIG_PARSER_PROTOTYPE(config_parse_delegate); + CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mode); + CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mem_pressure_limit); ++CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_preference); + CONFIG_PARSER_PROTOTYPE(config_parse_device_policy); + CONFIG_PARSER_PROTOTYPE(config_parse_device_allow); + CONFIG_PARSER_PROTOTYPE(config_parse_io_device_latency); +diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c +index 84f57d94d23..5bbaa07dd1c 100644 +--- a/src/shared/bus-unit-util.c ++++ b/src/shared/bus-unit-util.c +@@ -435,7 +435,8 @@ static int bus_append_cgroup_property(sd_bus_message *m, const char *field, cons + if (STR_IN_SET(field, "DevicePolicy", + "Slice", + "ManagedOOMSwap", +- "ManagedOOMMemoryPressure")) ++ "ManagedOOMMemoryPressure", ++ "ManagedOOMPreference")) + return bus_append_string(m, field, eq); + + if (STR_IN_SET(field, "ManagedOOMMemoryPressureLimit")) { +diff --git a/src/test/test-tables.c b/src/test/test-tables.c +index 641cadec858..cc93bbbc749 100644 +--- a/src/test/test-tables.c ++++ b/src/test/test-tables.c +@@ -73,6 +73,7 @@ int main(int argc, char **argv) { + test_table(log_target, LOG_TARGET); + test_table(mac_address_policy, MAC_ADDRESS_POLICY); + test_table(managed_oom_mode, MANAGED_OOM_MODE); ++ test_table(managed_oom_preference, MANAGED_OOM_PREFERENCE); + test_table(manager_state, MANAGER_STATE); + test_table(manager_timestamp, MANAGER_TIMESTAMP); + test_table(mount_exec_command, MOUNT_EXEC_COMMAND); +diff --git a/test/fuzz/fuzz-unit-file/directives.service b/test/fuzz/fuzz-unit-file/directives.service +index 15fa556dd64..0c7ded6786a 100644 +--- a/test/fuzz/fuzz-unit-file/directives.service ++++ b/test/fuzz/fuzz-unit-file/directives.service +@@ -138,6 +138,10 @@ MakeDirectory= + Mark= + MaxConnections= + MaxConnectionsPerSource= ++ManagedOOMSwap= ++ManagedOOMMemoryPressure= ++ManagedOOMMemoryPressureLimitPercent= ++ManagedOOMPreference= + MemoryAccounting= + MemoryHigh= + MemoryLimit= +diff --git a/test/units/testsuite-56.sh b/test/units/testsuite-56.sh +index 88c185b8869..1884f814689 100755 +--- a/test/units/testsuite-56.sh ++++ b/test/units/testsuite-56.sh +@@ -13,6 +13,8 @@ if [[ "$cgroup_type" != *"cgroup2"* ]] && [[ "$cgroup_type" != *"0x63677270"* ]] + fi + [[ -e /skipped ]] && exit 0 || true + ++rm -rf /etc/systemd/system/testsuite-56-testbloat.service.d ++ + echo "DefaultMemoryPressureDurationSec=5s" >> /etc/systemd/oomd.conf + + systemctl start testsuite-56-testchill.service +@@ -41,10 +43,14 @@ if ! systemctl status testsuite-56-testchill.service; then exit 24; fi + if setfattr -n user.xattr_test -v 1 /sys/fs/cgroup/; then + sleep 120 # wait for systemd-oomd kill cool down and elevated memory pressure to come down + ++ mkdir -p /etc/systemd/system/testsuite-56-testbloat.service.d/ ++ echo "[Service]" > /etc/systemd/system/testsuite-56-testbloat.service.d/override.conf ++ echo "ManagedOOMPreference=avoid" >> /etc/systemd/system/testsuite-56-testbloat.service.d/override.conf ++ ++ systemctl daemon-reload + systemctl start testsuite-56-testchill.service + systemctl start testsuite-56-testmunch.service + systemctl start testsuite-56-testbloat.service +- setfattr -n user.oomd_avoid -v 1 /sys/fs/cgroup/testsuite.slice/testsuite-56.slice/testsuite-56-workload.slice/testsuite-56-testbloat.service + + timeout=$(date -ud "2 minutes" +%s) + while [[ $(date -u +%s) -le $timeout ]]; do + +From 32d695eccfeef00023992cdf20bf39f9d0288c67 Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Thu, 28 Jan 2021 17:35:17 -0800 +Subject: [PATCH 7/7] man: document ManagedOOMPreference= + +--- + man/org.freedesktop.systemd1.xml | 36 ++++++++++++++++++++++++++++++++ + man/systemd.resource-control.xml | 32 ++++++++++++++++++++++++++++ + 2 files changed, 68 insertions(+) + +diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml +index 7543a617b78..1d419ac495e 100644 +--- a/man/org.freedesktop.systemd1.xml ++++ b/man/org.freedesktop.systemd1.xml +@@ -2450,6 +2450,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; ++ @org.freedesktop.DBus.Property.EmitsChangedSignal("false") ++ readonly s ManagedOOMPreference = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly as Environment = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") +@@ -2974,6 +2976,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + + ++ ++ + + + +@@ -3538,6 +3542,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + + ++ ++ + + + +@@ -4204,6 +4210,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; ++ @org.freedesktop.DBus.Property.EmitsChangedSignal("false") ++ readonly s ManagedOOMPreference = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly as Environment = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") +@@ -4756,6 +4764,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + + ++ ++ + + + +@@ -5318,6 +5328,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + + ++ ++ + + + +@@ -5897,6 +5909,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; ++ @org.freedesktop.DBus.Property.EmitsChangedSignal("false") ++ readonly s ManagedOOMPreference = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly as Environment = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") +@@ -6377,6 +6391,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + + ++ ++ + + + +@@ -6857,6 +6873,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + + ++ ++ + + + +@@ -7557,6 +7575,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; ++ @org.freedesktop.DBus.Property.EmitsChangedSignal("false") ++ readonly s ManagedOOMPreference = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly as Environment = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") +@@ -8023,6 +8043,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + + ++ ++ + + + +@@ -8489,6 +8511,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + + ++ ++ + + + +@@ -9042,6 +9066,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; ++ @org.freedesktop.DBus.Property.EmitsChangedSignal("false") ++ readonly s ManagedOOMPreference = '...'; + }; + interface org.freedesktop.DBus.Peer { ... }; + interface org.freedesktop.DBus.Introspectable { ... }; +@@ -9178,6 +9204,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + + ++ ++ + + + +@@ -9318,6 +9346,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + + ++ ++ + + + +@@ -9477,6 +9507,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; ++ @org.freedesktop.DBus.Property.EmitsChangedSignal("false") ++ readonly s ManagedOOMPreference = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly s KillMode = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") +@@ -9629,6 +9661,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + + ++ ++ + + + +@@ -9795,6 +9829,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + + ++ ++ + + + +diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml +index be9c35057db..13ff7e9a740 100644 +--- a/man/systemd.resource-control.xml ++++ b/man/systemd.resource-control.xml +@@ -913,6 +913,38 @@ DeviceAllow=/dev/loop-control + + + ++ ++ ++ ManagedOOMPreference=none|avoid|omit ++ ++ ++ Allows deprioritizing or omitting this unit's cgroup as a candidate when systemd-oomd ++ needs to act. Requires support for extended attributes (see ++ xattr7) ++ in order to use or . Additionally, systemd-oomd ++ will ignore these extended attributes if the unit's cgroup is not owned by the root user and group. ++ ++ If this property is set to , the service manager will set the ++ "user.oomd_avoid" extended attribute on the unit's cgroup to "1". If systemd-oomd sees ++ this extended attribute on a cgroup set to "1" when choosing between candidates, it will only select the ++ cgroup with "user.oomd_avoid" if there are no other viable candidates. ++ ++ If this property is set to , the service manager will set the "user.oomd_omit" ++ extended attribute on the unit's cgroup to "1". If systemd-oomd sees the this extended ++ attribute on the cgroup set to "1", it will ignore the cgroup as a candidate and will not perform any actions ++ on the cgroup. ++ ++ It is recommended to use and sparingly as it can adversely ++ affect systemd-oomd's kill behavior. Also note that these extended attributes are not ++ applied recursively to cgroups under this unit's cgroup. ++ ++ Defaults to which means no extended attributes will be set and systemd-oomd will ++ sort this unit's cgroup as defined in ++ systemd-oomd.service8 ++ and oomd.conf5 (if this ++ unit's cgroup becomes a candidate). ++ ++ + + + diff --git a/18444.patch b/18444.patch new file mode 100644 index 0000000..7b1b066 --- /dev/null +++ b/18444.patch @@ -0,0 +1,987 @@ +From a9b1927c15fce3c9945ac249d8e8ddc42028a057 Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Tue, 2 Feb 2021 01:47:08 -0800 +Subject: [PATCH 1/2] parse-util: add permyriad parsing + +--- + src/basic/parse-util.c | 137 ++++++++++++++++++++++++++----------- + src/basic/parse-util.h | 3 + + src/test/test-parse-util.c | 68 ++++++++++++++++++ + 3 files changed, 169 insertions(+), 39 deletions(-) + +diff --git a/src/basic/parse-util.c b/src/basic/parse-util.c +index 5d4dafe3a5..a0fb2c9d17 100644 +--- a/src/basic/parse-util.c ++++ b/src/basic/parse-util.c +@@ -671,11 +671,11 @@ int parse_fractional_part_u(const char **p, size_t digits, unsigned *res) { + return 0; + } + +-int parse_percent_unbounded(const char *p) { ++static int parse_parts_value_whole(const char *p, const char *symbol) { + const char *pc, *n; + int r, v; + +- pc = endswith(p, "%"); ++ pc = endswith(p, symbol); + if (!pc) + return -EINVAL; + +@@ -689,6 +689,74 @@ int parse_percent_unbounded(const char *p) { + return v; + } + ++static int parse_parts_value_with_tenths_place(const char *p, const char *symbol) { ++ const char *pc, *dot, *n; ++ int r, q, v; ++ ++ pc = endswith(p, symbol); ++ if (!pc) ++ return -EINVAL; ++ ++ dot = memchr(p, '.', pc - p); ++ if (dot) { ++ if (dot + 2 != pc) ++ return -EINVAL; ++ if (dot[1] < '0' || dot[1] > '9') ++ return -EINVAL; ++ q = dot[1] - '0'; ++ n = strndupa(p, dot - p); ++ } else { ++ q = 0; ++ n = strndupa(p, pc - p); ++ } ++ r = safe_atoi(n, &v); ++ if (r < 0) ++ return r; ++ if (v < 0) ++ return -ERANGE; ++ if (v > (INT_MAX - q) / 10) ++ return -ERANGE; ++ ++ v = v * 10 + q; ++ return v; ++} ++ ++static int parse_parts_value_with_hundredths_place(const char *p, const char *symbol) { ++ const char *pc, *dot, *n; ++ int r, q, v; ++ ++ pc = endswith(p, symbol); ++ if (!pc) ++ return -EINVAL; ++ ++ dot = memchr(p, '.', pc - p); ++ if (dot) { ++ if (dot + 3 != pc) ++ return -EINVAL; ++ if (dot[1] < '0' || dot[1] > '9' || dot[2] < '0' || dot[2] > '9') ++ return -EINVAL; ++ q = (dot[1] - '0') * 10 + (dot[2] - '0'); ++ n = strndupa(p, dot - p); ++ } else { ++ q = 0; ++ n = strndupa(p, pc - p); ++ } ++ r = safe_atoi(n, &v); ++ if (r < 0) ++ return r; ++ if (v < 0) ++ return -ERANGE; ++ if (v > (INT_MAX - q) / 100) ++ return -ERANGE; ++ ++ v = v * 100 + q; ++ return v; ++} ++ ++int parse_percent_unbounded(const char *p) { ++ return parse_parts_value_whole(p, "%"); ++} ++ + int parse_percent(const char *p) { + int v; + +@@ -700,46 +768,13 @@ int parse_percent(const char *p) { + } + + int parse_permille_unbounded(const char *p) { +- const char *pc, *pm, *dot, *n; +- int r, q, v; ++ const char *pm; + + pm = endswith(p, "‰"); +- if (pm) { +- n = strndupa(p, pm - p); +- r = safe_atoi(n, &v); +- if (r < 0) +- return r; +- if (v < 0) +- return -ERANGE; +- } else { +- pc = endswith(p, "%"); +- if (!pc) +- return -EINVAL; +- +- dot = memchr(p, '.', pc - p); +- if (dot) { +- if (dot + 2 != pc) +- return -EINVAL; +- if (dot[1] < '0' || dot[1] > '9') +- return -EINVAL; +- q = dot[1] - '0'; +- n = strndupa(p, dot - p); +- } else { +- q = 0; +- n = strndupa(p, pc - p); +- } +- r = safe_atoi(n, &v); +- if (r < 0) +- return r; +- if (v < 0) +- return -ERANGE; +- if (v > (INT_MAX - q) / 10) +- return -ERANGE; ++ if (pm) ++ return parse_parts_value_whole(p, "‰"); + +- v = v * 10 + q; +- } +- +- return v; ++ return parse_parts_value_with_tenths_place(p, "%"); + } + + int parse_permille(const char *p) { +@@ -752,6 +787,30 @@ int parse_permille(const char *p) { + return v; + } + ++int parse_permyriad_unbounded(const char *p) { ++ const char *pm; ++ ++ pm = endswith(p, "‱"); ++ if (pm) ++ return parse_parts_value_whole(p, "‱"); ++ ++ pm = endswith(p, "‰"); ++ if (pm) ++ return parse_parts_value_with_tenths_place(p, "‰"); ++ ++ return parse_parts_value_with_hundredths_place(p, "%"); ++} ++ ++int parse_permyriad(const char *p) { ++ int v; ++ ++ v = parse_permyriad_unbounded(p); ++ if (v > 10000) ++ return -ERANGE; ++ ++ return v; ++} ++ + int parse_nice(const char *p, int *ret) { + int n, r; + +diff --git a/src/basic/parse-util.h b/src/basic/parse-util.h +index 81478ed059..3e29291f26 100644 +--- a/src/basic/parse-util.h ++++ b/src/basic/parse-util.h +@@ -136,6 +136,9 @@ int parse_percent(const char *p); + int parse_permille_unbounded(const char *p); + int parse_permille(const char *p); + ++int parse_permyriad_unbounded(const char *p); ++int parse_permyriad(const char *p); ++ + int parse_nice(const char *p, int *ret); + + int parse_ip_port(const char *s, uint16_t *ret); +diff --git a/src/test/test-parse-util.c b/src/test/test-parse-util.c +index 1c969091ef..6e23efe134 100644 +--- a/src/test/test-parse-util.c ++++ b/src/test/test-parse-util.c +@@ -790,6 +790,72 @@ static void test_parse_permille_unbounded(void) { + assert_se(parse_permille_unbounded("429496729.6%") == -ERANGE); + } + ++static void test_parse_permyriad(void) { ++ assert_se(parse_permyriad("") == -EINVAL); ++ assert_se(parse_permyriad("foo") == -EINVAL); ++ assert_se(parse_permyriad("0") == -EINVAL); ++ assert_se(parse_permyriad("50") == -EINVAL); ++ assert_se(parse_permyriad("100") == -EINVAL); ++ assert_se(parse_permyriad("-1") == -EINVAL); ++ ++ assert_se(parse_permyriad("0‱") == 0); ++ assert_se(parse_permyriad("555‱") == 555); ++ assert_se(parse_permyriad("1000‱") == 1000); ++ assert_se(parse_permyriad("-7‱") == -ERANGE); ++ assert_se(parse_permyriad("10007‱") == -ERANGE); ++ assert_se(parse_permyriad("‱") == -EINVAL); ++ assert_se(parse_permyriad("‱‱") == -EINVAL); ++ assert_se(parse_permyriad("‱1") == -EINVAL); ++ assert_se(parse_permyriad("1‱‱") == -EINVAL); ++ assert_se(parse_permyriad("3.2‱") == -EINVAL); ++ ++ assert_se(parse_permyriad("0‰") == 0); ++ assert_se(parse_permyriad("555.5‰") == 5555); ++ assert_se(parse_permyriad("1000.0‰") == 10000); ++ assert_se(parse_permyriad("-7‰") == -ERANGE); ++ assert_se(parse_permyriad("1007‰") == -ERANGE); ++ assert_se(parse_permyriad("‰") == -EINVAL); ++ assert_se(parse_permyriad("‰‰") == -EINVAL); ++ assert_se(parse_permyriad("‰1") == -EINVAL); ++ assert_se(parse_permyriad("1‰‰") == -EINVAL); ++ assert_se(parse_permyriad("3.22‰") == -EINVAL); ++ ++ assert_se(parse_permyriad("0%") == 0); ++ assert_se(parse_permyriad("55%") == 5500); ++ assert_se(parse_permyriad("55.53%") == 5553); ++ assert_se(parse_permyriad("100%") == 10000); ++ assert_se(parse_permyriad("-7%") == -ERANGE); ++ assert_se(parse_permyriad("107%") == -ERANGE); ++ assert_se(parse_permyriad("%") == -EINVAL); ++ assert_se(parse_permyriad("%%") == -EINVAL); ++ assert_se(parse_permyriad("%1") == -EINVAL); ++ assert_se(parse_permyriad("1%%") == -EINVAL); ++ assert_se(parse_permyriad("3.212%") == -EINVAL); ++} ++ ++static void test_parse_permyriad_unbounded(void) { ++ assert_se(parse_permyriad_unbounded("1001‱") == 1001); ++ assert_se(parse_permyriad_unbounded("4000‱") == 4000); ++ assert_se(parse_permyriad_unbounded("2147483647‱") == 2147483647); ++ assert_se(parse_permyriad_unbounded("2147483648‱") == -ERANGE); ++ assert_se(parse_permyriad_unbounded("4294967295‱") == -ERANGE); ++ assert_se(parse_permyriad_unbounded("4294967296‱") == -ERANGE); ++ ++ assert_se(parse_permyriad_unbounded("101‰") == 1010); ++ assert_se(parse_permyriad_unbounded("400‰") == 4000); ++ assert_se(parse_permyriad_unbounded("214748364.7‰") == 2147483647); ++ assert_se(parse_permyriad_unbounded("214748364.8‰") == -ERANGE); ++ assert_se(parse_permyriad_unbounded("429496729.5‰") == -ERANGE); ++ assert_se(parse_permyriad_unbounded("429496729.6‰") == -ERANGE); ++ ++ assert_se(parse_permyriad_unbounded("99%") == 9900); ++ assert_se(parse_permyriad_unbounded("40%") == 4000); ++ assert_se(parse_permyriad_unbounded("21474836.47%") == 2147483647); ++ assert_se(parse_permyriad_unbounded("21474836.48%") == -ERANGE); ++ assert_se(parse_permyriad_unbounded("42949672.95%") == -ERANGE); ++ assert_se(parse_permyriad_unbounded("42949672.96%") == -ERANGE); ++} ++ + static void test_parse_nice(void) { + int n; + +@@ -987,6 +1053,8 @@ int main(int argc, char *argv[]) { + test_parse_percent_unbounded(); + test_parse_permille(); + test_parse_permille_unbounded(); ++ test_parse_permyriad(); ++ test_parse_permyriad_unbounded(); + test_parse_nice(); + test_parse_dev(); + test_parse_errno(); +-- +2.29.2 + + +From 5fdc5d3384f81888704a0a19db3cb33bce2d8bdb Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Tue, 2 Feb 2021 14:16:03 -0800 +Subject: [PATCH 2/2] oom: rework *MemoryPressureLimit= properties to have + 1/10000 precision + +Requested in +https://github.com/systemd/systemd/pull/15206#discussion_r505506657, +preserve the full granularity for memory pressure limits (permyriad) +instead of capping out at percent. +--- + docs/TRANSIENT-SETTINGS.md | 2 +- + man/oomd.conf.xml | 6 ++--- + man/org.freedesktop.systemd1.xml | 36 +++++++++++++------------- + man/systemd.resource-control.xml | 2 +- + src/core/cgroup.c | 4 +-- + src/core/cgroup.h | 2 +- + src/core/core-varlink.c | 2 +- + src/core/dbus-cgroup.c | 16 +++++++++--- + src/core/dbus-util.c | 29 --------------------- + src/core/dbus-util.h | 1 - + src/core/load-fragment-gperf.gperf.m4 | 2 +- + src/core/load-fragment.c | 6 ++--- + src/oom/oomd-manager.c | 24 +++++++++++------ + src/oom/oomd-manager.h | 4 +-- + src/oom/oomd-util.c | 4 +-- + src/oom/oomd.c | 10 +++---- + src/oom/oomd.conf | 2 +- + src/shared/bus-get-properties.c | 17 ------------ + src/shared/bus-get-properties.h | 1 - + src/shared/bus-unit-util.c | 19 ++++++++++++-- + src/shared/conf-parser.c | 1 + + src/shared/conf-parser.h | 1 + + test/units/testsuite-56-workload.slice | 2 +- + test/units/testsuite-56.sh | 2 +- + 24 files changed, 91 insertions(+), 104 deletions(-) + +diff --git a/docs/TRANSIENT-SETTINGS.md b/docs/TRANSIENT-SETTINGS.md +index 50b9a42fa1..5037060254 100644 +--- a/docs/TRANSIENT-SETTINGS.md ++++ b/docs/TRANSIENT-SETTINGS.md +@@ -272,7 +272,7 @@ All cgroup/resource control settings are available for transient units + ✓ IPAddressDeny= + ✓ ManagedOOMSwap= + ✓ ManagedOOMMemoryPressure= +-✓ ManagedOOMMemoryPressureLimitPercent= ++✓ ManagedOOMMemoryPressureLimit= + ``` + + ## Process Killing Settings +diff --git a/man/oomd.conf.xml b/man/oomd.conf.xml +index bb5da87c54..2a12be8cad 100644 +--- a/man/oomd.conf.xml ++++ b/man/oomd.conf.xml +@@ -59,10 +59,10 @@ + + + +- DefaultMemoryPressureLimitPercent= ++ DefaultMemoryPressureLimit= + + Sets the limit for memory pressure on the unit's cgroup before systemd-oomd +- will take action. A unit can override this value with ManagedOOMMemoryPressureLimitPercent=. ++ will take action. A unit can override this value with ManagedOOMMemoryPressureLimit=. + The memory pressure for this property represents the fraction of time in a 10 second window in which all tasks + in the cgroup were delayed. For each monitored cgroup, if the memory pressure on that cgroup exceeds the + limit set for longer than the duration set by DefaultMemoryPressureDurationSec=, +@@ -78,7 +78,7 @@ + + Sets the amount of time a unit's cgroup needs to have exceeded memory pressure limits before + systemd-oomd will take action. Memory pressure limits are defined by +- DefaultMemoryPressureLimitPercent= and ManagedOOMMemoryPressureLimitPercent=. ++ DefaultMemoryPressureLimit= and ManagedOOMMemoryPressureLimit=. + Defaults to 30 seconds when this property is unset or set to 0. + + +diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml +index 78fd0b3378..7809b65062 100644 +--- a/man/org.freedesktop.systemd1.xml ++++ b/man/org.freedesktop.systemd1.xml +@@ -2419,7 +2419,7 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") +- readonly s ManagedOOMMemoryPressureLimitPercent = '...'; ++ readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly as Environment = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") +@@ -2938,7 +2938,7 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + + +- ++ + + + +@@ -3494,7 +3494,7 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + + +- ++ + + + +@@ -4146,7 +4146,7 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") +- readonly s ManagedOOMMemoryPressureLimitPercent = '...'; ++ readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly as Environment = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") +@@ -4693,7 +4693,7 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + + +- ++ + + + +@@ -5251,7 +5251,7 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + + +- ++ + + + +@@ -5827,7 +5827,7 @@ node /org/freedesktop/systemd1/unit/home_2emount { + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") +- readonly s ManagedOOMMemoryPressureLimitPercent = '...'; ++ readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly as Environment = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") +@@ -6302,7 +6302,7 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + + +- ++ + + + +@@ -6778,7 +6778,7 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + + +- ++ + + + +@@ -7475,7 +7475,7 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") +- readonly s ManagedOOMMemoryPressureLimitPercent = '...'; ++ readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly as Environment = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") +@@ -7936,7 +7936,7 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + + +- ++ + + + +@@ -8398,7 +8398,7 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + + +- ++ + + + +@@ -8948,7 +8948,7 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") +- readonly s ManagedOOMMemoryPressureLimitPercent = '...'; ++ readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; + }; + interface org.freedesktop.DBus.Peer { ... }; + interface org.freedesktop.DBus.Introspectable { ... }; +@@ -9083,7 +9083,7 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + + +- ++ + + + +@@ -9223,7 +9223,7 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + + +- ++ + + + +@@ -9383,7 +9383,7 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") +- readonly s ManagedOOMMemoryPressureLimitPercent = '...'; ++ readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly s KillMode = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") +@@ -9534,7 +9534,7 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + + +- ++ + + + +@@ -9700,7 +9700,7 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + + +- ++ + + + +diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml +index 26dedda3fd..4381c4e1b7 100644 +--- a/man/systemd.resource-control.xml ++++ b/man/systemd.resource-control.xml +@@ -901,7 +901,7 @@ DeviceAllow=/dev/loop-control + + + +- ManagedOOMMemoryPressureLimitPercent= ++ ManagedOOMMemoryPressureLimit= + + + Overrides the default memory pressure limit set by +diff --git a/src/core/cgroup.c b/src/core/cgroup.c +index 7dc6c20bb7..e2ed0e546e 100644 +--- a/src/core/cgroup.c ++++ b/src/core/cgroup.c +@@ -417,7 +417,7 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { + "%sDelegate: %s\n" + "%sManagedOOMSwap: %s\n" + "%sManagedOOMMemoryPressure: %s\n" +- "%sManagedOOMMemoryPressureLimitPercent: %d%%\n", ++ "%sManagedOOMMemoryPressureLimit: %" PRIu32 ".%02" PRIu32 "%%\n", + prefix, yes_no(c->cpu_accounting), + prefix, yes_no(c->io_accounting), + prefix, yes_no(c->blockio_accounting), +@@ -450,7 +450,7 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { + prefix, yes_no(c->delegate), + prefix, managed_oom_mode_to_string(c->moom_swap), + prefix, managed_oom_mode_to_string(c->moom_mem_pressure), +- prefix, c->moom_mem_pressure_limit); ++ prefix, c->moom_mem_pressure_limit_permyriad / 100, c->moom_mem_pressure_limit_permyriad % 100); + + if (c->delegate) { + _cleanup_free_ char *t = NULL; +diff --git a/src/core/cgroup.h b/src/core/cgroup.h +index 66f3a63b82..9fbfabbb7e 100644 +--- a/src/core/cgroup.h ++++ b/src/core/cgroup.h +@@ -163,7 +163,7 @@ struct CGroupContext { + /* Settings for systemd-oomd */ + ManagedOOMMode moom_swap; + ManagedOOMMode moom_mem_pressure; +- int moom_mem_pressure_limit; ++ uint32_t moom_mem_pressure_limit_permyriad; + }; + + /* Used when querying IP accounting data */ +diff --git a/src/core/core-varlink.c b/src/core/core-varlink.c +index dd6c11ab4d..17fb9bc83f 100644 +--- a/src/core/core-varlink.c ++++ b/src/core/core-varlink.c +@@ -83,7 +83,7 @@ static int build_managed_oom_json_array_element(Unit *u, const char *property, J + JSON_BUILD_PAIR("mode", JSON_BUILD_STRING(mode)), + JSON_BUILD_PAIR("path", JSON_BUILD_STRING(u->cgroup_path)), + JSON_BUILD_PAIR("property", JSON_BUILD_STRING(property)), +- JSON_BUILD_PAIR_CONDITION(use_limit, "limit", JSON_BUILD_UNSIGNED(c->moom_mem_pressure_limit)))); ++ JSON_BUILD_PAIR_CONDITION(use_limit, "limit", JSON_BUILD_UNSIGNED(c->moom_mem_pressure_limit_permyriad)))); + } + + int manager_varlink_send_managed_oom_update(Unit *u) { +diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c +index 37c581fb22..df35ec114d 100644 +--- a/src/core/dbus-cgroup.c ++++ b/src/core/dbus-cgroup.c +@@ -395,7 +395,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = { + SD_BUS_PROPERTY("DisableControllers", "as", property_get_cgroup_mask, offsetof(CGroupContext, disable_controllers), 0), + SD_BUS_PROPERTY("ManagedOOMSwap", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_swap), 0), + SD_BUS_PROPERTY("ManagedOOMMemoryPressure", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_mem_pressure), 0), +- SD_BUS_PROPERTY("ManagedOOMMemoryPressureLimitPercent", "s", bus_property_get_percent, offsetof(CGroupContext, moom_mem_pressure_limit), 0), ++ SD_BUS_PROPERTY("ManagedOOMMemoryPressureLimitPermyriad", "u", NULL, offsetof(CGroupContext, moom_mem_pressure_limit_permyriad), 0), + SD_BUS_VTABLE_END + }; + +@@ -1697,14 +1697,24 @@ int bus_cgroup_set_property( + return 1; + } + +- if (streq(name, "ManagedOOMMemoryPressureLimitPercent")) { ++ if (streq(name, "ManagedOOMMemoryPressureLimitPermyriad")) { ++ uint32_t v; ++ + if (!UNIT_VTABLE(u)->can_set_managed_oom) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set %s for this unit type", name); + +- r = bus_set_transient_percent(u, name, &c->moom_mem_pressure_limit, message, flags, error); ++ r = sd_bus_message_read(message, "u", &v); + if (r < 0) + return r; + ++ if (v > 10000) ++ return -ERANGE; ++ ++ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { ++ c->moom_mem_pressure_limit_permyriad = v; ++ unit_write_settingf(u, flags, name, "ManagedOOMMemoryPressureLimit=%" PRIu32 ".%02" PRIu32 "%%", v / 100, v % 100); ++ } ++ + if (c->moom_mem_pressure == MANAGED_OOM_KILL) + (void) manager_varlink_send_managed_oom_update(u); + +diff --git a/src/core/dbus-util.c b/src/core/dbus-util.c +index d6223db305..eb03d30cf7 100644 +--- a/src/core/dbus-util.c ++++ b/src/core/dbus-util.c +@@ -91,35 +91,6 @@ int bus_set_transient_bool( + return 1; + } + +-int bus_set_transient_percent( +- Unit *u, +- const char *name, +- int *p, +- sd_bus_message *message, +- UnitWriteFlags flags, +- sd_bus_error *error) { +- +- const char *v; +- int r; +- +- assert(p); +- +- r = sd_bus_message_read(message, "s", &v); +- if (r < 0) +- return r; +- +- r = parse_percent(v); +- if (r < 0) +- return r; +- +- if (!UNIT_WRITE_FLAGS_NOOP(flags)) { +- *p = r; +- unit_write_settingf(u, flags, name, "%s=%d%%", name, r); +- } +- +- return 1; +-} +- + int bus_set_transient_usec_internal( + Unit *u, + const char *name, +diff --git a/src/core/dbus-util.h b/src/core/dbus-util.h +index 4e7c68e843..b68ec38ada 100644 +--- a/src/core/dbus-util.h ++++ b/src/core/dbus-util.h +@@ -240,7 +240,6 @@ int bus_set_transient_user_relaxed(Unit *u, const char *name, char **p, sd_bus_m + int bus_set_transient_path(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); + int bus_set_transient_string(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); + int bus_set_transient_bool(Unit *u, const char *name, bool *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +-int bus_set_transient_percent(Unit *u, const char *name, int *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); + int bus_set_transient_usec_internal(Unit *u, const char *name, usec_t *p, bool fix_0, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); + static inline int bus_set_transient_usec(Unit *u, const char *name, usec_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error) { + return bus_set_transient_usec_internal(u, name, p, false, message, flags, error); +diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 +index 946862c398..db2a4e28a8 100644 +--- a/src/core/load-fragment-gperf.gperf.m4 ++++ b/src/core/load-fragment-gperf.gperf.m4 +@@ -226,7 +226,7 @@ $1.IPIngressFilterPath, config_parse_ip_filter_bpf_progs, + $1.IPEgressFilterPath, config_parse_ip_filter_bpf_progs, 0, offsetof($1, cgroup_context.ip_filters_egress) + $1.ManagedOOMSwap, config_parse_managed_oom_mode, 0, offsetof($1, cgroup_context.moom_swap) + $1.ManagedOOMMemoryPressure, config_parse_managed_oom_mode, 0, offsetof($1, cgroup_context.moom_mem_pressure) +-$1.ManagedOOMMemoryPressureLimitPercent, config_parse_managed_oom_mem_pressure_limit, 0, offsetof($1, cgroup_context.moom_mem_pressure_limit) ++$1.ManagedOOMMemoryPressureLimit, config_parse_managed_oom_mem_pressure_limit, 0, offsetof($1, cgroup_context.moom_mem_pressure_limit_permyriad) + $1.NetClass, config_parse_warn_compat, DISABLED_LEGACY, 0' + )m4_dnl + Unit.Description, config_parse_unit_string_printf, 0, offsetof(Unit, description) +diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c +index 4964249bf2..e0e9920e06 100644 +--- a/src/core/load-fragment.c ++++ b/src/core/load-fragment.c +@@ -3859,7 +3859,7 @@ int config_parse_managed_oom_mem_pressure_limit( + const char *rvalue, + void *data, + void *userdata) { +- int *limit = data; ++ uint32_t *limit = data; + UnitType t; + int r; + +@@ -3874,9 +3874,9 @@ int config_parse_managed_oom_mem_pressure_limit( + return 0; + } + +- r = parse_percent(rvalue); ++ r = parse_permyriad(rvalue); + if (r < 0) { +- log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse limit percent value, ignoring: %s", rvalue); ++ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse memory pressure limit value, ignoring: %s", rvalue); + return 0; + } + +diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c +index 3efa629002..338935b3ec 100644 +--- a/src/oom/oomd-manager.c ++++ b/src/oom/oomd-manager.c +@@ -100,10 +100,10 @@ static int process_managed_oom_reply( + limit = m->default_mem_pressure_limit; + + if (streq(reply.property, "ManagedOOMMemoryPressure")) { +- if (reply.limit > 100) ++ if (reply.limit > 10000) + continue; + else if (reply.limit != 0) { +- ret = store_loadavg_fixed_point((unsigned long) reply.limit, 0, &limit); ++ ret = store_loadavg_fixed_point((unsigned long) reply.limit / 100, (unsigned long) reply.limit % 100, &limit); + if (ret < 0) + continue; + } +@@ -478,8 +478,8 @@ static int manager_connect_bus(Manager *m) { + return 0; + } + +-int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit, usec_t mem_pressure_usec) { +- unsigned long l; ++int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit_permyriad, usec_t mem_pressure_usec) { ++ unsigned long l, f; + int r; + + assert(m); +@@ -489,8 +489,16 @@ int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressur + m->swap_used_limit = swap_used_limit != -1 ? swap_used_limit : DEFAULT_SWAP_USED_LIMIT; + assert(m->swap_used_limit <= 100); + +- l = mem_pressure_limit != -1 ? mem_pressure_limit : DEFAULT_MEM_PRESSURE_LIMIT; +- r = store_loadavg_fixed_point(l, 0, &m->default_mem_pressure_limit); ++ if (mem_pressure_limit_permyriad != -1) { ++ assert(mem_pressure_limit_permyriad <= 10000); ++ ++ l = mem_pressure_limit_permyriad / 100; ++ f = mem_pressure_limit_permyriad % 100; ++ } else { ++ l = DEFAULT_MEM_PRESSURE_LIMIT_PERCENT; ++ f = 0; ++ } ++ r = store_loadavg_fixed_point(l, f, &m->default_mem_pressure_limit); + if (r < 0) + return r; + +@@ -530,12 +538,12 @@ int manager_get_dump_string(Manager *m, char **ret) { + fprintf(f, + "Dry Run: %s\n" + "Swap Used Limit: %u%%\n" +- "Default Memory Pressure Limit: %lu%%\n" ++ "Default Memory Pressure Limit: %lu.%02lu%%\n" + "Default Memory Pressure Duration: %s\n" + "System Context:\n", + yes_no(m->dry_run), + m->swap_used_limit, +- LOAD_INT(m->default_mem_pressure_limit), ++ LOAD_INT(m->default_mem_pressure_limit), LOAD_FRAC(m->default_mem_pressure_limit), + format_timespan(buf, sizeof(buf), m->default_mem_pressure_duration_usec, USEC_PER_SEC)); + oomd_dump_system_context(&m->system_context, f, "\t"); + +diff --git a/src/oom/oomd-manager.h b/src/oom/oomd-manager.h +index ee17abced2..521665e0a8 100644 +--- a/src/oom/oomd-manager.h ++++ b/src/oom/oomd-manager.h +@@ -17,7 +17,7 @@ + * Generally 60 or higher might be acceptable for something like system.slice with no memory.high set; processes in + * system.slice are assumed to be less latency sensitive. */ + #define DEFAULT_MEM_PRESSURE_DURATION_USEC (30 * USEC_PER_SEC) +-#define DEFAULT_MEM_PRESSURE_LIMIT 60 ++#define DEFAULT_MEM_PRESSURE_LIMIT_PERCENT 60 + #define DEFAULT_SWAP_USED_LIMIT 90 + + #define RECLAIM_DURATION_USEC (30 * USEC_PER_SEC) +@@ -56,7 +56,7 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free); + + int manager_new(Manager **ret); + +-int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit, usec_t mem_pressure_usec); ++int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit_permyriad, usec_t mem_pressure_usec); + + int manager_get_dump_string(Manager *m, char **ret); + +diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c +index cec656f6fa..fcccddb92e 100644 +--- a/src/oom/oomd-util.c ++++ b/src/oom/oomd-util.c +@@ -415,11 +415,11 @@ void oomd_dump_memory_pressure_cgroup_context(const OomdCGroupContext *ctx, FILE + + fprintf(f, + "%sPath: %s\n" +- "%s\tMemory Pressure Limit: %lu%%\n" ++ "%s\tMemory Pressure Limit: %lu.%02lu%%\n" + "%s\tPressure: Avg10: %lu.%02lu Avg60: %lu.%02lu Avg300: %lu.%02lu Total: %s\n" + "%s\tCurrent Memory Usage: %s\n", + strempty(prefix), ctx->path, +- strempty(prefix), LOAD_INT(ctx->mem_pressure_limit), ++ strempty(prefix), LOAD_INT(ctx->mem_pressure_limit), LOAD_FRAC(ctx->mem_pressure_limit), + strempty(prefix), + LOAD_INT(ctx->memory_pressure.avg10), LOAD_FRAC(ctx->memory_pressure.avg10), + LOAD_INT(ctx->memory_pressure.avg60), LOAD_FRAC(ctx->memory_pressure.avg60), +diff --git a/src/oom/oomd.c b/src/oom/oomd.c +index 1fbcf41492..811d211b58 100644 +--- a/src/oom/oomd.c ++++ b/src/oom/oomd.c +@@ -18,14 +18,14 @@ + + static bool arg_dry_run = false; + static int arg_swap_used_limit = -1; +-static int arg_mem_pressure_limit = -1; ++static int arg_mem_pressure_limit_permyriad = -1; + static usec_t arg_mem_pressure_usec = 0; + + static int parse_config(void) { + static const ConfigTableItem items[] = { +- { "OOM", "SwapUsedLimitPercent", config_parse_percent, 0, &arg_swap_used_limit }, +- { "OOM", "DefaultMemoryPressureLimitPercent", config_parse_percent, 0, &arg_mem_pressure_limit }, +- { "OOM", "DefaultMemoryPressureDurationSec", config_parse_sec, 0, &arg_mem_pressure_usec }, ++ { "OOM", "SwapUsedLimitPercent", config_parse_percent, 0, &arg_swap_used_limit }, ++ { "OOM", "DefaultMemoryPressureLimit", config_parse_permyriad, 0, &arg_mem_pressure_limit_permyriad }, ++ { "OOM", "DefaultMemoryPressureDurationSec", config_parse_sec, 0, &arg_mem_pressure_usec }, + {} + }; + +@@ -160,7 +160,7 @@ static int run(int argc, char *argv[]) { + if (r < 0) + return log_error_errno(r, "Failed to create manager: %m"); + +- r = manager_start(m, arg_dry_run, arg_swap_used_limit, arg_mem_pressure_limit, arg_mem_pressure_usec); ++ r = manager_start(m, arg_dry_run, arg_swap_used_limit, arg_mem_pressure_limit_permyriad, arg_mem_pressure_usec); + if (r < 0) + return log_error_errno(r, "Failed to start up daemon: %m"); + +diff --git a/src/oom/oomd.conf b/src/oom/oomd.conf +index 766cb1717f..bd6a9391c6 100644 +--- a/src/oom/oomd.conf ++++ b/src/oom/oomd.conf +@@ -13,5 +13,5 @@ + + [OOM] + #SwapUsedLimitPercent=90% +-#DefaultMemoryPressureLimitPercent=60% ++#DefaultMemoryPressureLimit=60% + #DefaultMemoryPressureDurationSec=30s +diff --git a/src/shared/bus-get-properties.c b/src/shared/bus-get-properties.c +index 32f68d5e6a..a5ce7ef17f 100644 +--- a/src/shared/bus-get-properties.c ++++ b/src/shared/bus-get-properties.c +@@ -55,23 +55,6 @@ int bus_property_get_id128( + return sd_bus_message_append_array(reply, 'y', id->bytes, 16); + } + +-int bus_property_get_percent( +- sd_bus *bus, +- const char *path, +- const char *interface, +- const char *property, +- sd_bus_message *reply, +- void *userdata, +- sd_bus_error *error) { +- +- char pstr[DECIMAL_STR_MAX(int) + 2]; +- int p = *(int*) userdata; +- +- xsprintf(pstr, "%d%%", p); +- +- return sd_bus_message_append_basic(reply, 's', pstr); +-} +- + #if __SIZEOF_SIZE_T__ != 8 + int bus_property_get_size( + sd_bus *bus, +diff --git a/src/shared/bus-get-properties.h b/src/shared/bus-get-properties.h +index 9832c0d067..26f3e8588c 100644 +--- a/src/shared/bus-get-properties.h ++++ b/src/shared/bus-get-properties.h +@@ -8,7 +8,6 @@ + int bus_property_get_bool(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); + int bus_property_set_bool(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *value, void *userdata, sd_bus_error *error); + int bus_property_get_id128(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); +-int bus_property_get_percent(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); + + #define bus_property_get_usec ((sd_bus_property_get_t) NULL) + #define bus_property_set_usec ((sd_bus_property_set_t) NULL) +diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c +index 2bab2299fb..f96059c699 100644 +--- a/src/shared/bus-unit-util.c ++++ b/src/shared/bus-unit-util.c +@@ -435,10 +435,25 @@ static int bus_append_cgroup_property(sd_bus_message *m, const char *field, cons + if (STR_IN_SET(field, "DevicePolicy", + "Slice", + "ManagedOOMSwap", +- "ManagedOOMMemoryPressure", +- "ManagedOOMMemoryPressureLimitPercent")) ++ "ManagedOOMMemoryPressure")) + return bus_append_string(m, field, eq); + ++ if (STR_IN_SET(field, "ManagedOOMMemoryPressureLimit")) { ++ char *n; ++ ++ r = parse_permyriad(eq); ++ if (r < 0) ++ return log_error_errno(r, "Failed to parse %s value: %s", field, eq); ++ ++ n = strjoina(field, "Permyriad"); ++ ++ r = sd_bus_message_append(m, "(sv)", n, "u", (uint32_t) r); ++ if (r < 0) ++ return bus_log_create_error(r); ++ ++ return 1; ++ } ++ + if (STR_IN_SET(field, "CPUAccounting", + "MemoryAccounting", + "IOAccounting", +diff --git a/src/shared/conf-parser.c b/src/shared/conf-parser.c +index 35d301d9db..c8c253d603 100644 +--- a/src/shared/conf-parser.c ++++ b/src/shared/conf-parser.c +@@ -1245,3 +1245,4 @@ int config_parse_vlanprotocol(const char* unit, + } + + DEFINE_CONFIG_PARSE(config_parse_percent, parse_percent, "Failed to parse percent value"); ++DEFINE_CONFIG_PARSE(config_parse_permyriad, parse_permyriad, "Failed to parse permyriad value"); +diff --git a/src/shared/conf-parser.h b/src/shared/conf-parser.h +index f115cb23af..988d81e43a 100644 +--- a/src/shared/conf-parser.h ++++ b/src/shared/conf-parser.h +@@ -148,6 +148,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_mtu); + CONFIG_PARSER_PROTOTYPE(config_parse_rlimit); + CONFIG_PARSER_PROTOTYPE(config_parse_vlanprotocol); + CONFIG_PARSER_PROTOTYPE(config_parse_percent); ++CONFIG_PARSER_PROTOTYPE(config_parse_permyriad); + + typedef enum Disabled { + DISABLED_CONFIGURATION, +diff --git a/test/units/testsuite-56-workload.slice b/test/units/testsuite-56-workload.slice +index 45b04914c6..8c32b28094 100644 +--- a/test/units/testsuite-56-workload.slice ++++ b/test/units/testsuite-56-workload.slice +@@ -7,4 +7,4 @@ MemoryAccounting=true + IOAccounting=true + TasksAccounting=true + ManagedOOMMemoryPressure=kill +-ManagedOOMMemoryPressureLimitPercent=1% ++ManagedOOMMemoryPressureLimit=1% +diff --git a/test/units/testsuite-56.sh b/test/units/testsuite-56.sh +index 4dc9d8c7a8..8b01fe37ed 100755 +--- a/test/units/testsuite-56.sh ++++ b/test/units/testsuite-56.sh +@@ -20,7 +20,7 @@ systemctl start testsuite-56-testbloat.service + + # Verify systemd-oomd is monitoring the expected units + oomctl | grep "/testsuite-56-workload.slice" +-oomctl | grep "1%" ++oomctl | grep "1.00%" + oomctl | grep "Default Memory Pressure Duration: 5s" + + # systemd-oomd watches for elevated pressure for 30 seconds before acting. +-- +2.29.2 + diff --git a/95ca39f04efa278ac93881e6e364a6ae520b03e7.patch b/95ca39f04efa278ac93881e6e364a6ae520b03e7.patch new file mode 100644 index 0000000..478902a --- /dev/null +++ b/95ca39f04efa278ac93881e6e364a6ae520b03e7.patch @@ -0,0 +1,40 @@ +From 95ca39f04efa278ac93881e6e364a6ae520b03e7 Mon Sep 17 00:00:00 2001 +From: Yu Watanabe +Date: Fri, 27 Nov 2020 08:29:20 +0900 +Subject: [PATCH] oom: use CMP() macro + +--- + src/oom/oomd-util.h | 14 ++------------ + 1 file changed, 2 insertions(+), 12 deletions(-) + +diff --git a/src/oom/oomd-util.h b/src/oom/oomd-util.h +index 87ecda80fbc..0834cbf09d7 100644 +--- a/src/oom/oomd-util.h ++++ b/src/oom/oomd-util.h +@@ -64,24 +64,14 @@ static inline int compare_pgscan(OomdCGroupContext * const *c1, OomdCGroupContex + assert(c1); + assert(c2); + +- if ((*c1)->pgscan > (*c2)->pgscan) +- return -1; +- else if ((*c1)->pgscan < (*c2)->pgscan) +- return 1; +- else +- return 0; ++ return CMP((*c2)->pgscan, (*c1)->pgscan); + } + + static inline int compare_swap_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) { + assert(c1); + assert(c2); + +- if ((*c1)->swap_usage > (*c2)->swap_usage) +- return -1; +- else if ((*c1)->swap_usage < (*c2)->swap_usage) +- return 1; +- else +- return 0; ++ return CMP((*c2)->swap_usage, (*c1)->swap_usage); + } + + /* Get an array of OomdCGroupContexts from `h`, qsorted from largest to smallest values according to `compare_func`. diff --git a/owner-check.sh b/owner-check.sh new file mode 100755 index 0000000..7086238 --- /dev/null +++ b/owner-check.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -e + +[ -z "$server" -o -z "login" ] && { echo '$server and $login need to be set'; exit 1 } + +header= +from=systemd-maint@fedoraproject.org +time='2 years ago' +# time='1 day ago' +port=587 + +for user in "$@"; do + echo "checking $user…" + t=$(git shortlog --all --author $user --since "@{$time}" | wc -l) + if [ $t != 0 ]; then + echo "$t commits in the last two years, OK" + continue + fi + + if [ -z "$header" ]; then + echo '$USER$;$EMAIL$' >.mail.list + header=done + fi + + echo "$user;$user@fedoraproject.org" >>.mail.list +done + +[ -z "$header" ] && exit 0 + +echo "Sending mails…" +set -x +massmail -F $from \ + -C $from \ + -S 'write access to the fedora systemd package' \ + -z $server -u $login -P $port \ + .mail.list /dev/null || useradd -r -l -g systemd-coredump getent group systemd-resolve &>/dev/null || groupadd -r -g 193 systemd-resolve 2>&1 || : getent passwd systemd-resolve &>/dev/null || useradd -r -u 193 -l -g systemd-resolve -d / -s /sbin/nologin -c "systemd Resolver" systemd-resolve &>/dev/null || : +getent group systemd-oom &>/dev/null || groupadd -r systemd-oom 2>&1 || : +getent passwd systemd-oom &>/dev/null || useradd -r -l -g systemd-oom -d / -s /sbin/nologin -c "systemd Userspace OOM Killer" systemd-oom &>/dev/null || : + %post systemd-machine-id-setup &>/dev/null || : +# FIXME: move to %postun. We want to restart systemd *after* removing +# files from the old rpm. Right now we may still have bits the old +# setup if the files are not present in the new version. But before +# implement restarting of *other* services after the transaction, moving +# this would make things worse, increasing the number of warnings we get +# about needed daemon-reload. systemctl daemon-reexec &>/dev/null || { # systemd v239 had bug #9553 in D-Bus authentication of the private socket, # which was later fixed in v240 by #9625. @@ -647,13 +680,13 @@ systemctl daemon-reexec &>/dev/null || { fi } -journalctl --update-catalog &>/dev/null || : -systemd-tmpfiles --create &>/dev/null || : +if [ $1 -eq 1 ]; then + # create /var/log/journal only on initial installation, + # and only if it's writable (it won't be in rpm-ostree). + [ -w %{_localstatedir} ] && mkdir -p %{_localstatedir}/log/journal -# create /var/log/journal only on initial installation, -# and only if it's writable (it won't be in rpm-ostree). -if [ $1 -eq 1 ] && [ -w %{_localstatedir} ]; then - mkdir -p %{_localstatedir}/log/journal + [ -w %{_localstatedir} ] && journalctl --update-catalog || : + systemd-tmpfiles --create &>/dev/null || : fi # Make sure new journal files will be owned by the "systemd-journal" group @@ -695,19 +728,17 @@ if test -d /run/systemd/system/ && ln -fsv ../run/systemd/resolve/stub-resolv.conf /etc/resolv.conf fi -%preun -if [ $1 -eq 0 ] ; then - systemctl disable --quiet \ - remote-fs.target \ - getty@.service \ - serial-getty@.service \ - console-getty.service \ - debug-shell.service \ - systemd-resolved.service \ - systemd-homed.service \ - >/dev/null || : +%postun +if [ $1 -eq 1 ]; then + [ -w %{_localstatedir} ] && journalctl --update-catalog || : + systemd-tmpfiles --create &>/dev/null || : fi +%systemd_postun_with_restart systemd-timedated.service systemd-portabled.service systemd-homed.service systemd-hostnamed.service systemd-journald.service systemd-localed.service systemd-userdbd.service systemd-oomd.service + +# FIXME: systemd-logind.service is excluded (https://github.com/systemd/systemd/pull/17558) +# FIXME: user@*.service needs to be restarted, but using systemctl --user daemon-reexec + %triggerun -- systemd < 246.1-1 # This is for upgrades from previous versions before systemd-resolved became the default. systemctl --no-reload preset systemd-resolved.service &>/dev/null || : @@ -724,6 +755,12 @@ if systemctl -q is-enabled systemd-resolved.service &>/dev/null; then systemctl start systemd-resolved.service &>/dev/null || : fi +%triggerpostun -- systemd < 247.3-2 +# This is for upgrades from previous versions before oomd-defaults is available. +# We use %%triggerpostun here because rpm doesn't allow a second %%triggerun with +# a different package version. +systemctl --no-reload preset systemd-oomd.service &>/dev/null || : + %post libs %{?ldconfig} @@ -808,9 +845,9 @@ grep -q -E '^KEYMAP="?fi-latin[19]"?' /etc/vconsole.conf 2>/dev/null && %systemd_preun %udev_services %postun udev -# Only restart systemd-udev, to run the upgraded dameon. +# Restart some services. # Others are either oneshot services, or sockets, and restarting them causes issues (#1378974) -%systemd_postun_with_restart systemd-udevd.service +%systemd_postun_with_restart systemd-udevd.service systemd-timesyncd.service %pre journal-remote getent group systemd-journal-remote &>/dev/null || groupadd -r systemd-journal-remote 2>&1 || : @@ -884,6 +921,8 @@ getent passwd systemd-network &>/dev/null || useradd -r -u 192 -l -g systemd-net %files networkd -f .file-list-networkd +%files oomd-defaults -f .file-list-oomd-defaults + %files tests -f .file-list-tests %files standalone-tmpfiles -f .file-list-standalone-tmpfiles @@ -891,6 +930,13 @@ getent passwd systemd-network &>/dev/null || useradd -r -u 192 -l -g systemd-net %files standalone-sysusers -f .file-list-standalone-sysusers %changelog +* Fri Feb 5 2021 Anita Zhang - 247.3-2 +- Changes for https://fedoraproject.org/wiki/Changes/EnableSystemdOomd. +- Backports consist primarily of PR #18361, #18444, and #18401 (plus some + additional ones to handle merge conflicts). +- Create systemd-oomd-defaults subpackage to install unit drop-ins that will + configure systemd-oomd to monitor and act. + * Tue Feb 2 2021 Zbigniew Jędrzejewski-Szmek - 247.3-1 - Minor stable release - Fixes #1895937, #1813219, #1903106.