From 026d94bfc7d1a85bd2ca3ac99d08c010ab10540e Mon Sep 17 00:00:00 2001 From: Jan Friesse Date: Fri, 23 Jul 2021 15:41:59 +0200 Subject: [PATCH] - Related: rhbz#1948974 --- ...-support-for-cgroup-v2-and-auto-mode.patch | 317 ++++++++++++++++++ ...974-1-main-Add-support-for-cgroup-v2.patch | 122 ------- ...an-Add-info-about-cgroup-v2-behavior.patch | 44 --- corosync.spec | 11 +- 4 files changed, 324 insertions(+), 170 deletions(-) create mode 100644 bz1948974-1-main-Add-support-for-cgroup-v2-and-auto-mode.patch delete mode 100644 bz1948974-1-main-Add-support-for-cgroup-v2.patch delete mode 100644 bz1948974-2-man-Add-info-about-cgroup-v2-behavior.patch diff --git a/bz1948974-1-main-Add-support-for-cgroup-v2-and-auto-mode.patch b/bz1948974-1-main-Add-support-for-cgroup-v2-and-auto-mode.patch new file mode 100644 index 0000000..bdfd505 --- /dev/null +++ b/bz1948974-1-main-Add-support-for-cgroup-v2-and-auto-mode.patch @@ -0,0 +1,317 @@ +From c9996fdd0f4fa1fbf113b740eea01bcc70b235aa Mon Sep 17 00:00:00 2001 +From: Jan Friesse +Date: Mon, 3 May 2021 15:29:04 +0200 +Subject: [PATCH] main: Add support for cgroup v2 and auto mode + +Support for cgroup v2 is very similar to cgroup v1 just checking (and +writing) different file. + +Because of all the problems described later with cgroup v2 new "auto" +mode (new default) is added. This mode first tries to set rr scheduling +and moves Corosync to root cgroup only if it fails. + +Testing this feature is a bit harder than with cgroup v1 so it's +probably worh noting in this commit message. + +1. Copy some service file (I've used httpd service) and set + CPUQuota=30% in the [service] section. +2. Check /sys/fs/cgroup/cgroup.subtree_control - there should be no + "cpu" +3. Start modified service +4. Check /sys/fs/cgroup/cgroup.subtree_control - there should be "cpu" +5. Start corosync - It should be able to get rt priority + +When move_to_root_cgroup is disabled (applies only for kernels +with CONFIG_RT_GROUP_SCHED enabled), behavior differs: +- If corosync is started before modified service, so + there is no "cpu" in /sys/fs/cgroup/cgroup.subtree_control + corosync starts without problem and gets rt priority. + Starting modified service later will never add "cpu" into + /sys/fs/cgroup/cgroup.subtree_control (because corosync is holding + rt priority and it is placed in the non-root cgroup by systemd). + +- When corosync is started after modified service, so "cpu" + is in /sys/fs/cgroup/cgroup.subtree_control, corosync is not + able to get RT priority. + +It's worth noting problems when cgroup v2 is used together with systemd +logging described in corosync.conf(5) man page. + +Signed-off-by: Jan Friesse +Reviewed-by: Christine Caulfield +--- + exec/coroparse.c | 3 +- + exec/main.c | 98 ++++++++++++++++++++++++++++++++++----------- + man/corosync.conf.5 | 38 +++++++++++++++--- + 3 files changed, 109 insertions(+), 30 deletions(-) + +diff --git a/exec/coroparse.c b/exec/coroparse.c +index 741f3741..56b8034e 100644 +--- a/exec/coroparse.c ++++ b/exec/coroparse.c +@@ -828,7 +828,8 @@ static int main_config_parser_cb(const char *path, + } + if (strcmp(path, "system.move_to_root_cgroup") == 0) { + if ((strcmp(value, "yes") != 0) && +- (strcmp(value, "no") != 0)) { ++ (strcmp(value, "no") != 0) && ++ (strcmp(value, "auto") != 0)) { + *error_string = "Invalid system.move_to_root_cgroup"; + + return (0); +diff --git a/exec/main.c b/exec/main.c +index aa6d9fbf..5fb4d47c 100644 +--- a/exec/main.c ++++ b/exec/main.c +@@ -169,6 +169,12 @@ static char corosync_config_file[PATH_MAX + 1] = COROSYSCONFDIR "/corosync.conf" + + static int lockfile_fd = -1; + ++enum move_to_root_cgroup_mode { ++ MOVE_TO_ROOT_CGROUP_MODE_OFF = 0, ++ MOVE_TO_ROOT_CGROUP_MODE_ON = 1, ++ MOVE_TO_ROOT_CGROUP_MODE_AUTO = 2, ++}; ++ + qb_loop_t *cs_poll_handle_get (void) + { + return (corosync_poll_handle); +@@ -859,7 +865,12 @@ static void timer_function_scheduler_timeout (void *data) + } + + +-static int corosync_set_rr_scheduler (void) ++/* ++ * Set main pid RR scheduler. ++ * silent: don't log sched_get_priority_max and sched_setscheduler errors ++ * Returns: 0 - success, -1 failure, -2 platform doesn't support SCHED_RR ++ */ ++static int corosync_set_rr_scheduler (int silent) + { + int ret_val = 0; + +@@ -871,9 +882,11 @@ static int corosync_set_rr_scheduler (void) + global_sched_param.sched_priority = sched_priority; + res = sched_setscheduler (0, SCHED_RR, &global_sched_param); + if (res == -1) { +- LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING, +- "Could not set SCHED_RR at priority %d", +- global_sched_param.sched_priority); ++ if (!silent) { ++ LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING, ++ "Could not set SCHED_RR at priority %d", ++ global_sched_param.sched_priority); ++ } + + global_sched_param.sched_priority = 0; + #ifdef HAVE_QB_LOG_THREAD_PRIORITY_SET +@@ -898,15 +911,17 @@ static int corosync_set_rr_scheduler (void) + } + } + } else { +- LOGSYS_PERROR (errno, LOGSYS_LEVEL_WARNING, +- "Could not get maximum scheduler priority"); ++ if (!silent) { ++ LOGSYS_PERROR (errno, LOGSYS_LEVEL_WARNING, ++ "Could not get maximum scheduler priority"); ++ } + sched_priority = 0; + ret_val = -1; + } + #else + log_printf(LOGSYS_LEVEL_WARNING, + "The Platform is missing process priority setting features. Leaving at default."); +- ret_val = -1; ++ ret_val = -2; + #endif + + return (ret_val); +@@ -1173,6 +1188,7 @@ error_close: + static int corosync_move_to_root_cgroup(void) { + FILE *f; + int res = -1; ++ const char *cgroup_task_fname = NULL; + + /* + * /sys/fs/cgroup is hardcoded, because most of Linux distributions are now +@@ -1183,15 +1199,29 @@ static int corosync_move_to_root_cgroup(void) { + */ + f = fopen("/sys/fs/cgroup/cpu/cpu.rt_runtime_us", "rt"); + if (f == NULL) { +- log_printf(LOGSYS_LEVEL_DEBUG, "cpu.rt_runtime_us doesn't exists -> " +- "system without cgroup or with disabled CONFIG_RT_GROUP_SCHED"); ++ /* ++ * Try cgroup v2 ++ */ ++ f = fopen("/sys/fs/cgroup/cgroup.procs", "rt"); ++ if (f == NULL) { ++ log_printf(LOG_DEBUG, "cpu.rt_runtime_us or cgroup.procs doesn't exist -> " ++ "system without cgroup or with disabled CONFIG_RT_GROUP_SCHED"); + +- res = 0; +- goto exit_res; ++ res = 0; ++ goto exit_res; ++ } else { ++ log_printf(LOGSYS_LEVEL_DEBUG, "Moving main pid to cgroup v2 root cgroup"); ++ ++ cgroup_task_fname = "/sys/fs/cgroup/cgroup.procs"; ++ } ++ } else { ++ log_printf(LOGSYS_LEVEL_DEBUG, "Moving main pid to cgroup v1 root cgroup"); ++ ++ cgroup_task_fname = "/sys/fs/cgroup/cpu/tasks"; + } + (void)fclose(f); + +- f = fopen("/sys/fs/cgroup/cpu/tasks", "w"); ++ f = fopen(cgroup_task_fname, "w"); + if (f == NULL) { + log_printf(LOGSYS_LEVEL_WARNING, "Can't open cgroups tasks file for writing"); + +@@ -1256,7 +1286,8 @@ int main (int argc, char **argv, char **envp) + const char *error_string; + struct totem_config totem_config; + int res, ch; +- int background, sched_rr, prio, testonly, move_to_root_cgroup; ++ int background, sched_rr, prio, testonly; ++ enum move_to_root_cgroup_mode move_to_root_cgroup; + enum e_corosync_done flock_err; + uint64_t totem_config_warnings; + struct scheduler_pause_timeout_data scheduler_pause_timeout_data; +@@ -1264,6 +1295,7 @@ int main (int argc, char **argv, char **envp) + char *ep; + char *tmp_str; + int log_subsys_id_totem; ++ int silent; + + /* default configuration + */ +@@ -1417,21 +1449,19 @@ int main (int argc, char **argv, char **envp) + } + + +- move_to_root_cgroup = 1; ++ move_to_root_cgroup = MOVE_TO_ROOT_CGROUP_MODE_AUTO; + if (icmap_get_string("system.move_to_root_cgroup", &tmp_str) == CS_OK) { +- if (strcmp(tmp_str, "yes") != 0) { +- move_to_root_cgroup = 0; ++ /* ++ * Validity of move_to_root_cgroup values checked in coroparse.c ++ */ ++ if (strcmp(tmp_str, "yes") == 0) { ++ move_to_root_cgroup = MOVE_TO_ROOT_CGROUP_MODE_ON; ++ } else if (strcmp(tmp_str, "no") == 0) { ++ move_to_root_cgroup = MOVE_TO_ROOT_CGROUP_MODE_OFF; + } + free(tmp_str); + } + +- /* +- * Try to move corosync into root cpu cgroup. Failure is not fatal and +- * error is deliberately ignored. +- */ +- if (move_to_root_cgroup) { +- (void)corosync_move_to_root_cgroup(); +- } + + sched_rr = 1; + if (icmap_get_string("system.sched_rr", &tmp_str) == CS_OK) { +@@ -1462,11 +1492,31 @@ int main (int argc, char **argv, char **envp) + free(tmp_str); + } + ++ if (move_to_root_cgroup == MOVE_TO_ROOT_CGROUP_MODE_ON) { ++ /* ++ * Try to move corosync into root cpu cgroup. Failure is not fatal and ++ * error is deliberately ignored. ++ */ ++ (void)corosync_move_to_root_cgroup(); ++ } ++ + /* + * Set round robin realtime scheduling with priority 99 + */ + if (sched_rr) { +- if (corosync_set_rr_scheduler () != 0) { ++ silent = (move_to_root_cgroup == MOVE_TO_ROOT_CGROUP_MODE_AUTO); ++ res = corosync_set_rr_scheduler (silent); ++ ++ if (res == -1 && move_to_root_cgroup == MOVE_TO_ROOT_CGROUP_MODE_AUTO) { ++ /* ++ * Try to move process to root cgroup and try set priority again ++ */ ++ (void)corosync_move_to_root_cgroup(); ++ ++ res = corosync_set_rr_scheduler (0); ++ } ++ ++ if (res != 0) { + prio = INT_MIN; + } else { + prio = 0; +diff --git a/man/corosync.conf.5 b/man/corosync.conf.5 +index 25289ba4..0588ad1e 100644 +--- a/man/corosync.conf.5 ++++ b/man/corosync.conf.5 +@@ -1,6 +1,6 @@ + .\"/* + .\" * Copyright (c) 2005 MontaVista Software, Inc. +-.\" * Copyright (c) 2006-2020 Red Hat, Inc. ++.\" * Copyright (c) 2006-2021 Red Hat, Inc. + .\" * + .\" * All rights reserved. + .\" * +@@ -32,7 +32,7 @@ + .\" * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + .\" * THE POSSIBILITY OF SUCH DAMAGE. + .\" */ +-.TH COROSYNC_CONF 5 2021-04-09 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual" ++.TH COROSYNC_CONF 5 2021-07-23 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual" + .SH NAME + corosync.conf - corosync executive configuration file + +@@ -799,9 +799,37 @@ meaning maximal / minimal priority (so minimal / maximal nice value). + + .TP + move_to_root_cgroup +-Should be set to yes (default) if corosync should try to move itself to root +-cgroup. This feature is available only for systems with cgroups with RT +-sched enabled (Linux with CONFIG_RT_GROUP_SCHED kernel option). ++Can be one of ++.B yes ++(Corosync always moves itself to root cgroup), ++.B no ++(Corosync never tries to move itself to root cgroup) or ++.B auto ++(Corosync first checks if sched_rr is enabled, and if ++so, it tries to set round robin realtime scheduling with maximal priority to itself. ++If setting of priority fails, corosync tries to move itself to root ++cgroup and retries setting of priority). ++ ++This feature is available only for systems with cgroups v1 with RT ++sched enabled (Linux with CONFIG_RT_GROUP_SCHED kernel option) and cgroups v2. ++ ++It's worth noting that currently (May 3 2021) cgroup2 doesn’t yet ++support control of realtime processes and the cpu controller can only be ++enabled when all RT processes are in the root cgroup (applies only for kernel ++with CONFIG_RT_GROUP_SCHED enabled). So when move_to_root_cgroup ++is disabled, kernel is compiled with CONFIG_RT_GROUP_SCHED and systemd is used, ++it may be impossible to make systemd options ++like CPUQuota working correctly until corosync is stopped. ++ ++Also when moving to root cgroup is enforced and used together with cgroup2 and systemd ++it makes impossible (most of the time) for journald to add systemd specific ++metadata (most importantly _SYSTEMD_UNIT) properly, because corosync is ++moved out of cgroup created by systemd. This means ++it is not possible to filter corosync logged messages based on these metadata ++(for example using -u or _SYSTEMD_UNIT=UNIT pattern) and also running ++systemctl status doesn't display (all) corosync log messages. ++The problem is even worse because journald caches pid for some time ++(approx. 5 sec) so initial corosync messages have correct metadata. + + .TP + allow_knet_handle_fallback +-- +2.27.0 + diff --git a/bz1948974-1-main-Add-support-for-cgroup-v2.patch b/bz1948974-1-main-Add-support-for-cgroup-v2.patch deleted file mode 100644 index 995af00..0000000 --- a/bz1948974-1-main-Add-support-for-cgroup-v2.patch +++ /dev/null @@ -1,122 +0,0 @@ -From 57e6b86b53010dd2612b0a6a4e04917673062ecf Mon Sep 17 00:00:00 2001 -From: Jan Friesse -Date: Mon, 3 May 2021 15:29:04 +0200 -Subject: [PATCH 3/7] main: Add support for cgroup v2 - -Support for cgroup v2 is very similar to cgroup v1 just checking (and -writing) different file. - -Testing this feature is a bit harder than with cgroup v1 so it's -probably worh noting in this commit message. - -1. Copy some service file (I've used httpd service) and set - CPUQuota=30% in the [service] section. -2. Check /sys/fs/cgroup/cgroup.subtree_control - there should be no - "cpu" -3. Start modified service -4. Check /sys/fs/cgroup/cgroup.subtree_control - there should be "cpu" -5. Start corosync - It should be able to get rt priority - -When move_to_root_cgroup is disabled, behavior differs: -- If corosync is started before modified service, so - there is no "cpu" in /sys/fs/cgroup/cgroup.subtree_control - corosync starts without problem and gets rt priority. - Starting modified service later will never add "cpu" into - /sys/fs/cgroup/cgroup.subtree_control (because corosync is holding - rt priority and it is placed in the non-root cgroup by systemd). - -- When corosync is started after modified service, so "cpu" - is in /sys/fs/cgroup/cgroup.subtree_control, corosync is not - able to get RT priority. - -Signed-off-by: Jan Friesse -Reviewed-by: Christine Caulfield ---- - exec/main.c | 21 ++++++++++++++++----- - man/corosync.conf.5 | 14 ++++++++++---- - 2 files changed, 26 insertions(+), 9 deletions(-) - -diff --git a/exec/main.c b/exec/main.c -index aa6d9fbf..65ae5e4f 100644 ---- a/exec/main.c -+++ b/exec/main.c -@@ -1173,6 +1173,7 @@ error_close: - static int corosync_move_to_root_cgroup(void) { - FILE *f; - int res = -1; -+ const char *cgroup_task_fname = NULL; - - /* - * /sys/fs/cgroup is hardcoded, because most of Linux distributions are now -@@ -1183,15 +1184,25 @@ static int corosync_move_to_root_cgroup(void) { - */ - f = fopen("/sys/fs/cgroup/cpu/cpu.rt_runtime_us", "rt"); - if (f == NULL) { -- log_printf(LOGSYS_LEVEL_DEBUG, "cpu.rt_runtime_us doesn't exists -> " -- "system without cgroup or with disabled CONFIG_RT_GROUP_SCHED"); -+ /* -+ * Try cgroup v2 -+ */ -+ f = fopen("/sys/fs/cgroup/cgroup.procs", "rt"); -+ if (f == NULL) { -+ log_printf(LOG_DEBUG, "cpu.rt_runtime_us or cgroup.procs doesn't exist -> " -+ "system without cgroup or with disabled CONFIG_RT_GROUP_SCHED"); - -- res = 0; -- goto exit_res; -+ res = 0; -+ goto exit_res; -+ } else { -+ cgroup_task_fname = "/sys/fs/cgroup/cgroup.procs"; -+ } -+ } else { -+ cgroup_task_fname = "/sys/fs/cgroup/cpu/tasks"; - } - (void)fclose(f); - -- f = fopen("/sys/fs/cgroup/cpu/tasks", "w"); -+ f = fopen(cgroup_task_fname, "w"); - if (f == NULL) { - log_printf(LOGSYS_LEVEL_WARNING, "Can't open cgroups tasks file for writing"); - -diff --git a/man/corosync.conf.5 b/man/corosync.conf.5 -index 25289ba4..1c9d2ad7 100644 ---- a/man/corosync.conf.5 -+++ b/man/corosync.conf.5 -@@ -1,6 +1,6 @@ - .\"/* - .\" * Copyright (c) 2005 MontaVista Software, Inc. --.\" * Copyright (c) 2006-2020 Red Hat, Inc. -+.\" * Copyright (c) 2006-2021 Red Hat, Inc. - .\" * - .\" * All rights reserved. - .\" * -@@ -32,7 +32,7 @@ - .\" * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - .\" * THE POSSIBILITY OF SUCH DAMAGE. - .\" */ --.TH COROSYNC_CONF 5 2021-04-09 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual" -+.TH COROSYNC_CONF 5 2021-05-03 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual" - .SH NAME - corosync.conf - corosync executive configuration file - -@@ -800,8 +800,14 @@ meaning maximal / minimal priority (so minimal / maximal nice value). - .TP - move_to_root_cgroup - Should be set to yes (default) if corosync should try to move itself to root --cgroup. This feature is available only for systems with cgroups with RT --sched enabled (Linux with CONFIG_RT_GROUP_SCHED kernel option). -+cgroup. This feature is available only for systems with cgroups v1 with RT -+sched enabled (Linux with CONFIG_RT_GROUP_SCHED kernel option) and cgroups v2. -+ -+It's worth noting that currently (May 3 2021) cgroup2 doesn’t yet -+support control of realtime processes and the cpu controller can only be -+enabled when all RT processes are in the root cgroup. So when move_to_root_cgroup -+is disabled and systemd is used, it may be impossible to make systemd options -+like CPUQuota working correctly until corosync is stopped. - - .TP - allow_knet_handle_fallback --- -2.27.0 - diff --git a/bz1948974-2-man-Add-info-about-cgroup-v2-behavior.patch b/bz1948974-2-man-Add-info-about-cgroup-v2-behavior.patch deleted file mode 100644 index 3a12637..0000000 --- a/bz1948974-2-man-Add-info-about-cgroup-v2-behavior.patch +++ /dev/null @@ -1,44 +0,0 @@ -From 9d3df5696ed6b04b379a2fe643eec1fcd5a4b10d Mon Sep 17 00:00:00 2001 -From: Jan Friesse -Date: Tue, 18 May 2021 10:43:37 +0200 -Subject: [PATCH 5/7] man: Add info about cgroup v2 behavior - -Signed-off-by: Jan Friesse -Reviewed-by: Christine Caulfield ---- - man/corosync.conf.5 | 12 +++++++++++- - 1 file changed, 11 insertions(+), 1 deletion(-) - -diff --git a/man/corosync.conf.5 b/man/corosync.conf.5 -index 1c9d2ad7..90a2babc 100644 ---- a/man/corosync.conf.5 -+++ b/man/corosync.conf.5 -@@ -32,7 +32,7 @@ - .\" * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - .\" * THE POSSIBILITY OF SUCH DAMAGE. - .\" */ --.TH COROSYNC_CONF 5 2021-05-03 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual" -+.TH COROSYNC_CONF 5 2021-05-18 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual" - .SH NAME - corosync.conf - corosync executive configuration file - -@@ -809,6 +809,16 @@ enabled when all RT processes are in the root cgroup. So when move_to_root_cgrou - is disabled and systemd is used, it may be impossible to make systemd options - like CPUQuota working correctly until corosync is stopped. - -+Also when this option is used together with cgroup2 and systemd -+it makes impossible (most of the time) for journald to add systemd specific -+metadata (most importantly _SYSTEMD_UNIT) properly, because corosync is -+moved out of cgroup created by systemd. This means -+it is not possible to filter corosync logged messages based on these metadata -+(for example using -u or _SYSTEMD_UNIT=UNIT pattern) and also running -+systemctl status doesn't display (all) corosync log messages. -+The problem is even worse because journald caches pid for some time -+(approx. 5 sec) so initial corosync messages have correct metadata. -+ - .TP - allow_knet_handle_fallback - If knet handle creation fails using privileged operations, allow fallback to --- -2.27.0 - diff --git a/corosync.spec b/corosync.spec index d1a56c6..733d313 100644 --- a/corosync.spec +++ b/corosync.spec @@ -18,13 +18,12 @@ Name: corosync Summary: The Corosync Cluster Engine and Application Programming Interfaces Version: 3.1.4 -Release: 2%{?gitver}%{?dist} +Release: 3%{?gitver}%{?dist} License: BSD URL: http://corosync.github.io/corosync/ Source0: http://build.clusterlabs.org/corosync/releases/%{name}-%{version}%{?gittarver}.tar.gz -Patch0: bz1948974-1-main-Add-support-for-cgroup-v2.patch -Patch1: bz1948974-2-man-Add-info-about-cgroup-v2-behavior.patch +Patch0: bz1948974-1-main-Add-support-for-cgroup-v2-and-auto-mode.patch # Runtime bits # The automatic dependency overridden in favor of explicit version lock @@ -75,7 +74,6 @@ BuildRequires: make %prep %setup -q -n %{name}-%{version}%{?gittarver} %patch0 -p1 -b .bz1948974-1 -%patch1 -p1 -b .bz1948974-2 %build %if %{with runautogen} @@ -294,6 +292,11 @@ network splits) %endif %changelog +* Fri Jul 23 2021 Jan Friesse - 3.1.4-3 +- Related: rhbz#1948974 + +- Add support for cgroup v2 and auto mode (rhbz#1948974) + * Tue Jun 22 2021 Mohan Boddu - 3.1.4-2 - Rebuilt for RHEL 9 BETA for openssl 3.0 Related: rhbz#1971065