corosync/bz1948974-1-main-Add-support-for-cgroup-v2-and-auto-mode.patch

318 lines
11 KiB
Diff
Raw Normal View History

2021-07-23 13:41:59 +00:00
From c9996fdd0f4fa1fbf113b740eea01bcc70b235aa Mon Sep 17 00:00:00 2001
From: Jan Friesse <jfriesse@redhat.com>
Date: Mon, 3 May 2021 15:29:04 +0200
Subject: [PATCH] main: Add support for cgroup v2 and auto mode
Support for cgroup v2 is very similar to cgroup v1 just checking (and
writing) different file.
Because of all the problems described later with cgroup v2 new "auto"
mode (new default) is added. This mode first tries to set rr scheduling
and moves Corosync to root cgroup only if it fails.
Testing this feature is a bit harder than with cgroup v1 so it's
probably worh noting in this commit message.
1. Copy some service file (I've used httpd service) and set
CPUQuota=30% in the [service] section.
2. Check /sys/fs/cgroup/cgroup.subtree_control - there should be no
"cpu"
3. Start modified service
4. Check /sys/fs/cgroup/cgroup.subtree_control - there should be "cpu"
5. Start corosync - It should be able to get rt priority
When move_to_root_cgroup is disabled (applies only for kernels
with CONFIG_RT_GROUP_SCHED enabled), behavior differs:
- If corosync is started before modified service, so
there is no "cpu" in /sys/fs/cgroup/cgroup.subtree_control
corosync starts without problem and gets rt priority.
Starting modified service later will never add "cpu" into
/sys/fs/cgroup/cgroup.subtree_control (because corosync is holding
rt priority and it is placed in the non-root cgroup by systemd).
- When corosync is started after modified service, so "cpu"
is in /sys/fs/cgroup/cgroup.subtree_control, corosync is not
able to get RT priority.
It's worth noting problems when cgroup v2 is used together with systemd
logging described in corosync.conf(5) man page.
Signed-off-by: Jan Friesse <jfriesse@redhat.com>
Reviewed-by: Christine Caulfield <ccaulfie@redhat.com>
---
exec/coroparse.c | 3 +-
exec/main.c | 98 ++++++++++++++++++++++++++++++++++-----------
man/corosync.conf.5 | 38 +++++++++++++++---
3 files changed, 109 insertions(+), 30 deletions(-)
diff --git a/exec/coroparse.c b/exec/coroparse.c
index 741f3741..56b8034e 100644
--- a/exec/coroparse.c
+++ b/exec/coroparse.c
@@ -828,7 +828,8 @@ static int main_config_parser_cb(const char *path,
}
if (strcmp(path, "system.move_to_root_cgroup") == 0) {
if ((strcmp(value, "yes") != 0) &&
- (strcmp(value, "no") != 0)) {
+ (strcmp(value, "no") != 0) &&
+ (strcmp(value, "auto") != 0)) {
*error_string = "Invalid system.move_to_root_cgroup";
return (0);
diff --git a/exec/main.c b/exec/main.c
index aa6d9fbf..5fb4d47c 100644
--- a/exec/main.c
+++ b/exec/main.c
@@ -169,6 +169,12 @@ static char corosync_config_file[PATH_MAX + 1] = COROSYSCONFDIR "/corosync.conf"
static int lockfile_fd = -1;
+enum move_to_root_cgroup_mode {
+ MOVE_TO_ROOT_CGROUP_MODE_OFF = 0,
+ MOVE_TO_ROOT_CGROUP_MODE_ON = 1,
+ MOVE_TO_ROOT_CGROUP_MODE_AUTO = 2,
+};
+
qb_loop_t *cs_poll_handle_get (void)
{
return (corosync_poll_handle);
@@ -859,7 +865,12 @@ static void timer_function_scheduler_timeout (void *data)
}
-static int corosync_set_rr_scheduler (void)
+/*
+ * Set main pid RR scheduler.
+ * silent: don't log sched_get_priority_max and sched_setscheduler errors
+ * Returns: 0 - success, -1 failure, -2 platform doesn't support SCHED_RR
+ */
+static int corosync_set_rr_scheduler (int silent)
{
int ret_val = 0;
@@ -871,9 +882,11 @@ static int corosync_set_rr_scheduler (void)
global_sched_param.sched_priority = sched_priority;
res = sched_setscheduler (0, SCHED_RR, &global_sched_param);
if (res == -1) {
- LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING,
- "Could not set SCHED_RR at priority %d",
- global_sched_param.sched_priority);
+ if (!silent) {
+ LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING,
+ "Could not set SCHED_RR at priority %d",
+ global_sched_param.sched_priority);
+ }
global_sched_param.sched_priority = 0;
#ifdef HAVE_QB_LOG_THREAD_PRIORITY_SET
@@ -898,15 +911,17 @@ static int corosync_set_rr_scheduler (void)
}
}
} else {
- LOGSYS_PERROR (errno, LOGSYS_LEVEL_WARNING,
- "Could not get maximum scheduler priority");
+ if (!silent) {
+ LOGSYS_PERROR (errno, LOGSYS_LEVEL_WARNING,
+ "Could not get maximum scheduler priority");
+ }
sched_priority = 0;
ret_val = -1;
}
#else
log_printf(LOGSYS_LEVEL_WARNING,
"The Platform is missing process priority setting features. Leaving at default.");
- ret_val = -1;
+ ret_val = -2;
#endif
return (ret_val);
@@ -1173,6 +1188,7 @@ error_close:
static int corosync_move_to_root_cgroup(void) {
FILE *f;
int res = -1;
+ const char *cgroup_task_fname = NULL;
/*
* /sys/fs/cgroup is hardcoded, because most of Linux distributions are now
@@ -1183,15 +1199,29 @@ static int corosync_move_to_root_cgroup(void) {
*/
f = fopen("/sys/fs/cgroup/cpu/cpu.rt_runtime_us", "rt");
if (f == NULL) {
- log_printf(LOGSYS_LEVEL_DEBUG, "cpu.rt_runtime_us doesn't exists -> "
- "system without cgroup or with disabled CONFIG_RT_GROUP_SCHED");
+ /*
+ * Try cgroup v2
+ */
+ f = fopen("/sys/fs/cgroup/cgroup.procs", "rt");
+ if (f == NULL) {
+ log_printf(LOG_DEBUG, "cpu.rt_runtime_us or cgroup.procs doesn't exist -> "
+ "system without cgroup or with disabled CONFIG_RT_GROUP_SCHED");
- res = 0;
- goto exit_res;
+ res = 0;
+ goto exit_res;
+ } else {
+ log_printf(LOGSYS_LEVEL_DEBUG, "Moving main pid to cgroup v2 root cgroup");
+
+ cgroup_task_fname = "/sys/fs/cgroup/cgroup.procs";
+ }
+ } else {
+ log_printf(LOGSYS_LEVEL_DEBUG, "Moving main pid to cgroup v1 root cgroup");
+
+ cgroup_task_fname = "/sys/fs/cgroup/cpu/tasks";
}
(void)fclose(f);
- f = fopen("/sys/fs/cgroup/cpu/tasks", "w");
+ f = fopen(cgroup_task_fname, "w");
if (f == NULL) {
log_printf(LOGSYS_LEVEL_WARNING, "Can't open cgroups tasks file for writing");
@@ -1256,7 +1286,8 @@ int main (int argc, char **argv, char **envp)
const char *error_string;
struct totem_config totem_config;
int res, ch;
- int background, sched_rr, prio, testonly, move_to_root_cgroup;
+ int background, sched_rr, prio, testonly;
+ enum move_to_root_cgroup_mode move_to_root_cgroup;
enum e_corosync_done flock_err;
uint64_t totem_config_warnings;
struct scheduler_pause_timeout_data scheduler_pause_timeout_data;
@@ -1264,6 +1295,7 @@ int main (int argc, char **argv, char **envp)
char *ep;
char *tmp_str;
int log_subsys_id_totem;
+ int silent;
/* default configuration
*/
@@ -1417,21 +1449,19 @@ int main (int argc, char **argv, char **envp)
}
- move_to_root_cgroup = 1;
+ move_to_root_cgroup = MOVE_TO_ROOT_CGROUP_MODE_AUTO;
if (icmap_get_string("system.move_to_root_cgroup", &tmp_str) == CS_OK) {
- if (strcmp(tmp_str, "yes") != 0) {
- move_to_root_cgroup = 0;
+ /*
+ * Validity of move_to_root_cgroup values checked in coroparse.c
+ */
+ if (strcmp(tmp_str, "yes") == 0) {
+ move_to_root_cgroup = MOVE_TO_ROOT_CGROUP_MODE_ON;
+ } else if (strcmp(tmp_str, "no") == 0) {
+ move_to_root_cgroup = MOVE_TO_ROOT_CGROUP_MODE_OFF;
}
free(tmp_str);
}
- /*
- * Try to move corosync into root cpu cgroup. Failure is not fatal and
- * error is deliberately ignored.
- */
- if (move_to_root_cgroup) {
- (void)corosync_move_to_root_cgroup();
- }
sched_rr = 1;
if (icmap_get_string("system.sched_rr", &tmp_str) == CS_OK) {
@@ -1462,11 +1492,31 @@ int main (int argc, char **argv, char **envp)
free(tmp_str);
}
+ if (move_to_root_cgroup == MOVE_TO_ROOT_CGROUP_MODE_ON) {
+ /*
+ * Try to move corosync into root cpu cgroup. Failure is not fatal and
+ * error is deliberately ignored.
+ */
+ (void)corosync_move_to_root_cgroup();
+ }
+
/*
* Set round robin realtime scheduling with priority 99
*/
if (sched_rr) {
- if (corosync_set_rr_scheduler () != 0) {
+ silent = (move_to_root_cgroup == MOVE_TO_ROOT_CGROUP_MODE_AUTO);
+ res = corosync_set_rr_scheduler (silent);
+
+ if (res == -1 && move_to_root_cgroup == MOVE_TO_ROOT_CGROUP_MODE_AUTO) {
+ /*
+ * Try to move process to root cgroup and try set priority again
+ */
+ (void)corosync_move_to_root_cgroup();
+
+ res = corosync_set_rr_scheduler (0);
+ }
+
+ if (res != 0) {
prio = INT_MIN;
} else {
prio = 0;
diff --git a/man/corosync.conf.5 b/man/corosync.conf.5
index 25289ba4..0588ad1e 100644
--- a/man/corosync.conf.5
+++ b/man/corosync.conf.5
@@ -1,6 +1,6 @@
.\"/*
.\" * Copyright (c) 2005 MontaVista Software, Inc.
-.\" * Copyright (c) 2006-2020 Red Hat, Inc.
+.\" * Copyright (c) 2006-2021 Red Hat, Inc.
.\" *
.\" * All rights reserved.
.\" *
@@ -32,7 +32,7 @@
.\" * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
.\" * THE POSSIBILITY OF SUCH DAMAGE.
.\" */
-.TH COROSYNC_CONF 5 2021-04-09 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual"
+.TH COROSYNC_CONF 5 2021-07-23 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual"
.SH NAME
corosync.conf - corosync executive configuration file
@@ -799,9 +799,37 @@ meaning maximal / minimal priority (so minimal / maximal nice value).
.TP
move_to_root_cgroup
-Should be set to yes (default) if corosync should try to move itself to root
-cgroup. This feature is available only for systems with cgroups with RT
-sched enabled (Linux with CONFIG_RT_GROUP_SCHED kernel option).
+Can be one of
+.B yes
+(Corosync always moves itself to root cgroup),
+.B no
+(Corosync never tries to move itself to root cgroup) or
+.B auto
+(Corosync first checks if sched_rr is enabled, and if
+so, it tries to set round robin realtime scheduling with maximal priority to itself.
+If setting of priority fails, corosync tries to move itself to root
+cgroup and retries setting of priority).
+
+This feature is available only for systems with cgroups v1 with RT
+sched enabled (Linux with CONFIG_RT_GROUP_SCHED kernel option) and cgroups v2.
+
+It's worth noting that currently (May 3 2021) cgroup2 doesnt yet
+support control of realtime processes and the cpu controller can only be
+enabled when all RT processes are in the root cgroup (applies only for kernel
+with CONFIG_RT_GROUP_SCHED enabled). So when move_to_root_cgroup
+is disabled, kernel is compiled with CONFIG_RT_GROUP_SCHED and systemd is used,
+it may be impossible to make systemd options
+like CPUQuota working correctly until corosync is stopped.
+
+Also when moving to root cgroup is enforced and used together with cgroup2 and systemd
+it makes impossible (most of the time) for journald to add systemd specific
+metadata (most importantly _SYSTEMD_UNIT) properly, because corosync is
+moved out of cgroup created by systemd. This means
+it is not possible to filter corosync logged messages based on these metadata
+(for example using -u or _SYSTEMD_UNIT=UNIT pattern) and also running
+systemctl status doesn't display (all) corosync log messages.
+The problem is even worse because journald caches pid for some time
+(approx. 5 sec) so initial corosync messages have correct metadata.
.TP
allow_knet_handle_fallback
--
2.27.0