318 lines
11 KiB
Diff
318 lines
11 KiB
Diff
From c9996fdd0f4fa1fbf113b740eea01bcc70b235aa Mon Sep 17 00:00:00 2001
|
||
From: Jan Friesse <jfriesse@redhat.com>
|
||
Date: Mon, 3 May 2021 15:29:04 +0200
|
||
Subject: [PATCH] main: Add support for cgroup v2 and auto mode
|
||
|
||
Support for cgroup v2 is very similar to cgroup v1 just checking (and
|
||
writing) different file.
|
||
|
||
Because of all the problems described later with cgroup v2 new "auto"
|
||
mode (new default) is added. This mode first tries to set rr scheduling
|
||
and moves Corosync to root cgroup only if it fails.
|
||
|
||
Testing this feature is a bit harder than with cgroup v1 so it's
|
||
probably worh noting in this commit message.
|
||
|
||
1. Copy some service file (I've used httpd service) and set
|
||
CPUQuota=30% in the [service] section.
|
||
2. Check /sys/fs/cgroup/cgroup.subtree_control - there should be no
|
||
"cpu"
|
||
3. Start modified service
|
||
4. Check /sys/fs/cgroup/cgroup.subtree_control - there should be "cpu"
|
||
5. Start corosync - It should be able to get rt priority
|
||
|
||
When move_to_root_cgroup is disabled (applies only for kernels
|
||
with CONFIG_RT_GROUP_SCHED enabled), behavior differs:
|
||
- If corosync is started before modified service, so
|
||
there is no "cpu" in /sys/fs/cgroup/cgroup.subtree_control
|
||
corosync starts without problem and gets rt priority.
|
||
Starting modified service later will never add "cpu" into
|
||
/sys/fs/cgroup/cgroup.subtree_control (because corosync is holding
|
||
rt priority and it is placed in the non-root cgroup by systemd).
|
||
|
||
- When corosync is started after modified service, so "cpu"
|
||
is in /sys/fs/cgroup/cgroup.subtree_control, corosync is not
|
||
able to get RT priority.
|
||
|
||
It's worth noting problems when cgroup v2 is used together with systemd
|
||
logging described in corosync.conf(5) man page.
|
||
|
||
Signed-off-by: Jan Friesse <jfriesse@redhat.com>
|
||
Reviewed-by: Christine Caulfield <ccaulfie@redhat.com>
|
||
---
|
||
exec/coroparse.c | 3 +-
|
||
exec/main.c | 98 ++++++++++++++++++++++++++++++++++-----------
|
||
man/corosync.conf.5 | 38 +++++++++++++++---
|
||
3 files changed, 109 insertions(+), 30 deletions(-)
|
||
|
||
diff --git a/exec/coroparse.c b/exec/coroparse.c
|
||
index 741f3741..56b8034e 100644
|
||
--- a/exec/coroparse.c
|
||
+++ b/exec/coroparse.c
|
||
@@ -828,7 +828,8 @@ static int main_config_parser_cb(const char *path,
|
||
}
|
||
if (strcmp(path, "system.move_to_root_cgroup") == 0) {
|
||
if ((strcmp(value, "yes") != 0) &&
|
||
- (strcmp(value, "no") != 0)) {
|
||
+ (strcmp(value, "no") != 0) &&
|
||
+ (strcmp(value, "auto") != 0)) {
|
||
*error_string = "Invalid system.move_to_root_cgroup";
|
||
|
||
return (0);
|
||
diff --git a/exec/main.c b/exec/main.c
|
||
index aa6d9fbf..5fb4d47c 100644
|
||
--- a/exec/main.c
|
||
+++ b/exec/main.c
|
||
@@ -169,6 +169,12 @@ static char corosync_config_file[PATH_MAX + 1] = COROSYSCONFDIR "/corosync.conf"
|
||
|
||
static int lockfile_fd = -1;
|
||
|
||
+enum move_to_root_cgroup_mode {
|
||
+ MOVE_TO_ROOT_CGROUP_MODE_OFF = 0,
|
||
+ MOVE_TO_ROOT_CGROUP_MODE_ON = 1,
|
||
+ MOVE_TO_ROOT_CGROUP_MODE_AUTO = 2,
|
||
+};
|
||
+
|
||
qb_loop_t *cs_poll_handle_get (void)
|
||
{
|
||
return (corosync_poll_handle);
|
||
@@ -859,7 +865,12 @@ static void timer_function_scheduler_timeout (void *data)
|
||
}
|
||
|
||
|
||
-static int corosync_set_rr_scheduler (void)
|
||
+/*
|
||
+ * Set main pid RR scheduler.
|
||
+ * silent: don't log sched_get_priority_max and sched_setscheduler errors
|
||
+ * Returns: 0 - success, -1 failure, -2 platform doesn't support SCHED_RR
|
||
+ */
|
||
+static int corosync_set_rr_scheduler (int silent)
|
||
{
|
||
int ret_val = 0;
|
||
|
||
@@ -871,9 +882,11 @@ static int corosync_set_rr_scheduler (void)
|
||
global_sched_param.sched_priority = sched_priority;
|
||
res = sched_setscheduler (0, SCHED_RR, &global_sched_param);
|
||
if (res == -1) {
|
||
- LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING,
|
||
- "Could not set SCHED_RR at priority %d",
|
||
- global_sched_param.sched_priority);
|
||
+ if (!silent) {
|
||
+ LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING,
|
||
+ "Could not set SCHED_RR at priority %d",
|
||
+ global_sched_param.sched_priority);
|
||
+ }
|
||
|
||
global_sched_param.sched_priority = 0;
|
||
#ifdef HAVE_QB_LOG_THREAD_PRIORITY_SET
|
||
@@ -898,15 +911,17 @@ static int corosync_set_rr_scheduler (void)
|
||
}
|
||
}
|
||
} else {
|
||
- LOGSYS_PERROR (errno, LOGSYS_LEVEL_WARNING,
|
||
- "Could not get maximum scheduler priority");
|
||
+ if (!silent) {
|
||
+ LOGSYS_PERROR (errno, LOGSYS_LEVEL_WARNING,
|
||
+ "Could not get maximum scheduler priority");
|
||
+ }
|
||
sched_priority = 0;
|
||
ret_val = -1;
|
||
}
|
||
#else
|
||
log_printf(LOGSYS_LEVEL_WARNING,
|
||
"The Platform is missing process priority setting features. Leaving at default.");
|
||
- ret_val = -1;
|
||
+ ret_val = -2;
|
||
#endif
|
||
|
||
return (ret_val);
|
||
@@ -1173,6 +1188,7 @@ error_close:
|
||
static int corosync_move_to_root_cgroup(void) {
|
||
FILE *f;
|
||
int res = -1;
|
||
+ const char *cgroup_task_fname = NULL;
|
||
|
||
/*
|
||
* /sys/fs/cgroup is hardcoded, because most of Linux distributions are now
|
||
@@ -1183,15 +1199,29 @@ static int corosync_move_to_root_cgroup(void) {
|
||
*/
|
||
f = fopen("/sys/fs/cgroup/cpu/cpu.rt_runtime_us", "rt");
|
||
if (f == NULL) {
|
||
- log_printf(LOGSYS_LEVEL_DEBUG, "cpu.rt_runtime_us doesn't exists -> "
|
||
- "system without cgroup or with disabled CONFIG_RT_GROUP_SCHED");
|
||
+ /*
|
||
+ * Try cgroup v2
|
||
+ */
|
||
+ f = fopen("/sys/fs/cgroup/cgroup.procs", "rt");
|
||
+ if (f == NULL) {
|
||
+ log_printf(LOG_DEBUG, "cpu.rt_runtime_us or cgroup.procs doesn't exist -> "
|
||
+ "system without cgroup or with disabled CONFIG_RT_GROUP_SCHED");
|
||
|
||
- res = 0;
|
||
- goto exit_res;
|
||
+ res = 0;
|
||
+ goto exit_res;
|
||
+ } else {
|
||
+ log_printf(LOGSYS_LEVEL_DEBUG, "Moving main pid to cgroup v2 root cgroup");
|
||
+
|
||
+ cgroup_task_fname = "/sys/fs/cgroup/cgroup.procs";
|
||
+ }
|
||
+ } else {
|
||
+ log_printf(LOGSYS_LEVEL_DEBUG, "Moving main pid to cgroup v1 root cgroup");
|
||
+
|
||
+ cgroup_task_fname = "/sys/fs/cgroup/cpu/tasks";
|
||
}
|
||
(void)fclose(f);
|
||
|
||
- f = fopen("/sys/fs/cgroup/cpu/tasks", "w");
|
||
+ f = fopen(cgroup_task_fname, "w");
|
||
if (f == NULL) {
|
||
log_printf(LOGSYS_LEVEL_WARNING, "Can't open cgroups tasks file for writing");
|
||
|
||
@@ -1256,7 +1286,8 @@ int main (int argc, char **argv, char **envp)
|
||
const char *error_string;
|
||
struct totem_config totem_config;
|
||
int res, ch;
|
||
- int background, sched_rr, prio, testonly, move_to_root_cgroup;
|
||
+ int background, sched_rr, prio, testonly;
|
||
+ enum move_to_root_cgroup_mode move_to_root_cgroup;
|
||
enum e_corosync_done flock_err;
|
||
uint64_t totem_config_warnings;
|
||
struct scheduler_pause_timeout_data scheduler_pause_timeout_data;
|
||
@@ -1264,6 +1295,7 @@ int main (int argc, char **argv, char **envp)
|
||
char *ep;
|
||
char *tmp_str;
|
||
int log_subsys_id_totem;
|
||
+ int silent;
|
||
|
||
/* default configuration
|
||
*/
|
||
@@ -1417,21 +1449,19 @@ int main (int argc, char **argv, char **envp)
|
||
}
|
||
|
||
|
||
- move_to_root_cgroup = 1;
|
||
+ move_to_root_cgroup = MOVE_TO_ROOT_CGROUP_MODE_AUTO;
|
||
if (icmap_get_string("system.move_to_root_cgroup", &tmp_str) == CS_OK) {
|
||
- if (strcmp(tmp_str, "yes") != 0) {
|
||
- move_to_root_cgroup = 0;
|
||
+ /*
|
||
+ * Validity of move_to_root_cgroup values checked in coroparse.c
|
||
+ */
|
||
+ if (strcmp(tmp_str, "yes") == 0) {
|
||
+ move_to_root_cgroup = MOVE_TO_ROOT_CGROUP_MODE_ON;
|
||
+ } else if (strcmp(tmp_str, "no") == 0) {
|
||
+ move_to_root_cgroup = MOVE_TO_ROOT_CGROUP_MODE_OFF;
|
||
}
|
||
free(tmp_str);
|
||
}
|
||
|
||
- /*
|
||
- * Try to move corosync into root cpu cgroup. Failure is not fatal and
|
||
- * error is deliberately ignored.
|
||
- */
|
||
- if (move_to_root_cgroup) {
|
||
- (void)corosync_move_to_root_cgroup();
|
||
- }
|
||
|
||
sched_rr = 1;
|
||
if (icmap_get_string("system.sched_rr", &tmp_str) == CS_OK) {
|
||
@@ -1462,11 +1492,31 @@ int main (int argc, char **argv, char **envp)
|
||
free(tmp_str);
|
||
}
|
||
|
||
+ if (move_to_root_cgroup == MOVE_TO_ROOT_CGROUP_MODE_ON) {
|
||
+ /*
|
||
+ * Try to move corosync into root cpu cgroup. Failure is not fatal and
|
||
+ * error is deliberately ignored.
|
||
+ */
|
||
+ (void)corosync_move_to_root_cgroup();
|
||
+ }
|
||
+
|
||
/*
|
||
* Set round robin realtime scheduling with priority 99
|
||
*/
|
||
if (sched_rr) {
|
||
- if (corosync_set_rr_scheduler () != 0) {
|
||
+ silent = (move_to_root_cgroup == MOVE_TO_ROOT_CGROUP_MODE_AUTO);
|
||
+ res = corosync_set_rr_scheduler (silent);
|
||
+
|
||
+ if (res == -1 && move_to_root_cgroup == MOVE_TO_ROOT_CGROUP_MODE_AUTO) {
|
||
+ /*
|
||
+ * Try to move process to root cgroup and try set priority again
|
||
+ */
|
||
+ (void)corosync_move_to_root_cgroup();
|
||
+
|
||
+ res = corosync_set_rr_scheduler (0);
|
||
+ }
|
||
+
|
||
+ if (res != 0) {
|
||
prio = INT_MIN;
|
||
} else {
|
||
prio = 0;
|
||
diff --git a/man/corosync.conf.5 b/man/corosync.conf.5
|
||
index 25289ba4..0588ad1e 100644
|
||
--- a/man/corosync.conf.5
|
||
+++ b/man/corosync.conf.5
|
||
@@ -1,6 +1,6 @@
|
||
.\"/*
|
||
.\" * Copyright (c) 2005 MontaVista Software, Inc.
|
||
-.\" * Copyright (c) 2006-2020 Red Hat, Inc.
|
||
+.\" * Copyright (c) 2006-2021 Red Hat, Inc.
|
||
.\" *
|
||
.\" * All rights reserved.
|
||
.\" *
|
||
@@ -32,7 +32,7 @@
|
||
.\" * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
|
||
.\" * THE POSSIBILITY OF SUCH DAMAGE.
|
||
.\" */
|
||
-.TH COROSYNC_CONF 5 2021-04-09 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual"
|
||
+.TH COROSYNC_CONF 5 2021-07-23 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual"
|
||
.SH NAME
|
||
corosync.conf - corosync executive configuration file
|
||
|
||
@@ -799,9 +799,37 @@ meaning maximal / minimal priority (so minimal / maximal nice value).
|
||
|
||
.TP
|
||
move_to_root_cgroup
|
||
-Should be set to yes (default) if corosync should try to move itself to root
|
||
-cgroup. This feature is available only for systems with cgroups with RT
|
||
-sched enabled (Linux with CONFIG_RT_GROUP_SCHED kernel option).
|
||
+Can be one of
|
||
+.B yes
|
||
+(Corosync always moves itself to root cgroup),
|
||
+.B no
|
||
+(Corosync never tries to move itself to root cgroup) or
|
||
+.B auto
|
||
+(Corosync first checks if sched_rr is enabled, and if
|
||
+so, it tries to set round robin realtime scheduling with maximal priority to itself.
|
||
+If setting of priority fails, corosync tries to move itself to root
|
||
+cgroup and retries setting of priority).
|
||
+
|
||
+This feature is available only for systems with cgroups v1 with RT
|
||
+sched enabled (Linux with CONFIG_RT_GROUP_SCHED kernel option) and cgroups v2.
|
||
+
|
||
+It's worth noting that currently (May 3 2021) cgroup2 doesn’t yet
|
||
+support control of realtime processes and the cpu controller can only be
|
||
+enabled when all RT processes are in the root cgroup (applies only for kernel
|
||
+with CONFIG_RT_GROUP_SCHED enabled). So when move_to_root_cgroup
|
||
+is disabled, kernel is compiled with CONFIG_RT_GROUP_SCHED and systemd is used,
|
||
+it may be impossible to make systemd options
|
||
+like CPUQuota working correctly until corosync is stopped.
|
||
+
|
||
+Also when moving to root cgroup is enforced and used together with cgroup2 and systemd
|
||
+it makes impossible (most of the time) for journald to add systemd specific
|
||
+metadata (most importantly _SYSTEMD_UNIT) properly, because corosync is
|
||
+moved out of cgroup created by systemd. This means
|
||
+it is not possible to filter corosync logged messages based on these metadata
|
||
+(for example using -u or _SYSTEMD_UNIT=UNIT pattern) and also running
|
||
+systemctl status doesn't display (all) corosync log messages.
|
||
+The problem is even worse because journald caches pid for some time
|
||
+(approx. 5 sec) so initial corosync messages have correct metadata.
|
||
|
||
.TP
|
||
allow_knet_handle_fallback
|
||
--
|
||
2.27.0
|
||
|