318 lines
11 KiB
Diff
318 lines
11 KiB
Diff
|
From c9996fdd0f4fa1fbf113b740eea01bcc70b235aa Mon Sep 17 00:00:00 2001
|
|||
|
From: Jan Friesse <jfriesse@redhat.com>
|
|||
|
Date: Mon, 3 May 2021 15:29:04 +0200
|
|||
|
Subject: [PATCH] main: Add support for cgroup v2 and auto mode
|
|||
|
|
|||
|
Support for cgroup v2 is very similar to cgroup v1 just checking (and
|
|||
|
writing) different file.
|
|||
|
|
|||
|
Because of all the problems described later with cgroup v2 new "auto"
|
|||
|
mode (new default) is added. This mode first tries to set rr scheduling
|
|||
|
and moves Corosync to root cgroup only if it fails.
|
|||
|
|
|||
|
Testing this feature is a bit harder than with cgroup v1 so it's
|
|||
|
probably worh noting in this commit message.
|
|||
|
|
|||
|
1. Copy some service file (I've used httpd service) and set
|
|||
|
CPUQuota=30% in the [service] section.
|
|||
|
2. Check /sys/fs/cgroup/cgroup.subtree_control - there should be no
|
|||
|
"cpu"
|
|||
|
3. Start modified service
|
|||
|
4. Check /sys/fs/cgroup/cgroup.subtree_control - there should be "cpu"
|
|||
|
5. Start corosync - It should be able to get rt priority
|
|||
|
|
|||
|
When move_to_root_cgroup is disabled (applies only for kernels
|
|||
|
with CONFIG_RT_GROUP_SCHED enabled), behavior differs:
|
|||
|
- If corosync is started before modified service, so
|
|||
|
there is no "cpu" in /sys/fs/cgroup/cgroup.subtree_control
|
|||
|
corosync starts without problem and gets rt priority.
|
|||
|
Starting modified service later will never add "cpu" into
|
|||
|
/sys/fs/cgroup/cgroup.subtree_control (because corosync is holding
|
|||
|
rt priority and it is placed in the non-root cgroup by systemd).
|
|||
|
|
|||
|
- When corosync is started after modified service, so "cpu"
|
|||
|
is in /sys/fs/cgroup/cgroup.subtree_control, corosync is not
|
|||
|
able to get RT priority.
|
|||
|
|
|||
|
It's worth noting problems when cgroup v2 is used together with systemd
|
|||
|
logging described in corosync.conf(5) man page.
|
|||
|
|
|||
|
Signed-off-by: Jan Friesse <jfriesse@redhat.com>
|
|||
|
Reviewed-by: Christine Caulfield <ccaulfie@redhat.com>
|
|||
|
---
|
|||
|
exec/coroparse.c | 3 +-
|
|||
|
exec/main.c | 98 ++++++++++++++++++++++++++++++++++-----------
|
|||
|
man/corosync.conf.5 | 38 +++++++++++++++---
|
|||
|
3 files changed, 109 insertions(+), 30 deletions(-)
|
|||
|
|
|||
|
diff --git a/exec/coroparse.c b/exec/coroparse.c
|
|||
|
index 741f3741..56b8034e 100644
|
|||
|
--- a/exec/coroparse.c
|
|||
|
+++ b/exec/coroparse.c
|
|||
|
@@ -828,7 +828,8 @@ static int main_config_parser_cb(const char *path,
|
|||
|
}
|
|||
|
if (strcmp(path, "system.move_to_root_cgroup") == 0) {
|
|||
|
if ((strcmp(value, "yes") != 0) &&
|
|||
|
- (strcmp(value, "no") != 0)) {
|
|||
|
+ (strcmp(value, "no") != 0) &&
|
|||
|
+ (strcmp(value, "auto") != 0)) {
|
|||
|
*error_string = "Invalid system.move_to_root_cgroup";
|
|||
|
|
|||
|
return (0);
|
|||
|
diff --git a/exec/main.c b/exec/main.c
|
|||
|
index aa6d9fbf..5fb4d47c 100644
|
|||
|
--- a/exec/main.c
|
|||
|
+++ b/exec/main.c
|
|||
|
@@ -169,6 +169,12 @@ static char corosync_config_file[PATH_MAX + 1] = COROSYSCONFDIR "/corosync.conf"
|
|||
|
|
|||
|
static int lockfile_fd = -1;
|
|||
|
|
|||
|
+enum move_to_root_cgroup_mode {
|
|||
|
+ MOVE_TO_ROOT_CGROUP_MODE_OFF = 0,
|
|||
|
+ MOVE_TO_ROOT_CGROUP_MODE_ON = 1,
|
|||
|
+ MOVE_TO_ROOT_CGROUP_MODE_AUTO = 2,
|
|||
|
+};
|
|||
|
+
|
|||
|
qb_loop_t *cs_poll_handle_get (void)
|
|||
|
{
|
|||
|
return (corosync_poll_handle);
|
|||
|
@@ -859,7 +865,12 @@ static void timer_function_scheduler_timeout (void *data)
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
-static int corosync_set_rr_scheduler (void)
|
|||
|
+/*
|
|||
|
+ * Set main pid RR scheduler.
|
|||
|
+ * silent: don't log sched_get_priority_max and sched_setscheduler errors
|
|||
|
+ * Returns: 0 - success, -1 failure, -2 platform doesn't support SCHED_RR
|
|||
|
+ */
|
|||
|
+static int corosync_set_rr_scheduler (int silent)
|
|||
|
{
|
|||
|
int ret_val = 0;
|
|||
|
|
|||
|
@@ -871,9 +882,11 @@ static int corosync_set_rr_scheduler (void)
|
|||
|
global_sched_param.sched_priority = sched_priority;
|
|||
|
res = sched_setscheduler (0, SCHED_RR, &global_sched_param);
|
|||
|
if (res == -1) {
|
|||
|
- LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING,
|
|||
|
- "Could not set SCHED_RR at priority %d",
|
|||
|
- global_sched_param.sched_priority);
|
|||
|
+ if (!silent) {
|
|||
|
+ LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING,
|
|||
|
+ "Could not set SCHED_RR at priority %d",
|
|||
|
+ global_sched_param.sched_priority);
|
|||
|
+ }
|
|||
|
|
|||
|
global_sched_param.sched_priority = 0;
|
|||
|
#ifdef HAVE_QB_LOG_THREAD_PRIORITY_SET
|
|||
|
@@ -898,15 +911,17 @@ static int corosync_set_rr_scheduler (void)
|
|||
|
}
|
|||
|
}
|
|||
|
} else {
|
|||
|
- LOGSYS_PERROR (errno, LOGSYS_LEVEL_WARNING,
|
|||
|
- "Could not get maximum scheduler priority");
|
|||
|
+ if (!silent) {
|
|||
|
+ LOGSYS_PERROR (errno, LOGSYS_LEVEL_WARNING,
|
|||
|
+ "Could not get maximum scheduler priority");
|
|||
|
+ }
|
|||
|
sched_priority = 0;
|
|||
|
ret_val = -1;
|
|||
|
}
|
|||
|
#else
|
|||
|
log_printf(LOGSYS_LEVEL_WARNING,
|
|||
|
"The Platform is missing process priority setting features. Leaving at default.");
|
|||
|
- ret_val = -1;
|
|||
|
+ ret_val = -2;
|
|||
|
#endif
|
|||
|
|
|||
|
return (ret_val);
|
|||
|
@@ -1173,6 +1188,7 @@ error_close:
|
|||
|
static int corosync_move_to_root_cgroup(void) {
|
|||
|
FILE *f;
|
|||
|
int res = -1;
|
|||
|
+ const char *cgroup_task_fname = NULL;
|
|||
|
|
|||
|
/*
|
|||
|
* /sys/fs/cgroup is hardcoded, because most of Linux distributions are now
|
|||
|
@@ -1183,15 +1199,29 @@ static int corosync_move_to_root_cgroup(void) {
|
|||
|
*/
|
|||
|
f = fopen("/sys/fs/cgroup/cpu/cpu.rt_runtime_us", "rt");
|
|||
|
if (f == NULL) {
|
|||
|
- log_printf(LOGSYS_LEVEL_DEBUG, "cpu.rt_runtime_us doesn't exists -> "
|
|||
|
- "system without cgroup or with disabled CONFIG_RT_GROUP_SCHED");
|
|||
|
+ /*
|
|||
|
+ * Try cgroup v2
|
|||
|
+ */
|
|||
|
+ f = fopen("/sys/fs/cgroup/cgroup.procs", "rt");
|
|||
|
+ if (f == NULL) {
|
|||
|
+ log_printf(LOG_DEBUG, "cpu.rt_runtime_us or cgroup.procs doesn't exist -> "
|
|||
|
+ "system without cgroup or with disabled CONFIG_RT_GROUP_SCHED");
|
|||
|
|
|||
|
- res = 0;
|
|||
|
- goto exit_res;
|
|||
|
+ res = 0;
|
|||
|
+ goto exit_res;
|
|||
|
+ } else {
|
|||
|
+ log_printf(LOGSYS_LEVEL_DEBUG, "Moving main pid to cgroup v2 root cgroup");
|
|||
|
+
|
|||
|
+ cgroup_task_fname = "/sys/fs/cgroup/cgroup.procs";
|
|||
|
+ }
|
|||
|
+ } else {
|
|||
|
+ log_printf(LOGSYS_LEVEL_DEBUG, "Moving main pid to cgroup v1 root cgroup");
|
|||
|
+
|
|||
|
+ cgroup_task_fname = "/sys/fs/cgroup/cpu/tasks";
|
|||
|
}
|
|||
|
(void)fclose(f);
|
|||
|
|
|||
|
- f = fopen("/sys/fs/cgroup/cpu/tasks", "w");
|
|||
|
+ f = fopen(cgroup_task_fname, "w");
|
|||
|
if (f == NULL) {
|
|||
|
log_printf(LOGSYS_LEVEL_WARNING, "Can't open cgroups tasks file for writing");
|
|||
|
|
|||
|
@@ -1256,7 +1286,8 @@ int main (int argc, char **argv, char **envp)
|
|||
|
const char *error_string;
|
|||
|
struct totem_config totem_config;
|
|||
|
int res, ch;
|
|||
|
- int background, sched_rr, prio, testonly, move_to_root_cgroup;
|
|||
|
+ int background, sched_rr, prio, testonly;
|
|||
|
+ enum move_to_root_cgroup_mode move_to_root_cgroup;
|
|||
|
enum e_corosync_done flock_err;
|
|||
|
uint64_t totem_config_warnings;
|
|||
|
struct scheduler_pause_timeout_data scheduler_pause_timeout_data;
|
|||
|
@@ -1264,6 +1295,7 @@ int main (int argc, char **argv, char **envp)
|
|||
|
char *ep;
|
|||
|
char *tmp_str;
|
|||
|
int log_subsys_id_totem;
|
|||
|
+ int silent;
|
|||
|
|
|||
|
/* default configuration
|
|||
|
*/
|
|||
|
@@ -1417,21 +1449,19 @@ int main (int argc, char **argv, char **envp)
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
- move_to_root_cgroup = 1;
|
|||
|
+ move_to_root_cgroup = MOVE_TO_ROOT_CGROUP_MODE_AUTO;
|
|||
|
if (icmap_get_string("system.move_to_root_cgroup", &tmp_str) == CS_OK) {
|
|||
|
- if (strcmp(tmp_str, "yes") != 0) {
|
|||
|
- move_to_root_cgroup = 0;
|
|||
|
+ /*
|
|||
|
+ * Validity of move_to_root_cgroup values checked in coroparse.c
|
|||
|
+ */
|
|||
|
+ if (strcmp(tmp_str, "yes") == 0) {
|
|||
|
+ move_to_root_cgroup = MOVE_TO_ROOT_CGROUP_MODE_ON;
|
|||
|
+ } else if (strcmp(tmp_str, "no") == 0) {
|
|||
|
+ move_to_root_cgroup = MOVE_TO_ROOT_CGROUP_MODE_OFF;
|
|||
|
}
|
|||
|
free(tmp_str);
|
|||
|
}
|
|||
|
|
|||
|
- /*
|
|||
|
- * Try to move corosync into root cpu cgroup. Failure is not fatal and
|
|||
|
- * error is deliberately ignored.
|
|||
|
- */
|
|||
|
- if (move_to_root_cgroup) {
|
|||
|
- (void)corosync_move_to_root_cgroup();
|
|||
|
- }
|
|||
|
|
|||
|
sched_rr = 1;
|
|||
|
if (icmap_get_string("system.sched_rr", &tmp_str) == CS_OK) {
|
|||
|
@@ -1462,11 +1492,31 @@ int main (int argc, char **argv, char **envp)
|
|||
|
free(tmp_str);
|
|||
|
}
|
|||
|
|
|||
|
+ if (move_to_root_cgroup == MOVE_TO_ROOT_CGROUP_MODE_ON) {
|
|||
|
+ /*
|
|||
|
+ * Try to move corosync into root cpu cgroup. Failure is not fatal and
|
|||
|
+ * error is deliberately ignored.
|
|||
|
+ */
|
|||
|
+ (void)corosync_move_to_root_cgroup();
|
|||
|
+ }
|
|||
|
+
|
|||
|
/*
|
|||
|
* Set round robin realtime scheduling with priority 99
|
|||
|
*/
|
|||
|
if (sched_rr) {
|
|||
|
- if (corosync_set_rr_scheduler () != 0) {
|
|||
|
+ silent = (move_to_root_cgroup == MOVE_TO_ROOT_CGROUP_MODE_AUTO);
|
|||
|
+ res = corosync_set_rr_scheduler (silent);
|
|||
|
+
|
|||
|
+ if (res == -1 && move_to_root_cgroup == MOVE_TO_ROOT_CGROUP_MODE_AUTO) {
|
|||
|
+ /*
|
|||
|
+ * Try to move process to root cgroup and try set priority again
|
|||
|
+ */
|
|||
|
+ (void)corosync_move_to_root_cgroup();
|
|||
|
+
|
|||
|
+ res = corosync_set_rr_scheduler (0);
|
|||
|
+ }
|
|||
|
+
|
|||
|
+ if (res != 0) {
|
|||
|
prio = INT_MIN;
|
|||
|
} else {
|
|||
|
prio = 0;
|
|||
|
diff --git a/man/corosync.conf.5 b/man/corosync.conf.5
|
|||
|
index 25289ba4..0588ad1e 100644
|
|||
|
--- a/man/corosync.conf.5
|
|||
|
+++ b/man/corosync.conf.5
|
|||
|
@@ -1,6 +1,6 @@
|
|||
|
.\"/*
|
|||
|
.\" * Copyright (c) 2005 MontaVista Software, Inc.
|
|||
|
-.\" * Copyright (c) 2006-2020 Red Hat, Inc.
|
|||
|
+.\" * Copyright (c) 2006-2021 Red Hat, Inc.
|
|||
|
.\" *
|
|||
|
.\" * All rights reserved.
|
|||
|
.\" *
|
|||
|
@@ -32,7 +32,7 @@
|
|||
|
.\" * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
|
|||
|
.\" * THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
.\" */
|
|||
|
-.TH COROSYNC_CONF 5 2021-04-09 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual"
|
|||
|
+.TH COROSYNC_CONF 5 2021-07-23 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual"
|
|||
|
.SH NAME
|
|||
|
corosync.conf - corosync executive configuration file
|
|||
|
|
|||
|
@@ -799,9 +799,37 @@ meaning maximal / minimal priority (so minimal / maximal nice value).
|
|||
|
|
|||
|
.TP
|
|||
|
move_to_root_cgroup
|
|||
|
-Should be set to yes (default) if corosync should try to move itself to root
|
|||
|
-cgroup. This feature is available only for systems with cgroups with RT
|
|||
|
-sched enabled (Linux with CONFIG_RT_GROUP_SCHED kernel option).
|
|||
|
+Can be one of
|
|||
|
+.B yes
|
|||
|
+(Corosync always moves itself to root cgroup),
|
|||
|
+.B no
|
|||
|
+(Corosync never tries to move itself to root cgroup) or
|
|||
|
+.B auto
|
|||
|
+(Corosync first checks if sched_rr is enabled, and if
|
|||
|
+so, it tries to set round robin realtime scheduling with maximal priority to itself.
|
|||
|
+If setting of priority fails, corosync tries to move itself to root
|
|||
|
+cgroup and retries setting of priority).
|
|||
|
+
|
|||
|
+This feature is available only for systems with cgroups v1 with RT
|
|||
|
+sched enabled (Linux with CONFIG_RT_GROUP_SCHED kernel option) and cgroups v2.
|
|||
|
+
|
|||
|
+It's worth noting that currently (May 3 2021) cgroup2 doesn’t yet
|
|||
|
+support control of realtime processes and the cpu controller can only be
|
|||
|
+enabled when all RT processes are in the root cgroup (applies only for kernel
|
|||
|
+with CONFIG_RT_GROUP_SCHED enabled). So when move_to_root_cgroup
|
|||
|
+is disabled, kernel is compiled with CONFIG_RT_GROUP_SCHED and systemd is used,
|
|||
|
+it may be impossible to make systemd options
|
|||
|
+like CPUQuota working correctly until corosync is stopped.
|
|||
|
+
|
|||
|
+Also when moving to root cgroup is enforced and used together with cgroup2 and systemd
|
|||
|
+it makes impossible (most of the time) for journald to add systemd specific
|
|||
|
+metadata (most importantly _SYSTEMD_UNIT) properly, because corosync is
|
|||
|
+moved out of cgroup created by systemd. This means
|
|||
|
+it is not possible to filter corosync logged messages based on these metadata
|
|||
|
+(for example using -u or _SYSTEMD_UNIT=UNIT pattern) and also running
|
|||
|
+systemctl status doesn't display (all) corosync log messages.
|
|||
|
+The problem is even worse because journald caches pid for some time
|
|||
|
+(approx. 5 sec) so initial corosync messages have correct metadata.
|
|||
|
|
|||
|
.TP
|
|||
|
allow_knet_handle_fallback
|
|||
|
--
|
|||
|
2.27.0
|
|||
|
|