pacemaker/SOURCES/013-pacemakerd.patch
CentOS Sources f50b06c409 import pacemaker-2.0.4-6.el8
# Conflicts:
#	.gitignore
#	.pacemaker.metadata
#	SPECS/pacemaker.spec
2021-08-28 18:45:35 +00:00

1073 lines
36 KiB
Diff

From b49880467c18ade43cc283036949b686d1413118 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 14 Apr 2020 14:12:22 -0500
Subject: [PATCH 1/5] Low: pacemakerd: remove unneeded IPC and CPG commands
Now that the controller handles crm_node's -l/-p needs, pacemakerd no longer
needs to. Backward compatibility isn't an issue because pacemakerd IPC isn't
proxied for Pacemaker Remote, so only local clients are relevant, and the
pacemakerd IPC never had a C API, which means it was internal only.
Without those commands, pacemakerd no longer needs to track a process mask,
connect to CPG, or maintain the peer cache.
The only remaining need for the cluster layer is to use corosync CFG to tell
corosync to initiate or block shutdown.
---
daemons/pacemakerd/pacemakerd.c | 283 ++----------------------------------
daemons/pacemakerd/pacemakerd.h | 2 +-
daemons/pacemakerd/pcmkd_corosync.c | 7 +-
3 files changed, 16 insertions(+), 276 deletions(-)
diff --git a/daemons/pacemakerd/pacemakerd.c b/daemons/pacemakerd/pacemakerd.c
index 64c30e2..0f7b2db 100644
--- a/daemons/pacemakerd/pacemakerd.c
+++ b/daemons/pacemakerd/pacemakerd.c
@@ -36,15 +36,12 @@
#include <dirent.h>
#include <ctype.h>
-static gboolean pcmk_quorate = FALSE;
static gboolean fatal_error = FALSE;
static GMainLoop *mainloop = NULL;
static bool global_keep_tracking = false;
#define PCMK_PROCESS_CHECK_INTERVAL 5
-static const char *local_name = NULL;
-static uint32_t local_nodeid = 0;
static crm_trigger_t *shutdown_trigger = NULL;
static const char *pid_file = PCMK_RUN_DIR "/pacemaker.pid";
@@ -105,23 +102,6 @@ static pcmk_child_t pcmk_children[] = {
static gboolean check_active_before_startup_processes(gpointer user_data);
static int child_liveness(pcmk_child_t *child);
static gboolean start_child(pcmk_child_t * child);
-static gboolean update_node_processes(uint32_t id, const char *uname,
- uint32_t procs);
-void update_process_clients(pcmk__client_t *client);
-
-static uint32_t
-get_process_list(void)
-{
- int lpc = 0;
- uint32_t procs = crm_get_cluster_proc();
-
- for (lpc = 0; lpc < SIZEOF(pcmk_children); lpc++) {
- if (pcmk_children[lpc].pid != 0) {
- procs |= pcmk_children[lpc].flag;
- }
- }
- return procs;
-}
static void
pcmk_process_exit(pcmk_child_t * child)
@@ -129,16 +109,6 @@ pcmk_process_exit(pcmk_child_t * child)
child->pid = 0;
child->active_before_startup = FALSE;
- /* Broadcast the fact that one of our processes died ASAP
- *
- * Try to get some logging of the cause out first though
- * because we're probably about to get fenced
- *
- * Potentially do this only if respawn_count > N
- * to allow for local recovery
- */
- update_node_processes(local_nodeid, NULL, get_process_list());
-
child->respawn_count += 1;
if (child->respawn_count > MAX_RESPAWN) {
crm_err("Child respawn count exceeded by %s", child->name);
@@ -148,8 +118,6 @@ pcmk_process_exit(pcmk_child_t * child)
if (shutdown_trigger) {
/* resume step-wise shutdown (returned TRUE yields no parallelizing) */
mainloop_set_trigger(shutdown_trigger);
- /* intended to speed up propagating expected lay-off of the daemons? */
- update_node_processes(local_nodeid, NULL, get_process_list());
} else if (!child->respawn) {
/* nothing to do */
@@ -341,7 +309,6 @@ start_child(pcmk_child_t * child)
crm_info("Forked child %lld for process %s%s",
(long long) child->pid, child->name,
use_valgrind ? " (valgrind enabled: " VALGRIND_BIN ")" : "");
- update_node_processes(local_nodeid, NULL, get_process_list());
return TRUE;
} else {
@@ -492,7 +459,6 @@ pcmk_shutdown_worker(gpointer user_data)
}
}
- /* send_cluster_id(); */
crm_notice("Shutdown complete");
{
@@ -567,22 +533,12 @@ pcmk_ipc_dispatch(qb_ipcs_connection_t * qbc, void *data, size_t size)
pcmk_shutdown(15);
} else if (crm_str_eq(task, CRM_OP_RM_NODE_CACHE, TRUE)) {
- /* Send to everyone */
- struct iovec *iov;
- int id = 0;
- const char *name = NULL;
-
- crm_element_value_int(msg, XML_ATTR_ID, &id);
- name = crm_element_value(msg, XML_ATTR_UNAME);
- crm_notice("Instructing peers to remove references to node %s/%u", name, id);
-
- iov = calloc(1, sizeof(struct iovec));
- iov->iov_base = dump_xml_unformatted(msg);
- iov->iov_len = 1 + strlen(iov->iov_base);
- send_cpg_iov(iov);
+ crm_trace("Ignoring IPC request to purge node "
+ "because peer cache is not used");
} else {
- update_process_clients(c);
+ crm_debug("Unrecognized IPC command '%s' sent to pacemakerd",
+ crm_str(task));
}
free_xml(msg);
@@ -618,113 +574,6 @@ struct qb_ipcs_service_handlers mcp_ipc_callbacks = {
.connection_destroyed = pcmk_ipc_destroy
};
-static void
-send_xml_to_client(gpointer key, gpointer value, gpointer user_data)
-{
- pcmk__ipc_send_xml((pcmk__client_t *) value, 0, (xmlNode *) user_data,
- crm_ipc_server_event);
-}
-
-/*!
- * \internal
- * \brief Send an XML message with process list of all known peers to client(s)
- *
- * \param[in] client Send message to this client, or all clients if NULL
- */
-void
-update_process_clients(pcmk__client_t *client)
-{
- GHashTableIter iter;
- crm_node_t *node = NULL;
- xmlNode *update = create_xml_node(NULL, "nodes");
-
- if (is_corosync_cluster()) {
- crm_xml_add_int(update, "quorate", pcmk_quorate);
- }
-
- g_hash_table_iter_init(&iter, crm_peer_cache);
- while (g_hash_table_iter_next(&iter, NULL, (gpointer *) & node)) {
- xmlNode *xml = create_xml_node(update, "node");
-
- crm_xml_add_int(xml, "id", node->id);
- crm_xml_add(xml, "uname", node->uname);
- crm_xml_add(xml, "state", node->state);
- crm_xml_add_int(xml, "processes", node->processes);
- }
-
- if(client) {
- crm_trace("Sending process list to client %s", client->id);
- send_xml_to_client(NULL, client, update);
-
- } else {
- crm_trace("Sending process list to %d clients",
- pcmk__ipc_client_count());
- pcmk__foreach_ipc_client(send_xml_to_client, update);
- }
- free_xml(update);
-}
-
-/*!
- * \internal
- * \brief Send a CPG message with local node's process list to all peers
- */
-static void
-update_process_peers(void)
-{
- /* Do nothing for corosync-2 based clusters */
-
- struct iovec *iov = calloc(1, sizeof(struct iovec));
-
- CRM_ASSERT(iov);
- if (local_name) {
- iov->iov_base = crm_strdup_printf("<node uname=\"%s\" proclist=\"%u\"/>",
- local_name, get_process_list());
- } else {
- iov->iov_base = crm_strdup_printf("<node proclist=\"%u\"/>",
- get_process_list());
- }
- iov->iov_len = strlen(iov->iov_base) + 1;
- crm_trace("Sending %s", (char*) iov->iov_base);
- send_cpg_iov(iov);
-}
-
-/*!
- * \internal
- * \brief Update a node's process list, notifying clients and peers if needed
- *
- * \param[in] id Node ID of affected node
- * \param[in] uname Uname of affected node
- * \param[in] procs Affected node's process list mask
- *
- * \return TRUE if the process list changed, FALSE otherwise
- */
-static gboolean
-update_node_processes(uint32_t id, const char *uname, uint32_t procs)
-{
- gboolean changed = FALSE;
- crm_node_t *node = crm_get_peer(id, uname);
-
- if (procs != 0) {
- if (procs != node->processes) {
- crm_debug("Node %s now has process list: %.32x (was %.32x)",
- node->uname, procs, node->processes);
- node->processes = procs;
- changed = TRUE;
-
- /* If local node's processes have changed, notify clients/peers */
- if (id == local_nodeid) {
- update_process_clients(NULL);
- update_process_peers();
- }
-
- } else {
- crm_trace("Node %s still has process list: %.32x", node->uname, procs);
- }
- }
- return changed;
-}
-
-
static pcmk__cli_option_t long_options[] = {
// long option, argument type, storage, short option, description, flags
{
@@ -1126,91 +975,6 @@ init_children_processes(void)
setenv("PCMK_respawned", "true", 1);
}
-static void
-mcp_cpg_destroy(gpointer user_data)
-{
- crm_crit("Lost connection to cluster layer, shutting down");
- crm_exit(CRM_EX_DISCONNECT);
-}
-
-/*!
- * \internal
- * \brief Process a CPG message (process list or manual peer cache removal)
- *
- * \param[in] handle CPG connection (ignored)
- * \param[in] groupName CPG group name (ignored)
- * \param[in] nodeid ID of affected node
- * \param[in] pid Process ID (ignored)
- * \param[in] msg CPG XML message
- * \param[in] msg_len Length of msg in bytes (ignored)
- */
-static void
-mcp_cpg_deliver(cpg_handle_t handle,
- const struct cpg_name *groupName,
- uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len)
-{
- xmlNode *xml = string2xml(msg);
- const char *task = crm_element_value(xml, F_CRM_TASK);
-
- crm_trace("Received CPG message (%s): %.200s",
- (task? task : "process list"), (char*)msg);
-
- if (task == NULL) {
- if (nodeid == local_nodeid) {
- crm_debug("Ignoring message with local node's process list");
- } else {
- uint32_t procs = 0;
- const char *uname = crm_element_value(xml, "uname");
-
- crm_element_value_int(xml, "proclist", (int *)&procs);
- if (update_node_processes(nodeid, uname, procs)) {
- update_process_clients(NULL);
- }
- }
-
- } else if (crm_str_eq(task, CRM_OP_RM_NODE_CACHE, TRUE)) {
- int id = 0;
- const char *name = NULL;
-
- crm_element_value_int(xml, XML_ATTR_ID, &id);
- name = crm_element_value(xml, XML_ATTR_UNAME);
- reap_crm_member(id, name);
- }
-
- if (xml != NULL) {
- free_xml(xml);
- }
-}
-
-static void
-mcp_cpg_membership(cpg_handle_t handle,
- const struct cpg_name *groupName,
- const struct cpg_address *member_list, size_t member_list_entries,
- const struct cpg_address *left_list, size_t left_list_entries,
- const struct cpg_address *joined_list, size_t joined_list_entries)
-{
- /* Update peer cache if needed */
- pcmk_cpg_membership(handle, groupName, member_list, member_list_entries,
- left_list, left_list_entries,
- joined_list, joined_list_entries);
-
- /* Always broadcast our own presence after any membership change */
- update_process_peers();
-}
-
-static gboolean
-mcp_quorum_callback(unsigned long long seq, gboolean quorate)
-{
- pcmk_quorate = quorate;
- return TRUE;
-}
-
-static void
-mcp_quorum_destroy(gpointer user_data)
-{
- crm_info("connection lost");
-}
-
int
main(int argc, char **argv)
{
@@ -1226,7 +990,6 @@ main(int argc, char **argv)
struct rlimit cores;
crm_ipc_t *old_instance = NULL;
qb_ipcs_service_t *ipcs = NULL;
- static crm_cluster_t cluster;
crm_log_preinit(NULL, argc, argv);
pcmk__set_cli_options(NULL, "[options]", long_options,
@@ -1397,7 +1160,7 @@ main(int argc, char **argv)
}
/* Allows us to block shutdown */
- if (cluster_connect_cfg(&local_nodeid) == FALSE) {
+ if (!cluster_connect_cfg()) {
crm_err("Couldn't connect to Corosync's CFG service");
crm_exit(CRM_EX_PROTOCOL);
}
@@ -1417,34 +1180,13 @@ main(int argc, char **argv)
crm_exit(CRM_EX_FATAL);
};
- cluster.destroy = mcp_cpg_destroy;
- cluster.cpg.cpg_deliver_fn = mcp_cpg_deliver;
- cluster.cpg.cpg_confchg_fn = mcp_cpg_membership;
-
- crm_set_autoreap(FALSE);
+ mainloop_add_signal(SIGTERM, pcmk_shutdown);
+ mainloop_add_signal(SIGINT, pcmk_shutdown);
- rc = pcmk_ok;
+ init_children_processes();
- if (cluster_connect_cpg(&cluster) == FALSE) {
- crm_err("Couldn't connect to Corosync's CPG service");
- rc = -ENOPROTOOPT;
-
- } else if (cluster_connect_quorum(mcp_quorum_callback, mcp_quorum_destroy)
- == FALSE) {
- rc = -ENOTCONN;
-
- } else {
- local_name = get_local_node_name();
- update_node_processes(local_nodeid, local_name, get_process_list());
-
- mainloop_add_signal(SIGTERM, pcmk_shutdown);
- mainloop_add_signal(SIGINT, pcmk_shutdown);
-
- init_children_processes();
-
- crm_notice("Pacemaker daemon successfully started and accepting connections");
- g_main_loop_run(mainloop);
- }
+ crm_notice("Pacemaker daemon successfully started and accepting connections");
+ g_main_loop_run(mainloop);
if (ipcs) {
crm_trace("Closing IPC server");
@@ -1453,9 +1195,6 @@ main(int argc, char **argv)
}
g_main_loop_unref(mainloop);
-
- cluster_disconnect_cpg(&cluster);
cluster_disconnect_cfg();
-
- crm_exit(crm_errno2exit(rc));
+ crm_exit(CRM_EX_OK);
}
diff --git a/daemons/pacemakerd/pacemakerd.h b/daemons/pacemakerd/pacemakerd.h
index d66ab10..ac2d842 100644
--- a/daemons/pacemakerd/pacemakerd.h
+++ b/daemons/pacemakerd/pacemakerd.h
@@ -22,7 +22,7 @@
gboolean mcp_read_config(void);
-gboolean cluster_connect_cfg(uint32_t * nodeid);
+gboolean cluster_connect_cfg(void);
gboolean cluster_disconnect_cfg(void);
void pcmk_shutdown(int nsig);
diff --git a/daemons/pacemakerd/pcmkd_corosync.c b/daemons/pacemakerd/pcmkd_corosync.c
index ec74908..156f965 100644
--- a/daemons/pacemakerd/pcmkd_corosync.c
+++ b/daemons/pacemakerd/pcmkd_corosync.c
@@ -93,13 +93,14 @@ cluster_disconnect_cfg(void)
} while(counter < max)
gboolean
-cluster_connect_cfg(uint32_t * nodeid)
+cluster_connect_cfg(void)
{
cs_error_t rc;
int fd = -1, retries = 0, rv;
uid_t found_uid = 0;
gid_t found_gid = 0;
pid_t found_pid = 0;
+ uint32_t nodeid;
static struct mainloop_fd_callbacks cfg_fd_callbacks = {
.dispatch = pcmk_cfg_dispatch,
@@ -134,14 +135,14 @@ cluster_connect_cfg(uint32_t * nodeid)
}
retries = 0;
- cs_repeat(retries, 30, rc = corosync_cfg_local_get(cfg_handle, nodeid));
+ cs_repeat(retries, 30, rc = corosync_cfg_local_get(cfg_handle, &nodeid));
if (rc != CS_OK) {
crm_err("corosync cfg local_get error %d", rc);
goto bail;
}
- crm_debug("Our nodeid: %d", *nodeid);
+ crm_debug("Our nodeid: %lu", (unsigned long) nodeid);
mainloop_add_fd("corosync-cfg", G_PRIORITY_DEFAULT, fd, &cfg_handle, &cfg_fd_callbacks);
return TRUE;
--
1.8.3.1
From 23ad3803a12189a369d188f3d3e606142cf16c52 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 2 Jul 2020 11:25:28 -0500
Subject: [PATCH 2/5] Refactor: pacemakerd: functionize removing core file
limit
... for readability and code isolation
---
daemons/pacemakerd/pacemakerd.c | 50 ++++++++++++++++++++++++-----------------
1 file changed, 29 insertions(+), 21 deletions(-)
diff --git a/daemons/pacemakerd/pacemakerd.c b/daemons/pacemakerd/pacemakerd.c
index 0f7b2db..ac308e7 100644
--- a/daemons/pacemakerd/pacemakerd.c
+++ b/daemons/pacemakerd/pacemakerd.c
@@ -975,10 +975,37 @@ init_children_processes(void)
setenv("PCMK_respawned", "true", 1);
}
+static void
+remove_core_file_limit(void)
+{
+ struct rlimit cores;
+ int rc = getrlimit(RLIMIT_CORE, &cores);
+
+ if (rc < 0) {
+ crm_perror(LOG_ERR, "Cannot determine current maximum core size.");
+ return;
+ }
+
+ if ((cores.rlim_max == 0) && (geteuid() == 0)) {
+ cores.rlim_max = RLIM_INFINITY;
+ } else {
+ crm_info("Maximum core file size is %llu bytes",
+ (unsigned long long) cores.rlim_max);
+ }
+ cores.rlim_cur = cores.rlim_max;
+
+ rc = setrlimit(RLIMIT_CORE, &cores);
+ if (rc < 0) {
+ crm_perror(LOG_ERR,
+ "Core file generation will remain disabled."
+ " Core files are an important diagnostic tool, so"
+ " please consider enabling them by default.");
+ }
+}
+
int
main(int argc, char **argv)
{
- int rc;
int flag;
int argerr = 0;
@@ -987,7 +1014,6 @@ main(int argc, char **argv)
uid_t pcmk_uid = 0;
gid_t pcmk_gid = 0;
- struct rlimit cores;
crm_ipc_t *old_instance = NULL;
qb_ipcs_service_t *ipcs = NULL;
@@ -1099,25 +1125,7 @@ main(int argc, char **argv)
PACEMAKER_VERSION, BUILD_VERSION, CRM_FEATURES);
mainloop = g_main_loop_new(NULL, FALSE);
- rc = getrlimit(RLIMIT_CORE, &cores);
- if (rc < 0) {
- crm_perror(LOG_ERR, "Cannot determine current maximum core size.");
- } else {
- if (cores.rlim_max == 0 && geteuid() == 0) {
- cores.rlim_max = RLIM_INFINITY;
- } else {
- crm_info("Maximum core file size is: %lu", (unsigned long)cores.rlim_max);
- }
- cores.rlim_cur = cores.rlim_max;
-
- rc = setrlimit(RLIMIT_CORE, &cores);
- if (rc < 0) {
- crm_perror(LOG_ERR,
- "Core file generation will remain disabled."
- " Core files are an important diagnostic tool, so"
- " please consider enabling them by default.");
- }
- }
+ remove_core_file_limit();
if (pcmk_daemon_user(&pcmk_uid, &pcmk_gid) < 0) {
crm_err("Cluster user %s does not exist, aborting Pacemaker startup", CRM_DAEMON_USER);
--
1.8.3.1
From 40b0891dc92767aad8495121afcbd6e68fd3830a Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 2 Jul 2020 11:26:09 -0500
Subject: [PATCH 3/5] Log: pacemakerd: improve messages
---
daemons/pacemakerd/pacemakerd.c | 22 +++++-----
daemons/pacemakerd/pcmkd_corosync.c | 85 +++++++++++++++++--------------------
2 files changed, 50 insertions(+), 57 deletions(-)
diff --git a/daemons/pacemakerd/pacemakerd.c b/daemons/pacemakerd/pacemakerd.c
index ac308e7..0f459c0 100644
--- a/daemons/pacemakerd/pacemakerd.c
+++ b/daemons/pacemakerd/pacemakerd.c
@@ -13,6 +13,7 @@
#include <pwd.h>
#include <grp.h>
#include <poll.h>
+#include <errno.h>
#include <stdio.h>
#include <stdbool.h>
#include <sys/stat.h>
@@ -344,7 +345,7 @@ start_child(pcmk_child_t * child)
// Drop root group access if not needed
if (!need_root_group && (setgid(gid) < 0)) {
- crm_perror(LOG_ERR, "Could not set group to %d", gid);
+ crm_warn("Could not set group to %d: %s", gid, strerror(errno));
}
/* Initialize supplementary groups to only those always granted to
@@ -356,7 +357,8 @@ start_child(pcmk_child_t * child)
}
if (uid && setuid(uid) < 0) {
- crm_perror(LOG_ERR, "Could not set user to %d (%s)", uid, child->uid);
+ crm_warn("Could not set user to %s (id %d): %s",
+ child->uid, uid, strerror(errno));
}
pcmk__close_fds_in_child(true);
@@ -370,7 +372,7 @@ start_child(pcmk_child_t * child)
} else {
(void)execvp(child->command, opts_default);
}
- crm_perror(LOG_ERR, "FATAL: Cannot exec %s", child->command);
+ crm_crit("Could not execute %s: %s", child->command, strerror(errno));
crm_exit(CRM_EX_FATAL);
}
return TRUE; /* never reached */
@@ -527,8 +529,7 @@ pcmk_ipc_dispatch(qb_ipcs_connection_t * qbc, void *data, size_t size)
task = crm_element_value(msg, F_CRM_TASK);
if (crm_str_eq(task, CRM_OP_QUIT, TRUE)) {
- /* Time to quit */
- crm_notice("Shutting down in response to ticket %s (%s)",
+ crm_notice("Shutting down in response to IPC request %s from %s",
crm_element_value(msg, F_CRM_REFERENCE), crm_element_value(msg, F_CRM_ORIGIN));
pcmk_shutdown(15);
@@ -982,7 +983,8 @@ remove_core_file_limit(void)
int rc = getrlimit(RLIMIT_CORE, &cores);
if (rc < 0) {
- crm_perror(LOG_ERR, "Cannot determine current maximum core size.");
+ crm_warn("Cannot determine current maximum core file size: %s",
+ strerror(errno));
return;
}
@@ -996,10 +998,8 @@ remove_core_file_limit(void)
rc = setrlimit(RLIMIT_CORE, &cores);
if (rc < 0) {
- crm_perror(LOG_ERR,
- "Core file generation will remain disabled."
- " Core files are an important diagnostic tool, so"
- " please consider enabling them by default.");
+ crm_warn("Cannot raise system limit on core file size "
+ "(consider doing so manually)");
}
}
@@ -1108,7 +1108,6 @@ main(int argc, char **argv)
crm_ipc_destroy(old_instance);
if (mcp_read_config() == FALSE) {
- crm_notice("Could not obtain corosync config data, exiting");
crm_exit(CRM_EX_UNAVAILABLE);
}
@@ -1169,7 +1168,6 @@ main(int argc, char **argv)
/* Allows us to block shutdown */
if (!cluster_connect_cfg()) {
- crm_err("Couldn't connect to Corosync's CFG service");
crm_exit(CRM_EX_PROTOCOL);
}
diff --git a/daemons/pacemakerd/pcmkd_corosync.c b/daemons/pacemakerd/pcmkd_corosync.c
index 156f965..6f19803 100644
--- a/daemons/pacemakerd/pcmkd_corosync.c
+++ b/daemons/pacemakerd/pcmkd_corosync.c
@@ -28,7 +28,6 @@
#include <crm/common/ipc_internal.h> /* PCMK__SPECIAL_PID* */
-enum cluster_type_e stack = pcmk_cluster_unknown;
static corosync_cfg_handle_t cfg_handle;
/* =::=::=::= CFG - Shutdown stuff =::=::=::= */
@@ -63,9 +62,8 @@ pcmk_cfg_dispatch(gpointer user_data)
static void
cfg_connection_destroy(gpointer user_data)
{
- crm_err("Connection destroyed");
+ crm_err("Lost connection to Corosync");
cfg_handle = 0;
-
pcmk_shutdown(SIGTERM);
}
@@ -85,7 +83,7 @@ cluster_disconnect_cfg(void)
code; \
if(rc == CS_ERR_TRY_AGAIN || rc == CS_ERR_QUEUE_FULL) { \
counter++; \
- crm_debug("Retrying operation after %ds", counter); \
+ crm_debug("Retrying Corosync operation after %ds", counter); \
sleep(counter); \
} else { \
break; \
@@ -110,41 +108,42 @@ cluster_connect_cfg(void)
cs_repeat(retries, 30, rc = corosync_cfg_initialize(&cfg_handle, &cfg_callbacks));
if (rc != CS_OK) {
- crm_err("corosync cfg init: %s (%d)", cs_strerror(rc), rc);
+ crm_crit("Could not connect to Corosync CFG: %s " CRM_XS " rc=%d",
+ cs_strerror(rc), rc);
return FALSE;
}
rc = corosync_cfg_fd_get(cfg_handle, &fd);
if (rc != CS_OK) {
- crm_err("corosync cfg fd_get: %s (%d)", cs_strerror(rc), rc);
+ crm_crit("Could not get Corosync CFG descriptor: %s " CRM_XS " rc=%d",
+ cs_strerror(rc), rc);
goto bail;
}
/* CFG provider run as root (in given user namespace, anyway)? */
if (!(rv = crm_ipc_is_authentic_process(fd, (uid_t) 0,(gid_t) 0, &found_pid,
&found_uid, &found_gid))) {
- crm_err("CFG provider is not authentic:"
- " process %lld (uid: %lld, gid: %lld)",
- (long long) PCMK__SPECIAL_PID_AS_0(found_pid),
- (long long) found_uid, (long long) found_gid);
+ crm_crit("Rejecting Corosync CFG provider because process %lld "
+ "is running as uid %lld gid %lld, not root",
+ (long long) PCMK__SPECIAL_PID_AS_0(found_pid),
+ (long long) found_uid, (long long) found_gid);
goto bail;
} else if (rv < 0) {
- crm_err("Could not verify authenticity of CFG provider: %s (%d)",
- strerror(-rv), -rv);
+ crm_crit("Could not authenticate Corosync CFG provider: %s "
+ CRM_XS " rc=%d", strerror(-rv), -rv);
goto bail;
}
retries = 0;
cs_repeat(retries, 30, rc = corosync_cfg_local_get(cfg_handle, &nodeid));
-
if (rc != CS_OK) {
- crm_err("corosync cfg local_get error %d", rc);
+ crm_crit("Could not get local node ID from Corosync: %s "
+ CRM_XS " rc=%d", cs_strerror(rc), rc);
goto bail;
}
+ crm_debug("Corosync reports local node ID is %lu", (unsigned long) nodeid);
- crm_debug("Our nodeid: %lu", (unsigned long) nodeid);
mainloop_add_fd("corosync-cfg", G_PRIORITY_DEFAULT, fd, &cfg_handle, &cfg_fd_callbacks);
-
return TRUE;
bail:
@@ -184,14 +183,15 @@ mcp_read_config(void)
gid_t found_gid = 0;
pid_t found_pid = 0;
int rv;
+ enum cluster_type_e stack;
// There can be only one possibility
do {
rc = cmap_initialize(&local_handle);
if (rc != CS_OK) {
retries++;
- printf("cmap connection setup failed: %s. Retrying in %ds\n", cs_strerror(rc), retries);
- crm_info("cmap connection setup failed: %s. Retrying in %ds", cs_strerror(rc), retries);
+ crm_info("Could not connect to Corosync CMAP: %s (retrying in %ds) "
+ CRM_XS " rc=%d", cs_strerror(rc), retries, rc);
sleep(retries);
} else {
@@ -201,15 +201,15 @@ mcp_read_config(void)
} while (retries < 5);
if (rc != CS_OK) {
- printf("Could not connect to Cluster Configuration Database API, error %d\n", rc);
- crm_warn("Could not connect to Cluster Configuration Database API, error %d", rc);
+ crm_crit("Could not connect to Corosync CMAP: %s "
+ CRM_XS " rc=%d", cs_strerror(rc), rc);
return FALSE;
}
rc = cmap_fd_get(local_handle, &fd);
if (rc != CS_OK) {
- crm_err("Could not obtain the CMAP API connection: %s (%d)",
- cs_strerror(rc), rc);
+ crm_crit("Could not get Corosync CMAP descriptor: %s " CRM_XS " rc=%d",
+ cs_strerror(rc), rc);
cmap_finalize(local_handle);
return FALSE;
}
@@ -217,38 +217,33 @@ mcp_read_config(void)
/* CMAP provider run as root (in given user namespace, anyway)? */
if (!(rv = crm_ipc_is_authentic_process(fd, (uid_t) 0,(gid_t) 0, &found_pid,
&found_uid, &found_gid))) {
- crm_err("CMAP provider is not authentic:"
- " process %lld (uid: %lld, gid: %lld)",
- (long long) PCMK__SPECIAL_PID_AS_0(found_pid),
- (long long) found_uid, (long long) found_gid);
+ crm_crit("Rejecting Corosync CMAP provider because process %lld "
+ "is running as uid %lld gid %lld, not root",
+ (long long) PCMK__SPECIAL_PID_AS_0(found_pid),
+ (long long) found_uid, (long long) found_gid);
cmap_finalize(local_handle);
return FALSE;
} else if (rv < 0) {
- crm_err("Could not verify authenticity of CMAP provider: %s (%d)",
- strerror(-rv), -rv);
+ crm_crit("Could not authenticate Corosync CMAP provider: %s "
+ CRM_XS " rc=%d", strerror(-rv), -rv);
cmap_finalize(local_handle);
return FALSE;
}
stack = get_cluster_type();
- crm_info("Reading configure for stack: %s", name_for_cluster_type(stack));
-
- /* =::=::= Should we be here =::=::= */
- if (stack == pcmk_cluster_corosync) {
- pcmk__set_env_option("cluster_type", "corosync");
- pcmk__set_env_option("quorum_type", "corosync");
-
- } else {
- crm_err("Unsupported stack type: %s", name_for_cluster_type(stack));
+ if (stack != pcmk_cluster_corosync) {
+ crm_crit("Expected corosync stack but detected %s " CRM_XS " stack=%d",
+ name_for_cluster_type(stack), stack);
return FALSE;
}
- /* =::=::= Logging =::=::= */
- if (pcmk__env_option("debug")) {
- /* Syslog logging is already setup by crm_log_init() */
+ crm_info("Reading configuration for %s stack",
+ name_for_cluster_type(stack));
+ pcmk__set_env_option("cluster_type", "corosync");
+ pcmk__set_env_option("quorum_type", "corosync");
- } else {
- /* Check corosync */
+ // If debug logging is not configured, check whether corosync has it
+ if (pcmk__env_option("debug") == NULL) {
char *debug_enabled = NULL;
get_config_opt(config, local_handle, "logging.debug", &debug_enabled, "off");
@@ -269,7 +264,7 @@ mcp_read_config(void)
if(local_handle){
gid_t gid = 0;
if (pcmk_daemon_user(NULL, &gid) < 0) {
- crm_warn("Could not authorize group with corosync " CRM_XS
+ crm_warn("Could not authorize group with Corosync " CRM_XS
" No group found for user %s", CRM_DAEMON_USER);
} else {
@@ -277,8 +272,8 @@ mcp_read_config(void)
snprintf(key, PATH_MAX, "uidgid.gid.%u", gid);
rc = cmap_set_uint8(local_handle, key, 1);
if (rc != CS_OK) {
- crm_warn("Could not authorize group with corosync "CRM_XS
- " group=%u rc=%d (%s)", gid, rc, ais_error2text(rc));
+ crm_warn("Could not authorize group with Corosync: %s " CRM_XS
+ " group=%u rc=%d", ais_error2text(rc), gid, rc);
}
}
}
--
1.8.3.1
From c1e12aeb58366f508730defcb8eb6f3ebedac469 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 23 Apr 2020 13:30:58 -0500
Subject: [PATCH 4/5] Refactor: pacemakerd: use existing handle for corosync
shutdown
---
daemons/pacemakerd/pacemakerd.c | 31 ++++---------------------------
daemons/pacemakerd/pacemakerd.h | 1 +
daemons/pacemakerd/pcmkd_corosync.c | 24 +++++++++++++++++++++++-
3 files changed, 28 insertions(+), 28 deletions(-)
diff --git a/daemons/pacemakerd/pacemakerd.c b/daemons/pacemakerd/pacemakerd.c
index 0f459c0..fb35dfc 100644
--- a/daemons/pacemakerd/pacemakerd.c
+++ b/daemons/pacemakerd/pacemakerd.c
@@ -30,10 +30,6 @@
#include <crm/cluster/internal.h>
#include <crm/cluster.h>
-#ifdef SUPPORT_COROSYNC
-#include <corosync/cfg.h>
-#endif
-
#include <dirent.h>
#include <ctype.h>
@@ -145,28 +141,6 @@ pcmk_process_exit(pcmk_child_t * child)
}
}
-static void pcmk_exit_with_cluster(int exitcode)
-{
-#ifdef SUPPORT_COROSYNC
- corosync_cfg_handle_t cfg_handle;
- cs_error_t err;
-
- if (exitcode == CRM_EX_FATAL) {
- crm_info("Asking Corosync to shut down");
- err = corosync_cfg_initialize(&cfg_handle, NULL);
- if (err != CS_OK) {
- crm_warn("Unable to open handle to corosync to close it down. err=%d", err);
- }
- err = corosync_cfg_try_shutdown(cfg_handle, COROSYNC_CFG_SHUTDOWN_FLAG_IMMEDIATE);
- if (err != CS_OK) {
- crm_warn("Corosync shutdown failed. err=%d", err);
- }
- corosync_cfg_finalize(cfg_handle);
- }
-#endif
- crm_exit(exitcode);
-}
-
static void
pcmk_child_exit(mainloop_child_t * p, pid_t pid, int core, int signo, int exitcode)
{
@@ -475,7 +449,10 @@ pcmk_shutdown_worker(gpointer user_data)
if (fatal_error) {
crm_notice("Shutting down and staying down after fatal error");
- pcmk_exit_with_cluster(CRM_EX_FATAL);
+#ifdef SUPPORT_COROSYNC
+ pcmkd_shutdown_corosync();
+#endif
+ crm_exit(CRM_EX_FATAL);
}
return TRUE;
diff --git a/daemons/pacemakerd/pacemakerd.h b/daemons/pacemakerd/pacemakerd.h
index ac2d842..5f475fd 100644
--- a/daemons/pacemakerd/pacemakerd.h
+++ b/daemons/pacemakerd/pacemakerd.h
@@ -24,5 +24,6 @@ gboolean mcp_read_config(void);
gboolean cluster_connect_cfg(void);
gboolean cluster_disconnect_cfg(void);
+void pcmkd_shutdown_corosync(void);
void pcmk_shutdown(int nsig);
diff --git a/daemons/pacemakerd/pcmkd_corosync.c b/daemons/pacemakerd/pcmkd_corosync.c
index 6f19803..82bd257 100644
--- a/daemons/pacemakerd/pcmkd_corosync.c
+++ b/daemons/pacemakerd/pcmkd_corosync.c
@@ -28,7 +28,7 @@
#include <crm/common/ipc_internal.h> /* PCMK__SPECIAL_PID* */
-static corosync_cfg_handle_t cfg_handle;
+static corosync_cfg_handle_t cfg_handle = 0;
/* =::=::=::= CFG - Shutdown stuff =::=::=::= */
@@ -151,6 +151,28 @@ cluster_connect_cfg(void)
return FALSE;
}
+void
+pcmkd_shutdown_corosync(void)
+{
+ cs_error_t rc;
+
+ if (cfg_handle == 0) {
+ crm_warn("Unable to shut down Corosync: No connection");
+ return;
+ }
+ crm_info("Asking Corosync to shut down");
+ rc = corosync_cfg_try_shutdown(cfg_handle,
+ COROSYNC_CFG_SHUTDOWN_FLAG_IMMEDIATE);
+ if (rc == CS_OK) {
+ corosync_cfg_finalize(cfg_handle);
+ cfg_handle = 0;
+ } else {
+ crm_warn("Corosync shutdown failed: %s " CRM_XS " rc=%d",
+ cs_strerror(rc), rc);
+ }
+}
+
+
/* =::=::=::= Configuration =::=::=::= */
static int
get_config_opt(uint64_t unused, cmap_handle_t object_handle, const char *key, char **value,
--
1.8.3.1
From e93857b25a22045d5db7bd45c3f026fb82e6da8d Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 23 Apr 2020 13:36:08 -0500
Subject: [PATCH 5/5] Build: pacemakerd: properly conditionalize corosync calls
Previously, pacemakerd in its entirety was not built unless corosync support
was enabled, a throwback to when CMAN and Corosync 1 were supported. Now,
pacemakerd is built unconditionally, and just its corosync-related calls are
guarded by corosync support.
This has no effect currently since corosync 2+ is the only supported cluster
layer, but it offers some future-proofing.
---
daemons/pacemakerd/Makefile.am | 9 ++++-----
daemons/pacemakerd/pacemakerd.c | 6 ++++++
2 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/daemons/pacemakerd/Makefile.am b/daemons/pacemakerd/Makefile.am
index b01d8ef..4cc8a7c 100644
--- a/daemons/pacemakerd/Makefile.am
+++ b/daemons/pacemakerd/Makefile.am
@@ -1,5 +1,5 @@
#
-# Copyright 2004-2019 the Pacemaker project contributors
+# Copyright 2004-2020 the Pacemaker project contributors
#
# The version control history for this file may have further details.
#
@@ -9,8 +9,6 @@
include $(top_srcdir)/mk/common.mk
-if BUILD_CS_SUPPORT
-
initdir = $(INITDIR)
init_SCRIPTS = pacemaker
sbin_PROGRAMS = pacemakerd
@@ -30,8 +28,9 @@ pacemakerd_LDFLAGS = $(LDFLAGS_HARDENED_EXE)
pacemakerd_LDADD = $(top_builddir)/lib/cluster/libcrmcluster.la $(top_builddir)/lib/common/libcrmcommon.la
pacemakerd_LDADD += $(CLUSTERLIBS)
-pacemakerd_SOURCES = pacemakerd.c pcmkd_corosync.c
-
+pacemakerd_SOURCES = pacemakerd.c
+if BUILD_CS_SUPPORT
+pacemakerd_SOURCES += pcmkd_corosync.c
endif
CLEANFILES = $(man8_MANS)
diff --git a/daemons/pacemakerd/pacemakerd.c b/daemons/pacemakerd/pacemakerd.c
index fb35dfc..652d6ca 100644
--- a/daemons/pacemakerd/pacemakerd.c
+++ b/daemons/pacemakerd/pacemakerd.c
@@ -1084,9 +1084,11 @@ main(int argc, char **argv)
crm_ipc_close(old_instance);
crm_ipc_destroy(old_instance);
+#ifdef SUPPORT_COROSYNC
if (mcp_read_config() == FALSE) {
crm_exit(CRM_EX_UNAVAILABLE);
}
+#endif
// OCF shell functions and cluster-glue need facility under different name
{
@@ -1143,10 +1145,12 @@ main(int argc, char **argv)
crm_exit(CRM_EX_OSERR);
}
+#ifdef SUPPORT_COROSYNC
/* Allows us to block shutdown */
if (!cluster_connect_cfg()) {
crm_exit(CRM_EX_PROTOCOL);
}
+#endif
if(pcmk_locate_sbd() > 0) {
setenv("PCMK_watchdog", "true", 1);
@@ -1178,6 +1182,8 @@ main(int argc, char **argv)
}
g_main_loop_unref(mainloop);
+#ifdef SUPPORT_COROSYNC
cluster_disconnect_cfg();
+#endif
crm_exit(CRM_EX_OK);
}
--
1.8.3.1