From 696a55366267ebf8412e938db92edfc3671469c7 Mon Sep 17 00:00:00 2001 From: CentOS Sources Date: Tue, 28 Jul 2020 05:28:03 -0400 Subject: [PATCH] import corosync-3.0.3-4.el8 --- ...m-Ignore-the-icmap_get_-return-value.patch | 73 ++ ...flect-runtime-change-of-2Node-to-WFA.patch | 80 ++ ...ap_keys-man-page-from-section-8-to-7.patch | 901 ++++++++++++++++++ ...stats-Add-stats-for-scheduler-misses.patch | 320 +++++++ ...nanoseconds-from-epoch-for-schedmiss.patch | 31 + ...Add-schedmiss-timestamp-into-message.patch | 47 + ...uorum-Change-check-of-expected_votes.patch | 51 + ...mtool-exit-on-invalid-expected-votes.patch | 33 + ...uorum-set-wfa-status-only-on-startup.patch | 67 ++ SPECS/corosync.spec | 54 +- 10 files changed, 1655 insertions(+), 2 deletions(-) create mode 100644 SOURCES/bz1780137-1-votequorum-Ignore-the-icmap_get_-return-value.patch create mode 100644 SOURCES/bz1780137-2-votequorum-Reflect-runtime-change-of-2Node-to-WFA.patch create mode 100644 SOURCES/bz1791792-1-man-move-cmap_keys-man-page-from-section-8-to-7.patch create mode 100644 SOURCES/bz1791792-2-stats-Add-stats-for-scheduler-misses.patch create mode 100644 SOURCES/bz1791792-3-stats-Use-nanoseconds-from-epoch-for-schedmiss.patch create mode 100644 SOURCES/bz1791792-4-main-Add-schedmiss-timestamp-into-message.patch create mode 100644 SOURCES/bz1809864-1-votequorum-Change-check-of-expected_votes.patch create mode 100644 SOURCES/bz1809864-2-quorumtool-exit-on-invalid-expected-votes.patch create mode 100644 SOURCES/bz1816653-1-votequorum-set-wfa-status-only-on-startup.patch diff --git a/SOURCES/bz1780137-1-votequorum-Ignore-the-icmap_get_-return-value.patch b/SOURCES/bz1780137-1-votequorum-Ignore-the-icmap_get_-return-value.patch new file mode 100644 index 0000000..d337c86 --- /dev/null +++ b/SOURCES/bz1780137-1-votequorum-Ignore-the-icmap_get_-return-value.patch @@ -0,0 +1,73 @@ +From cddd62f972bca276c934e58f08da84071cec1ddb Mon Sep 17 00:00:00 2001 +From: Jan Friesse +Date: Mon, 25 Nov 2019 18:21:52 +0100 +Subject: [PATCH] votequorum: Ignore the icmap_get_* return value + +Express intention to ignore icmap_get_* return +value and rely on default behavior of not changing the output +parameter on error. + +Signed-off-by: Jan Friesse +--- + exec/votequorum.c | 22 +++++++++++----------- + 1 file changed, 11 insertions(+), 11 deletions(-) + +diff --git a/exec/votequorum.c b/exec/votequorum.c +index f78b3f9..e1d7e73 100644 +--- a/exec/votequorum.c ++++ b/exec/votequorum.c +@@ -1271,10 +1271,10 @@ static char *votequorum_readconfig(int runtime) + /* + * gather basic data here + */ +- icmap_get_uint32("quorum.expected_votes", &expected_votes); ++ (void)icmap_get_uint32("quorum.expected_votes", &expected_votes); + have_nodelist = votequorum_read_nodelist_configuration(&node_votes, &node_count, &node_expected_votes); + have_qdevice = votequorum_qdevice_is_configured(&qdevice_votes); +- icmap_get_uint8("quorum.two_node", &two_node); ++ (void)icmap_get_uint8("quorum.two_node", &two_node); + + /* + * do config verification and enablement +@@ -1319,13 +1319,13 @@ static char *votequorum_readconfig(int runtime) + wait_for_all = 1; + } + +- icmap_get_uint8("quorum.allow_downscale", &allow_downscale); +- icmap_get_uint8("quorum.wait_for_all", &wait_for_all); +- icmap_get_uint8("quorum.last_man_standing", &last_man_standing); +- icmap_get_uint32("quorum.last_man_standing_window", &last_man_standing_window); +- icmap_get_uint8("quorum.expected_votes_tracking", &ev_tracking); +- icmap_get_uint8("quorum.auto_tie_breaker", &atb); +- icmap_get_string("quorum.auto_tie_breaker_node", &atb_string); ++ (void)icmap_get_uint8("quorum.allow_downscale", &allow_downscale); ++ (void)icmap_get_uint8("quorum.wait_for_all", &wait_for_all); ++ (void)icmap_get_uint8("quorum.last_man_standing", &last_man_standing); ++ (void)icmap_get_uint32("quorum.last_man_standing_window", &last_man_standing_window); ++ (void)icmap_get_uint8("quorum.expected_votes_tracking", &ev_tracking); ++ (void)icmap_get_uint8("quorum.auto_tie_breaker", &atb); ++ (void)icmap_get_string("quorum.auto_tie_breaker_node", &atb_string); + + /* auto_tie_breaker defaults to LOWEST */ + if (atb) { +@@ -1517,7 +1517,7 @@ static char *votequorum_readconfig(int runtime) + us->expected_votes = node_expected_votes; + } else { + us->votes = 1; +- icmap_get_uint32("quorum.votes", &us->votes); ++ (void)icmap_get_uint32("quorum.votes", &us->votes); + } + + if (expected_votes) { +@@ -1568,7 +1568,7 @@ static void votequorum_refresh_config( + return ; + } + +- icmap_get_uint8("quorum.cancel_wait_for_all", &cancel_wfa); ++ (void)icmap_get_uint8("quorum.cancel_wait_for_all", &cancel_wfa); + if (strcmp(key_name, "quorum.cancel_wait_for_all") == 0 && + cancel_wfa >= 1) { + icmap_set_uint8("quorum.cancel_wait_for_all", 0); +-- +1.8.3.1 + diff --git a/SOURCES/bz1780137-2-votequorum-Reflect-runtime-change-of-2Node-to-WFA.patch b/SOURCES/bz1780137-2-votequorum-Reflect-runtime-change-of-2Node-to-WFA.patch new file mode 100644 index 0000000..0410773 --- /dev/null +++ b/SOURCES/bz1780137-2-votequorum-Reflect-runtime-change-of-2Node-to-WFA.patch @@ -0,0 +1,80 @@ +From 8ce65bf951bc1e5b2d64b60ea027fbdc551d4fc8 Mon Sep 17 00:00:00 2001 +From: Jan Friesse +Date: Thu, 16 Jan 2020 15:43:59 +0100 +Subject: [PATCH] votequorum: Reflect runtime change of 2Node to WFA + +When 2Node mode is set, WFA is also set unless WFA is configured +explicitly. This behavior was not reflected on runtime change, so +restarted corosync behavior was different (WFA not set). Also when +cluster is reduced from 3 nodes to 2 nodes during runtime, WFA was not +set, what may result in two quorate partitions. + +Solution is to set WFA depending on 2Node when WFA +is not explicitly configured. + +Signed-off-by: Jan Friesse +Reviewed-by: Christine Caulfield +--- + exec/votequorum.c | 24 +++++++++++++++++++----- + 1 file changed, 19 insertions(+), 5 deletions(-) + +diff --git a/exec/votequorum.c b/exec/votequorum.c +index 0cde8f8..52424fa 100644 +--- a/exec/votequorum.c ++++ b/exec/votequorum.c +@@ -80,6 +80,7 @@ static uint8_t two_node = 0; + + static uint8_t wait_for_all = 0; + static uint8_t wait_for_all_status = 0; ++static uint8_t wait_for_all_autoset = 0; /* Wait for all is not set explicitly and follows two_node */ + + static enum {ATB_NONE, ATB_LOWEST, ATB_HIGHEST, ATB_LIST} auto_tie_breaker = ATB_NONE, initial_auto_tie_breaker = ATB_NONE; + static int lowest_node_id = -1; +@@ -1315,12 +1316,10 @@ static char *votequorum_readconfig(int runtime) + * Enable special features + */ + if (!runtime) { +- if (two_node) { +- wait_for_all = 1; +- } +- + (void)icmap_get_uint8("quorum.allow_downscale", &allow_downscale); +- (void)icmap_get_uint8("quorum.wait_for_all", &wait_for_all); ++ if (icmap_get_uint8("quorum.wait_for_all", &wait_for_all) != CS_OK) { ++ wait_for_all_autoset = 1; ++ } + (void)icmap_get_uint8("quorum.last_man_standing", &last_man_standing); + (void)icmap_get_uint32("quorum.last_man_standing_window", &last_man_standing_window); + (void)icmap_get_uint8("quorum.expected_votes_tracking", &ev_tracking); +@@ -1361,6 +1360,15 @@ static char *votequorum_readconfig(int runtime) + + } + ++ /* ++ * Changing of wait_for_all during runtime is not supported, but changing of two_node is ++ * and two_node may set wfa if not configured explicitly. It is safe to unset it ++ * (or set it back) when two_node changes. ++ */ ++ if (wait_for_all_autoset) { ++ wait_for_all = two_node; ++ } ++ + /* two_node and auto_tie_breaker are not compatible as two_node uses + * a fence race to decide quorum whereas ATB decides based on node id + */ +@@ -1540,6 +1548,12 @@ static char *votequorum_readconfig(int runtime) + update_two_node(); + if (wait_for_all) { + update_wait_for_all_status(1); ++ } else if (wait_for_all_autoset && wait_for_all_status) { ++ /* ++ * Reset wait for all status for consistency when wfa is auto-unset by 2node. ++ * wait_for_all_status would be ignored by are_we_quorate anyway. ++ */ ++ update_wait_for_all_status(0); + } + + out: +-- +1.8.3.1 + diff --git a/SOURCES/bz1791792-1-man-move-cmap_keys-man-page-from-section-8-to-7.patch b/SOURCES/bz1791792-1-man-move-cmap_keys-man-page-from-section-8-to-7.patch new file mode 100644 index 0000000..4141cbf --- /dev/null +++ b/SOURCES/bz1791792-1-man-move-cmap_keys-man-page-from-section-8-to-7.patch @@ -0,0 +1,901 @@ +From f1d36307e524f9440733f0b01a9fc627a0e1cac7 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Ferenc=20W=C3=A1gner?= +Date: Sat, 4 Jan 2020 13:38:08 +0100 +Subject: [PATCH] man: move cmap_keys man page from section 8 to 7 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Section 8 is for "System administration commands", 7 is "Miscellaneous". + +Signed-off-by: Ferenc Wágner +Reviewed-by: Jan Friesse +--- + corosync.spec.in | 2 +- + man/Makefile.am | 2 +- + man/cmap_keys.7 | 397 +++++++++++++++++++++++++++++++++++++++++++++++++ + man/cmap_keys.8 | 397 ------------------------------------------------- + man/cmap_overview.3 | 4 +- + man/corosync-cmapctl.8 | 2 +- + man/index.html | 2 +- + 7 files changed, 403 insertions(+), 403 deletions(-) + create mode 100644 man/cmap_keys.7 + delete mode 100644 man/cmap_keys.8 + +diff --git a/corosync.spec.in b/corosync.spec.in +index c06675d..8ac3757 100644 +--- a/corosync.spec.in ++++ b/corosync.spec.in +@@ -217,7 +217,7 @@ fi + %{_mandir}/man8/corosync-quorumtool.8* + %{_mandir}/man5/corosync.conf.5* + %{_mandir}/man5/votequorum.5* +-%{_mandir}/man8/cmap_keys.8* ++%{_mandir}/man7/cmap_keys.7* + + # library + # +diff --git a/man/Makefile.am b/man/Makefile.am +index 2ef5dcd..92a76ed 100644 +--- a/man/Makefile.am ++++ b/man/Makefile.am +@@ -140,7 +140,7 @@ dist_man_MANS = corosync.conf.5 \ + votequorum_overview.3 \ + sam_overview.3 \ + cmap_overview.3 \ +- cmap_keys.8 ++ cmap_keys.7 + + if BUILD_VQSIM + dist_man_MANS += $(corosync_vqsim_man) +diff --git a/man/cmap_keys.7 b/man/cmap_keys.7 +new file mode 100644 +index 0000000..6bc04fe +--- /dev/null ++++ b/man/cmap_keys.7 +@@ -0,0 +1,397 @@ ++.\"/* ++.\" * Copyright (c) 2012-2018 Red Hat, Inc. ++.\" * ++.\" * All rights reserved. ++.\" * ++.\" * Author: Jan Friesse (jfriesse@redhat.com) ++.\" * ++.\" * This software licensed under BSD license, the text of which follows: ++.\" * ++.\" * Redistribution and use in source and binary forms, with or without ++.\" * modification, are permitted provided that the following conditions are met: ++.\" * ++.\" * - Redistributions of source code must retain the above copyright notice, ++.\" * this list of conditions and the following disclaimer. ++.\" * - Redistributions in binary form must reproduce the above copyright notice, ++.\" * this list of conditions and the following disclaimer in the documentation ++.\" * and/or other materials provided with the distribution. ++.\" * - Neither the name of the Red Hat, Inc. nor the names of its ++.\" * contributors may be used to endorse or promote products derived from this ++.\" * software without specific prior written permission. ++.\" * ++.\" * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++.\" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++.\" * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++.\" * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE ++.\" * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++.\" * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++.\" * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS ++.\" * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN ++.\" * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ++.\" * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF ++.\" * THE POSSIBILITY OF SUCH DAMAGE. ++.\" */ ++.TH "CMAP_KEYS" 7 "2018-10-08" "corosync Man Page" "Corosync Cluster Engine Programmer's Manual" ++ ++.SH NAME ++.P ++cmap_keys \- Overview of keys stored in the Configuration Map ++ ++.SH OVERVIEW ++.P ++There are 3 main types of keys stored in CMAP: ++.PP ++* Mapping of values stored in the config file. ++.PP ++* Runtime statistics. ++.PP ++* Other user created values. ++ ++In this man page, wild-cards have the usual meaning. ++ ++.SH ICMAP KEYS ++These keys are in the icmap (default) map ++.TP ++internal_configuration.* ++Internal configuration data. All keys in this prefix are read only. ++It's only useful for getting a list of loaded services. ++ ++.TP ++logging.* ++Values read from the configuration file. It's possible to change them at runtime. ++If subsystem specific configuration is needed, the key must be in the form ++logging.logger_subsys.SERVICE.key, where SERVICE is upper case name of the service and ++key is same as in the configuration file. All values are of string type. ++ ++.TP ++nodelist.* ++Values are read from the configuration file only (dynamic updates are not allowed). ++Each node element in the configuration file gets ++assigned its position starting from zero. So the first node from the config file has ++nodelist.node.0. prefix. To be a valid entry, each node must have ++.B ring0_addr ++key. ++To change the ++.B nodeid ++key, use a u32 data type. ++ ++Local node position is stored in ++.B local_node_pos ++key (RO), so it's easy to find ++out nodeid/ring addresses of the local node directly from cmap. ++ ++.TP ++runtime.blackbox.* ++Trigger keys for storing fplay data. It's recommended that you use the corosync-blackbox command ++to change keys in this prefix. ++ ++.TP ++runtime.force_gather ++Set to 'yes' to force the processor to move into the GATHER state. This operation ++is dangerous and is not recommended. ++ ++.TP ++runtime.config.* ++Contains the values actually in use by the totem membership protocol. ++Values here are either taken from the Corosync configuration file, ++defaults or computed from entries in the config file. For information ++on individual keys please refer to the man page ++.BR corosync.conf (5). ++ ++.TP ++runtime.services.* ++Prefix with statistics for service engines. Each service has its own ++.B service_id ++key in the prefix with the name runtime.services.SERVICE., where SERVICE is the lower case ++name of the service. Inside the service prefix is the number of messages received and sent ++by the corosync engine in the format runtime.services.SERVICE.EXEC_CALL.rx and ++runtime.services.SERVICE.EXEC_CALL.tx, where EXEC_CALL is the internal id of the service ++call (so for example 3 in cpg service is receive of multicast message from other ++nodes). ++ ++.TP ++runtime.totem.members.* ++Prefix containing members of the totem single ring protocol. Each member ++keys has format runtime.totem.members.NODEID.KEY, where key is ++one of: ++ ++.B config_version ++Config version of the member node. ++ ++.TP ++resources.process.PID.* ++Prefix created by applications using SAM with CMAP integration. ++It contains the following keys: ++ ++.B recovery ++Recovery policy of the process. Can be one of quit or restart. ++ ++.B poll_period ++Value passed in sam_initialize as a time_interval. ++ ++.B last_updated ++Last time SAM received a heartbeat from the client. ++ ++.B state ++State of the client. Can be one of failed, stopped, running and waiting for quorum. ++ ++.TP ++uidgid.* ++Information about users/groups which are allowed to make IPC connections to ++corosync. Entries loaded from configuration file are stored with ++uidgid.config.* prefix and are pruned on configuration file reload. Dynamic ++entries has uidgid.* prefix and a configuration file reload doesn't affect them. ++ ++.TP ++quorum.cancel_wait_for_all ++Tells votequorum to cancel waiting for all nodes at cluster startup. Can be used ++to unblock quorum if notes are known to be down. For pcs use only. ++ ++.TP ++config.reload_in_progress ++This value will be set to 1 (or created) when a corosync.conf reload is started, ++and set to 0 when the reload is completed. This allows interested subsystems ++to do atomic reconfiguration rather than changing each key. Note that ++individual add/change/delete notifications will still be sent during a reload. ++ ++.TP ++config.totemconfig_reload_in_progress ++This key is similar to ++.B config.totemconfig_reload_in_progress ++but changed after the totem config trigger is processed. It is useful (mainly) ++for situations when ++.B nodelist.local_node_pos ++must be correctly reinstated before anything else. ++ ++.SH STATS KEYS ++These keys are in the stats map. All keys in this map are read-only. ++Modification tracking of individual keys is supported in the stats map, but not ++prefixes. Add/Delete operations are supported on prefixes though so you can track ++for new ipc connections or knet interfaces. ++.TP ++stats.srp.* ++Prefix containing statistics about totem. ++Typical key prefixes: ++ ++.B commit_entered ++Number of times the processor entered COMMIT state. ++ ++.B commit_token_lost ++Number of times the processor lost token in COMMIT state. ++ ++.B consensus_timeouts ++How many times the processor timed out forming a consensus about membership. ++ ++.B continuous_gather ++How many times the processor was not able to reach consensus. ++ ++.B firewall_enabled_or_nic_failure ++Set to 1 when processor was not able to reach consensus for long time. The usual ++reason is a badly configured firewall or connection failure. ++ ++.B gather_entered ++Number of times the processor entered GATHER state. ++ ++.B gather_token_lost ++Number of times the processor lost token in GATHER state. ++ ++.B mcast_retx ++Number of retransmitted messages. ++ ++.B mcast_rx ++Number of received multicast messages. ++ ++.B mcast_tx ++Number of transmitted multicast messages. ++ ++.B memb_commit_token_rx ++Number of received commit tokens. ++ ++.B memb_commit_token_tx ++Number of transmitted commit tokens. ++ ++.B memb_join_rx ++Number of received join messages. ++ ++.B memb_join_tx ++Number of transmitted join messages. ++ ++.B memb_merge_detect_rx ++Number of received member merge messages. ++ ++.B memb_merge_detect_tx ++Number of transmitted member merge messages. ++ ++.B orf_token_rx ++Number of received orf tokens. ++ ++.B orf_token_tx ++Number of transmitted orf tokens. ++ ++.B recovery_entered ++Number of times the processor entered recovery. ++ ++.B recovery_token_lost ++Number of times the token was lost in recovery state. ++ ++.B rx_msg_dropped ++Number of received messages which were dropped because they were not expected ++(as example multicast message in commit state). ++ ++.B token_hold_cancel_rx ++Number of received token hold cancel messages. ++ ++.B token_hold_cancel_tx ++Number of transmitted token hold cancel messages. ++ ++.B mtt_rx_token ++Mean transit time of token in milliseconds. In other words, time between ++two consecutive token receives. ++ ++.B avg_token_workload ++Average time in milliseconds of holding time of token on the current processor. ++ ++.B avg_backlog_calc ++Average number of not yet sent messages on the current processor. ++ ++.TP ++stats.knet.nodeX.linkY.* ++Statistics about the network traffic to and from each node and link when using ++tke kronosnet transport ++ ++.B connected ++Whether the link is connected or not ++ ++.B up_count ++Number of times this link has changed state to UP ++ ++.B down_count ++Number of times this link has changed state to DOWN ++ ++.B latency_ave / latency_max / latency_max ++Calculated latencies of this link. Note that if there has been no traffic ++on the link then latency_min will show a very large number. ++ ++.B latency_samples ++The number of samples used to calculate the latency figures, so you have ++some idea of their precision. ++ ++.B rx_data_packets / tx_data_packets ++The number of packets sent/received on this link ++ ++.B rx_data_bytes / tx_data_bytes ++The number of bytes sent/received on this link ++ ++.B rx_pmtu_packets / tx_pmtu_packets ++The number of packets sent/received by the PMTUd subsystem ++ ++.B rx_pmtu_bytes / tx_pmtu_bytes ++The number of bytes sent/received by the PMTUd subsystem ++ ++.B rx_ping_packets / tx_ping_packets ++The number of packets sent/received as pings ++ ++.B rx_ping_bytes / tx_ping_bytes ++The number of bytes sent/received as pings ++ ++.B rx_pong_packets / tx_pong_packets ++The number of packets sent/received as pongs ++ ++.B rx_pong_bytes / tx_pong_bytes ++The number of bytes sent/received as pongs ++ ++.B rx_total_packets / tx_total_packets ++The total number of packets sent/received. The aggregate of all of the above packet stats ++ ++.B rx_total_bytes / tx_total_bytes ++The total number of bytes sent/received. The aggregate of all of the above bytes stats ++ ++.B tx_data_retries / tx_pmtu_retries / tx_ping_retries / tx_pong_retries / tx_total_retries ++Number of times a transmit operation had to be retried due to the socket returning EAGAIN ++ ++.TP ++stats.ipcs.* ++There is information about total number of active connections from client programs ++at the time the request was made. ++.B active ++number of closed connections during whole runtime of corosync ++.B closed ++Total number of connections that have been made since corosync was started ++ ++.TP ++stats.ipcs.ID.* ++Each IPC connection has a unique ID. This is in the form [[serviceX:][PID:]internal_id. ++ ++Typical keys in this prefix are: ++ ++.B proc_name ++process name of connected process (unavailable on some platforms) ++ ++.B dispatched ++number of dispatched messages. ++ ++.B invalid_request ++number of requests made by IPC which are invalid (calling non-existing call, ...). ++ ++.B name ++contains short name of the IPC connection (unavailable on some platforms). ++ ++.B overload ++is number of requests which were not processed because of overload. ++ ++.B queue_size ++contains the number of messages in the queue waiting for send. ++ ++.B recv_retries ++is the total number of interrupted receives. ++ ++.B requests ++contains the number of requests made by IPC. ++ ++.B responses ++is the number of responses sent to the IPC client. ++ ++.B send_retries ++contains the total number of interrupted sends. ++ ++.B service_id ++contains the ID of service which the IPC is connected to. ++ ++.TP ++stats.clear.* ++These are write-only keys used to clear the stats for various subsystems ++ ++.B totem ++Clears the pg & srp totem stats. ++ ++.B knet ++Clears the knet stats ++ ++.B ipc ++Clears the ipc stats ++ ++.B all ++Clears all of the above stats ++ ++ ++.SH DYNAMIC CHANGE USER/GROUP PERMISSION TO USE COROSYNC IPC ++Is the same as in the configuration file. eg: to add UID 500 use ++ ++.br ++# corosync-cmapctl -s uidgid.uid.500 u8 1 ++ ++GID is similar, so to add a GID use ++ ++.br ++# corosync-cmapctl -s uidgid.gid.500 u8 1 ++ ++For removal of permissions, simply delete the key ++ ++.br ++# corosync-cmapctl -d uidgid.gid.500 ++ ++ ++.SH "SEE ALSO" ++.BR corosync_overview (7), ++.BR corosync.conf (5), ++.BR corosync-cmapctl (8) +diff --git a/man/cmap_keys.8 b/man/cmap_keys.8 +deleted file mode 100644 +index e2ea1fb..0000000 +--- a/man/cmap_keys.8 ++++ /dev/null +@@ -1,397 +0,0 @@ +-.\"/* +-.\" * Copyright (c) 2012-2018 Red Hat, Inc. +-.\" * +-.\" * All rights reserved. +-.\" * +-.\" * Author: Jan Friesse (jfriesse@redhat.com) +-.\" * +-.\" * This software licensed under BSD license, the text of which follows: +-.\" * +-.\" * Redistribution and use in source and binary forms, with or without +-.\" * modification, are permitted provided that the following conditions are met: +-.\" * +-.\" * - Redistributions of source code must retain the above copyright notice, +-.\" * this list of conditions and the following disclaimer. +-.\" * - Redistributions in binary form must reproduce the above copyright notice, +-.\" * this list of conditions and the following disclaimer in the documentation +-.\" * and/or other materials provided with the distribution. +-.\" * - Neither the name of the Red Hat, Inc. nor the names of its +-.\" * contributors may be used to endorse or promote products derived from this +-.\" * software without specific prior written permission. +-.\" * +-.\" * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +-.\" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +-.\" * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +-.\" * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +-.\" * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +-.\" * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +-.\" * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +-.\" * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +-.\" * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +-.\" * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +-.\" * THE POSSIBILITY OF SUCH DAMAGE. +-.\" */ +-.TH "CMAP_KEYS" 8 "2018-10-08" "corosync Man Page" "Corosync Cluster Engine Programmer's Manual" +- +-.SH NAME +-.P +-cmap_keys \- Overview of keys stored in the Configuration Map +- +-.SH OVERVIEW +-.P +-There are 3 main types of keys stored in CMAP: +-.PP +-* Mapping of values stored in the config file. +-.PP +-* Runtime statistics. +-.PP +-* Other user created values. +- +-In this man page, wild-cards have the usual meaning. +- +-.SH ICMAP KEYS +-These keys are in the icmap (default) map +-.TP +-internal_configuration.* +-Internal configuration data. All keys in this prefix are read only. +-It's only useful for getting a list of loaded services. +- +-.TP +-logging.* +-Values read from the configuration file. It's possible to change them at runtime. +-If subsystem specific configuration is needed, the key must be in the form +-logging.logger_subsys.SERVICE.key, where SERVICE is upper case name of the service and +-key is same as in the configuration file. All values are of string type. +- +-.TP +-nodelist.* +-Values are read from the configuration file only (dynamic updates are not allowed). +-Each node element in the configuration file gets +-assigned its position starting from zero. So the first node from the config file has +-nodelist.node.0. prefix. To be a valid entry, each node must have +-.B ring0_addr +-key. +-To change the +-.B nodeid +-key, use a u32 data type. +- +-Local node position is stored in +-.B local_node_pos +-key (RO), so it's easy to find +-out nodeid/ring addresses of the local node directly from cmap. +- +-.TP +-runtime.blackbox.* +-Trigger keys for storing fplay data. It's recommended that you use the corosync-blackbox command +-to change keys in this prefix. +- +-.TP +-runtime.force_gather +-Set to 'yes' to force the processor to move into the GATHER state. This operation +-is dangerous and is not recommended. +- +-.TP +-runtime.config.* +-Contains the values actually in use by the totem membership protocol. +-Values here are either taken from the Corosync configuration file, +-defaults or computed from entries in the config file. For information +-on individual keys please refer to the man page +-.BR corosync.conf (5). +- +-.TP +-runtime.services.* +-Prefix with statistics for service engines. Each service has its own +-.B service_id +-key in the prefix with the name runtime.services.SERVICE., where SERVICE is the lower case +-name of the service. Inside the service prefix is the number of messages received and sent +-by the corosync engine in the format runtime.services.SERVICE.EXEC_CALL.rx and +-runtime.services.SERVICE.EXEC_CALL.tx, where EXEC_CALL is the internal id of the service +-call (so for example 3 in cpg service is receive of multicast message from other +-nodes). +- +-.TP +-runtime.totem.members.* +-Prefix containing members of the totem single ring protocol. Each member +-keys has format runtime.totem.members.NODEID.KEY, where key is +-one of: +- +-.B config_version +-Config version of the member node. +- +-.TP +-resources.process.PID.* +-Prefix created by applications using SAM with CMAP integration. +-It contains the following keys: +- +-.B recovery +-Recovery policy of the process. Can be one of quit or restart. +- +-.B poll_period +-Value passed in sam_initialize as a time_interval. +- +-.B last_updated +-Last time SAM received a heartbeat from the client. +- +-.B state +-State of the client. Can be one of failed, stopped, running and waiting for quorum. +- +-.TP +-uidgid.* +-Information about users/groups which are allowed to make IPC connections to +-corosync. Entries loaded from configuration file are stored with +-uidgid.config.* prefix and are pruned on configuration file reload. Dynamic +-entries has uidgid.* prefix and a configuration file reload doesn't affect them. +- +-.TP +-quorum.cancel_wait_for_all +-Tells votequorum to cancel waiting for all nodes at cluster startup. Can be used +-to unblock quorum if notes are known to be down. For pcs use only. +- +-.TP +-config.reload_in_progress +-This value will be set to 1 (or created) when a corosync.conf reload is started, +-and set to 0 when the reload is completed. This allows interested subsystems +-to do atomic reconfiguration rather than changing each key. Note that +-individual add/change/delete notifications will still be sent during a reload. +- +-.TP +-config.totemconfig_reload_in_progress +-This key is similar to +-.B config.totemconfig_reload_in_progress +-but changed after the totem config trigger is processed. It is useful (mainly) +-for situations when +-.B nodelist.local_node_pos +-must be correctly reinstated before anything else. +- +-.SH STATS KEYS +-These keys are in the stats map. All keys in this map are read-only. +-Modification tracking of individual keys is supported in the stats map, but not +-prefixes. Add/Delete operations are supported on prefixes though so you can track +-for new ipc connections or knet interfaces. +-.TP +-stats.srp.* +-Prefix containing statistics about totem. +-Typical key prefixes: +- +-.B commit_entered +-Number of times the processor entered COMMIT state. +- +-.B commit_token_lost +-Number of times the processor lost token in COMMIT state. +- +-.B consensus_timeouts +-How many times the processor timed out forming a consensus about membership. +- +-.B continuous_gather +-How many times the processor was not able to reach consensus. +- +-.B firewall_enabled_or_nic_failure +-Set to 1 when processor was not able to reach consensus for long time. The usual +-reason is a badly configured firewall or connection failure. +- +-.B gather_entered +-Number of times the processor entered GATHER state. +- +-.B gather_token_lost +-Number of times the processor lost token in GATHER state. +- +-.B mcast_retx +-Number of retransmitted messages. +- +-.B mcast_rx +-Number of received multicast messages. +- +-.B mcast_tx +-Number of transmitted multicast messages. +- +-.B memb_commit_token_rx +-Number of received commit tokens. +- +-.B memb_commit_token_tx +-Number of transmitted commit tokens. +- +-.B memb_join_rx +-Number of received join messages. +- +-.B memb_join_tx +-Number of transmitted join messages. +- +-.B memb_merge_detect_rx +-Number of received member merge messages. +- +-.B memb_merge_detect_tx +-Number of transmitted member merge messages. +- +-.B orf_token_rx +-Number of received orf tokens. +- +-.B orf_token_tx +-Number of transmitted orf tokens. +- +-.B recovery_entered +-Number of times the processor entered recovery. +- +-.B recovery_token_lost +-Number of times the token was lost in recovery state. +- +-.B rx_msg_dropped +-Number of received messages which were dropped because they were not expected +-(as example multicast message in commit state). +- +-.B token_hold_cancel_rx +-Number of received token hold cancel messages. +- +-.B token_hold_cancel_tx +-Number of transmitted token hold cancel messages. +- +-.B mtt_rx_token +-Mean transit time of token in milliseconds. In other words, time between +-two consecutive token receives. +- +-.B avg_token_workload +-Average time in milliseconds of holding time of token on the current processor. +- +-.B avg_backlog_calc +-Average number of not yet sent messages on the current processor. +- +-.TP +-stats.knet.nodeX.linkY.* +-Statistics about the network traffic to and from each node and link when using +-tke kronosnet transport +- +-.B connected +-Whether the link is connected or not +- +-.B up_count +-Number of times this link has changed state to UP +- +-.B down_count +-Number of times this link has changed state to DOWN +- +-.B latency_ave / latency_max / latency_max +-Calculated latencies of this link. Note that if there has been no traffic +-on the link then latency_min will show a very large number. +- +-.B latency_samples +-The number of samples used to calculate the latency figures, so you have +-some idea of their precision. +- +-.B rx_data_packets / tx_data_packets +-The number of packets sent/received on this link +- +-.B rx_data_bytes / tx_data_bytes +-The number of bytes sent/received on this link +- +-.B rx_pmtu_packets / tx_pmtu_packets +-The number of packets sent/received by the PMTUd subsystem +- +-.B rx_pmtu_bytes / tx_pmtu_bytes +-The number of bytes sent/received by the PMTUd subsystem +- +-.B rx_ping_packets / tx_ping_packets +-The number of packets sent/received as pings +- +-.B rx_ping_bytes / tx_ping_bytes +-The number of bytes sent/received as pings +- +-.B rx_pong_packets / tx_pong_packets +-The number of packets sent/received as pongs +- +-.B rx_pong_bytes / tx_pong_bytes +-The number of bytes sent/received as pongs +- +-.B rx_total_packets / tx_total_packets +-The total number of packets sent/received. The aggregate of all of the above packet stats +- +-.B rx_total_bytes / tx_total_bytes +-The total number of bytes sent/received. The aggregate of all of the above bytes stats +- +-.B tx_data_retries / tx_pmtu_retries / tx_ping_retries / tx_pong_retries / tx_total_retries +-Number of times a transmit operation had to be retried due to the socket returning EAGAIN +- +-.TP +-stats.ipcs.* +-There is information about total number of active connections from client programs +-at the time the request was made. +-.B active +-number of closed connections during whole runtime of corosync +-.B closed +-Total number of connections that have been made since corosync was started +- +-.TP +-stats.ipcs.ID.* +-Each IPC connection has a unique ID. This is in the form [[serviceX:][PID:]internal_id. +- +-Typical keys in this prefix are: +- +-.B proc_name +-process name of connected process (unavailable on some platforms) +- +-.B dispatched +-number of dispatched messages. +- +-.B invalid_request +-number of requests made by IPC which are invalid (calling non-existing call, ...). +- +-.B name +-contains short name of the IPC connection (unavailable on some platforms). +- +-.B overload +-is number of requests which were not processed because of overload. +- +-.B queue_size +-contains the number of messages in the queue waiting for send. +- +-.B recv_retries +-is the total number of interrupted receives. +- +-.B requests +-contains the number of requests made by IPC. +- +-.B responses +-is the number of responses sent to the IPC client. +- +-.B send_retries +-contains the total number of interrupted sends. +- +-.B service_id +-contains the ID of service which the IPC is connected to. +- +-.TP +-stats.clear.* +-These are write-only keys used to clear the stats for various subsystems +- +-.B totem +-Clears the pg & srp totem stats. +- +-.B knet +-Clears the knet stats +- +-.B ipc +-Clears the ipc stats +- +-.B all +-Clears all of the above stats +- +- +-.SH DYNAMIC CHANGE USER/GROUP PERMISSION TO USE COROSYNC IPC +-Is the same as in the configuration file. eg: to add UID 500 use +- +-.br +-# corosync-cmapctl -s uidgid.uid.500 u8 1 +- +-GID is similar, so to add a GID use +- +-.br +-# corosync-cmapctl -s uidgid.gid.500 u8 1 +- +-For removal of permissions, simply delete the key +- +-.br +-# corosync-cmapctl -d uidgid.gid.500 +- +- +-.SH "SEE ALSO" +-.BR corosync_overview (7), +-.BR corosync.conf (5), +-.BR corosync-cmapctl (8) +diff --git a/man/cmap_overview.3 b/man/cmap_overview.3 +index cf4cabb..0aa3c14 100644 +--- a/man/cmap_overview.3 ++++ b/man/cmap_overview.3 +@@ -54,7 +54,7 @@ The library provides a mechanism to: + .PP + * Track changes on keys + +-Description of most keys created by corosync itself can be found in cmap_keys (8). ++Description of most keys created by corosync itself can be found in cmap_keys (7). + + .SH BUGS + .SH "SEE ALSO" +@@ -75,4 +75,4 @@ Description of most keys created by corosync itself can be found in cmap_keys (8 + .BR cmap_iter_finalize (3), + .BR cmap_track_add (3), + .BR cmap_track_delete (3), +-.BR cmap_keys (8) ++.BR cmap_keys (7) +diff --git a/man/corosync-cmapctl.8 b/man/corosync-cmapctl.8 +index 637e597..8826503 100644 +--- a/man/corosync-cmapctl.8 ++++ b/man/corosync-cmapctl.8 +@@ -96,4 +96,4 @@ corosync\-cmapctl \fB\-C\fR [ipc|totem|knet|all] + + .SH "SEE ALSO" + .BR cmap_overview (3), +-.BR cmap_keys (8) ++.BR cmap_keys (7) +diff --git a/man/index.html b/man/index.html +index f4819e5..21326dc 100644 +--- a/man/index.html ++++ b/man/index.html +@@ -63,7 +63,7 @@ + Description of corosync-cmapctl tool. +
+ +- cmap_keys(8): ++ cmap_keys(7): + Overview of keys stored in the Configuration Map. +
+ +-- +1.8.3.1 + diff --git a/SOURCES/bz1791792-2-stats-Add-stats-for-scheduler-misses.patch b/SOURCES/bz1791792-2-stats-Add-stats-for-scheduler-misses.patch new file mode 100644 index 0000000..898d9b5 --- /dev/null +++ b/SOURCES/bz1791792-2-stats-Add-stats-for-scheduler-misses.patch @@ -0,0 +1,320 @@ +From 48b6894ef41e9a06ccbb696d062d86ef60dc2c4b Mon Sep 17 00:00:00 2001 +From: Christine Caulfield +Date: Fri, 17 Jan 2020 14:22:16 +0000 +Subject: [PATCH] stats: Add stats for scheduler misses + +This patch add a stats.schedmiss.* set of entries that +are a record of the last 10 times corosync was not scheduled +in time. + +These entries are keypt in reverse order (so stats.schedmiss.0.* is +always the latest one kept) and the values, including the timestamp, +are in milliseconds. + +It's also possible to use a cmap tracker to follow these events, which +might be useful. + +Signed-off-by: Christine Caulfield +Reviewed-by: Jan Friesse +--- + exec/main.c | 2 + + exec/stats.c | 113 +++++++++++++++++++++++++++++++++++++++++++---- + exec/stats.h | 2 + + man/cmap_keys.7 | 26 ++++++++++- + tools/corosync-cmapctl.c | 5 ++- + 5 files changed, 136 insertions(+), 12 deletions(-) + +diff --git a/exec/main.c b/exec/main.c +index 7a471a1..fb0486e 100644 +--- a/exec/main.c ++++ b/exec/main.c +@@ -835,6 +835,8 @@ static void timer_function_scheduler_timeout (void *data) + log_printf (LOGSYS_LEVEL_WARNING, "Corosync main process was not scheduled for %0.4f ms " + "(threshold is %0.4f ms). Consider token timeout increase.", + (float)tv_diff / QB_TIME_NS_IN_MSEC, (float)timeout_data->max_tv_diff / QB_TIME_NS_IN_MSEC); ++ ++ stats_add_schedmiss_event(tv_current / 1000, (float)tv_diff / QB_TIME_NS_IN_MSEC); + } + + /* +diff --git a/exec/stats.c b/exec/stats.c +index e89504e..d5c1cbc 100644 +--- a/exec/stats.c ++++ b/exec/stats.c +@@ -1,5 +1,5 @@ + /* +- * Copyright (c) 2017 Red Hat, Inc. ++ * Copyright (c) 2017-2020 Red Hat, Inc. + * + * All rights reserved. + * +@@ -60,9 +60,20 @@ LOGSYS_DECLARE_SUBSYS ("STATS"); + + static qb_map_t *stats_map; + ++/* Structure of an element in the schedmiss array */ ++struct schedmiss_entry { ++ uint64_t timestamp; ++ float delay; ++}; ++#define MAX_SCHEDMISS_EVENTS 10 ++static struct schedmiss_entry schedmiss_event[MAX_SCHEDMISS_EVENTS]; ++static unsigned int highest_schedmiss_event; ++ ++#define SCHEDMISS_PREFIX "stats.schedmiss" ++ + /* Convert iterator number to text and a stats pointer */ + struct cs_stats_conv { +- enum {STAT_PG, STAT_SRP, STAT_KNET, STAT_KNET_HANDLE, STAT_IPCSC, STAT_IPCSG} type; ++ enum {STAT_PG, STAT_SRP, STAT_KNET, STAT_KNET_HANDLE, STAT_IPCSC, STAT_IPCSG, STAT_SCHEDMISS} type; + const char *name; + const size_t offset; + const icmap_value_types_t value_type; +@@ -190,6 +201,10 @@ struct cs_stats_conv cs_ipcs_global_stats[] = { + { STAT_IPCSG, "global.active", offsetof(struct ipcs_global_stats, active), ICMAP_VALUETYPE_UINT64}, + { STAT_IPCSG, "global.closed", offsetof(struct ipcs_global_stats, closed), ICMAP_VALUETYPE_UINT64}, + }; ++struct cs_stats_conv cs_schedmiss_stats[] = { ++ { STAT_SCHEDMISS, "timestamp", offsetof(struct schedmiss_entry, timestamp), ICMAP_VALUETYPE_UINT64}, ++ { STAT_SCHEDMISS, "delay", offsetof(struct schedmiss_entry, delay), ICMAP_VALUETYPE_FLOAT}, ++}; + + #define NUM_PG_STATS (sizeof(cs_pg_stats) / sizeof(struct cs_stats_conv)) + #define NUM_SRP_STATS (sizeof(cs_srp_stats) / sizeof(struct cs_stats_conv)) +@@ -286,7 +301,7 @@ cs_error_t stats_map_init(const struct corosync_api_v1 *corosync_api) + stats_add_entry(param, &cs_ipcs_global_stats[i]); + } + +- /* KNET and IPCS stats are added when appropriate */ ++ /* KNET, IPCS & SCHEDMISS stats are added when appropriate */ + return CS_OK; + } + +@@ -307,6 +322,8 @@ cs_error_t stats_map_get(const char *key_name, + int link_no; + int service_id; + uint32_t pid; ++ unsigned int sm_event; ++ char *sm_type; + void *conn_ptr; + + item = qb_map_get(stats_map, key_name); +@@ -363,17 +380,85 @@ cs_error_t stats_map_get(const char *key_name, + cs_ipcs_get_global_stats(&ipcs_global_stats); + stats_map_set_value(statinfo, &ipcs_global_stats, value, value_len, type); + break; ++ case STAT_SCHEDMISS: ++ if (sscanf(key_name, SCHEDMISS_PREFIX ".%d", &sm_event) != 1) { ++ return CS_ERR_NOT_EXIST; ++ } ++ ++ sm_type = strrchr(key_name, '.'); ++ if (sm_type == NULL) { ++ return CS_ERR_NOT_EXIST; ++ } ++ sm_type++; ++ ++ if (strcmp(sm_type, "timestamp") == 0) { ++ memcpy(value, &schedmiss_event[sm_event].timestamp, sizeof(uint64_t)); ++ *value_len = sizeof(uint64_t); ++ *type = ICMAP_VALUETYPE_UINT64; ++ } ++ if (strcmp(sm_type, "delay") == 0) { ++ memcpy(value, &schedmiss_event[sm_event].delay, sizeof(float)); ++ *value_len = sizeof(float); ++ *type = ICMAP_VALUETYPE_FLOAT; ++ } ++ break; + default: + return CS_ERR_LIBRARY; + } + return CS_OK; + } + +-#define STATS_CLEAR "stats.clear." +-#define STATS_CLEAR_KNET "stats.clear.knet" +-#define STATS_CLEAR_IPC "stats.clear.ipc" +-#define STATS_CLEAR_TOTEM "stats.clear.totem" +-#define STATS_CLEAR_ALL "stats.clear.all" ++static void schedmiss_clear_stats(void) ++{ ++ int i; ++ char param[ICMAP_KEYNAME_MAXLEN]; ++ ++ for (i=0; i=0; i--) { ++ schedmiss_event[i+1].timestamp = schedmiss_event[i].timestamp; ++ schedmiss_event[i+1].delay = schedmiss_event[i].delay; ++ } ++ ++ /* New entries are always at the front */ ++ schedmiss_event[0].timestamp = timestamp; ++ schedmiss_event[0].delay = delay; ++ ++ /* If we've not run off the end then add an entry in the trie for the new 'end' one */ ++ if (highest_schedmiss_event < MAX_SCHEDMISS_EVENTS) { ++ sprintf(param, SCHEDMISS_PREFIX ".%i.timestamp", highest_schedmiss_event); ++ stats_add_entry(param, &cs_schedmiss_stats[0]); ++ sprintf(param, SCHEDMISS_PREFIX ".%i.delay", highest_schedmiss_event); ++ stats_add_entry(param, &cs_schedmiss_stats[1]); ++ highest_schedmiss_event++; ++ } ++ /* Notifications get sent by the stats_updater */ ++} ++ ++#define STATS_CLEAR "stats.clear." ++#define STATS_CLEAR_KNET "stats.clear.knet" ++#define STATS_CLEAR_IPC "stats.clear.ipc" ++#define STATS_CLEAR_TOTEM "stats.clear.totem" ++#define STATS_CLEAR_ALL "stats.clear.all" ++#define STATS_CLEAR_SCHEDMISS "stats.clear.schedmiss" + + cs_error_t stats_map_set(const char *key_name, + const void *value, +@@ -394,9 +479,14 @@ cs_error_t stats_map_set(const char *key_name, + totempg_stats_clear(TOTEMPG_STATS_CLEAR_TOTEM); + cleared = 1; + } ++ if (strncmp(key_name, STATS_CLEAR_SCHEDMISS, strlen(STATS_CLEAR_SCHEDMISS)) == 0) { ++ schedmiss_clear_stats(); ++ cleared = 1; ++ } + if (strncmp(key_name, STATS_CLEAR_ALL, strlen(STATS_CLEAR_ALL)) == 0) { + totempg_stats_clear(TOTEMPG_STATS_CLEAR_TRANSPORT | TOTEMPG_STATS_CLEAR_TOTEM); + cs_ipcs_clear_stats(); ++ schedmiss_clear_stats(); + cleared = 1; + } + if (!cleared) { +@@ -500,6 +590,11 @@ static void stats_map_notify_fn(uint32_t event, char *key, void *old_value, void + return ; + } + ++ /* Ignore schedmiss trackers as the values are read from the circular buffer */ ++ if (strncmp(key, SCHEDMISS_PREFIX, strlen(SCHEDMISS_PREFIX)) == 0 ) { ++ return ; ++ } ++ + new_val.data = new_value; + if (stats_map_get(key, + &new_value, +@@ -556,7 +651,7 @@ cs_error_t stats_map_track_add(const char *key_name, + } + /* Get initial value */ + if (stats_map_get(tracker->key_name, +- &tracker->old_value, &value_len, &type) == CS_OK) { ++ &tracker->old_value, &value_len, &type) != CS_OK) { + tracker->old_value = 0ULL; + } + } else { +diff --git a/exec/stats.h b/exec/stats.h +index 45891ae..eac9e7c 100644 +--- a/exec/stats.h ++++ b/exec/stats.h +@@ -69,3 +69,5 @@ void stats_trigger_trackers(void); + void stats_ipcs_add_connection(int service_id, uint32_t pid, void *ptr); + void stats_ipcs_del_connection(int service_id, uint32_t pid, void *ptr); + cs_error_t cs_ipcs_get_conn_stats(int service_id, uint32_t pid, void *conn_ptr, struct ipcs_conn_stats *ipcs_stats); ++ ++void stats_add_schedmiss_event(uint64_t, float delay); +diff --git a/man/cmap_keys.7 b/man/cmap_keys.7 +index 6bc04fe..da95c51 100644 +--- a/man/cmap_keys.7 ++++ b/man/cmap_keys.7 +@@ -1,5 +1,5 @@ + .\"/* +-.\" * Copyright (c) 2012-2018 Red Hat, Inc. ++.\" * Copyright (c) 2012-2020 Red Hat, Inc. + .\" * + .\" * All rights reserved. + .\" * +@@ -357,6 +357,27 @@ contains the total number of interrupted sends. + .B service_id + contains the ID of service which the IPC is connected to. + ++ ++.TP ++stats.schedmiss..* ++If corosync is not scheduled after the required period of time it will ++log this event and also write an entry to the stats cmap under this key. ++There can be up to 10 entries (0..9) in here, when an 11th event happens ++the earliest will be removed. ++ ++These events will always be in reverse order, so stats.schedmiss.0.* will ++always be the latest event kept and 9 the oldest. If you want to listen ++for notifications then you are recommended to listen for changes ++to stats.schedmiss.0.timestamp or stats.schedmiss.0.delay. ++ ++.B timestamp ++The time of the event in ms since the Epoch (ie time_t * 1000 but with ++valid milliseconds). ++ ++.B delay ++The time that corosync was paused (in ms, float value). ++ ++ + .TP + stats.clear.* + These are write-only keys used to clear the stats for various subsystems +@@ -370,6 +391,9 @@ Clears the knet stats + .B ipc + Clears the ipc stats + ++.B schedmiss ++Clears the schedmiss stats ++ + .B all + Clears all of the above stats + +diff --git a/tools/corosync-cmapctl.c b/tools/corosync-cmapctl.c +index a4b61bd..ffca7e1 100644 +--- a/tools/corosync-cmapctl.c ++++ b/tools/corosync-cmapctl.c +@@ -115,7 +115,7 @@ static int print_help(void) + printf(" about the networking and IPC traffic in some detail.\n"); + printf("\n"); + printf("Clear stats:\n"); +- printf(" corosync-cmapctl -C [knet|ipc|totem|all]\n"); ++ printf(" corosync-cmapctl -C [knet|ipc|totem|schedmiss|all]\n"); + printf(" The 'stats' map is implied\n"); + printf("\n"); + printf("Load settings from a file:\n"); +@@ -849,6 +849,7 @@ int main(int argc, char *argv[]) + if (strcmp(optarg, "knet") == 0 || + strcmp(optarg, "totem") == 0 || + strcmp(optarg, "ipc") == 0 || ++ strcmp(optarg, "schedmiss") == 0 || + strcmp(optarg, "all") == 0) { + action = ACTION_CLEARSTATS; + clear_opt = optarg; +@@ -857,7 +858,7 @@ int main(int argc, char *argv[]) + map = CMAP_MAP_STATS; + } + else { +- fprintf(stderr, "argument to -C should be 'knet', 'totem', 'ipc' or 'all'\n"); ++ fprintf(stderr, "argument to -C should be 'knet', 'totem', 'ipc', 'schedmiss' or 'all'\n"); + return (EXIT_FAILURE); + } + break; +-- +1.8.3.1 + diff --git a/SOURCES/bz1791792-3-stats-Use-nanoseconds-from-epoch-for-schedmiss.patch b/SOURCES/bz1791792-3-stats-Use-nanoseconds-from-epoch-for-schedmiss.patch new file mode 100644 index 0000000..8f6f5f1 --- /dev/null +++ b/SOURCES/bz1791792-3-stats-Use-nanoseconds-from-epoch-for-schedmiss.patch @@ -0,0 +1,31 @@ +From ebd05fa00826c366922e619b012a0684c6856539 Mon Sep 17 00:00:00 2001 +From: Jan Friesse +Date: Thu, 23 Jan 2020 17:11:54 +0100 +Subject: [PATCH] stats: Use nanoseconds from epoch for schedmiss + +Using monotonic time is not working because it doesn't have to match +time from epoch. + +Signed-off-by: Jan Friesse +Reviewed-by: Christine Caulfield +--- + exec/main.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/exec/main.c b/exec/main.c +index fb0486e..821d97e 100644 +--- a/exec/main.c ++++ b/exec/main.c +@@ -836,7 +836,8 @@ static void timer_function_scheduler_timeout (void *data) + "(threshold is %0.4f ms). Consider token timeout increase.", + (float)tv_diff / QB_TIME_NS_IN_MSEC, (float)timeout_data->max_tv_diff / QB_TIME_NS_IN_MSEC); + +- stats_add_schedmiss_event(tv_current / 1000, (float)tv_diff / QB_TIME_NS_IN_MSEC); ++ stats_add_schedmiss_event(qb_util_nano_from_epoch_get() / QB_TIME_NS_IN_MSEC, ++ (float)tv_diff / QB_TIME_NS_IN_MSEC); + } + + /* +-- +1.8.3.1 + diff --git a/SOURCES/bz1791792-4-main-Add-schedmiss-timestamp-into-message.patch b/SOURCES/bz1791792-4-main-Add-schedmiss-timestamp-into-message.patch new file mode 100644 index 0000000..44803e2 --- /dev/null +++ b/SOURCES/bz1791792-4-main-Add-schedmiss-timestamp-into-message.patch @@ -0,0 +1,47 @@ +From 35662dd0ec53f456445c30c0ef92892f47b25aa2 Mon Sep 17 00:00:00 2001 +From: Jan Friesse +Date: Mon, 24 Feb 2020 14:58:45 +0100 +Subject: [PATCH] main: Add schedmiss timestamp into message + +This is useful for matching schedmiss event in stats map with logged +event. + +Signed-off-by: Jan Friesse +Reviewed-by: Christine Caulfield +--- + exec/main.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/exec/main.c b/exec/main.c +index 821d97e..8c3df79 100644 +--- a/exec/main.c ++++ b/exec/main.c +@@ -817,6 +817,7 @@ static void timer_function_scheduler_timeout (void *data) + struct scheduler_pause_timeout_data *timeout_data = (struct scheduler_pause_timeout_data *)data; + unsigned long long tv_current; + unsigned long long tv_diff; ++ uint64_t schedmiss_event_tstamp; + + tv_current = qb_util_nano_current_get (); + +@@ -832,12 +833,14 @@ static void timer_function_scheduler_timeout (void *data) + timeout_data->tv_prev = tv_current; + + if (tv_diff > timeout_data->max_tv_diff) { +- log_printf (LOGSYS_LEVEL_WARNING, "Corosync main process was not scheduled for %0.4f ms " ++ schedmiss_event_tstamp = qb_util_nano_from_epoch_get() / QB_TIME_NS_IN_MSEC; ++ ++ log_printf (LOGSYS_LEVEL_WARNING, "Corosync main process was not scheduled (@%" PRIu64 ") for %0.4f ms " + "(threshold is %0.4f ms). Consider token timeout increase.", ++ schedmiss_event_tstamp, + (float)tv_diff / QB_TIME_NS_IN_MSEC, (float)timeout_data->max_tv_diff / QB_TIME_NS_IN_MSEC); + +- stats_add_schedmiss_event(qb_util_nano_from_epoch_get() / QB_TIME_NS_IN_MSEC, +- (float)tv_diff / QB_TIME_NS_IN_MSEC); ++ stats_add_schedmiss_event(schedmiss_event_tstamp, (float)tv_diff / QB_TIME_NS_IN_MSEC); + } + + /* +-- +1.8.3.1 + diff --git a/SOURCES/bz1809864-1-votequorum-Change-check-of-expected_votes.patch b/SOURCES/bz1809864-1-votequorum-Change-check-of-expected_votes.patch new file mode 100644 index 0000000..06d444e --- /dev/null +++ b/SOURCES/bz1809864-1-votequorum-Change-check-of-expected_votes.patch @@ -0,0 +1,51 @@ +From 0c16442f2d93f32a229b87d2672e2dc8025ec704 Mon Sep 17 00:00:00 2001 +From: Jan Friesse +Date: Wed, 4 Mar 2020 11:42:15 +0100 +Subject: [PATCH] votequorum: Change check of expected_votes + +Previously value of new expected_votes was checked so newly computed +quorum value was in the interval . The +upper range prevented the cluster to become unquorate, but bottom check +was almost useless because it allowed to change expected_votes so it is +smaller than total_votes. + +Solution is to check if expected_votes is bigger or equal to total_votes +and for quorate cluster only check if cluster doesn't become unquorate +(for unquorate cluster one can set upper range freely - as it is +perfectly possible when using config file) + +Signed-off-by: Jan Friesse +Reviewed-by: Christine Caulfield +--- + exec/votequorum.c | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +diff --git a/exec/votequorum.c b/exec/votequorum.c +index 52424fa..b152425 100644 +--- a/exec/votequorum.c ++++ b/exec/votequorum.c +@@ -1,5 +1,5 @@ + /* +- * Copyright (c) 2009-2015 Red Hat, Inc. ++ * Copyright (c) 2009-2020 Red Hat, Inc. + * + * All rights reserved. + * +@@ -2688,8 +2688,12 @@ static void message_handler_req_lib_votequorum_setexpected (void *conn, const vo + */ + newquorum = calculate_quorum(1, req_lib_votequorum_setexpected->expected_votes, &total_votes); + allow_downscale = allow_downscale_status; +- if (newquorum < total_votes / 2 || +- newquorum > total_votes) { ++ /* ++ * Setting expected_votes < total_votes doesn't make sense. ++ * For quorate cluster prevent cluster to become unquorate. ++ */ ++ if (req_lib_votequorum_setexpected->expected_votes < total_votes || ++ (cluster_is_quorate && (newquorum > total_votes))) { + error = CS_ERR_INVALID_PARAM; + goto error_exit; + } +-- +1.8.3.1 + diff --git a/SOURCES/bz1809864-2-quorumtool-exit-on-invalid-expected-votes.patch b/SOURCES/bz1809864-2-quorumtool-exit-on-invalid-expected-votes.patch new file mode 100644 index 0000000..1f4c760 --- /dev/null +++ b/SOURCES/bz1809864-2-quorumtool-exit-on-invalid-expected-votes.patch @@ -0,0 +1,33 @@ +From 5f543465bb3506b7f4929a426f1c22a9c854cecd Mon Sep 17 00:00:00 2001 +From: Jan Friesse +Date: Wed, 4 Mar 2020 08:53:41 +0100 +Subject: [PATCH] quorumtool: exit on invalid expected votes + +Signed-off-by: Jan Friesse +Reviewed-by: Christine Caulfield +--- + tools/corosync-quorumtool.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/tools/corosync-quorumtool.c b/tools/corosync-quorumtool.c +index 9bef844..44bf181 100644 +--- a/tools/corosync-quorumtool.c ++++ b/tools/corosync-quorumtool.c +@@ -1,5 +1,5 @@ + /* +- * Copyright (c) 2009-2019 Red Hat, Inc. ++ * Copyright (c) 2009-2020 Red Hat, Inc. + * + * All rights reserved. + * +@@ -937,6 +937,7 @@ int main (int argc, char *argv[]) { + votes = strtol(optarg, &endptr, 0); + if ((votes == 0 && endptr == optarg) || votes <= 0) { + fprintf(stderr, "New expected votes value was not valid, try a positive number\n"); ++ exit(EXIT_FAILURE); + } else { + command_opt = CMD_SETEXPECTED; + } +-- +1.8.3.1 + diff --git a/SOURCES/bz1816653-1-votequorum-set-wfa-status-only-on-startup.patch b/SOURCES/bz1816653-1-votequorum-set-wfa-status-only-on-startup.patch new file mode 100644 index 0000000..367f44c --- /dev/null +++ b/SOURCES/bz1816653-1-votequorum-set-wfa-status-only-on-startup.patch @@ -0,0 +1,67 @@ +From ca320beac25f82c0c555799e647a47975a333c28 Mon Sep 17 00:00:00 2001 +From: Jan Friesse +Date: Tue, 10 Mar 2020 17:49:27 +0100 +Subject: [PATCH] votequorum: set wfa status only on startup + +Previously reload of configuration with enabled wait_for_all result in +set of wait_for_all_status which set cluster_is_quorate to 0 but didn't +inform the quorum service so votequorum and quorum information may get +out of sync. + +Example is 1 node cluster, which is extended to 3 nodes. Quorum service +reports cluster as a quorate (incorrect) and votequorum as not-quorate +(correct). Similar behavior happens when extending cluster in general, +but some configurations are less incorrect (3->4). + +Discussed solution was to inform quorum service but that would mean +every reload would cause loss of quorum until all nodes would be seen +again. + +Such behaviour is consistent but seems to be a bit too strict. + +Proposed solution sets wait_for_all_status only on startup and +doesn't touch it during reload. + +This solution fulfills requirement of "cluster will be quorate for +the first time only after all nodes have been visible at least +once at the same time." because node clears wait_for_all_status only +after it sees all other nodes or joins cluster which is quorate. It also +solves problem with extending cluster, because when cluster becomes +unquorate (1->3) wait_for_all_status is set. + +Added assert is only for ensure that I haven't missed any case when +quorate cluster may become unquorate. + +Signed-off-by: Jan Friesse +Reviewed-by: Christine Caulfield +--- + exec/votequorum.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/exec/votequorum.c b/exec/votequorum.c +index b152425..fb9f1cd 100644 +--- a/exec/votequorum.c ++++ b/exec/votequorum.c +@@ -1009,7 +1009,7 @@ static void are_we_quorate(unsigned int total_votes) + "Waiting for all cluster members. " + "Current votes: %d expected_votes: %d", + total_votes, us->expected_votes); +- cluster_is_quorate = 0; ++ assert(!cluster_is_quorate); + return; + } + update_wait_for_all_status(0); +@@ -1547,7 +1547,9 @@ static char *votequorum_readconfig(int runtime) + update_ev_barrier(us->expected_votes); + update_two_node(); + if (wait_for_all) { +- update_wait_for_all_status(1); ++ if (!runtime) { ++ update_wait_for_all_status(1); ++ } + } else if (wait_for_all_autoset && wait_for_all_status) { + /* + * Reset wait for all status for consistency when wfa is auto-unset by 2node. +-- +1.8.3.1 + diff --git a/SPECS/corosync.spec b/SPECS/corosync.spec index 52a04ec..17800d5 100644 --- a/SPECS/corosync.spec +++ b/SPECS/corosync.spec @@ -23,11 +23,21 @@ Name: corosync Summary: The Corosync Cluster Engine and Application Programming Interfaces Version: 3.0.3 -Release: 2%{?gitver}%{?dist} +Release: 4%{?gitver}%{?dist} License: BSD URL: http://corosync.github.io/corosync/ Source0: http://build.clusterlabs.org/corosync/releases/%{name}-%{version}%{?gittarver}.tar.gz +Patch0: bz1780137-1-votequorum-Ignore-the-icmap_get_-return-value.patch +Patch1: bz1791792-1-man-move-cmap_keys-man-page-from-section-8-to-7.patch +Patch2: bz1780137-2-votequorum-Reflect-runtime-change-of-2Node-to-WFA.patch +Patch3: bz1791792-2-stats-Add-stats-for-scheduler-misses.patch +Patch4: bz1791792-3-stats-Use-nanoseconds-from-epoch-for-schedmiss.patch +Patch5: bz1791792-4-main-Add-schedmiss-timestamp-into-message.patch +Patch6: bz1809864-1-votequorum-Change-check-of-expected_votes.patch +Patch7: bz1809864-2-quorumtool-exit-on-invalid-expected-votes.patch +Patch8: bz1816653-1-votequorum-set-wfa-status-only-on-startup.patch + %if %{with spausedd} Source1: https://github.com/jfriesse/spausedd/releases/download/%{spausedd_version}/spausedd-%{spausedd_version}.tar.gz # VMGuestLib exists only for x86_64 architecture @@ -89,6 +99,16 @@ BuildRequires: pkgconfig(vmguestlib) %setup -q -n %{name}-%{version}%{?gittarver} %endif +%patch0 -p1 -b .bz1780137-1 +%patch1 -p1 -b .bz1791792-1 +%patch2 -p1 -b .bz1780137-2 +%patch3 -p1 -b .bz1791792-2 +%patch4 -p1 -b .bz1791792-3 +%patch5 -p1 -b .bz1791792-4 +%patch6 -p1 -b .bz1809864-1 +%patch7 -p1 -b .bz1809864-2 +%patch8 -p1 -b .bz1816653-1 + %build %if %{with runautogen} ./autogen.sh @@ -254,7 +274,7 @@ fi %{_mandir}/man8/corosync-quorumtool.8* %{_mandir}/man5/corosync.conf.5* %{_mandir}/man5/votequorum.5* -%{_mandir}/man8/cmap_keys.8* +%{_mandir}/man7/cmap_keys.7* # library # @@ -387,6 +407,36 @@ fi %endif %changelog +* Tue May 26 2020 Jan Friesse 3.0.3-4 +- Resolves: rhbz#1780137 +- Resolves: rhbz#1791792 +- Resolves: rhbz#1809864 +- Resolves: rhbz#1816653 + +- votequorum: Ignore the icmap_get_* return value (rhbz#1780137) +- merge upstream commit cddd62f972bca276c934e58f08da84071cec1ddb (rhbz#1780137) +- man: move cmap_keys man page from section 8 to 7 (rhbz#1791792) +- merge upstream commit f1d36307e524f9440733f0b01a9fc627a0e1cac7 (rhbz#1791792) +- votequorum: Reflect runtime change of 2Node to WFA (rhbz#1780137) +- merge upstream commit 8ce65bf951bc1e5b2d64b60ea027fbdc551d4fc8 (rhbz#1780137) +- stats: Add stats for scheduler misses (rhbz#1791792) +- merge upstream commit 48b6894ef41e9a06ccbb696d062d86ef60dc2c4b (rhbz#1791792) +- stats: Use nanoseconds from epoch for schedmiss (rhbz#1791792) +- merge upstream commit ebd05fa00826c366922e619b012a0684c6856539 (rhbz#1791792) +- main: Add schedmiss timestamp into message (rhbz#1791792) +- merge upstream commit 35662dd0ec53f456445c30c0ef92892f47b25aa2 (rhbz#1791792) +- votequorum: Change check of expected_votes (rhbz#1809864) +- merge upstream commit 0c16442f2d93f32a229b87d2672e2dc8025ec704 (rhbz#1809864) +- quorumtool: exit on invalid expected votes (rhbz#1809864) +- merge upstream commit 5f543465bb3506b7f4929a426f1c22a9c854cecd (rhbz#1809864) +- votequorum: set wfa status only on startup (rhbz#1816653) +- merge upstream commit ca320beac25f82c0c555799e647a47975a333c28 (rhbz#1816653) + +* Tue Apr 28 2020 Jan Friesse - 3.0.3-3 +- Resolves: rhbz#1828295 + +- Add explicit spausedd dependency for revdeps CI test + * Mon Nov 25 2019 Jan Friesse - 3.0.3-2 - Related: rhbz#1745623