- Resolves: rhbz#2135861
This commit is contained in:
parent
6cd1c62eb5
commit
9e8ddfdb4a
1
.gitignore
vendored
1
.gitignore
vendored
@ -47,3 +47,4 @@ corosync-1.2.7.tar.gz
|
|||||||
/corosync-3.1.3.tar.gz
|
/corosync-3.1.3.tar.gz
|
||||||
/corosync-3.1.4.tar.gz
|
/corosync-3.1.4.tar.gz
|
||||||
/corosync-3.1.5.tar.gz
|
/corosync-3.1.5.tar.gz
|
||||||
|
/corosync-3.1.7.tar.gz
|
||||||
|
@ -1,128 +0,0 @@
|
|||||||
From cdf72925db5a81e546ca8e8d7d8291ee1fc77be4 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Jan Friesse <jfriesse@redhat.com>
|
|
||||||
Date: Wed, 11 Aug 2021 17:34:05 +0200
|
|
||||||
Subject: [PATCH] totem: Add cancel_hold_on_retransmit config option
|
|
||||||
|
|
||||||
Previously, existence of retransmit messages canceled holding
|
|
||||||
of token (and never allowed representative to enter token hold
|
|
||||||
state).
|
|
||||||
|
|
||||||
This makes token rotating maximum speed and keeps processor
|
|
||||||
resending messages over and over again - overloading network
|
|
||||||
and reducing chance to successfully deliver the messages.
|
|
||||||
|
|
||||||
Also there were reports of various Antivirus / IPS / IDS which slows
|
|
||||||
down delivery of packets with certain sizes (packets bigger than token)
|
|
||||||
what make Corosync retransmit messages over and over again.
|
|
||||||
|
|
||||||
Proposed solution is to allow representative to enter token hold
|
|
||||||
state when there are only retransmit messages. This allows network to
|
|
||||||
handle overload and/or gives Antivirus/IPS/IDS enough time scan and
|
|
||||||
deliver packets without corosync entering "FAILED TO RECEIVE" state and
|
|
||||||
adding more load to network.
|
|
||||||
|
|
||||||
Signed-off-by: Jan Friesse <jfriesse@redhat.com>
|
|
||||||
Reviewed-by: Christine Caulfield <ccaulfie@redhat.com>
|
|
||||||
---
|
|
||||||
exec/totemconfig.c | 6 ++++++
|
|
||||||
exec/totemsrp.c | 5 +++--
|
|
||||||
include/corosync/totem/totem.h | 2 ++
|
|
||||||
man/corosync.conf.5 | 15 ++++++++++++++-
|
|
||||||
4 files changed, 25 insertions(+), 3 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/exec/totemconfig.c b/exec/totemconfig.c
|
|
||||||
index 57a1587a..46e09952 100644
|
|
||||||
--- a/exec/totemconfig.c
|
|
||||||
+++ b/exec/totemconfig.c
|
|
||||||
@@ -81,6 +81,7 @@
|
|
||||||
#define MAX_MESSAGES 17
|
|
||||||
#define MISS_COUNT_CONST 5
|
|
||||||
#define BLOCK_UNLISTED_IPS 1
|
|
||||||
+#define CANCEL_TOKEN_HOLD_ON_RETRANSMIT 0
|
|
||||||
/* This constant is not used for knet */
|
|
||||||
#define UDP_NETMTU 1500
|
|
||||||
|
|
||||||
@@ -144,6 +145,8 @@ static void *totem_get_param_by_name(struct totem_config *totem_config, const ch
|
|
||||||
return totem_config->knet_compression_model;
|
|
||||||
if (strcmp(param_name, "totem.block_unlisted_ips") == 0)
|
|
||||||
return &totem_config->block_unlisted_ips;
|
|
||||||
+ if (strcmp(param_name, "totem.cancel_token_hold_on_retransmit") == 0)
|
|
||||||
+ return &totem_config->cancel_token_hold_on_retransmit;
|
|
||||||
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
@@ -365,6 +368,9 @@ void totem_volatile_config_read (struct totem_config *totem_config, icmap_map_t
|
|
||||||
|
|
||||||
totem_volatile_config_set_boolean_value(totem_config, temp_map, "totem.block_unlisted_ips", deleted_key,
|
|
||||||
BLOCK_UNLISTED_IPS);
|
|
||||||
+
|
|
||||||
+ totem_volatile_config_set_boolean_value(totem_config, temp_map, "totem.cancel_token_hold_on_retransmit",
|
|
||||||
+ deleted_key, CANCEL_TOKEN_HOLD_ON_RETRANSMIT);
|
|
||||||
}
|
|
||||||
|
|
||||||
int totem_volatile_config_validate (
|
|
||||||
diff --git a/exec/totemsrp.c b/exec/totemsrp.c
|
|
||||||
index 949d367b..d24b11fa 100644
|
|
||||||
--- a/exec/totemsrp.c
|
|
||||||
+++ b/exec/totemsrp.c
|
|
||||||
@@ -3981,8 +3981,9 @@ static int message_handler_orf_token (
|
|
||||||
transmits_allowed = fcc_calculate (instance, token);
|
|
||||||
mcasted_retransmit = orf_token_rtr (instance, token, &transmits_allowed);
|
|
||||||
|
|
||||||
- if (instance->my_token_held == 1 &&
|
|
||||||
- (token->rtr_list_entries > 0 || mcasted_retransmit > 0)) {
|
|
||||||
+ if (instance->totem_config->cancel_token_hold_on_retransmit &&
|
|
||||||
+ instance->my_token_held == 1 &&
|
|
||||||
+ (token->rtr_list_entries > 0 || mcasted_retransmit > 0)) {
|
|
||||||
instance->my_token_held = 0;
|
|
||||||
forward_token = 1;
|
|
||||||
}
|
|
||||||
diff --git a/include/corosync/totem/totem.h b/include/corosync/totem/totem.h
|
|
||||||
index 8b166566..bdb6a15f 100644
|
|
||||||
--- a/include/corosync/totem/totem.h
|
|
||||||
+++ b/include/corosync/totem/totem.h
|
|
||||||
@@ -244,6 +244,8 @@ struct totem_config {
|
|
||||||
|
|
||||||
unsigned int block_unlisted_ips;
|
|
||||||
|
|
||||||
+ unsigned int cancel_token_hold_on_retransmit;
|
|
||||||
+
|
|
||||||
void (*totem_memb_ring_id_create_or_load) (
|
|
||||||
struct memb_ring_id *memb_ring_id,
|
|
||||||
unsigned int nodeid);
|
|
||||||
diff --git a/man/corosync.conf.5 b/man/corosync.conf.5
|
|
||||||
index 0588ad1e..a3771ea7 100644
|
|
||||||
--- a/man/corosync.conf.5
|
|
||||||
+++ b/man/corosync.conf.5
|
|
||||||
@@ -32,7 +32,7 @@
|
|
||||||
.\" * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
|
|
||||||
.\" * THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
.\" */
|
|
||||||
-.TH COROSYNC_CONF 5 2021-07-23 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual"
|
|
||||||
+.TH COROSYNC_CONF 5 2021-08-11 "corosync Man Page" "Corosync Cluster Engine Programmer's Manual"
|
|
||||||
.SH NAME
|
|
||||||
corosync.conf - corosync executive configuration file
|
|
||||||
|
|
||||||
@@ -584,6 +584,19 @@ with an old configuration.
|
|
||||||
|
|
||||||
The default value is yes.
|
|
||||||
|
|
||||||
+.TP
|
|
||||||
+cancel_token_hold_on_retransmit
|
|
||||||
+Allows Corosync to hold token by representative when there is too much
|
|
||||||
+retransmit messages. This allows network to process increased load without
|
|
||||||
+overloading it. Used mechanism is same as described for
|
|
||||||
+.B hold
|
|
||||||
+directive.
|
|
||||||
+
|
|
||||||
+Some deployments may prefer to never hold token when there is
|
|
||||||
+retransmit messages. If so, option should be set to yes.
|
|
||||||
+
|
|
||||||
+The default value is no.
|
|
||||||
+
|
|
||||||
.PP
|
|
||||||
Within the
|
|
||||||
.B logging
|
|
||||||
--
|
|
||||||
2.27.0
|
|
||||||
|
|
@ -1,109 +0,0 @@
|
|||||||
From e7a82370a7b5d3ca342d5e42e25763fa2c938739 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Jan Friesse <jfriesse@redhat.com>
|
|
||||||
Date: Tue, 26 Oct 2021 18:17:59 +0200
|
|
||||||
Subject: [PATCH] totemsrp: Switch totempg buffers at the right time
|
|
||||||
|
|
||||||
Commit 92e0f9c7bb9b4b6a0da8d64bdf3b2e47ae55b1cc added switching of
|
|
||||||
totempg buffers in sync phase. But because buffers got switch too early
|
|
||||||
there was a problem when delivering recovered messages (messages got
|
|
||||||
corrupted and/or lost). Solution is to switch buffers after recovered
|
|
||||||
messages got delivered.
|
|
||||||
|
|
||||||
I think it is worth to describe complete history with reproducers so it
|
|
||||||
doesn't get lost.
|
|
||||||
|
|
||||||
It all started with 402638929e5045ef520a7339696c687fbed0b31b (more info
|
|
||||||
about original problem is described in
|
|
||||||
https://bugzilla.redhat.com/show_bug.cgi?id=820821). This patch
|
|
||||||
solves problem which is way to be reproduced with following reproducer:
|
|
||||||
- 2 nodes
|
|
||||||
- Both nodes running corosync and testcpg
|
|
||||||
- Pause node 1 (SIGSTOP of corosync)
|
|
||||||
- On node 1, send some messages by testcpg
|
|
||||||
(it's not answering but this doesn't matter). Simply hit ENTER key
|
|
||||||
few times is enough)
|
|
||||||
- Wait till node 2 detects that node 1 left
|
|
||||||
- Unpause node 1 (SIGCONT of corosync)
|
|
||||||
|
|
||||||
and on node 1 newly mcasted cpg messages got sent before sync barrier,
|
|
||||||
so node 2 logs "Unknown node -> we will not deliver message".
|
|
||||||
|
|
||||||
Solution was to add switch of totemsrp new messages buffer.
|
|
||||||
|
|
||||||
This patch was not enough so new one
|
|
||||||
(92e0f9c7bb9b4b6a0da8d64bdf3b2e47ae55b1cc) was created. Reproducer of
|
|
||||||
problem was similar, just cpgverify was used instead of testcpg.
|
|
||||||
Occasionally when node 1 was unpaused it hang in sync phase because
|
|
||||||
there was a partial message in totempg buffers. New sync message had
|
|
||||||
different frag cont so it was thrown away and never delivered.
|
|
||||||
|
|
||||||
After many years problem was found which is solved by this patch
|
|
||||||
(original issue describe in
|
|
||||||
https://github.com/corosync/corosync/issues/660).
|
|
||||||
Reproducer is more complex:
|
|
||||||
- 2 nodes
|
|
||||||
- Node 1 is rate-limited (used script on the hypervisor side):
|
|
||||||
```
|
|
||||||
iface=tapXXXX
|
|
||||||
# ~0.1MB/s in bit/s
|
|
||||||
rate=838856
|
|
||||||
# 1mb/s
|
|
||||||
burst=1048576
|
|
||||||
tc qdisc add dev $iface root handle 1: htb default 1
|
|
||||||
tc class add dev $iface parent 1: classid 1:1 htb rate ${rate}bps \
|
|
||||||
burst ${burst}b
|
|
||||||
tc qdisc add dev $iface handle ffff: ingress
|
|
||||||
tc filter add dev $iface parent ffff: prio 50 basic police rate \
|
|
||||||
${rate}bps burst ${burst}b mtu 64kb "drop"
|
|
||||||
```
|
|
||||||
- Node 2 is running corosync and cpgverify
|
|
||||||
- Node 1 keeps restarting of corosync and running cpgverify in cycle
|
|
||||||
- Console 1: while true; do corosync; sleep 20; \
|
|
||||||
kill $(pidof corosync); sleep 20; done
|
|
||||||
- Console 2: while true; do ./cpgverify;done
|
|
||||||
|
|
||||||
And from time to time (reproduced usually in less than 5 minutes)
|
|
||||||
cpgverify reports corrupted message.
|
|
||||||
|
|
||||||
Signed-off-by: Jan Friesse <jfriesse@redhat.com>
|
|
||||||
Reviewed-by: Fabio M. Di Nitto <fdinitto@redhat.com>
|
|
||||||
---
|
|
||||||
exec/totemsrp.c | 16 +++++++++++++++-
|
|
||||||
1 file changed, 15 insertions(+), 1 deletion(-)
|
|
||||||
|
|
||||||
diff --git a/exec/totemsrp.c b/exec/totemsrp.c
|
|
||||||
index d24b11fa..fd71771b 100644
|
|
||||||
--- a/exec/totemsrp.c
|
|
||||||
+++ b/exec/totemsrp.c
|
|
||||||
@@ -1989,13 +1989,27 @@ static void memb_state_operational_enter (struct totemsrp_instance *instance)
|
|
||||||
trans_memb_list_totemip, instance->my_trans_memb_entries,
|
|
||||||
left_list, instance->my_left_memb_entries,
|
|
||||||
0, 0, &instance->my_ring_id);
|
|
||||||
+ /*
|
|
||||||
+ * Switch new totemsrp messages queue. Messages sent from now on are stored
|
|
||||||
+ * in different queue so synchronization messages are delivered first. Totempg
|
|
||||||
+ * buffers will be switched later.
|
|
||||||
+ */
|
|
||||||
instance->waiting_trans_ack = 1;
|
|
||||||
- instance->totemsrp_waiting_trans_ack_cb_fn (1);
|
|
||||||
|
|
||||||
// TODO we need to filter to ensure we only deliver those
|
|
||||||
// messages which are part of instance->my_deliver_memb
|
|
||||||
messages_deliver_to_app (instance, 1, instance->old_ring_state_high_seq_received);
|
|
||||||
|
|
||||||
+ /*
|
|
||||||
+ * Switch totempg buffers. This used to be right after
|
|
||||||
+ * instance->waiting_trans_ack = 1;
|
|
||||||
+ * line. This was causing problem, because there may be not yet
|
|
||||||
+ * processed parts of messages in totempg buffers.
|
|
||||||
+ * So when buffers were switched and recovered messages
|
|
||||||
+ * got delivered it was not possible to assemble them.
|
|
||||||
+ */
|
|
||||||
+ instance->totemsrp_waiting_trans_ack_cb_fn (1);
|
|
||||||
+
|
|
||||||
instance->my_aru = aru_save;
|
|
||||||
|
|
||||||
/*
|
|
||||||
--
|
|
||||||
2.27.0
|
|
||||||
|
|
@ -1,60 +0,0 @@
|
|||||||
From 04362046c4a9d7307feb5b68341d567b7d0b94d6 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Jan Friesse <jfriesse@redhat.com>
|
|
||||||
Date: Tue, 29 Mar 2022 17:09:22 +0200
|
|
||||||
Subject: [PATCH] logrotate: Use copytruncate method by default
|
|
||||||
|
|
||||||
The reopen lograte method has two main problems:
|
|
||||||
1. It does fail when corosync is not running (solvable by
|
|
||||||
adding "|| true")
|
|
||||||
2. If (for some reason, like SELinux) cfgtool -L fails, logrotate
|
|
||||||
fails and corosync keeps logging into old file. Added "|| true"
|
|
||||||
makes situation even worse because logrotate removes file but
|
|
||||||
corosync keeps logging into it.
|
|
||||||
|
|
||||||
Solution is to install copytruncate logrotate snip by default (and
|
|
||||||
keep reopen config file only for reference).
|
|
||||||
|
|
||||||
Signed-off-by: Jan Friesse <jfriesse@redhat.com>
|
|
||||||
Reviewed-by: Christine Caulfield <ccaulfie@redhat.com>
|
|
||||||
---
|
|
||||||
conf/logrotate/Makefile.am | 7 -------
|
|
||||||
conf/logrotate/corosync-reopen.in | 5 +++++
|
|
||||||
2 files changed, 5 insertions(+), 7 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/conf/logrotate/Makefile.am b/conf/logrotate/Makefile.am
|
|
||||||
index 4f7b7536..35efa2de 100644
|
|
||||||
--- a/conf/logrotate/Makefile.am
|
|
||||||
+++ b/conf/logrotate/Makefile.am
|
|
||||||
@@ -34,16 +34,9 @@ MAINTAINERCLEANFILES = Makefile.in
|
|
||||||
|
|
||||||
EXTRA_DIST = corosync-reopen.in corosync-copytruncate.in
|
|
||||||
|
|
||||||
-if HAVE_QB_LOG_FILE_REOPEN
|
|
||||||
-corosync: corosync-reopen.in
|
|
||||||
- $(SED) -e 's#@''LOGDIR@#${LOGDIR}#g' \
|
|
||||||
- -e 's#@''SBINDIR@#$(sbindir)#g' \
|
|
||||||
- $< > $@
|
|
||||||
-else
|
|
||||||
corosync: corosync-copytruncate.in
|
|
||||||
$(SED) -e 's#@''LOGDIR@#${LOGDIR}#g' \
|
|
||||||
$< > $@
|
|
||||||
-endif
|
|
||||||
|
|
||||||
logrotatecorosyncdir = ${LOGROTATEDIR}
|
|
||||||
logrotatecorosync_DATA = corosync
|
|
||||||
diff --git a/conf/logrotate/corosync-reopen.in b/conf/logrotate/corosync-reopen.in
|
|
||||||
index 839c5dae..730fb741 100644
|
|
||||||
--- a/conf/logrotate/corosync-reopen.in
|
|
||||||
+++ b/conf/logrotate/corosync-reopen.in
|
|
||||||
@@ -1,3 +1,8 @@
|
|
||||||
+# This logrotate method has two main problems and it's kept only for reference:
|
|
||||||
+# 1. It does fail when corosync is not running (solvable by adding "|| true")
|
|
||||||
+# 2. If (for some reason) cfgtool -L fails, logrotate fails and corosync keeps
|
|
||||||
+# logging into old file. Added "|| true" makes situation even worse
|
|
||||||
+# because logrotate removes file but corosync keeps logging into it.
|
|
||||||
@LOGDIR@/corosync.log {
|
|
||||||
missingok
|
|
||||||
compress
|
|
||||||
--
|
|
||||||
2.27.0
|
|
||||||
|
|
@ -17,16 +17,12 @@
|
|||||||
|
|
||||||
Name: corosync
|
Name: corosync
|
||||||
Summary: The Corosync Cluster Engine and Application Programming Interfaces
|
Summary: The Corosync Cluster Engine and Application Programming Interfaces
|
||||||
Version: 3.1.5
|
Version: 3.1.7
|
||||||
Release: 4%{?gitver}%{?dist}
|
Release: 1%{?gitver}%{?dist}
|
||||||
License: BSD
|
License: BSD
|
||||||
URL: http://corosync.github.io/corosync/
|
URL: http://corosync.github.io/corosync/
|
||||||
Source0: http://build.clusterlabs.org/corosync/releases/%{name}-%{version}%{?gittarver}.tar.gz
|
Source0: http://build.clusterlabs.org/corosync/releases/%{name}-%{version}%{?gittarver}.tar.gz
|
||||||
|
|
||||||
Patch0: bz2024652-1-totem-Add-cancel_hold_on_retransmit-config-option.patch
|
|
||||||
Patch1: bz2024657-1-totemsrp-Switch-totempg-buffers-at-the-right-time.patch
|
|
||||||
Patch2: bz2070623-1-logrotate-Use-copytruncate-method-by-default.patch
|
|
||||||
|
|
||||||
# Runtime bits
|
# Runtime bits
|
||||||
# The automatic dependency overridden in favor of explicit version lock
|
# The automatic dependency overridden in favor of explicit version lock
|
||||||
Requires: corosynclib%{?_isa} = %{version}-%{release}
|
Requires: corosynclib%{?_isa} = %{version}-%{release}
|
||||||
@ -76,10 +72,6 @@ BuildRequires: make
|
|||||||
%prep
|
%prep
|
||||||
%setup -q -n %{name}-%{version}%{?gittarver}
|
%setup -q -n %{name}-%{version}%{?gittarver}
|
||||||
|
|
||||||
%patch0 -p1 -b .bz2024652-1
|
|
||||||
%patch1 -p1 -b .bz2024657-1
|
|
||||||
%patch2 -p1 -b .bz2070623-1
|
|
||||||
|
|
||||||
%build
|
%build
|
||||||
%if %{with runautogen}
|
%if %{with runautogen}
|
||||||
./autogen.sh
|
./autogen.sh
|
||||||
@ -297,6 +289,11 @@ network splits)
|
|||||||
%endif
|
%endif
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Tue Nov 15 2022 Jan Friesse <jfriesse@redhat.com> - 3.1.7-1
|
||||||
|
- Resolves: rhbz#2135861
|
||||||
|
|
||||||
|
- New upstream release (rhbz#2135861)
|
||||||
|
|
||||||
* Thu Mar 31 2022 Jan Friesse <jfriesse@redhat.com> - 3.1.5-4
|
* Thu Mar 31 2022 Jan Friesse <jfriesse@redhat.com> - 3.1.5-4
|
||||||
- Resolves: rhbz#2070623
|
- Resolves: rhbz#2070623
|
||||||
|
|
||||||
|
2
sources
2
sources
@ -1 +1 @@
|
|||||||
SHA512 (corosync-3.1.5.tar.gz) = eb974a32f60c52564057ed41c1ebf31fe4332a5a082ebbd5fa2540af8fa9e8c0c42d4ef9066abcb9d7dd04c12b97cd13642289c65b5b6b65cfd30c12641ada1d
|
SHA512 (corosync-3.1.7.tar.gz) = a4d00f18a6dda07f36e77fc48f5bddff77e12a3e6ee40d9450734e281d20479b90cd0c653e255cfc46e0e42e4a0177291a3daba671e751d027e4317e601f0cd2
|
||||||
|
Loading…
Reference in New Issue
Block a user