From 4e32c3112a2f13a302709d72b0ae989287a48563 Mon Sep 17 00:00:00 2001
From: Jan Friesse <jfriesse@redhat.com>
Date: Mon, 29 Aug 2011 15:09:52 +0200
Subject: [PATCH] rrp: Higher threshold in passive mode for mcast

There were too much false positives with passive mode rrp when high
number of messages were received.

Patch adds new configurable variable rrp_problem_count_mcast_threshold
which is by default 10 times rrp_problem_count_threshold and this is
used as threshold for multicast packets in passive mode. Variable is
unused in active mode.

Signed-off-by: Jan Friesse <jfriesse@redhat.com>
Reviewed by: Steven Dake <sdake@redhat.com>
(cherry picked from commit 752239eaa1edd68695a6e40bcde60471f34a02fd)
---
 exec/totemconfig.c             |   11 +++++++++++
 exec/totemrrp.c                |    6 ++++--
 exec/totemsrp.c                |    3 +++
 include/corosync/totem/totem.h |    2 ++
 man/corosync.conf.5            |    8 ++++++++
 5 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/exec/totemconfig.c b/exec/totemconfig.c
index 80ca182..f767f69 100644
--- a/exec/totemconfig.c
+++ b/exec/totemconfig.c
@@ -213,6 +213,8 @@ static void totem_volatile_config_read (
 
 	objdb_get_int (objdb,object_totem_handle, "rrp_problem_count_threshold", &totem_config->rrp_problem_count_threshold);
 
+	objdb_get_int (objdb,object_totem_handle, "rrp_problem_count_mcast_threshold", &totem_config->rrp_problem_count_mcast_threshold);
+
 	objdb_get_int (objdb,object_totem_handle, "rrp_autorecovery_check_timeout", &totem_config->rrp_autorecovery_check_timeout);
 
 	objdb_get_int (objdb,object_totem_handle, "heartbeat_failures_allowed", &totem_config->heartbeat_failures_allowed);
@@ -667,12 +669,21 @@ int totem_config_validate (
 	if (totem_config->rrp_problem_count_threshold == 0) {
 		totem_config->rrp_problem_count_threshold = RRP_PROBLEM_COUNT_THRESHOLD_DEFAULT;
 	}
+	if (totem_config->rrp_problem_count_mcast_threshold == 0) {
+		totem_config->rrp_problem_count_mcast_threshold = totem_config->rrp_problem_count_threshold * 10;
+	}
 	if (totem_config->rrp_problem_count_threshold < RRP_PROBLEM_COUNT_THRESHOLD_MIN) {
 		snprintf (local_error_reason, sizeof(local_error_reason),
 			"The RRP problem count threshold (%d problem count) may not be less then (%d problem count).",
 			totem_config->rrp_problem_count_threshold, RRP_PROBLEM_COUNT_THRESHOLD_MIN);
 		goto parse_error;
 	}
+	if (totem_config->rrp_problem_count_mcast_threshold < RRP_PROBLEM_COUNT_THRESHOLD_MIN) {
+		snprintf (local_error_reason, sizeof(local_error_reason),
+			"The RRP multicast problem count threshold (%d problem count) may not be less then (%d problem count).",
+			totem_config->rrp_problem_count_mcast_threshold, RRP_PROBLEM_COUNT_THRESHOLD_MIN);
+		goto parse_error;
+	}
 	if (totem_config->rrp_token_expired_timeout == 0) {
 		totem_config->rrp_token_expired_timeout =
 			totem_config->token_retransmit_timeout;
diff --git a/exec/totemrrp.c b/exec/totemrrp.c
index a5abb1b..616d0d5 100644
--- a/exec/totemrrp.c
+++ b/exec/totemrrp.c
@@ -890,14 +890,17 @@ static void passive_monitor (
 	unsigned int max;
 	unsigned int i;
 	unsigned int min_all, min_active;
+	unsigned int threshold;
 
 	/*
 	 * Monitor for failures
 	 */
 	if (is_token_recv_count) {
 		recv_count = passive_instance->token_recv_count;
+		threshold = rrp_instance->totem_config->rrp_problem_count_threshold;
 	} else {
 		recv_count = passive_instance->mcast_recv_count;
+		threshold = rrp_instance->totem_config->rrp_problem_count_mcast_threshold;
 	}
 
 	recv_count[iface_no] += 1;
@@ -959,8 +962,7 @@ static void passive_monitor (
 
 	for (i = 0; i < rrp_instance->interface_count; i++) {
 		if ((passive_instance->faulty[i] == 0) &&
-			(max - recv_count[i] >
-			rrp_instance->totem_config->rrp_problem_count_threshold)) {
+		    (max - recv_count[i] > threshold)) {
 			passive_instance->faulty[i] = 1;
 			poll_timer_add (rrp_instance->poll_handle,
 				rrp_instance->totem_config->rrp_autorecovery_check_timeout,
diff --git a/exec/totemsrp.c b/exec/totemsrp.c
index 40460e0..6981ac1 100644
--- a/exec/totemsrp.c
+++ b/exec/totemsrp.c
@@ -858,6 +858,9 @@ int totemsrp_initialize (
 		"RRP threshold (%d problem count)\n",
 		totem_config->rrp_problem_count_threshold);
 	log_printf (instance->totemsrp_log_level_debug,
+		"RRP multicast threshold (%d problem count)\n",
+		totem_config->rrp_problem_count_mcast_threshold);
+	log_printf (instance->totemsrp_log_level_debug,
 		"RRP automatic recovery check timeout (%d ms)\n",
 		totem_config->rrp_autorecovery_check_timeout);
 	log_printf (instance->totemsrp_log_level_debug,
diff --git a/include/corosync/totem/totem.h b/include/corosync/totem/totem.h
index f3ac9cc..4dce3b3 100644
--- a/include/corosync/totem/totem.h
+++ b/include/corosync/totem/totem.h
@@ -143,6 +143,8 @@ struct totem_config {
 
 	unsigned int rrp_problem_count_threshold;
 
+	unsigned int rrp_problem_count_mcast_threshold;
+
 	unsigned int rrp_autorecovery_check_timeout;
 
 	char rrp_mode[TOTEM_RRP_MODE_BYTES];
diff --git a/man/corosync.conf.5 b/man/corosync.conf.5
index b6f769e..78eb2bb 100644
--- a/man/corosync.conf.5
+++ b/man/corosync.conf.5
@@ -472,6 +472,14 @@ may occur.
 The default is 10 problem counts.
 
 .TP
+rrp_problem_count_mcast_threshold
+This specifies the number of times a problem is detected with multicast before
+setting the link faulty for passive rrp mode. This variable is unused in active
+rrp mode.
+
+The default is 10 times rrp_problem_count_threshold.
+
+.TP
 rrp_token_expired_timeout
 This specifies the time in milliseconds to increment the problem counter for
 the redundant ring protocol after not having received a token from all rings
-- 
1.7.1