From 4e32c3112a2f13a302709d72b0ae989287a48563 Mon Sep 17 00:00:00 2001 From: Jan Friesse Date: Mon, 29 Aug 2011 15:09:52 +0200 Subject: [PATCH] rrp: Higher threshold in passive mode for mcast There were too much false positives with passive mode rrp when high number of messages were received. Patch adds new configurable variable rrp_problem_count_mcast_threshold which is by default 10 times rrp_problem_count_threshold and this is used as threshold for multicast packets in passive mode. Variable is unused in active mode. Signed-off-by: Jan Friesse Reviewed by: Steven Dake (cherry picked from commit 752239eaa1edd68695a6e40bcde60471f34a02fd) --- exec/totemconfig.c | 11 +++++++++++ exec/totemrrp.c | 6 ++++-- exec/totemsrp.c | 3 +++ include/corosync/totem/totem.h | 2 ++ man/corosync.conf.5 | 8 ++++++++ 5 files changed, 28 insertions(+), 2 deletions(-) diff --git a/exec/totemconfig.c b/exec/totemconfig.c index 80ca182..f767f69 100644 --- a/exec/totemconfig.c +++ b/exec/totemconfig.c @@ -213,6 +213,8 @@ static void totem_volatile_config_read ( objdb_get_int (objdb,object_totem_handle, "rrp_problem_count_threshold", &totem_config->rrp_problem_count_threshold); + objdb_get_int (objdb,object_totem_handle, "rrp_problem_count_mcast_threshold", &totem_config->rrp_problem_count_mcast_threshold); + objdb_get_int (objdb,object_totem_handle, "rrp_autorecovery_check_timeout", &totem_config->rrp_autorecovery_check_timeout); objdb_get_int (objdb,object_totem_handle, "heartbeat_failures_allowed", &totem_config->heartbeat_failures_allowed); @@ -667,12 +669,21 @@ int totem_config_validate ( if (totem_config->rrp_problem_count_threshold == 0) { totem_config->rrp_problem_count_threshold = RRP_PROBLEM_COUNT_THRESHOLD_DEFAULT; } + if (totem_config->rrp_problem_count_mcast_threshold == 0) { + totem_config->rrp_problem_count_mcast_threshold = totem_config->rrp_problem_count_threshold * 10; + } if (totem_config->rrp_problem_count_threshold < RRP_PROBLEM_COUNT_THRESHOLD_MIN) { snprintf (local_error_reason, sizeof(local_error_reason), "The RRP problem count threshold (%d problem count) may not be less then (%d problem count).", totem_config->rrp_problem_count_threshold, RRP_PROBLEM_COUNT_THRESHOLD_MIN); goto parse_error; } + if (totem_config->rrp_problem_count_mcast_threshold < RRP_PROBLEM_COUNT_THRESHOLD_MIN) { + snprintf (local_error_reason, sizeof(local_error_reason), + "The RRP multicast problem count threshold (%d problem count) may not be less then (%d problem count).", + totem_config->rrp_problem_count_mcast_threshold, RRP_PROBLEM_COUNT_THRESHOLD_MIN); + goto parse_error; + } if (totem_config->rrp_token_expired_timeout == 0) { totem_config->rrp_token_expired_timeout = totem_config->token_retransmit_timeout; diff --git a/exec/totemrrp.c b/exec/totemrrp.c index a5abb1b..616d0d5 100644 --- a/exec/totemrrp.c +++ b/exec/totemrrp.c @@ -890,14 +890,17 @@ static void passive_monitor ( unsigned int max; unsigned int i; unsigned int min_all, min_active; + unsigned int threshold; /* * Monitor for failures */ if (is_token_recv_count) { recv_count = passive_instance->token_recv_count; + threshold = rrp_instance->totem_config->rrp_problem_count_threshold; } else { recv_count = passive_instance->mcast_recv_count; + threshold = rrp_instance->totem_config->rrp_problem_count_mcast_threshold; } recv_count[iface_no] += 1; @@ -959,8 +962,7 @@ static void passive_monitor ( for (i = 0; i < rrp_instance->interface_count; i++) { if ((passive_instance->faulty[i] == 0) && - (max - recv_count[i] > - rrp_instance->totem_config->rrp_problem_count_threshold)) { + (max - recv_count[i] > threshold)) { passive_instance->faulty[i] = 1; poll_timer_add (rrp_instance->poll_handle, rrp_instance->totem_config->rrp_autorecovery_check_timeout, diff --git a/exec/totemsrp.c b/exec/totemsrp.c index 40460e0..6981ac1 100644 --- a/exec/totemsrp.c +++ b/exec/totemsrp.c @@ -858,6 +858,9 @@ int totemsrp_initialize ( "RRP threshold (%d problem count)\n", totem_config->rrp_problem_count_threshold); log_printf (instance->totemsrp_log_level_debug, + "RRP multicast threshold (%d problem count)\n", + totem_config->rrp_problem_count_mcast_threshold); + log_printf (instance->totemsrp_log_level_debug, "RRP automatic recovery check timeout (%d ms)\n", totem_config->rrp_autorecovery_check_timeout); log_printf (instance->totemsrp_log_level_debug, diff --git a/include/corosync/totem/totem.h b/include/corosync/totem/totem.h index f3ac9cc..4dce3b3 100644 --- a/include/corosync/totem/totem.h +++ b/include/corosync/totem/totem.h @@ -143,6 +143,8 @@ struct totem_config { unsigned int rrp_problem_count_threshold; + unsigned int rrp_problem_count_mcast_threshold; + unsigned int rrp_autorecovery_check_timeout; char rrp_mode[TOTEM_RRP_MODE_BYTES]; diff --git a/man/corosync.conf.5 b/man/corosync.conf.5 index b6f769e..78eb2bb 100644 --- a/man/corosync.conf.5 +++ b/man/corosync.conf.5 @@ -472,6 +472,14 @@ may occur. The default is 10 problem counts. .TP +rrp_problem_count_mcast_threshold +This specifies the number of times a problem is detected with multicast before +setting the link faulty for passive rrp mode. This variable is unused in active +rrp mode. + +The default is 10 times rrp_problem_count_threshold. + +.TP rrp_token_expired_timeout This specifies the time in milliseconds to increment the problem counter for the redundant ring protocol after not having received a token from all rings -- 1.7.1