improve logging of selection failures (RHEL-91787 RHEL-91789 RHEL-91791)

Resolves: RHEL-91787 RHEL-91789 RHEL-91791
This commit is contained in:
Miroslav Lichvar 2025-05-15 16:08:19 +02:00
parent 67f1c53a62
commit 31316144db
2 changed files with 615 additions and 0 deletions

612
chrony-logselect.patch Normal file
View File

@ -0,0 +1,612 @@
commit 620210218c4d2e9634035b3c3d01b0a329708111
Author: Miroslav Lichvar <mlichvar@redhat.com>
Date: Mon Mar 3 15:48:37 2025 +0100
sources: refactor logging of source-specific selection status
Move logging of falsetickers and system peers to the mark_source()
function. Use an array to flag already logged messages in preparation
for logging of other status. Support variable number of arguments in the
logging functions.
diff --git a/sources.c b/sources.c
index e292e4d6..bfb1c45b 100644
--- a/sources.c
+++ b/sources.c
@@ -66,7 +66,7 @@ struct SelectInfo {
/* This enum contains the flag values that are used to label
each source */
typedef enum {
- SRC_OK, /* OK so far, not a final status! */
+ SRC_OK = 0, /* OK so far, not a final status! */
SRC_UNSELECTABLE, /* Has noselect option set */
SRC_BAD_STATS, /* Doesn't have valid stats data */
SRC_UNSYNCHRONISED, /* Provides samples, but not synchronised */
@@ -144,9 +144,9 @@ struct SRC_Instance_Record {
/* Flag indicating the source has a leap second vote */
int leap_vote;
- /* Flag indicating the source was already reported as
- a falseticker since the last selection change */
- int reported_falseticker;
+ /* Flags indicating which status was already reported for
+ the source since the last change of the system peer */
+ char reported_status[SRC_SELECTED + 1];
};
/* ================================================== */
@@ -360,8 +360,8 @@ SRC_ResetInstance(SRC_Instance instance)
instance->stratum = 0;
instance->leap = LEAP_Unsynchronised;
instance->leap_vote = 0;
- instance->reported_falseticker = 0;
+ memset(instance->reported_status, 0, sizeof (instance->reported_status));
memset(&instance->sel_info, 0, sizeof (instance->sel_info));
SST_ResetInstance(instance->stats);
@@ -620,20 +620,45 @@ update_sel_options(void)
/* ================================================== */
+FORMAT_ATTRIBUTE_PRINTF(2, 3)
static void
-log_selection_message(LOG_Severity severity, const char *format, const char *arg)
+log_selection_message(LOG_Severity severity, const char *format, ...)
{
+ char buf[256];
+ va_list ap;
+
if (REF_GetMode() != REF_ModeNormal)
return;
- LOG(severity, format, arg);
+
+ va_start(ap, format);
+ vsnprintf(buf, sizeof (buf), format, ap);
+ va_end(ap);
+
+ LOG(severity, "%s", buf);
}
/* ================================================== */
+FORMAT_ATTRIBUTE_PRINTF(3, 4)
static void
-log_selection_source(LOG_Severity severity, const char *format, SRC_Instance inst)
+log_selection_source(LOG_Severity severity, SRC_Instance inst, const char *format, ...)
{
- char buf[320], *name, *ntp_name;
+ char buf[320], buf2[256], *name, *ntp_name, *s;
+ va_list ap;
+
+ if (REF_GetMode() != REF_ModeNormal)
+ return;
+
+ va_start(ap, format);
+ vsnprintf(buf2, sizeof (buf2), format, ap);
+ va_end(ap);
+
+ /* Replace ## with %s in the formatted string to be the source name */
+ s = strstr(buf2, "##");
+ if (!s || strchr(buf2, '%'))
+ return;
+ s[0] = '%';
+ s[1] = 's';
name = source_to_string(inst);
ntp_name = inst->type == SRC_NTP ? NSR_GetName(inst->ip_addr) : NULL;
@@ -643,7 +668,9 @@ log_selection_source(LOG_Severity severity, const char *format, SRC_Instance ins
else
snprintf(buf, sizeof (buf), "%s", name);
- log_selection_message(severity, format, buf);
+ LOG(severity, buf2, buf);
+
+ inst->reported_status[inst->status] = 1;
}
/* ================================================== */
@@ -686,7 +713,7 @@ source_to_string(SRC_Instance inst)
/* ================================================== */
static void
-mark_source(SRC_Instance inst, SRC_Status status)
+set_source_status(SRC_Instance inst, SRC_Status status)
{
struct timespec now;
@@ -731,6 +758,30 @@ mark_source(SRC_Instance inst, SRC_Status status)
/* ================================================== */
+static void
+mark_source(SRC_Instance inst, SRC_Status status)
+{
+ set_source_status(inst, status);
+
+ if (status < SRC_OK || status >= sizeof (inst->reported_status))
+ assert(0);
+
+ if (!inst->reported_status[status]) {
+ switch (status) {
+ case SRC_FALSETICKER:
+ log_selection_source(LOGS_WARN, inst, "Detected falseticker ##");
+ break;
+ case SRC_SELECTED:
+ log_selection_source(LOGS_INFO, inst, "Selected source ##");
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+/* ================================================== */
+
static void
mark_ok_sources(SRC_Status status)
{
@@ -739,7 +790,8 @@ mark_ok_sources(SRC_Status status)
for (i = 0; i < n_sources; i++) {
if (sources[i]->status != SRC_OK)
continue;
- mark_source(sources[i], status);
+ /* Don't log the status in this case (multiple sources at once) */
+ set_source_status(sources[i], status);
}
}
@@ -1161,7 +1213,7 @@ SRC_SelectSource(SRC_Instance updated_inst)
/* Could not even get half the reachable (trusted) sources to agree */
if (!reported_no_majority) {
- log_selection_message(LOGS_WARN, "Can't synchronise: no majority", NULL);
+ log_selection_message(LOGS_WARN, "Can't synchronise: no majority");
reported_no_majority = 1;
report_selection_loss = 0;
}
@@ -1212,10 +1264,6 @@ SRC_SelectSource(SRC_Instance updated_inst)
sel_req_source = 0;
} else {
mark_source(sources[i], SRC_FALSETICKER);
- if (!sources[i]->reported_falseticker) {
- log_selection_source(LOGS_WARN, "Detected falseticker %s", sources[i]);
- sources[i]->reported_falseticker = 1;
- }
}
}
@@ -1335,13 +1383,12 @@ SRC_SelectSource(SRC_Instance updated_inst)
}
selected_source_index = max_score_index;
- log_selection_source(LOGS_INFO, "Selected source %s", sources[selected_source_index]);
/* New source has been selected, reset all scores */
for (i = 0; i < n_sources; i++) {
sources[i]->sel_score = 1.0;
sources[i]->distant = 0;
- sources[i]->reported_falseticker = 0;
+ memset(sources[i]->reported_status, 0, sizeof (sources[i]->reported_status));
}
reported_no_majority = 0;
commit 52237b7d0da75fcd750c5d5ee80e87b97474f7fd
Author: Miroslav Lichvar <mlichvar@redhat.com>
Date: Wed Mar 5 10:45:20 2025 +0100
sources: warn about sources exceeding maxdistance or maxjitter
Log a warning message if a source is rejected in the source selecting
due to exceeding the maxdistance or maxjitter limit to make it more
obvious when synchronization is failing for this reason. Delay the
message until the reachability register is full (8 updates), or a
replacement of the source is attempted.
diff --git a/sources.c b/sources.c
index bfb1c45b..2ba595b5 100644
--- a/sources.c
+++ b/sources.c
@@ -768,6 +768,20 @@ mark_source(SRC_Instance inst, SRC_Status status)
if (!inst->reported_status[status]) {
switch (status) {
+ case SRC_BAD_DISTANCE:
+ if (inst->reachability_size < SOURCE_REACH_BITS && inst->bad < BAD_HANDLE_THRESHOLD)
+ break;
+ log_selection_source(LOGS_WARN, inst,
+ "Root distance of ## exceeds maxdistance of %.3f seconds",
+ max_distance);
+ break;
+ case SRC_JITTERY:
+ if (inst->reachability_size < SOURCE_REACH_BITS && inst->bad < BAD_HANDLE_THRESHOLD)
+ break;
+ log_selection_source(LOGS_WARN, inst,
+ "Jitter of ## exceeds maxjitter of %.3f seconds",
+ max_jitter);
+ break;
case SRC_FALSETICKER:
log_selection_source(LOGS_WARN, inst, "Detected falseticker ##");
break;
commit 1c06925d439d8699cc52df71a7a4e809b5b7348f
Author: Miroslav Lichvar <mlichvar@redhat.com>
Date: Thu Mar 6 12:46:09 2025 +0100
sources: improve no majority log message
Add the number of sources that form an agreement (overlapping
intervals), if at least two agree with each other, and number of
reachable sources to the "Can't synchronize: no majority" log message to
better explain why synchronization is failing and hint that adding more
sources might help.
diff --git a/sources.c b/sources.c
index 2ba595b5..3b1766be 100644
--- a/sources.c
+++ b/sources.c
@@ -1222,12 +1222,23 @@ SRC_SelectSource(SRC_Instance updated_inst)
assert(depth == 0 && trust_depth == 0);
assert(2 * n_sel_sources == n_endpoints);
- if ((best_trust_depth == 0 && best_depth <= n_sel_sources / 2) ||
- (best_trust_depth > 0 && best_trust_depth <= n_sel_trust_sources / 2)) {
+ if (best_trust_depth > 0) {
+ best_depth = best_trust_depth;
+ n_sel_sources = n_sel_trust_sources;
+ }
+
+ if (best_depth <= n_sel_sources / 2) {
/* Could not even get half the reachable (trusted) sources to agree */
if (!reported_no_majority) {
- log_selection_message(LOGS_WARN, "Can't synchronise: no majority");
+ if (best_depth < 2)
+ log_selection_message(LOGS_WARN, "%s (no agreement among %d %ssources)",
+ "Can't synchronise: no majority", n_sel_sources,
+ best_trust_depth > 0 ? "trusted " : "");
+ else
+ log_selection_message(LOGS_WARN, "%s (only %d of %d %ssources agree)",
+ "Can't synchronise: no majority", best_depth,
+ n_sel_sources, best_trust_depth > 0 ? "trusted " : "");
reported_no_majority = 1;
report_selection_loss = 0;
}
diff --git a/test/simulation/009-sourceselection b/test/simulation/009-sourceselection
index 547c376c..139b6aa2 100755
--- a/test/simulation/009-sourceselection
+++ b/test/simulation/009-sourceselection
@@ -24,6 +24,12 @@ for falsetickers in 3 4; do
# These check are expected to fail
check_source_selection && test_fail
check_sync && test_fail
+
+ if [ $falsetickers = 3 ]; then
+ check_log_messages "Can't synchronise: no majority (only 2 of 5 sources agree)" 1 1 || test_fail
+ else
+ check_log_messages "Can't synchronise: no majority (no agreement among 5 sources)" 1 1 || test_fail
+ fi
done
# Sources with large asymmetric delay should be excluded
diff --git a/test/simulation/148-replacement b/test/simulation/148-replacement
index d09fba6e..ee8b2e23 100755
--- a/test/simulation/148-replacement
+++ b/test/simulation/148-replacement
@@ -68,7 +68,7 @@ check_source_selection && test_fail
check_packet_interval || test_fail
check_sync || test_fail
-check_log_messages "Can't synchronise: no majority" 1 1 || test_fail
+check_log_messages "Can't synchronise: no majority (no agreement among 2 sources)" 1 1 || test_fail
check_log_messages "Detected falseticker" 0 2 || test_fail
check_log_messages "Source 192.168.123.. replaced with" 3 60 || test_fail
check_log_messages "Source 192.168.123.1 replaced with" 1 25 || test_fail
commit 2fc7fc2e227f85dcde0953d47e1815432a4a2bf7
Author: Miroslav Lichvar <mlichvar@redhat.com>
Date: Wed Mar 12 15:21:18 2025 +0100
sources: delay maxjitter/maxdistance warning messages
Avoid logging the new warning messages about exceeded maxjitter or
maxdistance when only a small number of samples is collected after the
source becomes reachable and the values are unstable. Log the messages
only when a replacement attempt is made.
diff --git a/sources.c b/sources.c
index 3b1766be..c8d96af4 100644
--- a/sources.c
+++ b/sources.c
@@ -769,14 +769,14 @@ mark_source(SRC_Instance inst, SRC_Status status)
if (!inst->reported_status[status]) {
switch (status) {
case SRC_BAD_DISTANCE:
- if (inst->reachability_size < SOURCE_REACH_BITS && inst->bad < BAD_HANDLE_THRESHOLD)
+ if (inst->bad < BAD_HANDLE_THRESHOLD)
break;
log_selection_source(LOGS_WARN, inst,
"Root distance of ## exceeds maxdistance of %.3f seconds",
max_distance);
break;
case SRC_JITTERY:
- if (inst->reachability_size < SOURCE_REACH_BITS && inst->bad < BAD_HANDLE_THRESHOLD)
+ if (inst->bad < BAD_HANDLE_THRESHOLD)
break;
log_selection_source(LOGS_WARN, inst,
"Jitter of ## exceeds maxjitter of %.3f seconds",
commit 0c7e72304aac05020cbaeb4ccab73ce0c441e1ca
Author: Miroslav Lichvar <mlichvar@redhat.com>
Date: Thu Mar 13 15:40:47 2025 +0100
sources: switch unselect_selected_source() to variable arguments
Switch the function to the full printf style, which will be needed to
log an integer.
diff --git a/sources.c b/sources.c
index c8d96af4..b43ef8dc 100644
--- a/sources.c
+++ b/sources.c
@@ -206,8 +206,7 @@ static LOG_FileID logfileid;
/* Forward prototype */
static void update_sel_options(void);
-static void unselect_selected_source(LOG_Severity severity, const char *format,
- const char *arg);
+static void unselect_selected_source(LOG_Severity severity, const char *format, ...);
static void slew_sources(struct timespec *raw, struct timespec *cooked, double dfreq,
double doffset, LCL_ChangeType change_type, void *anything);
static void add_dispersion(double dispersion, void *anything);
@@ -340,7 +339,7 @@ void SRC_DestroyInstance(SRC_Instance instance)
if (selected_source_index > dead_index)
--selected_source_index;
else if (selected_source_index == dead_index)
- unselect_selected_source(LOGS_INFO, NULL, NULL);
+ unselect_selected_source(LOGS_INFO, NULL);
SRC_SelectSource(NULL);
}
@@ -815,16 +814,24 @@ mark_ok_sources(SRC_Status status)
call providing a message or selection of another source, which resets the
report_selection_loss flag. */
+FORMAT_ATTRIBUTE_PRINTF(2, 3)
static void
-unselect_selected_source(LOG_Severity severity, const char *format, const char *arg)
+unselect_selected_source(LOG_Severity severity, const char *format, ...)
{
+ char buf[256];
+ va_list ap;
+
if (selected_source_index != INVALID_SOURCE) {
selected_source_index = INVALID_SOURCE;
report_selection_loss = 1;
}
if (report_selection_loss && format) {
- log_selection_message(severity, format, arg);
+ va_start(ap, format);
+ vsnprintf(buf, sizeof (buf), format, ap);
+ va_end(ap);
+
+ log_selection_message(severity, "%s", buf);
report_selection_loss = 0;
}
}
@@ -947,7 +954,7 @@ SRC_SelectSource(SRC_Instance updated_inst)
}
if (n_sources == 0) {
- unselect_selected_source(LOGS_INFO, "Can't synchronise: no sources", NULL);
+ unselect_selected_source(LOGS_INFO, "Can't synchronise: no sources");
return;
}
@@ -1134,7 +1141,7 @@ SRC_SelectSource(SRC_Instance updated_inst)
if (n_badstats_sources && n_sel_sources && selected_source_index == INVALID_SOURCE &&
max_sel_reach_size < SOURCE_REACH_BITS && max_sel_reach >> 1 == max_badstat_reach) {
mark_ok_sources(SRC_WAITS_STATS);
- unselect_selected_source(LOGS_INFO, NULL, NULL);
+ unselect_selected_source(LOGS_INFO, NULL);
return;
}
@@ -1148,7 +1155,7 @@ SRC_SelectSource(SRC_Instance updated_inst)
if (n_endpoints == 0) {
/* No sources provided valid endpoints */
- unselect_selected_source(LOGS_INFO, "Can't synchronise: no selectable sources", NULL);
+ unselect_selected_source(LOGS_INFO, "Can't synchronise: no selectable sources");
return;
}
@@ -1402,7 +1409,7 @@ SRC_SelectSource(SRC_Instance updated_inst)
/* Before selecting the new synchronisation source wait until the reference
can be updated */
if (sources[max_score_index]->updates == 0) {
- unselect_selected_source(LOGS_INFO, NULL, NULL);
+ unselect_selected_source(LOGS_INFO, NULL);
mark_ok_sources(SRC_WAITS_UPDATE);
return;
}
commit a8094716b0c21a037e3f21d2f3c2e45f14060b07
Author: Miroslav Lichvar <mlichvar@redhat.com>
Date: Thu Nov 14 13:41:41 2024 +0100
test: add 014-intermittent test
diff --git a/test/simulation/014-intermittent b/test/simulation/014-intermittent
new file mode 100755
index 00000000..2f018ed0
--- /dev/null
+++ b/test/simulation/014-intermittent
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+
+. ./test.common
+
+test_start "intermittent connection"
+
+# Pass packets only for 1200 seconds every 10000 seconds
+base_delay=$(cat <<-EOF | tr -d '\n'
+ (+ 1e-4
+ (* -1
+ (equal 0.1 (min (% time 10000) 1200) 1200)))
+EOF
+)
+
+time_max_limit=1e-1
+freq_max_limit=1e-2
+time_rms_limit=2e-3
+freq_rms_limit=2e-5
+limit=100000
+
+run_test || test_fail
+check_chronyd_exit || test_fail
+check_sync || test_fail
+
+check_log_messages "Can't.*no selectable sources" 9 10 || test_fail
+check_log_messages "Selected source 192.168.123.1" 9 10 || test_fail
+
+# Pass every 20th request
+base_delay=$(cat <<-EOF | tr -d '\n'
+ (+ 1e-4
+ (* -1
+ (equal 0.1 from 2)
+ (equal 0.1 (min (% (sum 1) 20) 1) 1)))
+EOF
+)
+
+time_max_limit=1e-2
+freq_max_limit=1e-4
+time_rms_limit=5e-3
+max_sync_time=22000
+
+run_test || test_fail
+check_chronyd_exit || test_fail
+check_sync || test_fail
+
+check_log_messages "Can't.*no selectable sources" 5 15 || test_fail
+check_log_messages "Selected source 192.168.123.1" 5 15 || test_fail
+
+test_pass
commit aa3660256a83768aa3fb640f6032780000c7a4fb
Author: Miroslav Lichvar <mlichvar@redhat.com>
Date: Thu Mar 13 15:55:32 2025 +0100
sources: improve no-selectable-sources log message
Include the number of unreachable sources in the "Can't synchronise: no
selectable sources" log message to provide a hint whether it might be a
networking issue.
diff --git a/sources.c b/sources.c
index b43ef8dc..406c2f86 100644
--- a/sources.c
+++ b/sources.c
@@ -937,7 +937,7 @@ SRC_SelectSource(SRC_Instance updated_inst)
struct timespec now, ref_time;
int i, j, j1, j2, index, sel_prefer, n_endpoints, n_sel_sources, sel_req_source;
int max_badstat_reach, max_badstat_reach_size, n_badstats_sources;
- int max_sel_reach, max_sel_reach_size;
+ int max_sel_reach, max_sel_reach_size, n_unreach_sources;
int depth, best_depth, trust_depth, best_trust_depth, n_sel_trust_sources;
int combined, stratum, min_stratum, max_score_index;
int orphan_stratum, orphan_source;
@@ -966,6 +966,7 @@ SRC_SelectSource(SRC_Instance updated_inst)
n_endpoints = 0;
n_sel_sources = n_sel_trust_sources = 0;
n_badstats_sources = 0;
+ n_unreach_sources = 0;
sel_req_source = 0;
max_sel_reach = max_badstat_reach = 0;
max_sel_reach_size = max_badstat_reach_size = 0;
@@ -988,6 +989,10 @@ SRC_SelectSource(SRC_Instance updated_inst)
continue;
}
+ /* Count unreachable sources for a warning message */
+ if (sources[i]->reachability == 0)
+ n_unreach_sources++;
+
si = &sources[i]->sel_info;
SST_GetSelectionData(sources[i]->stats, &now,
&si->lo_limit, &si->hi_limit, &si->root_distance,
@@ -1155,7 +1160,8 @@ SRC_SelectSource(SRC_Instance updated_inst)
if (n_endpoints == 0) {
/* No sources provided valid endpoints */
- unselect_selected_source(LOGS_INFO, "Can't synchronise: no selectable sources");
+ unselect_selected_source(LOGS_INFO, "Can't synchronise: no selectable sources"
+ " (%d unreachable sources)", n_unreach_sources);
return;
}
diff --git a/test/simulation/014-intermittent b/test/simulation/014-intermittent
index 2f018ed0..4af574b7 100755
--- a/test/simulation/014-intermittent
+++ b/test/simulation/014-intermittent
@@ -22,7 +22,7 @@ run_test || test_fail
check_chronyd_exit || test_fail
check_sync || test_fail
-check_log_messages "Can't.*no selectable sources" 9 10 || test_fail
+check_log_messages "Can't.*no selectable sources (1 unreachable" 9 10 || test_fail
check_log_messages "Selected source 192.168.123.1" 9 10 || test_fail
# Pass every 20th request
@@ -43,7 +43,7 @@ run_test || test_fail
check_chronyd_exit || test_fail
check_sync || test_fail
-check_log_messages "Can't.*no selectable sources" 5 15 || test_fail
+check_log_messages "Can't.*no selectable sources (1 unreachable" 5 15 || test_fail
check_log_messages "Selected source 192.168.123.1" 5 15 || test_fail
test_pass
commit 402d5d9eb5767797e3af150119b63a08adf43813
Author: Miroslav Lichvar <mlichvar@redhat.com>
Date: Thu Mar 13 16:03:59 2025 +0100
sources: increase severity of can't-synchronise log messages
Switch the info-level "Can't synchronise" selection messages to
warnings.
diff --git a/sources.c b/sources.c
index 406c2f86..cfe7f260 100644
--- a/sources.c
+++ b/sources.c
@@ -954,7 +954,7 @@ SRC_SelectSource(SRC_Instance updated_inst)
}
if (n_sources == 0) {
- unselect_selected_source(LOGS_INFO, "Can't synchronise: no sources");
+ unselect_selected_source(LOGS_WARN, "Can't synchronise: no sources");
return;
}
@@ -1160,7 +1160,7 @@ SRC_SelectSource(SRC_Instance updated_inst)
if (n_endpoints == 0) {
/* No sources provided valid endpoints */
- unselect_selected_source(LOGS_INFO, "Can't synchronise: no selectable sources"
+ unselect_selected_source(LOGS_WARN, "Can't synchronise: no selectable sources"
" (%d unreachable sources)", n_unreach_sources);
return;
}
@@ -1306,7 +1306,7 @@ SRC_SelectSource(SRC_Instance updated_inst)
}
if (!n_sel_sources || sel_req_source || n_sel_sources < CNF_GetMinSources()) {
- unselect_selected_source(LOGS_INFO, "Can't synchronise: %s selectable sources",
+ unselect_selected_source(LOGS_WARN, "Can't synchronise: %s selectable sources",
!n_sel_sources ? "no" :
sel_req_source ? "no required source in" : "not enough");
mark_ok_sources(SRC_WAITS_SOURCES);

View File

@ -29,6 +29,8 @@ Patch1: chrony-nm-dispatcher-dhcp.patch
Patch2: chrony-refclkreach.patch
# improve description of refresh directive
Patch3: chrony-docrefresh.patch
# improve logging of selection failures
Patch4: chrony-logselect.patch
BuildRequires: gnutls-devel libcap-devel libedit-devel pps-tools-devel
BuildRequires: gcc gcc-c++ make bison systemd gnupg2
@ -65,6 +67,7 @@ service to other computers in the network.
%patch -P 1 -p1 -b .nm-dispatcher-dhcp
%patch -P 2 -p1
%patch -P 3 -p1 -b .docrefresh
%patch -P 4 -p1
%{?gitpatch: echo %{version}-%{gitpatch} > version.txt}