pcp/pmda-openmetrics_removal.patch
lmchilton f5f1fcd82a Resolves: RHEL-101745
Resolves: RHEL-106772
Resolves: RHEL-79767
2025-11-19 14:00:02 -05:00

415 lines
17 KiB
Diff

commit 137de0c80681723845b880cea42d4ce7d9f0007e
Author: lmchilton <lauren.chilton26@gmail.com>
Date: Fri Sep 19 15:40:45 2025 -0400
pmdaopenmetrics: update metric removal mechanism
update metric removal mechanism to remove metrics
deleted from existing sources. New mechanism first
marks all metrics as true (to be removed) then as
metrics are seen they are marked as false (do not
remove). At the end of the refresh any existing
metrics marked true are removed. Added code for
control initialization and removal. All QA in
pmda.openmetrics group is passing. Edited
function old_enough_for_refresh() to allow
refreshs. In the previous state:
len(self.metrics_by_name) is never 0 after
the first pass, so the function would
always return False.
diff --git a/qa/1976 b/qa/1976
index 9700d9934..88405387a 100755
--- a/qa/1976
+++ b/qa/1976
@@ -2,7 +2,7 @@
# PCP QA Test No. 1976
# Test pmdaopenmetrics metric removal
#
-# Copyright (c) 2017, 2025 Red Hat. All Rights Reserved.
+# Copyright (c) 2025 Red Hat. All Rights Reserved.
#
seq=`basename $0`
echo "QA output created by $seq"
@@ -45,6 +45,15 @@ find $PCP_PMDAS_DIR/openmetrics/config.d -name GNU\* -exec rm -f {} ";"
_pmdaopenmetrics_install
+iam=openmetrics
+# append -R option to pmcd config
+sed < $PCP_PMCDCONF_PATH \
+ -e "/^$iam.*/s/$/ -R 1/" \
+ > $tmp.conf
+$sudo cp $tmp.conf $PCP_PMCDCONF_PATH
+$sudo systemctl restart pmcd
+sleep 2
+
if ! _pmdaopenmetrics_wait_for_metric openmetrics.thermostat
then
status=1
@@ -56,15 +65,23 @@ $sudo rm $PCP_PMDAS_DIR/openmetrics/config.d/simple_metric.url
pminfo openmetrics.simple_metric
echo
+echo "-- check control metrics disappeared --"
+pminfo -dfmt openmetrics.control.status_code
+echo
+
echo "-- source re-addition --"
# same access controls logic as above, user $PCP_USER needs to be
# able to read the file at the end of the URL
#
cp $here/openmetrics/samples/simple_metric.txt $tmp.simple_metric.txt
-echo 'file:///'$tmp.simple_metric.txt >$PCP_PMDAS_DIR/openmetrics/config.d/simple_metric.url
+echo 'file:///'$tmp.simple_metric.txt > $PCP_PMDAS_DIR/openmetrics/config.d/simple_metric.url
pminfo openmetrics.simple_metric
echo
+echo "-- check control metrics reappeared --"
+pminfo -dfmt openmetrics.control.status_code
+echo
+
echo "-- metric removal of recognized source/metric --"
$sudo rm $PCP_PMDAS_DIR/openmetrics/config.d/simple_metric.url
pminfo openmetrics.simple_metric
@@ -77,6 +94,16 @@ $sudo touch -t 197001010000 $PCP_PMDAS_DIR/openmetrics/config.d/simple_metric.ur
pminfo openmetrics.simple_metric
echo
+echo "-- metric removal by modifying source file, source persists --"
+$sudo sed -i -e "/metric2/d" $tmp'.simple_metric.txt'
+
+echo "-- sleep to allow for old_enough_for_refresh() --"
+sleep 2
+echo
+
+echo "-- metric2 removed --"
+pminfo openmetrics.simple_metric
+
_pmdaopenmetrics_remove >/dev/null 2>&1
# success, all done
diff --git a/qa/1976.out b/qa/1976.out
index e72b1984c..a0dd60721 100644
--- a/qa/1976.out
+++ b/qa/1976.out
@@ -4,10 +4,85 @@ QA output created by 1976
-- metric removal of new source/metric --
Error: openmetrics.simple_metric: Unknown metric name
+-- check control metrics disappeared --
+
+openmetrics.control.status_code PMID: 144.0.6 [per-end-point source URL response status code after the most recent fetch]
+ Data Type: 32-bit int InDom: 144.0 0x24000000
+ Semantics: discrete Units: none
+ inst [0 or "control"] value 0
+ inst [1 or "awk_scripted"] value 0
+ inst [2 or "bad_summary_nometa"] value 0
+ inst [3 or "collectd_sample"] value 0
+ inst [4 or "curl.script"] value 0
+ inst [5 or "curl_filtered"] value 0
+ inst [6 or "curl_hostname_label"] value 0
+ inst [7 or "curl_scripted"] value 0
+ inst [8 or "duplicate_hostname_label"] value 0
+ inst [9 or "good_summary_nometa"] value 0
+ inst [10 or "jenkins_monitoring_javamelody"] value 0
+ inst [11 or "jenkins_prometheus_plugin"] value 0
+ inst [12 or "labelfiltering"] value 0
+ inst [13 or "multiple.namespace.levels"] value 0
+ inst [14 or "multiple.namespace.more"] value 0
+ inst [15 or "pmwebd_3_12_2"] value 0
+ inst [16 or "problematic_strings"] value 0
+ inst [17 or "prom_exposition_formats_example1"] value 0
+ inst [18 or "python_sample_client_server"] value 0
+ inst [19 or "python_scripted"] value 0
+ inst [20 or "reordered_labels"] value 0
+ inst [21 or "sample_pmda_3_12_2"] value 0
+ inst [22 or "sample_pmda_instname_5_0_0"] value 0
+ inst [23 or "sample_pmda_pcp5_metadata"] value 0
+ inst [24 or "sample_prometheus_metrics"] value 0
+ inst [25 or "script_failed"] value 0
+ inst [26 or "sh_script_no_suffix"] value 0
+ inst [27 or "sh_scripted"] value 0
+ inst [29 or "stderr_check"] value 0
+ inst [30 or "thermostat"] value 0
+ inst [31 or "vmware_exporter"] value 0
+
-- source re-addition --
openmetrics.simple_metric.metric2
openmetrics.simple_metric.metric1
+-- check control metrics reappeared --
+
+openmetrics.control.status_code PMID: 144.0.6 [per-end-point source URL response status code after the most recent fetch]
+ Data Type: 32-bit int InDom: 144.0 0x24000000
+ Semantics: discrete Units: none
+ inst [0 or "control"] value 0
+ inst [1 or "awk_scripted"] value 0
+ inst [2 or "bad_summary_nometa"] value 0
+ inst [3 or "collectd_sample"] value 0
+ inst [4 or "curl.script"] value 0
+ inst [5 or "curl_filtered"] value 0
+ inst [6 or "curl_hostname_label"] value 0
+ inst [7 or "curl_scripted"] value 0
+ inst [8 or "duplicate_hostname_label"] value 0
+ inst [9 or "good_summary_nometa"] value 0
+ inst [10 or "jenkins_monitoring_javamelody"] value 0
+ inst [11 or "jenkins_prometheus_plugin"] value 0
+ inst [12 or "labelfiltering"] value 0
+ inst [13 or "multiple.namespace.levels"] value 0
+ inst [14 or "multiple.namespace.more"] value 0
+ inst [15 or "pmwebd_3_12_2"] value 0
+ inst [16 or "problematic_strings"] value 0
+ inst [17 or "prom_exposition_formats_example1"] value 0
+ inst [18 or "python_sample_client_server"] value 0
+ inst [19 or "python_scripted"] value 0
+ inst [20 or "reordered_labels"] value 0
+ inst [21 or "sample_pmda_3_12_2"] value 0
+ inst [22 or "sample_pmda_instname_5_0_0"] value 0
+ inst [23 or "sample_pmda_pcp5_metadata"] value 0
+ inst [24 or "sample_prometheus_metrics"] value 0
+ inst [25 or "script_failed"] value 0
+ inst [26 or "sh_script_no_suffix"] value 0
+ inst [27 or "sh_scripted"] value 0
+ inst [28 or "simple_metric"] value 0
+ inst [29 or "stderr_check"] value 0
+ inst [30 or "thermostat"] value 0
+ inst [31 or "vmware_exporter"] value 0
+
-- metric removal of recognized source/metric --
Error: openmetrics.simple_metric: Unknown metric name
@@ -15,3 +90,8 @@ Error: openmetrics.simple_metric: Unknown metric name
openmetrics.simple_metric.metric2
openmetrics.simple_metric.metric1
+-- metric removal by modifying source file, source persists --
+-- sleep to allow for old_enough_for_refresh() --
+
+-- metric2 removed --
+openmetrics.simple_metric.metric1
diff --git a/src/pmdas/openmetrics/pmdaopenmetrics.1 b/src/pmdas/openmetrics/pmdaopenmetrics.1
index 29370d1cb..e696b2f70 100644
--- a/src/pmdas/openmetrics/pmdaopenmetrics.1
+++ b/src/pmdas/openmetrics/pmdaopenmetrics.1
@@ -2,6 +2,7 @@
.\"
.\" Copyright (c) 2017-2019 Red Hat.
.\" Copyright (c) 2017 Ronak Jain.
+.\" Copyright (c) 2025 Lauren Chilton.
.\"
.\" This program is free software; you can redistribute it and/or modify it
.\" under the terms of the GNU General Public License as published by the
@@ -25,6 +26,7 @@
[\f3\-c\f1 \f2config\f1]
[\f3\-d\f1 \f2domain\f1]
[\f3\-l\f1 \f2logfile\f1]
+[\f3\-R\f1 \f2refresh_timeout\f1]
[\f3\-r\f1 \f2root\f1]
[\f3\-t\f1 \f2timeout\f1]
[\f3\-u\f1 \f2user\f1]
@@ -138,6 +140,13 @@ Use of the
.B \-r
option may also change the defaults for some other command line options,
e.g. the default log file name and the default configuration directory.
+.PP
+The
+.B \-R
+option allows the user to configure the \fItimeout\fR,
+in seconds, between cluster refreshes. The default value is
+.B 10
+seconds.
.SH "CONFIGURATION SOURCES"
As it runs,
.B pmdaopenmetrics
diff --git a/src/pmdas/openmetrics/pmdaopenmetrics.python b/src/pmdas/openmetrics/pmdaopenmetrics.python
index 383c7c899..352f6565f 100755
--- a/src/pmdas/openmetrics/pmdaopenmetrics.python
+++ b/src/pmdas/openmetrics/pmdaopenmetrics.python
@@ -51,10 +51,6 @@ else:
# and/or scripts. See the --nosort option to turn it off.
sort_conf_list = True
-# Number of seconds to wait between poll attempts on a source that
-# we've never been able to connect to & collect a list of metrics from.
-empty_source_pmns_poll = 10.0
-
MAX_CLUSTER = 0xfff # ~ max. number of openmetrics sources
MAX_METRIC = 0x3ff # ~ max. number of metrics per source
MAX_INDOM = 0x7fffffff # coincidentally, ~ product of above
@@ -581,6 +577,8 @@ class Source(object):
self.metrics_by_name = {} # name -> Metric
self.metrics_by_num = {} # number (last component of pmid) -> Metric
+ self.metric_removal_flags = {}
+ self.metric_fullnames = {}
def helptext(self, helpline):
if helpline: # it could be None!
@@ -603,7 +601,7 @@ class Source(object):
'''
now = time.time()
last_try_age = now - self.refresh_time
- return len(self.metrics_by_name) == 0 and last_try_age > empty_source_pmns_poll
+ return len(self.metrics_by_name) == 0 or last_try_age > self.pmda.refresh_timeout
def check_filter(self, name, entrytype):
'''
@@ -689,13 +687,15 @@ class Source(object):
self.pmda.debug("included_labels '%s'" % (included_labels)) if self.pmda.dbg else None
self.pmda.debug("optional_labels '%s'" % (optional_labels)) if self.pmda.dbg else None
if sp.name in self.metrics_by_name:
- if ("openmetrics.%s.%s" % (self.name, sp.name)) not in self.pmda.all_metrics and self.name in self.pmda.re_add_list:
+ self.metric_removal_flags[sp.name] = False
+ if pcpline:
+ split = pcpline.split(" ")
+ fullname = "openmetrics.%s.%s" % (self.name, split[1])
+ else:
+ fullname = "openmetrics.%s.%s" % (self.name, sp.name.replace(":", "."))
+ self.metric_fullnames[sp.name] = fullname
+ if ("openmetrics.%s.%s" % (self.name, sp.name)) not in self.pmda.all_metrics:
# re-add metric to namespace
- if pcpline:
- split = pcpline.split(" ")
- fullname = "openmetrics.%s.%s" % (self.name, split[1])
- else:
- fullname = "openmetrics.%s.%s" % (self.name, sp.name.replace(":", "."))
help_oneline, help_text = self.helptext(helpline)
try:
obj = self.pmda.removed_metrics[fullname]
@@ -967,6 +967,9 @@ class Source(object):
if self.document is None: # error during fetch?
return
+ for metric in self.metrics_by_name:
+ self.metric_removal_flags[metric] = True
+
# parse and handle the openmetrics formatted metric data
parse_time = time.time()
s = self.parse_lines(self.document)
@@ -976,6 +979,19 @@ class Source(object):
self.pmda.stats_parse_time[self.cluster] += incr
self.pmda.stats_parse_time[0] += incr # total
+ for metric, value in self.metric_removal_flags.items():
+ remove_name = self.metric_fullnames[metric]
+ if value is True and remove_name in self.pmda.all_metrics:
+ self.pmda.debug("removing metric from existing source: %s" % metric) if self.pmda.dbg else None
+ try:
+ remove_object = self.pmda.all_metrics[remove_name]
+ self.pmda.remove_metric(remove_name, remove_object)
+ self.pmda.set_need_refresh()
+ del self.pmda.all_metrics[remove_name]
+ self.pmda.removed_metrics[remove_name] = remove_object
+ except Exception as e:
+ self.pmda.debug("cannot remove metric from existing source, see error: %s" % e) if self.pmda.dbg else None
+
# save metric & indom lookup tables changes, if any
for _, m in self.metrics_by_name.items():
try: # NB: must process whole list even if exceptions escape
@@ -1000,7 +1016,7 @@ class Source(object):
return [c_api.PM_ERR_AGAIN, 0]
class OpenMetricsPMDA(PMDA):
- def __init__(self, pmda_name, domain, config, timeout, user, debugflag, logfile):
+ def __init__(self, pmda_name, domain, config, timeout, refresh_timeout, user, debugflag, logfile):
'''
Initialize the PMDA. This can take a while for large configurations.
The openmetrics entry in pmcd.conf specifies to start up in "notready"
@@ -1020,6 +1036,9 @@ class OpenMetricsPMDA(PMDA):
# and the storable metric $(pmda_name).control.debug
self.dbg = debugflag
+ # Number of seconds to wait between poll attempts on a source
+ self.refresh_timeout = refresh_timeout
+
# now everything else may take time
self.pmda_name = pmda_name
self.config_dir = os.path.normpath(config)
@@ -1040,6 +1059,7 @@ class OpenMetricsPMDA(PMDA):
self.all_metrics = {}
# keep track of removed metrics, in case of re-addition
self.removed_metrics = {}
+ self.controls = {0:0}
# compiled regex cache
self.regex_cache = {}
@@ -1155,6 +1175,24 @@ class OpenMetricsPMDA(PMDA):
mtime = m
return mtime, ret
+ def initialize_controls(self, cluster):
+ self.stats_fetch_calls[cluster] = 0
+ self.stats_fetch_time[cluster] = 0
+ self.stats_parse_time[cluster] = 0
+ self.stats_status[cluster] = "unknown"
+ self.stats_status_code[cluster] = 0
+
+ self.controls[cluster] = 1
+
+ def delete_controls(self, cluster):
+ del self.stats_fetch_calls[cluster]
+ del self.stats_fetch_time[cluster]
+ del self.stats_parse_time[cluster]
+ del self.stats_status[cluster]
+ del self.stats_status_code[cluster]
+
+ self.controls[cluster] = 0
+
def rescan_confdir(self):
'''Scan the configuration directories for any new .url files
or scripts. Ensure there is a Source registered in the
@@ -1199,6 +1237,9 @@ class OpenMetricsPMDA(PMDA):
try:
remove_name = key
remove_obj = value
+ cluster = self.cluster_table.intern_lookup_value(split_name[1])
+ if self.controls[cluster] == 1:
+ self.delete_controls(cluster)
self.remove_metric(remove_name, remove_obj)
self.removed_metrics[remove_name] = remove_obj
self.debug("removed metric name: %s" % remove_name) if self.dbg else None
@@ -1245,6 +1286,8 @@ class OpenMetricsPMDA(PMDA):
if value == s:
cluster_for_refresh.append(key)
cluster_for_refresh_names.append(name)
+ if self.controls[key] == 0:
+ self.initialize_controls(key)
self.debug("refreshing cluster list: %s" % cluster_for_refresh_names) if self.dbg else None
self.refresh_some_clusters_for_fetch(cluster_for_refresh)
else:
@@ -1256,11 +1299,7 @@ class OpenMetricsPMDA(PMDA):
self.source_by_cluster[source.cluster] = source
# initialize statistics
- self.stats_fetch_calls[cluster] = 0
- self.stats_fetch_time[cluster] = 0
- self.stats_parse_time[cluster] = 0
- self.stats_status[cluster] = "unknown"
- self.stats_status_code[cluster] = 0
+ self.initialize_controls(cluster)
save_cluster_table = True
self.log("Found source %s cluster %d" % (name, cluster))
@@ -1583,6 +1622,11 @@ if __name__ == '__main__':
type=int,
default=2,
help='HTTP GET timeout for each end-point URL (default 2 seconds)')
+ parser.add_argument(
+ '-R', '--refresh',
+ type=int,
+ default=10,
+ help='timeout between cluster refreshes (default 10 seconds)')
parser.add_argument(
'-u', '--user',
type=str,
@@ -1602,7 +1646,7 @@ if __name__ == '__main__':
# the IPC protocol is ipc_prot="binary notready". See also pmcd(1) man page.
# The "binary notready" setting can also be manually configured in pmcd.conf.
# Default domain number is PMDA(144), see -d option.
- pmda = OpenMetricsPMDA(args.root, args.domain, args.config, args.timeout, args.user, args.debug, args.log)
+ pmda = OpenMetricsPMDA(args.root, args.domain, args.config, args.timeout, args.refresh, args.user, args.debug, args.log)
# Uncomment to force -D or use: pmstore openmetrics.control.debug 1
# pmda.dbg = True