BZ 1846711 - pcp-pmda-openmetrics produces warnings querying grafana in its default configuration 0b2ef2d79 pmdaopenmetrics: add control.status metrics, de-verbosify the log, QA updates 63605e3db qa/1102: tweak openmetrics QA to be more deterministic 649a0c3a2 qa: improve _filter_pmda_remove() in common.filter commit 0b2ef2d79686d1e44901263093edeb9e1b9b5f77 Author: Mark Goodwin Date: Fri Jun 19 12:18:47 2020 +1000 pmdaopenmetrics: add control.status metrics, de-verbosify the log, QA updates Resolves: RHBZ#1846711 Add openmetrics.control.status (string status per configured URL of the last fetch) and openmetrics.control.status_code, which is the integer response code (e.g. 200 is success) with discrete semantics. In addition, we now only spam the PMDA log and systemd journal when a URL fetch fails if openmetrics.control.debug is non-zero. Users can instead rely on the new status metrics, which can also be used for service availability monitoring. These metrics complement the openmetrics.control.parse_time, fetch_time and calls counters. Includes QA updates and pmdaopenmetrics(1) doc updates. diff --git a/qa/1321.out b/qa/1321.out index cee072cd2..4533bccd8 100644 --- a/qa/1321.out +++ b/qa/1321.out @@ -13,6 +13,8 @@ openmetrics.control.calls openmetrics.control.debug openmetrics.control.fetch_time openmetrics.control.parse_time +openmetrics.control.status +openmetrics.control.status_code openmetrics.source1.metric1 == Created URL file /var/lib/pcp/pmdas/openmetrics/config.d/source2.url @@ -22,6 +24,8 @@ openmetrics.control.calls openmetrics.control.debug openmetrics.control.fetch_time openmetrics.control.parse_time +openmetrics.control.status +openmetrics.control.status_code openmetrics.source1.metric1 openmetrics.source2.metric1 openmetrics.source2.metric2 @@ -33,6 +37,8 @@ openmetrics.control.calls openmetrics.control.debug openmetrics.control.fetch_time openmetrics.control.parse_time +openmetrics.control.status +openmetrics.control.status_code openmetrics.source1.metric1 openmetrics.source2.metric1 openmetrics.source2.metric2 @@ -47,6 +53,8 @@ openmetrics.control.calls openmetrics.control.debug openmetrics.control.fetch_time openmetrics.control.parse_time +openmetrics.control.status +openmetrics.control.status_code openmetrics.source1.metric1 openmetrics.source2.metric1 openmetrics.source2.metric2 @@ -63,6 +71,8 @@ openmetrics.control.calls openmetrics.control.debug openmetrics.control.fetch_time openmetrics.control.parse_time +openmetrics.control.status +openmetrics.control.status_code openmetrics.source1.metric1 openmetrics.source2.metric1 openmetrics.source2.metric2 diff --git a/src/pmdas/openmetrics/pmdaopenmetrics.1 b/src/pmdas/openmetrics/pmdaopenmetrics.1 index d3c7aa85f..0c92e2a11 100644 --- a/src/pmdas/openmetrics/pmdaopenmetrics.1 +++ b/src/pmdas/openmetrics/pmdaopenmetrics.1 @@ -413,10 +413,37 @@ log mandatory on 2 second { The PMDA maintains special control metrics, as described below. Apart from .BR openmetrics.control.debug , -each of these metrics is a counter and has one instance for each configured metric source. -The instance domain is adjusted dynamically as new sources are discovered. +each of these metrics has one instance for each configured metric source. +All of these metrics have integer values with counter semantics, except +.BR openmetrics.control.status , +which has a string value. +It is important to note that fetching any of the +.B openmetrics.control +metrics will only update the counters and status values if the corresponding URL is actually fetched. +If the source URL is not fetched, the control metric values do not trigger a refresh and the control +values reported represent the most recent fetch of each corresponding source. +.PP +The instance domain for the +.B openmetrics.control +metrics is adjusted dynamically as new sources are discovered. If there are no sources configured, the metric names are still defined but the instance domain will be empty and a fetch will return no values. +.IP \fBopenmetrics.control.status\fP +A string representing the status of the last fetch of the corresponding source. +This will generally be +.B success +for an http response code of 200. +This metric can be used for service availability monitoring - provided, as stated above, +the corresponding source URL is fetched too. +.IP \fBopenmetrics.control.status_code\fP +This metric is similar to +.B openmetrics.control.status +except that it is the integer response code of the last fetch. +A value of +.B 200 +usually signifies success and any other value failure. +This metric can also be used for service availability monitoring, with the same caveats as +.BR openmetrics.control.status . .IP \fBopenmetrics.control.calls\fP total number of times each configured metric source has been fetched (if it's a URL) or executed (if it's a script), since the PMDA started. diff --git a/src/pmdas/openmetrics/pmdaopenmetrics.python b/src/pmdas/openmetrics/pmdaopenmetrics.python index a5ed22f13..1486ed676 100755 --- a/src/pmdas/openmetrics/pmdaopenmetrics.python +++ b/src/pmdas/openmetrics/pmdaopenmetrics.python @@ -1,6 +1,6 @@ #!/usr/bin/env pmpython # -# Copyright (c) 2017-2019 Red Hat. +# Copyright (c) 2017-2020 Red Hat. # Copyright (c) 2017 Ronak Jain. # # This program is free software; you can redistribute it and/or modify it @@ -704,6 +704,7 @@ class Source(object): return # fetch the document + status_code = 0 try: if self.is_scripted: # Execute file, expecting openmetrics metric data on stdout. @@ -715,6 +716,7 @@ class Source(object): self.document = open(self.url[7:], 'r').read() else: r = self.requests.get(self.url, headers=self.headers, timeout=timeout) + status_code = r.status_code r.raise_for_status() # non-200? ERROR # NB: the requests package automatically enables http keep-alive and compression self.document = r.text @@ -723,9 +725,13 @@ class Source(object): incr = int(1000 * (time.time() - fetch_time)) self.pmda.stats_fetch_time[self.cluster] += incr self.pmda.stats_fetch_time[0] += incr # total for all sources + self.pmda.stats_status[self.cluster] = "success" + self.pmda.stats_status_code[self.cluster] = status_code except Exception as e: - self.pmda.err('Warning: cannot fetch URL or execute script %s: %s' % (self.path, e)) + self.pmda.stats_status[self.cluster] = 'failed to fetch URL or execute script %s: %s' % (self.path, e) + self.pmda.stats_status_code[self.cluster] = status_code + self.pmda.debug('Warning: cannot fetch URL or execute script %s: %s' % (self.path, e)) if self.pmda.dbg else None return def refresh2(self, timeout): @@ -844,6 +850,20 @@ class OpenMetricsPMDA(PMDA): pmUnits(0, 0, 0, 0, 0, 0)), 'debug flag to enable verbose log messages, to enable: pmstore %s.control.debug 1' % self.pmda_name) + # response status string, per-source end-point + self.stats_status = {0:"none"} # status string, keyed by cluster number + self.add_metric('%s.control.status' % self.pmda_name, pmdaMetric(self.pmid(0, 5), + c_api.PM_TYPE_STRING, self.sources_indom, c_api.PM_SEM_INSTANT, + pmUnits(0, 0, 0, 0, 0, 0)), # no units + 'per-end-point source URL response status after the most recent fetch') + + # response status code, per-source end-point + self.stats_status_code = {0:0} # status code, keyed by cluster number + self.add_metric('%s.control.status_code' % self.pmda_name, pmdaMetric(self.pmid(0, 6), + c_api.PM_TYPE_32, self.sources_indom, c_api.PM_SEM_DISCRETE, + pmUnits(0, 0, 0, 0, 0, 0)), # no units + 'per-end-point source URL response status code after the most recent fetch') + # schedule a refresh self.set_need_refresh() @@ -961,6 +981,8 @@ class OpenMetricsPMDA(PMDA): self.stats_fetch_calls[cluster] = 0 self.stats_fetch_time[cluster] = 0 self.stats_parse_time[cluster] = 0 + self.stats_status[cluster] = "unknown" + self.stats_status_code[cluster] = 0 save_cluster_table = True self.log("Found source %s cluster %d" % (name, cluster)) @@ -996,6 +1018,10 @@ class OpenMetricsPMDA(PMDA): return [self.stats_parse_time[inst], 1] if inst in self.stats_parse_time else [c_api.PM_ERR_INST, 0] elif item == 4: # $(pmda_name).control.debug return [self.dbg, 1] + elif item == 5: # per-source status string + return [self.stats_status[inst], 1] if inst in self.stats_status else [c_api.PM_ERR_INST, 0] + elif item == 6: # per-source status code + return [self.stats_status_code[inst], 1] if inst in self.stats_status_code else [c_api.PM_ERR_INST, 0] return [c_api.PM_ERR_PMID, 0] self.assert_source_invariants(cluster=cluster) commit 63605e3db4b2821df2a6ffb21507af91d97f3a8b Author: Mark Goodwin Date: Fri Jun 19 10:02:04 2020 +1000 qa/1102: tweak openmetrics QA to be more deterministic Now that pmdaopenmetrics is Installed by default with the localhost grafana metrics URL configured, after _pmdaopenmetrics_save_config we need to _pmdaopenmetrics_remove before _pmdaopenmetrics_install to make qa/1102 deterministic. diff --git a/qa/1102 b/qa/1102 index f573d14f4..98ff61f5e 100755 --- a/qa/1102 +++ b/qa/1102 @@ -46,6 +46,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 _stop_auto_restart pmcd _pmdaopenmetrics_save_config +_pmdaopenmetrics_remove _pmdaopenmetrics_install port=`_find_free_port 10000` diff --git a/qa/1102.out b/qa/1102.out index 5094e4a82..aa74abe44 100644 --- a/qa/1102.out +++ b/qa/1102.out @@ -1,5 +1,12 @@ QA output created by 1102 +=== remove openmetrics agent === +Culling the Performance Metrics Name Space ... +openmetrics ... done +Updating the PMCD control file, and notifying PMCD ... +[...removing files...] +Check openmetrics metrics have gone away ... OK + === openmetrics agent installation === Fetch and desc openmetrics metrics: success commit 649a0c3a2745f549b139ce1250e38a1e90308426 Author: Mark Goodwin Date: Fri Jun 19 09:55:58 2020 +1000 qa: improve _filter_pmda_remove() in common.filter Filter "Job for pmcd.service canceled" in _filter_pmda_remove. Systemd sometimes (uncommonly) prints this if a PMDA is still starting when a QA test ./Removes it. diff --git a/qa/common.filter b/qa/common.filter index a53d4a49d..b327abedc 100644 --- a/qa/common.filter +++ b/qa/common.filter @@ -760,6 +760,7 @@ _filter_pmda_remove() _filter_pmda_install | sed \ -e '/Removing files/d' \ + -e '/Job for pmcd.service canceled/d' \ -e '/Updating the PMCD control file/c\ Updating the PMCD control file, and notifying PMCD ...\ [...removing files...]'