From 3b84b4ccfa9e4924a5a3829d3810568dfb69bf63 Mon Sep 17 00:00:00 2001 From: Jake Hunsaker Date: Fri, 18 Mar 2022 16:25:35 -0400 Subject: [PATCH 1/2] [pacemaker] Redesign node enumeration logic It has been found that `pcs status` output is liable to change, which ends up breaking our parsing of node lists when using it on newer versions. Instead, first try to parse through `crm_mon` output, which is what `pcs status` uses under the hood, but as a stable and reliable xml format. Failing that, for example if the `--primary` node is not functioning as part of the cluster, source `/etc/corosync/corosync.conf` instead. Related: RHBZ2065805 Related: RHBZ2065811 Signed-off-by: Jake Hunsaker --- sos/collector/clusters/pacemaker.py | 110 +++++++++++++++++++--------- 1 file changed, 76 insertions(+), 34 deletions(-) diff --git a/sos/collector/clusters/pacemaker.py b/sos/collector/clusters/pacemaker.py index 55024314..49d0ce51 100644 --- a/sos/collector/clusters/pacemaker.py +++ b/sos/collector/clusters/pacemaker.py @@ -8,7 +8,11 @@ # # See the LICENSE file in the source distribution for further information. +import re + from sos.collector.clusters import Cluster +from setuptools._vendor.packaging import version +from xml.etree import ElementTree class pacemaker(Cluster): @@ -18,42 +22,80 @@ class pacemaker(Cluster): packages = ('pacemaker',) option_list = [ ('online', True, 'Collect nodes listed as online'), - ('offline', True, 'Collect nodes listed as offline') + ('offline', True, 'Collect nodes listed as offline'), + ('only-corosync', False, 'Only use corosync.conf to enumerate nodes') ] def get_nodes(self): - self.res = self.exec_primary_cmd('pcs status') - if self.res['status'] != 0: - self.log_error('Cluster status could not be determined. Is the ' - 'cluster running on this node?') - return [] - if 'node names do not match' in self.res['output']: - self.log_warn('Warning: node name mismatch reported. Attempts to ' - 'connect to some nodes may fail.\n') - return self.parse_pcs_output() - - def parse_pcs_output(self): - nodes = [] - if self.get_option('online'): - nodes += self.get_online_nodes() - if self.get_option('offline'): - nodes += self.get_offline_nodes() - return nodes - - def get_online_nodes(self): - for line in self.res['output'].splitlines(): - if line.startswith('Online:'): - nodes = line.split('[')[1].split(']')[0] - return [n for n in nodes.split(' ') if n] - - def get_offline_nodes(self): - offline = [] - for line in self.res['output'].splitlines(): - if line.startswith('Node') and line.endswith('(offline)'): - offline.append(line.split()[1].replace(':', '')) - if line.startswith('OFFLINE:'): - nodes = line.split('[')[1].split(']')[0] - offline.extend([n for n in nodes.split(' ') if n]) - return offline + self.nodes = [] + # try crm_mon first + try: + if not self.get_option('only-corosync'): + try: + self.get_nodes_from_crm() + except Exception as err: + self.log_warn("Falling back to sourcing corosync.conf. " + "Could not parse crm_mon output: %s" % err) + if not self.nodes: + # fallback to corosync.conf, in case the node we're inspecting + # is offline from the cluster + self.get_nodes_from_corosync() + except Exception as err: + self.log_error("Could not determine nodes from cluster: %s" % err) + + _shorts = [n for n in self.nodes if '.' not in n] + if _shorts: + self.log_warn( + "WARNING: Node addresses '%s' may not resolve locally if you " + "are not running on a node in the cluster. Try using option " + "'-c pacemaker.only-corosync' if these connections fail." + % ','.join(_shorts) + ) + return self.nodes + + def get_nodes_from_crm(self): + """ + Try to parse crm_mon output for node list and status. + """ + xmlopt = '--output-as=xml' + # older pacemaker had a different option for xml output + _ver = self.exec_primary_cmd('crm_mon --version') + if _ver['status'] == 0: + cver = _ver['output'].split()[1].split('-')[0] + if not version.parse(cver) > version.parse('2.0.3'): + xmlopt = '--as-xml' + else: + return + _out = self.exec_primary_cmd( + "crm_mon --one-shot --inactive %s" % xmlopt, + need_root=True + ) + if _out['status'] == 0: + self.parse_crm_xml(_out['output']) + + def parse_crm_xml(self, xmlstring): + """ + Parse the xml output string provided by crm_mon + """ + _xml = ElementTree.fromstring(xmlstring) + nodes = _xml.find('nodes') + for node in nodes: + _node = node.attrib + if self.get_option('online') and _node['online'] == 'true': + self.nodes.append(_node['name']) + elif self.get_option('offline') and _node['online'] == 'false': + self.nodes.append(_node['name']) + + def get_nodes_from_corosync(self): + """ + As a fallback measure, read corosync.conf to get the node list. Note + that this prevents us from separating online nodes from offline nodes. + """ + self.log_warn("WARNING: unable to distinguish online nodes from " + "offline nodes when sourcing from corosync.conf") + cc = self.primary.read_file('/etc/corosync/corosync.conf') + nodes = re.findall(r'((\sring0_addr:)(.*))', cc) + for node in nodes: + self.nodes.append(node[-1].strip()) # vim: set et ts=4 sw=4 : -- 2.34.3 From 6701a7d77ecc998b018b54ecc00f9fd102ae9518 Mon Sep 17 00:00:00 2001 From: Jake Hunsaker Date: Mon, 21 Mar 2022 12:05:59 -0400 Subject: [PATCH 2/2] [clusters] Allow clusters to not add localhost to node list For most of our supported clusters, we end up needing to add the local host executing `sos collect` to the node list (unless `--no-local` is used) as that accounts for the primary node that may otherwise be left off. However, this is not helpful for clusters that may reports node names as something other than resolveable names. In those cases, such as with pacemaker, adding the local hostname may result in duplicate collections. Add a toggle to cluster profiles via a new `strict_node_list` class attr that, if True, will skip this addition. This toggle is default `False` to preserve existing behavior, and is now enabled for `pacemaker` specifically. Related: RHBZ#2065821 Signed-off-by: Jake Hunsaker --- sos/collector/__init__.py | 3 ++- sos/collector/clusters/__init__.py | 4 ++++ sos/collector/clusters/pacemaker.py | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/sos/collector/__init__.py b/sos/collector/__init__.py index a8bb0064..d898ca34 100644 --- a/sos/collector/__init__.py +++ b/sos/collector/__init__.py @@ -1073,7 +1073,8 @@ class SoSCollector(SoSComponent): for node in self.node_list: if host == node.split('.')[0]: self.node_list.remove(node) - self.node_list.append(self.hostname) + if not self.cluster.strict_node_list: + self.node_list.append(self.hostname) self.reduce_node_list() try: _node_max = len(max(self.node_list, key=len)) diff --git a/sos/collector/clusters/__init__.py b/sos/collector/clusters/__init__.py index f3f550ad..f00677b8 100644 --- a/sos/collector/clusters/__init__.py +++ b/sos/collector/clusters/__init__.py @@ -57,6 +57,10 @@ class Cluster(): sos_plugin_options = {} sos_preset = '' cluster_name = None + # set this to True if the local host running collect should *not* be + # forcibly added to the node list. This can be helpful in situations where + # the host's fqdn and the name the cluster uses are different + strict_node_list = False def __init__(self, commons): self.primary = None diff --git a/sos/collector/clusters/pacemaker.py b/sos/collector/clusters/pacemaker.py index 49d0ce51..bebcb265 100644 --- a/sos/collector/clusters/pacemaker.py +++ b/sos/collector/clusters/pacemaker.py @@ -20,6 +20,7 @@ class pacemaker(Cluster): cluster_name = 'Pacemaker High Availability Cluster Manager' sos_plugins = ['pacemaker'] packages = ('pacemaker',) + strict_node_list = True option_list = [ ('online', True, 'Collect nodes listed as online'), ('offline', True, 'Collect nodes listed as offline'), -- 2.34.3