From cc60fa5ee25bffed9203a4f786256185b7fe0115 Mon Sep 17 00:00:00 2001 From: Nadia Pinaeva Date: Tue, 15 Mar 2022 11:49:57 +0100 Subject: [PATCH] Add ovs datapath and groups collection commands Add ct-zone-list command for openshift-ovn Signed-off-by: Nadia Pinaeva --- sos/report/plugins/openshift_ovn.py | 4 ++++ sos/report/plugins/openvswitch.py | 3 +++ 2 files changed, 7 insertions(+) diff --git a/sos/report/plugins/openshift_ovn.py b/sos/report/plugins/openshift_ovn.py index 168f1dd3..b4787b8e 100644 --- a/sos/report/plugins/openshift_ovn.py +++ b/sos/report/plugins/openshift_ovn.py @@ -34,6 +34,10 @@ class OpenshiftOVN(Plugin, RedHatPlugin): 'ovn-appctl -t /var/run/ovn/ovnsb_db.ctl ' + 'cluster/status OVN_Southbound'], container='ovnkube-master') + self.add_cmd_output([ + 'ovs-appctl -t /var/run/ovn/ovn-controller.*.ctl ' + + 'ct-zone-list'], + container='ovnkube-node') self.add_cmd_output([ 'ovs-appctl -t ovs-monitor-ipsec tunnels/show', 'ipsec status', diff --git a/sos/report/plugins/openvswitch.py b/sos/report/plugins/openvswitch.py index 179d1532..159b0bd2 100644 --- a/sos/report/plugins/openvswitch.py +++ b/sos/report/plugins/openvswitch.py @@ -124,6 +124,8 @@ class OpenVSwitch(Plugin): "ovs-vsctl -t 5 list interface", # Capture OVS detailed information from all the bridges "ovs-vsctl -t 5 list bridge", + # Capture OVS datapath list + "ovs-vsctl -t 5 list datapath", # Capture DPDK queue to pmd mapping "ovs-appctl dpif-netdev/pmd-rxq-show", # Capture DPDK pmd stats @@ -229,6 +231,7 @@ class OpenVSwitch(Plugin): "ovs-ofctl queue-get-config %s" % br, "ovs-ofctl queue-stats %s" % br, "ovs-ofctl show %s" % br, + "ovs-ofctl dump-groups %s" % br, ]) # Flow protocols currently supported -- 2.27.0 From af40be92f502b35fa9d39ce4d4fea7d80c367830 Mon Sep 17 00:00:00 2001 From: Nadia Pinaeva Date: Tue, 15 Mar 2022 13:09:55 +0100 Subject: [PATCH] Improve sos collect for OCP: 1. wait for sos tmp project to be deleted (just calling delete changes project state to Terminating, and running a new sos collect is not possible before this project is fully deleted) 2. use --retries flag to copy sos reports from the nodes more reliably. The flag has been recently added to kubectl, and the most reliable way to check if it's available or not is to check command error output for "unknown flag" substring Signed-off-by: Nadia Pinaeva --- sos/collector/clusters/ocp.py | 5 +++++ sos/collector/transports/oc.py | 6 +++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/sos/collector/clusters/ocp.py b/sos/collector/clusters/ocp.py index f1714239..9beb2f9b 100644 --- a/sos/collector/clusters/ocp.py +++ b/sos/collector/clusters/ocp.py @@ -128,6 +128,11 @@ if not ret['status'] == 0: self.log_error("Error deleting temporary project: %s" % ret['output']) + ret = self.exec_primary_cmd("oc wait namespace/%s --for=delete " + "--timeout=30s" % self.project) + if not ret['status'] == 0: + self.log_error("Error waiting for temporary project to be " + "deleted: %s" % ret['output']) # don't leave the config on a non-existing project self.exec_master_cmd("oc project default") self.project = None diff --git a/sos/collector/transports/oc.py b/sos/collector/transports/oc.py index 0fc9eee8..90a802b2 100644 --- a/sos/collector/transports/oc.py +++ b/sos/collector/transports/oc.py @@ -231,5 +231,9 @@ class OCTransport(RemoteTransport): % (self.project, self.pod_name)) def _retrieve_file(self, fname, dest): - cmd = self.run_oc("cp %s:%s %s" % (self.pod_name, fname, dest)) + # check if --retries flag is available for given version of oc + result = self.run_oc("cp --retries", stderr=True) + flags = '' if "unknown flag" in result["output"] else '--retries=5' + cmd = self.run_oc("cp %s %s:%s %s" + % (flags, self.pod_name, fname, dest)) return cmd['status'] == 0 -- 2.27.0 From 3b0676b90ff65f20eaba3062775ff72b89386ffc Mon Sep 17 00:00:00 2001 From: Jake Hunsaker Date: Tue, 22 Mar 2022 14:25:24 -0400 Subject: [PATCH] [Plugin] Allow plugins to define default command environment vars Adds the ability for plugins to define a default set of environment vars to pass to all commands executed by the plugin. This may be done either via the new `set_default_cmd_environment()` or `add_default_cmd_environment()` methods. The former will override any previously set values, whereas the latter will add/update/modify any existing values. Signed-off-by: Jake Hunsaker --- sos/report/plugins/__init__.py | 55 ++++++++++++++++++- .../plugin_tests/plugin_environment.py | 44 +++++++++++++++ .../fake_plugins/default_env_test.py | 28 ++++++++++ tests/unittests/plugin_tests.py | 15 +++++ 4 files changed, 140 insertions(+), 2 deletions(-) create mode 100644 tests/report_tests/plugin_tests/plugin_environment.py create mode 100644 tests/test_data/fake_plugins/default_env_test.py diff --git a/sos/report/plugins/__init__.py b/sos/report/plugins/__init__.py index 336b4d22..74b4f4be 100644 --- a/sos/report/plugins/__init__.py +++ b/sos/report/plugins/__init__.py @@ -571,6 +571,7 @@ class Plugin(): self.manifest = None self.skip_files = commons['cmdlineopts'].skip_files self.skip_commands = commons['cmdlineopts'].skip_commands + self.default_environment = {} self.soslog = self.commons['soslog'] if 'soslog' in self.commons \ else logging.getLogger('sos') @@ -542,6 +542,52 @@ self.manifest.add_list('files', []) self.manifest.add_field('containers', {}) + def set_default_cmd_environment(self, env_vars): + """ + Specify a collection of environment variables that should always be + passed to commands being executed by this plugin. + + :param env_vars: The environment variables and their values to set + :type env_vars: ``dict{ENV_VAR_NAME: ENV_VAR_VALUE}`` + """ + if not isinstance(env_vars, dict): + raise TypeError( + "Environment variables for Plugin must be specified by dict" + ) + self.default_environment = env_vars + self._log_debug("Default environment for all commands now set to %s" + % self.default_environment) + + def add_default_cmd_environment(self, env_vars): + """ + Add or modify a specific environment variable in the set of default + environment variables used by this Plugin. + + :param env_vars: The environment variables to add to the current + set of env vars in use + :type env_vars: ``dict`` + """ + if not isinstance(env_vars, dict): + raise TypeError("Environment variables must be added via dict") + self._log_debug("Adding %s to default environment" % env_vars) + self.default_environment.update(env_vars) + + def _get_cmd_environment(self, env=None): + """ + Get the merged set of environment variables for a command about to be + executed by this plugin. + + :returns: The set of env vars to use for a command + :rtype: ``dict`` + """ + if env is None: + return self.default_environment + if not isinstance(env, dict): + raise TypeError("Command env vars must be passed as dict") + _env = self.default_environment.copy() + _env.update(env) + return _env + def timeout_from_options(self, optname, plugoptname, default_timeout): """Returns either the default [plugin|cmd] timeout value, the value as provided on the commandline via -k plugin.[|cmd-]timeout=value, or the @@ -2258,6 +2305,8 @@ class Plugin(): _tags = list(set(_tags)) + _env = self._get_cmd_environment(env) + if chroot or self.commons['cmdlineopts'].chroot == 'always': root = self.sysroot else: @@ -2068,7 +2068,7 @@ result = sos_get_command_output( cmd, timeout=timeout, stderr=stderr, chroot=root, - chdir=runat, env=env, binary=binary, sizelimit=sizelimit, + chdir=runat, env=_env, binary=binary, sizelimit=sizelimit, poller=self.check_timeout, foreground=foreground ) @@ -2510,6 +2559,8 @@ class Plugin(): else: root = None + _env = self._get_cmd_environment(env) + if container: if self._get_container_runtime() is None: self._log_info("Cannot run cmd '%s' in container %s: no " @@ -2522,7 +2573,7 @@ class Plugin(): "container is running." % (cmd, container)) return sos_get_command_output(cmd, timeout=timeout, chroot=root, - chdir=runat, binary=binary, env=env, + chdir=runat, binary=binary, env=_env, foreground=foreground, stderr=stderr) def _add_container_file_to_manifest(self, container, path, arcpath, tags): diff --git a/tests/report_tests/plugin_tests/plugin_environment.py b/tests/report_tests/plugin_tests/plugin_environment.py new file mode 100644 index 00000000..3158437a --- /dev/null +++ b/tests/report_tests/plugin_tests/plugin_environment.py @@ -0,0 +1,44 @@ +# This file is part of the sos project: https://github.com/sosreport/sos +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions of +# version 2 of the GNU General Public License. +# +# See the LICENSE file in the source distribution for further information. + +import os + +from sos_tests import StageTwoReportTest + + +class PluginDefaultEnvironmentTest(StageTwoReportTest): + """ + Ensure that being able to set a default set of environment variables is + working correctly and does not leave a lingering env var on the system + + :avocado: tags=stageone + """ + + install_plugins = ['default_env_test'] + sos_cmd = '-o default_env_test' + + def test_environment_used_in_cmd(self): + self.assertFileHasContent( + 'sos_commands/default_env_test/env_var_test', + 'Does Linus play hockey?' + ) + + def test_environment_setting_logged(self): + self.assertSosLogContains( + 'Default environment for all commands now set to' + ) + + def test_environment_not_set_on_host(self): + self.assertTrue('TORVALDS' not in os.environ) + self.assertTrue('GREATESTSPORT' not in os.environ) + + def test_environment_not_captured(self): + # we should still have an empty environment file + self.assertFileCollected('environment') + self.assertFileNotHasContent('environment', 'TORVALDS') + self.assertFileNotHasContent('environment', 'GREATESTSPORT') diff --git a/tests/test_data/fake_plugins/default_env_test.py b/tests/test_data/fake_plugins/default_env_test.py new file mode 100644 index 00000000..d1d1fb78 --- /dev/null +++ b/tests/test_data/fake_plugins/default_env_test.py @@ -0,0 +1,28 @@ +# This file is part of the sos project: https://github.com/sosreport/sos +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions of +# version 2 of the GNU General Public License. +# +# See the LICENSE file in the source distribution for further information. + +from sos.report.plugins import Plugin, IndependentPlugin + + +class DefaultEnv(Plugin, IndependentPlugin): + + plugin_name = 'default_env_test' + short_desc = 'Fake plugin to test default env var handling' + + def setup(self): + self.set_default_cmd_environment({ + 'TORVALDS': 'Linus', + 'GREATESTSPORT': 'hockey' + }) + + self.add_cmd_output( + "sh -c 'echo Does '$TORVALDS' play '$GREATESTSPORT'?'", + suggest_filename='env_var_test' + ) + + self.add_env_var(['TORVALDS', 'GREATESTSPORT']) diff --git a/tests/unittests/plugin_tests.py b/tests/unittests/plugin_tests.py index 0dfa243d..e469b78e 100644 --- a/tests/unittests/plugin_tests.py +++ b/tests/unittests/plugin_tests.py @@ -305,6 +305,21 @@ class PluginTests(unittest.TestCase): p.postproc() self.assertTrue(p.did_postproc) + def test_set_default_cmd_env(self): + p = MockPlugin({ + 'sysroot': self.sysroot, + 'policy': LinuxPolicy(init=InitSystem(), probe_runtime=False), + 'cmdlineopts': MockOptions(), + 'devices': {} + }) + e = {'TORVALDS': 'Linus'} + p.set_default_cmd_environment(e) + self.assertEquals(p.default_environment, e) + add_e = {'GREATESTSPORT': 'hockey'} + p.add_default_cmd_environment(add_e) + self.assertEquals(p.default_environment['GREATESTSPORT'], 'hockey') + self.assertEquals(p.default_environment['TORVALDS'], 'Linus') + class AddCopySpecTests(unittest.TestCase): -- 2.27.0 From 1e12325efaa500d304dcbfbeeb50e72ed0f938f5 Mon Sep 17 00:00:00 2001 From: Vladislav Walek <22072258+vwalek@users.noreply.github.com> Date: Thu, 17 Mar 2022 14:10:26 -0700 Subject: [PATCH] [openshift] Adding ability to use the localhost.kubeconfig and KUBECONFIG env to use system:admin Signed-off-by: Vladislav Walek <22072258+vwalek@users.noreply.github.com> --- sos/report/plugins/openshift.py | 45 +++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/sos/report/plugins/openshift.py b/sos/report/plugins/openshift.py index 5ae38178..d643f04c 100644 --- a/sos/report/plugins/openshift.py +++ b/sos/report/plugins/openshift.py @@ -60,11 +60,18 @@ profiles = ('openshift',) packages = ('openshift-hyperkube',) + master_localhost_kubeconfig = ( + '/etc/kubernetes/static-pod-resources/' + 'kube-apiserver-certs/secrets/node-kubeconfigs/localhost.kubeconfig' + ) + option_list = [ ('token', 'admin token to allow API queries', 'fast', None), + ('kubeconfig', 'Path to a locally available kubeconfig file', + 'fast', None), ('host', 'host address to use for oc login, including port', 'fast', 'https://localhost:6443'), - ('no-oc', 'do not collect `oc` command output', 'fast', False), + ('no-oc', 'do not collect `oc` command output', 'fast', True), ('podlogs', 'collect logs from each pod', 'fast', True), ('podlogs-filter', ('limit podlogs collection to pods matching this ' 'regex'), 'fast', ''), @@ -73,6 +80,10 @@ class Openshift(Plugin, RedHatPlugin): """Check to see if we can run `oc` commands""" return self.exec_cmd('oc whoami')['status'] == 0 + def _check_localhost_kubeconfig(self): + """Check if the localhost.kubeconfig exists with system:admin user""" + return self.path_exists(self.get_option('kubeconfig')) + def _check_oc_logged_in(self): """See if we're logged in to the API service, and if not attempt to do so using provided plugin options @@ -80,8 +91,38 @@ class Openshift(Plugin, RedHatPlugin): if self._check_oc_function(): return True - # Not logged in currently, attempt to do so + if self.get_option('kubeconfig') is None: + # If admin doesn't add the kubeconfig + # use default localhost.kubeconfig + self.set_option( + 'kubeconfig', + self.master_localhost_kubeconfig + ) + + # Check first if we can use the localhost.kubeconfig before + # using token. We don't want to use 'host' option due we use + # cluster url from kubeconfig. Default is localhost. + if self._check_localhost_kubeconfig(): + self.set_default_cmd_environment({ + 'KUBECONFIG': self.get_option('kubeconfig') + }) + + oc_res = self.exec_cmd( + "oc login -u system:admin " + "--insecure-skip-tls-verify=True" + ) + if oc_res['status'] == 0 and self._check_oc_function(): + return True + + self._log_warn( + "The login command failed with status: %s and error: %s" + % (oc_res['status'], oc_res['output']) + ) + return False + + # If kubeconfig is not defined, check if token is provided. token = self.get_option('token') or os.getenv('SOSOCPTOKEN', None) + if token: oc_res = self.exec_cmd("oc login %s --token=%s " "--insecure-skip-tls-verify=True" -- 2.27.0 From 3b84b4ccfa9e4924a5a3829d3810568dfb69bf63 Mon Sep 17 00:00:00 2001 From: Jake Hunsaker Date: Fri, 18 Mar 2022 16:25:35 -0400 Subject: [PATCH 1/2] [pacemaker] Redesign node enumeration logic It has been found that `pcs status` output is liable to change, which ends up breaking our parsing of node lists when using it on newer versions. Instead, first try to parse through `crm_mon` output, which is what `pcs status` uses under the hood, but as a stable and reliable xml format. Failing that, for example if the `--primary` node is not functioning as part of the cluster, source `/etc/corosync/corosync.conf` instead. Related: RHBZ2065805 Related: RHBZ2065811 Signed-off-by: Jake Hunsaker --- sos/collector/clusters/pacemaker.py | 110 +++++++++++++++++++--------- 1 file changed, 76 insertions(+), 34 deletions(-) diff --git a/sos/collector/clusters/pacemaker.py b/sos/collector/clusters/pacemaker.py index 55024314..49d0ce51 100644 --- a/sos/collector/clusters/pacemaker.py +++ b/sos/collector/clusters/pacemaker.py @@ -8,7 +8,11 @@ # # See the LICENSE file in the source distribution for further information. +import re + from sos.collector.clusters import Cluster +from setuptools._vendor.packaging import version +from xml.etree import ElementTree class pacemaker(Cluster): @@ -18,42 +22,80 @@ class pacemaker(Cluster): packages = ('pacemaker',) option_list = [ ('online', True, 'Collect nodes listed as online'), - ('offline', True, 'Collect nodes listed as offline') + ('offline', True, 'Collect nodes listed as offline'), + ('only-corosync', False, 'Only use corosync.conf to enumerate nodes') ] def get_nodes(self): - self.res = self.exec_primary_cmd('pcs status') - if self.res['status'] != 0: - self.log_error('Cluster status could not be determined. Is the ' - 'cluster running on this node?') - return [] - if 'node names do not match' in self.res['output']: - self.log_warn('Warning: node name mismatch reported. Attempts to ' - 'connect to some nodes may fail.\n') - return self.parse_pcs_output() - - def parse_pcs_output(self): - nodes = [] - if self.get_option('online'): - nodes += self.get_online_nodes() - if self.get_option('offline'): - nodes += self.get_offline_nodes() - return nodes - - def get_online_nodes(self): - for line in self.res['output'].splitlines(): - if line.startswith('Online:'): - nodes = line.split('[')[1].split(']')[0] - return [n for n in nodes.split(' ') if n] - - def get_offline_nodes(self): - offline = [] - for line in self.res['output'].splitlines(): - if line.startswith('Node') and line.endswith('(offline)'): - offline.append(line.split()[1].replace(':', '')) - if line.startswith('OFFLINE:'): - nodes = line.split('[')[1].split(']')[0] - offline.extend([n for n in nodes.split(' ') if n]) - return offline + self.nodes = [] + # try crm_mon first + try: + if not self.get_option('only-corosync'): + try: + self.get_nodes_from_crm() + except Exception as err: + self.log_warn("Falling back to sourcing corosync.conf. " + "Could not parse crm_mon output: %s" % err) + if not self.nodes: + # fallback to corosync.conf, in case the node we're inspecting + # is offline from the cluster + self.get_nodes_from_corosync() + except Exception as err: + self.log_error("Could not determine nodes from cluster: %s" % err) + + _shorts = [n for n in self.nodes if '.' not in n] + if _shorts: + self.log_warn( + "WARNING: Node addresses '%s' may not resolve locally if you " + "are not running on a node in the cluster. Try using option " + "'-c pacemaker.only-corosync' if these connections fail." + % ','.join(_shorts) + ) + return self.nodes + + def get_nodes_from_crm(self): + """ + Try to parse crm_mon output for node list and status. + """ + xmlopt = '--output-as=xml' + # older pacemaker had a different option for xml output + _ver = self.exec_primary_cmd('crm_mon --version') + if _ver['status'] == 0: + cver = _ver['output'].split()[1].split('-')[0] + if not version.parse(cver) > version.parse('2.0.3'): + xmlopt = '--as-xml' + else: + return + _out = self.exec_primary_cmd( + "crm_mon --one-shot --inactive %s" % xmlopt, + need_root=True + ) + if _out['status'] == 0: + self.parse_crm_xml(_out['output']) + + def parse_crm_xml(self, xmlstring): + """ + Parse the xml output string provided by crm_mon + """ + _xml = ElementTree.fromstring(xmlstring) + nodes = _xml.find('nodes') + for node in nodes: + _node = node.attrib + if self.get_option('online') and _node['online'] == 'true': + self.nodes.append(_node['name']) + elif self.get_option('offline') and _node['online'] == 'false': + self.nodes.append(_node['name']) + + def get_nodes_from_corosync(self): + """ + As a fallback measure, read corosync.conf to get the node list. Note + that this prevents us from separating online nodes from offline nodes. + """ + self.log_warn("WARNING: unable to distinguish online nodes from " + "offline nodes when sourcing from corosync.conf") + cc = self.primary.read_file('/etc/corosync/corosync.conf') + nodes = re.findall(r'((\sring0_addr:)(.*))', cc) + for node in nodes: + self.nodes.append(node[-1].strip()) # vim: set et ts=4 sw=4 : -- 2.27.0 From 6701a7d77ecc998b018b54ecc00f9fd102ae9518 Mon Sep 17 00:00:00 2001 From: Jake Hunsaker Date: Mon, 21 Mar 2022 12:05:59 -0400 Subject: [PATCH 2/2] [clusters] Allow clusters to not add localhost to node list For most of our supported clusters, we end up needing to add the local host executing `sos collect` to the node list (unless `--no-local` is used) as that accounts for the primary node that may otherwise be left off. However, this is not helpful for clusters that may reports node names as something other than resolveable names. In those cases, such as with pacemaker, adding the local hostname may result in duplicate collections. Add a toggle to cluster profiles via a new `strict_node_list` class attr that, if True, will skip this addition. This toggle is default `False` to preserve existing behavior, and is now enabled for `pacemaker` specifically. Related: RHBZ#2065821 Signed-off-by: Jake Hunsaker --- sos/collector/__init__.py | 3 ++- sos/collector/clusters/__init__.py | 4 ++++ sos/collector/clusters/pacemaker.py | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/sos/collector/__init__.py b/sos/collector/__init__.py index a8bb0064..d898ca34 100644 --- a/sos/collector/__init__.py +++ b/sos/collector/__init__.py @@ -1073,7 +1073,8 @@ class SoSCollector(SoSComponent): for node in self.node_list: if host == node.split('.')[0]: self.node_list.remove(node) - self.node_list.append(self.hostname) + if not self.cluster.strict_node_list: + self.node_list.append(self.hostname) self.reduce_node_list() try: _node_max = len(max(self.node_list, key=len)) diff --git a/sos/collector/clusters/__init__.py b/sos/collector/clusters/__init__.py index f3f550ad..f00677b8 100644 --- a/sos/collector/clusters/__init__.py +++ b/sos/collector/clusters/__init__.py @@ -56,6 +56,10 @@ sos_plugin_options = {} sos_preset = '' cluster_name = None + # set this to True if the local host running collect should *not* be + # forcibly added to the node list. This can be helpful in situations where + # the host's fqdn and the name the cluster uses are different + strict_node_list = False def __init__(self, commons): self.master = None diff --git a/sos/collector/clusters/pacemaker.py b/sos/collector/clusters/pacemaker.py index 49d0ce51..bebcb265 100644 --- a/sos/collector/clusters/pacemaker.py +++ b/sos/collector/clusters/pacemaker.py @@ -20,6 +20,7 @@ class pacemaker(Cluster): cluster_name = 'Pacemaker High Availability Cluster Manager' sos_plugins = ['pacemaker'] packages = ('pacemaker',) + strict_node_list = True option_list = [ ('online', True, 'Collect nodes listed as online'), ('offline', True, 'Collect nodes listed as offline'), -- 2.27.0 From 61765992812afb785e9552e01e3b5579118a6963 Mon Sep 17 00:00:00 2001 From: Nadia Pinaeva Date: Fri, 1 Apr 2022 12:05:36 +0200 Subject: [PATCH] Add one more container for plugin enablement Signed-off-by: Nadia Pinaeva --- sos/report/plugins/openshift_ovn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sos/report/plugins/openshift_ovn.py b/sos/report/plugins/openshift_ovn.py index b4787b8e..98522b1e 100644 --- a/sos/report/plugins/openshift_ovn.py +++ b/sos/report/plugins/openshift_ovn.py @@ -16,7 +16,7 @@ class OpenshiftOVN(Plugin, RedHatPlugin): """ short_desc = 'Openshift OVN' plugin_name = "openshift_ovn" - containers = ('ovnkube-master', 'ovn-ipsec') + containers = ('ovnkube-master', 'ovnkube-node', 'ovn-ipsec') profiles = ('openshift',) def setup(self): -- 2.27.0 From d3aa071efc85507341cf65dd61414a734654f50a Mon Sep 17 00:00:00 2001 From: Jake Hunsaker Date: Mon, 28 Mar 2022 14:47:09 -0400 Subject: [PATCH] [presets] Adjust OCP preset options Adjust the options used by the 'ocp' preset to better reflect the current collection needs and approach. This includes disabling the `cgroups` plugin due to the large amount of mostly irrelevant data captured due to the high number of containers present on OCP nodes, ensuring the `--container-runtime` option is set to `crio` to align container-based collections, disabling HTML report generation and increasing the base log size rather than blindly enabling all-logs. Signed-off-by: Jake Hunsaker --- sos/presets/redhat/__init__.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/sos/presets/redhat/__init__.py b/sos/presets/redhat/__init__.py index 865c9b6b..0b9f6f11 100644 --- a/sos/presets/redhat/__init__.py +++ b/sos/presets/redhat/__init__.py @@ -36,10 +36,15 @@ RHOSP_OPTS = SoSOptions(plugopts=[ RHOCP = "ocp" RHOCP_DESC = "OpenShift Container Platform by Red Hat" -RHOCP_OPTS = SoSOptions(all_logs=True, verify=True, plugopts=[ - 'networking.timeout=600', - 'networking.ethtool_namespaces=False', - 'networking.namespaces=200']) +RHOCP_OPTS = SoSOptions( + verify=True, skip_plugins=['cgroups'], container_runtime='crio', + no_report=True, log_size=100, + plugopts=[ + 'crio.timeout=600', + 'networking.timeout=600', + 'networking.ethtool_namespaces=False', + 'networking.namespaces=200' + ]) RH_CFME = "cfme" RH_CFME_DESC = "Red Hat CloudForms" -- 2.27.0 From f2b67ab820070063995689fed03492cdaa012d01 Mon Sep 17 00:00:00 2001 From: Nadia Pinaeva Date: Fri, 1 Apr 2022 17:01:35 +0200 Subject: [PATCH] Use /etc/os-release instead of /etc/redhat-release as the most compatible way to find host release Signed-off-by: Nadia Pinaeva --- sos/policies/distros/redhat.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sos/policies/distros/redhat.py b/sos/policies/distros/redhat.py index 0c72a5e4..2e117f37 100644 --- a/sos/policies/distros/redhat.py +++ b/sos/policies/distros/redhat.py @@ -40,7 +40,6 @@ class RedHatPolicy(LinuxPolicy): ('Distribution Website', 'https://www.redhat.com/'), ('Commercial Support', 'https://www.access.redhat.com/') ] - _redhat_release = '/etc/redhat-release' _tmp_dir = "/var/tmp" _in_container = False default_scl_prefix = '/opt/rh' @@ -471,7 +470,7 @@ support representative. atomic = False if ENV_HOST_SYSROOT not in os.environ: return atomic - host_release = os.environ[ENV_HOST_SYSROOT] + cls._redhat_release + host_release = os.environ[ENV_HOST_SYSROOT] + OS_RELEASE if not os.path.exists(host_release): return False try: @@ -558,7 +557,7 @@ support representative. coreos = False if ENV_HOST_SYSROOT not in os.environ: return coreos - host_release = os.environ[ENV_HOST_SYSROOT] + cls._redhat_release + host_release = os.environ[ENV_HOST_SYSROOT] + OS_RELEASE try: for line in open(host_release, 'r').read().splitlines(): coreos |= 'Red Hat Enterprise Linux CoreOS' in line -- 2.27.0 From ee0dd68199a2c9296eafe64ead5b2263c8270e4a Mon Sep 17 00:00:00 2001 From: Nadia Pinaeva Date: Wed, 6 Apr 2022 11:56:41 +0200 Subject: [PATCH] Use --force-pull-image option for pods created with oc. Set --force-pull-image=True by default, can be turned off with --force-pull-image=False Signed-off-by: Nadia Pinaeva --- man/en/sos-collect.1 | 16 +++++++++++----- sos/collector/__init__.py | 9 +++++---- sos/collector/transports/oc.py | 2 ++ sos/options.py | 20 ++++++++++++++------ 4 files changed, 32 insertions(+), 15 deletions(-) diff --git a/man/en/sos-collect.1 b/man/en/sos-collect.1 index 9b0a5d7b..2f60332b 100644 --- a/man/en/sos-collect.1 +++ b/man/en/sos-collect.1 @@ -28,7 +28,7 @@ [\-\-no\-local] [\-\-master MASTER] [\-\-image IMAGE] - [\-\-force-pull-image] + [\-\-force-pull-image TOGGLE, --pull TOGGLE] [\-\-registry-user USER] [\-\-registry-password PASSWORD] [\-\-registry-authfile FILE] @@ -262,10 +262,16 @@ Specify an image to use for the temporary container created for collections on containerized host, if you do not want to use the default image specifed by the host's policy. Note that this should include the registry. .TP -\fB\-\-force-pull-image\fR -Use this option to force the container runtime to pull the specified image (even -if it is the policy default image) even if the image already exists on the host. -This may be useful to update an older container image on containerized hosts. +\fB\-\-force-pull-image TOGGLE, \-\-pull TOGGLE\fR +When collecting an sos report from a containerized host, force the host to always +pull the specified image, even if that image already exists on the host. +This is useful to ensure that the latest version of that image is always in use. +Disabling this option will use whatever version of the image is present on the node, +and only attempt a pull if there is no copy of the image present at all. + +Enable with true/on/yes or disable with false/off/no + +Default: true .TP \fB\-\-registry-user USER\fR Specify the username to authenticate to the registry with in order to pull the container diff --git a/sos/collector/__init__.py b/sos/collector/__init__.py index d898ca34..66c3d932 100644 --- a/sos/collector/__init__.py +++ b/sos/collector/__init__.py @@ -27,7 +27,7 @@ from textwrap import fill from sos.cleaner import SoSCleaner from sos.collector.sosnode import SosNode -from sos.options import ClusterOption +from sos.options import ClusterOption, str_to_bool from sos.component import SoSComponent from sos import __version__ @@ -85,7 +85,7 @@ class SoSCollector(SoSComponent): 'encrypt_pass': '', 'group': None, 'image': '', - 'force_pull_image': False, + 'force_pull_image': True, 'jobs': 4, 'keywords': [], 'keyword_file': None, @@ -357,8 +357,9 @@ class SoSCollector(SoSComponent): collect_grp.add_argument('--image', help=('Specify the container image to use for' ' containerized hosts.')) - collect_grp.add_argument('--force-pull-image', '--pull', default=False, - action='store_true', + collect_grp.add_argument('--force-pull-image', '--pull', + default=True, choices=(True, False), + type=str_to_bool, help='Force pull the container image even if ' 'it already exists on the host') collect_grp.add_argument('--registry-user', default=None, diff --git a/sos/collector/transports/oc.py b/sos/collector/transports/oc.py index 90a802b2..8f6aa9b4 100644 --- a/sos/collector/transports/oc.py +++ b/sos/collector/transports/oc.py @@ -147,6 +147,8 @@ class OCTransport(RemoteTransport): "tty": True } ], + "imagePullPolicy": + "Always" if self.opts.force_pull_image else "IfNotPresent", "restartPolicy": "Never", "nodeName": self.address, "hostNetwork": True, diff --git a/sos/options.py b/sos/options.py index 4846a509..2d5a5135 100644 --- a/sos/options.py +++ b/sos/options.py @@ -18,6 +18,16 @@ def _is_seq(val): return val_type is list or val_type is tuple +def str_to_bool(val): + _val = val.lower() + if _val in ['true', 'on', 'yes']: + return True + elif _val in ['false', 'off', 'no']: + return False + else: + return None + + class SoSOptions(): def _merge_opt(self, opt, src, is_default): @@ -153,15 +163,13 @@ class SoSOptions(): if isinstance(self.arg_defaults[key], list): return [v for v in val.split(',')] if isinstance(self.arg_defaults[key], bool): - _val = val.lower() - if _val in ['true', 'on', 'yes']: - return True - elif _val in ['false', 'off', 'no']: - return False - else: + val = str_to_bool(val) + if val is None: raise Exception( "Value of '%s' in %s must be True or False or analagous" % (key, conf)) + else: + return val if isinstance(self.arg_defaults[key], int): try: return int(val) -- 2.27.0 From ade857c82926bb7de2c45232f00a3f346db4e59a Mon Sep 17 00:00:00 2001 From: Jake Hunsaker Date: Wed, 3 Nov 2021 16:28:10 -0400 Subject: [PATCH] [collect] Update docstrings for collect for help command Updates the docstrings for collect components for the new `help` command, including minor formatting changes to make the help output more human-friendly. This includes docstring updates and changes to transports and cluster profiles. Signed-off-by: Jake Hunsaker --- sos/collector/clusters/jbon.py | 13 +++++--- sos/collector/clusters/kubernetes.py | 7 ++++- sos/collector/clusters/ocp.py | 35 ++++++++++++++++++++- sos/collector/clusters/ovirt.py | 27 ++++++++++++++++ sos/collector/clusters/satellite.py | 9 +++++- sos/collector/transports/control_persist.py | 12 +++++-- sos/collector/transports/local.py | 7 +++-- sos/collector/transports/oc.py | 20 ++++++++++-- 8 files changed, 114 insertions(+), 16 deletions(-) diff --git a/sos/collector/clusters/jbon.py b/sos/collector/clusters/jbon.py index 8f083ac6..219e1901 100644 --- a/sos/collector/clusters/jbon.py +++ b/sos/collector/clusters/jbon.py @@ -12,11 +12,14 @@ from sos.collector.clusters import Cluster class jbon(Cluster): - '''Just a Bunch of Nodes - - Used when --cluster-type=none (or jbon), to avoid cluster checks, and just - use the provided --nodes list - ''' + """ + Used when --cluster-type=none (or jbon) to avoid cluster checks, and just + use the provided --nodes list. + + Using this profile will skip any and all operations that a cluster profile + normally performs, and will not set any plugins, plugin options, or presets + for the sos report generated on the nodes provided by --nodes. + """ cluster_name = 'Just a Bunch Of Nodes (no cluster)' packages = None diff --git a/sos/collector/clusters/kubernetes.py b/sos/collector/clusters/kubernetes.py index 99f788dc..04752977 100644 --- a/sos/collector/clusters/kubernetes.py +++ b/sos/collector/clusters/kubernetes.py @@ -13,7 +13,12 @@ from sos.collector.clusters import Cluster class kubernetes(Cluster): - + """ + The kuberentes cluster profile is intended to be used on kubernetes + clusters built from the upstream/source kubernetes (k8s) project. It is + not intended for use with other projects or platforms that are built ontop + of kubernetes. + """ cluster_name = 'Community Kubernetes' packages = ('kubernetes-master',) sos_plugins = ['kubernetes'] diff --git a/sos/collector/clusters/ocp.py b/sos/collector/clusters/ocp.py index ae93ad58..f1714239 100644 --- a/sos/collector/clusters/ocp.py +++ b/sos/collector/clusters/ocp.py @@ -16,7 +16,40 @@ from sos.utilities import is_executable class ocp(Cluster): - """OpenShift Container Platform v4""" + """ + This profile is for use with OpenShift Container Platform (v4) clusters + instead of the kubernetes profile. + + This profile will favor using the `oc` transport type, which means it will + leverage a locally installed `oc` binary. This is also how node enumeration + is done. To instead use SSH to connect to the nodes, use the + '--transport=control_persist' option. + + Thus, a functional `oc` binary for the user executing sos collect is + required. Functional meaning that the user can run `oc` commands with + clusterAdmin privileges. + + If this requires the use of a secondary configuration file, specify that + path with the 'kubeconfig' cluster option. + + Alternatively, provide a clusterAdmin access token either via the 'token' + cluster option or, preferably, the SOSOCPTOKEN environment variable. + + By default, this profile will enumerate only master nodes within the + cluster, and this may be changed by overriding the 'role' cluster option. + To collect from all nodes in the cluster regardless of role, use the form + -c ocp.role=''. + + Filtering nodes by a label applied to that node is also possible via the + label cluster option, though be aware that this is _combined_ with the role + option mentioned above. + + To avoid redundant collections of OCP API information (e.g. 'oc get' + commands), this profile will attempt to enable the openshift plugin on only + a single master node. If the none of the master nodes have a functional + 'oc' binary available, *and* the --no-local option is used, that means that + no API data will be collected. + """ cluster_name = 'OpenShift Container Platform v4' packages = ('openshift-hyperkube', 'openshift-clients') diff --git a/sos/collector/clusters/ovirt.py b/sos/collector/clusters/ovirt.py index bd2d0c74..4587a1ca 100644 --- a/sos/collector/clusters/ovirt.py +++ b/sos/collector/clusters/ovirt.py @@ -17,6 +17,33 @@ ENGINE_KEY = '/etc/pki/ovirt-engine/keys/engine_id_rsa' class ovirt(Cluster): + """ + This cluster profile is for the oVirt/RHV project which provides for a + virtualization cluster built ontop of KVM. + + Nodes enumerated will be hypervisors within the envrionment, not virtual + machines running on those hypervisors. By default, ALL hypervisors within + the environment are returned. This may be influenced by the 'cluster' and + 'datacenter' cluster options, which will limit enumeration to hypervisors + within the specific cluster and/or datacenter. The spm-only cluster option + may also be used to only collect from hypervisors currently holding the + SPM role. + + Optionally, to only collect an archive from manager and the postgresql + database, use the no-hypervisors cluster option. + + By default, a second archive from the manager will be collected that is + just the postgresql plugin configured in such a way that a dump of the + manager's database that can be explored and restored to other systems will + be collected. + + The ovirt profile focuses on the upstream, community ovirt project. + + The rhv profile is for Red Hat customers running RHV (formerly RHEV). + + The rhhi_virt profile is for Red Hat customers running RHV in a + hyper-converged setup and enables gluster collections. + """ cluster_name = 'Community oVirt' packages = ('ovirt-engine',) diff --git a/sos/collector/clusters/satellite.py b/sos/collector/clusters/satellite.py index 7c21e553..5e28531d 100644 --- a/sos/collector/clusters/satellite.py +++ b/sos/collector/clusters/satellite.py @@ -13,7 +13,14 @@ from sos.collector.clusters import Cluster class satellite(Cluster): - """Red Hat Satellite 6""" + """ + This profile is specifically for Red Hat Satellite 6, and not earlier + releases of Satellite. + + While note technically a 'cluster' in the traditional sense, Satellite + does provide for 'capsule' nodes which is what this profile aims to + enumerate beyond the 'primary' Satellite system. + """ cluster_name = 'Red Hat Satellite 6' packages = ('satellite', 'satellite-installer') diff --git a/sos/collector/transports/control_persist.py b/sos/collector/transports/control_persist.py index 3e848b41..ae6c7e43 100644 --- a/sos/collector/transports/control_persist.py +++ b/sos/collector/transports/control_persist.py @@ -26,10 +26,18 @@ from sos.utilities import sos_get_command_output class SSHControlPersist(RemoteTransport): - """A transport for collect that leverages OpenSSH's Control Persist + """ + A transport for collect that leverages OpenSSH's ControlPersist functionality which uses control sockets to transparently keep a connection open to the remote host without needing to rebuild the SSH connection for - each and every command executed on the node + each and every command executed on the node. + + This transport will by default assume the use of SSH keys, meaning keys + have already been distributed to target nodes. If this is not the case, + users will need to provide a password using the --password or + --password-per-node option, depending on if the password to connect to all + nodes is the same or not. Note that these options prevent the use of the + --batch option, as they require user input. """ name = 'control_persist' diff --git a/sos/collector/transports/local.py b/sos/collector/transports/local.py index 2996d524..dd72053c 100644 --- a/sos/collector/transports/local.py +++ b/sos/collector/transports/local.py @@ -15,9 +15,10 @@ from sos.collector.transports import RemoteTransport class LocalTransport(RemoteTransport): - """A 'transport' to represent a local node. This allows us to more easily - extend SoSNode() without having a ton of 'if local' or similar checks in - more places than we actually need them + """ + A 'transport' to represent a local node. No remote connection is actually + made, and all commands set to be run by this transport are executed locally + without any wrappers. """ name = 'local_node' diff --git a/sos/collector/transports/oc.py b/sos/collector/transports/oc.py index 720dd61d..0fc9eee8 100644 --- a/sos/collector/transports/oc.py +++ b/sos/collector/transports/oc.py @@ -18,15 +18,29 @@ from sos.utilities import (is_executable, sos_get_command_output, class OCTransport(RemoteTransport): - """This transport leverages the execution of commands via a locally + """ + This transport leverages the execution of commands via a locally available and configured ``oc`` binary for OCPv4 environments. + The location of the oc binary MUST be in the $PATH used by the locally + loaded SoS policy. Specifically this means that the binary cannot be in the + running user's home directory, such as ~/.local/bin. + OCPv4 clusters generally discourage the use of SSH, so this transport may be used to remove our use of SSH in favor of the environment provided method of connecting to nodes and executing commands via debug pods. - Note that this approach will generate multiple debug pods over the course - of our execution + The debug pod created will be a privileged pod that mounts the host's + filesystem internally so that sos report collections reflect the host, and + not the container in which it runs. + + This transport will execute within a temporary 'sos-collect-tmp' project + created by the OCP cluster profile. The project will be removed at the end + of execution. + + In the event of failures due to a misbehaving OCP API or oc binary, it is + recommended to fallback to the control_persist transport by manually + setting the --transport option. """ name = 'oc' -- 2.27.0 From ce289a3ae7101a898efdb84ddfd575576ba5819b Mon Sep 17 00:00:00 2001 From: Jake Hunsaker Date: Tue, 5 Apr 2022 11:32:11 -0400 Subject: [PATCH] [ocp, openshift] Re-align API collection options and rename option Previously, in #2888, the `openshift` plugin was extended to allow API collections by using a default-available kubeconfig file rather than relying on user-provided tokens. This also included flipping the default value of the `no-oc` plugin option to `True` (meaning do not collect API output by default). This worked for the plugin, but it introduced a gap in `sos collect` whereby the cluster profile could no longer reliably enable API collections when trying to leverage the new functionality of not requiring a user token. Fix this by updating the cluster profile to align with the new default-off approach of API collections. Along with this, add a toggle to the cluster profile directly to allow users to toggle API collections on or off (default off) directly. This is done via a new `with-api` cluster option (e.g. `-c ocp.with-api`). Further, rename the `openshift` plugin option from `no-oc` to `with-api`. This change not only makes the option use case far more obvious, it will also align the use of the option to both `collect` and `report` so that users need only be aware of a single option for either method. The cluster profile also has logic to detect which plugin option, `no-oc` or `with-api` to use based on the (RHEL) sos version installed on the nodes being inspected by the `ocp` cluster profile. Signed-off-by: Jake Hunsaker --- sos/collector/clusters/ocp.py | 72 +++++++++++++++++++++++++++------ sos/report/plugins/openshift.py | 26 +++++++----- 2 files changed, 77 insertions(+), 21 deletions(-) diff --git a/sos/collector/clusters/ocp.py b/sos/collector/clusters/ocp.py index 9beb2f9b..e31d1903 100644 --- a/sos/collector/clusters/ocp.py +++ b/sos/collector/clusters/ocp.py @@ -30,7 +30,11 @@ class ocp(Cluster): clusterAdmin privileges. If this requires the use of a secondary configuration file, specify that - path with the 'kubeconfig' cluster option. + path with the 'kubeconfig' cluster option. This config file will also be + used on a single master node to perform API collections if the `with-api` + option is enabled (default disabled). If no `kubeconfig` option is given, + but `with-api` is enabled, the cluster profile will attempt to use a + well-known default kubeconfig file if it is available on the host. Alternatively, provide a clusterAdmin access token either via the 'token' cluster option or, preferably, the SOSOCPTOKEN environment variable. @@ -45,7 +49,7 @@ class ocp(Cluster): option mentioned above. To avoid redundant collections of OCP API information (e.g. 'oc get' - commands), this profile will attempt to enable the openshift plugin on only + commands), this profile will attempt to enable the API collections on only a single master node. If the none of the master nodes have a functional 'oc' binary available, *and* the --no-local option is used, that means that no API data will be collected. @@ -63,7 +67,8 @@ class ocp(Cluster): ('label', '', 'Colon delimited list of labels to select nodes with'), ('role', 'master', 'Colon delimited list of roles to filter on'), ('kubeconfig', '', 'Path to the kubeconfig file'), - ('token', '', 'Service account token to use for oc authorization') + ('token', '', 'Service account token to use for oc authorization'), + ('with-api', False, 'Collect OCP API data from a master node') ] def fmt_oc_cmd(self, cmd): @@ -219,13 +219,52 @@ return False return 'master' in self.node_dict[sosnode.address]['roles'] + def _toggle_api_opt(self, node, use_api): + """In earlier versions of sos, the openshift plugin option that is + used to toggle the API collections was called `no-oc` rather than + `with-api`. This older plugin option had the inverse logic of the + current `with-api` option. + + Use this to toggle the correct plugin option given the node's sos + version. Note that the use of version 4.2 here is tied to the RHEL + release (the only usecase for this cluster profile) rather than + the upstream version given the backports for that downstream. + + :param node: The node being inspected for API collections + :type node: ``SoSNode`` + + :param use_api: Should this node enable API collections? + :type use_api: ``bool`` + """ + if node.check_sos_version('4.2-16'): + _opt = 'with-api' + _val = 'on' if use_api else 'off' + else: + _opt = 'no-oc' + _val = 'off' if use_api else 'on' + node.plugopts.append("openshift.%s=%s" % (_opt, _val)) + def set_master_options(self, node): + node.enable_plugins.append('openshift') + if not self.get_option('with-api'): + self._toggle_api_opt(node, False) + return if self.api_collect_enabled: # a primary has already been enabled for API collection, disable # it among others - node.plugopts.append('openshift.no-oc=on') + self._toggle_api_opt(node, False) else: + # running in a container, so reference the /host mount point + master_kube = ( + '/host/etc/kubernetes/static-pod-resources/' + 'kube-apiserver-certs/secrets/node-kubeconfigs/' + 'localhost.kubeconfig' + ) + _optconfig = self.get_option('kubeconfig') + if _optconfig and not _optconfig.startswith('/host'): + _optconfig = '/host/' + _optconfig + _kubeconfig = _optconfig or master_kube _oc_cmd = 'oc' if node.host.containerized: _oc_cmd = '/host/bin/oc' @@ -244,17 +288,21 @@ class ocp(Cluster): need_root=True) if can_oc['status'] == 0: # the primary node can already access the API + self._toggle_api_opt(node, True) self.api_collect_enabled = True elif self.token: node.sos_env_vars['SOSOCPTOKEN'] = self.token + self._toggle_api_opt(node, True) + self.api_collect_enabled = True + elif node.file_exists(_kubeconfig): + # if the file exists, then the openshift sos plugin will use it + # if the with-api option is turned on + if not _kubeconfig == master_kube: + node.plugopts.append( + "openshift.kubeconfig=%s" % _kubeconfig + ) + self._toggle_api_opt(node, True) self.api_collect_enabled = True - elif self.get_option('kubeconfig'): - kc = self.get_option('kubeconfig') - if node.file_exists(kc): - if node.host.containerized: - kc = "/host/%s" % kc - node.sos_env_vars['KUBECONFIG'] = kc - self.api_collect_enabled = True if self.api_collect_enabled: msg = ("API collections will be performed on %s\nNote: API " "collections may extend runtime by 10s of minutes\n" @@ -264,6 +312,6 @@ class ocp(Cluster): def set_node_options(self, node): # don't attempt OC API collections on non-primary nodes - node.plugopts.append('openshift.no-oc=on') + self._toggle_api_opt(node, False) # vim: set et ts=4 sw=4 : diff --git a/sos/report/plugins/openshift.py b/sos/report/plugins/openshift.py index d643f04c..a41ab62b 100644 --- a/sos/report/plugins/openshift.py +++ b/sos/report/plugins/openshift.py @@ -19,7 +19,10 @@ class Openshift(Plugin, RedHatPlugin): further extending the kubernetes plugin (or the OCP 3.x extensions included in the Red Hat version of the kube plugin). - By default, this plugin will collect cluster information and inspect the + This plugin may collect OCP API information when the `with-api` option is + enabled. This option is disabled by default. + + When enabled, this plugin will collect cluster information and inspect the default namespaces/projects that are created during deployment - i.e. the namespaces of the cluster projects matching openshift.* and kube.*. At the time of this plugin's creation that number of default projects is already @@ -34,16 +37,20 @@ class Openshift(Plugin, RedHatPlugin): Users will need to either: - 1) Provide the bearer token via the `-k openshift.token` option - 2) Provide the bearer token via the `SOSOCPTOKEN` environment variable - 3) Otherwise ensure that the root user can successfully run `oc` and + 1) Accept the use of a well-known stock kubeconfig file provided via a + static pod resource for the kube-apiserver + 2) Provide the bearer token via the `-k openshift.token` option + 3) Provide the bearer token via the `SOSOCPTOKEN` environment variable + 4) Otherwise ensure that the root user can successfully run `oc` and get proper output prior to running this plugin - It is highly suggested that option #2 be used first, as this will prevent - the token from being recorded in output saved to the archive. Option #1 may + It is highly suggested that option #1 be used first, as this uses well + known configurations and requires the least information from the user. If + using a token, it is recommended to use option #3 as this will prevent + the token from being recorded in output saved to the archive. Option #2 may be used if this is considered an acceptable risk. It is not recommended to - rely on option #3, though it will provide the functionality needed. + rely on option #4, though it will provide the functionality needed. """ short_desc = 'Openshift Container Platform 4.x' @@ -71,6 +71,7 @@ 'fast', None), ('host', 'host address to use for oc login, including port', 'fast', 'https://localhost:6443'), + ('with-api', 'collect output from the OCP API', 'fast', False), ('no-oc', 'do not collect `oc` command output', 'fast', True), ('podlogs', 'collect logs from each pod', 'fast', True), ('podlogs-filter', ('limit podlogs collection to pods matching this ' @@ -212,7 +220,7 @@ class Openshift(Plugin, RedHatPlugin): self.add_copy_spec('/etc/kubernetes/*') # see if we run `oc` commands - if not self.get_option('no-oc'): + if self.get_option('with-api'): can_run_oc = self._check_oc_logged_in() else: can_run_oc = False -- 2.27.0 From 2b2ad04884cfdda78c02a33a73603d249c0a4dd5 Mon Sep 17 00:00:00 2001 From: Jake Hunsaker Date: Tue, 19 Oct 2021 11:51:56 -0400 Subject: [PATCH 1/2] [Plugin] Allow writing command output directly to disk This commit addresses a long standing ask in sos, regarding resource consumption when sos is run with `--all-logs`. For executions where `--all-logs` is used, or for specific commands where `sizelimit` has been set to 0, `add_cmd_output()` and `add_journal()` will now instruct `sos_get_command_output()` to write output directly to a file rather than saving the output in memory. When this occurs, the `output` key in the returned dict will be empty. Note that this does extend to `collect_cmd_output()` or `exec_cmd()`. Resolves: #1506 Signed-off-by: Jake Hunsaker --- sos/archive.py | 14 +++---- sos/report/__init__.py | 10 ++++- sos/report/plugins/__init__.py | 70 +++++++++++++++++++++++----------- sos/utilities.py | 69 ++++++++++++++++++++++++++++----- 4 files changed, 123 insertions(+), 40 deletions(-) diff --git a/sos/archive.py b/sos/archive.py index e3c68b77..e92d5d60 100644 --- a/sos/archive.py +++ b/sos/archive.py @@ -251,7 +251,7 @@ class FileCacheArchive(Archive): return dest - def _check_path(self, src, path_type, dest=None, force=False): + def check_path(self, src, path_type, dest=None, force=False): """Check a new destination path in the archive. Since it is possible for multiple plugins to collect the same @@ -345,7 +345,7 @@ class FileCacheArchive(Archive): if not dest: dest = src - dest = self._check_path(dest, P_FILE) + dest = self.check_path(dest, P_FILE) if not dest: return @@ -384,7 +384,7 @@ class FileCacheArchive(Archive): # over any exixting content in the archive, since it is used by # the Plugin postprocessing hooks to perform regex substitution # on file content. - dest = self._check_path(dest, P_FILE, force=True) + dest = self.check_path(dest, P_FILE, force=True) f = codecs.open(dest, mode, encoding='utf-8') if isinstance(content, bytes): @@ -397,7 +397,7 @@ class FileCacheArchive(Archive): def add_binary(self, content, dest): with self._path_lock: - dest = self._check_path(dest, P_FILE) + dest = self.check_path(dest, P_FILE) if not dest: return @@ -409,7 +409,7 @@ class FileCacheArchive(Archive): def add_link(self, source, link_name): self.log_debug("adding symlink at '%s' -> '%s'" % (link_name, source)) with self._path_lock: - dest = self._check_path(link_name, P_LINK) + dest = self.check_path(link_name, P_LINK) if not dest: return @@ -484,10 +484,10 @@ class FileCacheArchive(Archive): """ # Establish path structure with self._path_lock: - self._check_path(path, P_DIR) + self.check_path(path, P_DIR) def add_node(self, path, mode, device): - dest = self._check_path(path, P_NODE) + dest = self.check_path(path, P_NODE) if not dest: return diff --git a/sos/report/__init__.py b/sos/report/__init__.py index 249ff119..46c7f219 100644 --- a/sos/report/__init__.py +++ b/sos/report/__init__.py @@ -476,7 +476,8 @@ class SoSReport(SoSComponent): 'verbosity': self.opts.verbosity, 'cmdlineopts': self.opts, 'devices': self.devices, - 'namespaces': self.namespaces + 'namespaces': self.namespaces, + 'tempfile_util': self.tempfile_util } def get_temp_file(self): @@ -1075,7 +1076,12 @@ class SoSReport(SoSComponent): _plug.manifest.add_field('end_time', end) _plug.manifest.add_field('run_time', end - start) except TimeoutError: - self.ui_log.error("\n Plugin %s timed out\n" % plugin[1]) + msg = "Plugin %s timed out" % plugin[1] + # log to ui_log.error to show the user, log to soslog.info + # so that someone investigating the sos execution has it all + # in one place, but without double notifying the user. + self.ui_log.error("\n %s\n" % msg) + self.soslog.info(msg) self.running_plugs.remove(plugin[1]) self.loaded_plugins[plugin[0]-1][1].set_timeout_hit() pool.shutdown(wait=True) diff --git a/sos/report/plugins/__init__.py b/sos/report/plugins/__init__.py index 3ff7c191..38529a9d 100644 --- a/sos/report/plugins/__init__.py +++ b/sos/report/plugins/__init__.py @@ -15,6 +15,7 @@ from sos.utilities import (sos_get_command_output, import_module, grep, path_exists, path_isdir, path_isfile, path_islink, listdir, path_join) +from sos.archive import P_FILE import os import glob import re @@ -1686,6 +1687,8 @@ class Plugin(): kwargs['priority'] = 10 if 'changes' not in kwargs: kwargs['changes'] = False + if self.get_option('all_logs') or kwargs['sizelimit'] == 0: + kwargs['to_file'] = True soscmd = SoSCommand(**kwargs) self._log_debug("packed command: " + soscmd.__str__()) for _skip_cmd in self.skip_commands: @@ -1707,7 +1710,8 @@ class Plugin(): chroot=True, runat=None, env=None, binary=False, sizelimit=None, pred=None, subdir=None, changes=False, foreground=False, tags=[], - priority=10, cmd_as_tag=False, container=None): + priority=10, cmd_as_tag=False, container=None, + to_file=False): """Run a program or a list of programs and collect the output Output will be limited to `sizelimit`, collecting the last X amount @@ -1776,6 +1780,10 @@ class Plugin(): :param container: Run the specified `cmds` inside a container with this ID or name :type container: ``str`` + + :param to_file: Should command output be written directly to a new + file rather than stored in memory? + :type to_file: ``bool`` """ if isinstance(cmds, str): cmds = [cmds] @@ -1863,7 +1863,7 @@ pred=pred, subdir=subdir, tags=tags, changes=changes, foreground=foreground, priority=priority, cmd_as_tag=cmd_as_tag, - container_cmd=container_cmd) + to_file=to_file, container_cmd=container_cmd) def add_cmd_tags(self, tagdict): """Retroactively add tags to any commands that have been run by this @@ -2014,7 +2014,7 @@ stderr=True, chroot=True, runat=None, env=None, binary=False, sizelimit=None, subdir=None, changes=False, foreground=False, tags=[], - priority=10, cmd_as_tag=False, + priority=10, cmd_as_tag=False, to_file=False, container_cmd=False): """Execute a command and save the output to a file for inclusion in the report. @@ -2041,6 +2041,8 @@ TTY :param tags: Add tags in the archive manifest :param cmd_as_tag: Format command string to tag + :param to_file: Write output directly to file instead + of saving in memory :returns: dict containing status, output, and filename in the archive for the executed cmd @@ -2074,27 +2074,46 @@ else: root = None + if suggest_filename: + outfn = self._make_command_filename(suggest_filename, subdir) + else: + outfn = self._make_command_filename(cmd, subdir) + + outfn_strip = outfn[len(self.commons['cmddir'])+1:] + + if to_file: + self._log_debug("collecting '%s' output directly to disk" + % cmd) + self.archive.check_path(outfn, P_FILE) + out_file = os.path.join(self.archive.get_archive_path(), outfn) + else: + out_file = False + start = time() result = sos_get_command_output( cmd, timeout=timeout, stderr=stderr, chroot=root, chdir=runat, env=_env, binary=binary, sizelimit=sizelimit, - poller=self.check_timeout, foreground=foreground + poller=self.check_timeout, foreground=foreground, + to_file=out_file ) end = time() run_time = end - start if result['status'] == 124: - self._log_warn( - "command '%s' timed out after %ds" % (cmd, timeout) - ) + warn = "command '%s' timed out after %ds" % (cmd, timeout) + self._log_warn(warn) + if to_file: + msg = (" - output up until the timeout may be available at " + "%s" % outfn) + self._log_debug("%s%s" % (warn, msg)) manifest_cmd = { 'command': cmd.split(' ')[0], 'parameters': cmd.split(' ')[1:], 'exec': cmd, - 'filepath': None, + 'filepath': outfn if to_file else None, 'truncated': result['truncated'], 'return_code': result['status'], 'priority': priority, @@ -2060,7 +2090,7 @@ class Plugin(): result = sos_get_command_output( cmd, timeout=timeout, chroot=False, chdir=runat, env=env, binary=binary, sizelimit=sizelimit, - poller=self.check_timeout + poller=self.check_timeout, to_file=out_file ) run_time = time() - start self._log_debug("could not run '%s': command not found" % cmd) @@ -2077,22 +2107,15 @@ class Plugin(): if result['truncated']: self._log_info("collected output of '%s' was truncated" % cmd.split()[0]) - - if suggest_filename: - outfn = self._make_command_filename(suggest_filename, subdir) - else: - outfn = self._make_command_filename(cmd, subdir) - - outfn_strip = outfn[len(self.commons['cmddir'])+1:] - - if result['truncated']: linkfn = outfn outfn = outfn.replace('sos_commands', 'sos_strings') + '.tailed' - if binary: - self.archive.add_binary(result['output'], outfn) - else: - self.archive.add_string(result['output'], outfn) + if not to_file: + if binary: + self.archive.add_binary(result['output'], outfn) + else: + self.archive.add_string(result['output'], outfn) + if result['truncated']: # we need to manually build the relative path from the paths that # exist within the build dir to properly drop these symlinks @@ -2543,6 +2566,9 @@ class Plugin(): all_logs = self.get_option("all_logs") log_size = sizelimit or self.get_option("log_size") log_size = max(log_size, journal_size) if not all_logs else 0 + if sizelimit == 0: + # allow for specific sizelimit overrides in plugins + log_size = 0 if isinstance(units, str): units = [units] diff --git a/sos/utilities.py b/sos/utilities.py index 70d5c0ab..f422d7a0 100644 --- a/sos/utilities.py +++ b/sos/utilities.py @@ -110,7 +110,8 @@ def is_executable(command, sysroot=None): def sos_get_command_output(command, timeout=TIMEOUT_DEFAULT, stderr=False, chroot=None, chdir=None, env=None, foreground=False, - binary=False, sizelimit=None, poller=None): + binary=False, sizelimit=None, poller=None, + to_file=False): """Execute a command and return a dictionary of status and output, optionally changing root or current working directory before executing command. @@ -124,6 +125,12 @@ def sos_get_command_output(command, timeout=TIMEOUT_DEFAULT, stderr=False, if (chdir): os.chdir(chdir) + def _check_poller(proc): + if poller(): + proc.terminate() + raise SoSTimeoutError + time.sleep(0.01) + cmd_env = os.environ.copy() # ensure consistent locale for collected command output cmd_env['LC_ALL'] = 'C' @@ -158,24 +158,46 @@ expanded_args.extend(expanded_arg) else: expanded_args.append(arg) + if to_file: + _output = open(to_file, 'w') + else: + _output = PIPE try: - p = Popen(expanded_args, shell=False, stdout=PIPE, + p = Popen(expanded_args, shell=False, stdout=_output, stderr=STDOUT if stderr else PIPE, bufsize=-1, env=cmd_env, close_fds=True, preexec_fn=_child_prep_fn) - reader = AsyncReader(p.stdout, sizelimit, binary) + if not to_file: + reader = AsyncReader(p.stdout, sizelimit, binary) + else: + reader = FakeReader(p, binary) + if poller: while reader.running: - if poller(): - p.terminate() - raise SoSTimeoutError - time.sleep(0.01) - stdout = reader.get_contents() - truncated = reader.is_full + _check_poller(p) + else: + try: + # override timeout=0 to timeout=None, as Popen will treat the + # former as a literal 0-second timeout + p.wait(timeout if timeout else None) + except Exception: + p.terminate() + _output.close() + # until we separate timeouts from the `timeout` command + # handle per-cmd timeouts via Plugin status checks + return {'status': 124, 'output': reader.get_contents(), + 'truncated': reader.is_full} + if to_file: + _output.close() + + # wait for Popen to set the returncode while p.poll() is None: pass + stdout = reader.get_contents() + truncated = reader.is_full + except OSError as e: if e.errno == errno.ENOENT: return {'status': 127, 'output': "", 'truncated': ''} @@ -256,6 +285,28 @@ def path_join(path, *p, sysroot=os.sep): return os.path.join(path, *p) +class FakeReader(): + """Used as a replacement AsyncReader for when we are writing directly to + disk, and allows us to keep more simplified flows for executing, + monitoring, and collecting command output. + """ + + def __init__(self, process, binary): + self.process = process + self.binary = binary + + @property + def is_full(self): + return False + + def get_contents(self): + return '' if not self.binary else b'' + + @property + def running(self): + return self.process.poll() is None + + class AsyncReader(threading.Thread): """Used to limit command output to a given size without deadlocking sos. -- 2.27.0