sos/SOURCES/sos-bz1930181-collect-clean...

From fc0218638f3e865c4315823e72aef2f46d012d07 Mon Sep 17 00:00:00 2001
From: Jake Hunsaker <jhunsake@redhat.com>
Date: Wed, 14 Apr 2021 11:55:03 -0400
Subject: [PATCH 1/2] [clean] Load maps from all archives before obfuscation
 loop

Previously, maps were being prepped via archives after extraction. This
reduced the amount of file IO being done, but made it so that necessary
obfuscations from later archives in a series would not be obfuscated in
the archives obfuscated before those later archives were extracted.

Fix this by extracting the map prep files into memory for each archive
to prep the maps before we enter the obfuscation loop entirely.

Closes: #2490
Related: RHBZ#1930181
Resolves: #2492

Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
---
 sos/cleaner/__init__.py                | 69 +++++++++++++++-----------
 sos/cleaner/parsers/username_parser.py | 13 +++--
 2 files changed, 45 insertions(+), 37 deletions(-)

diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
index b9eb61ef..d10cdc55 100644
--- a/sos/cleaner/__init__.py
+++ b/sos/cleaner/__init__.py
@@ -292,6 +292,7 @@ third party.

         # we have at least one valid target to obfuscate
         self.completed_reports = []
+        self.preload_all_archives_into_maps()
         self.obfuscate_report_paths()

         if not self.completed_reports:
@@ -473,6 +474,44 @@ third party.
             self.ui_log.info("Exiting on user cancel")
             os._exit(130)

+    def preload_all_archives_into_maps(self):
+        """Before doing the actual obfuscation, if we have multiple archives
+        to obfuscate then we need to preload each of them into the mappings
+        to ensure that node1 is obfuscated in node2 as well as node2 being
+        obfuscated in node1's archive.
+        """
+        self.log_info("Pre-loading multiple archives into obfuscation maps")
+        for _arc in self.report_paths:
+            is_dir = os.path.isdir(_arc)
+            if is_dir:
+                _arc_name = _arc
+            else:
+                archive = tarfile.open(_arc)
+                _arc_name = _arc.split('/')[-1].split('.tar')[0]
+            # for each parser, load the map_prep_file into memory, and then
+            # send that for obfuscation. We don't actually obfuscate the file
+            # here, do that in the normal archive loop
+            for _parser in self.parsers:
+                if not _parser.prep_map_file:
+                    continue
+                _arc_path = os.path.join(_arc_name, _parser.prep_map_file)
+                try:
+                    if is_dir:
+                        _pfile = open(_arc_path, 'r')
+                        content = _pfile.read()
+                    else:
+                        _pfile = archive.extractfile(_arc_path)
+                        content = _pfile.read().decode('utf-8')
+                    _pfile.close()
+                    if isinstance(_parser, SoSUsernameParser):
+                        _parser.load_usernames_into_map(content)
+                    for line in content.splitlines():
+                        if isinstance(_parser, SoSHostnameParser):
+                            _parser.load_hostname_into_map(line)
+                        self.obfuscate_line(line, _parser.prep_map_file)
+                except Exception as err:
+                    self.log_debug("Could not prep %s: %s" % (_arc_path, err))
+
     def obfuscate_report(self, report):
         """Individually handle each archive or directory we've discovered by
         running through each file therein.
@@ -493,7 +532,6 @@ third party.
             start_time = datetime.now()
             arc_md.add_field('start_time', start_time)
             archive.extract()
-            self.prep_maps_from_archive(archive)
             archive.report_msg("Beginning obfuscation...")

             file_list = archive.get_file_list()
@@ -542,35 +580,6 @@ third party.
             self.ui_log.info("Exception while processing %s: %s"
                              % (report, err))

-    def prep_maps_from_archive(self, archive):
-        """Open specific files from an archive and try to load those values
-        into our mappings before iterating through the entire archive.
-
-        Positional arguments:
-
-            :param archive SoSObfuscationArchive:   An open archive object
-        """
-        for parser in self.parsers:
-            if not parser.prep_map_file:
-                continue
-            prep_file = archive.get_file_path(parser.prep_map_file)
-            if not prep_file:
-                self.log_debug("Could not prepare %s: %s does not exist"
-                               % (parser.name, parser.prep_map_file),
-                               caller=archive.archive_name)
-                continue
-            # this is a bit clunky, but we need to load this particular
-            # parser in a different way due to how hostnames are validated for
-            # obfuscation
-            if isinstance(parser, SoSHostnameParser):
-                with open(prep_file, 'r') as host_file:
-                    hostname = host_file.readline().strip()
-                    parser.load_hostname_into_map(hostname)
-            if isinstance(parser, SoSUsernameParser):
-                parser.load_usernames_into_map(prep_file)
-            self.obfuscate_file(prep_file, parser.prep_map_file,
-                                archive.archive_name)
-
     def obfuscate_file(self, filename, short_name=None, arc_name=None):
         """Obfuscate and individual file, line by line.

diff --git a/sos/cleaner/parsers/username_parser.py b/sos/cleaner/parsers/username_parser.py
index 5223c018..2bb6c7f3 100644
--- a/sos/cleaner/parsers/username_parser.py
+++ b/sos/cleaner/parsers/username_parser.py
@@ -39,16 +39,15 @@ class SoSUsernameParser(SoSCleanerParser):
         super(SoSUsernameParser, self).__init__(conf_file)
         self.mapping.load_names_from_options(opt_names)

-    def load_usernames_into_map(self, fname):
+    def load_usernames_into_map(self, content):
         """Since we don't get the list of usernames from a straight regex for
         this parser, we need to override the initial parser prepping here.
         """
-        with open(fname, 'r') as lastfile:
-            for line in lastfile.read().splitlines()[1:]:
-                user = line.split()[0]
-                if user in self.skip_list:
-                    continue
-                self.mapping.get(user)
+        for line in content.splitlines()[1:]:
+            user = line.split()[0]
+            if user in self.skip_list:
+                continue
+            self.mapping.get(user)

     def parse_line(self, line):
         count = 0
--
2.26.3


From b713f458bfa92427147de754ea36054bfde53d71 Mon Sep 17 00:00:00 2001
From: Jake Hunsaker <jhunsake@redhat.com>
Date: Wed, 14 Apr 2021 12:22:28 -0400
Subject: [PATCH 2/2] [clean] Remove duplicate file skipping within
 obfuscate_line()

A redundant file skipping check was being executed within
`obfuscate_line()` that would cause subsequent archives being obfuscated
to skip line obfuscation within a file, despite iterating through the
entire file.

Remove this redundant check, thus allowing proper obfuscation.

Closes: #2490
Related: RHBZ#1930181
Resolves: #2492

Signed-off-by: Jake Hunsaker <jhunsake@redhat.com>
---
 sos/cleaner/__init__.py            | 11 +++--------
 sos/cleaner/obfuscation_archive.py |  2 --
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/sos/cleaner/__init__.py b/sos/cleaner/__init__.py
index d10cdc55..bdd24f95 100644
--- a/sos/cleaner/__init__.py
+++ b/sos/cleaner/__init__.py
@@ -508,7 +508,7 @@ third party.
                     for line in content.splitlines():
                         if isinstance(_parser, SoSHostnameParser):
                             _parser.load_hostname_into_map(line)
-                        self.obfuscate_line(line, _parser.prep_map_file)
+                        self.obfuscate_line(line)
                 except Exception as err:
                     self.log_debug("Could not prep %s: %s" % (_arc_path, err))

@@ -606,7 +606,7 @@ third party.
                 if not line.strip():
                     continue
                 try:
-                    line, count = self.obfuscate_line(line, short_name)
+                    line, count = self.obfuscate_line(line)
                     subs += count
                     tfile.write(line)
                 except Exception as err:
@@ -631,7 +631,7 @@ third party.
                 pass
         return string_data

-    def obfuscate_line(self, line, filename):
+    def obfuscate_line(self, line):
         """Run a line through each of the obfuscation parsers, keeping a
         cumulative total of substitutions done on that particular line.

@@ -639,16 +639,11 @@ third party.

             :param line str:        The raw line as read from the file being
                                     processed
-            :param filename str:    Filename the line was read from

         Returns the fully obfuscated line and the number of substitutions made
         """
         count = 0
         for parser in self.parsers:
-            if filename and any([
-                re.match(_s, filename) for _s in parser.skip_files
-            ]):
-                continue
             try:
                 line, _count = parser.parse_line(line)
                 count += _count
diff --git a/sos/cleaner/obfuscation_archive.py b/sos/cleaner/obfuscation_archive.py
index 84ca30cd..c64ab13b 100644
--- a/sos/cleaner/obfuscation_archive.py
+++ b/sos/cleaner/obfuscation_archive.py
@@ -219,8 +219,6 @@ class SoSObfuscationArchive():
             :param filename str:        Filename relative to the extracted
                                         archive root
         """
-        if filename in self.file_sub_list:
-            return True

         if not os.path.isfile(self.get_file_path(filename)):
             return True
--
2.26.3