From ab1b5b48eca8c58966ac244d97829bae1386b94d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lubom=C3=ADr=20Sedl=C3=A1=C5=99?= Date: Fri, 9 Apr 2021 13:53:31 +0200 Subject: [PATCH] hybrid: Optimize getting lookaside packages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original code ended up downloading all repodata from the lookaside repo. This could cause a lot of memory to be used. The new code only downloads the repomd.xml and then primary record, which is sufficient to obtain all needed information. A lot less memory is used and the code is also significantly faster. Here are some alternative ways of getting a list of packages from the lookaside repo and reasons why they did not work: * dnf repoquery - this doesn't include modular packages unless the stream is default * dnf reposync - requires `--urls` option to only print the names, which is not available on RHEL 7 JIRA: RHELCMP-4761 Signed-off-by: Lubomír Sedlář --- pungi/phases/gather/methods/method_hybrid.py | 42 +++++++++++++------- tests/test_gather_method_hybrid.py | 30 +++----------- 2 files changed, 33 insertions(+), 39 deletions(-) diff --git a/pungi/phases/gather/methods/method_hybrid.py b/pungi/phases/gather/methods/method_hybrid.py index 3c673c0f..3aa60cbd 100644 --- a/pungi/phases/gather/methods/method_hybrid.py +++ b/pungi/phases/gather/methods/method_hybrid.py @@ -499,6 +499,27 @@ def _make_result(paths): return [{"path": path, "flags": []} for path in sorted(paths)] +def get_repo_packages(path): + """Extract file names of all packages in the given repository.""" + + packages = set() + + def callback(pkg): + packages.add(os.path.basename(pkg.location_href)) + + repomd = os.path.join(path, "repodata/repomd.xml") + with as_local_file(repomd) as url_: + repomd = cr.Repomd(url_) + for rec in repomd.records: + if rec.type != "primary": + continue + record_url = os.path.join(path, rec.location_href) + with as_local_file(record_url) as url_: + cr.xml_parse_primary(url_, pkgcb=callback, do_files=False) + + return packages + + def expand_packages(nevra_to_pkg, lookasides, nvrs, filter_packages): """For each package add source RPM.""" # This will serve as the final result. We collect sets of paths to the @@ -509,25 +530,16 @@ def expand_packages(nevra_to_pkg, lookasides, nvrs, filter_packages): filters = set(filter_packages) - # Collect list of all packages in lookaside. These will not be added to the - # result. Fus handles this in part: if a package is explicitly mentioned as - # input (which can happen with comps group expansion), it will be in the - # output even if it's in lookaside. lookaside_packages = set() for repo in lookasides: - md = cr.Metadata() - md.locate_and_load_xml(repo) - for key in md.keys(): - pkg = md.get(key) - url = os.path.join(pkg.location_base or repo, pkg.location_href) - # Strip file:// prefix - lookaside_packages.add(url[7:]) + lookaside_packages.update(get_repo_packages(repo)) for nvr, pkg_arch, flags in nvrs: pkg = nevra_to_pkg["%s.%s" % (nvr, pkg_arch)] - if pkg.file_path in lookaside_packages: - # Package is in lookaside, don't add it and ignore sources and - # debuginfo too. + if os.path.basename(pkg.file_path) in lookaside_packages: + # Fus can return lookaside package in output if the package is + # explicitly listed as input. This can happen during comps + # expansion. continue if pkg_is_debug(pkg): debuginfo.add(pkg.file_path) @@ -540,7 +552,7 @@ def expand_packages(nevra_to_pkg, lookasides, nvrs, filter_packages): if (srpm.name, "src") in filters: # Filtered package, skipping continue - if srpm.file_path not in lookaside_packages: + if os.path.basename(srpm.file_path) not in lookaside_packages: srpms.add(srpm.file_path) except KeyError: # Didn't find source RPM.. this should be logged diff --git a/tests/test_gather_method_hybrid.py b/tests/test_gather_method_hybrid.py index b4e24a0d..bc0a283b 100644 --- a/tests/test_gather_method_hybrid.py +++ b/tests/test_gather_method_hybrid.py @@ -934,20 +934,11 @@ class TestExpandPackages(helpers.PungiTestCase): }, ) - @mock.patch("pungi.phases.gather.methods.method_hybrid.cr") - def test_skip_lookaside_source(self, cr): + @mock.patch("pungi.phases.gather.methods.method_hybrid.get_repo_packages") + def test_skip_lookaside_source(self, get_repo_packages): nevra_to_pkg = self._mk_packages(src=True) lookasides = [mock.Mock()] - repo = { - "abc": NamedMock( - name="pkg", - arch="src", - location_base="file:///tmp/", - location_href="pkg.src.rpm", - ), - } - cr.Metadata.return_value.keys.return_value = repo.keys() - cr.Metadata.return_value.get.side_effect = lambda key: repo[key] + get_repo_packages.return_value = ["pkg.src.rpm"] res = hybrid.expand_packages( nevra_to_pkg, lookasides, [("pkg-3:1-2", "x86_64", [])], [] @@ -962,20 +953,11 @@ class TestExpandPackages(helpers.PungiTestCase): }, ) - @mock.patch("pungi.phases.gather.methods.method_hybrid.cr") - def test_skip_lookaside_packages(self, cr): + @mock.patch("pungi.phases.gather.methods.method_hybrid.get_repo_packages") + def test_skip_lookaside_packages(self, get_repo_packages): nevra_to_pkg = self._mk_packages(debug_arch="x86_64") lookasides = [mock.Mock()] - repo = { - "abc": NamedMock( - name="pkg", - arch="x86_64", - location_base="file:///tmp/", - location_href="pkg.rpm", - ) - } - cr.Metadata.return_value.keys.return_value = repo.keys() - cr.Metadata.return_value.get.side_effect = lambda key: repo[key] + get_repo_packages.return_value = ["pkg.rpm"] res = hybrid.expand_packages( nevra_to_pkg, lookasides, [("pkg-3:1-2", "x86_64", [])], []