diff --git a/1796.patch b/1796.patch new file mode 100644 index 00000000..30d9a5c4 --- /dev/null +++ b/1796.patch @@ -0,0 +1,60 @@ +From c8fe99b1aa5a9a9b941b7515cda367d24829dedf Mon Sep 17 00:00:00 2001 +From: Adam Williamson +Date: Nov 19 2024 08:22:09 +0000 +Subject: pkgset: optimize cache check (saves 20 minutes) + + +The pkgset phase takes around 35 minutes in current composes. +Around 20 minutes of that is spent creating these per-arch +subsets of the global package set. In a rather roundabout way +(see #1794 ), I figured out that almost all of this time is +spent in this cache check, which is broken for a subtle reason. + +Python's `in` keyword works by first attempting to call the +container's magic `__contains__` method. If the container does +not implement `__contains__`, it falls back to iteration - it +tries to iterate over the container until it either hits what +it's looking for, or runs out. (If the container implements +neither, you get an error). + +The FileCache instance's `file_cache` is a plain Python dict. +dicts have a very efficient `__contains__` implementation, so +doing `foo in (somedict)` is basically always very fast no matter +how huge the dict is. FileCache itself, though, implements +`__iter__` by returning an iterator over the `file_cache` dict's +keys, but it does *not* implement `__contains__`. So when we do +`foo in self.file_cache`, Python has to iterate over every key +in the dict until it hits foo or runs out. This is massively +slower than `foo in self.file_cache.file_cache`, which uses the +efficient `__contains__` method. + +Because these package sets are so huge, and we're looping over +*one* huge set and checking each package from it against the cache +of another, increasingly huge, set, this effect becomes massive. +To make it even worse, I ran a few tests where I added a debug log +if we ever hit the cache, and it looks like we never actually do - +so every check has to iterate through the entire dict. + +We could probably remove this entirely, but changing it to check +the dict instead of the FileCache instance makes it just about as +fast as taking it out, so I figured let's go with that in case +there's some unusual scenario in which the cache does work here. + +Signed-off-by: Adam Williamson + +--- + +diff --git a/pungi/phases/pkgset/pkgsets.py b/pungi/phases/pkgset/pkgsets.py +index 81c090c..ee12c3c 100644 +--- a/pungi/phases/pkgset/pkgsets.py ++++ b/pungi/phases/pkgset/pkgsets.py +@@ -265,7 +265,7 @@ class PackageSetBase(kobo.log.LoggingBase): + for arch in arch_list: + self.rpms_by_arch.setdefault(arch, []) + for i in other.rpms_by_arch.get(arch, []): +- if i.file_path in self.file_cache: ++ if i.file_path in self.file_cache.file_cache: + # TODO: test if it really works + continue + if inherit_to_noarch and exclusivearch_list and arch == "noarch": + diff --git a/pungi.spec b/pungi.spec index d5fd73f1..016e5db3 100644 --- a/pungi.spec +++ b/pungi.spec @@ -13,6 +13,7 @@ Patch: https://pagure.io/pungi/pull-request/1782.patch Patch: https://pagure.io/pungi/pull-request/1788.patch Patch: https://pagure.io/pungi/pull-request/1789.patch Patch: https://pagure.io/pungi/pull-request/1790.patch +Patch: https://pagure.io/pungi/pull-request/1796.patch BuildRequires: make BuildRequires: python3-pytest @@ -179,6 +180,9 @@ rm %{buildroot}%{_bindir}/pungi %{_bindir}/%{name}-cache-cleanup %changelog +* Tue Nov 19 2024 Adam Williamson - 4.7.0-7 +- Backport #1796 to speed up compose some more + * Thu Nov 14 2024 Adam Williamson - 4.7.0-6 - Rebuild with no changes to bump past release used in infra tag