Retry buildinstall tasks on losetup error

JIRA: RHELCMP-1394 Signed-off-by: Haibo Lin <hlin@redhat.com>
2020-07-20 17:11:06 +08:00 · 2020-07-20 17:11:06 +08:00 · 7e6bed9713
commit 7e6bed9713
parent f7167fa3b6
3 changed files with 78 additions and 7 deletions
--- a/pungi/phases/buildinstall.py
+++ b/pungi/phases/buildinstall.py
@ -801,6 +801,10 @@ class BuildinstallThread(WorkerThread):
                weight=compose.conf["runroot_weights"].get("buildinstall"),
            )
        else:
+            try:
+                lorax_log_dir = _get_log_dir(compose, variant, arch)
+            except Exception:
+                lorax_log_dir = None
            runroot.run(
                cmd,
                log_file=log_file,
@ -809,6 +813,7 @@ class BuildinstallThread(WorkerThread):
                mounts=[compose.topdir],
                weight=compose.conf["runroot_weights"].get("buildinstall"),
                chown_paths=chown_paths,
+                log_dir=lorax_log_dir,
            )

        if final_output_dir != output_dir:
--- a/pungi/runroot.py
+++ b/pungi/runroot.py
@ -74,12 +74,38 @@ class Runroot(kobo.log.LoggingBase):
        run(command, show_cmd=True, logfile=log_file)
        self._result = True

+    def _has_losetup_error(self, log_dir):
+        """
+        Check if there's losetup error in log.
+
+        This error happens if the Koji builder runs out of loopback devices.
+        This can happen if too many tasks that require them are scheduled on
+        the same builder. A retried task might end up on a different builder,
+        or maybe some other task will have finished already.
+
+        :param str log_dir: path to buildinstall log dir,
+            e.g. logs/s390x/buildinstall-BaseOS-logs/
+        """
+        if not log_dir:
+            return False
+
+        log_file = os.path.join(log_dir, "program.log")
+        try:
+            with open(log_file) as f:
+                for line in f:
+                    if "losetup: cannot find an unused loop device" in line:
+                        return True
+        except Exception:
+            pass
+        return False
+
    def _run_koji(self, command, log_file=None, packages=None, arch=None, **kwargs):
        """
        Runs the runroot command in Koji.
        """
        runroot_channel = self.compose.conf.get("runroot_channel")
        runroot_tag = self.compose.conf["runroot_tag"]
+        log_dir = kwargs.pop("log_dir", None)

        koji_wrapper = kojiwrapper.KojiWrapper(self.compose.conf["koji_profile"])
        koji_cmd = koji_wrapper.get_runroot_cmd(
@ -92,13 +118,19 @@ class Runroot(kobo.log.LoggingBase):
            **kwargs
        )

-        output = koji_wrapper.run_runroot_cmd(koji_cmd, log_file=log_file)
-        if output["retcode"] != 0:
-            raise RuntimeError(
-                "Runroot task failed: %s. See %s for more details."
-                % (output["task_id"], log_file)
-            )
-        self._result = output
+        attempt = 0
+        max_retries = 3
+        while True:
+            output = koji_wrapper.run_runroot_cmd(koji_cmd, log_file=log_file)
+            if output["retcode"] == 0:
+                self._result = output
+                return
+            elif attempt >= max_retries or not self._has_losetup_error(log_dir):
+                raise RuntimeError(
+                    "Runroot task failed: %s. See %s for more details."
+                    % (output["task_id"], log_file)
+                )
+            attempt += 1

    def _ssh_run(self, hostname, user, command, fmt_dict=None, log_file=None):
        """
--- a/tests/test_runroot.py
+++ b/tests/test_runroot.py
@ -198,3 +198,37 @@ class TestRunrootOpenSSH(helpers.PungiTestCase):
                ),
            ]
        )
+
+
+class TestRunrootKoji(helpers.PungiTestCase):
+    def setUp(self):
+        super(TestRunrootKoji, self).setUp()
+        self.compose = helpers.DummyCompose(
+            self.topdir, {"runroot": True, "runroot_tag": "f28-build"},
+        )
+
+        self.runroot = Runroot(self.compose)
+
+    def test_has_losetup_error(self):
+        self.assertFalse(self.runroot._has_losetup_error(None))
+
+        with mock.patch("pungi.runroot.open", mock.mock_open(read_data="")):
+            self.assertFalse(self.runroot._has_losetup_error("/foo_log_dir"))
+
+        with mock.patch(
+            "pungi.runroot.open",
+            mock.mock_open(read_data="losetup: cannot find an unused loop device"),
+        ):
+            self.assertTrue(self.runroot._has_losetup_error("/bar_log_dir"))
+
+    @mock.patch("pungi.runroot.kojiwrapper.KojiWrapper")
+    def test_run_koji_retry(self, mock_kojiwrapper):
+        self.compose.conf["koji_profile"] = "test"
+        mock_kojiwrapper.return_value.get_runroot_cmd.return_value = ["df -h"]
+        mock_kojiwrapper.return_value.run_runroot_cmd.side_effect = [
+            {"retcode": 1, "task_id": 1},
+            {"retcode": 0, "task_id": 2},
+        ]
+        self.runroot._has_losetup_error = mock.Mock(side_effect=[True, False])
+        self.runroot._run_koji("")
+        self.assertEqual(mock_kojiwrapper.return_value.run_runroot_cmd.call_count, 2)