keylime/SOURCES/0011-revocations-Try-to-send-notifications-on-shutdown.patch

From af9ac50f5acf1a7d4ad285956b60e60c3c4416b7 Mon Sep 17 00:00:00 2001
From: Anderson Toshiyuki Sasaki <ansasaki@redhat.com>
Date: Wed, 23 Jul 2025 15:39:49 +0200
Subject: [PATCH 11/13] revocations: Try to send notifications on shutdown

During verifier shutdown, try to send any pending revocation
notification in a best-effort manner.  In future, the pending revocation
notifications should be persisted to be processed during next startup.

Assisted-by: Claude 4 Sonnet
Signed-off-by: Anderson Toshiyuki Sasaki <ansasaki@redhat.com>
---
 keylime/cloud_verifier_tornado.py |   7 +
 keylime/revocation_notifier.py    | 239 ++++++++++++++++++++++--------
 2 files changed, 184 insertions(+), 62 deletions(-)

diff --git a/keylime/cloud_verifier_tornado.py b/keylime/cloud_verifier_tornado.py
index 7065661..89aa703 100644
--- a/keylime/cloud_verifier_tornado.py
+++ b/keylime/cloud_verifier_tornado.py
@@ -2109,6 +2109,10 @@ def main() -> None:
             # Stop server to not accept new incoming connections
             server.stop()

+            # Gracefully shutdown webhook workers to prevent connection errors
+            if "webhook" in revocation_notifier.get_notifiers():
+                revocation_notifier.shutdown_webhook_workers()
+
             # Wait for all connections to be closed and then stop ioloop
             async def stop() -> None:
                 await server.close_all_connections()
@@ -2136,6 +2140,9 @@ def main() -> None:
     def sig_handler(*_: Any) -> None:
         if run_revocation_notifier:
             revocation_notifier.stop_broker()
+        # Gracefully shutdown webhook workers to prevent connection errors
+        if "webhook" in revocation_notifier.get_notifiers():
+            revocation_notifier.shutdown_webhook_workers()
         for p in processes:
             p.join()
         # Do not call sys.exit(0) here as it interferes with multiprocessing cleanup
diff --git a/keylime/revocation_notifier.py b/keylime/revocation_notifier.py
index 5a7cc4b..c154028 100644
--- a/keylime/revocation_notifier.py
+++ b/keylime/revocation_notifier.py
@@ -18,6 +18,174 @@ broker_proc: Optional[Process] = None

 _SOCKET_PATH = "/var/run/keylime/keylime.verifier.ipc"

+# Global webhook manager instance (initialized when needed)
+_webhook_manager: Optional["WebhookNotificationManager"] = None
+
+
+class WebhookNotificationManager:
+    """Manages webhook worker threads and graceful shutdown for revocation notifications."""
+
+    def __init__(self) -> None:
+        self._shutdown_event = threading.Event()
+        self._workers: Set[threading.Thread] = set()
+        self._workers_lock = threading.Lock()
+
+    def notify_webhook(self, tosend: Dict[str, Any]) -> None:
+        """Send webhook notification with worker thread management."""
+        url = config.get("verifier", "webhook_url", section="revocations", fallback="")
+        # Check if a url was specified
+        if url == "":
+            return
+
+        # Similarly to notify(), let's convert `tosend' to str to prevent
+        # possible issues with json handling by python-requests.
+        tosend = json.bytes_to_str(tosend)
+
+        def worker_webhook(tosend: Dict[str, Any], url: str) -> None:
+            is_shutdown_mode = False
+            try:
+                interval = config.getfloat("verifier", "retry_interval")
+                exponential_backoff = config.getboolean("verifier", "exponential_backoff")
+
+                max_retries = config.getint("verifier", "max_retries")
+                if max_retries <= 0:
+                    logger.info("Invalid value found in 'max_retries' option for verifier, using default value")
+                    max_retries = 5
+
+                # During shutdown, use fewer retries but still make best effort
+                if self._shutdown_event.is_set():
+                    is_shutdown_mode = True
+                    max_retries = min(max_retries, 3)  # Reduce retries during shutdown but still try
+                    logger.info(
+                        "Shutdown mode: attempting to send critical revocation notification with %d retries",
+                        max_retries,
+                    )
+
+                # Get TLS options from the configuration
+                (cert, key, trusted_ca, key_password), verify_server_cert = web_util.get_tls_options(
+                    "verifier", is_client=True, logger=logger
+                )
+
+                # Generate the TLS context using the obtained options
+                tls_context = web_util.generate_tls_context(
+                    cert, key, trusted_ca, key_password, is_client=True, logger=logger
+                )
+
+                logger.info("Sending revocation event via webhook to %s ...", url)
+                for i in range(max_retries):
+                    next_retry = retry.retry_time(exponential_backoff, interval, i, logger)
+
+                    with RequestsClient(
+                        url,
+                        verify_server_cert,
+                        tls_context,
+                    ) as client:
+                        try:
+                            res = client.post("", json=tosend, timeout=5)
+                        except requests.exceptions.SSLError as ssl_error:
+                            if "TLSV1_ALERT_UNKNOWN_CA" in str(ssl_error):
+                                logger.warning(
+                                    "Keylime does not recognize certificate from peer. Check if verifier 'trusted_server_ca' is configured correctly"
+                                )
+
+                            raise ssl_error from ssl_error
+                        except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
+                            # During shutdown, only suppress errors on the final attempt after all retries exhausted
+                            if is_shutdown_mode and i == max_retries - 1:
+                                logger.warning(
+                                    "Final attempt to send revocation notification failed during shutdown: %s", e
+                                )
+                                return
+                            # Otherwise, let the retry logic handle it
+                            raise e
+
+                        if res and res.status_code in [200, 202]:
+                            if is_shutdown_mode:
+                                logger.info("Successfully sent revocation notification during shutdown")
+                            break
+
+                        logger.debug(
+                            "Unable to publish revocation message %d times via webhook, "
+                            "trying again in %d seconds. "
+                            "Server returned status code: %s",
+                            i + 1,
+                            next_retry,
+                            res.status_code,
+                        )
+
+                        # During shutdown, use shorter retry intervals to complete faster
+                        if is_shutdown_mode:
+                            next_retry = min(next_retry, 2.0)  # Cap retry interval during shutdown
+
+                        time.sleep(next_retry)
+
+            except Exception as e:
+                # Only suppress errors during final shutdown phase and log appropriately
+                if is_shutdown_mode:
+                    logger.warning("Failed to send revocation notification during shutdown: %s", e)
+                else:
+                    logger.error("Error in webhook worker: %s", e)
+            finally:
+                # Remove this worker from the active set
+                current_thread = threading.current_thread()
+                with self._workers_lock:
+                    self._workers.discard(current_thread)
+
+        w = functools.partial(worker_webhook, tosend, url)
+        t = threading.Thread(target=w, daemon=True)
+
+        # Add this worker to the active set
+        with self._workers_lock:
+            self._workers.add(t)
+
+        t.start()
+
+    def shutdown_workers(self) -> None:
+        """Signal webhook workers to shut down gracefully and wait for them to complete.
+
+        This gives workers time to complete their critical revocation notifications
+        before the service shuts down completely.
+        """
+        logger.info("Shutting down webhook workers gracefully...")
+        self._shutdown_event.set()
+
+        # Give workers generous time to complete critical revocation notifications
+        timeout = 30.0  # Increased timeout for critical security notifications
+        end_time = time.time() + timeout
+
+        with self._workers_lock:
+            workers_to_wait = list(self._workers)
+
+        if workers_to_wait:
+            logger.info("Waiting for %d webhook workers to complete revocation notifications...", len(workers_to_wait))
+
+        for worker in workers_to_wait:
+            remaining_time = max(0, end_time - time.time())
+            if remaining_time > 0:
+                logger.debug(
+                    "Waiting for webhook worker %s to complete (timeout: %.1f seconds)", worker.name, remaining_time
+                )
+                worker.join(timeout=remaining_time)
+                if worker.is_alive():
+                    logger.warning("Webhook worker %s did not complete within timeout", worker.name)
+            else:
+                logger.warning("Timeout exceeded while waiting for webhook workers")
+                break
+
+        # Clean up completed workers
+        with self._workers_lock:
+            self._workers.clear()
+
+        logger.info("Webhook workers shutdown complete")
+
+
+def _get_webhook_manager() -> WebhookNotificationManager:
+    """Get the global webhook manager instance, creating it if needed."""
+    global _webhook_manager
+    if _webhook_manager is None:
+        _webhook_manager = WebhookNotificationManager()
+    return _webhook_manager
+

 # return the revocation notification methods for cloud verifier
 def get_notifiers() -> Set[str]:
@@ -83,6 +251,12 @@ def stop_broker() -> None:
             broker_proc.kill()  # pylint: disable=E1101


+def shutdown_webhook_workers() -> None:
+    """Convenience function to shutdown webhook workers using the global manager."""
+    manager = _get_webhook_manager()
+    manager.shutdown_workers()
+
+
 def notify(tosend: Dict[str, Any]) -> None:
     assert "zeromq" in get_notifiers()
     try:
@@ -127,68 +301,9 @@ def notify(tosend: Dict[str, Any]) -> None:


 def notify_webhook(tosend: Dict[str, Any]) -> None:
-    url = config.get("verifier", "webhook_url", section="revocations", fallback="")
-    # Check if a url was specified
-    if url == "":
-        return
-
-    # Similarly to notify(), let's convert `tosend' to str to prevent
-    # possible issues with json handling by python-requests.
-    tosend = json.bytes_to_str(tosend)
-
-    def worker_webhook(tosend: Dict[str, Any], url: str) -> None:
-        interval = config.getfloat("verifier", "retry_interval")
-        exponential_backoff = config.getboolean("verifier", "exponential_backoff")
-
-        max_retries = config.getint("verifier", "max_retries")
-        if max_retries <= 0:
-            logger.info("Invalid value found in 'max_retries' option for verifier, using default value")
-            max_retries = 5
-
-        # Get TLS options from the configuration
-        (cert, key, trusted_ca, key_password), verify_server_cert = web_util.get_tls_options(
-            "verifier", is_client=True, logger=logger
-        )
-
-        # Generate the TLS context using the obtained options
-        tls_context = web_util.generate_tls_context(cert, key, trusted_ca, key_password, is_client=True, logger=logger)
-
-        logger.info("Sending revocation event via webhook to %s ...", url)
-        for i in range(max_retries):
-            next_retry = retry.retry_time(exponential_backoff, interval, i, logger)
-
-            with RequestsClient(
-                url,
-                verify_server_cert,
-                tls_context,
-            ) as client:
-                try:
-                    res = client.post("", json=tosend, timeout=5)
-                except requests.exceptions.SSLError as ssl_error:
-                    if "TLSV1_ALERT_UNKNOWN_CA" in str(ssl_error):
-                        logger.warning(
-                            "Keylime does not recognize certificate from peer. Check if verifier 'trusted_server_ca' is configured correctly"
-                        )
-
-                    raise ssl_error from ssl_error
-
-                if res and res.status_code in [200, 202]:
-                    break
-
-                logger.debug(
-                    "Unable to publish revocation message %d times via webhook, "
-                    "trying again in %d seconds. "
-                    "Server returned status code: %s",
-                    i + 1,
-                    next_retry,
-                    res.status_code,
-                )
-
-                time.sleep(next_retry)
-
-    w = functools.partial(worker_webhook, tosend, url)
-    t = threading.Thread(target=w, daemon=True)
-    t.start()
+    """Send webhook notification using the global webhook manager."""
+    manager = _get_webhook_manager()
+    manager.notify_webhook(tosend)


 cert_key = None
--
2.47.3