From 302e60d9ffebc04cdd7166db7b9f0b1ee6c0f8a2 Mon Sep 17 00:00:00 2001 From: Sergio Correia Date: Mon, 18 Aug 2025 12:26:23 +0000 Subject: [PATCH] Fix for revocation notifier not closing TLS session correctly Resolves: RHEL-109656 Signed-off-by: Sergio Correia --- ...rifier-Gracefully-shutdown-on-signal.patch | 42 +++ ...ry-to-send-notifications-on-shutdown.patch | 308 ++++++++++++++++++ ...close-the-session-at-the-end-of-the-.patch | 45 +++ keylime.spec | 12 +- 4 files changed, 406 insertions(+), 1 deletion(-) create mode 100644 0010-verifier-Gracefully-shutdown-on-signal.patch create mode 100644 0011-revocations-Try-to-send-notifications-on-shutdown.patch create mode 100644 0012-requests_client-close-the-session-at-the-end-of-the-.patch diff --git a/0010-verifier-Gracefully-shutdown-on-signal.patch b/0010-verifier-Gracefully-shutdown-on-signal.patch new file mode 100644 index 0000000..df7bb23 --- /dev/null +++ b/0010-verifier-Gracefully-shutdown-on-signal.patch @@ -0,0 +1,42 @@ +From 1b7191098ca3f6d72c6ad218564ae0938a87efd4 Mon Sep 17 00:00:00 2001 +From: Anderson Toshiyuki Sasaki +Date: Mon, 18 Aug 2025 12:22:55 +0000 +Subject: [PATCH 10/13] verifier: Gracefully shutdown on signal + +Wait for the processes to finish when interrupted by a signal. Do not +call exit(0) in the signal handler. + +Assisted-by: Claude 4 Sonnet +Signed-off-by: Anderson Toshiyuki Sasaki +--- + keylime/cloud_verifier_tornado.py | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +diff --git a/keylime/cloud_verifier_tornado.py b/keylime/cloud_verifier_tornado.py +index 7553ac8..7065661 100644 +--- a/keylime/cloud_verifier_tornado.py ++++ b/keylime/cloud_verifier_tornado.py +@@ -2138,7 +2138,7 @@ def main() -> None: + revocation_notifier.stop_broker() + for p in processes: + p.join() +- sys.exit(0) ++ # Do not call sys.exit(0) here as it interferes with multiprocessing cleanup + + signal.signal(signal.SIGINT, sig_handler) + signal.signal(signal.SIGTERM, sig_handler) +@@ -2159,3 +2159,11 @@ def main() -> None: + process = Process(target=server_process, args=(task_id, active_agents)) + process.start() + processes.append(process) ++ ++ # Wait for all worker processes to complete ++ try: ++ for p in processes: ++ p.join() ++ except KeyboardInterrupt: ++ # Signal handler will take care of cleanup ++ pass +-- +2.47.3 + diff --git a/0011-revocations-Try-to-send-notifications-on-shutdown.patch b/0011-revocations-Try-to-send-notifications-on-shutdown.patch new file mode 100644 index 0000000..b36836e --- /dev/null +++ b/0011-revocations-Try-to-send-notifications-on-shutdown.patch @@ -0,0 +1,308 @@ +From af9ac50f5acf1a7d4ad285956b60e60c3c4416b7 Mon Sep 17 00:00:00 2001 +From: Anderson Toshiyuki Sasaki +Date: Wed, 23 Jul 2025 15:39:49 +0200 +Subject: [PATCH 11/13] revocations: Try to send notifications on shutdown + +During verifier shutdown, try to send any pending revocation +notification in a best-effort manner. In future, the pending revocation +notifications should be persisted to be processed during next startup. + +Assisted-by: Claude 4 Sonnet +Signed-off-by: Anderson Toshiyuki Sasaki +--- + keylime/cloud_verifier_tornado.py | 7 + + keylime/revocation_notifier.py | 239 ++++++++++++++++++++++-------- + 2 files changed, 184 insertions(+), 62 deletions(-) + +diff --git a/keylime/cloud_verifier_tornado.py b/keylime/cloud_verifier_tornado.py +index 7065661..89aa703 100644 +--- a/keylime/cloud_verifier_tornado.py ++++ b/keylime/cloud_verifier_tornado.py +@@ -2109,6 +2109,10 @@ def main() -> None: + # Stop server to not accept new incoming connections + server.stop() + ++ # Gracefully shutdown webhook workers to prevent connection errors ++ if "webhook" in revocation_notifier.get_notifiers(): ++ revocation_notifier.shutdown_webhook_workers() ++ + # Wait for all connections to be closed and then stop ioloop + async def stop() -> None: + await server.close_all_connections() +@@ -2136,6 +2140,9 @@ def main() -> None: + def sig_handler(*_: Any) -> None: + if run_revocation_notifier: + revocation_notifier.stop_broker() ++ # Gracefully shutdown webhook workers to prevent connection errors ++ if "webhook" in revocation_notifier.get_notifiers(): ++ revocation_notifier.shutdown_webhook_workers() + for p in processes: + p.join() + # Do not call sys.exit(0) here as it interferes with multiprocessing cleanup +diff --git a/keylime/revocation_notifier.py b/keylime/revocation_notifier.py +index 5a7cc4b..c154028 100644 +--- a/keylime/revocation_notifier.py ++++ b/keylime/revocation_notifier.py +@@ -18,6 +18,174 @@ broker_proc: Optional[Process] = None + + _SOCKET_PATH = "/var/run/keylime/keylime.verifier.ipc" + ++# Global webhook manager instance (initialized when needed) ++_webhook_manager: Optional["WebhookNotificationManager"] = None ++ ++ ++class WebhookNotificationManager: ++ """Manages webhook worker threads and graceful shutdown for revocation notifications.""" ++ ++ def __init__(self) -> None: ++ self._shutdown_event = threading.Event() ++ self._workers: Set[threading.Thread] = set() ++ self._workers_lock = threading.Lock() ++ ++ def notify_webhook(self, tosend: Dict[str, Any]) -> None: ++ """Send webhook notification with worker thread management.""" ++ url = config.get("verifier", "webhook_url", section="revocations", fallback="") ++ # Check if a url was specified ++ if url == "": ++ return ++ ++ # Similarly to notify(), let's convert `tosend' to str to prevent ++ # possible issues with json handling by python-requests. ++ tosend = json.bytes_to_str(tosend) ++ ++ def worker_webhook(tosend: Dict[str, Any], url: str) -> None: ++ is_shutdown_mode = False ++ try: ++ interval = config.getfloat("verifier", "retry_interval") ++ exponential_backoff = config.getboolean("verifier", "exponential_backoff") ++ ++ max_retries = config.getint("verifier", "max_retries") ++ if max_retries <= 0: ++ logger.info("Invalid value found in 'max_retries' option for verifier, using default value") ++ max_retries = 5 ++ ++ # During shutdown, use fewer retries but still make best effort ++ if self._shutdown_event.is_set(): ++ is_shutdown_mode = True ++ max_retries = min(max_retries, 3) # Reduce retries during shutdown but still try ++ logger.info( ++ "Shutdown mode: attempting to send critical revocation notification with %d retries", ++ max_retries, ++ ) ++ ++ # Get TLS options from the configuration ++ (cert, key, trusted_ca, key_password), verify_server_cert = web_util.get_tls_options( ++ "verifier", is_client=True, logger=logger ++ ) ++ ++ # Generate the TLS context using the obtained options ++ tls_context = web_util.generate_tls_context( ++ cert, key, trusted_ca, key_password, is_client=True, logger=logger ++ ) ++ ++ logger.info("Sending revocation event via webhook to %s ...", url) ++ for i in range(max_retries): ++ next_retry = retry.retry_time(exponential_backoff, interval, i, logger) ++ ++ with RequestsClient( ++ url, ++ verify_server_cert, ++ tls_context, ++ ) as client: ++ try: ++ res = client.post("", json=tosend, timeout=5) ++ except requests.exceptions.SSLError as ssl_error: ++ if "TLSV1_ALERT_UNKNOWN_CA" in str(ssl_error): ++ logger.warning( ++ "Keylime does not recognize certificate from peer. Check if verifier 'trusted_server_ca' is configured correctly" ++ ) ++ ++ raise ssl_error from ssl_error ++ except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: ++ # During shutdown, only suppress errors on the final attempt after all retries exhausted ++ if is_shutdown_mode and i == max_retries - 1: ++ logger.warning( ++ "Final attempt to send revocation notification failed during shutdown: %s", e ++ ) ++ return ++ # Otherwise, let the retry logic handle it ++ raise e ++ ++ if res and res.status_code in [200, 202]: ++ if is_shutdown_mode: ++ logger.info("Successfully sent revocation notification during shutdown") ++ break ++ ++ logger.debug( ++ "Unable to publish revocation message %d times via webhook, " ++ "trying again in %d seconds. " ++ "Server returned status code: %s", ++ i + 1, ++ next_retry, ++ res.status_code, ++ ) ++ ++ # During shutdown, use shorter retry intervals to complete faster ++ if is_shutdown_mode: ++ next_retry = min(next_retry, 2.0) # Cap retry interval during shutdown ++ ++ time.sleep(next_retry) ++ ++ except Exception as e: ++ # Only suppress errors during final shutdown phase and log appropriately ++ if is_shutdown_mode: ++ logger.warning("Failed to send revocation notification during shutdown: %s", e) ++ else: ++ logger.error("Error in webhook worker: %s", e) ++ finally: ++ # Remove this worker from the active set ++ current_thread = threading.current_thread() ++ with self._workers_lock: ++ self._workers.discard(current_thread) ++ ++ w = functools.partial(worker_webhook, tosend, url) ++ t = threading.Thread(target=w, daemon=True) ++ ++ # Add this worker to the active set ++ with self._workers_lock: ++ self._workers.add(t) ++ ++ t.start() ++ ++ def shutdown_workers(self) -> None: ++ """Signal webhook workers to shut down gracefully and wait for them to complete. ++ ++ This gives workers time to complete their critical revocation notifications ++ before the service shuts down completely. ++ """ ++ logger.info("Shutting down webhook workers gracefully...") ++ self._shutdown_event.set() ++ ++ # Give workers generous time to complete critical revocation notifications ++ timeout = 30.0 # Increased timeout for critical security notifications ++ end_time = time.time() + timeout ++ ++ with self._workers_lock: ++ workers_to_wait = list(self._workers) ++ ++ if workers_to_wait: ++ logger.info("Waiting for %d webhook workers to complete revocation notifications...", len(workers_to_wait)) ++ ++ for worker in workers_to_wait: ++ remaining_time = max(0, end_time - time.time()) ++ if remaining_time > 0: ++ logger.debug( ++ "Waiting for webhook worker %s to complete (timeout: %.1f seconds)", worker.name, remaining_time ++ ) ++ worker.join(timeout=remaining_time) ++ if worker.is_alive(): ++ logger.warning("Webhook worker %s did not complete within timeout", worker.name) ++ else: ++ logger.warning("Timeout exceeded while waiting for webhook workers") ++ break ++ ++ # Clean up completed workers ++ with self._workers_lock: ++ self._workers.clear() ++ ++ logger.info("Webhook workers shutdown complete") ++ ++ ++def _get_webhook_manager() -> WebhookNotificationManager: ++ """Get the global webhook manager instance, creating it if needed.""" ++ global _webhook_manager ++ if _webhook_manager is None: ++ _webhook_manager = WebhookNotificationManager() ++ return _webhook_manager ++ + + # return the revocation notification methods for cloud verifier + def get_notifiers() -> Set[str]: +@@ -83,6 +251,12 @@ def stop_broker() -> None: + broker_proc.kill() # pylint: disable=E1101 + + ++def shutdown_webhook_workers() -> None: ++ """Convenience function to shutdown webhook workers using the global manager.""" ++ manager = _get_webhook_manager() ++ manager.shutdown_workers() ++ ++ + def notify(tosend: Dict[str, Any]) -> None: + assert "zeromq" in get_notifiers() + try: +@@ -127,68 +301,9 @@ def notify(tosend: Dict[str, Any]) -> None: + + + def notify_webhook(tosend: Dict[str, Any]) -> None: +- url = config.get("verifier", "webhook_url", section="revocations", fallback="") +- # Check if a url was specified +- if url == "": +- return +- +- # Similarly to notify(), let's convert `tosend' to str to prevent +- # possible issues with json handling by python-requests. +- tosend = json.bytes_to_str(tosend) +- +- def worker_webhook(tosend: Dict[str, Any], url: str) -> None: +- interval = config.getfloat("verifier", "retry_interval") +- exponential_backoff = config.getboolean("verifier", "exponential_backoff") +- +- max_retries = config.getint("verifier", "max_retries") +- if max_retries <= 0: +- logger.info("Invalid value found in 'max_retries' option for verifier, using default value") +- max_retries = 5 +- +- # Get TLS options from the configuration +- (cert, key, trusted_ca, key_password), verify_server_cert = web_util.get_tls_options( +- "verifier", is_client=True, logger=logger +- ) +- +- # Generate the TLS context using the obtained options +- tls_context = web_util.generate_tls_context(cert, key, trusted_ca, key_password, is_client=True, logger=logger) +- +- logger.info("Sending revocation event via webhook to %s ...", url) +- for i in range(max_retries): +- next_retry = retry.retry_time(exponential_backoff, interval, i, logger) +- +- with RequestsClient( +- url, +- verify_server_cert, +- tls_context, +- ) as client: +- try: +- res = client.post("", json=tosend, timeout=5) +- except requests.exceptions.SSLError as ssl_error: +- if "TLSV1_ALERT_UNKNOWN_CA" in str(ssl_error): +- logger.warning( +- "Keylime does not recognize certificate from peer. Check if verifier 'trusted_server_ca' is configured correctly" +- ) +- +- raise ssl_error from ssl_error +- +- if res and res.status_code in [200, 202]: +- break +- +- logger.debug( +- "Unable to publish revocation message %d times via webhook, " +- "trying again in %d seconds. " +- "Server returned status code: %s", +- i + 1, +- next_retry, +- res.status_code, +- ) +- +- time.sleep(next_retry) +- +- w = functools.partial(worker_webhook, tosend, url) +- t = threading.Thread(target=w, daemon=True) +- t.start() ++ """Send webhook notification using the global webhook manager.""" ++ manager = _get_webhook_manager() ++ manager.notify_webhook(tosend) + + + cert_key = None +-- +2.47.3 + diff --git a/0012-requests_client-close-the-session-at-the-end-of-the-.patch b/0012-requests_client-close-the-session-at-the-end-of-the-.patch new file mode 100644 index 0000000..ba24d60 --- /dev/null +++ b/0012-requests_client-close-the-session-at-the-end-of-the-.patch @@ -0,0 +1,45 @@ +From 5fb4484b07a7ba3fcdf451bf816b5f07a40d6d97 Mon Sep 17 00:00:00 2001 +From: Sergio Correia +Date: Wed, 4 Jun 2025 19:52:37 +0100 +Subject: [PATCH 12/13] requests_client: close the session at the end of the + resource manager + +We had an issue in the past in which the webhook worker would not +properly close the opened session. This was fixed in #1456 (Close +session in worker_webhook function). + +At some later point, in #1566 (revocation_notifier: Take into account CA +certificates added via configuration), some refactoring around the +webhook_worker() in revocation_notifier happened and it started using +the RequestsClient resource manager. + +However, the RequestsClient does not close the session at its end, which +in turns makes that the old issue of not closing properly the session +in the webhook_worker() returned. + +We now issue a session.close() at the end of the RequestsClient. + +Signed-off-by: Sergio Correia +--- + keylime/requests_client.py | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/keylime/requests_client.py b/keylime/requests_client.py +index 16615f7..b7da484 100644 +--- a/keylime/requests_client.py ++++ b/keylime/requests_client.py +@@ -40,7 +40,10 @@ class RequestsClient: + return self + + def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: +- pass ++ try: ++ self.session.close() ++ except Exception: ++ pass + + def request(self, method: str, url: str, **kwargs: Any) -> requests.Response: + return self.session.request(method, self.base_url + url, **kwargs) +-- +2.47.3 + diff --git a/keylime.spec b/keylime.spec index f007ad5..a31de45 100644 --- a/keylime.spec +++ b/keylime.spec @@ -9,7 +9,7 @@ Name: keylime Version: 7.12.1 -Release: 10%{?dist} +Release: 11%{?dist} Summary: Open source TPM software for Bootstrapping and Maintaining Trust URL: https://github.com/keylime/keylime @@ -34,6 +34,12 @@ Patch: 0007-fix_db_connection_leaks.patch Patch: 0008-mb-support-EV_EFI_HANDOFF_TABLES-events-on-PCR1.patch Patch: 0009-mb-support-vendor_db-as-logged-by-newer-shim-version.patch +# Backported from https://github.com/keylime/keylime/pull/1784 +# and https://github.com/keylime/keylime/pull/1785. +Patch: 0010-verifier-Gracefully-shutdown-on-signal.patch +Patch: 0011-revocations-Try-to-send-notifications-on-shutdown.patch +Patch: 0012-requests_client-close-the-session-at-the-end-of-the-.patch + License: ASL 2.0 and MIT BuildRequires: git-core @@ -427,6 +433,10 @@ fi %license LICENSE %changelog +* Mon Aug 18 2025 Sergio Correia - 7.12.1-11 +- Fix for revocation notifier not closing TLS session correctly + Resolves: RHEL-109656 + * Wed Aug 13 2025 Sergio Correia - 7.12.1-10 - Support vendor_db: follow-up fix Related: RHEL-80455