Fix for revocation notifier not closing TLS session correctly

Resolves: RHEL-109656

Signed-off-by: Sergio Correia <scorreia@redhat.com>
This commit is contained in:
Sergio Correia 2025-08-18 12:26:23 +00:00
parent f280e120cd
commit 302e60d9ff
No known key found for this signature in database
GPG Key ID: D0D219ED1F7E762C
4 changed files with 406 additions and 1 deletions

View File

@ -0,0 +1,42 @@
From 1b7191098ca3f6d72c6ad218564ae0938a87efd4 Mon Sep 17 00:00:00 2001
From: Anderson Toshiyuki Sasaki <ansasaki@redhat.com>
Date: Mon, 18 Aug 2025 12:22:55 +0000
Subject: [PATCH 10/13] verifier: Gracefully shutdown on signal
Wait for the processes to finish when interrupted by a signal. Do not
call exit(0) in the signal handler.
Assisted-by: Claude 4 Sonnet
Signed-off-by: Anderson Toshiyuki Sasaki <ansasaki@redhat.com>
---
keylime/cloud_verifier_tornado.py | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/keylime/cloud_verifier_tornado.py b/keylime/cloud_verifier_tornado.py
index 7553ac8..7065661 100644
--- a/keylime/cloud_verifier_tornado.py
+++ b/keylime/cloud_verifier_tornado.py
@@ -2138,7 +2138,7 @@ def main() -> None:
revocation_notifier.stop_broker()
for p in processes:
p.join()
- sys.exit(0)
+ # Do not call sys.exit(0) here as it interferes with multiprocessing cleanup
signal.signal(signal.SIGINT, sig_handler)
signal.signal(signal.SIGTERM, sig_handler)
@@ -2159,3 +2159,11 @@ def main() -> None:
process = Process(target=server_process, args=(task_id, active_agents))
process.start()
processes.append(process)
+
+ # Wait for all worker processes to complete
+ try:
+ for p in processes:
+ p.join()
+ except KeyboardInterrupt:
+ # Signal handler will take care of cleanup
+ pass
--
2.47.3

View File

@ -0,0 +1,308 @@
From af9ac50f5acf1a7d4ad285956b60e60c3c4416b7 Mon Sep 17 00:00:00 2001
From: Anderson Toshiyuki Sasaki <ansasaki@redhat.com>
Date: Wed, 23 Jul 2025 15:39:49 +0200
Subject: [PATCH 11/13] revocations: Try to send notifications on shutdown
During verifier shutdown, try to send any pending revocation
notification in a best-effort manner. In future, the pending revocation
notifications should be persisted to be processed during next startup.
Assisted-by: Claude 4 Sonnet
Signed-off-by: Anderson Toshiyuki Sasaki <ansasaki@redhat.com>
---
keylime/cloud_verifier_tornado.py | 7 +
keylime/revocation_notifier.py | 239 ++++++++++++++++++++++--------
2 files changed, 184 insertions(+), 62 deletions(-)
diff --git a/keylime/cloud_verifier_tornado.py b/keylime/cloud_verifier_tornado.py
index 7065661..89aa703 100644
--- a/keylime/cloud_verifier_tornado.py
+++ b/keylime/cloud_verifier_tornado.py
@@ -2109,6 +2109,10 @@ def main() -> None:
# Stop server to not accept new incoming connections
server.stop()
+ # Gracefully shutdown webhook workers to prevent connection errors
+ if "webhook" in revocation_notifier.get_notifiers():
+ revocation_notifier.shutdown_webhook_workers()
+
# Wait for all connections to be closed and then stop ioloop
async def stop() -> None:
await server.close_all_connections()
@@ -2136,6 +2140,9 @@ def main() -> None:
def sig_handler(*_: Any) -> None:
if run_revocation_notifier:
revocation_notifier.stop_broker()
+ # Gracefully shutdown webhook workers to prevent connection errors
+ if "webhook" in revocation_notifier.get_notifiers():
+ revocation_notifier.shutdown_webhook_workers()
for p in processes:
p.join()
# Do not call sys.exit(0) here as it interferes with multiprocessing cleanup
diff --git a/keylime/revocation_notifier.py b/keylime/revocation_notifier.py
index 5a7cc4b..c154028 100644
--- a/keylime/revocation_notifier.py
+++ b/keylime/revocation_notifier.py
@@ -18,6 +18,174 @@ broker_proc: Optional[Process] = None
_SOCKET_PATH = "/var/run/keylime/keylime.verifier.ipc"
+# Global webhook manager instance (initialized when needed)
+_webhook_manager: Optional["WebhookNotificationManager"] = None
+
+
+class WebhookNotificationManager:
+ """Manages webhook worker threads and graceful shutdown for revocation notifications."""
+
+ def __init__(self) -> None:
+ self._shutdown_event = threading.Event()
+ self._workers: Set[threading.Thread] = set()
+ self._workers_lock = threading.Lock()
+
+ def notify_webhook(self, tosend: Dict[str, Any]) -> None:
+ """Send webhook notification with worker thread management."""
+ url = config.get("verifier", "webhook_url", section="revocations", fallback="")
+ # Check if a url was specified
+ if url == "":
+ return
+
+ # Similarly to notify(), let's convert `tosend' to str to prevent
+ # possible issues with json handling by python-requests.
+ tosend = json.bytes_to_str(tosend)
+
+ def worker_webhook(tosend: Dict[str, Any], url: str) -> None:
+ is_shutdown_mode = False
+ try:
+ interval = config.getfloat("verifier", "retry_interval")
+ exponential_backoff = config.getboolean("verifier", "exponential_backoff")
+
+ max_retries = config.getint("verifier", "max_retries")
+ if max_retries <= 0:
+ logger.info("Invalid value found in 'max_retries' option for verifier, using default value")
+ max_retries = 5
+
+ # During shutdown, use fewer retries but still make best effort
+ if self._shutdown_event.is_set():
+ is_shutdown_mode = True
+ max_retries = min(max_retries, 3) # Reduce retries during shutdown but still try
+ logger.info(
+ "Shutdown mode: attempting to send critical revocation notification with %d retries",
+ max_retries,
+ )
+
+ # Get TLS options from the configuration
+ (cert, key, trusted_ca, key_password), verify_server_cert = web_util.get_tls_options(
+ "verifier", is_client=True, logger=logger
+ )
+
+ # Generate the TLS context using the obtained options
+ tls_context = web_util.generate_tls_context(
+ cert, key, trusted_ca, key_password, is_client=True, logger=logger
+ )
+
+ logger.info("Sending revocation event via webhook to %s ...", url)
+ for i in range(max_retries):
+ next_retry = retry.retry_time(exponential_backoff, interval, i, logger)
+
+ with RequestsClient(
+ url,
+ verify_server_cert,
+ tls_context,
+ ) as client:
+ try:
+ res = client.post("", json=tosend, timeout=5)
+ except requests.exceptions.SSLError as ssl_error:
+ if "TLSV1_ALERT_UNKNOWN_CA" in str(ssl_error):
+ logger.warning(
+ "Keylime does not recognize certificate from peer. Check if verifier 'trusted_server_ca' is configured correctly"
+ )
+
+ raise ssl_error from ssl_error
+ except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
+ # During shutdown, only suppress errors on the final attempt after all retries exhausted
+ if is_shutdown_mode and i == max_retries - 1:
+ logger.warning(
+ "Final attempt to send revocation notification failed during shutdown: %s", e
+ )
+ return
+ # Otherwise, let the retry logic handle it
+ raise e
+
+ if res and res.status_code in [200, 202]:
+ if is_shutdown_mode:
+ logger.info("Successfully sent revocation notification during shutdown")
+ break
+
+ logger.debug(
+ "Unable to publish revocation message %d times via webhook, "
+ "trying again in %d seconds. "
+ "Server returned status code: %s",
+ i + 1,
+ next_retry,
+ res.status_code,
+ )
+
+ # During shutdown, use shorter retry intervals to complete faster
+ if is_shutdown_mode:
+ next_retry = min(next_retry, 2.0) # Cap retry interval during shutdown
+
+ time.sleep(next_retry)
+
+ except Exception as e:
+ # Only suppress errors during final shutdown phase and log appropriately
+ if is_shutdown_mode:
+ logger.warning("Failed to send revocation notification during shutdown: %s", e)
+ else:
+ logger.error("Error in webhook worker: %s", e)
+ finally:
+ # Remove this worker from the active set
+ current_thread = threading.current_thread()
+ with self._workers_lock:
+ self._workers.discard(current_thread)
+
+ w = functools.partial(worker_webhook, tosend, url)
+ t = threading.Thread(target=w, daemon=True)
+
+ # Add this worker to the active set
+ with self._workers_lock:
+ self._workers.add(t)
+
+ t.start()
+
+ def shutdown_workers(self) -> None:
+ """Signal webhook workers to shut down gracefully and wait for them to complete.
+
+ This gives workers time to complete their critical revocation notifications
+ before the service shuts down completely.
+ """
+ logger.info("Shutting down webhook workers gracefully...")
+ self._shutdown_event.set()
+
+ # Give workers generous time to complete critical revocation notifications
+ timeout = 30.0 # Increased timeout for critical security notifications
+ end_time = time.time() + timeout
+
+ with self._workers_lock:
+ workers_to_wait = list(self._workers)
+
+ if workers_to_wait:
+ logger.info("Waiting for %d webhook workers to complete revocation notifications...", len(workers_to_wait))
+
+ for worker in workers_to_wait:
+ remaining_time = max(0, end_time - time.time())
+ if remaining_time > 0:
+ logger.debug(
+ "Waiting for webhook worker %s to complete (timeout: %.1f seconds)", worker.name, remaining_time
+ )
+ worker.join(timeout=remaining_time)
+ if worker.is_alive():
+ logger.warning("Webhook worker %s did not complete within timeout", worker.name)
+ else:
+ logger.warning("Timeout exceeded while waiting for webhook workers")
+ break
+
+ # Clean up completed workers
+ with self._workers_lock:
+ self._workers.clear()
+
+ logger.info("Webhook workers shutdown complete")
+
+
+def _get_webhook_manager() -> WebhookNotificationManager:
+ """Get the global webhook manager instance, creating it if needed."""
+ global _webhook_manager
+ if _webhook_manager is None:
+ _webhook_manager = WebhookNotificationManager()
+ return _webhook_manager
+
# return the revocation notification methods for cloud verifier
def get_notifiers() -> Set[str]:
@@ -83,6 +251,12 @@ def stop_broker() -> None:
broker_proc.kill() # pylint: disable=E1101
+def shutdown_webhook_workers() -> None:
+ """Convenience function to shutdown webhook workers using the global manager."""
+ manager = _get_webhook_manager()
+ manager.shutdown_workers()
+
+
def notify(tosend: Dict[str, Any]) -> None:
assert "zeromq" in get_notifiers()
try:
@@ -127,68 +301,9 @@ def notify(tosend: Dict[str, Any]) -> None:
def notify_webhook(tosend: Dict[str, Any]) -> None:
- url = config.get("verifier", "webhook_url", section="revocations", fallback="")
- # Check if a url was specified
- if url == "":
- return
-
- # Similarly to notify(), let's convert `tosend' to str to prevent
- # possible issues with json handling by python-requests.
- tosend = json.bytes_to_str(tosend)
-
- def worker_webhook(tosend: Dict[str, Any], url: str) -> None:
- interval = config.getfloat("verifier", "retry_interval")
- exponential_backoff = config.getboolean("verifier", "exponential_backoff")
-
- max_retries = config.getint("verifier", "max_retries")
- if max_retries <= 0:
- logger.info("Invalid value found in 'max_retries' option for verifier, using default value")
- max_retries = 5
-
- # Get TLS options from the configuration
- (cert, key, trusted_ca, key_password), verify_server_cert = web_util.get_tls_options(
- "verifier", is_client=True, logger=logger
- )
-
- # Generate the TLS context using the obtained options
- tls_context = web_util.generate_tls_context(cert, key, trusted_ca, key_password, is_client=True, logger=logger)
-
- logger.info("Sending revocation event via webhook to %s ...", url)
- for i in range(max_retries):
- next_retry = retry.retry_time(exponential_backoff, interval, i, logger)
-
- with RequestsClient(
- url,
- verify_server_cert,
- tls_context,
- ) as client:
- try:
- res = client.post("", json=tosend, timeout=5)
- except requests.exceptions.SSLError as ssl_error:
- if "TLSV1_ALERT_UNKNOWN_CA" in str(ssl_error):
- logger.warning(
- "Keylime does not recognize certificate from peer. Check if verifier 'trusted_server_ca' is configured correctly"
- )
-
- raise ssl_error from ssl_error
-
- if res and res.status_code in [200, 202]:
- break
-
- logger.debug(
- "Unable to publish revocation message %d times via webhook, "
- "trying again in %d seconds. "
- "Server returned status code: %s",
- i + 1,
- next_retry,
- res.status_code,
- )
-
- time.sleep(next_retry)
-
- w = functools.partial(worker_webhook, tosend, url)
- t = threading.Thread(target=w, daemon=True)
- t.start()
+ """Send webhook notification using the global webhook manager."""
+ manager = _get_webhook_manager()
+ manager.notify_webhook(tosend)
cert_key = None
--
2.47.3

View File

@ -0,0 +1,45 @@
From 5fb4484b07a7ba3fcdf451bf816b5f07a40d6d97 Mon Sep 17 00:00:00 2001
From: Sergio Correia <scorreia@redhat.com>
Date: Wed, 4 Jun 2025 19:52:37 +0100
Subject: [PATCH 12/13] requests_client: close the session at the end of the
resource manager
We had an issue in the past in which the webhook worker would not
properly close the opened session. This was fixed in #1456 (Close
session in worker_webhook function).
At some later point, in #1566 (revocation_notifier: Take into account CA
certificates added via configuration), some refactoring around the
webhook_worker() in revocation_notifier happened and it started using
the RequestsClient resource manager.
However, the RequestsClient does not close the session at its end, which
in turns makes that the old issue of not closing properly the session
in the webhook_worker() returned.
We now issue a session.close() at the end of the RequestsClient.
Signed-off-by: Sergio Correia <scorreia@redhat.com>
---
keylime/requests_client.py | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/keylime/requests_client.py b/keylime/requests_client.py
index 16615f7..b7da484 100644
--- a/keylime/requests_client.py
+++ b/keylime/requests_client.py
@@ -40,7 +40,10 @@ class RequestsClient:
return self
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
- pass
+ try:
+ self.session.close()
+ except Exception:
+ pass
def request(self, method: str, url: str, **kwargs: Any) -> requests.Response:
return self.session.request(method, self.base_url + url, **kwargs)
--
2.47.3

View File

@ -9,7 +9,7 @@
Name: keylime
Version: 7.12.1
Release: 10%{?dist}
Release: 11%{?dist}
Summary: Open source TPM software for Bootstrapping and Maintaining Trust
URL: https://github.com/keylime/keylime
@ -34,6 +34,12 @@ Patch: 0007-fix_db_connection_leaks.patch
Patch: 0008-mb-support-EV_EFI_HANDOFF_TABLES-events-on-PCR1.patch
Patch: 0009-mb-support-vendor_db-as-logged-by-newer-shim-version.patch
# Backported from https://github.com/keylime/keylime/pull/1784
# and https://github.com/keylime/keylime/pull/1785.
Patch: 0010-verifier-Gracefully-shutdown-on-signal.patch
Patch: 0011-revocations-Try-to-send-notifications-on-shutdown.patch
Patch: 0012-requests_client-close-the-session-at-the-end-of-the-.patch
License: ASL 2.0 and MIT
BuildRequires: git-core
@ -427,6 +433,10 @@ fi
%license LICENSE
%changelog
* Mon Aug 18 2025 Sergio Correia <scorreia@redhat.com> - 7.12.1-11
- Fix for revocation notifier not closing TLS session correctly
Resolves: RHEL-109656
* Wed Aug 13 2025 Sergio Correia <scorreia@redhat.com> - 7.12.1-10
- Support vendor_db: follow-up fix
Related: RHEL-80455