Fix for revocation notifier not closing TLS session correctly
Resolves: RHEL-109656 Signed-off-by: Sergio Correia <scorreia@redhat.com>
This commit is contained in:
parent
f280e120cd
commit
302e60d9ff
42
0010-verifier-Gracefully-shutdown-on-signal.patch
Normal file
42
0010-verifier-Gracefully-shutdown-on-signal.patch
Normal file
@ -0,0 +1,42 @@
|
||||
From 1b7191098ca3f6d72c6ad218564ae0938a87efd4 Mon Sep 17 00:00:00 2001
|
||||
From: Anderson Toshiyuki Sasaki <ansasaki@redhat.com>
|
||||
Date: Mon, 18 Aug 2025 12:22:55 +0000
|
||||
Subject: [PATCH 10/13] verifier: Gracefully shutdown on signal
|
||||
|
||||
Wait for the processes to finish when interrupted by a signal. Do not
|
||||
call exit(0) in the signal handler.
|
||||
|
||||
Assisted-by: Claude 4 Sonnet
|
||||
Signed-off-by: Anderson Toshiyuki Sasaki <ansasaki@redhat.com>
|
||||
---
|
||||
keylime/cloud_verifier_tornado.py | 10 +++++++++-
|
||||
1 file changed, 9 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/keylime/cloud_verifier_tornado.py b/keylime/cloud_verifier_tornado.py
|
||||
index 7553ac8..7065661 100644
|
||||
--- a/keylime/cloud_verifier_tornado.py
|
||||
+++ b/keylime/cloud_verifier_tornado.py
|
||||
@@ -2138,7 +2138,7 @@ def main() -> None:
|
||||
revocation_notifier.stop_broker()
|
||||
for p in processes:
|
||||
p.join()
|
||||
- sys.exit(0)
|
||||
+ # Do not call sys.exit(0) here as it interferes with multiprocessing cleanup
|
||||
|
||||
signal.signal(signal.SIGINT, sig_handler)
|
||||
signal.signal(signal.SIGTERM, sig_handler)
|
||||
@@ -2159,3 +2159,11 @@ def main() -> None:
|
||||
process = Process(target=server_process, args=(task_id, active_agents))
|
||||
process.start()
|
||||
processes.append(process)
|
||||
+
|
||||
+ # Wait for all worker processes to complete
|
||||
+ try:
|
||||
+ for p in processes:
|
||||
+ p.join()
|
||||
+ except KeyboardInterrupt:
|
||||
+ # Signal handler will take care of cleanup
|
||||
+ pass
|
||||
--
|
||||
2.47.3
|
||||
|
||||
308
0011-revocations-Try-to-send-notifications-on-shutdown.patch
Normal file
308
0011-revocations-Try-to-send-notifications-on-shutdown.patch
Normal file
@ -0,0 +1,308 @@
|
||||
From af9ac50f5acf1a7d4ad285956b60e60c3c4416b7 Mon Sep 17 00:00:00 2001
|
||||
From: Anderson Toshiyuki Sasaki <ansasaki@redhat.com>
|
||||
Date: Wed, 23 Jul 2025 15:39:49 +0200
|
||||
Subject: [PATCH 11/13] revocations: Try to send notifications on shutdown
|
||||
|
||||
During verifier shutdown, try to send any pending revocation
|
||||
notification in a best-effort manner. In future, the pending revocation
|
||||
notifications should be persisted to be processed during next startup.
|
||||
|
||||
Assisted-by: Claude 4 Sonnet
|
||||
Signed-off-by: Anderson Toshiyuki Sasaki <ansasaki@redhat.com>
|
||||
---
|
||||
keylime/cloud_verifier_tornado.py | 7 +
|
||||
keylime/revocation_notifier.py | 239 ++++++++++++++++++++++--------
|
||||
2 files changed, 184 insertions(+), 62 deletions(-)
|
||||
|
||||
diff --git a/keylime/cloud_verifier_tornado.py b/keylime/cloud_verifier_tornado.py
|
||||
index 7065661..89aa703 100644
|
||||
--- a/keylime/cloud_verifier_tornado.py
|
||||
+++ b/keylime/cloud_verifier_tornado.py
|
||||
@@ -2109,6 +2109,10 @@ def main() -> None:
|
||||
# Stop server to not accept new incoming connections
|
||||
server.stop()
|
||||
|
||||
+ # Gracefully shutdown webhook workers to prevent connection errors
|
||||
+ if "webhook" in revocation_notifier.get_notifiers():
|
||||
+ revocation_notifier.shutdown_webhook_workers()
|
||||
+
|
||||
# Wait for all connections to be closed and then stop ioloop
|
||||
async def stop() -> None:
|
||||
await server.close_all_connections()
|
||||
@@ -2136,6 +2140,9 @@ def main() -> None:
|
||||
def sig_handler(*_: Any) -> None:
|
||||
if run_revocation_notifier:
|
||||
revocation_notifier.stop_broker()
|
||||
+ # Gracefully shutdown webhook workers to prevent connection errors
|
||||
+ if "webhook" in revocation_notifier.get_notifiers():
|
||||
+ revocation_notifier.shutdown_webhook_workers()
|
||||
for p in processes:
|
||||
p.join()
|
||||
# Do not call sys.exit(0) here as it interferes with multiprocessing cleanup
|
||||
diff --git a/keylime/revocation_notifier.py b/keylime/revocation_notifier.py
|
||||
index 5a7cc4b..c154028 100644
|
||||
--- a/keylime/revocation_notifier.py
|
||||
+++ b/keylime/revocation_notifier.py
|
||||
@@ -18,6 +18,174 @@ broker_proc: Optional[Process] = None
|
||||
|
||||
_SOCKET_PATH = "/var/run/keylime/keylime.verifier.ipc"
|
||||
|
||||
+# Global webhook manager instance (initialized when needed)
|
||||
+_webhook_manager: Optional["WebhookNotificationManager"] = None
|
||||
+
|
||||
+
|
||||
+class WebhookNotificationManager:
|
||||
+ """Manages webhook worker threads and graceful shutdown for revocation notifications."""
|
||||
+
|
||||
+ def __init__(self) -> None:
|
||||
+ self._shutdown_event = threading.Event()
|
||||
+ self._workers: Set[threading.Thread] = set()
|
||||
+ self._workers_lock = threading.Lock()
|
||||
+
|
||||
+ def notify_webhook(self, tosend: Dict[str, Any]) -> None:
|
||||
+ """Send webhook notification with worker thread management."""
|
||||
+ url = config.get("verifier", "webhook_url", section="revocations", fallback="")
|
||||
+ # Check if a url was specified
|
||||
+ if url == "":
|
||||
+ return
|
||||
+
|
||||
+ # Similarly to notify(), let's convert `tosend' to str to prevent
|
||||
+ # possible issues with json handling by python-requests.
|
||||
+ tosend = json.bytes_to_str(tosend)
|
||||
+
|
||||
+ def worker_webhook(tosend: Dict[str, Any], url: str) -> None:
|
||||
+ is_shutdown_mode = False
|
||||
+ try:
|
||||
+ interval = config.getfloat("verifier", "retry_interval")
|
||||
+ exponential_backoff = config.getboolean("verifier", "exponential_backoff")
|
||||
+
|
||||
+ max_retries = config.getint("verifier", "max_retries")
|
||||
+ if max_retries <= 0:
|
||||
+ logger.info("Invalid value found in 'max_retries' option for verifier, using default value")
|
||||
+ max_retries = 5
|
||||
+
|
||||
+ # During shutdown, use fewer retries but still make best effort
|
||||
+ if self._shutdown_event.is_set():
|
||||
+ is_shutdown_mode = True
|
||||
+ max_retries = min(max_retries, 3) # Reduce retries during shutdown but still try
|
||||
+ logger.info(
|
||||
+ "Shutdown mode: attempting to send critical revocation notification with %d retries",
|
||||
+ max_retries,
|
||||
+ )
|
||||
+
|
||||
+ # Get TLS options from the configuration
|
||||
+ (cert, key, trusted_ca, key_password), verify_server_cert = web_util.get_tls_options(
|
||||
+ "verifier", is_client=True, logger=logger
|
||||
+ )
|
||||
+
|
||||
+ # Generate the TLS context using the obtained options
|
||||
+ tls_context = web_util.generate_tls_context(
|
||||
+ cert, key, trusted_ca, key_password, is_client=True, logger=logger
|
||||
+ )
|
||||
+
|
||||
+ logger.info("Sending revocation event via webhook to %s ...", url)
|
||||
+ for i in range(max_retries):
|
||||
+ next_retry = retry.retry_time(exponential_backoff, interval, i, logger)
|
||||
+
|
||||
+ with RequestsClient(
|
||||
+ url,
|
||||
+ verify_server_cert,
|
||||
+ tls_context,
|
||||
+ ) as client:
|
||||
+ try:
|
||||
+ res = client.post("", json=tosend, timeout=5)
|
||||
+ except requests.exceptions.SSLError as ssl_error:
|
||||
+ if "TLSV1_ALERT_UNKNOWN_CA" in str(ssl_error):
|
||||
+ logger.warning(
|
||||
+ "Keylime does not recognize certificate from peer. Check if verifier 'trusted_server_ca' is configured correctly"
|
||||
+ )
|
||||
+
|
||||
+ raise ssl_error from ssl_error
|
||||
+ except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
|
||||
+ # During shutdown, only suppress errors on the final attempt after all retries exhausted
|
||||
+ if is_shutdown_mode and i == max_retries - 1:
|
||||
+ logger.warning(
|
||||
+ "Final attempt to send revocation notification failed during shutdown: %s", e
|
||||
+ )
|
||||
+ return
|
||||
+ # Otherwise, let the retry logic handle it
|
||||
+ raise e
|
||||
+
|
||||
+ if res and res.status_code in [200, 202]:
|
||||
+ if is_shutdown_mode:
|
||||
+ logger.info("Successfully sent revocation notification during shutdown")
|
||||
+ break
|
||||
+
|
||||
+ logger.debug(
|
||||
+ "Unable to publish revocation message %d times via webhook, "
|
||||
+ "trying again in %d seconds. "
|
||||
+ "Server returned status code: %s",
|
||||
+ i + 1,
|
||||
+ next_retry,
|
||||
+ res.status_code,
|
||||
+ )
|
||||
+
|
||||
+ # During shutdown, use shorter retry intervals to complete faster
|
||||
+ if is_shutdown_mode:
|
||||
+ next_retry = min(next_retry, 2.0) # Cap retry interval during shutdown
|
||||
+
|
||||
+ time.sleep(next_retry)
|
||||
+
|
||||
+ except Exception as e:
|
||||
+ # Only suppress errors during final shutdown phase and log appropriately
|
||||
+ if is_shutdown_mode:
|
||||
+ logger.warning("Failed to send revocation notification during shutdown: %s", e)
|
||||
+ else:
|
||||
+ logger.error("Error in webhook worker: %s", e)
|
||||
+ finally:
|
||||
+ # Remove this worker from the active set
|
||||
+ current_thread = threading.current_thread()
|
||||
+ with self._workers_lock:
|
||||
+ self._workers.discard(current_thread)
|
||||
+
|
||||
+ w = functools.partial(worker_webhook, tosend, url)
|
||||
+ t = threading.Thread(target=w, daemon=True)
|
||||
+
|
||||
+ # Add this worker to the active set
|
||||
+ with self._workers_lock:
|
||||
+ self._workers.add(t)
|
||||
+
|
||||
+ t.start()
|
||||
+
|
||||
+ def shutdown_workers(self) -> None:
|
||||
+ """Signal webhook workers to shut down gracefully and wait for them to complete.
|
||||
+
|
||||
+ This gives workers time to complete their critical revocation notifications
|
||||
+ before the service shuts down completely.
|
||||
+ """
|
||||
+ logger.info("Shutting down webhook workers gracefully...")
|
||||
+ self._shutdown_event.set()
|
||||
+
|
||||
+ # Give workers generous time to complete critical revocation notifications
|
||||
+ timeout = 30.0 # Increased timeout for critical security notifications
|
||||
+ end_time = time.time() + timeout
|
||||
+
|
||||
+ with self._workers_lock:
|
||||
+ workers_to_wait = list(self._workers)
|
||||
+
|
||||
+ if workers_to_wait:
|
||||
+ logger.info("Waiting for %d webhook workers to complete revocation notifications...", len(workers_to_wait))
|
||||
+
|
||||
+ for worker in workers_to_wait:
|
||||
+ remaining_time = max(0, end_time - time.time())
|
||||
+ if remaining_time > 0:
|
||||
+ logger.debug(
|
||||
+ "Waiting for webhook worker %s to complete (timeout: %.1f seconds)", worker.name, remaining_time
|
||||
+ )
|
||||
+ worker.join(timeout=remaining_time)
|
||||
+ if worker.is_alive():
|
||||
+ logger.warning("Webhook worker %s did not complete within timeout", worker.name)
|
||||
+ else:
|
||||
+ logger.warning("Timeout exceeded while waiting for webhook workers")
|
||||
+ break
|
||||
+
|
||||
+ # Clean up completed workers
|
||||
+ with self._workers_lock:
|
||||
+ self._workers.clear()
|
||||
+
|
||||
+ logger.info("Webhook workers shutdown complete")
|
||||
+
|
||||
+
|
||||
+def _get_webhook_manager() -> WebhookNotificationManager:
|
||||
+ """Get the global webhook manager instance, creating it if needed."""
|
||||
+ global _webhook_manager
|
||||
+ if _webhook_manager is None:
|
||||
+ _webhook_manager = WebhookNotificationManager()
|
||||
+ return _webhook_manager
|
||||
+
|
||||
|
||||
# return the revocation notification methods for cloud verifier
|
||||
def get_notifiers() -> Set[str]:
|
||||
@@ -83,6 +251,12 @@ def stop_broker() -> None:
|
||||
broker_proc.kill() # pylint: disable=E1101
|
||||
|
||||
|
||||
+def shutdown_webhook_workers() -> None:
|
||||
+ """Convenience function to shutdown webhook workers using the global manager."""
|
||||
+ manager = _get_webhook_manager()
|
||||
+ manager.shutdown_workers()
|
||||
+
|
||||
+
|
||||
def notify(tosend: Dict[str, Any]) -> None:
|
||||
assert "zeromq" in get_notifiers()
|
||||
try:
|
||||
@@ -127,68 +301,9 @@ def notify(tosend: Dict[str, Any]) -> None:
|
||||
|
||||
|
||||
def notify_webhook(tosend: Dict[str, Any]) -> None:
|
||||
- url = config.get("verifier", "webhook_url", section="revocations", fallback="")
|
||||
- # Check if a url was specified
|
||||
- if url == "":
|
||||
- return
|
||||
-
|
||||
- # Similarly to notify(), let's convert `tosend' to str to prevent
|
||||
- # possible issues with json handling by python-requests.
|
||||
- tosend = json.bytes_to_str(tosend)
|
||||
-
|
||||
- def worker_webhook(tosend: Dict[str, Any], url: str) -> None:
|
||||
- interval = config.getfloat("verifier", "retry_interval")
|
||||
- exponential_backoff = config.getboolean("verifier", "exponential_backoff")
|
||||
-
|
||||
- max_retries = config.getint("verifier", "max_retries")
|
||||
- if max_retries <= 0:
|
||||
- logger.info("Invalid value found in 'max_retries' option for verifier, using default value")
|
||||
- max_retries = 5
|
||||
-
|
||||
- # Get TLS options from the configuration
|
||||
- (cert, key, trusted_ca, key_password), verify_server_cert = web_util.get_tls_options(
|
||||
- "verifier", is_client=True, logger=logger
|
||||
- )
|
||||
-
|
||||
- # Generate the TLS context using the obtained options
|
||||
- tls_context = web_util.generate_tls_context(cert, key, trusted_ca, key_password, is_client=True, logger=logger)
|
||||
-
|
||||
- logger.info("Sending revocation event via webhook to %s ...", url)
|
||||
- for i in range(max_retries):
|
||||
- next_retry = retry.retry_time(exponential_backoff, interval, i, logger)
|
||||
-
|
||||
- with RequestsClient(
|
||||
- url,
|
||||
- verify_server_cert,
|
||||
- tls_context,
|
||||
- ) as client:
|
||||
- try:
|
||||
- res = client.post("", json=tosend, timeout=5)
|
||||
- except requests.exceptions.SSLError as ssl_error:
|
||||
- if "TLSV1_ALERT_UNKNOWN_CA" in str(ssl_error):
|
||||
- logger.warning(
|
||||
- "Keylime does not recognize certificate from peer. Check if verifier 'trusted_server_ca' is configured correctly"
|
||||
- )
|
||||
-
|
||||
- raise ssl_error from ssl_error
|
||||
-
|
||||
- if res and res.status_code in [200, 202]:
|
||||
- break
|
||||
-
|
||||
- logger.debug(
|
||||
- "Unable to publish revocation message %d times via webhook, "
|
||||
- "trying again in %d seconds. "
|
||||
- "Server returned status code: %s",
|
||||
- i + 1,
|
||||
- next_retry,
|
||||
- res.status_code,
|
||||
- )
|
||||
-
|
||||
- time.sleep(next_retry)
|
||||
-
|
||||
- w = functools.partial(worker_webhook, tosend, url)
|
||||
- t = threading.Thread(target=w, daemon=True)
|
||||
- t.start()
|
||||
+ """Send webhook notification using the global webhook manager."""
|
||||
+ manager = _get_webhook_manager()
|
||||
+ manager.notify_webhook(tosend)
|
||||
|
||||
|
||||
cert_key = None
|
||||
--
|
||||
2.47.3
|
||||
|
||||
@ -0,0 +1,45 @@
|
||||
From 5fb4484b07a7ba3fcdf451bf816b5f07a40d6d97 Mon Sep 17 00:00:00 2001
|
||||
From: Sergio Correia <scorreia@redhat.com>
|
||||
Date: Wed, 4 Jun 2025 19:52:37 +0100
|
||||
Subject: [PATCH 12/13] requests_client: close the session at the end of the
|
||||
resource manager
|
||||
|
||||
We had an issue in the past in which the webhook worker would not
|
||||
properly close the opened session. This was fixed in #1456 (Close
|
||||
session in worker_webhook function).
|
||||
|
||||
At some later point, in #1566 (revocation_notifier: Take into account CA
|
||||
certificates added via configuration), some refactoring around the
|
||||
webhook_worker() in revocation_notifier happened and it started using
|
||||
the RequestsClient resource manager.
|
||||
|
||||
However, the RequestsClient does not close the session at its end, which
|
||||
in turns makes that the old issue of not closing properly the session
|
||||
in the webhook_worker() returned.
|
||||
|
||||
We now issue a session.close() at the end of the RequestsClient.
|
||||
|
||||
Signed-off-by: Sergio Correia <scorreia@redhat.com>
|
||||
---
|
||||
keylime/requests_client.py | 5 ++++-
|
||||
1 file changed, 4 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/keylime/requests_client.py b/keylime/requests_client.py
|
||||
index 16615f7..b7da484 100644
|
||||
--- a/keylime/requests_client.py
|
||||
+++ b/keylime/requests_client.py
|
||||
@@ -40,7 +40,10 @@ class RequestsClient:
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
||||
- pass
|
||||
+ try:
|
||||
+ self.session.close()
|
||||
+ except Exception:
|
||||
+ pass
|
||||
|
||||
def request(self, method: str, url: str, **kwargs: Any) -> requests.Response:
|
||||
return self.session.request(method, self.base_url + url, **kwargs)
|
||||
--
|
||||
2.47.3
|
||||
|
||||
12
keylime.spec
12
keylime.spec
@ -9,7 +9,7 @@
|
||||
|
||||
Name: keylime
|
||||
Version: 7.12.1
|
||||
Release: 10%{?dist}
|
||||
Release: 11%{?dist}
|
||||
Summary: Open source TPM software for Bootstrapping and Maintaining Trust
|
||||
|
||||
URL: https://github.com/keylime/keylime
|
||||
@ -34,6 +34,12 @@ Patch: 0007-fix_db_connection_leaks.patch
|
||||
Patch: 0008-mb-support-EV_EFI_HANDOFF_TABLES-events-on-PCR1.patch
|
||||
Patch: 0009-mb-support-vendor_db-as-logged-by-newer-shim-version.patch
|
||||
|
||||
# Backported from https://github.com/keylime/keylime/pull/1784
|
||||
# and https://github.com/keylime/keylime/pull/1785.
|
||||
Patch: 0010-verifier-Gracefully-shutdown-on-signal.patch
|
||||
Patch: 0011-revocations-Try-to-send-notifications-on-shutdown.patch
|
||||
Patch: 0012-requests_client-close-the-session-at-the-end-of-the-.patch
|
||||
|
||||
License: ASL 2.0 and MIT
|
||||
|
||||
BuildRequires: git-core
|
||||
@ -427,6 +433,10 @@ fi
|
||||
%license LICENSE
|
||||
|
||||
%changelog
|
||||
* Mon Aug 18 2025 Sergio Correia <scorreia@redhat.com> - 7.12.1-11
|
||||
- Fix for revocation notifier not closing TLS session correctly
|
||||
Resolves: RHEL-109656
|
||||
|
||||
* Wed Aug 13 2025 Sergio Correia <scorreia@redhat.com> - 7.12.1-10
|
||||
- Support vendor_db: follow-up fix
|
||||
Related: RHEL-80455
|
||||
|
||||
Loading…
Reference in New Issue
Block a user