keylime/SOURCES/0011-revocations-Try-to-send-notifications-on-shutdown.patch

309 lines
13 KiB
Diff

From af9ac50f5acf1a7d4ad285956b60e60c3c4416b7 Mon Sep 17 00:00:00 2001
From: Anderson Toshiyuki Sasaki <ansasaki@redhat.com>
Date: Wed, 23 Jul 2025 15:39:49 +0200
Subject: [PATCH 11/13] revocations: Try to send notifications on shutdown
During verifier shutdown, try to send any pending revocation
notification in a best-effort manner. In future, the pending revocation
notifications should be persisted to be processed during next startup.
Assisted-by: Claude 4 Sonnet
Signed-off-by: Anderson Toshiyuki Sasaki <ansasaki@redhat.com>
---
keylime/cloud_verifier_tornado.py | 7 +
keylime/revocation_notifier.py | 239 ++++++++++++++++++++++--------
2 files changed, 184 insertions(+), 62 deletions(-)
diff --git a/keylime/cloud_verifier_tornado.py b/keylime/cloud_verifier_tornado.py
index 7065661..89aa703 100644
--- a/keylime/cloud_verifier_tornado.py
+++ b/keylime/cloud_verifier_tornado.py
@@ -2109,6 +2109,10 @@ def main() -> None:
# Stop server to not accept new incoming connections
server.stop()
+ # Gracefully shutdown webhook workers to prevent connection errors
+ if "webhook" in revocation_notifier.get_notifiers():
+ revocation_notifier.shutdown_webhook_workers()
+
# Wait for all connections to be closed and then stop ioloop
async def stop() -> None:
await server.close_all_connections()
@@ -2136,6 +2140,9 @@ def main() -> None:
def sig_handler(*_: Any) -> None:
if run_revocation_notifier:
revocation_notifier.stop_broker()
+ # Gracefully shutdown webhook workers to prevent connection errors
+ if "webhook" in revocation_notifier.get_notifiers():
+ revocation_notifier.shutdown_webhook_workers()
for p in processes:
p.join()
# Do not call sys.exit(0) here as it interferes with multiprocessing cleanup
diff --git a/keylime/revocation_notifier.py b/keylime/revocation_notifier.py
index 5a7cc4b..c154028 100644
--- a/keylime/revocation_notifier.py
+++ b/keylime/revocation_notifier.py
@@ -18,6 +18,174 @@ broker_proc: Optional[Process] = None
_SOCKET_PATH = "/var/run/keylime/keylime.verifier.ipc"
+# Global webhook manager instance (initialized when needed)
+_webhook_manager: Optional["WebhookNotificationManager"] = None
+
+
+class WebhookNotificationManager:
+ """Manages webhook worker threads and graceful shutdown for revocation notifications."""
+
+ def __init__(self) -> None:
+ self._shutdown_event = threading.Event()
+ self._workers: Set[threading.Thread] = set()
+ self._workers_lock = threading.Lock()
+
+ def notify_webhook(self, tosend: Dict[str, Any]) -> None:
+ """Send webhook notification with worker thread management."""
+ url = config.get("verifier", "webhook_url", section="revocations", fallback="")
+ # Check if a url was specified
+ if url == "":
+ return
+
+ # Similarly to notify(), let's convert `tosend' to str to prevent
+ # possible issues with json handling by python-requests.
+ tosend = json.bytes_to_str(tosend)
+
+ def worker_webhook(tosend: Dict[str, Any], url: str) -> None:
+ is_shutdown_mode = False
+ try:
+ interval = config.getfloat("verifier", "retry_interval")
+ exponential_backoff = config.getboolean("verifier", "exponential_backoff")
+
+ max_retries = config.getint("verifier", "max_retries")
+ if max_retries <= 0:
+ logger.info("Invalid value found in 'max_retries' option for verifier, using default value")
+ max_retries = 5
+
+ # During shutdown, use fewer retries but still make best effort
+ if self._shutdown_event.is_set():
+ is_shutdown_mode = True
+ max_retries = min(max_retries, 3) # Reduce retries during shutdown but still try
+ logger.info(
+ "Shutdown mode: attempting to send critical revocation notification with %d retries",
+ max_retries,
+ )
+
+ # Get TLS options from the configuration
+ (cert, key, trusted_ca, key_password), verify_server_cert = web_util.get_tls_options(
+ "verifier", is_client=True, logger=logger
+ )
+
+ # Generate the TLS context using the obtained options
+ tls_context = web_util.generate_tls_context(
+ cert, key, trusted_ca, key_password, is_client=True, logger=logger
+ )
+
+ logger.info("Sending revocation event via webhook to %s ...", url)
+ for i in range(max_retries):
+ next_retry = retry.retry_time(exponential_backoff, interval, i, logger)
+
+ with RequestsClient(
+ url,
+ verify_server_cert,
+ tls_context,
+ ) as client:
+ try:
+ res = client.post("", json=tosend, timeout=5)
+ except requests.exceptions.SSLError as ssl_error:
+ if "TLSV1_ALERT_UNKNOWN_CA" in str(ssl_error):
+ logger.warning(
+ "Keylime does not recognize certificate from peer. Check if verifier 'trusted_server_ca' is configured correctly"
+ )
+
+ raise ssl_error from ssl_error
+ except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
+ # During shutdown, only suppress errors on the final attempt after all retries exhausted
+ if is_shutdown_mode and i == max_retries - 1:
+ logger.warning(
+ "Final attempt to send revocation notification failed during shutdown: %s", e
+ )
+ return
+ # Otherwise, let the retry logic handle it
+ raise e
+
+ if res and res.status_code in [200, 202]:
+ if is_shutdown_mode:
+ logger.info("Successfully sent revocation notification during shutdown")
+ break
+
+ logger.debug(
+ "Unable to publish revocation message %d times via webhook, "
+ "trying again in %d seconds. "
+ "Server returned status code: %s",
+ i + 1,
+ next_retry,
+ res.status_code,
+ )
+
+ # During shutdown, use shorter retry intervals to complete faster
+ if is_shutdown_mode:
+ next_retry = min(next_retry, 2.0) # Cap retry interval during shutdown
+
+ time.sleep(next_retry)
+
+ except Exception as e:
+ # Only suppress errors during final shutdown phase and log appropriately
+ if is_shutdown_mode:
+ logger.warning("Failed to send revocation notification during shutdown: %s", e)
+ else:
+ logger.error("Error in webhook worker: %s", e)
+ finally:
+ # Remove this worker from the active set
+ current_thread = threading.current_thread()
+ with self._workers_lock:
+ self._workers.discard(current_thread)
+
+ w = functools.partial(worker_webhook, tosend, url)
+ t = threading.Thread(target=w, daemon=True)
+
+ # Add this worker to the active set
+ with self._workers_lock:
+ self._workers.add(t)
+
+ t.start()
+
+ def shutdown_workers(self) -> None:
+ """Signal webhook workers to shut down gracefully and wait for them to complete.
+
+ This gives workers time to complete their critical revocation notifications
+ before the service shuts down completely.
+ """
+ logger.info("Shutting down webhook workers gracefully...")
+ self._shutdown_event.set()
+
+ # Give workers generous time to complete critical revocation notifications
+ timeout = 30.0 # Increased timeout for critical security notifications
+ end_time = time.time() + timeout
+
+ with self._workers_lock:
+ workers_to_wait = list(self._workers)
+
+ if workers_to_wait:
+ logger.info("Waiting for %d webhook workers to complete revocation notifications...", len(workers_to_wait))
+
+ for worker in workers_to_wait:
+ remaining_time = max(0, end_time - time.time())
+ if remaining_time > 0:
+ logger.debug(
+ "Waiting for webhook worker %s to complete (timeout: %.1f seconds)", worker.name, remaining_time
+ )
+ worker.join(timeout=remaining_time)
+ if worker.is_alive():
+ logger.warning("Webhook worker %s did not complete within timeout", worker.name)
+ else:
+ logger.warning("Timeout exceeded while waiting for webhook workers")
+ break
+
+ # Clean up completed workers
+ with self._workers_lock:
+ self._workers.clear()
+
+ logger.info("Webhook workers shutdown complete")
+
+
+def _get_webhook_manager() -> WebhookNotificationManager:
+ """Get the global webhook manager instance, creating it if needed."""
+ global _webhook_manager
+ if _webhook_manager is None:
+ _webhook_manager = WebhookNotificationManager()
+ return _webhook_manager
+
# return the revocation notification methods for cloud verifier
def get_notifiers() -> Set[str]:
@@ -83,6 +251,12 @@ def stop_broker() -> None:
broker_proc.kill() # pylint: disable=E1101
+def shutdown_webhook_workers() -> None:
+ """Convenience function to shutdown webhook workers using the global manager."""
+ manager = _get_webhook_manager()
+ manager.shutdown_workers()
+
+
def notify(tosend: Dict[str, Any]) -> None:
assert "zeromq" in get_notifiers()
try:
@@ -127,68 +301,9 @@ def notify(tosend: Dict[str, Any]) -> None:
def notify_webhook(tosend: Dict[str, Any]) -> None:
- url = config.get("verifier", "webhook_url", section="revocations", fallback="")
- # Check if a url was specified
- if url == "":
- return
-
- # Similarly to notify(), let's convert `tosend' to str to prevent
- # possible issues with json handling by python-requests.
- tosend = json.bytes_to_str(tosend)
-
- def worker_webhook(tosend: Dict[str, Any], url: str) -> None:
- interval = config.getfloat("verifier", "retry_interval")
- exponential_backoff = config.getboolean("verifier", "exponential_backoff")
-
- max_retries = config.getint("verifier", "max_retries")
- if max_retries <= 0:
- logger.info("Invalid value found in 'max_retries' option for verifier, using default value")
- max_retries = 5
-
- # Get TLS options from the configuration
- (cert, key, trusted_ca, key_password), verify_server_cert = web_util.get_tls_options(
- "verifier", is_client=True, logger=logger
- )
-
- # Generate the TLS context using the obtained options
- tls_context = web_util.generate_tls_context(cert, key, trusted_ca, key_password, is_client=True, logger=logger)
-
- logger.info("Sending revocation event via webhook to %s ...", url)
- for i in range(max_retries):
- next_retry = retry.retry_time(exponential_backoff, interval, i, logger)
-
- with RequestsClient(
- url,
- verify_server_cert,
- tls_context,
- ) as client:
- try:
- res = client.post("", json=tosend, timeout=5)
- except requests.exceptions.SSLError as ssl_error:
- if "TLSV1_ALERT_UNKNOWN_CA" in str(ssl_error):
- logger.warning(
- "Keylime does not recognize certificate from peer. Check if verifier 'trusted_server_ca' is configured correctly"
- )
-
- raise ssl_error from ssl_error
-
- if res and res.status_code in [200, 202]:
- break
-
- logger.debug(
- "Unable to publish revocation message %d times via webhook, "
- "trying again in %d seconds. "
- "Server returned status code: %s",
- i + 1,
- next_retry,
- res.status_code,
- )
-
- time.sleep(next_retry)
-
- w = functools.partial(worker_webhook, tosend, url)
- t = threading.Thread(target=w, daemon=True)
- t.start()
+ """Send webhook notification using the global webhook manager."""
+ manager = _get_webhook_manager()
+ manager.notify_webhook(tosend)
cert_key = None
--
2.47.3