From d7b9d5aca98ae8623258c1ae3fc643e7b3bb6fcf Mon Sep 17 00:00:00 2001 From: Sergio Arroutbi Date: Tue, 14 Apr 2026 11:00:29 +0200 Subject: [PATCH] Fix verifier race condition on agent delete Resolves: RHEL-153121 Backport upstream PR https://github.com/keylime/keylime/pull/1874 to fix a race condition in the pull-mode DELETE handler where an agent could be deleted from the database while an in-flight attestation cycle was still running, causing 'tenant -c update' to intermittently fail with "Agent was not deleted from Verifier after 5 tries". Signed-off-by: Sergio Arroutbi --- ...ifier-race-condition-on-agent-delete.patch | 449 ++++++++++++++++++ keylime.spec | 6 +- 2 files changed, 454 insertions(+), 1 deletion(-) create mode 100644 0013-fix-verifier-race-condition-on-agent-delete.patch diff --git a/0013-fix-verifier-race-condition-on-agent-delete.patch b/0013-fix-verifier-race-condition-on-agent-delete.patch new file mode 100644 index 0000000..e6d6d33 --- /dev/null +++ b/0013-fix-verifier-race-condition-on-agent-delete.patch @@ -0,0 +1,449 @@ +From 739a8a97357f1b52c3944706479740fb7b71fb33 Mon Sep 17 00:00:00 2001 +From: Anderson Toshiyuki Sasaki +Date: Thu, 26 Mar 2026 11:28:52 +0100 +Subject: [PATCH 13/13] fix: verifier race condition on agent delete + +Fix a race condition in the pull-mode DELETE handler where an agent +could be deleted from the database while an in-flight attestation cycle +was still running, causing 'tenant -c update' to intermittently fail +with "Agent was not deleted from Verifier after 5 tries". + +The race had two interacting causes: + +1. TERMINATED was in the immediate-deletion states list. When a second + DELETE arrived and found the agent in TERMINATED state, it deleted + immediately (200), even though an invoke_get_quote() coroutine from + the first DELETE's cycle was still in-flight. The orphaned coroutine + then crashed in store_attestation_state() with AssertionError. + +2. The DELETE handler did not cancel the scheduled IOLoop poll timer, so + new attestation cycles could start even after deletion was requested. + +Changes: +- Add _pending_events registry and _register_pending_event / + _cancel_pending_event helpers to track IOLoop timer handles (from + upstream commit 59ac386). +- Replace assert statements in store_attestation_state() and the DELETE + handler with proper error handling (graceful log+return and 404). +- Remove TERMINATED from the immediate-deletion states list so that a + second DELETE returns 202 instead of deleting while in-flight work + exists. +- Cancel the pending poll timer via _pending_events on DELETE to prevent + new attestation cycles from starting. +- Fix the tenant's do_cvdelete() to handle 200/202/404 response codes + properly and fix a typo (reponse_json -> response_json). +- Cancel the pending poll timer in the PUT "stop" handler to prevent + new attestation cycles after agent stop. +- Suppress a mypy false positive in keylime/json.py. +- Add unit tests for pending-event management and store_attestation_state + graceful handling when agent is deleted. + +Co-Authored-By: Claude Opus 4.6 +Signed-off-by: Anderson Toshiyuki Sasaki +--- + keylime/cloud_verifier_tornado.py | 180 ++++++++++++++++++++++++++++++------ + keylime/json.py | 2 +- + keylime/tenant.py | 26 +++--- + test/test_cloud_verifier_tornado.py | 114 +++++++++++++++++++++++ + 4 files changed, 282 insertions(+), 40 deletions(-) + +diff --git a/keylime/cloud_verifier_tornado.py b/keylime/cloud_verifier_tornado.py +index 17bec75..75f117b 100644 +--- a/keylime/cloud_verifier_tornado.py ++++ b/keylime/cloud_verifier_tornado.py +@@ -171,6 +171,35 @@ exclude_db: Dict[str, Any] = { + "ssl_context": None, + } + ++# Registry of agent_id -> IOLoop timeout handle for all scheduled pending ++# events (quote polls, retries). Used to cancel them all on shutdown. ++_pending_events: Dict[str, object] = {} ++ ++ ++def _register_pending_event(agent: Dict[str, Any], handle: object) -> None: ++ """Track a pending IOLoop timeout in both the agent dict and the global registry. ++ ++ The agent dict field ``pending_event`` is the per-agent reference used during ++ normal operation (e.g. cancelling on state change). The module-level ++ ``_pending_events`` dict mirrors it so that *all* handles can be ++ bulk-cancelled on shutdown without iterating over every agent. ++ """ ++ agent["pending_event"] = handle ++ _pending_events[agent["agent_id"]] = handle ++ ++ ++def _cancel_pending_event(agent: Dict[str, Any]) -> None: ++ """Cancel and unregister the pending IOLoop timeout for *agent*, if any.""" ++ handle = agent.get("pending_event") ++ if handle is None: ++ return ++ agent["pending_event"] = None ++ _pending_events.pop(agent["agent_id"], None) ++ try: ++ tornado.ioloop.IOLoop.current().remove_timeout(handle) ++ except Exception as e: ++ logger.debug("Could not remove pending event for agent %s: %s", agent["agent_id"], e) ++ + + def _from_db_obj(agent_db_obj: VerfierMain) -> Dict[str, Any]: + fields = [ +@@ -286,7 +315,12 @@ def store_attestation_state(agentAttestState: AgentAttestState) -> None: + try: + with session_context() as session: + update_agent = session.get(VerfierMain, agentAttestState.get_agent_id()) # type: ignore[attr-defined] +- assert update_agent ++ if update_agent is None: ++ logger.warning( ++ "Agent %s no longer in database, skipping attestation state storage", ++ agent_id, ++ ) ++ return + update_agent.boottime = agentAttestState.get_boottime() # pyright: ignore + update_agent.next_ima_ml_entry = agentAttestState.get_next_ima_ml_entry() # pyright: ignore + ima_pcrs_dict = agentAttestState.get_ima_pcrs() +@@ -605,34 +639,115 @@ class AgentsHandler(BaseHandler): + except SQLAlchemyError as e: + logger.error("SQLAlchemy Error deleting agent in push mode: %s", e) + web_util.echo_json_response(self.req_handler, 500, "Internal Server Error") +- else: +- # Pull mode: Use operational_state to determine deletion behavior +- op_state = agent.operational_state +- if op_state in ( +- states.SAVED, +- states.FAILED, +- states.TERMINATED, +- states.TENANT_FAILED, +- states.INVALID_QUOTE, +- ): +- try: +- verifier_db_delete_agent(session, agent_id) +- web_util.echo_json_response(self.req_handler, 200, "Success") +- logger.info("DELETE (pull mode) returning 200 response for agent id: %s", agent_id) +- except SQLAlchemyError as e: +- logger.error("SQLAlchemy Error deleting agent in pull mode: %s", e) +- web_util.echo_json_response(self.req_handler, 500, "Internal Server Error") ++ return ++ ++ # Pull mode: Use operational_state to determine deletion behavior. ++ # ++ # Terminal states with no in-flight work can be deleted ++ # immediately (200). Note that TERMINATED is intentionally ++ # excluded: it means a previous DELETE was accepted but the ++ # attestation cycle has not yet finished. Deleting immediately ++ # while in-flight work exists causes store_attestation_state() ++ # to fail when it tries to persist results for the now-gone ++ # agent. ++ op_state = agent.operational_state ++ if op_state in ( ++ states.SAVED, ++ states.FAILED, ++ states.TENANT_FAILED, ++ states.INVALID_QUOTE, ++ ): ++ # Agent is in a terminal state with no in-flight work — delete immediately. ++ # Cancel any local pending poll timer first (same-worker ++ # defensive cleanup). This matters when a cross-worker ++ # PUT /stop sets TENANT_FAILED in the DB but cannot cancel ++ # the timer in this worker's _pending_events. ++ pending_handle = _pending_events.pop(agent_id, None) ++ if pending_handle is not None: ++ tornado.ioloop.IOLoop.current().remove_timeout(pending_handle) ++ try: ++ verifier_db_delete_agent(session, agent_id) ++ web_util.echo_json_response(self.req_handler, 200, "Success") ++ logger.info("DELETE (pull mode) returning 200 response for agent id: %s", agent_id) ++ except SQLAlchemyError as e: ++ logger.error("SQLAlchemy Error deleting agent in pull mode: %s", e) ++ web_util.echo_json_response(self.req_handler, 500, "Internal Server Error") ++ return ++ ++ # Agent is in an active state or already TERMINATED from a ++ # previous DELETE. ++ # ++ # Multi-worker note: _pending_events is process-local. Each ++ # agent's attestation cycle runs in the worker process it was ++ # assigned to at startup (round-robin), but this DELETE ++ # request may arrive at any worker. ++ # ++ # - Same worker: pending_handle is accurate — if found, the ++ # agent was idle (timer pending) and we can delete ++ # immediately since no coroutine is in-flight. ++ # - Different worker: pending_handle is always None, so we ++ # fall through to the 202/TERMINATED path. The managing ++ # worker's timer fires normally, process_agent() detects ++ # TERMINATED, and completes the deletion. ++ # ++ # Important: when the agent is already TERMINATED, do NOT ++ # cancel the pending poll timer — it is the only mechanism ++ # that will trigger process_agent() to detect TERMINATED and ++ # complete the deletion. ++ if op_state == states.TERMINATED: # pyright: ignore ++ # Agent is already TERMINATED from a previous DELETE. ++ # Leave the pending poll timer alone so process_agent() ++ # can detect TERMINATED and complete the deletion. ++ web_util.echo_json_response(self.req_handler, 202, "Accepted") ++ logger.info( ++ "DELETE (pull mode) returning 202 response for agent id: %s " ++ "(already TERMINATED, waiting for deletion to complete)", ++ agent_id, ++ ) ++ return ++ ++ # First DELETE for this agent. Try to cancel the pending ++ # poll timer (same-worker optimization). ++ # ++ # Pop the handle first but do NOT cancel the timer yet — ++ # if the DB operation fails we restore the handle so the ++ # attestation cycle can continue. ++ pending_handle = _pending_events.pop(agent_id, None) ++ try: ++ if pending_handle is not None: ++ # Same-worker optimization: the agent was idle ++ # (waiting for the next poll timer) — no in-flight ++ # coroutine will come along to detect TERMINATED and ++ # complete the deletion, so delete immediately. ++ verifier_db_delete_agent(session, agent_id) ++ # DB succeeded — now safe to cancel the timer. ++ tornado.ioloop.IOLoop.current().remove_timeout(pending_handle) ++ web_util.echo_json_response(self.req_handler, 200, "Success") ++ logger.info("DELETE (pull mode) returning 200 response for agent id: %s", agent_id) + else: +- try: +- update_agent = session.get(VerfierMain, agent_id) # type: ignore[attr-defined] +- assert update_agent +- update_agent.operational_state = states.TERMINATED # pyright: ignore +- session.add(update_agent) +- # session.commit() is automatically called by context manager +- web_util.echo_json_response(self.req_handler, 202, "Accepted") +- logger.info("DELETE (pull mode) returning 202 response for agent id: %s", agent_id) +- except SQLAlchemyError as e: +- logger.error("SQLAlchemy Error for agent ID %s: %s", agent_id, e) ++ # Either an invoke_get_quote() / invoke_provide_v() ++ # coroutine is in-flight (no pending_handle — the ++ # timer already fired), or this DELETE arrived at a ++ # different worker process. Mark as TERMINATED and ++ # let process_agent() perform the actual deletion ++ # when the in-flight work finishes (or the timer ++ # fires in the managing worker). ++ update_agent = session.get(VerfierMain, agent_id) # type: ignore[attr-defined] ++ if update_agent is None: ++ web_util.echo_json_response(self.req_handler, 404, "agent id not found") ++ return ++ update_agent.operational_state = states.TERMINATED # pyright: ignore ++ session.add(update_agent) ++ web_util.echo_json_response(self.req_handler, 202, "Accepted") ++ logger.info("DELETE (pull mode) returning 202 response for agent id: %s", agent_id) ++ except SQLAlchemyError as e: ++ logger.error("SQLAlchemy Error for agent ID %s: %s", agent_id, e) ++ if pending_handle is not None: ++ # Restore the timer so the attestation cycle can ++ # continue — the DB operation failed so the agent ++ # is still there. ++ _pending_events[agent_id] = pending_handle ++ web_util.echo_json_response(self.req_handler, 500, "Internal server error") + + def post(self) -> None: + """This method handles the POST requests to add agents to the Cloud Verifier. +@@ -1045,6 +1160,15 @@ class AgentsHandler(BaseHandler): + # session.commit() is automatically called by context manager + except SQLAlchemyError as e: + logger.error("SQLAlchemy Error: %s", e) ++ web_util.echo_json_response(self.req_handler, 500, "Internal server error") ++ return ++ ++ # DB succeeded — now safe to cancel the pending poll ++ # timer to prevent new attestation cycles. ++ if agent_id is not None: ++ pending_handle = _pending_events.pop(agent_id, None) ++ if pending_handle is not None: ++ tornado.ioloop.IOLoop.current().remove_timeout(pending_handle) + + web_util.echo_json_response(self.req_handler, 200, "Success") + logger.info("PUT returning 200 response for agent id: %s", agent_id) +diff --git a/keylime/json.py b/keylime/json.py +index 82292a1..4c4b897 100644 +--- a/keylime/json.py ++++ b/keylime/json.py +@@ -24,7 +24,7 @@ def bytes_to_str(data: Any) -> Any: + for _k, _v in data.items(): + data[_k] = bytes_to_str(_v) + elif isinstance(data, tuple(_list_types)): +- _l = list(data) ++ _l = list(data) # type: ignore[call-overload] + for _k, _v in enumerate(_l): + _l[_k] = bytes_to_str(_v) + data = _l +diff --git a/keylime/tenant.py b/keylime/tenant.py +index 0cdeada..4cb3698 100644 +--- a/keylime/tenant.py ++++ b/keylime/tenant.py +@@ -1074,22 +1074,17 @@ class Tenant: + # keylime_logging.log_http_response(logger, logging.ERROR, response_json) + raise UserError(f"{self.verifier_fid_str} timed out while deleting {self.agent_fid_str}.") + +- if response_json["code"] == 202: ++ if response_json["code"] == 200: ++ logger.info("Agent %s deleted from the CV", self.agent_uuid) ++ elif response_json["code"] == 202: + numtries = 0 + deleted = False + + while not deleted: +- reponse_json = self.do_cvstatus(not_found_fail=False) +- if reponse_json["code"] != 404: ++ response_json = self.do_cvstatus(not_found_fail=False) ++ if response_json["code"] != 404: + numtries += 1 + if numtries >= self.maxr: +- # EVALUATE DELETION +- # logger.error( +- # "%s was not deleted from %s after %d tries", +- # self.agent_fid_str, +- # self.verifier_fid_str, +- # numtries, +- # ) + raise UserError( + f"{self.agent_fid_str} was not deleted from {self.verifier_fid_str} after {numtries} tries" + ) +@@ -1114,8 +1109,17 @@ class Tenant: + self.verifier_fid_str, + numtries, + ) +- # Marked for deletion (need to modify the code on CI tests) + logger.info("Agent %s deleted from the CV", self.agent_uuid) ++ elif response_json["code"] == 404: ++ # The agent was already deleted (e.g. by another caller ++ # between the do_cvstatus check and the DELETE request). ++ # The desired end state is reached — treat as success. ++ logger.info("Agent %s is already absent from the CV", self.agent_uuid) ++ else: ++ raise UserError( ++ f"Unexpected response code {response_json['code']} from " ++ f"{self.verifier_fid_str} while deleting {self.agent_fid_str}" ++ ) + + def do_regstatus(self) -> Dict[str, Any]: + if not self.registrar_ip or not self.registrar_port: +diff --git a/test/test_cloud_verifier_tornado.py b/test/test_cloud_verifier_tornado.py +new file mode 100644 +index 0000000..7515b2e +--- /dev/null ++++ b/test/test_cloud_verifier_tornado.py +@@ -0,0 +1,114 @@ ++"""Unit tests for cloud_verifier_tornado deletion and pending-event management. ++ ++Tests cover: ++1. _register_pending_event / _cancel_pending_event helpers ++2. store_attestation_state graceful handling when agent is deleted ++""" ++ ++# pylint: disable=protected-access ++ ++import unittest ++from unittest.mock import MagicMock, patch ++ ++from keylime import cloud_verifier_tornado ++ ++ ++class TestPendingEventRegistry(unittest.TestCase): ++ """Test the _pending_events registry helpers.""" ++ ++ def setUp(self): ++ cloud_verifier_tornado._pending_events.clear() ++ ++ def tearDown(self): ++ cloud_verifier_tornado._pending_events.clear() ++ ++ def test_register_pending_event(self): ++ """_register_pending_event stores handle in agent dict and global registry.""" ++ agent = {"agent_id": "test-agent-1", "pending_event": None} ++ handle = object() ++ ++ cloud_verifier_tornado._register_pending_event(agent, handle) ++ ++ self.assertIs(agent["pending_event"], handle) ++ self.assertIs(cloud_verifier_tornado._pending_events["test-agent-1"], handle) ++ ++ def test_cancel_pending_event_removes_from_both(self): ++ """_cancel_pending_event clears agent dict and global registry.""" ++ agent = {"agent_id": "test-agent-1", "pending_event": None} ++ handle = object() ++ cloud_verifier_tornado._register_pending_event(agent, handle) ++ ++ with patch("tornado.ioloop.IOLoop") as mock_ioloop_cls: ++ mock_ioloop = MagicMock() ++ mock_ioloop_cls.current.return_value = mock_ioloop ++ ++ cloud_verifier_tornado._cancel_pending_event(agent) ++ ++ self.assertIsNone(agent["pending_event"]) ++ self.assertNotIn("test-agent-1", cloud_verifier_tornado._pending_events) ++ mock_ioloop.remove_timeout.assert_called_once_with(handle) ++ ++ def test_cancel_pending_event_noop_when_none(self): ++ """_cancel_pending_event is a no-op when no pending event exists.""" ++ agent = {"agent_id": "test-agent-1", "pending_event": None} ++ ++ # Should not raise ++ cloud_verifier_tornado._cancel_pending_event(agent) ++ ++ self.assertIsNone(agent["pending_event"]) ++ ++ def test_cancel_pending_event_handles_remove_timeout_error(self): ++ """_cancel_pending_event logs but doesn't raise on remove_timeout failure.""" ++ agent = {"agent_id": "test-agent-1", "pending_event": None} ++ handle = object() ++ cloud_verifier_tornado._register_pending_event(agent, handle) ++ ++ with patch("tornado.ioloop.IOLoop") as mock_ioloop_cls: ++ mock_ioloop = MagicMock() ++ mock_ioloop_cls.current.return_value = mock_ioloop ++ mock_ioloop.remove_timeout.side_effect = RuntimeError("IOLoop stopped") ++ ++ # Should not raise ++ cloud_verifier_tornado._cancel_pending_event(agent) ++ ++ self.assertIsNone(agent["pending_event"]) ++ self.assertNotIn("test-agent-1", cloud_verifier_tornado._pending_events) ++ ++ def test_register_replaces_previous_handle(self): ++ """_register_pending_event replaces a previously registered handle.""" ++ agent = {"agent_id": "test-agent-1", "pending_event": None} ++ handle1 = object() ++ handle2 = object() ++ ++ cloud_verifier_tornado._register_pending_event(agent, handle1) ++ cloud_verifier_tornado._register_pending_event(agent, handle2) ++ ++ self.assertIs(agent["pending_event"], handle2) ++ self.assertIs(cloud_verifier_tornado._pending_events["test-agent-1"], handle2) ++ ++ ++class TestStoreAttestationState(unittest.TestCase): ++ """Test store_attestation_state graceful handling of deleted agents.""" ++ ++ @patch("keylime.cloud_verifier_tornado.session_context") ++ def test_skips_when_agent_not_in_db(self, mock_session_ctx): ++ """store_attestation_state returns gracefully when agent is deleted from DB.""" ++ mock_session = MagicMock() ++ mock_session.get.return_value = None ++ mock_session_ctx.return_value.__enter__ = MagicMock(return_value=mock_session) ++ mock_session_ctx.return_value.__exit__ = MagicMock(return_value=False) ++ ++ mock_attest_state = MagicMock() ++ mock_attest_state.get_ima_pcrs.return_value = {"10": "some_value"} ++ mock_attest_state.agent_id = "deleted-agent" ++ mock_attest_state.get_agent_id.return_value = "deleted-agent" ++ ++ # Should not raise (previously would AssertionError) ++ cloud_verifier_tornado.store_attestation_state(mock_attest_state) ++ ++ # Verify no attempt to set attributes on None ++ mock_session.add.assert_not_called() ++ ++ ++if __name__ == "__main__": ++ unittest.main() +-- +2.49.0 + diff --git a/keylime.spec b/keylime.spec index 2cf173e..30e364c 100644 --- a/keylime.spec +++ b/keylime.spec @@ -9,7 +9,7 @@ Name: keylime Version: 7.14.1 -Release: 3%{?dist} +Release: 4%{?dist} Summary: Open source TPM software for Bootstrapping and Maintaining Trust URL: https://github.com/keylime/keylime @@ -35,6 +35,10 @@ Patch: 0011-fix-Add-fork-safety-to-DBManager-via-dispose.patch # Backport https://github.com/keylime/keylime/pull/1866 Patch: 0012-fix-mem-leak-remove-unbounded-functools.cache-from-l.patch +# RHEL-153121 - fix verifier race condition on agent delete. +# Backport https://github.com/keylime/keylime/pulls/1874 +Patch: 0013-fix-verifier-race-condition-on-agent-delete.patch + # Main program: Apache-2.0 # Icons: MIT License: Apache-2.0 AND MIT