Fix verifier race condition on agent delete

Resolves: RHEL-153121

Backport upstream PR https://github.com/keylime/keylime/pull/1874
to fix a race condition in the pull-mode DELETE handler where an
agent could be deleted from the database while an in-flight
attestation cycle was still running, causing 'tenant -c update'
to intermittently fail with "Agent was not deleted from Verifier
after 5 tries".

Signed-off-by: Sergio Arroutbi <sarroutb@redhat.com>
This commit is contained in:
Sergio Arroutbi 2026-04-14 11:00:29 +02:00
parent a78791fcf3
commit d7b9d5aca9
No known key found for this signature in database
GPG Key ID: A3E819DEA8102235
2 changed files with 454 additions and 1 deletions

View File

@ -0,0 +1,449 @@
From 739a8a97357f1b52c3944706479740fb7b71fb33 Mon Sep 17 00:00:00 2001
From: Anderson Toshiyuki Sasaki <ansasaki@redhat.com>
Date: Thu, 26 Mar 2026 11:28:52 +0100
Subject: [PATCH 13/13] fix: verifier race condition on agent delete
Fix a race condition in the pull-mode DELETE handler where an agent
could be deleted from the database while an in-flight attestation cycle
was still running, causing 'tenant -c update' to intermittently fail
with "Agent was not deleted from Verifier after 5 tries".
The race had two interacting causes:
1. TERMINATED was in the immediate-deletion states list. When a second
DELETE arrived and found the agent in TERMINATED state, it deleted
immediately (200), even though an invoke_get_quote() coroutine from
the first DELETE's cycle was still in-flight. The orphaned coroutine
then crashed in store_attestation_state() with AssertionError.
2. The DELETE handler did not cancel the scheduled IOLoop poll timer, so
new attestation cycles could start even after deletion was requested.
Changes:
- Add _pending_events registry and _register_pending_event /
_cancel_pending_event helpers to track IOLoop timer handles (from
upstream commit 59ac386).
- Replace assert statements in store_attestation_state() and the DELETE
handler with proper error handling (graceful log+return and 404).
- Remove TERMINATED from the immediate-deletion states list so that a
second DELETE returns 202 instead of deleting while in-flight work
exists.
- Cancel the pending poll timer via _pending_events on DELETE to prevent
new attestation cycles from starting.
- Fix the tenant's do_cvdelete() to handle 200/202/404 response codes
properly and fix a typo (reponse_json -> response_json).
- Cancel the pending poll timer in the PUT "stop" handler to prevent
new attestation cycles after agent stop.
- Suppress a mypy false positive in keylime/json.py.
- Add unit tests for pending-event management and store_attestation_state
graceful handling when agent is deleted.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Anderson Toshiyuki Sasaki <ansasaki@redhat.com>
---
keylime/cloud_verifier_tornado.py | 180 ++++++++++++++++++++++++++++++------
keylime/json.py | 2 +-
keylime/tenant.py | 26 +++---
test/test_cloud_verifier_tornado.py | 114 +++++++++++++++++++++++
4 files changed, 282 insertions(+), 40 deletions(-)
diff --git a/keylime/cloud_verifier_tornado.py b/keylime/cloud_verifier_tornado.py
index 17bec75..75f117b 100644
--- a/keylime/cloud_verifier_tornado.py
+++ b/keylime/cloud_verifier_tornado.py
@@ -171,6 +171,35 @@ exclude_db: Dict[str, Any] = {
"ssl_context": None,
}
+# Registry of agent_id -> IOLoop timeout handle for all scheduled pending
+# events (quote polls, retries). Used to cancel them all on shutdown.
+_pending_events: Dict[str, object] = {}
+
+
+def _register_pending_event(agent: Dict[str, Any], handle: object) -> None:
+ """Track a pending IOLoop timeout in both the agent dict and the global registry.
+
+ The agent dict field ``pending_event`` is the per-agent reference used during
+ normal operation (e.g. cancelling on state change). The module-level
+ ``_pending_events`` dict mirrors it so that *all* handles can be
+ bulk-cancelled on shutdown without iterating over every agent.
+ """
+ agent["pending_event"] = handle
+ _pending_events[agent["agent_id"]] = handle
+
+
+def _cancel_pending_event(agent: Dict[str, Any]) -> None:
+ """Cancel and unregister the pending IOLoop timeout for *agent*, if any."""
+ handle = agent.get("pending_event")
+ if handle is None:
+ return
+ agent["pending_event"] = None
+ _pending_events.pop(agent["agent_id"], None)
+ try:
+ tornado.ioloop.IOLoop.current().remove_timeout(handle)
+ except Exception as e:
+ logger.debug("Could not remove pending event for agent %s: %s", agent["agent_id"], e)
+
def _from_db_obj(agent_db_obj: VerfierMain) -> Dict[str, Any]:
fields = [
@@ -286,7 +315,12 @@ def store_attestation_state(agentAttestState: AgentAttestState) -> None:
try:
with session_context() as session:
update_agent = session.get(VerfierMain, agentAttestState.get_agent_id()) # type: ignore[attr-defined]
- assert update_agent
+ if update_agent is None:
+ logger.warning(
+ "Agent %s no longer in database, skipping attestation state storage",
+ agent_id,
+ )
+ return
update_agent.boottime = agentAttestState.get_boottime() # pyright: ignore
update_agent.next_ima_ml_entry = agentAttestState.get_next_ima_ml_entry() # pyright: ignore
ima_pcrs_dict = agentAttestState.get_ima_pcrs()
@@ -605,34 +639,115 @@ class AgentsHandler(BaseHandler):
except SQLAlchemyError as e:
logger.error("SQLAlchemy Error deleting agent in push mode: %s", e)
web_util.echo_json_response(self.req_handler, 500, "Internal Server Error")
- else:
- # Pull mode: Use operational_state to determine deletion behavior
- op_state = agent.operational_state
- if op_state in (
- states.SAVED,
- states.FAILED,
- states.TERMINATED,
- states.TENANT_FAILED,
- states.INVALID_QUOTE,
- ):
- try:
- verifier_db_delete_agent(session, agent_id)
- web_util.echo_json_response(self.req_handler, 200, "Success")
- logger.info("DELETE (pull mode) returning 200 response for agent id: %s", agent_id)
- except SQLAlchemyError as e:
- logger.error("SQLAlchemy Error deleting agent in pull mode: %s", e)
- web_util.echo_json_response(self.req_handler, 500, "Internal Server Error")
+ return
+
+ # Pull mode: Use operational_state to determine deletion behavior.
+ #
+ # Terminal states with no in-flight work can be deleted
+ # immediately (200). Note that TERMINATED is intentionally
+ # excluded: it means a previous DELETE was accepted but the
+ # attestation cycle has not yet finished. Deleting immediately
+ # while in-flight work exists causes store_attestation_state()
+ # to fail when it tries to persist results for the now-gone
+ # agent.
+ op_state = agent.operational_state
+ if op_state in (
+ states.SAVED,
+ states.FAILED,
+ states.TENANT_FAILED,
+ states.INVALID_QUOTE,
+ ):
+ # Agent is in a terminal state with no in-flight work — delete immediately.
+ # Cancel any local pending poll timer first (same-worker
+ # defensive cleanup). This matters when a cross-worker
+ # PUT /stop sets TENANT_FAILED in the DB but cannot cancel
+ # the timer in this worker's _pending_events.
+ pending_handle = _pending_events.pop(agent_id, None)
+ if pending_handle is not None:
+ tornado.ioloop.IOLoop.current().remove_timeout(pending_handle)
+ try:
+ verifier_db_delete_agent(session, agent_id)
+ web_util.echo_json_response(self.req_handler, 200, "Success")
+ logger.info("DELETE (pull mode) returning 200 response for agent id: %s", agent_id)
+ except SQLAlchemyError as e:
+ logger.error("SQLAlchemy Error deleting agent in pull mode: %s", e)
+ web_util.echo_json_response(self.req_handler, 500, "Internal Server Error")
+ return
+
+ # Agent is in an active state or already TERMINATED from a
+ # previous DELETE.
+ #
+ # Multi-worker note: _pending_events is process-local. Each
+ # agent's attestation cycle runs in the worker process it was
+ # assigned to at startup (round-robin), but this DELETE
+ # request may arrive at any worker.
+ #
+ # - Same worker: pending_handle is accurate — if found, the
+ # agent was idle (timer pending) and we can delete
+ # immediately since no coroutine is in-flight.
+ # - Different worker: pending_handle is always None, so we
+ # fall through to the 202/TERMINATED path. The managing
+ # worker's timer fires normally, process_agent() detects
+ # TERMINATED, and completes the deletion.
+ #
+ # Important: when the agent is already TERMINATED, do NOT
+ # cancel the pending poll timer — it is the only mechanism
+ # that will trigger process_agent() to detect TERMINATED and
+ # complete the deletion.
+ if op_state == states.TERMINATED: # pyright: ignore
+ # Agent is already TERMINATED from a previous DELETE.
+ # Leave the pending poll timer alone so process_agent()
+ # can detect TERMINATED and complete the deletion.
+ web_util.echo_json_response(self.req_handler, 202, "Accepted")
+ logger.info(
+ "DELETE (pull mode) returning 202 response for agent id: %s "
+ "(already TERMINATED, waiting for deletion to complete)",
+ agent_id,
+ )
+ return
+
+ # First DELETE for this agent. Try to cancel the pending
+ # poll timer (same-worker optimization).
+ #
+ # Pop the handle first but do NOT cancel the timer yet —
+ # if the DB operation fails we restore the handle so the
+ # attestation cycle can continue.
+ pending_handle = _pending_events.pop(agent_id, None)
+ try:
+ if pending_handle is not None:
+ # Same-worker optimization: the agent was idle
+ # (waiting for the next poll timer) — no in-flight
+ # coroutine will come along to detect TERMINATED and
+ # complete the deletion, so delete immediately.
+ verifier_db_delete_agent(session, agent_id)
+ # DB succeeded — now safe to cancel the timer.
+ tornado.ioloop.IOLoop.current().remove_timeout(pending_handle)
+ web_util.echo_json_response(self.req_handler, 200, "Success")
+ logger.info("DELETE (pull mode) returning 200 response for agent id: %s", agent_id)
else:
- try:
- update_agent = session.get(VerfierMain, agent_id) # type: ignore[attr-defined]
- assert update_agent
- update_agent.operational_state = states.TERMINATED # pyright: ignore
- session.add(update_agent)
- # session.commit() is automatically called by context manager
- web_util.echo_json_response(self.req_handler, 202, "Accepted")
- logger.info("DELETE (pull mode) returning 202 response for agent id: %s", agent_id)
- except SQLAlchemyError as e:
- logger.error("SQLAlchemy Error for agent ID %s: %s", agent_id, e)
+ # Either an invoke_get_quote() / invoke_provide_v()
+ # coroutine is in-flight (no pending_handle — the
+ # timer already fired), or this DELETE arrived at a
+ # different worker process. Mark as TERMINATED and
+ # let process_agent() perform the actual deletion
+ # when the in-flight work finishes (or the timer
+ # fires in the managing worker).
+ update_agent = session.get(VerfierMain, agent_id) # type: ignore[attr-defined]
+ if update_agent is None:
+ web_util.echo_json_response(self.req_handler, 404, "agent id not found")
+ return
+ update_agent.operational_state = states.TERMINATED # pyright: ignore
+ session.add(update_agent)
+ web_util.echo_json_response(self.req_handler, 202, "Accepted")
+ logger.info("DELETE (pull mode) returning 202 response for agent id: %s", agent_id)
+ except SQLAlchemyError as e:
+ logger.error("SQLAlchemy Error for agent ID %s: %s", agent_id, e)
+ if pending_handle is not None:
+ # Restore the timer so the attestation cycle can
+ # continue — the DB operation failed so the agent
+ # is still there.
+ _pending_events[agent_id] = pending_handle
+ web_util.echo_json_response(self.req_handler, 500, "Internal server error")
def post(self) -> None:
"""This method handles the POST requests to add agents to the Cloud Verifier.
@@ -1045,6 +1160,15 @@ class AgentsHandler(BaseHandler):
# session.commit() is automatically called by context manager
except SQLAlchemyError as e:
logger.error("SQLAlchemy Error: %s", e)
+ web_util.echo_json_response(self.req_handler, 500, "Internal server error")
+ return
+
+ # DB succeeded — now safe to cancel the pending poll
+ # timer to prevent new attestation cycles.
+ if agent_id is not None:
+ pending_handle = _pending_events.pop(agent_id, None)
+ if pending_handle is not None:
+ tornado.ioloop.IOLoop.current().remove_timeout(pending_handle)
web_util.echo_json_response(self.req_handler, 200, "Success")
logger.info("PUT returning 200 response for agent id: %s", agent_id)
diff --git a/keylime/json.py b/keylime/json.py
index 82292a1..4c4b897 100644
--- a/keylime/json.py
+++ b/keylime/json.py
@@ -24,7 +24,7 @@ def bytes_to_str(data: Any) -> Any:
for _k, _v in data.items():
data[_k] = bytes_to_str(_v)
elif isinstance(data, tuple(_list_types)):
- _l = list(data)
+ _l = list(data) # type: ignore[call-overload]
for _k, _v in enumerate(_l):
_l[_k] = bytes_to_str(_v)
data = _l
diff --git a/keylime/tenant.py b/keylime/tenant.py
index 0cdeada..4cb3698 100644
--- a/keylime/tenant.py
+++ b/keylime/tenant.py
@@ -1074,22 +1074,17 @@ class Tenant:
# keylime_logging.log_http_response(logger, logging.ERROR, response_json)
raise UserError(f"{self.verifier_fid_str} timed out while deleting {self.agent_fid_str}.")
- if response_json["code"] == 202:
+ if response_json["code"] == 200:
+ logger.info("Agent %s deleted from the CV", self.agent_uuid)
+ elif response_json["code"] == 202:
numtries = 0
deleted = False
while not deleted:
- reponse_json = self.do_cvstatus(not_found_fail=False)
- if reponse_json["code"] != 404:
+ response_json = self.do_cvstatus(not_found_fail=False)
+ if response_json["code"] != 404:
numtries += 1
if numtries >= self.maxr:
- # EVALUATE DELETION
- # logger.error(
- # "%s was not deleted from %s after %d tries",
- # self.agent_fid_str,
- # self.verifier_fid_str,
- # numtries,
- # )
raise UserError(
f"{self.agent_fid_str} was not deleted from {self.verifier_fid_str} after {numtries} tries"
)
@@ -1114,8 +1109,17 @@ class Tenant:
self.verifier_fid_str,
numtries,
)
- # Marked for deletion (need to modify the code on CI tests)
logger.info("Agent %s deleted from the CV", self.agent_uuid)
+ elif response_json["code"] == 404:
+ # The agent was already deleted (e.g. by another caller
+ # between the do_cvstatus check and the DELETE request).
+ # The desired end state is reached — treat as success.
+ logger.info("Agent %s is already absent from the CV", self.agent_uuid)
+ else:
+ raise UserError(
+ f"Unexpected response code {response_json['code']} from "
+ f"{self.verifier_fid_str} while deleting {self.agent_fid_str}"
+ )
def do_regstatus(self) -> Dict[str, Any]:
if not self.registrar_ip or not self.registrar_port:
diff --git a/test/test_cloud_verifier_tornado.py b/test/test_cloud_verifier_tornado.py
new file mode 100644
index 0000000..7515b2e
--- /dev/null
+++ b/test/test_cloud_verifier_tornado.py
@@ -0,0 +1,114 @@
+"""Unit tests for cloud_verifier_tornado deletion and pending-event management.
+
+Tests cover:
+1. _register_pending_event / _cancel_pending_event helpers
+2. store_attestation_state graceful handling when agent is deleted
+"""
+
+# pylint: disable=protected-access
+
+import unittest
+from unittest.mock import MagicMock, patch
+
+from keylime import cloud_verifier_tornado
+
+
+class TestPendingEventRegistry(unittest.TestCase):
+ """Test the _pending_events registry helpers."""
+
+ def setUp(self):
+ cloud_verifier_tornado._pending_events.clear()
+
+ def tearDown(self):
+ cloud_verifier_tornado._pending_events.clear()
+
+ def test_register_pending_event(self):
+ """_register_pending_event stores handle in agent dict and global registry."""
+ agent = {"agent_id": "test-agent-1", "pending_event": None}
+ handle = object()
+
+ cloud_verifier_tornado._register_pending_event(agent, handle)
+
+ self.assertIs(agent["pending_event"], handle)
+ self.assertIs(cloud_verifier_tornado._pending_events["test-agent-1"], handle)
+
+ def test_cancel_pending_event_removes_from_both(self):
+ """_cancel_pending_event clears agent dict and global registry."""
+ agent = {"agent_id": "test-agent-1", "pending_event": None}
+ handle = object()
+ cloud_verifier_tornado._register_pending_event(agent, handle)
+
+ with patch("tornado.ioloop.IOLoop") as mock_ioloop_cls:
+ mock_ioloop = MagicMock()
+ mock_ioloop_cls.current.return_value = mock_ioloop
+
+ cloud_verifier_tornado._cancel_pending_event(agent)
+
+ self.assertIsNone(agent["pending_event"])
+ self.assertNotIn("test-agent-1", cloud_verifier_tornado._pending_events)
+ mock_ioloop.remove_timeout.assert_called_once_with(handle)
+
+ def test_cancel_pending_event_noop_when_none(self):
+ """_cancel_pending_event is a no-op when no pending event exists."""
+ agent = {"agent_id": "test-agent-1", "pending_event": None}
+
+ # Should not raise
+ cloud_verifier_tornado._cancel_pending_event(agent)
+
+ self.assertIsNone(agent["pending_event"])
+
+ def test_cancel_pending_event_handles_remove_timeout_error(self):
+ """_cancel_pending_event logs but doesn't raise on remove_timeout failure."""
+ agent = {"agent_id": "test-agent-1", "pending_event": None}
+ handle = object()
+ cloud_verifier_tornado._register_pending_event(agent, handle)
+
+ with patch("tornado.ioloop.IOLoop") as mock_ioloop_cls:
+ mock_ioloop = MagicMock()
+ mock_ioloop_cls.current.return_value = mock_ioloop
+ mock_ioloop.remove_timeout.side_effect = RuntimeError("IOLoop stopped")
+
+ # Should not raise
+ cloud_verifier_tornado._cancel_pending_event(agent)
+
+ self.assertIsNone(agent["pending_event"])
+ self.assertNotIn("test-agent-1", cloud_verifier_tornado._pending_events)
+
+ def test_register_replaces_previous_handle(self):
+ """_register_pending_event replaces a previously registered handle."""
+ agent = {"agent_id": "test-agent-1", "pending_event": None}
+ handle1 = object()
+ handle2 = object()
+
+ cloud_verifier_tornado._register_pending_event(agent, handle1)
+ cloud_verifier_tornado._register_pending_event(agent, handle2)
+
+ self.assertIs(agent["pending_event"], handle2)
+ self.assertIs(cloud_verifier_tornado._pending_events["test-agent-1"], handle2)
+
+
+class TestStoreAttestationState(unittest.TestCase):
+ """Test store_attestation_state graceful handling of deleted agents."""
+
+ @patch("keylime.cloud_verifier_tornado.session_context")
+ def test_skips_when_agent_not_in_db(self, mock_session_ctx):
+ """store_attestation_state returns gracefully when agent is deleted from DB."""
+ mock_session = MagicMock()
+ mock_session.get.return_value = None
+ mock_session_ctx.return_value.__enter__ = MagicMock(return_value=mock_session)
+ mock_session_ctx.return_value.__exit__ = MagicMock(return_value=False)
+
+ mock_attest_state = MagicMock()
+ mock_attest_state.get_ima_pcrs.return_value = {"10": "some_value"}
+ mock_attest_state.agent_id = "deleted-agent"
+ mock_attest_state.get_agent_id.return_value = "deleted-agent"
+
+ # Should not raise (previously would AssertionError)
+ cloud_verifier_tornado.store_attestation_state(mock_attest_state)
+
+ # Verify no attempt to set attributes on None
+ mock_session.add.assert_not_called()
+
+
+if __name__ == "__main__":
+ unittest.main()
--
2.49.0

View File

@ -9,7 +9,7 @@
Name: keylime
Version: 7.14.1
Release: 3%{?dist}
Release: 4%{?dist}
Summary: Open source TPM software for Bootstrapping and Maintaining Trust
URL: https://github.com/keylime/keylime
@ -35,6 +35,10 @@ Patch: 0011-fix-Add-fork-safety-to-DBManager-via-dispose.patch
# Backport https://github.com/keylime/keylime/pull/1866
Patch: 0012-fix-mem-leak-remove-unbounded-functools.cache-from-l.patch
# RHEL-153121 - fix verifier race condition on agent delete.
# Backport https://github.com/keylime/keylime/pulls/1874
Patch: 0013-fix-verifier-race-condition-on-agent-delete.patch
# Main program: Apache-2.0
# Icons: MIT
License: Apache-2.0 AND MIT