- Update to cryptsetup 2.8.6 - Add upstream patches for jq test fixes - Add upstream patches for reencryption error path improvements Resolves: RHEL-163434
171 lines
6.2 KiB
Diff
171 lines
6.2 KiB
Diff
From 303b319488b652efc68472e1580f5ec0df3e8eba Mon Sep 17 00:00:00 2001
|
|
Message-ID: <303b319488b652efc68472e1580f5ec0df3e8eba.1778441857.git.khanicov@redhat.com>
|
|
From: Ondrej Kozina <okozina@redhat.com>
|
|
Date: Wed, 10 Dec 2025 16:37:20 +0100
|
|
Subject: [PATCH] reencrypt: add more gracefull reencryption error path.
|
|
|
|
This adds proper reencryption error path for non critical
|
|
failures not requiring full LUKS2 reencryption recovery.
|
|
|
|
While non-critical reencryption failures were properly identified
|
|
in former code the graceful recovery was never implemented.
|
|
This affected the state of live device mappings after failed
|
|
reencription. For example, read error on data device did not trigger
|
|
LUKS2 recovery scenario (correctly), but the overlay reencryption
|
|
device stack for online reencryption remained stuck with hotzone layer
|
|
suspended.
|
|
|
|
This patch addresses the issue.
|
|
---
|
|
lib/luks2/luks2_reencrypt.c | 86 +++++++++++++++++++++++++++++++++----
|
|
1 file changed, 78 insertions(+), 8 deletions(-)
|
|
|
|
diff --git a/lib/luks2/luks2_reencrypt.c b/lib/luks2/luks2_reencrypt.c
|
|
index 21bd6674..1845a782 100644
|
|
--- a/lib/luks2/luks2_reencrypt.c
|
|
+++ b/lib/luks2/luks2_reencrypt.c
|
|
@@ -2374,11 +2374,6 @@ err:
|
|
return r;
|
|
}
|
|
|
|
-/* TODO:
|
|
- * 1) audit error path. any error in this routine is fatal and should be unlikely.
|
|
- * usually it would hint some collision with another userspace process touching
|
|
- * dm devices directly.
|
|
- */
|
|
static reenc_status_t reenc_refresh_helper_devices(struct crypt_device *cd, const char *overlay,
|
|
const char *hotzone)
|
|
{
|
|
@@ -4125,14 +4120,24 @@ static reenc_status_t reencrypt_step(struct crypt_device *cd,
|
|
/* metadata commit point */
|
|
r = reencrypt_hotzone_protect_final(cd, hdr, rh->reenc_keyslot, rp, rh->reenc_buffer, rh->read);
|
|
if (r < 0) {
|
|
- /* severity normal */
|
|
+ /*
|
|
+ * Nothing was written in hotzone area yet. Even if metadata write failed the previous
|
|
+ * state is still valid. If the metadata write passed and there was another
|
|
+ * error it's harmless to do recovery. Recovery may be run several times with no
|
|
+ * negative side effect.
|
|
+ */
|
|
log_err(cd, _("Failed to write reencryption resilience metadata."));
|
|
return REENC_ERR_ROLLBACK_MEMORY;
|
|
}
|
|
|
|
r = crypt_storage_wrapper_decrypt(rh->cw1, rh->offset, rh->reenc_buffer, rh->read);
|
|
if (r) {
|
|
- /* severity normal */
|
|
+ /*
|
|
+ * Ideally, this would be specific error (REENC_ERR_ROLLBACK_METADATA) case where
|
|
+ * it would rollback on-disk metadata to the last valid state (still no write in
|
|
+ * hotzone area). But it's not worth the effort. This will trigger full LUKS2
|
|
+ * reencryption recovery despite not being necessary.
|
|
+ */
|
|
log_err(cd, _("Decryption failed."));
|
|
return REENC_ERR_ROLLBACK_MEMORY;
|
|
}
|
|
@@ -4156,7 +4161,6 @@ static reenc_status_t reencrypt_step(struct crypt_device *cd,
|
|
}
|
|
|
|
if (online) {
|
|
- /* severity normal */
|
|
log_dbg(cd, "Resuming device %s", rh->hotzone_name);
|
|
r = dm_resume_device(cd, rh->hotzone_name, DM_RESUME_PRIVATE);
|
|
if (r) {
|
|
@@ -4246,6 +4250,7 @@ static int replace_hotzone_device_with_error(struct crypt_device *cd, struct luk
|
|
|
|
static int teardown_overlay_devices(struct crypt_device *cd, struct luks2_reencrypt *rh)
|
|
{
|
|
+ bool overlay_suspended, hotzone_suspended;
|
|
int r;
|
|
|
|
/* Reload device with current LUKS2 segments */
|
|
@@ -4255,6 +4260,44 @@ static int teardown_overlay_devices(struct crypt_device *cd, struct luks2_reencr
|
|
return r;
|
|
}
|
|
|
|
+ overlay_suspended = dm_status_suspended(cd, rh->overlay_name) > 0;
|
|
+ hotzone_suspended = dm_status_suspended(cd, rh->hotzone_name) > 0;
|
|
+
|
|
+ /*
|
|
+ * The overlay (if suspended) may hold already queued I/Os.
|
|
+ * Reload the overlay device with the table identical to the one
|
|
+ * loaded to the top level device. The overlay device will dropped
|
|
+ * shortly after successful top level device resume.
|
|
+ */
|
|
+ if (overlay_suspended) {
|
|
+ log_dbg(cd, "Reverting suspended device %s to previous metadata segments", rh->overlay_name);
|
|
+ r = LUKS2_reload(cd, rh->overlay_name, rh->vks, rh->device_size, rh->flags);
|
|
+ if (r) {
|
|
+ log_err(cd, _("Failed to reload device %s."), rh->overlay_name);
|
|
+ return r;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * if the hotzone is suspended we must error all pending I/O waiting in the device. The
|
|
+ * reencryption step was not completed and the pending I/O would corrupt the data on data
|
|
+ * device.
|
|
+ *
|
|
+ * If the hotzone table replacement fails we must abort!
|
|
+ */
|
|
+ if (hotzone_suspended && (r = replace_hotzone_device_with_error(cd, rh)))
|
|
+ return r;
|
|
+
|
|
+ if (overlay_suspended) {
|
|
+ /* Resume will pass since the hotzone (if previously suspended) is now
|
|
+ * replaced with live dm-error table */
|
|
+ r = dm_resume_device(cd, rh->overlay_name, DM_RESUME_PRIVATE);
|
|
+ if (r) {
|
|
+ log_err(cd, _("Failed to resume device %s."), rh->overlay_name);
|
|
+ return r;
|
|
+ }
|
|
+ }
|
|
+
|
|
/* Now we can switch original top level device away from overlay device */
|
|
r = dm_resume_device(cd, rh->device_name, DM_SUSPEND_SKIP_LOCKFS | DM_SUSPEND_NOFLUSH);
|
|
if (r) {
|
|
@@ -4322,6 +4365,29 @@ static int reencrypt_teardown_ok(struct crypt_device *cd, struct luks2_hdr *hdr,
|
|
return 0;
|
|
}
|
|
|
|
+static void reencrypt_teardown_rollback(struct crypt_device *cd, struct luks2_hdr *hdr,
|
|
+ struct luks2_reencrypt *rh)
|
|
+{
|
|
+ /*
|
|
+ * We cannot rollback for REENC_PROTECTION_NONE. It does not commit metadata as
|
|
+ * it progresses. In this case, the device stack is intentionally left as-is.
|
|
+ */
|
|
+ if (rh->rp.type <= REENC_PROTECTION_NONE)
|
|
+ return;
|
|
+
|
|
+ /*
|
|
+ * If metadata rollback fails, we cannot proceed with device teardown
|
|
+ * as we do not have proper metadata snapshot for LUKS2_reload().
|
|
+ */
|
|
+ if (LUKS2_hdr_rollback(cd, hdr))
|
|
+ return;
|
|
+
|
|
+ if (!rh->online)
|
|
+ return;
|
|
+
|
|
+ teardown_overlay_devices(cd, rh);
|
|
+}
|
|
+
|
|
static void reencrypt_teardown_fatal(struct crypt_device *cd, struct luks2_reencrypt *rh)
|
|
{
|
|
log_err(cd, _("Fatal error while reencrypting chunk starting at %" PRIu64 ", %" PRIu64 " sectors long."),
|
|
@@ -4347,6 +4413,10 @@ static int reencrypt_teardown(struct crypt_device *cd, struct luks2_hdr *hdr,
|
|
progress(rh->device_size, rh->progress, usrptr);
|
|
r = reencrypt_teardown_ok(cd, hdr, rh);
|
|
break;
|
|
+ case REENC_ERR_ROLLBACK_MEMORY:
|
|
+ reencrypt_teardown_rollback(cd, hdr, rh);
|
|
+ r = -EINVAL;
|
|
+ break;
|
|
case REENC_ERR_FATAL:
|
|
reencrypt_teardown_fatal(cd, rh);
|
|
/* fall-through */
|
|
--
|
|
2.53.0
|
|
|