From 8d2acfa55b9c9f522c848439e8bcdad303681658 Mon Sep 17 00:00:00 2001 Message-ID: <8d2acfa55b9c9f522c848439e8bcdad303681658.1730734026.git.jdenemar@redhat.com> From: Jiri Denemark Date: Thu, 8 Aug 2024 13:02:08 +0200 Subject: [PATCH] qemu: Avoid false failure when resuming post-copy migration Depending on timing between QEMU and libvirt an attempt to resume failed post-copy migration could immediately report a failure in post-copy phase again even though the migration actually resumed and is progressing just fine. This is caused by QEMU reporting the original migration state (i.e., postcopy-paused) until migration is successfully resumed and QEMU switches to postcopy-active. QEMU 9.1 introduced a new postcopy-recover-setup migration state which is entered immediately after requesting migration to be resumed and we can reliably wait for the migration to either continue or fail without being confused by the old state. https://issues.redhat.com/browse/RHEL-22166 Signed-off-by: Jiri Denemark Reviewed-by: Michal Privoznik (cherry picked from commit 11f6773f198636b80e73fb3f69adc83554860172) https://issues.redhat.com/browse/RHEL-63877 Signed-off-by: Jiri Denemark --- src/qemu/qemu_migration.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 70318e26f3..a1ebb621d1 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -1972,6 +1972,7 @@ enum qemuMigrationCompletedFlags { QEMU_MIGRATION_COMPLETED_CHECK_STORAGE = (1 << 1), QEMU_MIGRATION_COMPLETED_POSTCOPY = (1 << 2), QEMU_MIGRATION_COMPLETED_PRE_SWITCHOVER = (1 << 3), + QEMU_MIRGATION_COMPLETED_RECOVERY = (1 << 4), }; @@ -2033,6 +2034,16 @@ qemuMigrationAnyCompleted(virDomainObj *vm, return 1; } + /* When QEMU is new enough to enter postcopy-recover-setup state during + * post-copy recovery, the source waits for the recovery to start + * before letting the destination wait for migration to complete. + */ + if (flags & QEMU_MIRGATION_COMPLETED_RECOVERY && + jobData->status == VIR_DOMAIN_JOB_STATUS_POSTCOPY) { + VIR_DEBUG("Post-copy recovery active"); + return 1; + } + if (jobData->status == VIR_DOMAIN_JOB_STATUS_HYPERVISOR_COMPLETED) return 1; else @@ -5131,6 +5142,7 @@ qemuMigrationSrcResume(virDomainObj *vm, char **cookieout, int *cookieoutlen, qemuMigrationSpec *spec, + virConnectPtr dconn, unsigned int flags) { qemuDomainObjPrivate *priv = vm->privateData; @@ -5161,6 +5173,17 @@ qemuMigrationSrcResume(virDomainObj *vm, if (rc < 0) return -1; + /* Wait for postcopy recovery to start (or fail) if QEMU is new enough to + * support postcopy-recover-setup migration state. */ + if (priv->migrationRecoverSetup) { + VIR_DEBUG("Waiting for post-copy recovery to start"); + if (qemuMigrationSrcWaitForCompletion(vm, VIR_ASYNC_JOB_MIGRATION_OUT, dconn, + QEMU_MIRGATION_COMPLETED_RECOVERY) < 0) + return -1; + } else { + VIR_WARN("QEMU is too old, we may report a failure in post-copy phase even though the migration may be running just fine"); + } + if (qemuMigrationCookieFormat(mig, driver, vm, QEMU_MIGRATION_SOURCE, cookieout, cookieoutlen, @@ -5265,7 +5288,7 @@ qemuMigrationSrcPerformNative(virQEMUDriver *driver, if (flags & VIR_MIGRATE_POSTCOPY_RESUME) { ret = qemuMigrationSrcResume(vm, migParams, cookiein, cookieinlen, - cookieout, cookieoutlen, &spec, flags); + cookieout, cookieoutlen, &spec, dconn, flags); } else { ret = qemuMigrationSrcRun(driver, vm, xmlin, persist_xml, cookiein, cookieinlen, cookieout, cookieoutlen, flags, resource, -- 2.47.0