diff --git a/kvm-migration-Attempt-disk-reactivation-in-more-failure-.patch b/kvm-migration-Attempt-disk-reactivation-in-more-failure-.patch new file mode 100644 index 0000000..c3a39e3 --- /dev/null +++ b/kvm-migration-Attempt-disk-reactivation-in-more-failure-.patch @@ -0,0 +1,111 @@ +From a1f2a51d1a789c46e806adb332236ca16d538bf9 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 2 May 2023 15:52:12 -0500 +Subject: [PATCH 3/5] migration: Attempt disk reactivation in more failure + scenarios + +RH-Author: Eric Blake +RH-MergeRequest: 273: migration: prevent source core dump if NFS dies mid-migration +RH-Bugzilla: 2177957 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: quintela1 +RH-Acked-by: Kevin Wolf +RH-Commit: [3/3] e84bf1e7233c0273ca3136ecaa6b2cfc9c0efacb (ebblake/qemu-kvm) + +Commit fe904ea824 added a fail_inactivate label, which tries to +reactivate disks on the source after a failure while s->state == +MIGRATION_STATUS_ACTIVE, but didn't actually use the label if +qemu_savevm_state_complete_precopy() failed. This failure to +reactivate is also present in commit 6039dd5b1c (also covering the new +s->state == MIGRATION_STATUS_DEVICE state) and 403d18ae (ensuring +s->block_inactive is set more reliably). + +Consolidate the two labels back into one - no matter HOW migration is +failed, if there is any chance we can reach vm_start() after having +attempted inactivation, it is essential that we have tried to restart +disks before then. This also makes the cleanup more like +migrate_fd_cancel(). + +Suggested-by: Kevin Wolf +Signed-off-by: Eric Blake +Message-Id: <20230502205212.134680-1-eblake@redhat.com> +Acked-by: Peter Xu +Reviewed-by: Juan Quintela +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf +(cherry picked from commit 6dab4c93ecfae48e2e67b984d1032c1e988d3005) +[eblake: downstream migrate_colo() => migrate_colo_enabled()] +Signed-off-by: Eric Blake +--- + migration/migration.c | 24 ++++++++++++++---------- + 1 file changed, 14 insertions(+), 10 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index 6ba8eb0fdf..817170d52d 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -3255,6 +3255,11 @@ static void migration_completion(MigrationState *s) + MIGRATION_STATUS_DEVICE); + } + if (ret >= 0) { ++ /* ++ * Inactivate disks except in COLO, and track that we ++ * have done so in order to remember to reactivate ++ * them if migration fails or is cancelled. ++ */ + s->block_inactive = !migrate_colo_enabled(); + qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX); + ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false, +@@ -3290,13 +3295,13 @@ static void migration_completion(MigrationState *s) + rp_error = await_return_path_close_on_source(s); + trace_migration_return_path_end_after(rp_error); + if (rp_error) { +- goto fail_invalidate; ++ goto fail; + } + } + + if (qemu_file_get_error(s->to_dst_file)) { + trace_migration_completion_file_err(); +- goto fail_invalidate; ++ goto fail; + } + + if (!migrate_colo_enabled()) { +@@ -3306,26 +3311,25 @@ static void migration_completion(MigrationState *s) + + return; + +-fail_invalidate: +- /* If not doing postcopy, vm_start() will be called: let's regain +- * control on images. +- */ +- if (s->state == MIGRATION_STATUS_ACTIVE || +- s->state == MIGRATION_STATUS_DEVICE) { ++fail: ++ if (s->block_inactive && (s->state == MIGRATION_STATUS_ACTIVE || ++ s->state == MIGRATION_STATUS_DEVICE)) { ++ /* ++ * If not doing postcopy, vm_start() will be called: let's ++ * regain control on images. ++ */ + Error *local_err = NULL; + + qemu_mutex_lock_iothread(); + bdrv_invalidate_cache_all(&local_err); + if (local_err) { + error_report_err(local_err); +- s->block_inactive = true; + } else { + s->block_inactive = false; + } + qemu_mutex_unlock_iothread(); + } + +-fail: + migrate_set_state(&s->state, current_active_state, + MIGRATION_STATUS_FAILED); + } +-- +2.39.1 + diff --git a/kvm-migration-Handle-block-device-inactivation-failures-.patch b/kvm-migration-Handle-block-device-inactivation-failures-.patch new file mode 100644 index 0000000..2e863bb --- /dev/null +++ b/kvm-migration-Handle-block-device-inactivation-failures-.patch @@ -0,0 +1,117 @@ +From 1b07c7663b6a5c19c9303088d63c39dba7e3bb36 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Fri, 14 Apr 2023 10:33:58 -0500 +Subject: [PATCH 1/5] migration: Handle block device inactivation failures + better + +RH-Author: Eric Blake +RH-MergeRequest: 273: migration: prevent source core dump if NFS dies mid-migration +RH-Bugzilla: 2177957 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: quintela1 +RH-Acked-by: Kevin Wolf +RH-Commit: [1/3] 5892c17ca0a21d824d176e7398d12f7cf991651d (ebblake/qemu-kvm) + +Consider what happens when performing a migration between two host +machines connected to an NFS server serving multiple block devices to +the guest, when the NFS server becomes unavailable. The migration +attempts to inactivate all block devices on the source (a necessary +step before the destination can take over); but if the NFS server is +non-responsive, the attempt to inactivate can itself fail. When that +happens, the destination fails to get the migrated guest (good, +because the source wasn't able to flush everything properly): + + (qemu) qemu-kvm: load of migration failed: Input/output error + +at which point, our only hope for the guest is for the source to take +back control. With the current code base, the host outputs a message, but then appears to resume: + + (qemu) qemu-kvm: qemu_savevm_state_complete_precopy_non_iterable: bdrv_inactivate_all() failed (-1) + + (src qemu)info status + VM status: running + +but a second migration attempt now asserts: + + (src qemu) qemu-kvm: ../block.c:6738: int bdrv_inactivate_recurse(BlockDriverState *): Assertion `!(bs->open_flags & BDRV_O_INACTIVE)' failed. + +Whether the guest is recoverable on the source after the first failure +is debatable, but what we do not want is to have qemu itself fail due +to an assertion. It looks like the problem is as follows: + +In migration.c:migration_completion(), the source sets 'inactivate' to +true (since COLO is not enabled), then tries +savevm.c:qemu_savevm_state_complete_precopy() with a request to +inactivate block devices. In turn, this calls +block.c:bdrv_inactivate_all(), which fails when flushing runs up +against the non-responsive NFS server. With savevm failing, we are +now left in a state where some, but not all, of the block devices have +been inactivated; but migration_completion() then jumps to 'fail' +rather than 'fail_invalidate' and skips an attempt to reclaim those +those disks by calling bdrv_activate_all(). Even if we do attempt to +reclaim disks, we aren't taking note of failure there, either. + +Thus, we have reached a state where the migration engine has forgotten +all state about whether a block device is inactive, because we did not +set s->block_inactive in enough places; so migration allows the source +to reach vm_start() and resume execution, violating the block layer +invariant that the guest CPUs should not be restarted while a device +is inactive. Note that the code in migration.c:migrate_fd_cancel() +will also try to reactivate all block devices if s->block_inactive was +set, but because we failed to set that flag after the first failure, +the source assumes it has reclaimed all devices, even though it still +has remaining inactivated devices and does not try again. Normally, +qmp_cont() will also try to reactivate all disks (or correctly fail if +the disks are not reclaimable because NFS is not yet back up), but the +auto-resumption of the source after a migration failure does not go +through qmp_cont(). And because we have left the block layer in an +inconsistent state with devices still inactivated, the later migration +attempt is hitting the assertion failure. + +Since it is important to not resume the source with inactive disks, +this patch marks s->block_inactive before attempting inactivation, +rather than after succeeding, in order to prevent any vm_start() until +it has successfully reactivated all devices. + +See also https://bugzilla.redhat.com/show_bug.cgi?id=2058982 + +Signed-off-by: Eric Blake +Reviewed-by: Juan Quintela +Acked-by: Lukas Straub +Tested-by: Lukas Straub +Signed-off-by: Juan Quintela +(cherry picked from commit 403d18ae384239876764bbfa111d6cc5dcb673d1) +--- + migration/migration.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index 0885549de0..08e5e8f013 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -3256,13 +3256,11 @@ static void migration_completion(MigrationState *s) + MIGRATION_STATUS_DEVICE); + } + if (ret >= 0) { ++ s->block_inactive = inactivate; + qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX); + ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false, + inactivate); + } +- if (inactivate && ret >= 0) { +- s->block_inactive = true; +- } + } + qemu_mutex_unlock_iothread(); + +@@ -3321,6 +3319,7 @@ fail_invalidate: + bdrv_invalidate_cache_all(&local_err); + if (local_err) { + error_report_err(local_err); ++ s->block_inactive = true; + } else { + s->block_inactive = false; + } +-- +2.39.1 + diff --git a/kvm-migration-Minor-control-flow-simplification.patch b/kvm-migration-Minor-control-flow-simplification.patch new file mode 100644 index 0000000..f1a142a --- /dev/null +++ b/kvm-migration-Minor-control-flow-simplification.patch @@ -0,0 +1,53 @@ +From e79d0506184e861350d2a3e62dd986aa03d30aa8 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Thu, 20 Apr 2023 09:35:51 -0500 +Subject: [PATCH 2/5] migration: Minor control flow simplification + +RH-Author: Eric Blake +RH-MergeRequest: 273: migration: prevent source core dump if NFS dies mid-migration +RH-Bugzilla: 2177957 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: quintela1 +RH-Acked-by: Kevin Wolf +RH-Commit: [2/3] f00b21b6ebd377af79af93ac18f103f8dc0309d6 (ebblake/qemu-kvm) + +No need to declare a temporary variable. + +Suggested-by: Juan Quintela +Fixes: 1df36e8c6289 ("migration: Handle block device inactivation failures better") +Signed-off-by: Eric Blake +Reviewed-by: Juan Quintela +Signed-off-by: Juan Quintela +(cherry picked from commit 5d39f44d7ac5c63f53d4d0900ceba9521bc27e49) +--- + migration/migration.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index 08e5e8f013..6ba8eb0fdf 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -3248,7 +3248,6 @@ static void migration_completion(MigrationState *s) + ret = global_state_store(); + + if (!ret) { +- bool inactivate = !migrate_colo_enabled(); + ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); + trace_migration_completion_vm_stop(ret); + if (ret >= 0) { +@@ -3256,10 +3255,10 @@ static void migration_completion(MigrationState *s) + MIGRATION_STATUS_DEVICE); + } + if (ret >= 0) { +- s->block_inactive = inactivate; ++ s->block_inactive = !migrate_colo_enabled(); + qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX); + ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false, +- inactivate); ++ s->block_inactive); + } + } + qemu_mutex_unlock_iothread(); +-- +2.39.1 + diff --git a/kvm-nbd-server-Request-TCP_NODELAY.patch b/kvm-nbd-server-Request-TCP_NODELAY.patch new file mode 100644 index 0000000..26a3ca5 --- /dev/null +++ b/kvm-nbd-server-Request-TCP_NODELAY.patch @@ -0,0 +1,55 @@ +From 17c5524ada3f2ca9a9c645f540bedc5575302059 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Mon, 3 Apr 2023 19:40:47 -0500 +Subject: [PATCH 5/5] nbd/server: Request TCP_NODELAY +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Eric Blake +RH-MergeRequest: 274: nbd: improve TLS performance of NBD server +RH-Bugzilla: 2035712 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Kevin Wolf +RH-Acked-by: Stefano Garzarella +RH-Commit: [2/2] 092145077756cda2a4f849c5911031b0fc4a2134 (ebblake/qemu-kvm) + +Nagle's algorithm adds latency in order to reduce network packet +overhead on small packets. But when we are already using corking to +merge smaller packets into transactional requests, the extra delay +from TCP defaults just gets in the way (see recent commit bd2cd4a4). + +For reference, qemu as an NBD client already requests TCP_NODELAY (see +nbd_connect() in nbd/client-connection.c); as does libnbd as a client +[1], and nbdkit as a server [2]. Furthermore, the NBD spec recommends +the use of TCP_NODELAY [3]. + +[1] https://gitlab.com/nbdkit/libnbd/-/blob/a48a1142/generator/states-connect.c#L39 +[2] https://gitlab.com/nbdkit/nbdkit/-/blob/45b72f5b/server/sockets.c#L430 +[3] https://github.com/NetworkBlockDevice/nbd/blob/master/doc/proto.md#protocol-phases + +CC: Florian Westphal +Signed-off-by: Eric Blake +Message-Id: <20230404004047.142086-1-eblake@redhat.com> +Reviewed-by: Philippe Mathieu-Daudé +(cherry picked from commit f1426881a827a6d3f31b65616c4a8db1e9e7c45e) +Signed-off-by: Eric Blake +--- + nbd/server.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/nbd/server.c b/nbd/server.c +index a5edc7f681..6db124cf53 100644 +--- a/nbd/server.c ++++ b/nbd/server.c +@@ -2738,6 +2738,7 @@ void nbd_client_new(QIOChannelSocket *sioc, + } + client->tlsauthz = g_strdup(tlsauthz); + client->sioc = sioc; ++ qio_channel_set_delay(QIO_CHANNEL(sioc), false); + object_ref(OBJECT(client->sioc)); + client->ioc = QIO_CHANNEL(sioc); + object_ref(OBJECT(client->ioc)); +-- +2.39.1 + diff --git a/kvm-nbd-server-push-pending-frames-after-sending-reply.patch b/kvm-nbd-server-push-pending-frames-after-sending-reply.patch new file mode 100644 index 0000000..4f241a8 --- /dev/null +++ b/kvm-nbd-server-push-pending-frames-after-sending-reply.patch @@ -0,0 +1,72 @@ +From 170872370c6f3c916e741eb32d80431995d7a870 Mon Sep 17 00:00:00 2001 +From: Florian Westphal +Date: Fri, 24 Mar 2023 11:47:20 +0100 +Subject: [PATCH 4/5] nbd/server: push pending frames after sending reply + +RH-Author: Eric Blake +RH-MergeRequest: 274: nbd: improve TLS performance of NBD server +RH-Bugzilla: 2035712 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Kevin Wolf +RH-Acked-by: Stefano Garzarella +RH-Commit: [1/2] ab92c06c48810aa40380de0433dcac4c6e4be9a5 (ebblake/qemu-kvm) + +qemu-nbd doesn't set TCP_NODELAY on the tcp socket. + +Kernel waits for more data and avoids transmission of small packets. +Without TLS this is barely noticeable, but with TLS this really shows. + +Booting a VM via qemu-nbd on localhost (with tls) takes more than +2 minutes on my system. tcpdump shows frequent wait periods, where no +packets get sent for a 40ms period. + +Add explicit (un)corking when processing (and responding to) requests. +"TCP_CORK, &zero" after earlier "CORK, &one" will flush pending data. + +VM Boot time: +main: no tls: 23s, with tls: 2m45s +patched: no tls: 14s, with tls: 15s + +VM Boot time, qemu-nbd via network (same lan): +main: no tls: 18s, with tls: 1m50s +patched: no tls: 17s, with tls: 18s + +Future optimization: if we could detect if there is another pending +request we could defer the uncork operation because more data would be +appended. + +Signed-off-by: Florian Westphal +Message-Id: <20230324104720.2498-1-fw@strlen.de> +Reviewed-by: Eric Blake +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf +(cherry picked from commit bd2cd4a441ded163b62371790876f28a9b834317) +Signed-off-by: Eric Blake +--- + nbd/server.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/nbd/server.c b/nbd/server.c +index 4630dd7322..a5edc7f681 100644 +--- a/nbd/server.c ++++ b/nbd/server.c +@@ -2647,6 +2647,8 @@ static coroutine_fn void nbd_trip(void *opaque) + goto disconnect; + } + ++ qio_channel_set_cork(client->ioc, true); ++ + if (ret < 0) { + /* It wans't -EIO, so, according to nbd_co_receive_request() + * semantics, we should return the error to the client. */ +@@ -2672,6 +2674,7 @@ static coroutine_fn void nbd_trip(void *opaque) + goto disconnect; + } + ++ qio_channel_set_cork(client->ioc, false); + done: + nbd_request_put(req); + nbd_client_put(client); +-- +2.39.1 + diff --git a/qemu-kvm.spec b/qemu-kvm.spec index 75e3f75..99a8345 100644 --- a/qemu-kvm.spec +++ b/qemu-kvm.spec @@ -83,7 +83,7 @@ Obsoletes: %1-rhev <= %{epoch}:%{version}-%{release} Summary: QEMU is a machine emulator and virtualizer Name: qemu-kvm Version: 6.2.0 -Release: 33%{?rcrel}%{?dist} +Release: 34%{?rcrel}%{?dist} # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped Epoch: 15 License: GPLv2 and GPLv2+ and CC-BY @@ -654,6 +654,16 @@ Patch256: kvm-dma-helpers-prevent-dma_blk_cb-vs-dma_aio_cancel-rac.patch Patch257: kvm-virtio-scsi-reset-SCSI-devices-from-main-loop-thread.patch # For bz#2187159 - RHEL8.8 - KVM - Secure Guest crashed during booting with 248 vcpus Patch258: kvm-s390x-pv-Implement-a-CGS-check-helper.patch +# For bz#2177957 - Qemu core dump if cut off nfs storage during migration +Patch259: kvm-migration-Handle-block-device-inactivation-failures-.patch +# For bz#2177957 - Qemu core dump if cut off nfs storage during migration +Patch260: kvm-migration-Minor-control-flow-simplification.patch +# For bz#2177957 - Qemu core dump if cut off nfs storage during migration +Patch261: kvm-migration-Attempt-disk-reactivation-in-more-failure-.patch +# For bz#2035712 - [qemu] Booting from Guest Image over NBD with TLS Is Slow +Patch262: kvm-nbd-server-push-pending-frames-after-sending-reply.patch +# For bz#2035712 - [qemu] Booting from Guest Image over NBD with TLS Is Slow +Patch263: kvm-nbd-server-Request-TCP_NODELAY.patch BuildRequires: wget BuildRequires: rpm-build @@ -1823,6 +1833,17 @@ sh %{_sysconfdir}/sysconfig/modules/kvm.modules &> /dev/null || : %changelog +* Fri May 19 2023 Miroslav Rezanina - 6.2.0-34 +- kvm-migration-Handle-block-device-inactivation-failures-.patch [bz#2177957] +- kvm-migration-Minor-control-flow-simplification.patch [bz#2177957] +- kvm-migration-Attempt-disk-reactivation-in-more-failure-.patch [bz#2177957] +- kvm-nbd-server-push-pending-frames-after-sending-reply.patch [bz#2035712] +- kvm-nbd-server-Request-TCP_NODELAY.patch [bz#2035712] +- Resolves: bz#2177957 + (Qemu core dump if cut off nfs storage during migration) +- Resolves: bz#2035712 + ([qemu] Booting from Guest Image over NBD with TLS Is Slow) + * Tue Apr 25 2023 Miroslav Rezanina - 6.2.0-33 - kvm-s390x-pv-Implement-a-CGS-check-helper.patch [bz#2187159] - Resolves: bz#2187159