From 0552c42c392f9ef0d01b98e248d64a86401c0e59 Mon Sep 17 00:00:00 2001 From: Miroslav Rezanina Date: Mon, 13 Jun 2022 00:34:04 -0400 Subject: [PATCH] * Mon Jun 13 2022 Miroslav Rezanina - 7.0.0-6 - kvm-Introduce-event-loop-base-abstract-class.patch [bz#2031024] - kvm-util-main-loop-Introduce-the-main-loop-into-QOM.patch [bz#2031024] - kvm-util-event-loop-base-Introduce-options-to-set-the-th.patch [bz#2031024] - kvm-qcow2-Improve-refcount-structure-rebuilding.patch [bz#2072379] - kvm-iotests-108-Test-new-refcount-rebuild-algorithm.patch [bz#2072379] - kvm-qcow2-Add-errp-to-rebuild_refcount_structure.patch [bz#2072379] - kvm-iotests-108-Fix-when-missing-user_allow_other.patch [bz#2072379] - kvm-virtio-net-setup-vhost_dev-and-notifiers-for-cvq-onl.patch [bz#2070804] - kvm-virtio-net-align-ctrl_vq-index-for-non-mq-guest-for-.patch [bz#2070804] - kvm-vhost-vdpa-fix-improper-cleanup-in-net_init_vhost_vd.patch [bz#2070804] - kvm-vhost-net-fix-improper-cleanup-in-vhost_net_start.patch [bz#2070804] - kvm-vhost-vdpa-backend-feature-should-set-only-once.patch [bz#2070804] - kvm-vhost-vdpa-change-name-and-polarity-for-vhost_vdpa_o.patch [bz#2070804] - kvm-virtio-net-don-t-handle-mq-request-in-userspace-hand.patch [bz#2070804] - kvm-Revert-globally-limit-the-maximum-number-of-CPUs.patch [bz#2094270] - kvm-vfio-common-remove-spurious-warning-on-vfio_listener.patch [bz#2086262] - Resolves: bz#2031024 (Add support for fixing thread pool size [QEMU]) - Resolves: bz#2072379 (Fail to rebuild the reference count tables of qcow2 image on host block devices (e.g. LVs)) - Resolves: bz#2070804 (PXE boot crash qemu when using multiqueue vDPA) - Resolves: bz#2094270 (Do not set the hard vCPU limit to the soft vCPU limit in downstream qemu-kvm anymore) - Resolves: bz#2086262 ([Win11][tpm]vfio_listener_region_del received unaligned region) --- ...oduce-event-loop-base-abstract-class.patch | 503 ++++++++++++++++++ ...lly-limit-the-maximum-number-of-CPUs.patch | 58 ++ ...08-Fix-when-missing-user_allow_other.patch | 52 ++ ...-Test-new-refcount-rebuild-algorithm.patch | 445 ++++++++++++++++ ...d-errp-to-rebuild_refcount_structure.patch | 162 ++++++ ...mprove-refcount-structure-rebuilding.patch | 465 ++++++++++++++++ ...base-Introduce-options-to-set-the-th.patch | 385 ++++++++++++++ ...oop-Introduce-the-main-loop-into-QOM.patch | 233 ++++++++ ...ve-spurious-warning-on-vfio_listener.patch | 78 +++ ...-improper-cleanup-in-vhost_net_start.patch | 56 ++ ...backend-feature-should-set-only-once.patch | 58 ++ ...e-name-and-polarity-for-vhost_vdpa_o.patch | 123 +++++ ...mproper-cleanup-in-net_init_vhost_vd.patch | 48 ++ ...-ctrl_vq-index-for-non-mq-guest-for-.patch | 143 +++++ ...-handle-mq-request-in-userspace-hand.patch | 109 ++++ ...-vhost_dev-and-notifiers-for-cvq-onl.patch | 52 ++ qemu-kvm.spec | 62 ++- 17 files changed, 3031 insertions(+), 1 deletion(-) create mode 100644 kvm-Introduce-event-loop-base-abstract-class.patch create mode 100644 kvm-Revert-globally-limit-the-maximum-number-of-CPUs.patch create mode 100644 kvm-iotests-108-Fix-when-missing-user_allow_other.patch create mode 100644 kvm-iotests-108-Test-new-refcount-rebuild-algorithm.patch create mode 100644 kvm-qcow2-Add-errp-to-rebuild_refcount_structure.patch create mode 100644 kvm-qcow2-Improve-refcount-structure-rebuilding.patch create mode 100644 kvm-util-event-loop-base-Introduce-options-to-set-the-th.patch create mode 100644 kvm-util-main-loop-Introduce-the-main-loop-into-QOM.patch create mode 100644 kvm-vfio-common-remove-spurious-warning-on-vfio_listener.patch create mode 100644 kvm-vhost-net-fix-improper-cleanup-in-vhost_net_start.patch create mode 100644 kvm-vhost-vdpa-backend-feature-should-set-only-once.patch create mode 100644 kvm-vhost-vdpa-change-name-and-polarity-for-vhost_vdpa_o.patch create mode 100644 kvm-vhost-vdpa-fix-improper-cleanup-in-net_init_vhost_vd.patch create mode 100644 kvm-virtio-net-align-ctrl_vq-index-for-non-mq-guest-for-.patch create mode 100644 kvm-virtio-net-don-t-handle-mq-request-in-userspace-hand.patch create mode 100644 kvm-virtio-net-setup-vhost_dev-and-notifiers-for-cvq-onl.patch diff --git a/kvm-Introduce-event-loop-base-abstract-class.patch b/kvm-Introduce-event-loop-base-abstract-class.patch new file mode 100644 index 0000000..9f987ea --- /dev/null +++ b/kvm-Introduce-event-loop-base-abstract-class.patch @@ -0,0 +1,503 @@ +From 1163da281c178359dd7e1cf1ced5c98caa600f8e Mon Sep 17 00:00:00 2001 +From: Nicolas Saenz Julienne +Date: Mon, 25 Apr 2022 09:57:21 +0200 +Subject: [PATCH 01/16] Introduce event-loop-base abstract class + +RH-Author: Nicolas Saenz Julienne +RH-MergeRequest: 93: util/thread-pool: Expose minimum and maximum size +RH-Commit: [1/3] 5817205d8f56cc4aa98bd5963ecac54a59bad990 +RH-Bugzilla: 2031024 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi + +Introduce the 'event-loop-base' abstract class, it'll hold the +properties common to all event loops and provide the necessary hooks for +their creation and maintenance. Then have iothread inherit from it. + +EventLoopBaseClass is defined as user creatable and provides a hook for +its children to attach themselves to the user creatable class 'complete' +function. It also provides an update_params() callback to propagate +property changes onto its children. + +The new 'event-loop-base' class will live in the root directory. It is +built on its own using the 'link_whole' option (there are no direct +function dependencies between the class and its children, it all happens +trough 'constructor' magic). And also imposes new compilation +dependencies: + + qom <- event-loop-base <- blockdev (iothread.c) + +And in subsequent patches: + + qom <- event-loop-base <- qemuutil (util/main-loop.c) + +All this forced some amount of reordering in meson.build: + + - Moved qom build definition before qemuutil. Doing it the other way + around (i.e. moving qemuutil after qom) isn't possible as a lot of + core libraries that live in between the two depend on it. + + - Process the 'hw' subdir earlier, as it introduces files into the + 'qom' source set. + +No functional changes intended. + +Signed-off-by: Nicolas Saenz Julienne +Reviewed-by: Stefan Hajnoczi +Acked-by: Markus Armbruster +Message-id: 20220425075723.20019-2-nsaenzju@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 7d5983e3c8c40b1d0668faba31d79905c4fadd7d) +--- + event-loop-base.c | 104 +++++++++++++++++++++++++++++++ + include/sysemu/event-loop-base.h | 36 +++++++++++ + include/sysemu/iothread.h | 6 +- + iothread.c | 65 ++++++------------- + meson.build | 23 ++++--- + qapi/qom.json | 22 +++++-- + 6 files changed, 192 insertions(+), 64 deletions(-) + create mode 100644 event-loop-base.c + create mode 100644 include/sysemu/event-loop-base.h + +diff --git a/event-loop-base.c b/event-loop-base.c +new file mode 100644 +index 0000000000..a924c73a7c +--- /dev/null ++++ b/event-loop-base.c +@@ -0,0 +1,104 @@ ++/* ++ * QEMU event-loop base ++ * ++ * Copyright (C) 2022 Red Hat Inc ++ * ++ * Authors: ++ * Stefan Hajnoczi ++ * Nicolas Saenz Julienne ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2 or later. ++ * See the COPYING file in the top-level directory. ++ */ ++ ++#include "qemu/osdep.h" ++#include "qom/object_interfaces.h" ++#include "qapi/error.h" ++#include "sysemu/event-loop-base.h" ++ ++typedef struct { ++ const char *name; ++ ptrdiff_t offset; /* field's byte offset in EventLoopBase struct */ ++} EventLoopBaseParamInfo; ++ ++static EventLoopBaseParamInfo aio_max_batch_info = { ++ "aio-max-batch", offsetof(EventLoopBase, aio_max_batch), ++}; ++ ++static void event_loop_base_get_param(Object *obj, Visitor *v, ++ const char *name, void *opaque, Error **errp) ++{ ++ EventLoopBase *event_loop_base = EVENT_LOOP_BASE(obj); ++ EventLoopBaseParamInfo *info = opaque; ++ int64_t *field = (void *)event_loop_base + info->offset; ++ ++ visit_type_int64(v, name, field, errp); ++} ++ ++static void event_loop_base_set_param(Object *obj, Visitor *v, ++ const char *name, void *opaque, Error **errp) ++{ ++ EventLoopBaseClass *bc = EVENT_LOOP_BASE_GET_CLASS(obj); ++ EventLoopBase *base = EVENT_LOOP_BASE(obj); ++ EventLoopBaseParamInfo *info = opaque; ++ int64_t *field = (void *)base + info->offset; ++ int64_t value; ++ ++ if (!visit_type_int64(v, name, &value, errp)) { ++ return; ++ } ++ ++ if (value < 0) { ++ error_setg(errp, "%s value must be in range [0, %" PRId64 "]", ++ info->name, INT64_MAX); ++ return; ++ } ++ ++ *field = value; ++ ++ if (bc->update_params) { ++ bc->update_params(base, errp); ++ } ++ ++ return; ++} ++ ++static void event_loop_base_complete(UserCreatable *uc, Error **errp) ++{ ++ EventLoopBaseClass *bc = EVENT_LOOP_BASE_GET_CLASS(uc); ++ EventLoopBase *base = EVENT_LOOP_BASE(uc); ++ ++ if (bc->init) { ++ bc->init(base, errp); ++ } ++} ++ ++static void event_loop_base_class_init(ObjectClass *klass, void *class_data) ++{ ++ UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass); ++ ucc->complete = event_loop_base_complete; ++ ++ object_class_property_add(klass, "aio-max-batch", "int", ++ event_loop_base_get_param, ++ event_loop_base_set_param, ++ NULL, &aio_max_batch_info); ++} ++ ++static const TypeInfo event_loop_base_info = { ++ .name = TYPE_EVENT_LOOP_BASE, ++ .parent = TYPE_OBJECT, ++ .instance_size = sizeof(EventLoopBase), ++ .class_size = sizeof(EventLoopBaseClass), ++ .class_init = event_loop_base_class_init, ++ .abstract = true, ++ .interfaces = (InterfaceInfo[]) { ++ { TYPE_USER_CREATABLE }, ++ { } ++ } ++}; ++ ++static void register_types(void) ++{ ++ type_register_static(&event_loop_base_info); ++} ++type_init(register_types); +diff --git a/include/sysemu/event-loop-base.h b/include/sysemu/event-loop-base.h +new file mode 100644 +index 0000000000..8e77d8b69f +--- /dev/null ++++ b/include/sysemu/event-loop-base.h +@@ -0,0 +1,36 @@ ++/* ++ * QEMU event-loop backend ++ * ++ * Copyright (C) 2022 Red Hat Inc ++ * ++ * Authors: ++ * Nicolas Saenz Julienne ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2 or later. ++ * See the COPYING file in the top-level directory. ++ */ ++#ifndef QEMU_EVENT_LOOP_BASE_H ++#define QEMU_EVENT_LOOP_BASE_H ++ ++#include "qom/object.h" ++#include "block/aio.h" ++#include "qemu/typedefs.h" ++ ++#define TYPE_EVENT_LOOP_BASE "event-loop-base" ++OBJECT_DECLARE_TYPE(EventLoopBase, EventLoopBaseClass, ++ EVENT_LOOP_BASE) ++ ++struct EventLoopBaseClass { ++ ObjectClass parent_class; ++ ++ void (*init)(EventLoopBase *base, Error **errp); ++ void (*update_params)(EventLoopBase *base, Error **errp); ++}; ++ ++struct EventLoopBase { ++ Object parent; ++ ++ /* AioContext AIO engine parameters */ ++ int64_t aio_max_batch; ++}; ++#endif +diff --git a/include/sysemu/iothread.h b/include/sysemu/iothread.h +index 7f714bd136..8f8601d6ab 100644 +--- a/include/sysemu/iothread.h ++++ b/include/sysemu/iothread.h +@@ -17,11 +17,12 @@ + #include "block/aio.h" + #include "qemu/thread.h" + #include "qom/object.h" ++#include "sysemu/event-loop-base.h" + + #define TYPE_IOTHREAD "iothread" + + struct IOThread { +- Object parent_obj; ++ EventLoopBase parent_obj; + + QemuThread thread; + AioContext *ctx; +@@ -37,9 +38,6 @@ struct IOThread { + int64_t poll_max_ns; + int64_t poll_grow; + int64_t poll_shrink; +- +- /* AioContext AIO engine parameters */ +- int64_t aio_max_batch; + }; + typedef struct IOThread IOThread; + +diff --git a/iothread.c b/iothread.c +index 0f98af0f2a..8fa2f3bfb8 100644 +--- a/iothread.c ++++ b/iothread.c +@@ -17,6 +17,7 @@ + #include "qemu/module.h" + #include "block/aio.h" + #include "block/block.h" ++#include "sysemu/event-loop-base.h" + #include "sysemu/iothread.h" + #include "qapi/error.h" + #include "qapi/qapi-commands-misc.h" +@@ -152,10 +153,15 @@ static void iothread_init_gcontext(IOThread *iothread) + iothread->main_loop = g_main_loop_new(iothread->worker_context, TRUE); + } + +-static void iothread_set_aio_context_params(IOThread *iothread, Error **errp) ++static void iothread_set_aio_context_params(EventLoopBase *base, Error **errp) + { ++ IOThread *iothread = IOTHREAD(base); + ERRP_GUARD(); + ++ if (!iothread->ctx) { ++ return; ++ } ++ + aio_context_set_poll_params(iothread->ctx, + iothread->poll_max_ns, + iothread->poll_grow, +@@ -166,14 +172,15 @@ static void iothread_set_aio_context_params(IOThread *iothread, Error **errp) + } + + aio_context_set_aio_params(iothread->ctx, +- iothread->aio_max_batch, ++ iothread->parent_obj.aio_max_batch, + errp); + } + +-static void iothread_complete(UserCreatable *obj, Error **errp) ++ ++static void iothread_init(EventLoopBase *base, Error **errp) + { + Error *local_error = NULL; +- IOThread *iothread = IOTHREAD(obj); ++ IOThread *iothread = IOTHREAD(base); + char *thread_name; + + iothread->stopping = false; +@@ -189,7 +196,7 @@ static void iothread_complete(UserCreatable *obj, Error **errp) + */ + iothread_init_gcontext(iothread); + +- iothread_set_aio_context_params(iothread, &local_error); ++ iothread_set_aio_context_params(base, &local_error); + if (local_error) { + error_propagate(errp, local_error); + aio_context_unref(iothread->ctx); +@@ -201,7 +208,7 @@ static void iothread_complete(UserCreatable *obj, Error **errp) + * to inherit. + */ + thread_name = g_strdup_printf("IO %s", +- object_get_canonical_path_component(OBJECT(obj))); ++ object_get_canonical_path_component(OBJECT(base))); + qemu_thread_create(&iothread->thread, thread_name, iothread_run, + iothread, QEMU_THREAD_JOINABLE); + g_free(thread_name); +@@ -226,9 +233,6 @@ static IOThreadParamInfo poll_grow_info = { + static IOThreadParamInfo poll_shrink_info = { + "poll-shrink", offsetof(IOThread, poll_shrink), + }; +-static IOThreadParamInfo aio_max_batch_info = { +- "aio-max-batch", offsetof(IOThread, aio_max_batch), +-}; + + static void iothread_get_param(Object *obj, Visitor *v, + const char *name, IOThreadParamInfo *info, Error **errp) +@@ -288,35 +292,12 @@ static void iothread_set_poll_param(Object *obj, Visitor *v, + } + } + +-static void iothread_get_aio_param(Object *obj, Visitor *v, +- const char *name, void *opaque, Error **errp) +-{ +- IOThreadParamInfo *info = opaque; +- +- iothread_get_param(obj, v, name, info, errp); +-} +- +-static void iothread_set_aio_param(Object *obj, Visitor *v, +- const char *name, void *opaque, Error **errp) +-{ +- IOThread *iothread = IOTHREAD(obj); +- IOThreadParamInfo *info = opaque; +- +- if (!iothread_set_param(obj, v, name, info, errp)) { +- return; +- } +- +- if (iothread->ctx) { +- aio_context_set_aio_params(iothread->ctx, +- iothread->aio_max_batch, +- errp); +- } +-} +- + static void iothread_class_init(ObjectClass *klass, void *class_data) + { +- UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass); +- ucc->complete = iothread_complete; ++ EventLoopBaseClass *bc = EVENT_LOOP_BASE_CLASS(klass); ++ ++ bc->init = iothread_init; ++ bc->update_params = iothread_set_aio_context_params; + + object_class_property_add(klass, "poll-max-ns", "int", + iothread_get_poll_param, +@@ -330,23 +311,15 @@ static void iothread_class_init(ObjectClass *klass, void *class_data) + iothread_get_poll_param, + iothread_set_poll_param, + NULL, &poll_shrink_info); +- object_class_property_add(klass, "aio-max-batch", "int", +- iothread_get_aio_param, +- iothread_set_aio_param, +- NULL, &aio_max_batch_info); + } + + static const TypeInfo iothread_info = { + .name = TYPE_IOTHREAD, +- .parent = TYPE_OBJECT, ++ .parent = TYPE_EVENT_LOOP_BASE, + .class_init = iothread_class_init, + .instance_size = sizeof(IOThread), + .instance_init = iothread_instance_init, + .instance_finalize = iothread_instance_finalize, +- .interfaces = (InterfaceInfo[]) { +- {TYPE_USER_CREATABLE}, +- {} +- }, + }; + + static void iothread_register_types(void) +@@ -383,7 +356,7 @@ static int query_one_iothread(Object *object, void *opaque) + info->poll_max_ns = iothread->poll_max_ns; + info->poll_grow = iothread->poll_grow; + info->poll_shrink = iothread->poll_shrink; +- info->aio_max_batch = iothread->aio_max_batch; ++ info->aio_max_batch = iothread->parent_obj.aio_max_batch; + + QAPI_LIST_APPEND(*tail, info); + return 0; +diff --git a/meson.build b/meson.build +index 6f7e430f0f..b9c919a55e 100644 +--- a/meson.build ++++ b/meson.build +@@ -2804,6 +2804,7 @@ subdir('qom') + subdir('authz') + subdir('crypto') + subdir('ui') ++subdir('hw') + + + if enable_modules +@@ -2811,6 +2812,18 @@ if enable_modules + modulecommon = declare_dependency(link_whole: libmodulecommon, compile_args: '-DBUILD_DSO') + endif + ++qom_ss = qom_ss.apply(config_host, strict: false) ++libqom = static_library('qom', qom_ss.sources() + genh, ++ dependencies: [qom_ss.dependencies()], ++ name_suffix: 'fa') ++qom = declare_dependency(link_whole: libqom) ++ ++event_loop_base = files('event-loop-base.c') ++event_loop_base = static_library('event-loop-base', sources: event_loop_base + genh, ++ build_by_default: true) ++event_loop_base = declare_dependency(link_whole: event_loop_base, ++ dependencies: [qom]) ++ + stub_ss = stub_ss.apply(config_all, strict: false) + + util_ss.add_all(trace_ss) +@@ -2897,7 +2910,6 @@ subdir('monitor') + subdir('net') + subdir('replay') + subdir('semihosting') +-subdir('hw') + subdir('tcg') + subdir('fpu') + subdir('accel') +@@ -3022,13 +3034,6 @@ qemu_syms = custom_target('qemu.syms', output: 'qemu.syms', + capture: true, + command: [undefsym, nm, '@INPUT@']) + +-qom_ss = qom_ss.apply(config_host, strict: false) +-libqom = static_library('qom', qom_ss.sources() + genh, +- dependencies: [qom_ss.dependencies()], +- name_suffix: 'fa') +- +-qom = declare_dependency(link_whole: libqom) +- + authz_ss = authz_ss.apply(config_host, strict: false) + libauthz = static_library('authz', authz_ss.sources() + genh, + dependencies: [authz_ss.dependencies()], +@@ -3081,7 +3086,7 @@ libblockdev = static_library('blockdev', blockdev_ss.sources() + genh, + build_by_default: false) + + blockdev = declare_dependency(link_whole: [libblockdev], +- dependencies: [block]) ++ dependencies: [block, event_loop_base]) + + qmp_ss = qmp_ss.apply(config_host, strict: false) + libqmp = static_library('qmp', qmp_ss.sources() + genh, +diff --git a/qapi/qom.json b/qapi/qom.json +index eeb5395ff3..a2439533c5 100644 +--- a/qapi/qom.json ++++ b/qapi/qom.json +@@ -499,6 +499,20 @@ + '*repeat': 'bool', + '*grab-toggle': 'GrabToggleKeys' } } + ++## ++# @EventLoopBaseProperties: ++# ++# Common properties for event loops ++# ++# @aio-max-batch: maximum number of requests in a batch for the AIO engine, ++# 0 means that the engine will use its default. ++# (default: 0) ++# ++# Since: 7.1 ++## ++{ 'struct': 'EventLoopBaseProperties', ++ 'data': { '*aio-max-batch': 'int' } } ++ + ## + # @IothreadProperties: + # +@@ -516,17 +530,15 @@ + # algorithm detects it is spending too long polling without + # encountering events. 0 selects a default behaviour (default: 0) + # +-# @aio-max-batch: maximum number of requests in a batch for the AIO engine, +-# 0 means that the engine will use its default +-# (default:0, since 6.1) ++# The @aio-max-batch option is available since 6.1. + # + # Since: 2.0 + ## + { 'struct': 'IothreadProperties', ++ 'base': 'EventLoopBaseProperties', + 'data': { '*poll-max-ns': 'int', + '*poll-grow': 'int', +- '*poll-shrink': 'int', +- '*aio-max-batch': 'int' } } ++ '*poll-shrink': 'int' } } + + ## + # @MemoryBackendProperties: +-- +2.31.1 + diff --git a/kvm-Revert-globally-limit-the-maximum-number-of-CPUs.patch b/kvm-Revert-globally-limit-the-maximum-number-of-CPUs.patch new file mode 100644 index 0000000..7740d0b --- /dev/null +++ b/kvm-Revert-globally-limit-the-maximum-number-of-CPUs.patch @@ -0,0 +1,58 @@ +From 5ab8613582fd56b847fe75750acb5b7255900b35 Mon Sep 17 00:00:00 2001 +From: Vitaly Kuznetsov +Date: Thu, 9 Jun 2022 11:55:15 +0200 +Subject: [PATCH 15/16] Revert "globally limit the maximum number of CPUs" + +RH-Author: Vitaly Kuznetsov +RH-MergeRequest: 99: Revert "globally limit the maximum number of CPUs" +RH-Commit: [1/1] 13100d4a2209b2190a3654c1f9cf4ebade1e8d24 (vkuznets/qemu-kvm-c9s) +RH-Bugzilla: 2094270 +RH-Acked-by: Andrew Jones +RH-Acked-by: Cornelia Huck +RH-Acked-by: Thomas Huth + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2094270 +Brew: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=45871149 +Upstream Status: RHEL-only +Tested: with upstream kernel + +Downstream QEMU carries a patch that sets the hard limit of possible vCPUs +to the value that the KVM code of the kernel recommends as soft limit. +Upstream KVM code has been changed recently to not use an arbitrary soft +limit anymore, but to cap the value on the amount of available physical +CPUs of the host. This defeats the purpose of the downstream change in +QEMU completely. Drop the downstream-only patch to allow CPU overcommit. + +This reverts commit 6669f6fa677d43144f39d6ad59725b7ba622f1c2. + +Signed-off-by: Vitaly Kuznetsov +--- + accel/kvm/kvm-all.c | 12 ------------ + 1 file changed, 12 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index fdf0e4d429..5f1377ca04 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -2430,18 +2430,6 @@ static int kvm_init(MachineState *ms) + soft_vcpus_limit = kvm_recommended_vcpus(s); + hard_vcpus_limit = kvm_max_vcpus(s); + +-#ifdef HOST_PPC64 +- /* +- * On POWER, the kernel advertises a soft limit based on the +- * number of CPU threads on the host. We want to allow exceeding +- * this for testing purposes, so we don't want to set hard limit +- * to soft limit as on x86. +- */ +-#else +- /* RHEL doesn't support nr_vcpus > soft_vcpus_limit */ +- hard_vcpus_limit = soft_vcpus_limit; +-#endif +- + while (nc->name) { + if (nc->num > soft_vcpus_limit) { + warn_report("Number of %s cpus requested (%d) exceeds " +-- +2.31.1 + diff --git a/kvm-iotests-108-Fix-when-missing-user_allow_other.patch b/kvm-iotests-108-Fix-when-missing-user_allow_other.patch new file mode 100644 index 0000000..a37ea6f --- /dev/null +++ b/kvm-iotests-108-Fix-when-missing-user_allow_other.patch @@ -0,0 +1,52 @@ +From 447bca651c9156d7aba6b7495c75f19b5e4ed53f Mon Sep 17 00:00:00 2001 +From: Hanna Reitz +Date: Thu, 21 Apr 2022 16:24:35 +0200 +Subject: [PATCH 07/16] iotests/108: Fix when missing user_allow_other + +RH-Author: Hanna Reitz +RH-MergeRequest: 96: qcow2: Improve refcount structure rebuilding +RH-Commit: [4/4] a51ab8606fc9d8dea2b6539f4e795d5813892a5c (hreitz/qemu-kvm-c-9-s) +RH-Bugzilla: 2072379 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Eric Blake +RH-Acked-by: Stefan Hajnoczi + +FUSE exports' allow-other option defaults to "auto", which means that it +will try passing allow_other as a mount option, and fall back to not +using it when an error occurs. We make no effort to hide fusermount's +error message (because it would be difficult, and because users might +want to know about the fallback occurring), and so when allow_other does +not work (primarily when /etc/fuse.conf does not contain +user_allow_other), this error message will appear and break the +reference output. + +We do not need allow_other here, though, so we can just pass +allow-other=off to fix that. + +Reported-by: Markus Armbruster +Signed-off-by: Hanna Reitz +Message-Id: <20220421142435.569600-1-hreitz@redhat.com> +Tested-by: Markus Armbruster +Tested-by: Eric Blake +(cherry picked from commit 348a0740afc5b313599533eb69bbb2b95d2f1bba) +Signed-off-by: Hanna Reitz +--- + tests/qemu-iotests/108 | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tests/qemu-iotests/108 b/tests/qemu-iotests/108 +index a3090e2875..4681c7c769 100755 +--- a/tests/qemu-iotests/108 ++++ b/tests/qemu-iotests/108 +@@ -326,7 +326,7 @@ else + + $QSD \ + --blockdev file,node-name=export-node,filename="$TEST_IMG" \ +- --export fuse,id=fuse-export,node-name=export-node,mountpoint="$export_mp",writable=on,growable=off \ ++ --export fuse,id=fuse-export,node-name=export-node,mountpoint="$export_mp",writable=on,growable=off,allow-other=off \ + --pidfile "$TEST_DIR/qsd.pid" \ + & + +-- +2.31.1 + diff --git a/kvm-iotests-108-Test-new-refcount-rebuild-algorithm.patch b/kvm-iotests-108-Test-new-refcount-rebuild-algorithm.patch new file mode 100644 index 0000000..7a968f6 --- /dev/null +++ b/kvm-iotests-108-Test-new-refcount-rebuild-algorithm.patch @@ -0,0 +1,445 @@ +From ed69e01352b5e9a06173daab53bfa373c8535732 Mon Sep 17 00:00:00 2001 +From: Hanna Reitz +Date: Tue, 5 Apr 2022 15:46:51 +0200 +Subject: [PATCH 05/16] iotests/108: Test new refcount rebuild algorithm + +RH-Author: Hanna Reitz +RH-MergeRequest: 96: qcow2: Improve refcount structure rebuilding +RH-Commit: [2/4] b68310a9fee8465dd3f568c8e867e1b7ae52bdaf (hreitz/qemu-kvm-c-9-s) +RH-Bugzilla: 2072379 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Eric Blake +RH-Acked-by: Stefan Hajnoczi + +One clear problem with how qcow2's refcount structure rebuild algorithm +used to be before "qcow2: Improve refcount structure rebuilding" was +that it is prone to failure for qcow2 images on block devices: There is +generally unused space after the actual image, and if that exceeds what +one refblock covers, the old algorithm would invariably write the +reftable past the block device's end, which cannot work. The new +algorithm does not have this problem. + +Test it with three tests: +(1) Create an image with more empty space at the end than what one + refblock covers, see whether rebuilding the refcount structures + results in a change in the image file length. (It should not.) + +(2) Leave precisely enough space somewhere at the beginning of the image + for the new reftable (and the refblock for that place), see whether + the new algorithm puts the reftable there. (It should.) + +(3) Test the original problem: Create (something like) a block device + with a fixed size, then create a qcow2 image in there, write some + data, and then have qemu-img check rebuild the refcount structures. + Before HEAD^, the reftable would have been written past the image + file end, i.e. outside of what the block device provides, which + cannot work. HEAD^ should have fixed that. + ("Something like a block device" means a loop device if we can use + one ("sudo -n losetup" works), or a FUSE block export with + growable=false otherwise.) + +Reviewed-by: Eric Blake +Signed-off-by: Hanna Reitz +Message-Id: <20220405134652.19278-3-hreitz@redhat.com> +(cherry picked from commit 9ffd6d646d1d5ee9087a8cbf0b7d2f96c5656162) + +Conflicts: +- 108: The downstream qemu-storage-daemon does not support --daemonize, + so this switch has been replaced by a loop waiting for the PID file to + appear + +Signed-off-by: Hanna Reitz +--- + tests/qemu-iotests/108 | 263 ++++++++++++++++++++++++++++++++++++- + tests/qemu-iotests/108.out | 81 ++++++++++++ + 2 files changed, 343 insertions(+), 1 deletion(-) + +diff --git a/tests/qemu-iotests/108 b/tests/qemu-iotests/108 +index 56339ab2c5..a3090e2875 100755 +--- a/tests/qemu-iotests/108 ++++ b/tests/qemu-iotests/108 +@@ -30,13 +30,20 @@ status=1 # failure is the default! + + _cleanup() + { +- _cleanup_test_img ++ _cleanup_test_img ++ if [ -f "$TEST_DIR/qsd.pid" ]; then ++ qsd_pid=$(cat "$TEST_DIR/qsd.pid") ++ kill -KILL "$qsd_pid" ++ fusermount -u "$TEST_DIR/fuse-export" &>/dev/null ++ fi ++ rm -f "$TEST_DIR/fuse-export" + } + trap "_cleanup; exit \$status" 0 1 2 3 15 + + # get standard environment, filters and checks + . ./common.rc + . ./common.filter ++. ./common.qemu + + # This tests qcow2-specific low-level functionality + _supported_fmt qcow2 +@@ -47,6 +54,22 @@ _supported_os Linux + # files + _unsupported_imgopts 'refcount_bits=\([^1]\|.\([^6]\|$\)\)' data_file + ++# This test either needs sudo -n losetup or FUSE exports to work ++if sudo -n losetup &>/dev/null; then ++ loopdev=true ++else ++ loopdev=false ++ ++ # QSD --export fuse will either yield "Parameter 'id' is missing" ++ # or "Invalid parameter 'fuse'", depending on whether there is ++ # FUSE support or not. ++ error=$($QSD --export fuse 2>&1) ++ if [[ $error = *"'fuse'"* ]]; then ++ _notrun 'Passwordless sudo for losetup or FUSE support required, but' \ ++ 'neither is available' ++ fi ++fi ++ + echo + echo '=== Repairing an image without any refcount table ===' + echo +@@ -138,6 +161,244 @@ _make_test_img 64M + poke_file "$TEST_IMG" $((0x10008)) "\xff\xff\xff\xff\xff\xff\x00\x00" + _check_test_img -r all + ++echo ++echo '=== Check rebuilt reftable location ===' ++ ++# In an earlier version of the refcount rebuild algorithm, the ++# reftable was generally placed at the image end (unless something was ++# allocated in the area covered by the refblock right before the image ++# file end, then we would try to place the reftable in that refblock). ++# This was later changed so the reftable would be placed in the ++# earliest possible location. Test this. ++ ++echo ++echo '--- Does the image size increase? ---' ++echo ++ ++# First test: Just create some image, write some data to it, and ++# resize it so there is free space at the end of the image (enough ++# that it spans at least one full refblock, which for cluster_size=512 ++# images, spans 128k). With the old algorithm, the reftable would ++# have then been placed at the end of the image file, but with the new ++# one, it will be put in that free space. ++# We want to check whether the size of the image file increases due to ++# rebuilding the refcount structures (it should not). ++ ++_make_test_img -o 'cluster_size=512' 1M ++# Write something ++$QEMU_IO -c 'write 0 64k' "$TEST_IMG" | _filter_qemu_io ++ ++# Add free space ++file_len=$(stat -c '%s' "$TEST_IMG") ++truncate -s $((file_len + 256 * 1024)) "$TEST_IMG" ++ ++# Corrupt the image by saying the image header was not allocated ++rt_offset=$(peek_file_be "$TEST_IMG" 48 8) ++rb_offset=$(peek_file_be "$TEST_IMG" $rt_offset 8) ++poke_file "$TEST_IMG" $rb_offset "\x00\x00" ++ ++# Check whether rebuilding the refcount structures increases the image ++# file size ++file_len=$(stat -c '%s' "$TEST_IMG") ++echo ++# The only leaks there can be are the old refcount structures that are ++# leaked during rebuilding, no need to clutter the output with them ++_check_test_img -r all | grep -v '^Repairing cluster.*refcount=1 reference=0' ++echo ++post_repair_file_len=$(stat -c '%s' "$TEST_IMG") ++ ++if [[ $file_len -eq $post_repair_file_len ]]; then ++ echo 'OK: Image size did not change' ++else ++ echo 'ERROR: Image size differs' \ ++ "($file_len before, $post_repair_file_len after)" ++fi ++ ++echo ++echo '--- Will the reftable occupy a hole specifically left for it? ---' ++echo ++ ++# Note: With cluster_size=512, every refblock covers 128k. ++# The reftable covers 8M per reftable cluster. ++ ++# Create an image that requires two reftable clusters (just because ++# this is more interesting than a single-clustered reftable). ++_make_test_img -o 'cluster_size=512' 9M ++$QEMU_IO -c 'write 0 8M' "$TEST_IMG" | _filter_qemu_io ++ ++# Writing 8M will have resized the reftable. Unfortunately, doing so ++# will leave holes in the file, so we need to fill them up so we can ++# be sure the whole file is allocated. Do that by writing ++# consecutively smaller chunks starting from 8 MB, until the file ++# length increases even with a chunk size of 512. Then we must have ++# filled all holes. ++ofs=$((8 * 1024 * 1024)) ++block_len=$((16 * 1024)) ++while [[ $block_len -ge 512 ]]; do ++ file_len=$(stat -c '%s' "$TEST_IMG") ++ while [[ $(stat -c '%s' "$TEST_IMG") -eq $file_len ]]; do ++ # Do not include this in the reference output, it does not ++ # really matter which qemu-io calls we do here exactly ++ $QEMU_IO -c "write $ofs $block_len" "$TEST_IMG" >/dev/null ++ ofs=$((ofs + block_len)) ++ done ++ block_len=$((block_len / 2)) ++done ++ ++# Fill up to 9M (do not include this in the reference output either, ++# $ofs is random for all we know) ++$QEMU_IO -c "write $ofs $((9 * 1024 * 1024 - ofs))" "$TEST_IMG" >/dev/null ++ ++# Make space as follows: ++# - For the first refblock: Right at the beginning of the image (this ++# refblock is placed in the first place possible), ++# - For the reftable somewhere soon afterwards, still near the ++# beginning of the image (i.e. covered by the first refblock); the ++# reftable too is placed in the first place possible, but only after ++# all refblocks have been placed) ++# No space is needed for the other refblocks, because no refblock is ++# put before the space it covers. In this test case, we do not mind ++# if they are placed at the image file's end. ++ ++# Before we make that space, we have to find out the host offset of ++# the area that belonged to the two data clusters at guest offset 4k, ++# because we expect the reftable to be placed there, and we will have ++# to verify that it is. ++ ++l1_offset=$(peek_file_be "$TEST_IMG" 40 8) ++l2_offset=$(peek_file_be "$TEST_IMG" $l1_offset 8) ++l2_offset=$((l2_offset & 0x00fffffffffffe00)) ++data_4k_offset=$(peek_file_be "$TEST_IMG" \ ++ $((l2_offset + 4096 / 512 * 8)) 8) ++data_4k_offset=$((data_4k_offset & 0x00fffffffffffe00)) ++ ++$QEMU_IO -c "discard 0 512" -c "discard 4k 1k" "$TEST_IMG" | _filter_qemu_io ++ ++# Corrupt the image by saying the image header was not allocated ++rt_offset=$(peek_file_be "$TEST_IMG" 48 8) ++rb_offset=$(peek_file_be "$TEST_IMG" $rt_offset 8) ++poke_file "$TEST_IMG" $rb_offset "\x00\x00" ++ ++echo ++# The only leaks there can be are the old refcount structures that are ++# leaked during rebuilding, no need to clutter the output with them ++_check_test_img -r all | grep -v '^Repairing cluster.*refcount=1 reference=0' ++echo ++ ++# Check whether the reftable was put where we expected ++rt_offset=$(peek_file_be "$TEST_IMG" 48 8) ++if [[ $rt_offset -eq $data_4k_offset ]]; then ++ echo 'OK: Reftable is where we expect it' ++else ++ echo "ERROR: Reftable is at $rt_offset, but was expected at $data_4k_offset" ++fi ++ ++echo ++echo '--- Rebuilding refcount structures on block devices ---' ++echo ++ ++# A block device cannot really grow, at least not during qemu-img ++# check. As mentioned in the above cases, rebuilding the refcount ++# structure may lead to new refcount structures being written after ++# the end of the image, and in the past that happened even if there ++# was more than sufficient space in the image. Such post-EOF writes ++# will not work on block devices, so test that the new algorithm ++# avoids it. ++ ++# If we have passwordless sudo and losetup, we can use those to create ++# a block device. Otherwise, we can resort to qemu's FUSE export to ++# create a file that isn't growable, which effectively tests the same ++# thing. ++ ++_cleanup_test_img ++truncate -s $((64 * 1024 * 1024)) "$TEST_IMG" ++ ++if $loopdev; then ++ export_mp=$(sudo -n losetup --show -f "$TEST_IMG") ++ export_mp_driver=host_device ++ sudo -n chmod go+rw "$export_mp" ++else ++ # Create non-growable FUSE export that is a bit like an empty ++ # block device ++ export_mp="$TEST_DIR/fuse-export" ++ export_mp_driver=file ++ touch "$export_mp" ++ ++ $QSD \ ++ --blockdev file,node-name=export-node,filename="$TEST_IMG" \ ++ --export fuse,id=fuse-export,node-name=export-node,mountpoint="$export_mp",writable=on,growable=off \ ++ --pidfile "$TEST_DIR/qsd.pid" \ ++ & ++ ++ while [ ! -f "$TEST_DIR/qsd.pid" ]; do ++ sleep 0.1 ++ done ++fi ++ ++# Now create a qcow2 image on the device -- unfortunately, qemu-img ++# create force-creates the file, so we have to resort to the ++# blockdev-create job. ++_launch_qemu \ ++ --blockdev $export_mp_driver,node-name=file,filename="$export_mp" ++ ++_send_qemu_cmd \ ++ $QEMU_HANDLE \ ++ '{ "execute": "qmp_capabilities" }' \ ++ 'return' ++ ++# Small cluster size again, so the image needs multiple refblocks ++_send_qemu_cmd \ ++ $QEMU_HANDLE \ ++ '{ "execute": "blockdev-create", ++ "arguments": { ++ "job-id": "create", ++ "options": { ++ "driver": "qcow2", ++ "file": "file", ++ "size": '$((64 * 1024 * 1024))', ++ "cluster-size": 512 ++ } } }' \ ++ '"concluded"' ++ ++_send_qemu_cmd \ ++ $QEMU_HANDLE \ ++ '{ "execute": "job-dismiss", "arguments": { "id": "create" } }' \ ++ 'return' ++ ++_send_qemu_cmd \ ++ $QEMU_HANDLE \ ++ '{ "execute": "quit" }' \ ++ 'return' ++ ++wait=y _cleanup_qemu ++echo ++ ++# Write some data ++$QEMU_IO -c 'write 0 64k' "$export_mp" | _filter_qemu_io ++ ++# Corrupt the image by saying the image header was not allocated ++rt_offset=$(peek_file_be "$export_mp" 48 8) ++rb_offset=$(peek_file_be "$export_mp" $rt_offset 8) ++poke_file "$export_mp" $rb_offset "\x00\x00" ++ ++# Repairing such a simple case should just work ++# (We used to put the reftable at the end of the image file, which can ++# never work for non-growable devices.) ++echo ++TEST_IMG="$export_mp" _check_test_img -r all \ ++ | grep -v '^Repairing cluster.*refcount=1 reference=0' ++ ++if $loopdev; then ++ sudo -n losetup -d "$export_mp" ++else ++ qsd_pid=$(cat "$TEST_DIR/qsd.pid") ++ kill -TERM "$qsd_pid" ++ # Wait for process to exit (cannot `wait` because the QSD is daemonized) ++ while [ -f "$TEST_DIR/qsd.pid" ]; do ++ true ++ done ++fi ++ + # success, all done + echo '*** done' + rm -f $seq.full +diff --git a/tests/qemu-iotests/108.out b/tests/qemu-iotests/108.out +index 75bab8dc84..b5401d788d 100644 +--- a/tests/qemu-iotests/108.out ++++ b/tests/qemu-iotests/108.out +@@ -105,6 +105,87 @@ The following inconsistencies were found and repaired: + 0 leaked clusters + 1 corruptions + ++Double checking the fixed image now... ++No errors were found on the image. ++ ++=== Check rebuilt reftable location === ++ ++--- Does the image size increase? --- ++ ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 ++wrote 65536/65536 bytes at offset 0 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++ERROR cluster 0 refcount=0 reference=1 ++Rebuilding refcount structure ++The following inconsistencies were found and repaired: ++ ++ 0 leaked clusters ++ 1 corruptions ++ ++Double checking the fixed image now... ++No errors were found on the image. ++ ++OK: Image size did not change ++ ++--- Will the reftable occupy a hole specifically left for it? --- ++ ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=9437184 ++wrote 8388608/8388608 bytes at offset 0 ++8 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++discard 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++discard 1024/1024 bytes at offset 4096 ++1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++ERROR cluster 0 refcount=0 reference=1 ++Rebuilding refcount structure ++The following inconsistencies were found and repaired: ++ ++ 0 leaked clusters ++ 1 corruptions ++ ++Double checking the fixed image now... ++No errors were found on the image. ++ ++OK: Reftable is where we expect it ++ ++--- Rebuilding refcount structures on block devices --- ++ ++{ "execute": "qmp_capabilities" } ++{"return": {}} ++{ "execute": "blockdev-create", ++ "arguments": { ++ "job-id": "create", ++ "options": { ++ "driver": "IMGFMT", ++ "file": "file", ++ "size": 67108864, ++ "cluster-size": 512 ++ } } } ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "create"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "create"}} ++{"return": {}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "create"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "pending", "id": "create"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "create"}} ++{ "execute": "job-dismiss", "arguments": { "id": "create" } } ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "create"}} ++{"return": {}} ++{ "execute": "quit" } ++{"return": {}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} ++ ++wrote 65536/65536 bytes at offset 0 ++64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++ERROR cluster 0 refcount=0 reference=1 ++Rebuilding refcount structure ++The following inconsistencies were found and repaired: ++ ++ 0 leaked clusters ++ 1 corruptions ++ + Double checking the fixed image now... + No errors were found on the image. + *** done +-- +2.31.1 + diff --git a/kvm-qcow2-Add-errp-to-rebuild_refcount_structure.patch b/kvm-qcow2-Add-errp-to-rebuild_refcount_structure.patch new file mode 100644 index 0000000..9010d3d --- /dev/null +++ b/kvm-qcow2-Add-errp-to-rebuild_refcount_structure.patch @@ -0,0 +1,162 @@ +From 5e385a0e49a520550a83299632be175857b63f19 Mon Sep 17 00:00:00 2001 +From: Hanna Reitz +Date: Tue, 5 Apr 2022 15:46:52 +0200 +Subject: [PATCH 06/16] qcow2: Add errp to rebuild_refcount_structure() + +RH-Author: Hanna Reitz +RH-MergeRequest: 96: qcow2: Improve refcount structure rebuilding +RH-Commit: [3/4] 937b89a7eab6ec6b18618d59bc1526976ad03290 (hreitz/qemu-kvm-c-9-s) +RH-Bugzilla: 2072379 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Eric Blake +RH-Acked-by: Stefan Hajnoczi + +Instead of fprint()-ing error messages in rebuild_refcount_structure() +and its rebuild_refcounts_write_refblocks() helper, pass them through an +Error object to qcow2_check_refcounts() (which will then print it). + +Suggested-by: Eric Blake +Signed-off-by: Hanna Reitz +Message-Id: <20220405134652.19278-4-hreitz@redhat.com> +Reviewed-by: Eric Blake +(cherry picked from commit 0423f75351ab83b844a31349218b0eadd830e07a) +Signed-off-by: Hanna Reitz +--- + block/qcow2-refcount.c | 33 +++++++++++++++++++-------------- + 1 file changed, 19 insertions(+), 14 deletions(-) + +diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c +index c5669eaa51..ed0ecfaa89 100644 +--- a/block/qcow2-refcount.c ++++ b/block/qcow2-refcount.c +@@ -2465,7 +2465,8 @@ static int64_t alloc_clusters_imrt(BlockDriverState *bs, + static int rebuild_refcounts_write_refblocks( + BlockDriverState *bs, void **refcount_table, int64_t *nb_clusters, + int64_t first_cluster, int64_t end_cluster, +- uint64_t **on_disk_reftable_ptr, uint32_t *on_disk_reftable_entries_ptr ++ uint64_t **on_disk_reftable_ptr, uint32_t *on_disk_reftable_entries_ptr, ++ Error **errp + ) + { + BDRVQcow2State *s = bs->opaque; +@@ -2516,8 +2517,8 @@ static int rebuild_refcounts_write_refblocks( + nb_clusters, + &first_free_cluster); + if (refblock_offset < 0) { +- fprintf(stderr, "ERROR allocating refblock: %s\n", +- strerror(-refblock_offset)); ++ error_setg_errno(errp, -refblock_offset, ++ "ERROR allocating refblock"); + return refblock_offset; + } + +@@ -2539,6 +2540,7 @@ static int rebuild_refcounts_write_refblocks( + on_disk_reftable_entries * + REFTABLE_ENTRY_SIZE); + if (!on_disk_reftable) { ++ error_setg(errp, "ERROR allocating reftable memory"); + return -ENOMEM; + } + +@@ -2562,7 +2564,7 @@ static int rebuild_refcounts_write_refblocks( + ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset, + s->cluster_size, false); + if (ret < 0) { +- fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret)); ++ error_setg_errno(errp, -ret, "ERROR writing refblock"); + return ret; + } + +@@ -2578,7 +2580,7 @@ static int rebuild_refcounts_write_refblocks( + ret = bdrv_pwrite(bs->file, refblock_offset, on_disk_refblock, + s->cluster_size); + if (ret < 0) { +- fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret)); ++ error_setg_errno(errp, -ret, "ERROR writing refblock"); + return ret; + } + +@@ -2601,7 +2603,8 @@ static int rebuild_refcounts_write_refblocks( + static int rebuild_refcount_structure(BlockDriverState *bs, + BdrvCheckResult *res, + void **refcount_table, +- int64_t *nb_clusters) ++ int64_t *nb_clusters, ++ Error **errp) + { + BDRVQcow2State *s = bs->opaque; + int64_t reftable_offset = -1; +@@ -2652,7 +2655,7 @@ static int rebuild_refcount_structure(BlockDriverState *bs, + rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters, + 0, *nb_clusters, + &on_disk_reftable, +- &on_disk_reftable_entries); ++ &on_disk_reftable_entries, errp); + if (reftable_size_changed < 0) { + res->check_errors++; + ret = reftable_size_changed; +@@ -2676,8 +2679,8 @@ static int rebuild_refcount_structure(BlockDriverState *bs, + refcount_table, nb_clusters, + &first_free_cluster); + if (reftable_offset < 0) { +- fprintf(stderr, "ERROR allocating reftable: %s\n", +- strerror(-reftable_offset)); ++ error_setg_errno(errp, -reftable_offset, ++ "ERROR allocating reftable"); + res->check_errors++; + ret = reftable_offset; + goto fail; +@@ -2695,7 +2698,7 @@ static int rebuild_refcount_structure(BlockDriverState *bs, + reftable_start_cluster, + reftable_end_cluster, + &on_disk_reftable, +- &on_disk_reftable_entries); ++ &on_disk_reftable_entries, errp); + if (reftable_size_changed < 0) { + res->check_errors++; + ret = reftable_size_changed; +@@ -2725,7 +2728,7 @@ static int rebuild_refcount_structure(BlockDriverState *bs, + ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset, reftable_length, + false); + if (ret < 0) { +- fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret)); ++ error_setg_errno(errp, -ret, "ERROR writing reftable"); + goto fail; + } + +@@ -2733,7 +2736,7 @@ static int rebuild_refcount_structure(BlockDriverState *bs, + ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable, + reftable_length); + if (ret < 0) { +- fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret)); ++ error_setg_errno(errp, -ret, "ERROR writing reftable"); + goto fail; + } + +@@ -2746,7 +2749,7 @@ static int rebuild_refcount_structure(BlockDriverState *bs, + &reftable_offset_and_clusters, + sizeof(reftable_offset_and_clusters)); + if (ret < 0) { +- fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret)); ++ error_setg_errno(errp, -ret, "ERROR setting reftable"); + goto fail; + } + +@@ -2814,11 +2817,13 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, + if (rebuild && (fix & BDRV_FIX_ERRORS)) { + BdrvCheckResult old_res = *res; + int fresh_leaks = 0; ++ Error *local_err = NULL; + + fprintf(stderr, "Rebuilding refcount structure\n"); + ret = rebuild_refcount_structure(bs, res, &refcount_table, +- &nb_clusters); ++ &nb_clusters, &local_err); + if (ret < 0) { ++ error_report_err(local_err); + goto fail; + } + +-- +2.31.1 + diff --git a/kvm-qcow2-Improve-refcount-structure-rebuilding.patch b/kvm-qcow2-Improve-refcount-structure-rebuilding.patch new file mode 100644 index 0000000..cdc92b8 --- /dev/null +++ b/kvm-qcow2-Improve-refcount-structure-rebuilding.patch @@ -0,0 +1,465 @@ +From b453cf6be8429f4438d51eb24fcf49e7d9f14db6 Mon Sep 17 00:00:00 2001 +From: Hanna Reitz +Date: Tue, 5 Apr 2022 15:46:50 +0200 +Subject: [PATCH 04/16] qcow2: Improve refcount structure rebuilding + +RH-Author: Hanna Reitz +RH-MergeRequest: 96: qcow2: Improve refcount structure rebuilding +RH-Commit: [1/4] a3606b7abcaebb4930b566e95b1090aead62dfae (hreitz/qemu-kvm-c-9-s) +RH-Bugzilla: 2072379 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Eric Blake +RH-Acked-by: Stefan Hajnoczi + +When rebuilding the refcount structures (when qemu-img check -r found +errors with refcount = 0, but reference count > 0), the new refcount +table defaults to being put at the image file end[1]. There is no good +reason for that except that it means we will not have to rewrite any +refblocks we already wrote to disk. + +Changing the code to rewrite those refblocks is not too difficult, +though, so let us do that. That is beneficial for images on block +devices, where we cannot really write beyond the end of the image file. + +Use this opportunity to add extensive comments to the code, and refactor +it a bit, getting rid of the backwards-jumping goto. + +[1] Unless there is something allocated in the area pointed to by the + last refblock, so we have to write that refblock. In that case, we + try to put the reftable in there. + +Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1519071 +Closes: https://gitlab.com/qemu-project/qemu/-/issues/941 +Reviewed-by: Eric Blake +Signed-off-by: Hanna Reitz +Message-Id: <20220405134652.19278-2-hreitz@redhat.com> +(cherry picked from commit a8c07ec287554dcefd33733f0e5888a281ddc95e) +Signed-off-by: Hanna Reitz +--- + block/qcow2-refcount.c | 332 +++++++++++++++++++++++++++++------------ + 1 file changed, 235 insertions(+), 97 deletions(-) + +diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c +index b91499410c..c5669eaa51 100644 +--- a/block/qcow2-refcount.c ++++ b/block/qcow2-refcount.c +@@ -2438,111 +2438,140 @@ static int64_t alloc_clusters_imrt(BlockDriverState *bs, + } + + /* +- * Creates a new refcount structure based solely on the in-memory information +- * given through *refcount_table. All necessary allocations will be reflected +- * in that array. ++ * Helper function for rebuild_refcount_structure(). + * +- * On success, the old refcount structure is leaked (it will be covered by the +- * new refcount structure). ++ * Scan the range of clusters [first_cluster, end_cluster) for allocated ++ * clusters and write all corresponding refblocks to disk. The refblock ++ * and allocation data is taken from the in-memory refcount table ++ * *refcount_table[] (of size *nb_clusters), which is basically one big ++ * (unlimited size) refblock for the whole image. ++ * ++ * For these refblocks, clusters are allocated using said in-memory ++ * refcount table. Care is taken that these allocations are reflected ++ * in the refblocks written to disk. ++ * ++ * The refblocks' offsets are written into a reftable, which is ++ * *on_disk_reftable_ptr[] (of size *on_disk_reftable_entries_ptr). If ++ * that reftable is of insufficient size, it will be resized to fit. ++ * This reftable is not written to disk. ++ * ++ * (If *on_disk_reftable_ptr is not NULL, the entries within are assumed ++ * to point to existing valid refblocks that do not need to be allocated ++ * again.) ++ * ++ * Return whether the on-disk reftable array was resized (true/false), ++ * or -errno on error. + */ +-static int rebuild_refcount_structure(BlockDriverState *bs, +- BdrvCheckResult *res, +- void **refcount_table, +- int64_t *nb_clusters) ++static int rebuild_refcounts_write_refblocks( ++ BlockDriverState *bs, void **refcount_table, int64_t *nb_clusters, ++ int64_t first_cluster, int64_t end_cluster, ++ uint64_t **on_disk_reftable_ptr, uint32_t *on_disk_reftable_entries_ptr ++ ) + { + BDRVQcow2State *s = bs->opaque; +- int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0; ++ int64_t cluster; + int64_t refblock_offset, refblock_start, refblock_index; +- uint32_t reftable_size = 0; +- uint64_t *on_disk_reftable = NULL; ++ int64_t first_free_cluster = 0; ++ uint64_t *on_disk_reftable = *on_disk_reftable_ptr; ++ uint32_t on_disk_reftable_entries = *on_disk_reftable_entries_ptr; + void *on_disk_refblock; +- int ret = 0; +- struct { +- uint64_t reftable_offset; +- uint32_t reftable_clusters; +- } QEMU_PACKED reftable_offset_and_clusters; +- +- qcow2_cache_empty(bs, s->refcount_block_cache); ++ bool reftable_grown = false; ++ int ret; + +-write_refblocks: +- for (; cluster < *nb_clusters; cluster++) { ++ for (cluster = first_cluster; cluster < end_cluster; cluster++) { ++ /* Check all clusters to find refblocks that contain non-zero entries */ + if (!s->get_refcount(*refcount_table, cluster)) { + continue; + } + ++ /* ++ * This cluster is allocated, so we need to create a refblock ++ * for it. The data we will write to disk is just the ++ * respective slice from *refcount_table, so it will contain ++ * accurate refcounts for all clusters belonging to this ++ * refblock. After we have written it, we will therefore skip ++ * all remaining clusters in this refblock. ++ */ ++ + refblock_index = cluster >> s->refcount_block_bits; + refblock_start = refblock_index << s->refcount_block_bits; + +- /* Don't allocate a cluster in a refblock already written to disk */ +- if (first_free_cluster < refblock_start) { +- first_free_cluster = refblock_start; +- } +- refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table, +- nb_clusters, &first_free_cluster); +- if (refblock_offset < 0) { +- fprintf(stderr, "ERROR allocating refblock: %s\n", +- strerror(-refblock_offset)); +- res->check_errors++; +- ret = refblock_offset; +- goto fail; +- } ++ if (on_disk_reftable_entries > refblock_index && ++ on_disk_reftable[refblock_index]) ++ { ++ /* ++ * We can get here after a `goto write_refblocks`: We have a ++ * reftable from a previous run, and the refblock is already ++ * allocated. No need to allocate it again. ++ */ ++ refblock_offset = on_disk_reftable[refblock_index]; ++ } else { ++ int64_t refblock_cluster_index; + +- if (reftable_size <= refblock_index) { +- uint32_t old_reftable_size = reftable_size; +- uint64_t *new_on_disk_reftable; ++ /* Don't allocate a cluster in a refblock already written to disk */ ++ if (first_free_cluster < refblock_start) { ++ first_free_cluster = refblock_start; ++ } ++ refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table, ++ nb_clusters, ++ &first_free_cluster); ++ if (refblock_offset < 0) { ++ fprintf(stderr, "ERROR allocating refblock: %s\n", ++ strerror(-refblock_offset)); ++ return refblock_offset; ++ } + +- reftable_size = ROUND_UP((refblock_index + 1) * REFTABLE_ENTRY_SIZE, +- s->cluster_size) / REFTABLE_ENTRY_SIZE; +- new_on_disk_reftable = g_try_realloc(on_disk_reftable, +- reftable_size * +- REFTABLE_ENTRY_SIZE); +- if (!new_on_disk_reftable) { +- res->check_errors++; +- ret = -ENOMEM; +- goto fail; ++ refblock_cluster_index = refblock_offset / s->cluster_size; ++ if (refblock_cluster_index >= end_cluster) { ++ /* ++ * We must write the refblock that holds this refblock's ++ * refcount ++ */ ++ end_cluster = refblock_cluster_index + 1; + } +- on_disk_reftable = new_on_disk_reftable; + +- memset(on_disk_reftable + old_reftable_size, 0, +- (reftable_size - old_reftable_size) * REFTABLE_ENTRY_SIZE); ++ if (on_disk_reftable_entries <= refblock_index) { ++ on_disk_reftable_entries = ++ ROUND_UP((refblock_index + 1) * REFTABLE_ENTRY_SIZE, ++ s->cluster_size) / REFTABLE_ENTRY_SIZE; ++ on_disk_reftable = ++ g_try_realloc(on_disk_reftable, ++ on_disk_reftable_entries * ++ REFTABLE_ENTRY_SIZE); ++ if (!on_disk_reftable) { ++ return -ENOMEM; ++ } + +- /* The offset we have for the reftable is now no longer valid; +- * this will leak that range, but we can easily fix that by running +- * a leak-fixing check after this rebuild operation */ +- reftable_offset = -1; +- } else { +- assert(on_disk_reftable); +- } +- on_disk_reftable[refblock_index] = refblock_offset; ++ memset(on_disk_reftable + *on_disk_reftable_entries_ptr, 0, ++ (on_disk_reftable_entries - ++ *on_disk_reftable_entries_ptr) * ++ REFTABLE_ENTRY_SIZE); + +- /* If this is apparently the last refblock (for now), try to squeeze the +- * reftable in */ +- if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits && +- reftable_offset < 0) +- { +- uint64_t reftable_clusters = size_to_clusters(s, reftable_size * +- REFTABLE_ENTRY_SIZE); +- reftable_offset = alloc_clusters_imrt(bs, reftable_clusters, +- refcount_table, nb_clusters, +- &first_free_cluster); +- if (reftable_offset < 0) { +- fprintf(stderr, "ERROR allocating reftable: %s\n", +- strerror(-reftable_offset)); +- res->check_errors++; +- ret = reftable_offset; +- goto fail; ++ *on_disk_reftable_ptr = on_disk_reftable; ++ *on_disk_reftable_entries_ptr = on_disk_reftable_entries; ++ ++ reftable_grown = true; ++ } else { ++ assert(on_disk_reftable); + } ++ on_disk_reftable[refblock_index] = refblock_offset; + } + ++ /* Refblock is allocated, write it to disk */ ++ + ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset, + s->cluster_size, false); + if (ret < 0) { + fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret)); +- goto fail; ++ return ret; + } + +- /* The size of *refcount_table is always cluster-aligned, therefore the +- * write operation will not overflow */ ++ /* ++ * The refblock is simply a slice of *refcount_table. ++ * Note that the size of *refcount_table is always aligned to ++ * whole clusters, so the write operation will not result in ++ * out-of-bounds accesses. ++ */ + on_disk_refblock = (void *)((char *) *refcount_table + + refblock_index * s->cluster_size); + +@@ -2550,23 +2579,99 @@ write_refblocks: + s->cluster_size); + if (ret < 0) { + fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret)); +- goto fail; ++ return ret; + } + +- /* Go to the end of this refblock */ ++ /* This refblock is done, skip to its end */ + cluster = refblock_start + s->refcount_block_size - 1; + } + +- if (reftable_offset < 0) { +- uint64_t post_refblock_start, reftable_clusters; ++ return reftable_grown; ++} ++ ++/* ++ * Creates a new refcount structure based solely on the in-memory information ++ * given through *refcount_table (this in-memory information is basically just ++ * the concatenation of all refblocks). All necessary allocations will be ++ * reflected in that array. ++ * ++ * On success, the old refcount structure is leaked (it will be covered by the ++ * new refcount structure). ++ */ ++static int rebuild_refcount_structure(BlockDriverState *bs, ++ BdrvCheckResult *res, ++ void **refcount_table, ++ int64_t *nb_clusters) ++{ ++ BDRVQcow2State *s = bs->opaque; ++ int64_t reftable_offset = -1; ++ int64_t reftable_length = 0; ++ int64_t reftable_clusters; ++ int64_t refblock_index; ++ uint32_t on_disk_reftable_entries = 0; ++ uint64_t *on_disk_reftable = NULL; ++ int ret = 0; ++ int reftable_size_changed = 0; ++ struct { ++ uint64_t reftable_offset; ++ uint32_t reftable_clusters; ++ } QEMU_PACKED reftable_offset_and_clusters; ++ ++ qcow2_cache_empty(bs, s->refcount_block_cache); ++ ++ /* ++ * For each refblock containing entries, we try to allocate a ++ * cluster (in the in-memory refcount table) and write its offset ++ * into on_disk_reftable[]. We then write the whole refblock to ++ * disk (as a slice of the in-memory refcount table). ++ * This is done by rebuild_refcounts_write_refblocks(). ++ * ++ * Once we have scanned all clusters, we try to find space for the ++ * reftable. This will dirty the in-memory refcount table (i.e. ++ * make it differ from the refblocks we have already written), so we ++ * need to run rebuild_refcounts_write_refblocks() again for the ++ * range of clusters where the reftable has been allocated. ++ * ++ * This second run might make the reftable grow again, in which case ++ * we will need to allocate another space for it, which is why we ++ * repeat all this until the reftable stops growing. ++ * ++ * (This loop will terminate, because with every cluster the ++ * reftable grows, it can accomodate a multitude of more refcounts, ++ * so that at some point this must be able to cover the reftable ++ * and all refblocks describing it.) ++ * ++ * We then convert the reftable to big-endian and write it to disk. ++ * ++ * Note that we never free any reftable allocations. Doing so would ++ * needlessly complicate the algorithm: The eventual second check ++ * run we do will clean up all leaks we have caused. ++ */ ++ ++ reftable_size_changed = ++ rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters, ++ 0, *nb_clusters, ++ &on_disk_reftable, ++ &on_disk_reftable_entries); ++ if (reftable_size_changed < 0) { ++ res->check_errors++; ++ ret = reftable_size_changed; ++ goto fail; ++ } ++ ++ /* ++ * There was no reftable before, so rebuild_refcounts_write_refblocks() ++ * must have increased its size (from 0 to something). ++ */ ++ assert(reftable_size_changed); ++ ++ do { ++ int64_t reftable_start_cluster, reftable_end_cluster; ++ int64_t first_free_cluster = 0; ++ ++ reftable_length = on_disk_reftable_entries * REFTABLE_ENTRY_SIZE; ++ reftable_clusters = size_to_clusters(s, reftable_length); + +- post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size); +- reftable_clusters = +- size_to_clusters(s, reftable_size * REFTABLE_ENTRY_SIZE); +- /* Not pretty but simple */ +- if (first_free_cluster < post_refblock_start) { +- first_free_cluster = post_refblock_start; +- } + reftable_offset = alloc_clusters_imrt(bs, reftable_clusters, + refcount_table, nb_clusters, + &first_free_cluster); +@@ -2578,24 +2683,55 @@ write_refblocks: + goto fail; + } + +- goto write_refblocks; +- } ++ /* ++ * We need to update the affected refblocks, so re-run the ++ * write_refblocks loop for the reftable's range of clusters. ++ */ ++ assert(offset_into_cluster(s, reftable_offset) == 0); ++ reftable_start_cluster = reftable_offset / s->cluster_size; ++ reftable_end_cluster = reftable_start_cluster + reftable_clusters; ++ reftable_size_changed = ++ rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters, ++ reftable_start_cluster, ++ reftable_end_cluster, ++ &on_disk_reftable, ++ &on_disk_reftable_entries); ++ if (reftable_size_changed < 0) { ++ res->check_errors++; ++ ret = reftable_size_changed; ++ goto fail; ++ } ++ ++ /* ++ * If the reftable size has changed, we will need to find a new ++ * allocation, repeating the loop. ++ */ ++ } while (reftable_size_changed); + +- for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) { ++ /* The above loop must have run at least once */ ++ assert(reftable_offset >= 0); ++ ++ /* ++ * All allocations are done, all refblocks are written, convert the ++ * reftable to big-endian and write it to disk. ++ */ ++ ++ for (refblock_index = 0; refblock_index < on_disk_reftable_entries; ++ refblock_index++) ++ { + cpu_to_be64s(&on_disk_reftable[refblock_index]); + } + +- ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset, +- reftable_size * REFTABLE_ENTRY_SIZE, ++ ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset, reftable_length, + false); + if (ret < 0) { + fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret)); + goto fail; + } + +- assert(reftable_size < INT_MAX / REFTABLE_ENTRY_SIZE); ++ assert(reftable_length < INT_MAX); + ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable, +- reftable_size * REFTABLE_ENTRY_SIZE); ++ reftable_length); + if (ret < 0) { + fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret)); + goto fail; +@@ -2604,7 +2740,7 @@ write_refblocks: + /* Enter new reftable into the image header */ + reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset); + reftable_offset_and_clusters.reftable_clusters = +- cpu_to_be32(size_to_clusters(s, reftable_size * REFTABLE_ENTRY_SIZE)); ++ cpu_to_be32(reftable_clusters); + ret = bdrv_pwrite_sync(bs->file, + offsetof(QCowHeader, refcount_table_offset), + &reftable_offset_and_clusters, +@@ -2614,12 +2750,14 @@ write_refblocks: + goto fail; + } + +- for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) { ++ for (refblock_index = 0; refblock_index < on_disk_reftable_entries; ++ refblock_index++) ++ { + be64_to_cpus(&on_disk_reftable[refblock_index]); + } + s->refcount_table = on_disk_reftable; + s->refcount_table_offset = reftable_offset; +- s->refcount_table_size = reftable_size; ++ s->refcount_table_size = on_disk_reftable_entries; + update_max_refcount_table_index(s); + + return 0; +-- +2.31.1 + diff --git a/kvm-util-event-loop-base-Introduce-options-to-set-the-th.patch b/kvm-util-event-loop-base-Introduce-options-to-set-the-th.patch new file mode 100644 index 0000000..77929a6 --- /dev/null +++ b/kvm-util-event-loop-base-Introduce-options-to-set-the-th.patch @@ -0,0 +1,385 @@ +From 7a6fa42d4a4263c94b9bf18290f9e7680ea9e7f4 Mon Sep 17 00:00:00 2001 +From: Nicolas Saenz Julienne +Date: Mon, 25 Apr 2022 09:57:23 +0200 +Subject: [PATCH 03/16] util/event-loop-base: Introduce options to set the + thread pool size + +RH-Author: Nicolas Saenz Julienne +RH-MergeRequest: 93: util/thread-pool: Expose minimum and maximum size +RH-Commit: [3/3] af78a88ff3c69701cbb5f9e980c3d6ebbd13ff98 +RH-Bugzilla: 2031024 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi + +The thread pool regulates itself: when idle, it kills threads until +empty, when in demand, it creates new threads until full. This behaviour +doesn't play well with latency sensitive workloads where the price of +creating a new thread is too high. For example, when paired with qemu's +'-mlock', or using safety features like SafeStack, creating a new thread +has been measured take multiple milliseconds. + +In order to mitigate this let's introduce a new 'EventLoopBase' +property to set the thread pool size. The threads will be created during +the pool's initialization or upon updating the property's value, remain +available during its lifetime regardless of demand, and destroyed upon +freeing it. A properly characterized workload will then be able to +configure the pool to avoid any latency spikes. + +Signed-off-by: Nicolas Saenz Julienne +Reviewed-by: Stefan Hajnoczi +Acked-by: Markus Armbruster +Message-id: 20220425075723.20019-4-nsaenzju@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 71ad4713cc1d7fca24388b828ef31ae6cb38a31c) +--- + event-loop-base.c | 23 +++++++++++++ + include/block/aio.h | 10 ++++++ + include/block/thread-pool.h | 3 ++ + include/sysemu/event-loop-base.h | 4 +++ + iothread.c | 3 ++ + qapi/qom.json | 10 +++++- + util/aio-posix.c | 1 + + util/async.c | 20 ++++++++++++ + util/main-loop.c | 9 ++++++ + util/thread-pool.c | 55 +++++++++++++++++++++++++++++--- + 10 files changed, 133 insertions(+), 5 deletions(-) + +diff --git a/event-loop-base.c b/event-loop-base.c +index e7f99a6ec8..d5be4dc6fc 100644 +--- a/event-loop-base.c ++++ b/event-loop-base.c +@@ -14,6 +14,7 @@ + #include "qemu/osdep.h" + #include "qom/object_interfaces.h" + #include "qapi/error.h" ++#include "block/thread-pool.h" + #include "sysemu/event-loop-base.h" + + typedef struct { +@@ -21,9 +22,22 @@ typedef struct { + ptrdiff_t offset; /* field's byte offset in EventLoopBase struct */ + } EventLoopBaseParamInfo; + ++static void event_loop_base_instance_init(Object *obj) ++{ ++ EventLoopBase *base = EVENT_LOOP_BASE(obj); ++ ++ base->thread_pool_max = THREAD_POOL_MAX_THREADS_DEFAULT; ++} ++ + static EventLoopBaseParamInfo aio_max_batch_info = { + "aio-max-batch", offsetof(EventLoopBase, aio_max_batch), + }; ++static EventLoopBaseParamInfo thread_pool_min_info = { ++ "thread-pool-min", offsetof(EventLoopBase, thread_pool_min), ++}; ++static EventLoopBaseParamInfo thread_pool_max_info = { ++ "thread-pool-max", offsetof(EventLoopBase, thread_pool_max), ++}; + + static void event_loop_base_get_param(Object *obj, Visitor *v, + const char *name, void *opaque, Error **errp) +@@ -95,12 +109,21 @@ static void event_loop_base_class_init(ObjectClass *klass, void *class_data) + event_loop_base_get_param, + event_loop_base_set_param, + NULL, &aio_max_batch_info); ++ object_class_property_add(klass, "thread-pool-min", "int", ++ event_loop_base_get_param, ++ event_loop_base_set_param, ++ NULL, &thread_pool_min_info); ++ object_class_property_add(klass, "thread-pool-max", "int", ++ event_loop_base_get_param, ++ event_loop_base_set_param, ++ NULL, &thread_pool_max_info); + } + + static const TypeInfo event_loop_base_info = { + .name = TYPE_EVENT_LOOP_BASE, + .parent = TYPE_OBJECT, + .instance_size = sizeof(EventLoopBase), ++ .instance_init = event_loop_base_instance_init, + .class_size = sizeof(EventLoopBaseClass), + .class_init = event_loop_base_class_init, + .abstract = true, +diff --git a/include/block/aio.h b/include/block/aio.h +index 5634173b12..d128558f1d 100644 +--- a/include/block/aio.h ++++ b/include/block/aio.h +@@ -192,6 +192,8 @@ struct AioContext { + QSLIST_HEAD(, Coroutine) scheduled_coroutines; + QEMUBH *co_schedule_bh; + ++ int thread_pool_min; ++ int thread_pool_max; + /* Thread pool for performing work and receiving completion callbacks. + * Has its own locking. + */ +@@ -769,4 +771,12 @@ void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns, + void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch, + Error **errp); + ++/** ++ * aio_context_set_thread_pool_params: ++ * @ctx: the aio context ++ * @min: min number of threads to have readily available in the thread pool ++ * @min: max number of threads the thread pool can contain ++ */ ++void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min, ++ int64_t max, Error **errp); + #endif +diff --git a/include/block/thread-pool.h b/include/block/thread-pool.h +index 7dd7d730a0..2020bcc92d 100644 +--- a/include/block/thread-pool.h ++++ b/include/block/thread-pool.h +@@ -20,6 +20,8 @@ + + #include "block/block.h" + ++#define THREAD_POOL_MAX_THREADS_DEFAULT 64 ++ + typedef int ThreadPoolFunc(void *opaque); + + typedef struct ThreadPool ThreadPool; +@@ -33,5 +35,6 @@ BlockAIOCB *thread_pool_submit_aio(ThreadPool *pool, + int coroutine_fn thread_pool_submit_co(ThreadPool *pool, + ThreadPoolFunc *func, void *arg); + void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, void *arg); ++void thread_pool_update_params(ThreadPool *pool, struct AioContext *ctx); + + #endif +diff --git a/include/sysemu/event-loop-base.h b/include/sysemu/event-loop-base.h +index fced4c9fea..2748bf6ae1 100644 +--- a/include/sysemu/event-loop-base.h ++++ b/include/sysemu/event-loop-base.h +@@ -33,5 +33,9 @@ struct EventLoopBase { + + /* AioContext AIO engine parameters */ + int64_t aio_max_batch; ++ ++ /* AioContext thread pool parameters */ ++ int64_t thread_pool_min; ++ int64_t thread_pool_max; + }; + #endif +diff --git a/iothread.c b/iothread.c +index 8fa2f3bfb8..529194a566 100644 +--- a/iothread.c ++++ b/iothread.c +@@ -174,6 +174,9 @@ static void iothread_set_aio_context_params(EventLoopBase *base, Error **errp) + aio_context_set_aio_params(iothread->ctx, + iothread->parent_obj.aio_max_batch, + errp); ++ ++ aio_context_set_thread_pool_params(iothread->ctx, base->thread_pool_min, ++ base->thread_pool_max, errp); + } + + +diff --git a/qapi/qom.json b/qapi/qom.json +index 7d4a2ac1b9..6a653c6636 100644 +--- a/qapi/qom.json ++++ b/qapi/qom.json +@@ -508,10 +508,18 @@ + # 0 means that the engine will use its default. + # (default: 0) + # ++# @thread-pool-min: minimum number of threads reserved in the thread pool ++# (default:0) ++# ++# @thread-pool-max: maximum number of threads the thread pool can contain ++# (default:64) ++# + # Since: 7.1 + ## + { 'struct': 'EventLoopBaseProperties', +- 'data': { '*aio-max-batch': 'int' } } ++ 'data': { '*aio-max-batch': 'int', ++ '*thread-pool-min': 'int', ++ '*thread-pool-max': 'int' } } + + ## + # @IothreadProperties: +diff --git a/util/aio-posix.c b/util/aio-posix.c +index be0182a3c6..731f3826c0 100644 +--- a/util/aio-posix.c ++++ b/util/aio-posix.c +@@ -15,6 +15,7 @@ + + #include "qemu/osdep.h" + #include "block/block.h" ++#include "block/thread-pool.h" + #include "qemu/main-loop.h" + #include "qemu/rcu.h" + #include "qemu/rcu_queue.h" +diff --git a/util/async.c b/util/async.c +index 2ea1172f3e..554ba70cca 100644 +--- a/util/async.c ++++ b/util/async.c +@@ -563,6 +563,9 @@ AioContext *aio_context_new(Error **errp) + + ctx->aio_max_batch = 0; + ++ ctx->thread_pool_min = 0; ++ ctx->thread_pool_max = THREAD_POOL_MAX_THREADS_DEFAULT; ++ + return ctx; + fail: + g_source_destroy(&ctx->source); +@@ -696,3 +699,20 @@ void qemu_set_current_aio_context(AioContext *ctx) + assert(!get_my_aiocontext()); + set_my_aiocontext(ctx); + } ++ ++void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min, ++ int64_t max, Error **errp) ++{ ++ ++ if (min > max || !max || min > INT_MAX || max > INT_MAX) { ++ error_setg(errp, "bad thread-pool-min/thread-pool-max values"); ++ return; ++ } ++ ++ ctx->thread_pool_min = min; ++ ctx->thread_pool_max = max; ++ ++ if (ctx->thread_pool) { ++ thread_pool_update_params(ctx->thread_pool, ctx); ++ } ++} +diff --git a/util/main-loop.c b/util/main-loop.c +index 5b13f456fa..a0f48186ab 100644 +--- a/util/main-loop.c ++++ b/util/main-loop.c +@@ -30,6 +30,7 @@ + #include "sysemu/replay.h" + #include "qemu/main-loop.h" + #include "block/aio.h" ++#include "block/thread-pool.h" + #include "qemu/error-report.h" + #include "qemu/queue.h" + #include "qemu/compiler.h" +@@ -187,12 +188,20 @@ int qemu_init_main_loop(Error **errp) + + static void main_loop_update_params(EventLoopBase *base, Error **errp) + { ++ ERRP_GUARD(); ++ + if (!qemu_aio_context) { + error_setg(errp, "qemu aio context not ready"); + return; + } + + aio_context_set_aio_params(qemu_aio_context, base->aio_max_batch, errp); ++ if (*errp) { ++ return; ++ } ++ ++ aio_context_set_thread_pool_params(qemu_aio_context, base->thread_pool_min, ++ base->thread_pool_max, errp); + } + + MainLoop *mloop; +diff --git a/util/thread-pool.c b/util/thread-pool.c +index d763cea505..196835b4d3 100644 +--- a/util/thread-pool.c ++++ b/util/thread-pool.c +@@ -58,7 +58,6 @@ struct ThreadPool { + QemuMutex lock; + QemuCond worker_stopped; + QemuSemaphore sem; +- int max_threads; + QEMUBH *new_thread_bh; + + /* The following variables are only accessed from one AioContext. */ +@@ -71,8 +70,27 @@ struct ThreadPool { + int new_threads; /* backlog of threads we need to create */ + int pending_threads; /* threads created but not running yet */ + bool stopping; ++ int min_threads; ++ int max_threads; + }; + ++static inline bool back_to_sleep(ThreadPool *pool, int ret) ++{ ++ /* ++ * The semaphore timed out, we should exit the loop except when: ++ * - There is work to do, we raced with the signal. ++ * - The max threads threshold just changed, we raced with the signal. ++ * - The thread pool forces a minimum number of readily available threads. ++ */ ++ if (ret == -1 && (!QTAILQ_EMPTY(&pool->request_list) || ++ pool->cur_threads > pool->max_threads || ++ pool->cur_threads <= pool->min_threads)) { ++ return true; ++ } ++ ++ return false; ++} ++ + static void *worker_thread(void *opaque) + { + ThreadPool *pool = opaque; +@@ -91,8 +109,9 @@ static void *worker_thread(void *opaque) + ret = qemu_sem_timedwait(&pool->sem, 10000); + qemu_mutex_lock(&pool->lock); + pool->idle_threads--; +- } while (ret == -1 && !QTAILQ_EMPTY(&pool->request_list)); +- if (ret == -1 || pool->stopping) { ++ } while (back_to_sleep(pool, ret)); ++ if (ret == -1 || pool->stopping || ++ pool->cur_threads > pool->max_threads) { + break; + } + +@@ -294,6 +313,33 @@ void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, void *arg) + thread_pool_submit_aio(pool, func, arg, NULL, NULL); + } + ++void thread_pool_update_params(ThreadPool *pool, AioContext *ctx) ++{ ++ qemu_mutex_lock(&pool->lock); ++ ++ pool->min_threads = ctx->thread_pool_min; ++ pool->max_threads = ctx->thread_pool_max; ++ ++ /* ++ * We either have to: ++ * - Increase the number available of threads until over the min_threads ++ * threshold. ++ * - Decrease the number of available threads until under the max_threads ++ * threshold. ++ * - Do nothing. The current number of threads fall in between the min and ++ * max thresholds. We'll let the pool manage itself. ++ */ ++ for (int i = pool->cur_threads; i < pool->min_threads; i++) { ++ spawn_thread(pool); ++ } ++ ++ for (int i = pool->cur_threads; i > pool->max_threads; i--) { ++ qemu_sem_post(&pool->sem); ++ } ++ ++ qemu_mutex_unlock(&pool->lock); ++} ++ + static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx) + { + if (!ctx) { +@@ -306,11 +352,12 @@ static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx) + qemu_mutex_init(&pool->lock); + qemu_cond_init(&pool->worker_stopped); + qemu_sem_init(&pool->sem, 0); +- pool->max_threads = 64; + pool->new_thread_bh = aio_bh_new(ctx, spawn_thread_bh_fn, pool); + + QLIST_INIT(&pool->head); + QTAILQ_INIT(&pool->request_list); ++ ++ thread_pool_update_params(pool, ctx); + } + + ThreadPool *thread_pool_new(AioContext *ctx) +-- +2.31.1 + diff --git a/kvm-util-main-loop-Introduce-the-main-loop-into-QOM.patch b/kvm-util-main-loop-Introduce-the-main-loop-into-QOM.patch new file mode 100644 index 0000000..2104424 --- /dev/null +++ b/kvm-util-main-loop-Introduce-the-main-loop-into-QOM.patch @@ -0,0 +1,233 @@ +From b4969662de01848f887a3918e97e516efc213f71 Mon Sep 17 00:00:00 2001 +From: Nicolas Saenz Julienne +Date: Mon, 25 Apr 2022 09:57:22 +0200 +Subject: [PATCH 02/16] util/main-loop: Introduce the main loop into QOM + +RH-Author: Nicolas Saenz Julienne +RH-MergeRequest: 93: util/thread-pool: Expose minimum and maximum size +RH-Commit: [2/3] a481b77e25ad50d13dcbe26b36c551b18c89bddd +RH-Bugzilla: 2031024 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Stefan Hajnoczi + +'event-loop-base' provides basic property handling for all 'AioContext' +based event loops. So let's define a new 'MainLoopClass' that inherits +from it. This will permit tweaking the main loop's properties through +qapi as well as through the command line using the '-object' keyword[1]. +Only one instance of 'MainLoopClass' might be created at any time. + +'EventLoopBaseClass' learns a new callback, 'can_be_deleted()' so as to +mark 'MainLoop' as non-deletable. + +[1] For example: + -object main-loop,id=main-loop,aio-max-batch= + +Signed-off-by: Nicolas Saenz Julienne +Reviewed-by: Stefan Hajnoczi +Acked-by: Markus Armbruster +Message-id: 20220425075723.20019-3-nsaenzju@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 70ac26b9e5ca8374bb3ef3f30b871726673c9f27) +--- + event-loop-base.c | 13 ++++++++ + include/qemu/main-loop.h | 10 ++++++ + include/sysemu/event-loop-base.h | 1 + + meson.build | 3 +- + qapi/qom.json | 13 ++++++++ + util/main-loop.c | 56 ++++++++++++++++++++++++++++++++ + 6 files changed, 95 insertions(+), 1 deletion(-) + +diff --git a/event-loop-base.c b/event-loop-base.c +index a924c73a7c..e7f99a6ec8 100644 +--- a/event-loop-base.c ++++ b/event-loop-base.c +@@ -73,10 +73,23 @@ static void event_loop_base_complete(UserCreatable *uc, Error **errp) + } + } + ++static bool event_loop_base_can_be_deleted(UserCreatable *uc) ++{ ++ EventLoopBaseClass *bc = EVENT_LOOP_BASE_GET_CLASS(uc); ++ EventLoopBase *backend = EVENT_LOOP_BASE(uc); ++ ++ if (bc->can_be_deleted) { ++ return bc->can_be_deleted(backend); ++ } ++ ++ return true; ++} ++ + static void event_loop_base_class_init(ObjectClass *klass, void *class_data) + { + UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass); + ucc->complete = event_loop_base_complete; ++ ucc->can_be_deleted = event_loop_base_can_be_deleted; + + object_class_property_add(klass, "aio-max-batch", "int", + event_loop_base_get_param, +diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h +index d3750c8e76..20c9387654 100644 +--- a/include/qemu/main-loop.h ++++ b/include/qemu/main-loop.h +@@ -26,9 +26,19 @@ + #define QEMU_MAIN_LOOP_H + + #include "block/aio.h" ++#include "qom/object.h" ++#include "sysemu/event-loop-base.h" + + #define SIG_IPI SIGUSR1 + ++#define TYPE_MAIN_LOOP "main-loop" ++OBJECT_DECLARE_TYPE(MainLoop, MainLoopClass, MAIN_LOOP) ++ ++struct MainLoop { ++ EventLoopBase parent_obj; ++}; ++typedef struct MainLoop MainLoop; ++ + /** + * qemu_init_main_loop: Set up the process so that it can run the main loop. + * +diff --git a/include/sysemu/event-loop-base.h b/include/sysemu/event-loop-base.h +index 8e77d8b69f..fced4c9fea 100644 +--- a/include/sysemu/event-loop-base.h ++++ b/include/sysemu/event-loop-base.h +@@ -25,6 +25,7 @@ struct EventLoopBaseClass { + + void (*init)(EventLoopBase *base, Error **errp); + void (*update_params)(EventLoopBase *base, Error **errp); ++ bool (*can_be_deleted)(EventLoopBase *base); + }; + + struct EventLoopBase { +diff --git a/meson.build b/meson.build +index b9c919a55e..5a7c10e639 100644 +--- a/meson.build ++++ b/meson.build +@@ -2832,7 +2832,8 @@ libqemuutil = static_library('qemuutil', + sources: util_ss.sources() + stub_ss.sources() + genh, + dependencies: [util_ss.dependencies(), libm, threads, glib, socket, malloc, pixman]) + qemuutil = declare_dependency(link_with: libqemuutil, +- sources: genh + version_res) ++ sources: genh + version_res, ++ dependencies: [event_loop_base]) + + if have_system or have_user + decodetree = generator(find_program('scripts/decodetree.py'), +diff --git a/qapi/qom.json b/qapi/qom.json +index a2439533c5..7d4a2ac1b9 100644 +--- a/qapi/qom.json ++++ b/qapi/qom.json +@@ -540,6 +540,17 @@ + '*poll-grow': 'int', + '*poll-shrink': 'int' } } + ++## ++# @MainLoopProperties: ++# ++# Properties for the main-loop object. ++# ++# Since: 7.1 ++## ++{ 'struct': 'MainLoopProperties', ++ 'base': 'EventLoopBaseProperties', ++ 'data': {} } ++ + ## + # @MemoryBackendProperties: + # +@@ -830,6 +841,7 @@ + { 'name': 'input-linux', + 'if': 'CONFIG_LINUX' }, + 'iothread', ++ 'main-loop', + { 'name': 'memory-backend-epc', + 'if': 'CONFIG_LINUX' }, + 'memory-backend-file', +@@ -895,6 +907,7 @@ + 'input-linux': { 'type': 'InputLinuxProperties', + 'if': 'CONFIG_LINUX' }, + 'iothread': 'IothreadProperties', ++ 'main-loop': 'MainLoopProperties', + 'memory-backend-epc': { 'type': 'MemoryBackendEpcProperties', + 'if': 'CONFIG_LINUX' }, + 'memory-backend-file': 'MemoryBackendFileProperties', +diff --git a/util/main-loop.c b/util/main-loop.c +index b7b0ce4ca0..5b13f456fa 100644 +--- a/util/main-loop.c ++++ b/util/main-loop.c +@@ -33,6 +33,7 @@ + #include "qemu/error-report.h" + #include "qemu/queue.h" + #include "qemu/compiler.h" ++#include "qom/object.h" + + #ifndef _WIN32 + #include +@@ -184,6 +185,61 @@ int qemu_init_main_loop(Error **errp) + return 0; + } + ++static void main_loop_update_params(EventLoopBase *base, Error **errp) ++{ ++ if (!qemu_aio_context) { ++ error_setg(errp, "qemu aio context not ready"); ++ return; ++ } ++ ++ aio_context_set_aio_params(qemu_aio_context, base->aio_max_batch, errp); ++} ++ ++MainLoop *mloop; ++ ++static void main_loop_init(EventLoopBase *base, Error **errp) ++{ ++ MainLoop *m = MAIN_LOOP(base); ++ ++ if (mloop) { ++ error_setg(errp, "only one main-loop instance allowed"); ++ return; ++ } ++ ++ main_loop_update_params(base, errp); ++ ++ mloop = m; ++ return; ++} ++ ++static bool main_loop_can_be_deleted(EventLoopBase *base) ++{ ++ return false; ++} ++ ++static void main_loop_class_init(ObjectClass *oc, void *class_data) ++{ ++ EventLoopBaseClass *bc = EVENT_LOOP_BASE_CLASS(oc); ++ ++ bc->init = main_loop_init; ++ bc->update_params = main_loop_update_params; ++ bc->can_be_deleted = main_loop_can_be_deleted; ++} ++ ++static const TypeInfo main_loop_info = { ++ .name = TYPE_MAIN_LOOP, ++ .parent = TYPE_EVENT_LOOP_BASE, ++ .class_init = main_loop_class_init, ++ .instance_size = sizeof(MainLoop), ++}; ++ ++static void main_loop_register_types(void) ++{ ++ type_register_static(&main_loop_info); ++} ++ ++type_init(main_loop_register_types) ++ + static int max_priority; + + #ifndef _WIN32 +-- +2.31.1 + diff --git a/kvm-vfio-common-remove-spurious-warning-on-vfio_listener.patch b/kvm-vfio-common-remove-spurious-warning-on-vfio_listener.patch new file mode 100644 index 0000000..7e644c5 --- /dev/null +++ b/kvm-vfio-common-remove-spurious-warning-on-vfio_listener.patch @@ -0,0 +1,78 @@ +From 3de8fb9f3dba18d04efa10b70bcec641035effc5 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Tue, 24 May 2022 05:14:05 -0400 +Subject: [PATCH 16/16] vfio/common: remove spurious warning on + vfio_listener_region_del + +RH-Author: Eric Auger +RH-MergeRequest: 101: vfio/common: remove spurious warning on vfio_listener_region_del +RH-Commit: [1/1] dac688b8a981ebb964fea79ea198c329b9cdb551 (eauger1/centos-qemu-kvm) +RH-Bugzilla: 2086262 +RH-Acked-by: Miroslav Rezanina +RH-Acked-by: Cornelia Huck +RH-Acked-by: Alex Williamson + + Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2086262 + Brew: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=45876133 + Upstream Status: YES + Tested: With TPM-CRB and VFIO + +851d6d1a0f ("vfio/common: remove spurious tpm-crb-cmd misalignment +warning") removed the warning on vfio_listener_region_add() path. + +However the same warning also hits on region_del path. Let's remove +it and reword the dynamic trace as this can be called on both +map and unmap path. + +Contextual Conflict in hw/vfio/common.c +We don't have 8e3b0cbb721 ("Replace qemu_real_host_page variables with inlined functions") + +Signed-off-by: Eric Auger +Reviewed-by: Cornelia Huck +Link: https://lore.kernel.org/r/20220524091405.416256-1-eric.auger@redhat.com +Fixes: 851d6d1a0ff2 ("vfio/common: remove spurious tpm-crb-cmd misalignment warning") +Signed-off-by: Alex Williamson +(cherry picked from commit ec6600be0dc16982181c7ad80d94c143c0807dd2) +Signed-off-by: Eric Auger +--- + hw/vfio/common.c | 10 +++++++++- + hw/vfio/trace-events | 2 +- + 2 files changed, 10 insertions(+), 2 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 0fbe0d47af..637981f9a1 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1145,7 +1145,15 @@ static void vfio_listener_region_del(MemoryListener *listener, + if (unlikely((section->offset_within_address_space & + ~qemu_real_host_page_mask) != + (section->offset_within_region & ~qemu_real_host_page_mask))) { +- error_report("%s received unaligned region", __func__); ++ if (!vfio_known_safe_misalignment(section)) { ++ error_report("%s received unaligned region %s iova=0x%"PRIx64 ++ " offset_within_region=0x%"PRIx64 ++ " qemu_real_host_page_size=0x%"PRIxPTR, ++ __func__, memory_region_name(section->mr), ++ section->offset_within_address_space, ++ section->offset_within_region, ++ qemu_real_host_page_size); ++ } + return; + } + +diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events +index 582882db91..73dffe9e00 100644 +--- a/hw/vfio/trace-events ++++ b/hw/vfio/trace-events +@@ -100,7 +100,7 @@ vfio_listener_region_add_skip(uint64_t start, uint64_t end) "SKIPPING region_add + vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d" + vfio_listener_region_add_iommu(uint64_t start, uint64_t end) "region_add [iommu] 0x%"PRIx64" - 0x%"PRIx64 + vfio_listener_region_add_ram(uint64_t iova_start, uint64_t iova_end, void *vaddr) "region_add [ram] 0x%"PRIx64" - 0x%"PRIx64" [%p]" +-vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_within_region, uintptr_t page_size) "Region \"%s\" iova=0x%"PRIx64" offset_within_region=0x%"PRIx64" qemu_real_host_page_size=0x%"PRIxPTR ": cannot be mapped for DMA" ++vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_within_region, uintptr_t page_size) "Region \"%s\" iova=0x%"PRIx64" offset_within_region=0x%"PRIx64" qemu_real_host_page_size=0x%"PRIxPTR + vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA" + vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING region_del 0x%"PRIx64" - 0x%"PRIx64 + vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64 +-- +2.31.1 + diff --git a/kvm-vhost-net-fix-improper-cleanup-in-vhost_net_start.patch b/kvm-vhost-net-fix-improper-cleanup-in-vhost_net_start.patch new file mode 100644 index 0000000..70e8f59 --- /dev/null +++ b/kvm-vhost-net-fix-improper-cleanup-in-vhost_net_start.patch @@ -0,0 +1,56 @@ +From edb2bd99355f300b512c040e91f5870ea14a5d7e Mon Sep 17 00:00:00 2001 +From: Si-Wei Liu +Date: Fri, 6 May 2022 19:28:15 -0700 +Subject: [PATCH 11/16] vhost-net: fix improper cleanup in vhost_net_start +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jason Wang +RH-MergeRequest: 98: Multiqueue fixes for vhost-vDPA +RH-Commit: [4/7] e88e482dd4b344f0cc887a358268beaed4d62917 (jasowang/qemu-kvm-cs) +RH-Bugzilla: 2070804 +RH-Acked-by: Eugenio Pérez +RH-Acked-by: Laurent Vivier +RH-Acked-by: Cindy Lu + +vhost_net_start() missed a corresponding stop_one() upon error from +vhost_set_vring_enable(). While at it, make the error handling for +err_start more robust. No real issue was found due to this though. + +Signed-off-by: Si-Wei Liu +Acked-by: Jason Wang +Message-Id: <1651890498-24478-5-git-send-email-si-wei.liu@oracle.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 6f3910b5eee00b8cc959e94659c0d524c482a418) +Signed-off-by: Jason Wang +--- + hw/net/vhost_net.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c +index 30379d2ca4..d6d7c51f62 100644 +--- a/hw/net/vhost_net.c ++++ b/hw/net/vhost_net.c +@@ -381,6 +381,7 @@ int vhost_net_start(VirtIODevice *dev, NetClientState *ncs, + r = vhost_set_vring_enable(peer, peer->vring_enable); + + if (r < 0) { ++ vhost_net_stop_one(get_vhost_net(peer), dev); + goto err_start; + } + } +@@ -390,7 +391,8 @@ int vhost_net_start(VirtIODevice *dev, NetClientState *ncs, + + err_start: + while (--i >= 0) { +- peer = qemu_get_peer(ncs , i); ++ peer = qemu_get_peer(ncs, i < data_queue_pairs ? ++ i : n->max_queue_pairs); + vhost_net_stop_one(get_vhost_net(peer), dev); + } + e = k->set_guest_notifiers(qbus->parent, total_notifiers, false); +-- +2.31.1 + diff --git a/kvm-vhost-vdpa-backend-feature-should-set-only-once.patch b/kvm-vhost-vdpa-backend-feature-should-set-only-once.patch new file mode 100644 index 0000000..747bf5f --- /dev/null +++ b/kvm-vhost-vdpa-backend-feature-should-set-only-once.patch @@ -0,0 +1,58 @@ +From 46c5a35aa56cf0dd55376638dbf7d46e85f497e1 Mon Sep 17 00:00:00 2001 +From: Si-Wei Liu +Date: Fri, 6 May 2022 19:28:16 -0700 +Subject: [PATCH 12/16] vhost-vdpa: backend feature should set only once +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jason Wang +RH-MergeRequest: 98: Multiqueue fixes for vhost-vDPA +RH-Commit: [5/7] 7531bb8da0c99b29997e8bfc6d1e811daf3cdd38 (jasowang/qemu-kvm-cs) +RH-Bugzilla: 2070804 +RH-Acked-by: Eugenio Pérez +RH-Acked-by: Laurent Vivier +RH-Acked-by: Cindy Lu + +The vhost_vdpa_one_time_request() branch in +vhost_vdpa_set_backend_cap() incorrectly sends down +ioctls on vhost_dev with non-zero index. This may +end up with multiple VHOST_SET_BACKEND_FEATURES +ioctl calls sent down on the vhost-vdpa fd that is +shared between all these vhost_dev's. + +To fix it, send down ioctl only once via the first +vhost_dev with index 0. Toggle the polarity of the +vhost_vdpa_one_time_request() test should do the +trick. + +Fixes: 4d191cfdc7de ("vhost-vdpa: classify one time request") +Signed-off-by: Si-Wei Liu +Reviewed-by: Stefano Garzarella +Acked-by: Jason Wang +Acked-by: Eugenio Pérez +Message-Id: <1651890498-24478-6-git-send-email-si-wei.liu@oracle.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 6aee7e4233f6467f69531fcd352adff028f3f5ea) +Signed-off-by: Jason Wang +--- + hw/virtio/vhost-vdpa.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c +index 8adf7c0b92..6e3dbd9e89 100644 +--- a/hw/virtio/vhost-vdpa.c ++++ b/hw/virtio/vhost-vdpa.c +@@ -665,7 +665,7 @@ static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev) + + features &= f; + +- if (vhost_vdpa_one_time_request(dev)) { ++ if (!vhost_vdpa_one_time_request(dev)) { + r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features); + if (r) { + return -EFAULT; +-- +2.31.1 + diff --git a/kvm-vhost-vdpa-change-name-and-polarity-for-vhost_vdpa_o.patch b/kvm-vhost-vdpa-change-name-and-polarity-for-vhost_vdpa_o.patch new file mode 100644 index 0000000..2466557 --- /dev/null +++ b/kvm-vhost-vdpa-change-name-and-polarity-for-vhost_vdpa_o.patch @@ -0,0 +1,123 @@ +From 58acdab17ec00ab76105ab92a51c5ba4dec3df5a Mon Sep 17 00:00:00 2001 +From: Si-Wei Liu +Date: Fri, 6 May 2022 19:28:17 -0700 +Subject: [PATCH 13/16] vhost-vdpa: change name and polarity for + vhost_vdpa_one_time_request() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jason Wang +RH-MergeRequest: 98: Multiqueue fixes for vhost-vDPA +RH-Commit: [6/7] 7029778f463a136ff412c63b86b6953390e47bf8 (jasowang/qemu-kvm-cs) +RH-Bugzilla: 2070804 +RH-Acked-by: Eugenio Pérez +RH-Acked-by: Laurent Vivier +RH-Acked-by: Cindy Lu + +The name vhost_vdpa_one_time_request() was confusing. No +matter whatever it returns, its typical occurrence had +always been at requests that only need to be applied once. +And the name didn't suggest what it actually checks for. +Change it to vhost_vdpa_first_dev() with polarity flipped +for better readibility of code. That way it is able to +reflect what the check is really about. + +This call is applicable to request which performs operation +only once, before queues are set up, and usually at the beginning +of the caller function. Document the requirement for it in place. + +Signed-off-by: Si-Wei Liu +Message-Id: <1651890498-24478-7-git-send-email-si-wei.liu@oracle.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Stefano Garzarella +Acked-by: Jason Wang +(cherry picked from commit d71b0609fc04217e28d17009f04d74b08be6f466) +Signed-off-by: Jason Wang +--- + hw/virtio/vhost-vdpa.c | 23 +++++++++++++++-------- + 1 file changed, 15 insertions(+), 8 deletions(-) + +diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c +index 6e3dbd9e89..33dcaa135e 100644 +--- a/hw/virtio/vhost-vdpa.c ++++ b/hw/virtio/vhost-vdpa.c +@@ -366,11 +366,18 @@ static void vhost_vdpa_get_iova_range(struct vhost_vdpa *v) + v->iova_range.last); + } + +-static bool vhost_vdpa_one_time_request(struct vhost_dev *dev) ++/* ++ * The use of this function is for requests that only need to be ++ * applied once. Typically such request occurs at the beginning ++ * of operation, and before setting up queues. It should not be ++ * used for request that performs operation until all queues are ++ * set, which would need to check dev->vq_index_end instead. ++ */ ++static bool vhost_vdpa_first_dev(struct vhost_dev *dev) + { + struct vhost_vdpa *v = dev->opaque; + +- return v->index != 0; ++ return v->index == 0; + } + + static int vhost_vdpa_get_dev_features(struct vhost_dev *dev, +@@ -451,7 +458,7 @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp) + + vhost_vdpa_get_iova_range(v); + +- if (vhost_vdpa_one_time_request(dev)) { ++ if (!vhost_vdpa_first_dev(dev)) { + return 0; + } + +@@ -594,7 +601,7 @@ static int vhost_vdpa_memslots_limit(struct vhost_dev *dev) + static int vhost_vdpa_set_mem_table(struct vhost_dev *dev, + struct vhost_memory *mem) + { +- if (vhost_vdpa_one_time_request(dev)) { ++ if (!vhost_vdpa_first_dev(dev)) { + return 0; + } + +@@ -623,7 +630,7 @@ static int vhost_vdpa_set_features(struct vhost_dev *dev, + struct vhost_vdpa *v = dev->opaque; + int ret; + +- if (vhost_vdpa_one_time_request(dev)) { ++ if (!vhost_vdpa_first_dev(dev)) { + return 0; + } + +@@ -665,7 +672,7 @@ static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev) + + features &= f; + +- if (!vhost_vdpa_one_time_request(dev)) { ++ if (vhost_vdpa_first_dev(dev)) { + r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features); + if (r) { + return -EFAULT; +@@ -1118,7 +1125,7 @@ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base, + struct vhost_log *log) + { + struct vhost_vdpa *v = dev->opaque; +- if (v->shadow_vqs_enabled || vhost_vdpa_one_time_request(dev)) { ++ if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) { + return 0; + } + +@@ -1240,7 +1247,7 @@ static int vhost_vdpa_get_features(struct vhost_dev *dev, + + static int vhost_vdpa_set_owner(struct vhost_dev *dev) + { +- if (vhost_vdpa_one_time_request(dev)) { ++ if (!vhost_vdpa_first_dev(dev)) { + return 0; + } + +-- +2.31.1 + diff --git a/kvm-vhost-vdpa-fix-improper-cleanup-in-net_init_vhost_vd.patch b/kvm-vhost-vdpa-fix-improper-cleanup-in-net_init_vhost_vd.patch new file mode 100644 index 0000000..7716cbf --- /dev/null +++ b/kvm-vhost-vdpa-fix-improper-cleanup-in-net_init_vhost_vd.patch @@ -0,0 +1,48 @@ +From 3142102adb98f46518c0ac1773b0c48710c6bed6 Mon Sep 17 00:00:00 2001 +From: Si-Wei Liu +Date: Fri, 6 May 2022 19:28:14 -0700 +Subject: [PATCH 10/16] vhost-vdpa: fix improper cleanup in net_init_vhost_vdpa +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jason Wang +RH-MergeRequest: 98: Multiqueue fixes for vhost-vDPA +RH-Commit: [3/7] c83ff6c97d34cfae3c3447edde934b42a9ace75f (jasowang/qemu-kvm-cs) +RH-Bugzilla: 2070804 +RH-Acked-by: Eugenio Pérez +RH-Acked-by: Laurent Vivier +RH-Acked-by: Cindy Lu + +... such that no memory leaks on dangling net clients in case of +error. + +Signed-off-by: Si-Wei Liu +Acked-by: Jason Wang +Message-Id: <1651890498-24478-4-git-send-email-si-wei.liu@oracle.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 9bd055073e375c8a0d7ebce925e05d914d69fc7f) +Signed-off-by: Jason Wang +--- + net/vhost-vdpa.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c +index 1e9fe47c03..df1e69ee72 100644 +--- a/net/vhost-vdpa.c ++++ b/net/vhost-vdpa.c +@@ -306,7 +306,9 @@ int net_init_vhost_vdpa(const Netdev *netdev, const char *name, + + err: + if (i) { +- qemu_del_net_client(ncs[0]); ++ for (i--; i >= 0; i--) { ++ qemu_del_net_client(ncs[i]); ++ } + } + qemu_close(vdpa_device_fd); + +-- +2.31.1 + diff --git a/kvm-virtio-net-align-ctrl_vq-index-for-non-mq-guest-for-.patch b/kvm-virtio-net-align-ctrl_vq-index-for-non-mq-guest-for-.patch new file mode 100644 index 0000000..9da7ea7 --- /dev/null +++ b/kvm-virtio-net-align-ctrl_vq-index-for-non-mq-guest-for-.patch @@ -0,0 +1,143 @@ +From 316b73277de233c7a9b6917077c00d7012060944 Mon Sep 17 00:00:00 2001 +From: Si-Wei Liu +Date: Fri, 6 May 2022 19:28:13 -0700 +Subject: [PATCH 09/16] virtio-net: align ctrl_vq index for non-mq guest for + vhost_vdpa +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jason Wang +RH-MergeRequest: 98: Multiqueue fixes for vhost-vDPA +RH-Commit: [2/7] 7f764bbb579c7b473ad67fc25b46e698d277e781 (jasowang/qemu-kvm-cs) +RH-Bugzilla: 2070804 +RH-Acked-by: Eugenio Pérez +RH-Acked-by: Laurent Vivier +RH-Acked-by: Cindy Lu + +With MQ enabled vdpa device and non-MQ supporting guest e.g. +booting vdpa with mq=on over OVMF of single vqp, below assert +failure is seen: + +../hw/virtio/vhost-vdpa.c:560: vhost_vdpa_get_vq_index: Assertion `idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs' failed. + +0 0x00007f8ce3ff3387 in raise () at /lib64/libc.so.6 +1 0x00007f8ce3ff4a78 in abort () at /lib64/libc.so.6 +2 0x00007f8ce3fec1a6 in __assert_fail_base () at /lib64/libc.so.6 +3 0x00007f8ce3fec252 in () at /lib64/libc.so.6 +4 0x0000558f52d79421 in vhost_vdpa_get_vq_index (dev=, idx=) at ../hw/virtio/vhost-vdpa.c:563 +5 0x0000558f52d79421 in vhost_vdpa_get_vq_index (dev=, idx=) at ../hw/virtio/vhost-vdpa.c:558 +6 0x0000558f52d7329a in vhost_virtqueue_mask (hdev=0x558f55c01800, vdev=0x558f568f91f0, n=2, mask=) at ../hw/virtio/vhost.c:1557 +7 0x0000558f52c6b89a in virtio_pci_set_guest_notifier (d=d@entry=0x558f568f0f60, n=n@entry=2, assign=assign@entry=true, with_irqfd=with_irqfd@entry=false) + at ../hw/virtio/virtio-pci.c:974 +8 0x0000558f52c6c0d8 in virtio_pci_set_guest_notifiers (d=0x558f568f0f60, nvqs=3, assign=true) at ../hw/virtio/virtio-pci.c:1019 +9 0x0000558f52bf091d in vhost_net_start (dev=dev@entry=0x558f568f91f0, ncs=0x558f56937cd0, data_queue_pairs=data_queue_pairs@entry=1, cvq=cvq@entry=1) + at ../hw/net/vhost_net.c:361 +10 0x0000558f52d4e5e7 in virtio_net_set_status (status=, n=0x558f568f91f0) at ../hw/net/virtio-net.c:289 +11 0x0000558f52d4e5e7 in virtio_net_set_status (vdev=0x558f568f91f0, status=15 '\017') at ../hw/net/virtio-net.c:370 +12 0x0000558f52d6c4b2 in virtio_set_status (vdev=vdev@entry=0x558f568f91f0, val=val@entry=15 '\017') at ../hw/virtio/virtio.c:1945 +13 0x0000558f52c69eff in virtio_pci_common_write (opaque=0x558f568f0f60, addr=, val=, size=) at ../hw/virtio/virtio-pci.c:1292 +14 0x0000558f52d15d6e in memory_region_write_accessor (mr=0x558f568f19d0, addr=20, value=, size=1, shift=, mask=, attrs=...) + at ../softmmu/memory.c:492 +15 0x0000558f52d127de in access_with_adjusted_size (addr=addr@entry=20, value=value@entry=0x7f8cdbffe748, size=size@entry=1, access_size_min=, access_size_max=, access_fn=0x558f52d15cf0 , mr=0x558f568f19d0, attrs=...) at ../softmmu/memory.c:554 +16 0x0000558f52d157ef in memory_region_dispatch_write (mr=mr@entry=0x558f568f19d0, addr=20, data=, op=, attrs=attrs@entry=...) + at ../softmmu/memory.c:1504 +17 0x0000558f52d078e7 in flatview_write_continue (fv=fv@entry=0x7f8accbc3b90, addr=addr@entry=103079215124, attrs=..., ptr=ptr@entry=0x7f8ce6300028, len=len@entry=1, addr1=, l=, mr=0x558f568f19d0) at /home/opc/qemu-upstream/include/qemu/host-utils.h:165 +18 0x0000558f52d07b06 in flatview_write (fv=0x7f8accbc3b90, addr=103079215124, attrs=..., buf=0x7f8ce6300028, len=1) at ../softmmu/physmem.c:2822 +19 0x0000558f52d0b36b in address_space_write (as=, addr=, attrs=..., buf=buf@entry=0x7f8ce6300028, len=) + at ../softmmu/physmem.c:2914 +20 0x0000558f52d0b3da in address_space_rw (as=, addr=, attrs=..., + attrs@entry=..., buf=buf@entry=0x7f8ce6300028, len=, is_write=) at ../softmmu/physmem.c:2924 +21 0x0000558f52dced09 in kvm_cpu_exec (cpu=cpu@entry=0x558f55c2da60) at ../accel/kvm/kvm-all.c:2903 +22 0x0000558f52dcfabd in kvm_vcpu_thread_fn (arg=arg@entry=0x558f55c2da60) at ../accel/kvm/kvm-accel-ops.c:49 +23 0x0000558f52f9f04a in qemu_thread_start (args=) at ../util/qemu-thread-posix.c:556 +24 0x00007f8ce4392ea5 in start_thread () at /lib64/libpthread.so.0 +25 0x00007f8ce40bb9fd in clone () at /lib64/libc.so.6 + +The cause for the assert failure is due to that the vhost_dev index +for the ctrl vq was not aligned with actual one in use by the guest. +Upon multiqueue feature negotiation in virtio_net_set_multiqueue(), +if guest doesn't support multiqueue, the guest vq layout would shrink +to a single queue pair, consisting of 3 vqs in total (rx, tx and ctrl). +This results in ctrl_vq taking a different vhost_dev group index than +the default. We can map vq to the correct vhost_dev group by checking +if MQ is supported by guest and successfully negotiated. Since the +MQ feature is only present along with CTRL_VQ, we ensure the index +2 is only meant for the control vq while MQ is not supported by guest. + +Fixes: 22288fe ("virtio-net: vhost control virtqueue support") +Suggested-by: Jason Wang +Signed-off-by: Si-Wei Liu +Acked-by: Jason Wang +Message-Id: <1651890498-24478-3-git-send-email-si-wei.liu@oracle.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 68b0a6395f36a8f48f56f46d05f30be2067598b0) +Signed-off-by: Jason Wang +--- + hw/net/virtio-net.c | 33 +++++++++++++++++++++++++++++++-- + 1 file changed, 31 insertions(+), 2 deletions(-) + +diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c +index ffb3475201..f0bb29c741 100644 +--- a/hw/net/virtio-net.c ++++ b/hw/net/virtio-net.c +@@ -14,6 +14,7 @@ + #include "qemu/osdep.h" + #include "qemu/atomic.h" + #include "qemu/iov.h" ++#include "qemu/log.h" + #include "qemu/main-loop.h" + #include "qemu/module.h" + #include "hw/virtio/virtio.h" +@@ -3171,8 +3172,22 @@ static NetClientInfo net_virtio_info = { + static bool virtio_net_guest_notifier_pending(VirtIODevice *vdev, int idx) + { + VirtIONet *n = VIRTIO_NET(vdev); +- NetClientState *nc = qemu_get_subqueue(n->nic, vq2q(idx)); ++ NetClientState *nc; + assert(n->vhost_started); ++ if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_MQ) && idx == 2) { ++ /* Must guard against invalid features and bogus queue index ++ * from being set by malicious guest, or penetrated through ++ * buggy migration stream. ++ */ ++ if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) { ++ qemu_log_mask(LOG_GUEST_ERROR, ++ "%s: bogus vq index ignored\n", __func__); ++ return false; ++ } ++ nc = qemu_get_subqueue(n->nic, n->max_queue_pairs); ++ } else { ++ nc = qemu_get_subqueue(n->nic, vq2q(idx)); ++ } + return vhost_net_virtqueue_pending(get_vhost_net(nc->peer), idx); + } + +@@ -3180,8 +3195,22 @@ static void virtio_net_guest_notifier_mask(VirtIODevice *vdev, int idx, + bool mask) + { + VirtIONet *n = VIRTIO_NET(vdev); +- NetClientState *nc = qemu_get_subqueue(n->nic, vq2q(idx)); ++ NetClientState *nc; + assert(n->vhost_started); ++ if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_MQ) && idx == 2) { ++ /* Must guard against invalid features and bogus queue index ++ * from being set by malicious guest, or penetrated through ++ * buggy migration stream. ++ */ ++ if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) { ++ qemu_log_mask(LOG_GUEST_ERROR, ++ "%s: bogus vq index ignored\n", __func__); ++ return; ++ } ++ nc = qemu_get_subqueue(n->nic, n->max_queue_pairs); ++ } else { ++ nc = qemu_get_subqueue(n->nic, vq2q(idx)); ++ } + vhost_net_virtqueue_mask(get_vhost_net(nc->peer), + vdev, idx, mask); + } +-- +2.31.1 + diff --git a/kvm-virtio-net-don-t-handle-mq-request-in-userspace-hand.patch b/kvm-virtio-net-don-t-handle-mq-request-in-userspace-hand.patch new file mode 100644 index 0000000..3930cc2 --- /dev/null +++ b/kvm-virtio-net-don-t-handle-mq-request-in-userspace-hand.patch @@ -0,0 +1,109 @@ +From 521a1953bc11ab6823dcbbee773bcf86e926a9e7 Mon Sep 17 00:00:00 2001 +From: Si-Wei Liu +Date: Fri, 6 May 2022 19:28:18 -0700 +Subject: [PATCH 14/16] virtio-net: don't handle mq request in userspace + handler for vhost-vdpa +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jason Wang +RH-MergeRequest: 98: Multiqueue fixes for vhost-vDPA +RH-Commit: [7/7] 9781cab45448ae16a00fbf10cf7995df6b984a0a (jasowang/qemu-kvm-cs) +RH-Bugzilla: 2070804 +RH-Acked-by: Eugenio Pérez +RH-Acked-by: Laurent Vivier +RH-Acked-by: Cindy Lu + +virtio_queue_host_notifier_read() tends to read pending event +left behind on ioeventfd in the vhost_net_stop() path, and +attempts to handle outstanding kicks from userspace vq handler. +However, in the ctrl_vq handler, virtio_net_handle_mq() has a +recursive call into virtio_net_set_status(), which may lead to +segmentation fault as shown in below stack trace: + +0 0x000055f800df1780 in qdev_get_parent_bus (dev=0x0) at ../hw/core/qdev.c:376 +1 0x000055f800c68ad8 in virtio_bus_device_iommu_enabled (vdev=vdev@entry=0x0) at ../hw/virtio/virtio-bus.c:331 +2 0x000055f800d70d7f in vhost_memory_unmap (dev=) at ../hw/virtio/vhost.c:318 +3 0x000055f800d70d7f in vhost_memory_unmap (dev=, buffer=0x7fc19bec5240, len=2052, is_write=1, access_len=2052) at ../hw/virtio/vhost.c:336 +4 0x000055f800d71867 in vhost_virtqueue_stop (dev=dev@entry=0x55f8037ccc30, vdev=vdev@entry=0x55f8044ec590, vq=0x55f8037cceb0, idx=0) at ../hw/virtio/vhost.c:1241 +5 0x000055f800d7406c in vhost_dev_stop (hdev=hdev@entry=0x55f8037ccc30, vdev=vdev@entry=0x55f8044ec590) at ../hw/virtio/vhost.c:1839 +6 0x000055f800bf00a7 in vhost_net_stop_one (net=0x55f8037ccc30, dev=0x55f8044ec590) at ../hw/net/vhost_net.c:315 +7 0x000055f800bf0678 in vhost_net_stop (dev=dev@entry=0x55f8044ec590, ncs=0x55f80452bae0, data_queue_pairs=data_queue_pairs@entry=7, cvq=cvq@entry=1) + at ../hw/net/vhost_net.c:423 +8 0x000055f800d4e628 in virtio_net_set_status (status=, n=0x55f8044ec590) at ../hw/net/virtio-net.c:296 +9 0x000055f800d4e628 in virtio_net_set_status (vdev=vdev@entry=0x55f8044ec590, status=15 '\017') at ../hw/net/virtio-net.c:370 +10 0x000055f800d534d8 in virtio_net_handle_ctrl (iov_cnt=, iov=, cmd=0 '\000', n=0x55f8044ec590) at ../hw/net/virtio-net.c:1408 +11 0x000055f800d534d8 in virtio_net_handle_ctrl (vdev=0x55f8044ec590, vq=0x7fc1a7e888d0) at ../hw/net/virtio-net.c:1452 +12 0x000055f800d69f37 in virtio_queue_host_notifier_read (vq=0x7fc1a7e888d0) at ../hw/virtio/virtio.c:2331 +13 0x000055f800d69f37 in virtio_queue_host_notifier_read (n=n@entry=0x7fc1a7e8894c) at ../hw/virtio/virtio.c:3575 +14 0x000055f800c688e6 in virtio_bus_cleanup_host_notifier (bus=, n=n@entry=14) at ../hw/virtio/virtio-bus.c:312 +15 0x000055f800d73106 in vhost_dev_disable_notifiers (hdev=hdev@entry=0x55f8035b51b0, vdev=vdev@entry=0x55f8044ec590) + at ../../../include/hw/virtio/virtio-bus.h:35 +16 0x000055f800bf00b2 in vhost_net_stop_one (net=0x55f8035b51b0, dev=0x55f8044ec590) at ../hw/net/vhost_net.c:316 +17 0x000055f800bf0678 in vhost_net_stop (dev=dev@entry=0x55f8044ec590, ncs=0x55f80452bae0, data_queue_pairs=data_queue_pairs@entry=7, cvq=cvq@entry=1) + at ../hw/net/vhost_net.c:423 +18 0x000055f800d4e628 in virtio_net_set_status (status=, n=0x55f8044ec590) at ../hw/net/virtio-net.c:296 +19 0x000055f800d4e628 in virtio_net_set_status (vdev=0x55f8044ec590, status=15 '\017') at ../hw/net/virtio-net.c:370 +20 0x000055f800d6c4b2 in virtio_set_status (vdev=0x55f8044ec590, val=) at ../hw/virtio/virtio.c:1945 +21 0x000055f800d11d9d in vm_state_notify (running=running@entry=false, state=state@entry=RUN_STATE_SHUTDOWN) at ../softmmu/runstate.c:333 +22 0x000055f800d04e7a in do_vm_stop (state=state@entry=RUN_STATE_SHUTDOWN, send_stop=send_stop@entry=false) at ../softmmu/cpus.c:262 +23 0x000055f800d04e99 in vm_shutdown () at ../softmmu/cpus.c:280 +24 0x000055f800d126af in qemu_cleanup () at ../softmmu/runstate.c:812 +25 0x000055f800ad5b13 in main (argc=, argv=, envp=) at ../softmmu/main.c:51 + +For now, temporarily disable handling MQ request from the ctrl_vq +userspace hanlder to avoid the recursive virtio_net_set_status() +call. Some rework is needed to allow changing the number of +queues without going through a full virtio_net_set_status cycle, +particularly for vhost-vdpa backend. + +This patch will need to be reverted as soon as future patches of +having the change of #queues handled in userspace is merged. + +Fixes: 402378407db ("vhost-vdpa: multiqueue support") +Signed-off-by: Si-Wei Liu +Acked-by: Jason Wang +Message-Id: <1651890498-24478-8-git-send-email-si-wei.liu@oracle.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 2a7888cc3aa31faee839fa5dddad354ff8941f4c) +Signed-off-by: Jason Wang +--- + hw/net/virtio-net.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c +index f0bb29c741..099e65036d 100644 +--- a/hw/net/virtio-net.c ++++ b/hw/net/virtio-net.c +@@ -1381,6 +1381,7 @@ static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd, + { + VirtIODevice *vdev = VIRTIO_DEVICE(n); + uint16_t queue_pairs; ++ NetClientState *nc = qemu_get_queue(n->nic); + + virtio_net_disable_rss(n); + if (cmd == VIRTIO_NET_CTRL_MQ_HASH_CONFIG) { +@@ -1412,6 +1413,18 @@ static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd, + return VIRTIO_NET_ERR; + } + ++ /* Avoid changing the number of queue_pairs for vdpa device in ++ * userspace handler. A future fix is needed to handle the mq ++ * change in userspace handler with vhost-vdpa. Let's disable ++ * the mq handling from userspace for now and only allow get ++ * done through the kernel. Ripples may be seen when falling ++ * back to userspace, but without doing it qemu process would ++ * crash on a recursive entry to virtio_net_set_status(). ++ */ ++ if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) { ++ return VIRTIO_NET_ERR; ++ } ++ + n->curr_queue_pairs = queue_pairs; + /* stop the backend before changing the number of queue_pairs to avoid handling a + * disabled queue */ +-- +2.31.1 + diff --git a/kvm-virtio-net-setup-vhost_dev-and-notifiers-for-cvq-onl.patch b/kvm-virtio-net-setup-vhost_dev-and-notifiers-for-cvq-onl.patch new file mode 100644 index 0000000..f6072d2 --- /dev/null +++ b/kvm-virtio-net-setup-vhost_dev-and-notifiers-for-cvq-onl.patch @@ -0,0 +1,52 @@ +From 9e737aba614e94da4458f02d4ff97e95ffffd19f Mon Sep 17 00:00:00 2001 +From: Si-Wei Liu +Date: Fri, 6 May 2022 19:28:12 -0700 +Subject: [PATCH 08/16] virtio-net: setup vhost_dev and notifiers for cvq only + when feature is negotiated +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Jason Wang +RH-MergeRequest: 98: Multiqueue fixes for vhost-vDPA +RH-Commit: [1/7] a5c5a2862b2e4d15ef7c09da3e4234fdef37cc66 (jasowang/qemu-kvm-cs) +RH-Bugzilla: 2070804 +RH-Acked-by: Eugenio Pérez +RH-Acked-by: Laurent Vivier +RH-Acked-by: Cindy Lu + +When the control virtqueue feature is absent or not negotiated, +vhost_net_start() still tries to set up vhost_dev and install +vhost notifiers for the control virtqueue, which results in +erroneous ioctl calls with incorrect queue index sending down +to driver. Do that only when needed. + +Fixes: 22288fe ("virtio-net: vhost control virtqueue support") +Signed-off-by: Si-Wei Liu +Acked-by: Jason Wang +Message-Id: <1651890498-24478-2-git-send-email-si-wei.liu@oracle.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit aa8581945a13712ff3eed0ad3ba7a9664fc1604b) +Signed-off-by: Jason Wang +--- + hw/net/virtio-net.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c +index 1067e72b39..ffb3475201 100644 +--- a/hw/net/virtio-net.c ++++ b/hw/net/virtio-net.c +@@ -245,7 +245,8 @@ static void virtio_net_vhost_status(VirtIONet *n, uint8_t status) + VirtIODevice *vdev = VIRTIO_DEVICE(n); + NetClientState *nc = qemu_get_queue(n->nic); + int queue_pairs = n->multiqueue ? n->max_queue_pairs : 1; +- int cvq = n->max_ncs - n->max_queue_pairs; ++ int cvq = virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) ? ++ n->max_ncs - n->max_queue_pairs : 0; + + if (!get_vhost_net(nc->peer)) { + return; +-- +2.31.1 + diff --git a/qemu-kvm.spec b/qemu-kvm.spec index da9fadb..26f0c0a 100644 --- a/qemu-kvm.spec +++ b/qemu-kvm.spec @@ -151,7 +151,7 @@ Obsoletes: %{name}-block-ssh <= %{epoch}:%{version} \ Summary: QEMU is a machine emulator and virtualizer Name: qemu-kvm Version: 7.0.0 -Release: 5%{?rcrel}%{?dist}%{?cc_suffix} +Release: 6%{?rcrel}%{?dist}%{?cc_suffix} # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped # Epoch 15 used for RHEL 8 # Epoch 17 used for RHEL 9 (due to release versioning offset in RHEL 8.5) @@ -244,6 +244,38 @@ Patch44: kvm-migration-Fix-operator-type.patch Patch45: kvm-qemu-nbd-Pass-max-connections-to-blockdev-layer.patch # For bz#1708300 - RFE: qemu-nbd vs NBD_FLAG_CAN_MULTI_CONN Patch46: kvm-nbd-server-Allow-MULTI_CONN-for-shared-writable-expo.patch +# For bz#2031024 - Add support for fixing thread pool size [QEMU] +Patch47: kvm-Introduce-event-loop-base-abstract-class.patch +# For bz#2031024 - Add support for fixing thread pool size [QEMU] +Patch48: kvm-util-main-loop-Introduce-the-main-loop-into-QOM.patch +# For bz#2031024 - Add support for fixing thread pool size [QEMU] +Patch49: kvm-util-event-loop-base-Introduce-options-to-set-the-th.patch +# For bz#2072379 - Fail to rebuild the reference count tables of qcow2 image on host block devices (e.g. LVs) +Patch50: kvm-qcow2-Improve-refcount-structure-rebuilding.patch +# For bz#2072379 - Fail to rebuild the reference count tables of qcow2 image on host block devices (e.g. LVs) +Patch51: kvm-iotests-108-Test-new-refcount-rebuild-algorithm.patch +# For bz#2072379 - Fail to rebuild the reference count tables of qcow2 image on host block devices (e.g. LVs) +Patch52: kvm-qcow2-Add-errp-to-rebuild_refcount_structure.patch +# For bz#2072379 - Fail to rebuild the reference count tables of qcow2 image on host block devices (e.g. LVs) +Patch53: kvm-iotests-108-Fix-when-missing-user_allow_other.patch +# For bz#2070804 - PXE boot crash qemu when using multiqueue vDPA +Patch54: kvm-virtio-net-setup-vhost_dev-and-notifiers-for-cvq-onl.patch +# For bz#2070804 - PXE boot crash qemu when using multiqueue vDPA +Patch55: kvm-virtio-net-align-ctrl_vq-index-for-non-mq-guest-for-.patch +# For bz#2070804 - PXE boot crash qemu when using multiqueue vDPA +Patch56: kvm-vhost-vdpa-fix-improper-cleanup-in-net_init_vhost_vd.patch +# For bz#2070804 - PXE boot crash qemu when using multiqueue vDPA +Patch57: kvm-vhost-net-fix-improper-cleanup-in-vhost_net_start.patch +# For bz#2070804 - PXE boot crash qemu when using multiqueue vDPA +Patch58: kvm-vhost-vdpa-backend-feature-should-set-only-once.patch +# For bz#2070804 - PXE boot crash qemu when using multiqueue vDPA +Patch59: kvm-vhost-vdpa-change-name-and-polarity-for-vhost_vdpa_o.patch +# For bz#2070804 - PXE boot crash qemu when using multiqueue vDPA +Patch60: kvm-virtio-net-don-t-handle-mq-request-in-userspace-hand.patch +# For bz#2094270 - Do not set the hard vCPU limit to the soft vCPU limit in downstream qemu-kvm anymore +Patch61: kvm-Revert-globally-limit-the-maximum-number-of-CPUs.patch +# For bz#2086262 - [Win11][tpm]vfio_listener_region_del received unaligned region +Patch62: kvm-vfio-common-remove-spurious-warning-on-vfio_listener.patch # Source-git patches @@ -1279,6 +1311,34 @@ useradd -r -u 107 -g qemu -G kvm -d / -s /sbin/nologin \ %endif %changelog +* Mon Jun 13 2022 Miroslav Rezanina - 7.0.0-6 +- kvm-Introduce-event-loop-base-abstract-class.patch [bz#2031024] +- kvm-util-main-loop-Introduce-the-main-loop-into-QOM.patch [bz#2031024] +- kvm-util-event-loop-base-Introduce-options-to-set-the-th.patch [bz#2031024] +- kvm-qcow2-Improve-refcount-structure-rebuilding.patch [bz#2072379] +- kvm-iotests-108-Test-new-refcount-rebuild-algorithm.patch [bz#2072379] +- kvm-qcow2-Add-errp-to-rebuild_refcount_structure.patch [bz#2072379] +- kvm-iotests-108-Fix-when-missing-user_allow_other.patch [bz#2072379] +- kvm-virtio-net-setup-vhost_dev-and-notifiers-for-cvq-onl.patch [bz#2070804] +- kvm-virtio-net-align-ctrl_vq-index-for-non-mq-guest-for-.patch [bz#2070804] +- kvm-vhost-vdpa-fix-improper-cleanup-in-net_init_vhost_vd.patch [bz#2070804] +- kvm-vhost-net-fix-improper-cleanup-in-vhost_net_start.patch [bz#2070804] +- kvm-vhost-vdpa-backend-feature-should-set-only-once.patch [bz#2070804] +- kvm-vhost-vdpa-change-name-and-polarity-for-vhost_vdpa_o.patch [bz#2070804] +- kvm-virtio-net-don-t-handle-mq-request-in-userspace-hand.patch [bz#2070804] +- kvm-Revert-globally-limit-the-maximum-number-of-CPUs.patch [bz#2094270] +- kvm-vfio-common-remove-spurious-warning-on-vfio_listener.patch [bz#2086262] +- Resolves: bz#2031024 + (Add support for fixing thread pool size [QEMU]) +- Resolves: bz#2072379 + (Fail to rebuild the reference count tables of qcow2 image on host block devices (e.g. LVs)) +- Resolves: bz#2070804 + (PXE boot crash qemu when using multiqueue vDPA) +- Resolves: bz#2094270 + (Do not set the hard vCPU limit to the soft vCPU limit in downstream qemu-kvm anymore) +- Resolves: bz#2086262 + ([Win11][tpm]vfio_listener_region_del received unaligned region) + * Mon May 30 2022 Miroslav Rezanina - 7.0.0-5 - kvm-qemu-nbd-Pass-max-connections-to-blockdev-layer.patch [bz#1708300] - kvm-nbd-server-Allow-MULTI_CONN-for-shared-writable-expo.patch [bz#1708300]