diff --git a/kvm-KVM-Define-KVM_MEMSLOTS_NUM_MAX_DEFAULT.patch b/kvm-KVM-Define-KVM_MEMSLOTS_NUM_MAX_DEFAULT.patch new file mode 100644 index 0000000..14bd08c --- /dev/null +++ b/kvm-KVM-Define-KVM_MEMSLOTS_NUM_MAX_DEFAULT.patch @@ -0,0 +1,50 @@ +From 111d70a5bdc3ee0dde0a6def9e0c75ed20b4f093 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Tue, 17 Sep 2024 12:38:33 -0400 +Subject: [PATCH 6/9] KVM: Define KVM_MEMSLOTS_NUM_MAX_DEFAULT + +RH-Author: Peter Xu +RH-MergeRequest: 285: KVM: Dynamic sized kvm memslots array +RH-Jira: RHEL-57685 +RH-Acked-by: Juraj Marcin +RH-Acked-by: Miroslav Rezanina +RH-Commit: [5/7] e4c2a2c2f3a809c8efb709521c7a94ba0627c69b (peterx/qemu-kvm) + +Make the default max nr_slots a macro, it's only used when KVM reports +nothing. + +Reviewed-by: David Hildenbrand +Signed-off-by: Peter Xu +Link: https://lore.kernel.org/r/20240917163835.194664-3-peterx@redhat.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit b34a908c8f24eedb0a8e5ff486b059b58fd793f4) +Signed-off-by: Peter Xu +--- + accel/kvm/kvm-all.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index 38393bc86b..87db0f9494 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -71,6 +71,8 @@ + + /* Default num of memslots to be allocated when VM starts */ + #define KVM_MEMSLOTS_NR_ALLOC_DEFAULT 16 ++/* Default max allowed memslots if kernel reported nothing */ ++#define KVM_MEMSLOTS_NR_MAX_DEFAULT 32 + + struct KVMParkedVcpu { + unsigned long vcpu_id; +@@ -2617,7 +2619,7 @@ static int kvm_init(MachineState *ms) + + /* If unspecified, use the default value */ + if (!s->nr_slots) { +- s->nr_slots = 32; ++ s->nr_slots_max = KVM_MEMSLOTS_NR_MAX_DEFAULT; + } + + s->nr_as = kvm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE); +-- +2.39.3 + diff --git a/kvm-KVM-Dynamic-sized-kvm-memslots-array.patch b/kvm-KVM-Dynamic-sized-kvm-memslots-array.patch new file mode 100644 index 0000000..8fc648d --- /dev/null +++ b/kvm-KVM-Dynamic-sized-kvm-memslots-array.patch @@ -0,0 +1,251 @@ +From c77a30265b8d0db43174b040ea82103f8fdb9911 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Tue, 17 Sep 2024 12:38:32 -0400 +Subject: [PATCH 5/9] KVM: Dynamic sized kvm memslots array + +RH-Author: Peter Xu +RH-MergeRequest: 285: KVM: Dynamic sized kvm memslots array +RH-Jira: RHEL-57685 +RH-Acked-by: Juraj Marcin +RH-Acked-by: Miroslav Rezanina +RH-Commit: [4/7] 46d4abec352a92112e593ea61b7cbf5ce5f94cdc (peterx/qemu-kvm) + +Zhiyi reported an infinite loop issue in VFIO use case. The cause of that +was a separate discussion, however during that I found a regression of +dirty sync slowness when profiling. + +Each KVMMemoryListerner maintains an array of kvm memslots. Currently it's +statically allocated to be the max supported by the kernel. However after +Linux commit 4fc096a99e ("KVM: Raise the maximum number of user memslots"), +the max supported memslots reported now grows to some number large enough +so that it may not be wise to always statically allocate with the max +reported. + +What's worse, QEMU kvm code still walks all the allocated memslots entries +to do any form of lookups. It can drastically slow down all memslot +operations because each of such loop can run over 32K times on the new +kernels. + +Fix this issue by making the memslots to be allocated dynamically. + +Here the initial size was set to 16 because it should cover the basic VM +usages, so that the hope is the majority VM use case may not even need to +grow at all (e.g. if one starts a VM with ./qemu-system-x86_64 by default +it'll consume 9 memslots), however not too large to waste memory. + +There can also be even better way to address this, but so far this is the +simplest and should be already better even than before we grow the max +supported memslots. For example, in the case of above issue when VFIO was +attached on a 32GB system, there are only ~10 memslots used. So it could +be good enough as of now. + +In the above VFIO context, measurement shows that the precopy dirty sync +shrinked from ~86ms to ~3ms after this patch applied. It should also apply +to any KVM enabled VM even without VFIO. + +NOTE: we don't have a FIXES tag for this patch because there's no real +commit that regressed this in QEMU. Such behavior existed for a long time, +but only start to be a problem when the kernel reports very large +nr_slots_max value. However that's pretty common now (the kernel change +was merged in 2021) so we attached cc:stable because we'll want this change +to be backported to stable branches. + +Cc: qemu-stable +Reported-by: Zhiyi Guo +Tested-by: Zhiyi Guo +Signed-off-by: Peter Xu +Acked-by: David Hildenbrand +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240917163835.194664-2-peterx@redhat.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit 5504a8126115d173687b37e657312a8ffe29fc0c) +Signed-off-by: Peter Xu +--- + accel/kvm/kvm-all.c | 87 +++++++++++++++++++++++++++++++++------- + accel/kvm/trace-events | 1 + + include/sysemu/kvm_int.h | 1 + + 3 files changed, 74 insertions(+), 15 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index 8187ad3964..38393bc86b 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -69,6 +69,9 @@ + #define KVM_GUESTDBG_BLOCKIRQ 0 + #endif + ++/* Default num of memslots to be allocated when VM starts */ ++#define KVM_MEMSLOTS_NR_ALLOC_DEFAULT 16 ++ + struct KVMParkedVcpu { + unsigned long vcpu_id; + int kvm_fd; +@@ -165,6 +168,57 @@ void kvm_resample_fd_notify(int gsi) + } + } + ++/** ++ * kvm_slots_grow(): Grow the slots[] array in the KVMMemoryListener ++ * ++ * @kml: The KVMMemoryListener* to grow the slots[] array ++ * @nr_slots_new: The new size of slots[] array ++ * ++ * Returns: True if the array grows larger, false otherwise. ++ */ ++static bool kvm_slots_grow(KVMMemoryListener *kml, unsigned int nr_slots_new) ++{ ++ unsigned int i, cur = kml->nr_slots_allocated; ++ KVMSlot *slots; ++ ++ if (nr_slots_new > kvm_state->nr_slots) { ++ nr_slots_new = kvm_state->nr_slots; ++ } ++ ++ if (cur >= nr_slots_new) { ++ /* Big enough, no need to grow, or we reached max */ ++ return false; ++ } ++ ++ if (cur == 0) { ++ slots = g_new0(KVMSlot, nr_slots_new); ++ } else { ++ assert(kml->slots); ++ slots = g_renew(KVMSlot, kml->slots, nr_slots_new); ++ /* ++ * g_renew() doesn't initialize extended buffers, however kvm ++ * memslots require fields to be zero-initialized. E.g. pointers, ++ * memory_size field, etc. ++ */ ++ memset(&slots[cur], 0x0, sizeof(slots[0]) * (nr_slots_new - cur)); ++ } ++ ++ for (i = cur; i < nr_slots_new; i++) { ++ slots[i].slot = i; ++ } ++ ++ kml->slots = slots; ++ kml->nr_slots_allocated = nr_slots_new; ++ trace_kvm_slots_grow(cur, nr_slots_new); ++ ++ return true; ++} ++ ++static bool kvm_slots_double(KVMMemoryListener *kml) ++{ ++ return kvm_slots_grow(kml, kml->nr_slots_allocated * 2); ++} ++ + unsigned int kvm_get_max_memslots(void) + { + KVMState *s = KVM_STATE(current_accel()); +@@ -193,15 +247,26 @@ unsigned int kvm_get_free_memslots(void) + /* Called with KVMMemoryListener.slots_lock held */ + static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml) + { +- KVMState *s = kvm_state; ++ unsigned int n; + int i; + +- for (i = 0; i < s->nr_slots; i++) { ++ for (i = 0; i < kml->nr_slots_allocated; i++) { + if (kml->slots[i].memory_size == 0) { + return &kml->slots[i]; + } + } + ++ /* ++ * If no free slots, try to grow first by doubling. Cache the old size ++ * here to avoid another round of search: if the grow succeeded, it ++ * means slots[] now must have the existing "n" slots occupied, ++ * followed by one or more free slots starting from slots[n]. ++ */ ++ n = kml->nr_slots_allocated; ++ if (kvm_slots_double(kml)) { ++ return &kml->slots[n]; ++ } ++ + return NULL; + } + +@@ -222,10 +287,9 @@ static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml, + hwaddr start_addr, + hwaddr size) + { +- KVMState *s = kvm_state; + int i; + +- for (i = 0; i < s->nr_slots; i++) { ++ for (i = 0; i < kml->nr_slots_allocated; i++) { + KVMSlot *mem = &kml->slots[i]; + + if (start_addr == mem->start_addr && size == mem->memory_size) { +@@ -267,7 +331,7 @@ int kvm_physical_memory_addr_from_host(KVMState *s, void *ram, + int i, ret = 0; + + kvm_slots_lock(); +- for (i = 0; i < s->nr_slots; i++) { ++ for (i = 0; i < kml->nr_slots_allocated; i++) { + KVMSlot *mem = &kml->slots[i]; + + if (ram >= mem->ram && ram < mem->ram + mem->memory_size) { +@@ -1071,7 +1135,7 @@ static int kvm_physical_log_clear(KVMMemoryListener *kml, + + kvm_slots_lock(); + +- for (i = 0; i < s->nr_slots; i++) { ++ for (i = 0; i < kml->nr_slots_allocated; i++) { + mem = &kml->slots[i]; + /* Discard slots that are empty or do not overlap the section */ + if (!mem->memory_size || +@@ -1719,12 +1783,8 @@ static void kvm_log_sync_global(MemoryListener *l, bool last_stage) + /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */ + kvm_dirty_ring_flush(); + +- /* +- * TODO: make this faster when nr_slots is big while there are +- * only a few used slots (small VMs). +- */ + kvm_slots_lock(); +- for (i = 0; i < s->nr_slots; i++) { ++ for (i = 0; i < kml->nr_slots_allocated; i++) { + mem = &kml->slots[i]; + if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { + kvm_slot_sync_dirty_pages(mem); +@@ -1839,12 +1899,9 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, + { + int i; + +- kml->slots = g_new0(KVMSlot, s->nr_slots); + kml->as_id = as_id; + +- for (i = 0; i < s->nr_slots; i++) { +- kml->slots[i].slot = i; +- } ++ kvm_slots_grow(kml, KVM_MEMSLOTS_NR_ALLOC_DEFAULT); + + QSIMPLEQ_INIT(&kml->transaction_add); + QSIMPLEQ_INIT(&kml->transaction_del); +diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events +index 37626c1ac5..ad2ae6fca5 100644 +--- a/accel/kvm/trace-events ++++ b/accel/kvm/trace-events +@@ -36,3 +36,4 @@ kvm_io_window_exit(void) "" + kvm_run_exit_system_event(int cpu_index, uint32_t event_type) "cpu_index %d, system_even_type %"PRIu32 + kvm_convert_memory(uint64_t start, uint64_t size, const char *msg) "start 0x%" PRIx64 " size 0x%" PRIx64 " %s" + kvm_memory_fault(uint64_t start, uint64_t size, uint64_t flags) "start 0x%" PRIx64 " size 0x%" PRIx64 " flags 0x%" PRIx64 ++kvm_slots_grow(unsigned int old, unsigned int new) "%u -> %u" +diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h +index 1d8fb1473b..48e496b3d4 100644 +--- a/include/sysemu/kvm_int.h ++++ b/include/sysemu/kvm_int.h +@@ -46,6 +46,7 @@ typedef struct KVMMemoryListener { + MemoryListener listener; + KVMSlot *slots; + unsigned int nr_used_slots; ++ unsigned int nr_slots_allocated; + int as_id; + QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_add; + QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_del; +-- +2.39.3 + diff --git a/kvm-KVM-Rename-KVMMemoryListener.nr_used_slots-to-nr_slo.patch b/kvm-KVM-Rename-KVMMemoryListener.nr_used_slots-to-nr_slo.patch new file mode 100644 index 0000000..544247a --- /dev/null +++ b/kvm-KVM-Rename-KVMMemoryListener.nr_used_slots-to-nr_slo.patch @@ -0,0 +1,73 @@ +From b1d082cfad79245ac0ffed45f723092388d1cf45 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Tue, 17 Sep 2024 12:38:34 -0400 +Subject: [PATCH 7/9] KVM: Rename KVMMemoryListener.nr_used_slots to + nr_slots_used + +RH-Author: Peter Xu +RH-MergeRequest: 285: KVM: Dynamic sized kvm memslots array +RH-Jira: RHEL-57685 +RH-Acked-by: Juraj Marcin +RH-Acked-by: Miroslav Rezanina +RH-Commit: [6/7] ed173123ee23edcf62a6c1940ca74cdfd6b545e9 (peterx/qemu-kvm) + +This will make all nr_slots counters to be named in the same manner. + +Reviewed-by: David Hildenbrand +Signed-off-by: Peter Xu +Link: https://lore.kernel.org/r/20240917163835.194664-4-peterx@redhat.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit dbdc00ba5b136bba80d850f61cc79a9cafaae1cd) +Signed-off-by: Peter Xu +--- + accel/kvm/kvm-all.c | 6 +++--- + include/sysemu/kvm_int.h | 2 +- + 2 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index 87db0f9494..e99aaba486 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -239,7 +239,7 @@ unsigned int kvm_get_free_memslots(void) + if (!s->as[i].ml) { + continue; + } +- used_slots = MAX(used_slots, s->as[i].ml->nr_used_slots); ++ used_slots = MAX(used_slots, s->as[i].ml->nr_slots_used); + } + kvm_slots_unlock(); + +@@ -1516,7 +1516,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, + } + start_addr += slot_size; + size -= slot_size; +- kml->nr_used_slots--; ++ kml->nr_slots_used--; + } while (size); + return; + } +@@ -1555,7 +1555,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, + ram_start_offset += slot_size; + ram += slot_size; + size -= slot_size; +- kml->nr_used_slots++; ++ kml->nr_slots_used++; + } while (size); + } + +diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h +index 48e496b3d4..b705dfc9b4 100644 +--- a/include/sysemu/kvm_int.h ++++ b/include/sysemu/kvm_int.h +@@ -45,7 +45,7 @@ typedef struct KVMMemoryUpdate { + typedef struct KVMMemoryListener { + MemoryListener listener; + KVMSlot *slots; +- unsigned int nr_used_slots; ++ unsigned int nr_slots_used; + unsigned int nr_slots_allocated; + int as_id; + QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_add; +-- +2.39.3 + diff --git a/kvm-KVM-Rename-KVMState-nr_slots-to-nr_slots_max.patch b/kvm-KVM-Rename-KVMState-nr_slots-to-nr_slots_max.patch new file mode 100644 index 0000000..92b46cc --- /dev/null +++ b/kvm-KVM-Rename-KVMState-nr_slots-to-nr_slots_max.patch @@ -0,0 +1,90 @@ +From 891fb13363d168760cd21d0c57368e1a413cad27 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Tue, 17 Sep 2024 12:38:35 -0400 +Subject: [PATCH 8/9] KVM: Rename KVMState->nr_slots to nr_slots_max + +RH-Author: Peter Xu +RH-MergeRequest: 285: KVM: Dynamic sized kvm memslots array +RH-Jira: RHEL-57685 +RH-Acked-by: Juraj Marcin +RH-Acked-by: Miroslav Rezanina +RH-Commit: [7/7] 7a1b28f04ee6a2c80b07db241fc88cb40f54e376 (peterx/qemu-kvm) + +This value used to reflect the maximum supported memslots from KVM kernel. +Rename it to be clearer. + +Reviewed-by: David Hildenbrand +Signed-off-by: Peter Xu +Link: https://lore.kernel.org/r/20240917163835.194664-5-peterx@redhat.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit 943c742868c739c0b14fd996bad3adf744156fec) +Signed-off-by: Peter Xu +--- + accel/kvm/kvm-all.c | 12 ++++++------ + include/sysemu/kvm_int.h | 4 ++-- + 2 files changed, 8 insertions(+), 8 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index e99aaba486..dc6253895d 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -183,8 +183,8 @@ static bool kvm_slots_grow(KVMMemoryListener *kml, unsigned int nr_slots_new) + unsigned int i, cur = kml->nr_slots_allocated; + KVMSlot *slots; + +- if (nr_slots_new > kvm_state->nr_slots) { +- nr_slots_new = kvm_state->nr_slots; ++ if (nr_slots_new > kvm_state->nr_slots_max) { ++ nr_slots_new = kvm_state->nr_slots_max; + } + + if (cur >= nr_slots_new) { +@@ -225,7 +225,7 @@ unsigned int kvm_get_max_memslots(void) + { + KVMState *s = KVM_STATE(current_accel()); + +- return s->nr_slots; ++ return s->nr_slots_max; + } + + unsigned int kvm_get_free_memslots(void) +@@ -243,7 +243,7 @@ unsigned int kvm_get_free_memslots(void) + } + kvm_slots_unlock(); + +- return s->nr_slots - used_slots; ++ return s->nr_slots_max - used_slots; + } + + /* Called with KVMMemoryListener.slots_lock held */ +@@ -2615,10 +2615,10 @@ static int kvm_init(MachineState *ms) + (kvm_supported_memory_attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE); + + kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT); +- s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS); ++ s->nr_slots_max = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS); + + /* If unspecified, use the default value */ +- if (!s->nr_slots) { ++ if (!s->nr_slots_max) { + s->nr_slots_max = KVM_MEMSLOTS_NR_MAX_DEFAULT; + } + +diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h +index b705dfc9b4..2c57194b6b 100644 +--- a/include/sysemu/kvm_int.h ++++ b/include/sysemu/kvm_int.h +@@ -103,8 +103,8 @@ struct KVMDirtyRingReaper { + struct KVMState + { + AccelState parent_obj; +- +- int nr_slots; ++ /* Max number of KVM slots supported */ ++ int nr_slots_max; + int fd; + int vmfd; + int coalesced_mmio; +-- +2.39.3 + diff --git a/kvm-accel-kvm-refactor-dirty-ring-setup.patch b/kvm-accel-kvm-refactor-dirty-ring-setup.patch new file mode 100644 index 0000000..eb6eb23 --- /dev/null +++ b/kvm-accel-kvm-refactor-dirty-ring-setup.patch @@ -0,0 +1,144 @@ +From 00a2dbf483a077bb31b1c9f70cced36319d22628 Mon Sep 17 00:00:00 2001 +From: Ani Sinha +Date: Thu, 12 Sep 2024 11:48:38 +0530 +Subject: [PATCH 4/9] accel/kvm: refactor dirty ring setup + +RH-Author: Peter Xu +RH-MergeRequest: 285: KVM: Dynamic sized kvm memslots array +RH-Jira: RHEL-57685 +RH-Acked-by: Juraj Marcin +RH-Acked-by: Miroslav Rezanina +RH-Commit: [3/7] 94f345d1e7ad6437dd2ce67ca7cad224c67aa48f (peterx/qemu-kvm) + +Refactor setting up of dirty ring code in kvm_init() so that is can be +reused in the future patchsets. + +Signed-off-by: Ani Sinha +Link: https://lore.kernel.org/r/20240912061838.4501-1-anisinha@redhat.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit 28ed7f9761eb273e7dedcfdc0507d158106d0451) +Signed-off-by: Peter Xu +--- + accel/kvm/kvm-all.c | 88 +++++++++++++++++++++++++-------------------- + 1 file changed, 50 insertions(+), 38 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index d86d1b515a..8187ad3964 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -2439,6 +2439,55 @@ static int find_kvm_machine_type(MachineState *ms) + return type; + } + ++static int kvm_setup_dirty_ring(KVMState *s) ++{ ++ uint64_t dirty_log_manual_caps; ++ int ret; ++ ++ /* ++ * Enable KVM dirty ring if supported, otherwise fall back to ++ * dirty logging mode ++ */ ++ ret = kvm_dirty_ring_init(s); ++ if (ret < 0) { ++ return ret; ++ } ++ ++ /* ++ * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is ++ * enabled. More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no ++ * page is wr-protected initially, which is against how kvm dirty ring is ++ * usage - kvm dirty ring requires all pages are wr-protected at the very ++ * beginning. Enabling this feature for dirty ring causes data corruption. ++ * ++ * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log, ++ * we may expect a higher stall time when starting the migration. In the ++ * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too: ++ * instead of clearing dirty bit, it can be a way to explicitly wr-protect ++ * guest pages. ++ */ ++ if (!s->kvm_dirty_ring_size) { ++ dirty_log_manual_caps = ++ kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); ++ dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | ++ KVM_DIRTY_LOG_INITIALLY_SET); ++ s->manual_dirty_log_protect = dirty_log_manual_caps; ++ if (dirty_log_manual_caps) { ++ ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0, ++ dirty_log_manual_caps); ++ if (ret) { ++ warn_report("Trying to enable capability %"PRIu64" of " ++ "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. " ++ "Falling back to the legacy mode. ", ++ dirty_log_manual_caps); ++ s->manual_dirty_log_protect = 0; ++ } ++ } ++ } ++ ++ return 0; ++} ++ + static int kvm_init(MachineState *ms) + { + MachineClass *mc = MACHINE_GET_CLASS(ms); +@@ -2458,7 +2507,6 @@ static int kvm_init(MachineState *ms) + const KVMCapabilityInfo *missing_cap; + int ret; + int type; +- uint64_t dirty_log_manual_caps; + + qemu_mutex_init(&kml_slots_lock); + +@@ -2570,47 +2618,11 @@ static int kvm_init(MachineState *ms) + s->coalesced_pio = s->coalesced_mmio && + kvm_check_extension(s, KVM_CAP_COALESCED_PIO); + +- /* +- * Enable KVM dirty ring if supported, otherwise fall back to +- * dirty logging mode +- */ +- ret = kvm_dirty_ring_init(s); ++ ret = kvm_setup_dirty_ring(s); + if (ret < 0) { + goto err; + } + +- /* +- * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is +- * enabled. More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no +- * page is wr-protected initially, which is against how kvm dirty ring is +- * usage - kvm dirty ring requires all pages are wr-protected at the very +- * beginning. Enabling this feature for dirty ring causes data corruption. +- * +- * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log, +- * we may expect a higher stall time when starting the migration. In the +- * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too: +- * instead of clearing dirty bit, it can be a way to explicitly wr-protect +- * guest pages. +- */ +- if (!s->kvm_dirty_ring_size) { +- dirty_log_manual_caps = +- kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); +- dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | +- KVM_DIRTY_LOG_INITIALLY_SET); +- s->manual_dirty_log_protect = dirty_log_manual_caps; +- if (dirty_log_manual_caps) { +- ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0, +- dirty_log_manual_caps); +- if (ret) { +- warn_report("Trying to enable capability %"PRIu64" of " +- "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. " +- "Falling back to the legacy mode. ", +- dirty_log_manual_caps); +- s->manual_dirty_log_protect = 0; +- } +- } +- } +- + #ifdef KVM_CAP_VCPU_EVENTS + s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS); + #endif +-- +2.39.3 + diff --git a/kvm-kvm-refactor-core-virtual-machine-creation-into-its-.patch b/kvm-kvm-refactor-core-virtual-machine-creation-into-its-.patch new file mode 100644 index 0000000..fa633bb --- /dev/null +++ b/kvm-kvm-refactor-core-virtual-machine-creation-into-its-.patch @@ -0,0 +1,144 @@ +From 67180363bdc1898462f90e16c1909db7331cc5e2 Mon Sep 17 00:00:00 2001 +From: Ani Sinha +Date: Thu, 8 Aug 2024 17:08:38 +0530 +Subject: [PATCH 3/9] kvm: refactor core virtual machine creation into its own + function + +RH-Author: Peter Xu +RH-MergeRequest: 285: KVM: Dynamic sized kvm memslots array +RH-Jira: RHEL-57685 +RH-Acked-by: Juraj Marcin +RH-Acked-by: Miroslav Rezanina +RH-Commit: [2/7] a783111d9a2ef6590103543f1bd103bf90052872 (peterx/qemu-kvm) + +Refactoring the core logic around KVM_CREATE_VM into its own separate function +so that it can be called from other functions in subsequent patches. There is +no functional change in this patch. + +CC: pbonzini@redhat.com +CC: zhao1.liu@intel.com +Signed-off-by: Ani Sinha +Link: https://lore.kernel.org/r/20240808113838.1697366-1-anisinha@redhat.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit 67388078da1cf6dac89e5a7c748cca3444d49690) +Signed-off-by: Peter Xu +--- + accel/kvm/kvm-all.c | 89 ++++++++++++++++++++++++++++----------------- + 1 file changed, 56 insertions(+), 33 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index 7432a54f39..d86d1b515a 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -2385,6 +2385,60 @@ uint32_t kvm_dirty_ring_size(void) + return kvm_state->kvm_dirty_ring_size; + } + ++static int do_kvm_create_vm(MachineState *ms, int type) ++{ ++ KVMState *s; ++ int ret; ++ ++ s = KVM_STATE(ms->accelerator); ++ ++ do { ++ ret = kvm_ioctl(s, KVM_CREATE_VM, type); ++ } while (ret == -EINTR); ++ ++ if (ret < 0) { ++ error_report("ioctl(KVM_CREATE_VM) failed: %s", strerror(-ret)); ++ ++#ifdef TARGET_S390X ++ if (ret == -EINVAL) { ++ error_printf("Host kernel setup problem detected." ++ " Please verify:\n"); ++ error_printf("- for kernels supporting the" ++ " switch_amode or user_mode parameters, whether"); ++ error_printf(" user space is running in primary address space\n"); ++ error_printf("- for kernels supporting the vm.allocate_pgste" ++ " sysctl, whether it is enabled\n"); ++ } ++#elif defined(TARGET_PPC) ++ if (ret == -EINVAL) { ++ error_printf("PPC KVM module is not loaded. Try modprobe kvm_%s.\n", ++ (type == 2) ? "pr" : "hv"); ++ } ++#endif ++ } ++ ++ return ret; ++} ++ ++static int find_kvm_machine_type(MachineState *ms) ++{ ++ MachineClass *mc = MACHINE_GET_CLASS(ms); ++ int type; ++ ++ if (object_property_find(OBJECT(current_machine), "kvm-type")) { ++ g_autofree char *kvm_type; ++ kvm_type = object_property_get_str(OBJECT(current_machine), ++ "kvm-type", ++ &error_abort); ++ type = mc->kvm_type(ms, kvm_type); ++ } else if (mc->kvm_type) { ++ type = mc->kvm_type(ms, NULL); ++ } else { ++ type = kvm_arch_get_default_type(ms); ++ } ++ return type; ++} ++ + static int kvm_init(MachineState *ms) + { + MachineClass *mc = MACHINE_GET_CLASS(ms); +@@ -2467,45 +2521,14 @@ static int kvm_init(MachineState *ms) + } + s->as = g_new0(struct KVMAs, s->nr_as); + +- if (object_property_find(OBJECT(current_machine), "kvm-type")) { +- g_autofree char *kvm_type = object_property_get_str(OBJECT(current_machine), +- "kvm-type", +- &error_abort); +- type = mc->kvm_type(ms, kvm_type); +- } else if (mc->kvm_type) { +- type = mc->kvm_type(ms, NULL); +- } else { +- type = kvm_arch_get_default_type(ms); +- } +- ++ type = find_kvm_machine_type(ms); + if (type < 0) { + ret = -EINVAL; + goto err; + } + +- do { +- ret = kvm_ioctl(s, KVM_CREATE_VM, type); +- } while (ret == -EINTR); +- ++ ret = do_kvm_create_vm(ms, type); + if (ret < 0) { +- error_report("ioctl(KVM_CREATE_VM) failed: %s", strerror(-ret)); +- +-#ifdef TARGET_S390X +- if (ret == -EINVAL) { +- error_printf("Host kernel setup problem detected." +- " Please verify:\n"); +- error_printf("- for kernels supporting the" +- " switch_amode or user_mode parameters, whether"); +- error_printf(" user space is running in primary address space\n"); +- error_printf("- for kernels supporting the vm.allocate_pgste" +- " sysctl, whether it is enabled\n"); +- } +-#elif defined(TARGET_PPC) +- if (ret == -EINVAL) { +- error_printf("PPC KVM module is not loaded. Try modprobe kvm_%s.\n", +- (type == 2) ? "pr" : "hv"); +- } +-#endif + goto err; + } + +-- +2.39.3 + diff --git a/kvm-kvm-replace-fprintf-with-error_report-printf-in-kvm_.patch b/kvm-kvm-replace-fprintf-with-error_report-printf-in-kvm_.patch new file mode 100644 index 0000000..7f7a756 --- /dev/null +++ b/kvm-kvm-replace-fprintf-with-error_report-printf-in-kvm_.patch @@ -0,0 +1,132 @@ +From 522e19dd84eb5c4d88b3b70193ee104f67a5b89d Mon Sep 17 00:00:00 2001 +From: Ani Sinha +Date: Wed, 28 Aug 2024 18:15:39 +0530 +Subject: [PATCH 2/9] kvm: replace fprintf with error_report()/printf() in + kvm_init() + +RH-Author: Peter Xu +RH-MergeRequest: 285: KVM: Dynamic sized kvm memslots array +RH-Jira: RHEL-57685 +RH-Acked-by: Juraj Marcin +RH-Acked-by: Miroslav Rezanina +RH-Commit: [1/7] 6c1230a6d5033d928817df9458938a675058e995 (peterx/qemu-kvm) + +error_report() is more appropriate for error situations. Replace fprintf with +error_report() and error_printf() as appropriate. Some improvement in error +reporting also happens as a part of this change. For example: + +From: +$ ./qemu-system-x86_64 --accel kvm +Could not access KVM kernel module: No such file or directory + +To: +$ ./qemu-system-x86_64 --accel kvm +qemu-system-x86_64: --accel kvm: Could not access KVM kernel module: No such file or directory + +CC: qemu-trivial@nongnu.org +CC: zhao1.liu@intel.com +CC: armbru@redhat.com +Reviewed-by: Zhao Liu +Reviewed-by: Markus Armbruster +Signed-off-by: Ani Sinha +Link: https://lore.kernel.org/r/20240828124539.62672-1-anisinha@redhat.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit 804dfbe3ef5e950328b162ae85741be2e228544f) +Signed-off-by: Peter Xu +--- + accel/kvm/kvm-all.c | 40 ++++++++++++++++++---------------------- + 1 file changed, 18 insertions(+), 22 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index c7f1cc64b6..7432a54f39 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -2427,7 +2427,7 @@ static int kvm_init(MachineState *ms) + QLIST_INIT(&s->kvm_parked_vcpus); + s->fd = qemu_open_old(s->device ?: "/dev/kvm", O_RDWR); + if (s->fd == -1) { +- fprintf(stderr, "Could not access KVM kernel module: %m\n"); ++ error_report("Could not access KVM kernel module: %m"); + ret = -errno; + goto err; + } +@@ -2437,13 +2437,13 @@ static int kvm_init(MachineState *ms) + if (ret >= 0) { + ret = -EINVAL; + } +- fprintf(stderr, "kvm version too old\n"); ++ error_report("kvm version too old"); + goto err; + } + + if (ret > KVM_API_VERSION) { + ret = -EINVAL; +- fprintf(stderr, "kvm version not supported\n"); ++ error_report("kvm version not supported"); + goto err; + } + +@@ -2488,26 +2488,22 @@ static int kvm_init(MachineState *ms) + } while (ret == -EINTR); + + if (ret < 0) { +- fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret, +- strerror(-ret)); ++ error_report("ioctl(KVM_CREATE_VM) failed: %s", strerror(-ret)); + + #ifdef TARGET_S390X + if (ret == -EINVAL) { +- fprintf(stderr, +- "Host kernel setup problem detected. Please verify:\n"); +- fprintf(stderr, "- for kernels supporting the switch_amode or" +- " user_mode parameters, whether\n"); +- fprintf(stderr, +- " user space is running in primary address space\n"); +- fprintf(stderr, +- "- for kernels supporting the vm.allocate_pgste sysctl, " +- "whether it is enabled\n"); ++ error_printf("Host kernel setup problem detected." ++ " Please verify:\n"); ++ error_printf("- for kernels supporting the" ++ " switch_amode or user_mode parameters, whether"); ++ error_printf(" user space is running in primary address space\n"); ++ error_printf("- for kernels supporting the vm.allocate_pgste" ++ " sysctl, whether it is enabled\n"); + } + #elif defined(TARGET_PPC) + if (ret == -EINVAL) { +- fprintf(stderr, +- "PPC KVM module is not loaded. Try modprobe kvm_%s.\n", +- (type == 2) ? "pr" : "hv"); ++ error_printf("PPC KVM module is not loaded. Try modprobe kvm_%s.\n", ++ (type == 2) ? "pr" : "hv"); + } + #endif + goto err; +@@ -2526,9 +2522,9 @@ static int kvm_init(MachineState *ms) + nc->name, nc->num, soft_vcpus_limit); + + if (nc->num > hard_vcpus_limit) { +- fprintf(stderr, "Number of %s cpus requested (%d) exceeds " +- "the maximum cpus supported by KVM (%d)\n", +- nc->name, nc->num, hard_vcpus_limit); ++ error_report("Number of %s cpus requested (%d) exceeds " ++ "the maximum cpus supported by KVM (%d)", ++ nc->name, nc->num, hard_vcpus_limit); + exit(1); + } + } +@@ -2542,8 +2538,8 @@ static int kvm_init(MachineState *ms) + } + if (missing_cap) { + ret = -EINVAL; +- fprintf(stderr, "kvm does not support %s\n%s", +- missing_cap->name, upgrade_note); ++ error_report("kvm does not support %s", missing_cap->name); ++ error_printf("%s", upgrade_note); + goto err; + } + +-- +2.39.3 + diff --git a/kvm-migration-Ensure-vmstate_save-sets-errp.patch b/kvm-migration-Ensure-vmstate_save-sets-errp.patch new file mode 100644 index 0000000..47a3c70 --- /dev/null +++ b/kvm-migration-Ensure-vmstate_save-sets-errp.patch @@ -0,0 +1,92 @@ +From 6be2f51c147df1ab1dd7c68c6b554512dfc05e6f Mon Sep 17 00:00:00 2001 +From: Hanna Czenczek +Date: Tue, 15 Oct 2024 19:04:37 +0200 +Subject: [PATCH 1/9] migration: Ensure vmstate_save() sets errp + +RH-Author: Hanna Czenczek +RH-MergeRequest: 288: migration: Ensure vmstate_save() sets errp +RH-Jira: RHEL-63051 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: German Maglione +RH-Commit: [1/1] 4d5a65c294ae83a29db885e42fb3f2ca913c36f0 (hreitz/qemu-kvm-c-9-s) + +migration/savevm.c contains some calls to vmstate_save() that are +followed by migrate_set_error() if the integer return value indicates an +error. migrate_set_error() requires that the `Error *` object passed to +it is set. Therefore, vmstate_save() is assumed to always set *errp on +error. + +Right now, that assumption is not met: vmstate_save_state_v() (called +internally by vmstate_save()) will not set *errp if +vmstate_subsection_save() or vmsd->post_save() fail. Fix that by adding +an *errp parameter to vmstate_subsection_save(), and by generating a +generic error in case post_save() fails (as is already done for +pre_save()). + +Without this patch, qemu will crash after vmstate_subsection_save() or +post_save() have failed inside of a vmstate_save() call (unless +migrate_set_error() then happen to discard the new error because +s->error is already set). This happens e.g. when receiving the state +from a virtio-fs back-end (virtiofsd) fails. + +Signed-off-by: Hanna Czenczek +Link: https://lore.kernel.org/r/20241015170437.310358-1-hreitz@redhat.com +Signed-off-by: Peter Xu +(cherry picked from commit 37dfcba1a04989830c706f9cbc00450e5d3a7447) +Signed-off-by: Hanna Czenczek +--- + migration/vmstate.c | 13 ++++++++----- + 1 file changed, 8 insertions(+), 5 deletions(-) + +diff --git a/migration/vmstate.c b/migration/vmstate.c +index ff5d589a6d..fa002b24e8 100644 +--- a/migration/vmstate.c ++++ b/migration/vmstate.c +@@ -22,7 +22,8 @@ + #include "trace.h" + + static int vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd, +- void *opaque, JSONWriter *vmdesc); ++ void *opaque, JSONWriter *vmdesc, ++ Error **errp); + static int vmstate_subsection_load(QEMUFile *f, const VMStateDescription *vmsd, + void *opaque); + +@@ -441,12 +442,13 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd, + json_writer_end_array(vmdesc); + } + +- ret = vmstate_subsection_save(f, vmsd, opaque, vmdesc); ++ ret = vmstate_subsection_save(f, vmsd, opaque, vmdesc, errp); + + if (vmsd->post_save) { + int ps_ret = vmsd->post_save(opaque); +- if (!ret) { ++ if (!ret && ps_ret) { + ret = ps_ret; ++ error_setg(errp, "post-save failed: %s", vmsd->name); + } + } + return ret; +@@ -518,7 +520,8 @@ static int vmstate_subsection_load(QEMUFile *f, const VMStateDescription *vmsd, + } + + static int vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd, +- void *opaque, JSONWriter *vmdesc) ++ void *opaque, JSONWriter *vmdesc, ++ Error **errp) + { + const VMStateDescription * const *sub = vmsd->subsections; + bool vmdesc_has_subsections = false; +@@ -546,7 +549,7 @@ static int vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd, + qemu_put_byte(f, len); + qemu_put_buffer(f, (uint8_t *)vmsdsub->name, len); + qemu_put_be32(f, vmsdsub->version_id); +- ret = vmstate_save_state(f, vmsdsub, opaque, vmdesc); ++ ret = vmstate_save_state_with_err(f, vmsdsub, opaque, vmdesc, errp); + if (ret) { + return ret; + } +-- +2.39.3 + diff --git a/qemu-kvm.spec b/qemu-kvm.spec index 00910cf..cf6188e 100644 --- a/qemu-kvm.spec +++ b/qemu-kvm.spec @@ -143,7 +143,7 @@ Obsoletes: %{name}-block-ssh <= %{epoch}:%{version} \ Summary: QEMU is a machine emulator and virtualizer Name: qemu-kvm Version: 9.1.0 -Release: 4%{?rcrel}%{?dist}%{?cc_suffix} +Release: 5%{?rcrel}%{?dist}%{?cc_suffix} # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped # Epoch 15 used for RHEL 8 # Epoch 17 used for RHEL 9 (due to release versioning offset in RHEL 8.5) @@ -320,6 +320,22 @@ Patch72: kvm-target-i386-Introduce-GraniteRapids-v2-model.patch # For RHEL-30315 - [Intel 10.0 FEAT] [GNR] Virt-QEMU: Add AVX10.1 instruction support # For RHEL-45110 - [Intel 10.0 FEAT] [CWF][DMR] Virt-QEMU: Advertise new instructions SHA2-512NI, SM3, and SM4 Patch73: kvm-target-i386-add-sha512-sm3-sm4-feature-bits.patch +# For RHEL-63051 - qemu crashed after killed virtiofsd during migration +Patch74: kvm-migration-Ensure-vmstate_save-sets-errp.patch +# For RHEL-57685 - Bad migration performance when performing vGPU VM live migration +Patch75: kvm-kvm-replace-fprintf-with-error_report-printf-in-kvm_.patch +# For RHEL-57685 - Bad migration performance when performing vGPU VM live migration +Patch76: kvm-kvm-refactor-core-virtual-machine-creation-into-its-.patch +# For RHEL-57685 - Bad migration performance when performing vGPU VM live migration +Patch77: kvm-accel-kvm-refactor-dirty-ring-setup.patch +# For RHEL-57685 - Bad migration performance when performing vGPU VM live migration +Patch78: kvm-KVM-Dynamic-sized-kvm-memslots-array.patch +# For RHEL-57685 - Bad migration performance when performing vGPU VM live migration +Patch79: kvm-KVM-Define-KVM_MEMSLOTS_NUM_MAX_DEFAULT.patch +# For RHEL-57685 - Bad migration performance when performing vGPU VM live migration +Patch80: kvm-KVM-Rename-KVMMemoryListener.nr_used_slots-to-nr_slo.patch +# For RHEL-57685 - Bad migration performance when performing vGPU VM live migration +Patch81: kvm-KVM-Rename-KVMState-nr_slots-to-nr_slots_max.patch %if %{have_clang} BuildRequires: clang @@ -357,6 +373,8 @@ BuildRequires: librbd-devel # We need both because the 'stap' binary is probed for by configure BuildRequires: systemtap BuildRequires: systemtap-sdt-devel +# Required as we use dtrace for trace backend +BuildRequires: /usr/bin/dtrace # For VNC PNG support BuildRequires: libpng-devel # For virtiofs @@ -1384,6 +1402,23 @@ useradd -r -u 107 -g qemu -G kvm -d / -s /sbin/nologin \ %endif %changelog +* Tue Nov 19 2024 Miroslav Rezanina - 9.1.0-5 +- kvm-migration-Ensure-vmstate_save-sets-errp.patch [RHEL-63051] +- kvm-kvm-replace-fprintf-with-error_report-printf-in-kvm_.patch [RHEL-57685] +- kvm-kvm-refactor-core-virtual-machine-creation-into-its-.patch [RHEL-57685] +- kvm-accel-kvm-refactor-dirty-ring-setup.patch [RHEL-57685] +- kvm-KVM-Dynamic-sized-kvm-memslots-array.patch [RHEL-57685] +- kvm-KVM-Define-KVM_MEMSLOTS_NUM_MAX_DEFAULT.patch [RHEL-57685] +- kvm-KVM-Rename-KVMMemoryListener.nr_used_slots-to-nr_slo.patch [RHEL-57685] +- kvm-KVM-Rename-KVMState-nr_slots-to-nr_slots_max.patch [RHEL-57685] +- kvm-Require-new-dtrace-package.patch [RHEL-67899] +- Resolves: RHEL-63051 + (qemu crashed after killed virtiofsd during migration) +- Resolves: RHEL-57685 + (Bad migration performance when performing vGPU VM live migration ) +- Resolves: RHEL-67899 + (Failed to build qemu-kvm due to missing dtrace [rhel-10.0]) + * Tue Nov 12 2024 Miroslav Rezanina - 9.1.0-4.el10 - kvm-accel-kvm-check-for-KVM_CAP_READONLY_MEM-on-VM.patch [RHEL-58928] - kvm-hw-s390x-ipl-Provide-more-memory-to-the-s390-ccw.img.patch [RHEL-58153]