diff --git a/kvm-KVM-Define-KVM_MEMSLOTS_NUM_MAX_DEFAULT.patch b/kvm-KVM-Define-KVM_MEMSLOTS_NUM_MAX_DEFAULT.patch new file mode 100644 index 0000000..2670a23 --- /dev/null +++ b/kvm-KVM-Define-KVM_MEMSLOTS_NUM_MAX_DEFAULT.patch @@ -0,0 +1,49 @@ +From f9dfed0e5fd03ee6fa7364801db7d101bf085a79 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Tue, 17 Sep 2024 12:38:33 -0400 +Subject: [PATCH 6/9] KVM: Define KVM_MEMSLOTS_NUM_MAX_DEFAULT + +RH-Author: Peter Xu +RH-MergeRequest: 284: KVM: Dynamic sized kvm memslots array +RH-Jira: RHEL-57682 +RH-Acked-by: Juraj Marcin +RH-Commit: [5/7] c95bdaa406e76b943882fd75c4d345ca5fc397d4 (peterx/qemu-kvm) + +Make the default max nr_slots a macro, it's only used when KVM reports +nothing. + +Reviewed-by: David Hildenbrand +Signed-off-by: Peter Xu +Link: https://lore.kernel.org/r/20240917163835.194664-3-peterx@redhat.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit b34a908c8f24eedb0a8e5ff486b059b58fd793f4) +Signed-off-by: Peter Xu +--- + accel/kvm/kvm-all.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index 44bf4180fa..3900de8883 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -71,6 +71,8 @@ + + /* Default num of memslots to be allocated when VM starts */ + #define KVM_MEMSLOTS_NR_ALLOC_DEFAULT 16 ++/* Default max allowed memslots if kernel reported nothing */ ++#define KVM_MEMSLOTS_NR_MAX_DEFAULT 32 + + struct KVMParkedVcpu { + unsigned long vcpu_id; +@@ -2617,7 +2619,7 @@ static int kvm_init(MachineState *ms) + + /* If unspecified, use the default value */ + if (!s->nr_slots) { +- s->nr_slots = 32; ++ s->nr_slots_max = KVM_MEMSLOTS_NR_MAX_DEFAULT; + } + + s->nr_as = kvm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE); +-- +2.39.3 + diff --git a/kvm-KVM-Dynamic-sized-kvm-memslots-array.patch b/kvm-KVM-Dynamic-sized-kvm-memslots-array.patch new file mode 100644 index 0000000..f3f0522 --- /dev/null +++ b/kvm-KVM-Dynamic-sized-kvm-memslots-array.patch @@ -0,0 +1,250 @@ +From 9813dc1d19a6afedbab382b79e72691190e42fcf Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Tue, 17 Sep 2024 12:38:32 -0400 +Subject: [PATCH 5/9] KVM: Dynamic sized kvm memslots array + +RH-Author: Peter Xu +RH-MergeRequest: 284: KVM: Dynamic sized kvm memslots array +RH-Jira: RHEL-57682 +RH-Acked-by: Juraj Marcin +RH-Commit: [4/7] 04d74707873b28a50b1e1bc08e4788c79455518c (peterx/qemu-kvm) + +Zhiyi reported an infinite loop issue in VFIO use case. The cause of that +was a separate discussion, however during that I found a regression of +dirty sync slowness when profiling. + +Each KVMMemoryListerner maintains an array of kvm memslots. Currently it's +statically allocated to be the max supported by the kernel. However after +Linux commit 4fc096a99e ("KVM: Raise the maximum number of user memslots"), +the max supported memslots reported now grows to some number large enough +so that it may not be wise to always statically allocate with the max +reported. + +What's worse, QEMU kvm code still walks all the allocated memslots entries +to do any form of lookups. It can drastically slow down all memslot +operations because each of such loop can run over 32K times on the new +kernels. + +Fix this issue by making the memslots to be allocated dynamically. + +Here the initial size was set to 16 because it should cover the basic VM +usages, so that the hope is the majority VM use case may not even need to +grow at all (e.g. if one starts a VM with ./qemu-system-x86_64 by default +it'll consume 9 memslots), however not too large to waste memory. + +There can also be even better way to address this, but so far this is the +simplest and should be already better even than before we grow the max +supported memslots. For example, in the case of above issue when VFIO was +attached on a 32GB system, there are only ~10 memslots used. So it could +be good enough as of now. + +In the above VFIO context, measurement shows that the precopy dirty sync +shrinked from ~86ms to ~3ms after this patch applied. It should also apply +to any KVM enabled VM even without VFIO. + +NOTE: we don't have a FIXES tag for this patch because there's no real +commit that regressed this in QEMU. Such behavior existed for a long time, +but only start to be a problem when the kernel reports very large +nr_slots_max value. However that's pretty common now (the kernel change +was merged in 2021) so we attached cc:stable because we'll want this change +to be backported to stable branches. + +Cc: qemu-stable +Reported-by: Zhiyi Guo +Tested-by: Zhiyi Guo +Signed-off-by: Peter Xu +Acked-by: David Hildenbrand +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20240917163835.194664-2-peterx@redhat.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit 5504a8126115d173687b37e657312a8ffe29fc0c) +Signed-off-by: Peter Xu +--- + accel/kvm/kvm-all.c | 87 +++++++++++++++++++++++++++++++++------- + accel/kvm/trace-events | 1 + + include/sysemu/kvm_int.h | 1 + + 3 files changed, 74 insertions(+), 15 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index de709fbc43..44bf4180fa 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -69,6 +69,9 @@ + #define KVM_GUESTDBG_BLOCKIRQ 0 + #endif + ++/* Default num of memslots to be allocated when VM starts */ ++#define KVM_MEMSLOTS_NR_ALLOC_DEFAULT 16 ++ + struct KVMParkedVcpu { + unsigned long vcpu_id; + int kvm_fd; +@@ -165,6 +168,57 @@ void kvm_resample_fd_notify(int gsi) + } + } + ++/** ++ * kvm_slots_grow(): Grow the slots[] array in the KVMMemoryListener ++ * ++ * @kml: The KVMMemoryListener* to grow the slots[] array ++ * @nr_slots_new: The new size of slots[] array ++ * ++ * Returns: True if the array grows larger, false otherwise. ++ */ ++static bool kvm_slots_grow(KVMMemoryListener *kml, unsigned int nr_slots_new) ++{ ++ unsigned int i, cur = kml->nr_slots_allocated; ++ KVMSlot *slots; ++ ++ if (nr_slots_new > kvm_state->nr_slots) { ++ nr_slots_new = kvm_state->nr_slots; ++ } ++ ++ if (cur >= nr_slots_new) { ++ /* Big enough, no need to grow, or we reached max */ ++ return false; ++ } ++ ++ if (cur == 0) { ++ slots = g_new0(KVMSlot, nr_slots_new); ++ } else { ++ assert(kml->slots); ++ slots = g_renew(KVMSlot, kml->slots, nr_slots_new); ++ /* ++ * g_renew() doesn't initialize extended buffers, however kvm ++ * memslots require fields to be zero-initialized. E.g. pointers, ++ * memory_size field, etc. ++ */ ++ memset(&slots[cur], 0x0, sizeof(slots[0]) * (nr_slots_new - cur)); ++ } ++ ++ for (i = cur; i < nr_slots_new; i++) { ++ slots[i].slot = i; ++ } ++ ++ kml->slots = slots; ++ kml->nr_slots_allocated = nr_slots_new; ++ trace_kvm_slots_grow(cur, nr_slots_new); ++ ++ return true; ++} ++ ++static bool kvm_slots_double(KVMMemoryListener *kml) ++{ ++ return kvm_slots_grow(kml, kml->nr_slots_allocated * 2); ++} ++ + unsigned int kvm_get_max_memslots(void) + { + KVMState *s = KVM_STATE(current_accel()); +@@ -193,15 +247,26 @@ unsigned int kvm_get_free_memslots(void) + /* Called with KVMMemoryListener.slots_lock held */ + static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml) + { +- KVMState *s = kvm_state; ++ unsigned int n; + int i; + +- for (i = 0; i < s->nr_slots; i++) { ++ for (i = 0; i < kml->nr_slots_allocated; i++) { + if (kml->slots[i].memory_size == 0) { + return &kml->slots[i]; + } + } + ++ /* ++ * If no free slots, try to grow first by doubling. Cache the old size ++ * here to avoid another round of search: if the grow succeeded, it ++ * means slots[] now must have the existing "n" slots occupied, ++ * followed by one or more free slots starting from slots[n]. ++ */ ++ n = kml->nr_slots_allocated; ++ if (kvm_slots_double(kml)) { ++ return &kml->slots[n]; ++ } ++ + return NULL; + } + +@@ -222,10 +287,9 @@ static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml, + hwaddr start_addr, + hwaddr size) + { +- KVMState *s = kvm_state; + int i; + +- for (i = 0; i < s->nr_slots; i++) { ++ for (i = 0; i < kml->nr_slots_allocated; i++) { + KVMSlot *mem = &kml->slots[i]; + + if (start_addr == mem->start_addr && size == mem->memory_size) { +@@ -267,7 +331,7 @@ int kvm_physical_memory_addr_from_host(KVMState *s, void *ram, + int i, ret = 0; + + kvm_slots_lock(); +- for (i = 0; i < s->nr_slots; i++) { ++ for (i = 0; i < kml->nr_slots_allocated; i++) { + KVMSlot *mem = &kml->slots[i]; + + if (ram >= mem->ram && ram < mem->ram + mem->memory_size) { +@@ -1071,7 +1135,7 @@ static int kvm_physical_log_clear(KVMMemoryListener *kml, + + kvm_slots_lock(); + +- for (i = 0; i < s->nr_slots; i++) { ++ for (i = 0; i < kml->nr_slots_allocated; i++) { + mem = &kml->slots[i]; + /* Discard slots that are empty or do not overlap the section */ + if (!mem->memory_size || +@@ -1719,12 +1783,8 @@ static void kvm_log_sync_global(MemoryListener *l, bool last_stage) + /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */ + kvm_dirty_ring_flush(); + +- /* +- * TODO: make this faster when nr_slots is big while there are +- * only a few used slots (small VMs). +- */ + kvm_slots_lock(); +- for (i = 0; i < s->nr_slots; i++) { ++ for (i = 0; i < kml->nr_slots_allocated; i++) { + mem = &kml->slots[i]; + if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { + kvm_slot_sync_dirty_pages(mem); +@@ -1839,12 +1899,9 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, + { + int i; + +- kml->slots = g_new0(KVMSlot, s->nr_slots); + kml->as_id = as_id; + +- for (i = 0; i < s->nr_slots; i++) { +- kml->slots[i].slot = i; +- } ++ kvm_slots_grow(kml, KVM_MEMSLOTS_NR_ALLOC_DEFAULT); + + QSIMPLEQ_INIT(&kml->transaction_add); + QSIMPLEQ_INIT(&kml->transaction_del); +diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events +index 37626c1ac5..ad2ae6fca5 100644 +--- a/accel/kvm/trace-events ++++ b/accel/kvm/trace-events +@@ -36,3 +36,4 @@ kvm_io_window_exit(void) "" + kvm_run_exit_system_event(int cpu_index, uint32_t event_type) "cpu_index %d, system_even_type %"PRIu32 + kvm_convert_memory(uint64_t start, uint64_t size, const char *msg) "start 0x%" PRIx64 " size 0x%" PRIx64 " %s" + kvm_memory_fault(uint64_t start, uint64_t size, uint64_t flags) "start 0x%" PRIx64 " size 0x%" PRIx64 " flags 0x%" PRIx64 ++kvm_slots_grow(unsigned int old, unsigned int new) "%u -> %u" +diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h +index 1d8fb1473b..48e496b3d4 100644 +--- a/include/sysemu/kvm_int.h ++++ b/include/sysemu/kvm_int.h +@@ -46,6 +46,7 @@ typedef struct KVMMemoryListener { + MemoryListener listener; + KVMSlot *slots; + unsigned int nr_used_slots; ++ unsigned int nr_slots_allocated; + int as_id; + QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_add; + QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_del; +-- +2.39.3 + diff --git a/kvm-KVM-Rename-KVMMemoryListener.nr_used_slots-to-nr_slo.patch b/kvm-KVM-Rename-KVMMemoryListener.nr_used_slots-to-nr_slo.patch new file mode 100644 index 0000000..2998318 --- /dev/null +++ b/kvm-KVM-Rename-KVMMemoryListener.nr_used_slots-to-nr_slo.patch @@ -0,0 +1,72 @@ +From 6c0a0d9734c507af2c84aa33eb1624f35e1f51fb Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Tue, 17 Sep 2024 12:38:34 -0400 +Subject: [PATCH 7/9] KVM: Rename KVMMemoryListener.nr_used_slots to + nr_slots_used + +RH-Author: Peter Xu +RH-MergeRequest: 284: KVM: Dynamic sized kvm memslots array +RH-Jira: RHEL-57682 +RH-Acked-by: Juraj Marcin +RH-Commit: [6/7] 74e9576751e0adeb8113a5e8e495b4b1285b0d76 (peterx/qemu-kvm) + +This will make all nr_slots counters to be named in the same manner. + +Reviewed-by: David Hildenbrand +Signed-off-by: Peter Xu +Link: https://lore.kernel.org/r/20240917163835.194664-4-peterx@redhat.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit dbdc00ba5b136bba80d850f61cc79a9cafaae1cd) +Signed-off-by: Peter Xu +--- + accel/kvm/kvm-all.c | 6 +++--- + include/sysemu/kvm_int.h | 2 +- + 2 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index 3900de8883..e414d015c9 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -239,7 +239,7 @@ unsigned int kvm_get_free_memslots(void) + if (!s->as[i].ml) { + continue; + } +- used_slots = MAX(used_slots, s->as[i].ml->nr_used_slots); ++ used_slots = MAX(used_slots, s->as[i].ml->nr_slots_used); + } + kvm_slots_unlock(); + +@@ -1516,7 +1516,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, + } + start_addr += slot_size; + size -= slot_size; +- kml->nr_used_slots--; ++ kml->nr_slots_used--; + } while (size); + return; + } +@@ -1555,7 +1555,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, + ram_start_offset += slot_size; + ram += slot_size; + size -= slot_size; +- kml->nr_used_slots++; ++ kml->nr_slots_used++; + } while (size); + } + +diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h +index 48e496b3d4..b705dfc9b4 100644 +--- a/include/sysemu/kvm_int.h ++++ b/include/sysemu/kvm_int.h +@@ -45,7 +45,7 @@ typedef struct KVMMemoryUpdate { + typedef struct KVMMemoryListener { + MemoryListener listener; + KVMSlot *slots; +- unsigned int nr_used_slots; ++ unsigned int nr_slots_used; + unsigned int nr_slots_allocated; + int as_id; + QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_add; +-- +2.39.3 + diff --git a/kvm-KVM-Rename-KVMState-nr_slots-to-nr_slots_max.patch b/kvm-KVM-Rename-KVMState-nr_slots-to-nr_slots_max.patch new file mode 100644 index 0000000..1787c16 --- /dev/null +++ b/kvm-KVM-Rename-KVMState-nr_slots-to-nr_slots_max.patch @@ -0,0 +1,89 @@ +From 5b731abd3a932fe9a21f83f3849a3b3769906e19 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Tue, 17 Sep 2024 12:38:35 -0400 +Subject: [PATCH 8/9] KVM: Rename KVMState->nr_slots to nr_slots_max + +RH-Author: Peter Xu +RH-MergeRequest: 284: KVM: Dynamic sized kvm memslots array +RH-Jira: RHEL-57682 +RH-Acked-by: Juraj Marcin +RH-Commit: [7/7] 43471483e7380119ba6415bff6d8ee6c69aa9cd7 (peterx/qemu-kvm) + +This value used to reflect the maximum supported memslots from KVM kernel. +Rename it to be clearer. + +Reviewed-by: David Hildenbrand +Signed-off-by: Peter Xu +Link: https://lore.kernel.org/r/20240917163835.194664-5-peterx@redhat.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit 943c742868c739c0b14fd996bad3adf744156fec) +Signed-off-by: Peter Xu +--- + accel/kvm/kvm-all.c | 12 ++++++------ + include/sysemu/kvm_int.h | 4 ++-- + 2 files changed, 8 insertions(+), 8 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index e414d015c9..49dedda47e 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -183,8 +183,8 @@ static bool kvm_slots_grow(KVMMemoryListener *kml, unsigned int nr_slots_new) + unsigned int i, cur = kml->nr_slots_allocated; + KVMSlot *slots; + +- if (nr_slots_new > kvm_state->nr_slots) { +- nr_slots_new = kvm_state->nr_slots; ++ if (nr_slots_new > kvm_state->nr_slots_max) { ++ nr_slots_new = kvm_state->nr_slots_max; + } + + if (cur >= nr_slots_new) { +@@ -225,7 +225,7 @@ unsigned int kvm_get_max_memslots(void) + { + KVMState *s = KVM_STATE(current_accel()); + +- return s->nr_slots; ++ return s->nr_slots_max; + } + + unsigned int kvm_get_free_memslots(void) +@@ -243,7 +243,7 @@ unsigned int kvm_get_free_memslots(void) + } + kvm_slots_unlock(); + +- return s->nr_slots - used_slots; ++ return s->nr_slots_max - used_slots; + } + + /* Called with KVMMemoryListener.slots_lock held */ +@@ -2615,10 +2615,10 @@ static int kvm_init(MachineState *ms) + (kvm_supported_memory_attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE); + + kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT); +- s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS); ++ s->nr_slots_max = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS); + + /* If unspecified, use the default value */ +- if (!s->nr_slots) { ++ if (!s->nr_slots_max) { + s->nr_slots_max = KVM_MEMSLOTS_NR_MAX_DEFAULT; + } + +diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h +index b705dfc9b4..2c57194b6b 100644 +--- a/include/sysemu/kvm_int.h ++++ b/include/sysemu/kvm_int.h +@@ -103,8 +103,8 @@ struct KVMDirtyRingReaper { + struct KVMState + { + AccelState parent_obj; +- +- int nr_slots; ++ /* Max number of KVM slots supported */ ++ int nr_slots_max; + int fd; + int vmfd; + int coalesced_mmio; +-- +2.39.3 + diff --git a/kvm-accel-kvm-refactor-dirty-ring-setup.patch b/kvm-accel-kvm-refactor-dirty-ring-setup.patch new file mode 100644 index 0000000..ec064f5 --- /dev/null +++ b/kvm-accel-kvm-refactor-dirty-ring-setup.patch @@ -0,0 +1,143 @@ +From e27a9d1e5194243084efe4405fe50463442f0fe3 Mon Sep 17 00:00:00 2001 +From: Ani Sinha +Date: Thu, 12 Sep 2024 11:48:38 +0530 +Subject: [PATCH 4/9] accel/kvm: refactor dirty ring setup + +RH-Author: Peter Xu +RH-MergeRequest: 284: KVM: Dynamic sized kvm memslots array +RH-Jira: RHEL-57682 +RH-Acked-by: Juraj Marcin +RH-Commit: [3/7] 226ae9826237887fc55f75b9175524f12b4fa4a9 (peterx/qemu-kvm) + +Refactor setting up of dirty ring code in kvm_init() so that is can be +reused in the future patchsets. + +Signed-off-by: Ani Sinha +Link: https://lore.kernel.org/r/20240912061838.4501-1-anisinha@redhat.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit 28ed7f9761eb273e7dedcfdc0507d158106d0451) +Signed-off-by: Peter Xu +--- + accel/kvm/kvm-all.c | 88 +++++++++++++++++++++++++-------------------- + 1 file changed, 50 insertions(+), 38 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index 4f96d8b45e..de709fbc43 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -2439,6 +2439,55 @@ static int find_kvm_machine_type(MachineState *ms) + return type; + } + ++static int kvm_setup_dirty_ring(KVMState *s) ++{ ++ uint64_t dirty_log_manual_caps; ++ int ret; ++ ++ /* ++ * Enable KVM dirty ring if supported, otherwise fall back to ++ * dirty logging mode ++ */ ++ ret = kvm_dirty_ring_init(s); ++ if (ret < 0) { ++ return ret; ++ } ++ ++ /* ++ * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is ++ * enabled. More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no ++ * page is wr-protected initially, which is against how kvm dirty ring is ++ * usage - kvm dirty ring requires all pages are wr-protected at the very ++ * beginning. Enabling this feature for dirty ring causes data corruption. ++ * ++ * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log, ++ * we may expect a higher stall time when starting the migration. In the ++ * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too: ++ * instead of clearing dirty bit, it can be a way to explicitly wr-protect ++ * guest pages. ++ */ ++ if (!s->kvm_dirty_ring_size) { ++ dirty_log_manual_caps = ++ kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); ++ dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | ++ KVM_DIRTY_LOG_INITIALLY_SET); ++ s->manual_dirty_log_protect = dirty_log_manual_caps; ++ if (dirty_log_manual_caps) { ++ ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0, ++ dirty_log_manual_caps); ++ if (ret) { ++ warn_report("Trying to enable capability %"PRIu64" of " ++ "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. " ++ "Falling back to the legacy mode. ", ++ dirty_log_manual_caps); ++ s->manual_dirty_log_protect = 0; ++ } ++ } ++ } ++ ++ return 0; ++} ++ + static int kvm_init(MachineState *ms) + { + MachineClass *mc = MACHINE_GET_CLASS(ms); +@@ -2458,7 +2507,6 @@ static int kvm_init(MachineState *ms) + const KVMCapabilityInfo *missing_cap; + int ret; + int type; +- uint64_t dirty_log_manual_caps; + + qemu_mutex_init(&kml_slots_lock); + +@@ -2570,47 +2618,11 @@ static int kvm_init(MachineState *ms) + s->coalesced_pio = s->coalesced_mmio && + kvm_check_extension(s, KVM_CAP_COALESCED_PIO); + +- /* +- * Enable KVM dirty ring if supported, otherwise fall back to +- * dirty logging mode +- */ +- ret = kvm_dirty_ring_init(s); ++ ret = kvm_setup_dirty_ring(s); + if (ret < 0) { + goto err; + } + +- /* +- * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is +- * enabled. More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no +- * page is wr-protected initially, which is against how kvm dirty ring is +- * usage - kvm dirty ring requires all pages are wr-protected at the very +- * beginning. Enabling this feature for dirty ring causes data corruption. +- * +- * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log, +- * we may expect a higher stall time when starting the migration. In the +- * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too: +- * instead of clearing dirty bit, it can be a way to explicitly wr-protect +- * guest pages. +- */ +- if (!s->kvm_dirty_ring_size) { +- dirty_log_manual_caps = +- kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); +- dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | +- KVM_DIRTY_LOG_INITIALLY_SET); +- s->manual_dirty_log_protect = dirty_log_manual_caps; +- if (dirty_log_manual_caps) { +- ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0, +- dirty_log_manual_caps); +- if (ret) { +- warn_report("Trying to enable capability %"PRIu64" of " +- "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. " +- "Falling back to the legacy mode. ", +- dirty_log_manual_caps); +- s->manual_dirty_log_protect = 0; +- } +- } +- } +- + #ifdef KVM_CAP_VCPU_EVENTS + s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS); + #endif +-- +2.39.3 + diff --git a/kvm-kvm-refactor-core-virtual-machine-creation-into-its-.patch b/kvm-kvm-refactor-core-virtual-machine-creation-into-its-.patch new file mode 100644 index 0000000..b4ed917 --- /dev/null +++ b/kvm-kvm-refactor-core-virtual-machine-creation-into-its-.patch @@ -0,0 +1,143 @@ +From 90a18a9e2e585413eba96ab96df4a878f6c405be Mon Sep 17 00:00:00 2001 +From: Ani Sinha +Date: Thu, 8 Aug 2024 17:08:38 +0530 +Subject: [PATCH 3/9] kvm: refactor core virtual machine creation into its own + function + +RH-Author: Peter Xu +RH-MergeRequest: 284: KVM: Dynamic sized kvm memslots array +RH-Jira: RHEL-57682 +RH-Acked-by: Juraj Marcin +RH-Commit: [2/7] 3db75ee31b109048ef2de5c7f193116ab8c185a7 (peterx/qemu-kvm) + +Refactoring the core logic around KVM_CREATE_VM into its own separate function +so that it can be called from other functions in subsequent patches. There is +no functional change in this patch. + +CC: pbonzini@redhat.com +CC: zhao1.liu@intel.com +Signed-off-by: Ani Sinha +Link: https://lore.kernel.org/r/20240808113838.1697366-1-anisinha@redhat.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit 67388078da1cf6dac89e5a7c748cca3444d49690) +Signed-off-by: Peter Xu +--- + accel/kvm/kvm-all.c | 89 ++++++++++++++++++++++++++++----------------- + 1 file changed, 56 insertions(+), 33 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index b51441523d..4f96d8b45e 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -2385,6 +2385,60 @@ uint32_t kvm_dirty_ring_size(void) + return kvm_state->kvm_dirty_ring_size; + } + ++static int do_kvm_create_vm(MachineState *ms, int type) ++{ ++ KVMState *s; ++ int ret; ++ ++ s = KVM_STATE(ms->accelerator); ++ ++ do { ++ ret = kvm_ioctl(s, KVM_CREATE_VM, type); ++ } while (ret == -EINTR); ++ ++ if (ret < 0) { ++ error_report("ioctl(KVM_CREATE_VM) failed: %s", strerror(-ret)); ++ ++#ifdef TARGET_S390X ++ if (ret == -EINVAL) { ++ error_printf("Host kernel setup problem detected." ++ " Please verify:\n"); ++ error_printf("- for kernels supporting the" ++ " switch_amode or user_mode parameters, whether"); ++ error_printf(" user space is running in primary address space\n"); ++ error_printf("- for kernels supporting the vm.allocate_pgste" ++ " sysctl, whether it is enabled\n"); ++ } ++#elif defined(TARGET_PPC) ++ if (ret == -EINVAL) { ++ error_printf("PPC KVM module is not loaded. Try modprobe kvm_%s.\n", ++ (type == 2) ? "pr" : "hv"); ++ } ++#endif ++ } ++ ++ return ret; ++} ++ ++static int find_kvm_machine_type(MachineState *ms) ++{ ++ MachineClass *mc = MACHINE_GET_CLASS(ms); ++ int type; ++ ++ if (object_property_find(OBJECT(current_machine), "kvm-type")) { ++ g_autofree char *kvm_type; ++ kvm_type = object_property_get_str(OBJECT(current_machine), ++ "kvm-type", ++ &error_abort); ++ type = mc->kvm_type(ms, kvm_type); ++ } else if (mc->kvm_type) { ++ type = mc->kvm_type(ms, NULL); ++ } else { ++ type = kvm_arch_get_default_type(ms); ++ } ++ return type; ++} ++ + static int kvm_init(MachineState *ms) + { + MachineClass *mc = MACHINE_GET_CLASS(ms); +@@ -2467,45 +2521,14 @@ static int kvm_init(MachineState *ms) + } + s->as = g_new0(struct KVMAs, s->nr_as); + +- if (object_property_find(OBJECT(current_machine), "kvm-type")) { +- g_autofree char *kvm_type = object_property_get_str(OBJECT(current_machine), +- "kvm-type", +- &error_abort); +- type = mc->kvm_type(ms, kvm_type); +- } else if (mc->kvm_type) { +- type = mc->kvm_type(ms, NULL); +- } else { +- type = kvm_arch_get_default_type(ms); +- } +- ++ type = find_kvm_machine_type(ms); + if (type < 0) { + ret = -EINVAL; + goto err; + } + +- do { +- ret = kvm_ioctl(s, KVM_CREATE_VM, type); +- } while (ret == -EINTR); +- ++ ret = do_kvm_create_vm(ms, type); + if (ret < 0) { +- error_report("ioctl(KVM_CREATE_VM) failed: %s", strerror(-ret)); +- +-#ifdef TARGET_S390X +- if (ret == -EINVAL) { +- error_printf("Host kernel setup problem detected." +- " Please verify:\n"); +- error_printf("- for kernels supporting the" +- " switch_amode or user_mode parameters, whether"); +- error_printf(" user space is running in primary address space\n"); +- error_printf("- for kernels supporting the vm.allocate_pgste" +- " sysctl, whether it is enabled\n"); +- } +-#elif defined(TARGET_PPC) +- if (ret == -EINVAL) { +- error_printf("PPC KVM module is not loaded. Try modprobe kvm_%s.\n", +- (type == 2) ? "pr" : "hv"); +- } +-#endif + goto err; + } + +-- +2.39.3 + diff --git a/kvm-kvm-replace-fprintf-with-error_report-printf-in-kvm_.patch b/kvm-kvm-replace-fprintf-with-error_report-printf-in-kvm_.patch new file mode 100644 index 0000000..3ebe06d --- /dev/null +++ b/kvm-kvm-replace-fprintf-with-error_report-printf-in-kvm_.patch @@ -0,0 +1,131 @@ +From 66c634c4749d58c0c3644ace27a656c507433288 Mon Sep 17 00:00:00 2001 +From: Ani Sinha +Date: Wed, 28 Aug 2024 18:15:39 +0530 +Subject: [PATCH 2/9] kvm: replace fprintf with error_report()/printf() in + kvm_init() + +RH-Author: Peter Xu +RH-MergeRequest: 284: KVM: Dynamic sized kvm memslots array +RH-Jira: RHEL-57682 +RH-Acked-by: Juraj Marcin +RH-Commit: [1/7] 3dd0b67d3b6662001eb35201ca41b15d0dd97994 (peterx/qemu-kvm) + +error_report() is more appropriate for error situations. Replace fprintf with +error_report() and error_printf() as appropriate. Some improvement in error +reporting also happens as a part of this change. For example: + +From: +$ ./qemu-system-x86_64 --accel kvm +Could not access KVM kernel module: No such file or directory + +To: +$ ./qemu-system-x86_64 --accel kvm +qemu-system-x86_64: --accel kvm: Could not access KVM kernel module: No such file or directory + +CC: qemu-trivial@nongnu.org +CC: zhao1.liu@intel.com +CC: armbru@redhat.com +Reviewed-by: Zhao Liu +Reviewed-by: Markus Armbruster +Signed-off-by: Ani Sinha +Link: https://lore.kernel.org/r/20240828124539.62672-1-anisinha@redhat.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit 804dfbe3ef5e950328b162ae85741be2e228544f) +Signed-off-by: Peter Xu +--- + accel/kvm/kvm-all.c | 40 ++++++++++++++++++---------------------- + 1 file changed, 18 insertions(+), 22 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index a220178822..b51441523d 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -2427,7 +2427,7 @@ static int kvm_init(MachineState *ms) + QLIST_INIT(&s->kvm_parked_vcpus); + s->fd = qemu_open_old(s->device ?: "/dev/kvm", O_RDWR); + if (s->fd == -1) { +- fprintf(stderr, "Could not access KVM kernel module: %m\n"); ++ error_report("Could not access KVM kernel module: %m"); + ret = -errno; + goto err; + } +@@ -2437,13 +2437,13 @@ static int kvm_init(MachineState *ms) + if (ret >= 0) { + ret = -EINVAL; + } +- fprintf(stderr, "kvm version too old\n"); ++ error_report("kvm version too old"); + goto err; + } + + if (ret > KVM_API_VERSION) { + ret = -EINVAL; +- fprintf(stderr, "kvm version not supported\n"); ++ error_report("kvm version not supported"); + goto err; + } + +@@ -2488,26 +2488,22 @@ static int kvm_init(MachineState *ms) + } while (ret == -EINTR); + + if (ret < 0) { +- fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret, +- strerror(-ret)); ++ error_report("ioctl(KVM_CREATE_VM) failed: %s", strerror(-ret)); + + #ifdef TARGET_S390X + if (ret == -EINVAL) { +- fprintf(stderr, +- "Host kernel setup problem detected. Please verify:\n"); +- fprintf(stderr, "- for kernels supporting the switch_amode or" +- " user_mode parameters, whether\n"); +- fprintf(stderr, +- " user space is running in primary address space\n"); +- fprintf(stderr, +- "- for kernels supporting the vm.allocate_pgste sysctl, " +- "whether it is enabled\n"); ++ error_printf("Host kernel setup problem detected." ++ " Please verify:\n"); ++ error_printf("- for kernels supporting the" ++ " switch_amode or user_mode parameters, whether"); ++ error_printf(" user space is running in primary address space\n"); ++ error_printf("- for kernels supporting the vm.allocate_pgste" ++ " sysctl, whether it is enabled\n"); + } + #elif defined(TARGET_PPC) + if (ret == -EINVAL) { +- fprintf(stderr, +- "PPC KVM module is not loaded. Try modprobe kvm_%s.\n", +- (type == 2) ? "pr" : "hv"); ++ error_printf("PPC KVM module is not loaded. Try modprobe kvm_%s.\n", ++ (type == 2) ? "pr" : "hv"); + } + #endif + goto err; +@@ -2526,9 +2522,9 @@ static int kvm_init(MachineState *ms) + nc->name, nc->num, soft_vcpus_limit); + + if (nc->num > hard_vcpus_limit) { +- fprintf(stderr, "Number of %s cpus requested (%d) exceeds " +- "the maximum cpus supported by KVM (%d)\n", +- nc->name, nc->num, hard_vcpus_limit); ++ error_report("Number of %s cpus requested (%d) exceeds " ++ "the maximum cpus supported by KVM (%d)", ++ nc->name, nc->num, hard_vcpus_limit); + exit(1); + } + } +@@ -2542,8 +2538,8 @@ static int kvm_init(MachineState *ms) + } + if (missing_cap) { + ret = -EINVAL; +- fprintf(stderr, "kvm does not support %s\n%s", +- missing_cap->name, upgrade_note); ++ error_report("kvm does not support %s", missing_cap->name); ++ error_printf("%s", upgrade_note); + goto err; + } + +-- +2.39.3 + diff --git a/kvm-pc-q35-Bump-max_cpus-to-4096-vcpus.patch b/kvm-pc-q35-Bump-max_cpus-to-4096-vcpus.patch new file mode 100644 index 0000000..32064ea --- /dev/null +++ b/kvm-pc-q35-Bump-max_cpus-to-4096-vcpus.patch @@ -0,0 +1,75 @@ +From d06f8670b9304c66d45e2270a4f5b462ed6cbe09 Mon Sep 17 00:00:00 2001 +From: Ani Sinha +Date: Wed, 16 Oct 2024 17:21:34 +0530 +Subject: [PATCH 1/9] pc: q35: Bump max_cpus to 4096 vcpus +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Ani Sinha +RH-MergeRequest: 273: pc: q35: Bump max_cpus to 4096 vcpus +RH-Jira: RHEL-11043 +RH-Acked-by: Igor Mammedov +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: MST +RH-Commit: [1/1] 23caa8c9e4f34c3114701b7a5bb25002a9372b2e (anisinha/centos-qemu-kvm) + +This is the downstream change equivalent of the upstream QEMU commit +e4e98c7e ("pc: q35: Bump max_cpus to 4096 vcpus") + +Since upstream Linux kernel commit +f10a570b093e6 ("KVM: x86: Add CONFIG_KVM_MAX_NR_VCPUS to allow up to 4096 vCPUs") +Linux kernel can support upto a maximum number of 4096 vcpus when MAXSMP is +enabled in the kernel. This upstream change has been backported to c9s kernel +already. Please see JIRA https://issues.redhat.com/browse/RHEL-11579 and the +following commit authored by Vitaly Kuznetsov: +a85f846be686b0a ("KVM: x86: Add CONFIG_KVM_MAX_NR_VCPUS to allow up to 4096 vCPUs") + +At present, QEMU has been tested to correctly boot a linux guest with 4096 +vcpus using edk2 that has the fixes corresponding to the following two upstream +edk2 PRs: + +https://github.com/tianocore/edk2/pull/5410 +https://github.com/tianocore/edk2/pull/5418 + +The changes corresponding to the above two upstream edk2 PRs has been included +in the downstream c9s edk2 with the following MR: +https://gitlab.com/redhat/centos-stream/src/edk2/-/merge_requests/59 + +So bump up the value max_cpus to 4096 for RHEL q35 machines versions 9.6 and +newer. Q35 RHEL machines versions 9.4 and older continue to support 710 maximum +vcpus as before for compatibility reasons. + +See also https://gitlab.com/redhat/centos-stream/src/qemu-kvm/-/merge_requests/236 + +Signed-off-by: Ani Sinha +--- + hw/i386/pc_q35.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c +index 7606007bda..578f63524f 100644 +--- a/hw/i386/pc_q35.c ++++ b/hw/i386/pc_q35.c +@@ -344,7 +344,7 @@ static void pc_q35_machine_options(MachineClass *m) + m->default_display = "std"; + m->default_nic = "e1000e"; + m->no_floppy = 1; +- m->max_cpus = 710; ++ m->max_cpus = 4096; + m->no_parallel = 1; + machine_class_allow_dynamic_sysbus_dev(m, TYPE_AMD_IOMMU_DEVICE); + machine_class_allow_dynamic_sysbus_dev(m, TYPE_INTEL_IOMMU_DEVICE); +@@ -687,6 +687,9 @@ static void pc_q35_rhel_machine_9_4_0_options(MachineClass *m) + { + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + pc_q35_rhel_machine_9_6_0_options(m); ++ ++ /* older RHEL machines continue to support 710 vcpus */ ++ m->max_cpus = 710; + m->desc = "RHEL-9.4.0 PC (Q35 + ICH9, 2009)"; + m->alias = NULL; + pcmc->smbios_stream_product = "RHEL"; +-- +2.39.3 + diff --git a/qemu-kvm.spec b/qemu-kvm.spec index 4757186..0d3252b 100644 --- a/qemu-kvm.spec +++ b/qemu-kvm.spec @@ -149,7 +149,7 @@ Obsoletes: %{name}-block-ssh <= %{epoch}:%{version} \ Summary: QEMU is a machine emulator and virtualizer Name: qemu-kvm Version: 9.1.0 -Release: 2%{?rcrel}%{?dist}%{?cc_suffix} +Release: 3%{?rcrel}%{?dist}%{?cc_suffix} # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped # Epoch 15 used for RHEL 8 # Epoch 17 used for RHEL 9 (due to release versioning offset in RHEL 8.5) @@ -252,6 +252,22 @@ Patch55: kvm-vfio-migration-Change-trace-formats-from-hex-to-deci.patch Patch56: kvm-kvm-Allow-kvm_arch_get-put_registers-to-accept-Error.patch # For RHEL-60914 - Fail migration properly when put cpu register fails Patch57: kvm-target-i386-kvm-Report-which-action-failed-in-kvm_ar.patch +# For RHEL-11043 - [RFE] [HPEMC] [RHEL-9.6] qemu-kvm: support up to 4096 VCPUs +Patch58: kvm-pc-q35-Bump-max_cpus-to-4096-vcpus.patch +# For RHEL-57682 - Bad migration performance when performing vGPU VM live migration +Patch59: kvm-kvm-replace-fprintf-with-error_report-printf-in-kvm_.patch +# For RHEL-57682 - Bad migration performance when performing vGPU VM live migration +Patch60: kvm-kvm-refactor-core-virtual-machine-creation-into-its-.patch +# For RHEL-57682 - Bad migration performance when performing vGPU VM live migration +Patch61: kvm-accel-kvm-refactor-dirty-ring-setup.patch +# For RHEL-57682 - Bad migration performance when performing vGPU VM live migration +Patch62: kvm-KVM-Dynamic-sized-kvm-memslots-array.patch +# For RHEL-57682 - Bad migration performance when performing vGPU VM live migration +Patch63: kvm-KVM-Define-KVM_MEMSLOTS_NUM_MAX_DEFAULT.patch +# For RHEL-57682 - Bad migration performance when performing vGPU VM live migration +Patch64: kvm-KVM-Rename-KVMMemoryListener.nr_used_slots-to-nr_slo.patch +# For RHEL-57682 - Bad migration performance when performing vGPU VM live migration +Patch65: kvm-KVM-Rename-KVMState-nr_slots-to-nr_slots_max.patch %if %{have_clang} BuildRequires: clang @@ -289,6 +305,8 @@ BuildRequires: librbd-devel # We need both because the 'stap' binary is probed for by configure BuildRequires: systemtap BuildRequires: systemtap-sdt-devel +# Required as we use dtrace for trace backend +BuildRequires: /usr/bin/dtrace # For VNC PNG support BuildRequires: libpng-devel # For virtiofs @@ -1316,6 +1334,23 @@ useradd -r -u 107 -g qemu -G kvm -d / -s /sbin/nologin \ %endif %changelog +* Tue Nov 19 2024 Miroslav Rezanina - 9.1.0-3 +- kvm-pc-q35-Bump-max_cpus-to-4096-vcpus.patch [RHEL-11043] +- kvm-kvm-replace-fprintf-with-error_report-printf-in-kvm_.patch [RHEL-57682] +- kvm-kvm-refactor-core-virtual-machine-creation-into-its-.patch [RHEL-57682] +- kvm-accel-kvm-refactor-dirty-ring-setup.patch [RHEL-57682] +- kvm-KVM-Dynamic-sized-kvm-memslots-array.patch [RHEL-57682] +- kvm-KVM-Define-KVM_MEMSLOTS_NUM_MAX_DEFAULT.patch [RHEL-57682] +- kvm-KVM-Rename-KVMMemoryListener.nr_used_slots-to-nr_slo.patch [RHEL-57682] +- kvm-KVM-Rename-KVMState-nr_slots-to-nr_slots_max.patch [RHEL-57682] +- kvm-Require-new-dtrace-package.patch [RHEL-67900] +- Resolves: RHEL-11043 + ([RFE] [HPEMC] [RHEL-9.6] qemu-kvm: support up to 4096 VCPUs) +- Resolves: RHEL-57682 + (Bad migration performance when performing vGPU VM live migration ) +- Resolves: RHEL-67900 + (Failed to build qemu-kvm due to missing dtrace [rhel-9.6]) + * Mon Nov 11 2024 Miroslav Rezanina - 9.1.0-2 - kvm-hw-s390x-ipl-Provide-more-memory-to-the-s390-ccw.img.patch [RHEL-11424] - kvm-pc-bios-s390-ccw-Use-the-libc-from-SLOF-and-remove-s.patch [RHEL-11424]