diff --git a/kvm-hw-arm-Validate-cluster-and-NUMA-node-boundary.patch b/kvm-hw-arm-Validate-cluster-and-NUMA-node-boundary.patch new file mode 100644 index 0000000..e96bb10 --- /dev/null +++ b/kvm-hw-arm-Validate-cluster-and-NUMA-node-boundary.patch @@ -0,0 +1,60 @@ +From 7b57aec372fc238cbaafe86557f9fb4b560895b1 Mon Sep 17 00:00:00 2001 +From: Gavin Shan +Date: Tue, 27 Jun 2023 20:20:09 +1000 +Subject: [PATCH 2/6] hw/arm: Validate cluster and NUMA node boundary + +RH-Author: Gavin Shan +RH-MergeRequest: 175: hw/arm: Validate CPU cluster and NUMA node boundary for RHEL machines +RH-Bugzilla: 2171363 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Eric Auger +RH-Commit: [2/3] fcac7ea85d9f73613989903c642fc1bf6c51946b + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2171363 + +There are two ARM machines where NUMA is aware: 'virt' and 'sbsa-ref'. +Both of them are required to follow cluster-NUMA-node boundary. To +enable the validation to warn about the irregular configuration where +multiple CPUs in one cluster have been associated with different NUMA +nodes. + +Signed-off-by: Gavin Shan +Acked-by: Igor Mammedov +Message-Id: <20230509002739.18388-3-gshan@redhat.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit fecff672351ace5e39adf7dbcf7a8ee748b201cb) +Signed-off-by: Gavin Shan +--- + hw/arm/sbsa-ref.c | 2 ++ + hw/arm/virt.c | 2 ++ + 2 files changed, 4 insertions(+) + +diff --git a/hw/arm/sbsa-ref.c b/hw/arm/sbsa-ref.c +index 0b93558dde..efb380e7c8 100644 +--- a/hw/arm/sbsa-ref.c ++++ b/hw/arm/sbsa-ref.c +@@ -864,6 +864,8 @@ static void sbsa_ref_class_init(ObjectClass *oc, void *data) + mc->possible_cpu_arch_ids = sbsa_ref_possible_cpu_arch_ids; + mc->cpu_index_to_instance_props = sbsa_ref_cpu_index_to_props; + mc->get_default_cpu_node_id = sbsa_ref_get_default_cpu_node_id; ++ /* platform instead of architectural choice */ ++ mc->cpu_cluster_has_numa_boundary = true; + } + + static const TypeInfo sbsa_ref_info = { +diff --git a/hw/arm/virt.c b/hw/arm/virt.c +index 9be53e9355..df6a0231bc 100644 +--- a/hw/arm/virt.c ++++ b/hw/arm/virt.c +@@ -3083,6 +3083,8 @@ static void virt_machine_class_init(ObjectClass *oc, void *data) + mc->smp_props.clusters_supported = true; + mc->auto_enable_numa_with_memhp = true; + mc->auto_enable_numa_with_memdev = true; ++ /* platform instead of architectural choice */ ++ mc->cpu_cluster_has_numa_boundary = true; + mc->default_ram_id = "mach-virt.ram"; + + object_class_property_add(oc, "acpi", "OnOffAuto", +-- +2.39.3 + diff --git a/kvm-hw-arm-virt-Validate-cluster-and-NUMA-node-boundary-.patch b/kvm-hw-arm-virt-Validate-cluster-and-NUMA-node-boundary-.patch new file mode 100644 index 0000000..42ec705 --- /dev/null +++ b/kvm-hw-arm-virt-Validate-cluster-and-NUMA-node-boundary-.patch @@ -0,0 +1,41 @@ +From 022529f6d0ee306da857825c72a98bf7ddf5de22 Mon Sep 17 00:00:00 2001 +From: Gavin Shan +Date: Tue, 27 Jun 2023 20:20:09 +1000 +Subject: [PATCH 3/6] hw/arm/virt: Validate cluster and NUMA node boundary for + RHEL machines + +RH-Author: Gavin Shan +RH-MergeRequest: 175: hw/arm: Validate CPU cluster and NUMA node boundary for RHEL machines +RH-Bugzilla: 2171363 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Eric Auger +RH-Commit: [3/3] a396c499259b566861ca007b01f8539bf6113711 + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2171363 +Upstream Status: RHEL only + +Set mc->cpu_cluster_has_numa_boundary to true so that the boundary of +CPU cluster and NUMA node will be validated for 'virt-rhel*' machines. +A warning message will be printed if the boundary is broken. + +Signed-off-by: Gavin Shan +--- + hw/arm/virt.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/hw/arm/virt.c b/hw/arm/virt.c +index df6a0231bc..faf68488d5 100644 +--- a/hw/arm/virt.c ++++ b/hw/arm/virt.c +@@ -3530,6 +3530,8 @@ static void rhel_machine_class_init(ObjectClass *oc, void *data) + mc->smp_props.clusters_supported = true; + mc->auto_enable_numa_with_memhp = true; + mc->auto_enable_numa_with_memdev = true; ++ /* platform instead of architectural choice */ ++ mc->cpu_cluster_has_numa_boundary = true; + mc->default_ram_id = "mach-virt.ram"; + + object_class_property_add(oc, "acpi", "OnOffAuto", +-- +2.39.3 + diff --git a/kvm-kvm-reuse-per-vcpu-stats-fd-to-avoid-vcpu-interrupti.patch b/kvm-kvm-reuse-per-vcpu-stats-fd-to-avoid-vcpu-interrupti.patch new file mode 100644 index 0000000..d6a6d73 --- /dev/null +++ b/kvm-kvm-reuse-per-vcpu-stats-fd-to-avoid-vcpu-interrupti.patch @@ -0,0 +1,160 @@ +From a5857fb12fcad46e27c415fe82ce13c0cb5d09c7 Mon Sep 17 00:00:00 2001 +From: Marcelo Tosatti +Date: Thu, 29 Jun 2023 14:48:32 -0300 +Subject: [PATCH 5/6] kvm: reuse per-vcpu stats fd to avoid vcpu interruption +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marcelo Tosatti +RH-MergeRequest: 177: kvm: reuse per-vcpu stats fd to avoid vcpu interruption +RH-Bugzilla: 2218644 +RH-Acked-by: Vitaly Kuznetsov +RH-Acked-by: Paolo Bonzini +RH-Acked-by: Leonardo Brás +RH-Commit: [1/1] 4ec72385a9047888121485f49bacb1aff84f7018 (mtosatti/qemu-kvm) + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2218644 +Commit: 3b6f485275ae95a81eec589d2773b86ca9ddec4d + +A regression has been detected in latency testing of KVM guests. +More specifically, it was observed that the cyclictest +numbers inside of an isolated vcpu (running on isolated pcpu) are: + +Where a maximum of 50us is acceptable. + +The implementation of KVM_GET_STATS_FD uses run_on_cpu to query +per vcpu statistics, which interrupts the vcpu (and is unnecessary). + +To fix this, open the per vcpu stats fd on vcpu initialization, +and read from that fd from QEMU's main thread. + +Signed-off-by: Marcelo Tosatti +Signed-off-by: Paolo Bonzini +--- + accel/kvm/kvm-all.c | 30 +++++++++++++++--------------- + include/hw/core/cpu.h | 1 + + 2 files changed, 16 insertions(+), 15 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index cf3a88d90e..fa7ca46c66 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -450,6 +450,8 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp) + "kvm_init_vcpu: kvm_arch_init_vcpu failed (%lu)", + kvm_arch_vcpu_id(cpu)); + } ++ cpu->kvm_vcpu_stats_fd = kvm_vcpu_ioctl(cpu, KVM_GET_STATS_FD, NULL); ++ + err: + return ret; + } +@@ -3959,7 +3961,7 @@ static StatsDescriptors *find_stats_descriptors(StatsTarget target, int stats_fd + + /* Read stats header */ + kvm_stats_header = &descriptors->kvm_stats_header; +- ret = read(stats_fd, kvm_stats_header, sizeof(*kvm_stats_header)); ++ ret = pread(stats_fd, kvm_stats_header, sizeof(*kvm_stats_header), 0); + if (ret != sizeof(*kvm_stats_header)) { + error_setg(errp, "KVM stats: failed to read stats header: " + "expected %zu actual %zu", +@@ -3990,7 +3992,8 @@ static StatsDescriptors *find_stats_descriptors(StatsTarget target, int stats_fd + } + + static void query_stats(StatsResultList **result, StatsTarget target, +- strList *names, int stats_fd, Error **errp) ++ strList *names, int stats_fd, CPUState *cpu, ++ Error **errp) + { + struct kvm_stats_desc *kvm_stats_desc; + struct kvm_stats_header *kvm_stats_header; +@@ -4048,7 +4051,7 @@ static void query_stats(StatsResultList **result, StatsTarget target, + break; + case STATS_TARGET_VCPU: + add_stats_entry(result, STATS_PROVIDER_KVM, +- current_cpu->parent_obj.canonical_path, ++ cpu->parent_obj.canonical_path, + stats_list); + break; + default: +@@ -4085,10 +4088,9 @@ static void query_stats_schema(StatsSchemaList **result, StatsTarget target, + add_stats_schema(result, STATS_PROVIDER_KVM, target, stats_list); + } + +-static void query_stats_vcpu(CPUState *cpu, run_on_cpu_data data) ++static void query_stats_vcpu(CPUState *cpu, StatsArgs *kvm_stats_args) + { +- StatsArgs *kvm_stats_args = (StatsArgs *) data.host_ptr; +- int stats_fd = kvm_vcpu_ioctl(cpu, KVM_GET_STATS_FD, NULL); ++ int stats_fd = cpu->kvm_vcpu_stats_fd; + Error *local_err = NULL; + + if (stats_fd == -1) { +@@ -4097,14 +4099,13 @@ static void query_stats_vcpu(CPUState *cpu, run_on_cpu_data data) + return; + } + query_stats(kvm_stats_args->result.stats, STATS_TARGET_VCPU, +- kvm_stats_args->names, stats_fd, kvm_stats_args->errp); +- close(stats_fd); ++ kvm_stats_args->names, stats_fd, cpu, ++ kvm_stats_args->errp); + } + +-static void query_stats_schema_vcpu(CPUState *cpu, run_on_cpu_data data) ++static void query_stats_schema_vcpu(CPUState *cpu, StatsArgs *kvm_stats_args) + { +- StatsArgs *kvm_stats_args = (StatsArgs *) data.host_ptr; +- int stats_fd = kvm_vcpu_ioctl(cpu, KVM_GET_STATS_FD, NULL); ++ int stats_fd = cpu->kvm_vcpu_stats_fd; + Error *local_err = NULL; + + if (stats_fd == -1) { +@@ -4114,7 +4115,6 @@ static void query_stats_schema_vcpu(CPUState *cpu, run_on_cpu_data data) + } + query_stats_schema(kvm_stats_args->result.schema, STATS_TARGET_VCPU, stats_fd, + kvm_stats_args->errp); +- close(stats_fd); + } + + static void query_stats_cb(StatsResultList **result, StatsTarget target, +@@ -4132,7 +4132,7 @@ static void query_stats_cb(StatsResultList **result, StatsTarget target, + error_setg_errno(errp, errno, "KVM stats: ioctl failed"); + return; + } +- query_stats(result, target, names, stats_fd, errp); ++ query_stats(result, target, names, stats_fd, NULL, errp); + close(stats_fd); + break; + } +@@ -4146,7 +4146,7 @@ static void query_stats_cb(StatsResultList **result, StatsTarget target, + if (!apply_str_list_filter(cpu->parent_obj.canonical_path, targets)) { + continue; + } +- run_on_cpu(cpu, query_stats_vcpu, RUN_ON_CPU_HOST_PTR(&stats_args)); ++ query_stats_vcpu(cpu, &stats_args); + } + break; + } +@@ -4172,6 +4172,6 @@ void query_stats_schemas_cb(StatsSchemaList **result, Error **errp) + if (first_cpu) { + stats_args.result.schema = result; + stats_args.errp = errp; +- run_on_cpu(first_cpu, query_stats_schema_vcpu, RUN_ON_CPU_HOST_PTR(&stats_args)); ++ query_stats_schema_vcpu(first_cpu, &stats_args); + } + } +diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h +index 397fd3ac68..ae96be07e7 100644 +--- a/include/hw/core/cpu.h ++++ b/include/hw/core/cpu.h +@@ -399,6 +399,7 @@ struct CPUState { + struct kvm_dirty_gfn *kvm_dirty_gfns; + uint32_t kvm_fetch_index; + uint64_t dirty_pages; ++ int kvm_vcpu_stats_fd; + + /* Use by accel-block: CPU is executing an ioctl() */ + QemuLockCnt in_ioctl_lock; +-- +2.39.3 + diff --git a/kvm-numa-Validate-cluster-and-NUMA-node-boundary-if-requ.patch b/kvm-numa-Validate-cluster-and-NUMA-node-boundary-if-requ.patch new file mode 100644 index 0000000..66d68f1 --- /dev/null +++ b/kvm-numa-Validate-cluster-and-NUMA-node-boundary-if-requ.patch @@ -0,0 +1,145 @@ +From 760a2f284f6d4cd3cd3b1685411bbca21c4ad233 Mon Sep 17 00:00:00 2001 +From: Gavin Shan +Date: Tue, 27 Jun 2023 20:20:09 +1000 +Subject: [PATCH 1/6] numa: Validate cluster and NUMA node boundary if required +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Gavin Shan +RH-MergeRequest: 175: hw/arm: Validate CPU cluster and NUMA node boundary for RHEL machines +RH-Bugzilla: 2171363 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Eric Auger +RH-Commit: [1/3] 24580064b9a0076ec4d9a916839d85135ac48cd9 + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2171363 + +For some architectures like ARM64, multiple CPUs in one cluster can be +associated with different NUMA nodes, which is irregular configuration +because we shouldn't have this in baremetal environment. The irregular +configuration causes Linux guest to misbehave, as the following warning +messages indicate. + + -smp 6,maxcpus=6,sockets=2,clusters=1,cores=3,threads=1 \ + -numa node,nodeid=0,cpus=0-1,memdev=ram0 \ + -numa node,nodeid=1,cpus=2-3,memdev=ram1 \ + -numa node,nodeid=2,cpus=4-5,memdev=ram2 \ + + ------------[ cut here ]------------ + WARNING: CPU: 0 PID: 1 at kernel/sched/topology.c:2271 build_sched_domains+0x284/0x910 + Modules linked in: + CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.14.0-268.el9.aarch64 #1 + pstate: 00400005 (nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) + pc : build_sched_domains+0x284/0x910 + lr : build_sched_domains+0x184/0x910 + sp : ffff80000804bd50 + x29: ffff80000804bd50 x28: 0000000000000002 x27: 0000000000000000 + x26: ffff800009cf9a80 x25: 0000000000000000 x24: ffff800009cbf840 + x23: ffff000080325000 x22: ffff0000005df800 x21: ffff80000a4ce508 + x20: 0000000000000000 x19: ffff000080324440 x18: 0000000000000014 + x17: 00000000388925c0 x16: 000000005386a066 x15: 000000009c10cc2e + x14: 00000000000001c0 x13: 0000000000000001 x12: ffff00007fffb1a0 + x11: ffff00007fffb180 x10: ffff80000a4ce508 x9 : 0000000000000041 + x8 : ffff80000a4ce500 x7 : ffff80000a4cf920 x6 : 0000000000000001 + x5 : 0000000000000001 x4 : 0000000000000007 x3 : 0000000000000002 + x2 : 0000000000001000 x1 : ffff80000a4cf928 x0 : 0000000000000001 + Call trace: + build_sched_domains+0x284/0x910 + sched_init_domains+0xac/0xe0 + sched_init_smp+0x48/0xc8 + kernel_init_freeable+0x140/0x1ac + kernel_init+0x28/0x140 + ret_from_fork+0x10/0x20 + +Improve the situation to warn when multiple CPUs in one cluster have +been associated with different NUMA nodes. However, one NUMA node is +allowed to be associated with different clusters. + +Signed-off-by: Gavin Shan +Acked-by: Philippe Mathieu-Daudé +Acked-by: Igor Mammedov +Message-Id: <20230509002739.18388-2-gshan@redhat.com> +Signed-off-by: Paolo Bonzini +(cherry picked from commit a494fdb715832000ee9047a549a35aacfea8175e) +Signed-off-by: Gavin Shan +--- + hw/core/machine.c | 42 ++++++++++++++++++++++++++++++++++++++++++ + include/hw/boards.h | 1 + + 2 files changed, 43 insertions(+) + +diff --git a/hw/core/machine.c b/hw/core/machine.c +index c28702b690..5abdc8c39b 100644 +--- a/hw/core/machine.c ++++ b/hw/core/machine.c +@@ -1496,6 +1496,45 @@ static void machine_numa_finish_cpu_init(MachineState *machine) + g_string_free(s, true); + } + ++static void validate_cpu_cluster_to_numa_boundary(MachineState *ms) ++{ ++ MachineClass *mc = MACHINE_GET_CLASS(ms); ++ NumaState *state = ms->numa_state; ++ const CPUArchIdList *possible_cpus = mc->possible_cpu_arch_ids(ms); ++ const CPUArchId *cpus = possible_cpus->cpus; ++ int i, j; ++ ++ if (state->num_nodes <= 1 || possible_cpus->len <= 1) { ++ return; ++ } ++ ++ /* ++ * The Linux scheduling domain can't be parsed when the multiple CPUs ++ * in one cluster have been associated with different NUMA nodes. However, ++ * it's fine to associate one NUMA node with CPUs in different clusters. ++ */ ++ for (i = 0; i < possible_cpus->len; i++) { ++ for (j = i + 1; j < possible_cpus->len; j++) { ++ if (cpus[i].props.has_socket_id && ++ cpus[i].props.has_cluster_id && ++ cpus[i].props.has_node_id && ++ cpus[j].props.has_socket_id && ++ cpus[j].props.has_cluster_id && ++ cpus[j].props.has_node_id && ++ cpus[i].props.socket_id == cpus[j].props.socket_id && ++ cpus[i].props.cluster_id == cpus[j].props.cluster_id && ++ cpus[i].props.node_id != cpus[j].props.node_id) { ++ warn_report("CPU-%d and CPU-%d in socket-%" PRId64 "-cluster-%" PRId64 ++ " have been associated with node-%" PRId64 " and node-%" PRId64 ++ " respectively. It can cause OSes like Linux to" ++ " misbehave", i, j, cpus[i].props.socket_id, ++ cpus[i].props.cluster_id, cpus[i].props.node_id, ++ cpus[j].props.node_id); ++ } ++ } ++ } ++} ++ + MemoryRegion *machine_consume_memdev(MachineState *machine, + HostMemoryBackend *backend) + { +@@ -1581,6 +1620,9 @@ void machine_run_board_init(MachineState *machine, const char *mem_path, Error * + numa_complete_configuration(machine); + if (machine->numa_state->num_nodes) { + machine_numa_finish_cpu_init(machine); ++ if (machine_class->cpu_cluster_has_numa_boundary) { ++ validate_cpu_cluster_to_numa_boundary(machine); ++ } + } + } + +diff --git a/include/hw/boards.h b/include/hw/boards.h +index 5f08bd7550..3628671228 100644 +--- a/include/hw/boards.h ++++ b/include/hw/boards.h +@@ -275,6 +275,7 @@ struct MachineClass { + bool nvdimm_supported; + bool numa_mem_supported; + bool auto_enable_numa; ++ bool cpu_cluster_has_numa_boundary; + SMPCompatProps smp_props; + const char *default_ram_id; + +-- +2.39.3 + diff --git a/kvm-vhost-fix-vhost_dev_enable_notifiers-error-case.patch b/kvm-vhost-fix-vhost_dev_enable_notifiers-error-case.patch new file mode 100644 index 0000000..3282c24 --- /dev/null +++ b/kvm-vhost-fix-vhost_dev_enable_notifiers-error-case.patch @@ -0,0 +1,138 @@ +From ac54f5f746782da89ab674733af5622e524b58eb Mon Sep 17 00:00:00 2001 +From: Laurent Vivier +Date: Fri, 2 Jun 2023 18:27:35 +0200 +Subject: [PATCH 4/6] vhost: fix vhost_dev_enable_notifiers() error case +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Laurent Vivier +RH-MergeRequest: 176: vhost: fix vhost_dev_enable_notifiers() error case +RH-Jira: RHEL-330 +RH-Acked-by: MST +RH-Acked-by: Cindy Lu +RH-Acked-by: Eugenio Pérez +RH-Acked-by: Jason Wang +RH-Commit: [1/1] fd30d7501be59f7e5b9d6fc5ed84efcc4037d08e (lvivier/qemu-kvm-centos) + +JIRA: https://issues.redhat.com/browse/RHEL-330 + +in vhost_dev_enable_notifiers(), if virtio_bus_set_host_notifier(true) +fails, we call vhost_dev_disable_notifiers() that executes +virtio_bus_set_host_notifier(false) on all queues, even on queues that +have failed to be initialized. + +This triggers a core dump in memory_region_del_eventfd(): + + virtio_bus_set_host_notifier: unable to init event notifier: Too many open files (-24) + vhost VQ 1 notifier binding failed: 24 + .../softmmu/memory.c:2611: memory_region_del_eventfd: Assertion `i != mr->ioeventfd_nb' failed. + +Fix the problem by providing to vhost_dev_disable_notifiers() the +number of queues to disable. + +Fixes: 8771589b6f81 ("vhost: simplify vhost_dev_enable_notifiers") +Cc: longpeng2@huawei.com +Signed-off-by: Laurent Vivier +Message-Id: <20230602162735.3670785-1-lvivier@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Philippe Mathieu-Daudé +(cherry picked from commit 92099aa4e9a3bb6856c290afaf41c76f9e3dd9fd) +--- + hw/virtio/vhost.c | 65 ++++++++++++++++++++++++++--------------------- + 1 file changed, 36 insertions(+), 29 deletions(-) + +diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c +index a266396576..ae0a033e60 100644 +--- a/hw/virtio/vhost.c ++++ b/hw/virtio/vhost.c +@@ -1545,6 +1545,40 @@ void vhost_dev_cleanup(struct vhost_dev *hdev) + memset(hdev, 0, sizeof(struct vhost_dev)); + } + ++static void vhost_dev_disable_notifiers_nvqs(struct vhost_dev *hdev, ++ VirtIODevice *vdev, ++ unsigned int nvqs) ++{ ++ BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); ++ int i, r; ++ ++ /* ++ * Batch all the host notifiers in a single transaction to avoid ++ * quadratic time complexity in address_space_update_ioeventfds(). ++ */ ++ memory_region_transaction_begin(); ++ ++ for (i = 0; i < nvqs; ++i) { ++ r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i, ++ false); ++ if (r < 0) { ++ error_report("vhost VQ %d notifier cleanup failed: %d", i, -r); ++ } ++ assert(r >= 0); ++ } ++ ++ /* ++ * The transaction expects the ioeventfds to be open when it ++ * commits. Do it now, before the cleanup loop. ++ */ ++ memory_region_transaction_commit(); ++ ++ for (i = 0; i < nvqs; ++i) { ++ virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i); ++ } ++ virtio_device_release_ioeventfd(vdev); ++} ++ + /* Stop processing guest IO notifications in qemu. + * Start processing them in vhost in kernel. + */ +@@ -1574,7 +1608,7 @@ int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) + if (r < 0) { + error_report("vhost VQ %d notifier binding failed: %d", i, -r); + memory_region_transaction_commit(); +- vhost_dev_disable_notifiers(hdev, vdev); ++ vhost_dev_disable_notifiers_nvqs(hdev, vdev, i); + return r; + } + } +@@ -1591,34 +1625,7 @@ int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) + */ + void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) + { +- BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); +- int i, r; +- +- /* +- * Batch all the host notifiers in a single transaction to avoid +- * quadratic time complexity in address_space_update_ioeventfds(). +- */ +- memory_region_transaction_begin(); +- +- for (i = 0; i < hdev->nvqs; ++i) { +- r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i, +- false); +- if (r < 0) { +- error_report("vhost VQ %d notifier cleanup failed: %d", i, -r); +- } +- assert (r >= 0); +- } +- +- /* +- * The transaction expects the ioeventfds to be open when it +- * commits. Do it now, before the cleanup loop. +- */ +- memory_region_transaction_commit(); +- +- for (i = 0; i < hdev->nvqs; ++i) { +- virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i); +- } +- virtio_device_release_ioeventfd(vdev); ++ vhost_dev_disable_notifiers_nvqs(hdev, vdev, hdev->nvqs); + } + + /* Test and clear event pending status. +-- +2.39.3 + diff --git a/kvm-vhost-vdpa-do-not-cleanup-the-vdpa-vhost-net-structu.patch b/kvm-vhost-vdpa-do-not-cleanup-the-vdpa-vhost-net-structu.patch new file mode 100644 index 0000000..fd29eb7 --- /dev/null +++ b/kvm-vhost-vdpa-do-not-cleanup-the-vdpa-vhost-net-structu.patch @@ -0,0 +1,67 @@ +From 4e30ca551fb3740a428017a0debf0a6aab976639 Mon Sep 17 00:00:00 2001 +From: Ani Sinha +Date: Mon, 19 Jun 2023 12:22:09 +0530 +Subject: [PATCH 6/6] vhost-vdpa: do not cleanup the vdpa/vhost-net structures + if peer nic is present + +RH-Author: Ani Sinha +RH-MergeRequest: 174: vhost-vdpa: do not cleanup the vdpa/vhost-net structures if peer nic is present +RH-Bugzilla: 2128929 +RH-Acked-by: Igor Mammedov +RH-Acked-by: Miroslav Rezanina +RH-Commit: [1/1] c70d4e5fd93256326d318e0b507db6b9eb93ad86 (anisinha/centos-qemu-kvm) + +When a peer nic is still attached to the vdpa backend, it is too early to free +up the vhost-net and vdpa structures. If these structures are freed here, then +QEMU crashes when the guest is being shut down. The following call chain +would result in an assertion failure since the pointer returned from +vhost_vdpa_get_vhost_net() would be NULL: + +do_vm_stop() -> vm_state_notify() -> virtio_set_status() -> +virtio_net_vhost_status() -> get_vhost_net(). + +Therefore, we defer freeing up the structures until at guest shutdown +time when qemu_cleanup() calls net_cleanup() which then calls +qemu_del_net_client() which would eventually call vhost_vdpa_cleanup() +again to free up the structures. This time, the loop in net_cleanup() +ensures that vhost_vdpa_cleanup() will be called one last time when +all the peer nics are detached and freed. + +All unit tests pass with this change. + +CC: imammedo@redhat.com +CC: jusual@redhat.com +CC: mst@redhat.com +Fixes: CVE-2023-3301 +Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=2128929 +Signed-off-by: Ani Sinha +Message-Id: <20230619065209.442185-1-anisinha@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit a0d7215e339b61c7d7a7b3fcf754954d80d93eb8) +--- + net/vhost-vdpa.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c +index 99904a0da7..8c8900f0f4 100644 +--- a/net/vhost-vdpa.c ++++ b/net/vhost-vdpa.c +@@ -184,6 +184,14 @@ static void vhost_vdpa_cleanup(NetClientState *nc) + { + VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); + ++ /* ++ * If a peer NIC is attached, do not cleanup anything. ++ * Cleanup will happen as a part of qemu_cleanup() -> net_cleanup() ++ * when the guest is shutting down. ++ */ ++ if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_NIC) { ++ return; ++ } + qemu_vfree(s->cvq_cmd_out_buffer); + qemu_vfree(s->status); + if (s->vhost_net) { +-- +2.39.3 + diff --git a/qemu-kvm.spec b/qemu-kvm.spec index e7cd2af..ad8ea7e 100644 --- a/qemu-kvm.spec +++ b/qemu-kvm.spec @@ -149,7 +149,7 @@ Obsoletes: %{name}-block-ssh <= %{epoch}:%{version} \ Summary: QEMU is a machine emulator and virtualizer Name: qemu-kvm Version: 8.0.0 -Release: 6%{?rcrel}%{?dist}%{?cc_suffix} +Release: 7%{?rcrel}%{?dist}%{?cc_suffix} # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped # Epoch 15 used for RHEL 8 # Epoch 17 used for RHEL 9 (due to release versioning offset in RHEL 8.5) @@ -360,6 +360,18 @@ Patch102: kvm-target-i386-add-support-for-FB_CLEAR-feature.patch Patch103: kvm-block-blkio-use-qemu_open-to-support-fd-passing-for-.patch # For bz#2180076 - [qemu-kvm] support fd passing for libblkio QEMU BlockDrivers Patch104: kvm-qapi-add-fdset-feature-for-BlockdevOptionsVirtioBlkV.patch +# For bz#2171363 - [aarch64] Kernel hits Call trace with irregular CPU-to-NUMA association +Patch105: kvm-numa-Validate-cluster-and-NUMA-node-boundary-if-requ.patch +# For bz#2171363 - [aarch64] Kernel hits Call trace with irregular CPU-to-NUMA association +Patch106: kvm-hw-arm-Validate-cluster-and-NUMA-node-boundary.patch +# For bz#2171363 - [aarch64] Kernel hits Call trace with irregular CPU-to-NUMA association +Patch107: kvm-hw-arm-virt-Validate-cluster-and-NUMA-node-boundary-.patch +# For RHEL-330 - [virtual network][qemu-kvm-8.0.0-rc1]qemu core dump: qemu-kvm: ../softmmu/memory.c:2592: void memory_region_del_eventfd(MemoryRegion *, hwaddr, unsigned int, _Bool, uint64_t, EventNotifier *): Assertion `i != mr->ioeventfd_nb' failed +Patch108: kvm-vhost-fix-vhost_dev_enable_notifiers-error-case.patch +# For bz#2218644 - query-stats QMP command interrupts vcpus, the Max Latencies could be more than 100us (rhel 9.3.0 clone) +Patch109: kvm-kvm-reuse-per-vcpu-stats-fd-to-avoid-vcpu-interrupti.patch +# For bz#2128929 - [rhel9.2] hotplug/hotunplug mlx vdpa device to the occupied addr port, then qemu core dump occurs after shutdown guest +Patch110: kvm-vhost-vdpa-do-not-cleanup-the-vdpa-vhost-net-structu.patch %if %{have_clang} BuildRequires: clang @@ -1400,6 +1412,22 @@ useradd -r -u 107 -g qemu -G kvm -d / -s /sbin/nologin \ %endif %changelog +* Mon Jul 10 2023 Miroslav Rezanina - 8.0.0-7 +- kvm-numa-Validate-cluster-and-NUMA-node-boundary-if-requ.patch [bz#2171363] +- kvm-hw-arm-Validate-cluster-and-NUMA-node-boundary.patch [bz#2171363] +- kvm-hw-arm-virt-Validate-cluster-and-NUMA-node-boundary-.patch [bz#2171363] +- kvm-vhost-fix-vhost_dev_enable_notifiers-error-case.patch [RHEL-330] +- kvm-kvm-reuse-per-vcpu-stats-fd-to-avoid-vcpu-interrupti.patch [bz#2218644] +- kvm-vhost-vdpa-do-not-cleanup-the-vdpa-vhost-net-structu.patch [bz#2128929] +- Resolves: bz#2171363 + ([aarch64] Kernel hits Call trace with irregular CPU-to-NUMA association) +- Resolves: RHEL-330 + ([virtual network][qemu-kvm-8.0.0-rc1]qemu core dump: qemu-kvm: ../softmmu/memory.c:2592: void memory_region_del_eventfd(MemoryRegion *, hwaddr, unsigned int, _Bool, uint64_t, EventNotifier *): Assertion `i != mr->ioeventfd_nb' failed) +- Resolves: bz#2218644 + (query-stats QMP command interrupts vcpus, the Max Latencies could be more than 100us (rhel 9.3.0 clone)) +- Resolves: bz#2128929 + ([rhel9.2] hotplug/hotunplug mlx vdpa device to the occupied addr port, then qemu core dump occurs after shutdown guest) + * Mon Jun 26 2023 Miroslav Rezanina - 8.0.0-6 - kvm-target-i386-add-support-for-FLUSH_L1D-feature.patch [bz#2216201] - kvm-target-i386-add-support-for-FB_CLEAR-feature.patch [bz#2216201]