diff --git a/SOURCES/kvm-Enable-amd-iommu-device.patch b/SOURCES/kvm-Enable-amd-iommu-device.patch new file mode 100644 index 0000000..44846a5 --- /dev/null +++ b/SOURCES/kvm-Enable-amd-iommu-device.patch @@ -0,0 +1,38 @@ +From 0608561efc441f234d9aaf45f1867ffb5c43cffe Mon Sep 17 00:00:00 2001 +From: John Allen +Date: Wed, 11 Jun 2025 15:41:14 -0500 +Subject: [PATCH 26/57] Enable amd-iommu device + +RH-Author: John Allen +RH-MergeRequest: 380: Add ability to manually specify the AMDVI-PCI device +RH-Jira: RHEL-70925 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [3/3] 852500a18275e14bcd94d598ccd0ee33b76578dc (johnalle/qemu-kvm-fork) + +Now that the amdvi-pci device that amd-iommu creates can be specified +manually, amd-iommu device can be enabled. + +JIRA: https://issues.redhat.com/browse/RHEL-70925 + +Upstream: RHEL ONLY + +Signed-off-by: John Allen +--- + configs/devices/x86_64-softmmu/x86_64-rh-devices.mak | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak b/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak +index 3e5f693b62..2b15fdc2db 100644 +--- a/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak ++++ b/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak +@@ -97,6 +97,7 @@ CONFIG_VIRTIO_MEM=y + CONFIG_VIRTIO_PCI=y + CONFIG_VIRTIO_VGA=y + CONFIG_VIRTIO_IOMMU=y ++CONFIG_AMD_IOMMU=y + CONFIG_VMMOUSE=y + CONFIG_VMPORT=y + CONFIG_VTD=y +-- +2.39.3 + diff --git a/SOURCES/kvm-amd_iommu-Add-support-for-pass-though-mode.patch b/SOURCES/kvm-amd_iommu-Add-support-for-pass-though-mode.patch new file mode 100644 index 0000000..b0038a7 --- /dev/null +++ b/SOURCES/kvm-amd_iommu-Add-support-for-pass-though-mode.patch @@ -0,0 +1,141 @@ +From 4114553452f7187283aefa001bc8342fc65b6b72 Mon Sep 17 00:00:00 2001 +From: John Allen +Date: Wed, 11 Dec 2024 15:06:48 -0600 +Subject: [PATCH 04/57] amd_iommu: Add support for pass though mode + +RH-Author: John Allen +RH-MergeRequest: 303: Interrupt Remap support for emulated amd viommu +RH-Jira: RHEL-66202 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [2/5] 0434fefd554baf27fb9d93026af513c621f8cdb0 (johnalle/qemu-kvm-fork) + +JIRA: https://issues.redhat.com/browse/RHEL-66202 + +commit c1f46999ef506d9854534560a94d02cf3cf9edd1 +Author: Suravee Suthikulpanit +Date: Fri Sep 27 12:29:10 2024 -0500 + + amd_iommu: Add support for pass though mode + + Introduce 'nodma' shared memory region to support PT mode + so that for each device, we only create an alias to shared memory + region when DMA-remapping is disabled. + + Reviewed-by: Alejandro Jimenez + Signed-off-by: Suravee Suthikulpanit + Signed-off-by: Santosh Shukla + Message-Id: <20240927172913.121477-3-santosh.shukla@amd.com> + Reviewed-by: Michael S. Tsirkin + Signed-off-by: Michael S. Tsirkin + +Signed-off-by: John Allen +--- + hw/i386/amd_iommu.c | 49 ++++++++++++++++++++++++++++++++++++--------- + hw/i386/amd_iommu.h | 2 ++ + 2 files changed, 42 insertions(+), 9 deletions(-) + +diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c +index 148b5ee51d..567cb8adc9 100644 +--- a/hw/i386/amd_iommu.c ++++ b/hw/i386/amd_iommu.c +@@ -60,8 +60,9 @@ struct AMDVIAddressSpace { + uint8_t bus_num; /* bus number */ + uint8_t devfn; /* device function */ + AMDVIState *iommu_state; /* AMDVI - one per machine */ +- MemoryRegion root; /* AMDVI Root memory map region */ ++ MemoryRegion root; /* AMDVI Root memory map region */ + IOMMUMemoryRegion iommu; /* Device's address translation region */ ++ MemoryRegion iommu_nodma; /* Alias of shared nodma memory region */ + MemoryRegion iommu_ir; /* Device's interrupt remapping region */ + AddressSpace as; /* device's corresponding address space */ + }; +@@ -1412,6 +1413,7 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) + AMDVIState *s = opaque; + AMDVIAddressSpace **iommu_as, *amdvi_dev_as; + int bus_num = pci_bus_num(bus); ++ X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); + + iommu_as = s->address_spaces[bus_num]; + +@@ -1436,13 +1438,13 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) + * Memory region relationships looks like (Address range shows + * only lower 32 bits to make it short in length...): + * +- * |-----------------+-------------------+----------| +- * | Name | Address range | Priority | +- * |-----------------+-------------------+----------+ +- * | amdvi_root | 00000000-ffffffff | 0 | +- * | amdvi_iommu | 00000000-ffffffff | 1 | +- * | amdvi_iommu_ir | fee00000-feefffff | 64 | +- * |-----------------+-------------------+----------| ++ * |--------------------+-------------------+----------| ++ * | Name | Address range | Priority | ++ * |--------------------+-------------------+----------+ ++ * | amdvi-root | 00000000-ffffffff | 0 | ++ * | amdvi-iommu_nodma | 00000000-ffffffff | 0 | ++ * | amdvi-iommu_ir | fee00000-feefffff | 64 | ++ * |--------------------+-------------------+----------| + */ + memory_region_init_iommu(&amdvi_dev_as->iommu, + sizeof(amdvi_dev_as->iommu), +@@ -1461,7 +1463,25 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) + 64); + memory_region_add_subregion_overlap(&amdvi_dev_as->root, 0, + MEMORY_REGION(&amdvi_dev_as->iommu), +- 1); ++ 0); ++ ++ /* Build the DMA Disabled alias to shared memory */ ++ memory_region_init_alias(&amdvi_dev_as->iommu_nodma, OBJECT(s), ++ "amdvi-sys", &s->mr_sys, 0, ++ memory_region_size(&s->mr_sys)); ++ memory_region_add_subregion_overlap(&amdvi_dev_as->root, 0, ++ &amdvi_dev_as->iommu_nodma, ++ 0); ++ ++ if (!x86_iommu->pt_supported) { ++ memory_region_set_enabled(&amdvi_dev_as->iommu_nodma, false); ++ memory_region_set_enabled(MEMORY_REGION(&amdvi_dev_as->iommu), ++ true); ++ } else { ++ memory_region_set_enabled(MEMORY_REGION(&amdvi_dev_as->iommu), ++ false); ++ memory_region_set_enabled(&amdvi_dev_as->iommu_nodma, true); ++ } + } + return &iommu_as[devfn]->as; + } +@@ -1602,6 +1622,17 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) + "amdvi-mmio", AMDVI_MMIO_SIZE); + memory_region_add_subregion(get_system_memory(), AMDVI_BASE_ADDR, + &s->mr_mmio); ++ ++ /* Create the share memory regions by all devices */ ++ memory_region_init(&s->mr_sys, OBJECT(s), "amdvi-sys", UINT64_MAX); ++ ++ /* set up the DMA disabled memory region */ ++ memory_region_init_alias(&s->mr_nodma, OBJECT(s), ++ "amdvi-nodma", get_system_memory(), 0, ++ memory_region_size(get_system_memory())); ++ memory_region_add_subregion_overlap(&s->mr_sys, 0, ++ &s->mr_nodma, 0); ++ + pci_setup_iommu(bus, &amdvi_iommu_ops, s); + amdvi_init(s); + } +diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h +index e5c2ae94f2..be417e51c4 100644 +--- a/hw/i386/amd_iommu.h ++++ b/hw/i386/amd_iommu.h +@@ -354,6 +354,8 @@ struct AMDVIState { + uint32_t pprlog_tail; /* ppr log tail */ + + MemoryRegion mr_mmio; /* MMIO region */ ++ MemoryRegion mr_sys; ++ MemoryRegion mr_nodma; + uint8_t mmior[AMDVI_MMIO_SIZE]; /* read/write MMIO */ + uint8_t w1cmask[AMDVI_MMIO_SIZE]; /* read/write 1 clear mask */ + uint8_t romask[AMDVI_MMIO_SIZE]; /* MMIO read/only mask */ +-- +2.39.3 + diff --git a/SOURCES/kvm-amd_iommu-Check-APIC-ID-255-for-XTSup.patch b/SOURCES/kvm-amd_iommu-Check-APIC-ID-255-for-XTSup.patch new file mode 100644 index 0000000..a230203 --- /dev/null +++ b/SOURCES/kvm-amd_iommu-Check-APIC-ID-255-for-XTSup.patch @@ -0,0 +1,66 @@ +From 0397ebacdba6539147d9986255c3f81cbfdabf1e Mon Sep 17 00:00:00 2001 +From: John Allen +Date: Wed, 11 Dec 2024 15:07:03 -0600 +Subject: [PATCH 07/57] amd_iommu: Check APIC ID > 255 for XTSup + +RH-Author: John Allen +RH-MergeRequest: 303: Interrupt Remap support for emulated amd viommu +RH-Jira: RHEL-66202 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [5/5] f39b3e3cdefc2b562f1ad2ef939a37bf404f355a (johnalle/qemu-kvm-fork) + +JIRA: https://issues.redhat.com/browse/RHEL-66202 + +commit b12cb3819baf6d9ee8140d4dd6d36fa829e2c6d9 +Author: Suravee Suthikulpanit +Date: Fri Sep 27 12:29:13 2024 -0500 + + amd_iommu: Check APIC ID > 255 for XTSup + + The XTSup mode enables x2APIC support for AMD IOMMU, which is needed + to support vcpu w/ APIC ID > 255. + + Reviewed-by: Alejandro Jimenez + Signed-off-by: Suravee Suthikulpanit + Signed-off-by: Santosh Shukla + Message-Id: <20240927172913.121477-6-santosh.shukla@amd.com> + Reviewed-by: Michael S. Tsirkin + Signed-off-by: Michael S. Tsirkin + +Signed-off-by: John Allen +--- + hw/i386/amd_iommu.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c +index 82d76dfca9..d804656ea8 100644 +--- a/hw/i386/amd_iommu.c ++++ b/hw/i386/amd_iommu.c +@@ -32,6 +32,7 @@ + #include "trace.h" + #include "hw/i386/apic-msidef.h" + #include "hw/qdev-properties.h" ++#include "kvm/kvm_i386.h" + + /* used AMD-Vi MMIO registers */ + const char *amdvi_mmio_low[] = { +@@ -1651,6 +1652,16 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) + memory_region_add_subregion_overlap(&s->mr_sys, AMDVI_INT_ADDR_FIRST, + &s->mr_ir, 1); + ++ /* AMD IOMMU with x2APIC mode requires xtsup=on */ ++ if (x86ms->apic_id_limit > 255 && !s->xtsup) { ++ error_report("AMD IOMMU with x2APIC confguration requires xtsup=on"); ++ exit(EXIT_FAILURE); ++ } ++ if (s->xtsup && kvm_irqchip_is_split() && !kvm_enable_x2apic()) { ++ error_report("AMD IOMMU xtsup=on requires support on the KVM side"); ++ exit(EXIT_FAILURE); ++ } ++ + pci_setup_iommu(bus, &amdvi_iommu_ops, s); + amdvi_init(s); + } +-- +2.39.3 + diff --git a/SOURCES/kvm-amd_iommu-Rename-variable-mmio-to-mr_mmio.patch b/SOURCES/kvm-amd_iommu-Rename-variable-mmio-to-mr_mmio.patch new file mode 100644 index 0000000..76c9fd6 --- /dev/null +++ b/SOURCES/kvm-amd_iommu-Rename-variable-mmio-to-mr_mmio.patch @@ -0,0 +1,94 @@ +From f733325d3d91576ae9f6e341faabc301542fc6c8 Mon Sep 17 00:00:00 2001 +From: John Allen +Date: Wed, 11 Dec 2024 15:06:44 -0600 +Subject: [PATCH 03/57] amd_iommu: Rename variable mmio to mr_mmio + +RH-Author: John Allen +RH-MergeRequest: 303: Interrupt Remap support for emulated amd viommu +RH-Jira: RHEL-66202 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [1/5] 1996a48efb7210d4d1e0b929be2d115d672e1a02 (johnalle/qemu-kvm-fork) + +JIRA: https://issues.redhat.com/browse/RHEL-66202 + +commit 2e6f051cfc58e69dcb392cd245d8f01b0c2e963f +Author: Suravee Suthikulpanit +Date: Fri Sep 27 12:29:09 2024 -0500 + + amd_iommu: Rename variable mmio to mr_mmio + + Rename the MMIO memory region variable 'mmio' to 'mr_mmio' + so to correctly name align with struct AMDVIState::variable type. + + No functional change intended. + + Reviewed-by: Alejandro Jimenez + Signed-off-by: Suravee Suthikulpanit + Signed-off-by: Santosh Shukla + Message-Id: <20240927172913.121477-2-santosh.shukla@amd.com> + Reviewed-by: Michael S. Tsirkin + Signed-off-by: Michael S. Tsirkin + +Signed-off-by: John Allen +--- + hw/i386/acpi-build.c | 4 ++-- + hw/i386/amd_iommu.c | 6 +++--- + hw/i386/amd_iommu.h | 2 +- + 3 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c +index 5d4bd2b710..032fb1f904 100644 +--- a/hw/i386/acpi-build.c ++++ b/hw/i386/acpi-build.c +@@ -2397,7 +2397,7 @@ build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id, + /* Capability offset */ + build_append_int_noprefix(table_data, s->pci.capab_offset, 2); + /* IOMMU base address */ +- build_append_int_noprefix(table_data, s->mmio.addr, 8); ++ build_append_int_noprefix(table_data, s->mr_mmio.addr, 8); + /* PCI Segment Group */ + build_append_int_noprefix(table_data, 0, 2); + /* IOMMU info */ +@@ -2432,7 +2432,7 @@ build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id, + /* Capability offset */ + build_append_int_noprefix(table_data, s->pci.capab_offset, 2); + /* IOMMU base address */ +- build_append_int_noprefix(table_data, s->mmio.addr, 8); ++ build_append_int_noprefix(table_data, s->mr_mmio.addr, 8); + /* PCI Segment Group */ + build_append_int_noprefix(table_data, 0, 2); + /* IOMMU info */ +diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c +index 87643d2891..148b5ee51d 100644 +--- a/hw/i386/amd_iommu.c ++++ b/hw/i386/amd_iommu.c +@@ -1598,10 +1598,10 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) + x86ms->ioapic_as = amdvi_host_dma_iommu(bus, s, AMDVI_IOAPIC_SB_DEVID); + + /* set up MMIO */ +- memory_region_init_io(&s->mmio, OBJECT(s), &mmio_mem_ops, s, "amdvi-mmio", +- AMDVI_MMIO_SIZE); ++ memory_region_init_io(&s->mr_mmio, OBJECT(s), &mmio_mem_ops, s, ++ "amdvi-mmio", AMDVI_MMIO_SIZE); + memory_region_add_subregion(get_system_memory(), AMDVI_BASE_ADDR, +- &s->mmio); ++ &s->mr_mmio); + pci_setup_iommu(bus, &amdvi_iommu_ops, s); + amdvi_init(s); + } +diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h +index 73619fe9ea..e5c2ae94f2 100644 +--- a/hw/i386/amd_iommu.h ++++ b/hw/i386/amd_iommu.h +@@ -353,7 +353,7 @@ struct AMDVIState { + uint32_t pprlog_head; /* ppr log head */ + uint32_t pprlog_tail; /* ppr log tail */ + +- MemoryRegion mmio; /* MMIO region */ ++ MemoryRegion mr_mmio; /* MMIO region */ + uint8_t mmior[AMDVI_MMIO_SIZE]; /* read/write MMIO */ + uint8_t w1cmask[AMDVI_MMIO_SIZE]; /* read/write 1 clear mask */ + uint8_t romask[AMDVI_MMIO_SIZE]; /* MMIO read/only mask */ +-- +2.39.3 + diff --git a/SOURCES/kvm-amd_iommu-Send-notification-when-invalidate-interrup.patch b/SOURCES/kvm-amd_iommu-Send-notification-when-invalidate-interrup.patch new file mode 100644 index 0000000..044f8d1 --- /dev/null +++ b/SOURCES/kvm-amd_iommu-Send-notification-when-invalidate-interrup.patch @@ -0,0 +1,81 @@ +From 17ce6ac0d8edb04ba79bb39d3f695cd0506a9dc2 Mon Sep 17 00:00:00 2001 +From: John Allen +Date: Wed, 11 Dec 2024 15:06:59 -0600 +Subject: [PATCH 06/57] amd_iommu: Send notification when invalidate interrupt + entry cache + +RH-Author: John Allen +RH-MergeRequest: 303: Interrupt Remap support for emulated amd viommu +RH-Jira: RHEL-66202 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [4/5] d57e8fb4e69f3c01d32673bf658aae5067d6b969 (johnalle/qemu-kvm-fork) + +JIRA: https://issues.redhat.com/browse/RHEL-66202 + +commit f84aad4d718b83d2a4d90485992e5421430032e1 +Author: Suravee Suthikulpanit +Date: Fri Sep 27 12:29:12 2024 -0500 + + amd_iommu: Send notification when invalidate interrupt entry cache + + In order to support AMD IOMMU interrupt remapping emulation with PCI + pass-through devices, QEMU needs to notify VFIO when guest IOMMU driver + updates and invalidate the guest interrupt remapping table (IRT), and + communicate information so that the host IOMMU driver can update + the shadowed interrupt remapping table in the host IOMMU. + + Therefore, send notification when guest IOMMU emulates the IRT + invalidation commands. + + Reviewed-by: Alejandro Jimenez + Signed-off-by: Suravee Suthikulpanit + Signed-off-by: Santosh Shukla + Message-Id: <20240927172913.121477-5-santosh.shukla@amd.com> + Reviewed-by: Michael S. Tsirkin + Signed-off-by: Michael S. Tsirkin + +Signed-off-by: John Allen +--- + hw/i386/amd_iommu.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c +index 8fcf5eacb4..82d76dfca9 100644 +--- a/hw/i386/amd_iommu.c ++++ b/hw/i386/amd_iommu.c +@@ -431,6 +431,12 @@ static void amdvi_complete_ppr(AMDVIState *s, uint64_t *cmd) + trace_amdvi_ppr_exec(); + } + ++static void amdvi_intremap_inval_notify_all(AMDVIState *s, bool global, ++ uint32_t index, uint32_t mask) ++{ ++ x86_iommu_iec_notify_all(X86_IOMMU_DEVICE(s), global, index, mask); ++} ++ + static void amdvi_inval_all(AMDVIState *s, uint64_t *cmd) + { + if (extract64(cmd[0], 0, 60) || cmd[1]) { +@@ -438,6 +444,9 @@ static void amdvi_inval_all(AMDVIState *s, uint64_t *cmd) + s->cmdbuf + s->cmdbuf_head); + } + ++ /* Notify global invalidation */ ++ amdvi_intremap_inval_notify_all(s, true, 0, 0); ++ + amdvi_iotlb_reset(s); + trace_amdvi_all_inval(); + } +@@ -486,6 +495,9 @@ static void amdvi_inval_inttable(AMDVIState *s, uint64_t *cmd) + return; + } + ++ /* Notify global invalidation */ ++ amdvi_intremap_inval_notify_all(s, true, 0, 0); ++ + trace_amdvi_intr_inval(); + } + +-- +2.39.3 + diff --git a/SOURCES/kvm-amd_iommu-Use-shared-memory-region-for-Interrupt-Rem.patch b/SOURCES/kvm-amd_iommu-Use-shared-memory-region-for-Interrupt-Rem.patch new file mode 100644 index 0000000..39ad4ef --- /dev/null +++ b/SOURCES/kvm-amd_iommu-Use-shared-memory-region-for-Interrupt-Rem.patch @@ -0,0 +1,105 @@ +From 4859d41adfaae8933e074dcefdc81edd3832c914 Mon Sep 17 00:00:00 2001 +From: John Allen +Date: Wed, 11 Dec 2024 15:06:55 -0600 +Subject: [PATCH 05/57] amd_iommu: Use shared memory region for Interrupt + Remapping + +RH-Author: John Allen +RH-MergeRequest: 303: Interrupt Remap support for emulated amd viommu +RH-Jira: RHEL-66202 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [3/5] 48c0513c80257bfbd12c2cf3bab2503bd95d0b1c (johnalle/qemu-kvm-fork) + +JIRA: https://issues.redhat.com/browse/RHEL-66202 + +commit 9fc9dbac61ddde7d8df37e84c8e02cec249d3222 +Author: Suravee Suthikulpanit +Date: Fri Sep 27 12:29:11 2024 -0500 + + amd_iommu: Use shared memory region for Interrupt Remapping + + Use shared memory region for interrupt remapping which can be + aliased by all devices. + + Reviewed-by: Alejandro Jimenez + Signed-off-by: Suravee Suthikulpanit + Signed-off-by: Santosh Shukla + Message-Id: <20240927172913.121477-4-santosh.shukla@amd.com> + Reviewed-by: Michael S. Tsirkin + Signed-off-by: Michael S. Tsirkin + +Signed-off-by: John Allen +--- + hw/i386/amd_iommu.c | 22 ++++++++++++++-------- + hw/i386/amd_iommu.h | 1 + + 2 files changed, 15 insertions(+), 8 deletions(-) + +diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c +index 567cb8adc9..8fcf5eacb4 100644 +--- a/hw/i386/amd_iommu.c ++++ b/hw/i386/amd_iommu.c +@@ -1443,7 +1443,7 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) + * |--------------------+-------------------+----------+ + * | amdvi-root | 00000000-ffffffff | 0 | + * | amdvi-iommu_nodma | 00000000-ffffffff | 0 | +- * | amdvi-iommu_ir | fee00000-feefffff | 64 | ++ * | amdvi-iommu_ir | fee00000-feefffff | 1 | + * |--------------------+-------------------+----------| + */ + memory_region_init_iommu(&amdvi_dev_as->iommu, +@@ -1454,13 +1454,6 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) + memory_region_init(&amdvi_dev_as->root, OBJECT(s), + "amdvi_root", UINT64_MAX); + address_space_init(&amdvi_dev_as->as, &amdvi_dev_as->root, name); +- memory_region_init_io(&amdvi_dev_as->iommu_ir, OBJECT(s), +- &amdvi_ir_ops, s, "amd_iommu_ir", +- AMDVI_INT_ADDR_SIZE); +- memory_region_add_subregion_overlap(&amdvi_dev_as->root, +- AMDVI_INT_ADDR_FIRST, +- &amdvi_dev_as->iommu_ir, +- 64); + memory_region_add_subregion_overlap(&amdvi_dev_as->root, 0, + MEMORY_REGION(&amdvi_dev_as->iommu), + 0); +@@ -1472,6 +1465,13 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) + memory_region_add_subregion_overlap(&amdvi_dev_as->root, 0, + &amdvi_dev_as->iommu_nodma, + 0); ++ /* Build the Interrupt Remapping alias to shared memory */ ++ memory_region_init_alias(&amdvi_dev_as->iommu_ir, OBJECT(s), ++ "amdvi-ir", &s->mr_ir, 0, ++ memory_region_size(&s->mr_ir)); ++ memory_region_add_subregion_overlap(MEMORY_REGION(&amdvi_dev_as->iommu), ++ AMDVI_INT_ADDR_FIRST, ++ &amdvi_dev_as->iommu_ir, 1); + + if (!x86_iommu->pt_supported) { + memory_region_set_enabled(&amdvi_dev_as->iommu_nodma, false); +@@ -1633,6 +1633,12 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) + memory_region_add_subregion_overlap(&s->mr_sys, 0, + &s->mr_nodma, 0); + ++ /* set up the Interrupt Remapping memory region */ ++ memory_region_init_io(&s->mr_ir, OBJECT(s), &amdvi_ir_ops, ++ s, "amdvi-ir", AMDVI_INT_ADDR_SIZE); ++ memory_region_add_subregion_overlap(&s->mr_sys, AMDVI_INT_ADDR_FIRST, ++ &s->mr_ir, 1); ++ + pci_setup_iommu(bus, &amdvi_iommu_ops, s); + amdvi_init(s); + } +diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h +index be417e51c4..e0dac4d9a9 100644 +--- a/hw/i386/amd_iommu.h ++++ b/hw/i386/amd_iommu.h +@@ -356,6 +356,7 @@ struct AMDVIState { + MemoryRegion mr_mmio; /* MMIO region */ + MemoryRegion mr_sys; + MemoryRegion mr_nodma; ++ MemoryRegion mr_ir; + uint8_t mmior[AMDVI_MMIO_SIZE]; /* read/write MMIO */ + uint8_t w1cmask[AMDVI_MMIO_SIZE]; /* read/write 1 clear mask */ + uint8_t romask[AMDVI_MMIO_SIZE]; /* MMIO read/only mask */ +-- +2.39.3 + diff --git a/SOURCES/kvm-arm-Use-arm_virt_compat_set-to-apply-the-compat.patch b/SOURCES/kvm-arm-Use-arm_virt_compat_set-to-apply-the-compat.patch new file mode 100644 index 0000000..e293f6a --- /dev/null +++ b/SOURCES/kvm-arm-Use-arm_virt_compat_set-to-apply-the-compat.patch @@ -0,0 +1,53 @@ +From 173beb6698538dcffefab36772e107ffb0b4fbbd Mon Sep 17 00:00:00 2001 +From: Shaoqin Huang +Date: Mon, 28 Apr 2025 04:34:27 -0400 +Subject: [PATCH 2/5] arm: Use arm_virt_compat_set() to apply the compat + +RH-Author: Shaoqin Huang +RH-MergeRequest: 353: virtio-net: disable USO for virt-rhel9.6 +RH-Jira: RHEL-80313 +RH-Acked-by: Thomas Huth +RH-Acked-by: Eric Auger +RH-Commit: [2/2] 6e7a158e65296928040e70622b3cee59e45c1c36 (shahuang/qemu-kvm) + +JIRA: https://issues.redhat.com/browse/RHEL-80313 +Upstream Status: RHEL only + +Since the pauth and uso both should apply for the latest machine type, +move them to the arm_virt_compat_set() which applies the compat to all +machine types automatically. + +Signed-off-by: Shaoqin Huang +--- + hw/arm/virt.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/hw/arm/virt.c b/hw/arm/virt.c +index 896deaa025..2aef94e776 100644 +--- a/hw/arm/virt.c ++++ b/hw/arm/virt.c +@@ -127,6 +127,10 @@ static void arm_virt_compat_set(MachineClass *mc) + arm_virt_compat_len); + compat_props_add(mc->compat_props, arm_rhel_compat, + arm_rhel_compat_len); ++ compat_props_add(mc->compat_props, arm_rhel9_compat, ++ arm_rhel9_compat_len); ++ compat_props_add(mc->compat_props, hw_compat_rhel_9, ++ hw_compat_rhel_9_len); + } + + #define DEFINE_VIRT_MACHINE_IMPL(latest, ...) \ +@@ -3599,10 +3603,6 @@ DEFINE_VIRT_MACHINE(2, 6) + + static void virt_rhel_machine_9_6_0_options(MachineClass *mc) + { +- compat_props_add(mc->compat_props, arm_rhel9_compat, arm_rhel9_compat_len); +- +- /* NB: remember to move this line to the *latest* RHEL 9 machine */ +- compat_props_add(mc->compat_props, hw_compat_rhel_9, hw_compat_rhel_9_len); + } + DEFINE_VIRT_MACHINE_AS_LATEST(9, 6, 0) + +-- +2.48.1 + diff --git a/SOURCES/kvm-block-Add-new-bdrv_co_is_all_zeroes-function.patch b/SOURCES/kvm-block-Add-new-bdrv_co_is_all_zeroes-function.patch new file mode 100644 index 0000000..174ab4f --- /dev/null +++ b/SOURCES/kvm-block-Add-new-bdrv_co_is_all_zeroes-function.patch @@ -0,0 +1,145 @@ +From f2cd96a040dd7863484d22a3995a2904605dadde Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Fri, 9 May 2025 15:40:21 -0500 +Subject: [PATCH 06/16] block: Add new bdrv_co_is_all_zeroes() function + +RH-Author: Eric Blake +RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors +RH-Jira: RHEL-82906 RHEL-83015 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Jon Maloy +RH-Commit: [4/14] aabcba8323df698a72842f299e9242a5eee3aea6 (ebblake/centos-qemu-kvm) + +There are some optimizations that require knowing if an image starts +out as reading all zeroes, such as making blockdev-mirror faster by +skipping the copying of source zeroes to the destination. The +existing bdrv_co_is_zero_fast() is a good building block for answering +this question, but it tends to give an answer of 0 for a file we just +created via QMP 'blockdev-create' or similar (such as 'qemu-img create +-f raw'). Why? Because file-posix.c insists on allocating a tiny +header to any file rather than leaving it 100% sparse, due to some +filesystems that are unable to answer alignment probes on a hole. But +teaching file-posix.c to read the tiny header doesn't scale - the +problem of a small header is also visible when libvirt sets up an NBD +client to a just-created file on a migration destination host. + +So, we need a wrapper function that handles a bit more complexity in a +common manner for all block devices - when the BDS is mostly a hole, +but has a small non-hole header, it is still worth the time to read +that header and check if it reads as all zeroes before giving up and +returning a pessimistic answer. + +Signed-off-by: Eric Blake +Reviewed-by: Stefan Hajnoczi +Message-ID: <20250509204341.3553601-19-eblake@redhat.com> +(cherry picked from commit 52726096707c5c8b90597c445de897fa64d56e73) +Conflicts: + block/io.c - context with header names +Jira: https://issues.redhat.com/browse/RHEL-82906 +Jira: https://issues.redhat.com/browse/RHEL-83015 +Signed-off-by: Eric Blake +--- + block/io.c | 62 ++++++++++++++++++++++++++++++++++++++++ + include/block/block-io.h | 2 ++ + 2 files changed, 64 insertions(+) + +diff --git a/block/io.c b/block/io.c +index 293c5dd393..1f01337599 100644 +--- a/block/io.c ++++ b/block/io.c +@@ -38,10 +38,14 @@ + #include "qemu/error-report.h" + #include "qemu/main-loop.h" + #include "sysemu/replay.h" ++#include "qemu/units.h" + + /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ + #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) + ++/* Maximum read size for checking if data reads as zero, in bytes */ ++#define MAX_ZERO_CHECK_BUFFER (128 * KiB) ++ + static void coroutine_fn GRAPH_RDLOCK + bdrv_parent_cb_resize(BlockDriverState *bs); + +@@ -2774,6 +2778,64 @@ int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset, + return 1; + } + ++/* ++ * Check @bs (and its backing chain) to see if the entire image is known ++ * to read as zeroes. ++ * Return 1 if that is the case, 0 otherwise and -errno on error. ++ * This test is meant to be fast rather than accurate so returning 0 ++ * does not guarantee non-zero data; however, a return of 1 is reliable, ++ * and this function can report 1 in more cases than bdrv_co_is_zero_fast. ++ */ ++int coroutine_fn bdrv_co_is_all_zeroes(BlockDriverState *bs) ++{ ++ int ret; ++ int64_t pnum, bytes; ++ char *buf; ++ QEMUIOVector local_qiov; ++ IO_CODE(); ++ ++ bytes = bdrv_co_getlength(bs); ++ if (bytes < 0) { ++ return bytes; ++ } ++ ++ /* First probe - see if the entire image reads as zero */ ++ ret = bdrv_co_common_block_status_above(bs, NULL, false, BDRV_WANT_ZERO, ++ 0, bytes, &pnum, NULL, NULL, ++ NULL); ++ if (ret < 0) { ++ return ret; ++ } ++ if (ret & BDRV_BLOCK_ZERO) { ++ return bdrv_co_is_zero_fast(bs, pnum, bytes - pnum); ++ } ++ ++ /* ++ * Because of the way 'blockdev-create' works, raw files tend to ++ * be created with a non-sparse region at the front to make ++ * alignment probing easier. If the block starts with only a ++ * small allocated region, it is still worth the effort to see if ++ * the rest of the image is still sparse, coupled with manually ++ * reading the first region to see if it reads zero after all. ++ */ ++ if (pnum > MAX_ZERO_CHECK_BUFFER) { ++ return 0; ++ } ++ ret = bdrv_co_is_zero_fast(bs, pnum, bytes - pnum); ++ if (ret <= 0) { ++ return ret; ++ } ++ /* Only the head of the image is unknown, and it's small. Read it. */ ++ buf = qemu_blockalign(bs, pnum); ++ qemu_iovec_init_buf(&local_qiov, buf, pnum); ++ ret = bdrv_driver_preadv(bs, 0, pnum, &local_qiov, 0, 0); ++ if (ret >= 0) { ++ ret = buffer_is_zero(buf, pnum); ++ } ++ qemu_vfree(buf); ++ return ret; ++} ++ + int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t offset, + int64_t bytes, int64_t *pnum) + { +diff --git a/include/block/block-io.h b/include/block/block-io.h +index b49e0537dd..b99cc98d26 100644 +--- a/include/block/block-io.h ++++ b/include/block/block-io.h +@@ -161,6 +161,8 @@ bdrv_is_allocated_above(BlockDriverState *bs, BlockDriverState *base, + + int coroutine_fn GRAPH_RDLOCK + bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset, int64_t bytes); ++int coroutine_fn GRAPH_RDLOCK ++bdrv_co_is_all_zeroes(BlockDriverState *bs); + + int GRAPH_RDLOCK + bdrv_apply_auto_read_only(BlockDriverState *bs, const char *errmsg, +-- +2.48.1 + diff --git a/SOURCES/kvm-block-Expand-block-status-mode-from-bool-to-flags.patch b/SOURCES/kvm-block-Expand-block-status-mode-from-bool-to-flags.patch new file mode 100644 index 0000000..ce9b9cc --- /dev/null +++ b/SOURCES/kvm-block-Expand-block-status-mode-from-bool-to-flags.patch @@ -0,0 +1,689 @@ +From 26f5d221dd16137bed3527ee120cdf085e2c7e23 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Fri, 9 May 2025 15:40:18 -0500 +Subject: [PATCH 03/16] block: Expand block status mode from bool to flags + +RH-Author: Eric Blake +RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors +RH-Jira: RHEL-82906 RHEL-83015 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Jon Maloy +RH-Commit: [1/14] 9de5245def80e9815ed306e4abce9caec56cef6f (ebblake/centos-qemu-kvm) + +This patch is purely mechanical, changing bool want_zero into an +unsigned int for bitwise-or of flags. As of this patch, all +implementations are unchanged (the old want_zero==true is now +mode==BDRV_WANT_PRECISE which is a superset of BDRV_WANT_ZERO); but +the callers in io.c that used to pass want_zero==false are now +prepared for future driver changes that can now distinguish bewteen +BDRV_WANT_ZERO vs. BDRV_WANT_ALLOCATED. The next patch will actually +change the file-posix driver along those lines, now that we have +more-specific hints. + +As for the background why this patch is useful: right now, the +file-posix driver recognizes that if allocation is being queried, the +entire image can be reported as allocated (there is no backing file to +refer to) - but this throws away information on whether the entire +image reads as zero (trivially true if lseek(SEEK_HOLE) at offset 0 +returns -ENXIO, a bit more complicated to prove if the raw file was +created with 'qemu-img create' since we intentionally allocate a small +chunk of all-zero data to help with alignment probing). Later patches +will add a generic algorithm for seeing if an entire file reads as +zeroes. + +Signed-off-by: Eric Blake +Reviewed-by: Stefan Hajnoczi +Message-ID: <20250509204341.3553601-16-eblake@redhat.com> +(cherry picked from commit c33159dec79069514f78faecfe268439226b0f5b) +Jira: https://issues.redhat.com/browse/RHEL-82906 +Jira: https://issues.redhat.com/browse/RHEL-83015 +Signed-off-by: Eric Blake +--- + block/blkdebug.c | 6 ++-- + block/copy-before-write.c | 4 +-- + block/coroutines.h | 4 +-- + block/file-posix.c | 4 +-- + block/gluster.c | 4 +-- + block/io.c | 51 ++++++++++++++++---------------- + block/iscsi.c | 6 ++-- + block/nbd.c | 4 +-- + block/null.c | 6 ++-- + block/parallels.c | 6 ++-- + block/qcow.c | 2 +- + block/qcow2.c | 6 ++-- + block/qed.c | 6 ++-- + block/quorum.c | 4 +-- + block/raw-format.c | 4 +-- + block/rbd.c | 6 ++-- + block/snapshot-access.c | 4 +-- + block/vdi.c | 4 +-- + block/vmdk.c | 2 +- + block/vpc.c | 2 +- + block/vvfat.c | 6 ++-- + include/block/block-common.h | 11 +++++++ + include/block/block_int-common.h | 27 +++++++++-------- + include/block/block_int-io.h | 4 +-- + tests/unit/test-block-iothread.c | 2 +- + 25 files changed, 99 insertions(+), 86 deletions(-) + +diff --git a/block/blkdebug.c b/block/blkdebug.c +index c95c818c38..736ae2b56b 100644 +--- a/block/blkdebug.c ++++ b/block/blkdebug.c +@@ -751,9 +751,9 @@ blkdebug_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) + } + + static int coroutine_fn GRAPH_RDLOCK +-blkdebug_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset, +- int64_t bytes, int64_t *pnum, int64_t *map, +- BlockDriverState **file) ++blkdebug_co_block_status(BlockDriverState *bs, unsigned int mode, ++ int64_t offset, int64_t bytes, int64_t *pnum, ++ int64_t *map, BlockDriverState **file) + { + int err; + +diff --git a/block/copy-before-write.c b/block/copy-before-write.c +index 853e01a1eb..36488cdeca 100644 +--- a/block/copy-before-write.c ++++ b/block/copy-before-write.c +@@ -290,8 +290,8 @@ cbw_co_preadv_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes, + } + + static int coroutine_fn GRAPH_RDLOCK +-cbw_co_snapshot_block_status(BlockDriverState *bs, +- bool want_zero, int64_t offset, int64_t bytes, ++cbw_co_snapshot_block_status(BlockDriverState *bs, unsigned int mode, ++ int64_t offset, int64_t bytes, + int64_t *pnum, int64_t *map, + BlockDriverState **file) + { +diff --git a/block/coroutines.h b/block/coroutines.h +index f3226682d6..811ef12e43 100644 +--- a/block/coroutines.h ++++ b/block/coroutines.h +@@ -47,7 +47,7 @@ int coroutine_fn GRAPH_RDLOCK + bdrv_co_common_block_status_above(BlockDriverState *bs, + BlockDriverState *base, + bool include_base, +- bool want_zero, ++ unsigned int mode, + int64_t offset, + int64_t bytes, + int64_t *pnum, +@@ -78,7 +78,7 @@ int co_wrapper_mixed_bdrv_rdlock + bdrv_common_block_status_above(BlockDriverState *bs, + BlockDriverState *base, + bool include_base, +- bool want_zero, ++ unsigned int mode, + int64_t offset, + int64_t bytes, + int64_t *pnum, +diff --git a/block/file-posix.c b/block/file-posix.c +index f17a3f4d10..9ca55620ca 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -3277,7 +3277,7 @@ static int find_allocation(BlockDriverState *bs, off_t start, + * well exceed it. + */ + static int coroutine_fn raw_co_block_status(BlockDriverState *bs, +- bool want_zero, ++ unsigned int mode, + int64_t offset, + int64_t bytes, int64_t *pnum, + int64_t *map, +@@ -3293,7 +3293,7 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs, + return ret; + } + +- if (!want_zero) { ++ if (mode != BDRV_WANT_PRECISE) { + *pnum = bytes; + *map = offset; + *file = bs; +diff --git a/block/gluster.c b/block/gluster.c +index f8b415f381..ae5c45666b 100644 +--- a/block/gluster.c ++++ b/block/gluster.c +@@ -1466,7 +1466,7 @@ exit: + * (Based on raw_co_block_status() from file-posix.c.) + */ + static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs, +- bool want_zero, ++ unsigned int mode, + int64_t offset, + int64_t bytes, + int64_t *pnum, +@@ -1483,7 +1483,7 @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs, + return ret; + } + +- if (!want_zero) { ++ if (mode != BDRV_WANT_PRECISE) { + *pnum = bytes; + *map = offset; + *file = bs; +diff --git a/block/io.c b/block/io.c +index 3e189837a1..daaafe00d7 100644 +--- a/block/io.c ++++ b/block/io.c +@@ -2360,10 +2360,8 @@ int bdrv_flush_all(void) + * Drivers not implementing the functionality are assumed to not support + * backing files, hence all their sectors are reported as allocated. + * +- * If 'want_zero' is true, the caller is querying for mapping +- * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and +- * _ZERO where possible; otherwise, the result favors larger 'pnum', +- * with a focus on accurate BDRV_BLOCK_ALLOCATED. ++ * 'mode' serves as a hint as to which results are favored; see the ++ * BDRV_WANT_* macros for details. + * + * If 'offset' is beyond the end of the disk image the return value is + * BDRV_BLOCK_EOF and 'pnum' is set to 0. +@@ -2383,7 +2381,7 @@ int bdrv_flush_all(void) + * set to the host mapping and BDS corresponding to the guest offset. + */ + static int coroutine_fn GRAPH_RDLOCK +-bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero, ++bdrv_co_do_block_status(BlockDriverState *bs, unsigned int mode, + int64_t offset, int64_t bytes, + int64_t *pnum, int64_t *map, BlockDriverState **file) + { +@@ -2472,7 +2470,7 @@ bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero, + local_file = bs; + local_map = aligned_offset; + } else { +- ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, ++ ret = bs->drv->bdrv_co_block_status(bs, mode, aligned_offset, + aligned_bytes, pnum, &local_map, + &local_file); + +@@ -2484,10 +2482,10 @@ bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero, + * the cache requires an RCU update, so double check here to avoid + * such an update if possible. + * +- * Check want_zero, because we only want to update the cache when we ++ * Check mode, because we only want to update the cache when we + * have accurate information about what is zero and what is data. + */ +- if (want_zero && ++ if (mode == BDRV_WANT_PRECISE && + ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID) && + QLIST_EMPTY(&bs->children)) + { +@@ -2544,7 +2542,7 @@ bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero, + + if (ret & BDRV_BLOCK_RAW) { + assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file); +- ret = bdrv_co_do_block_status(local_file, want_zero, local_map, ++ ret = bdrv_co_do_block_status(local_file, mode, local_map, + *pnum, pnum, &local_map, &local_file); + goto out; + } +@@ -2556,7 +2554,7 @@ bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero, + + if (!cow_bs) { + ret |= BDRV_BLOCK_ZERO; +- } else if (want_zero) { ++ } else if (mode == BDRV_WANT_PRECISE) { + int64_t size2 = bdrv_co_getlength(cow_bs); + + if (size2 >= 0 && offset >= size2) { +@@ -2565,14 +2563,14 @@ bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero, + } + } + +- if (want_zero && ret & BDRV_BLOCK_RECURSE && ++ if (mode == BDRV_WANT_PRECISE && ret & BDRV_BLOCK_RECURSE && + local_file && local_file != bs && + (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && + (ret & BDRV_BLOCK_OFFSET_VALID)) { + int64_t file_pnum; + int ret2; + +- ret2 = bdrv_co_do_block_status(local_file, want_zero, local_map, ++ ret2 = bdrv_co_do_block_status(local_file, mode, local_map, + *pnum, &file_pnum, NULL, NULL); + if (ret2 >= 0) { + /* Ignore errors. This is just providing extra information, it +@@ -2623,7 +2621,7 @@ int coroutine_fn + bdrv_co_common_block_status_above(BlockDriverState *bs, + BlockDriverState *base, + bool include_base, +- bool want_zero, ++ unsigned int mode, + int64_t offset, + int64_t bytes, + int64_t *pnum, +@@ -2650,7 +2648,7 @@ bdrv_co_common_block_status_above(BlockDriverState *bs, + return 0; + } + +- ret = bdrv_co_do_block_status(bs, want_zero, offset, bytes, pnum, ++ ret = bdrv_co_do_block_status(bs, mode, offset, bytes, pnum, + map, file); + ++*depth; + if (ret < 0 || *pnum == 0 || ret & BDRV_BLOCK_ALLOCATED || bs == base) { +@@ -2667,7 +2665,7 @@ bdrv_co_common_block_status_above(BlockDriverState *bs, + for (p = bdrv_filter_or_cow_bs(bs); include_base || p != base; + p = bdrv_filter_or_cow_bs(p)) + { +- ret = bdrv_co_do_block_status(p, want_zero, offset, bytes, pnum, ++ ret = bdrv_co_do_block_status(p, mode, offset, bytes, pnum, + map, file); + ++*depth; + if (ret < 0) { +@@ -2730,7 +2728,8 @@ int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, + BlockDriverState **file) + { + IO_CODE(); +- return bdrv_co_common_block_status_above(bs, base, false, true, offset, ++ return bdrv_co_common_block_status_above(bs, base, false, ++ BDRV_WANT_PRECISE, offset, + bytes, pnum, map, file, NULL); + } + +@@ -2761,8 +2760,9 @@ int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset, + return 1; + } + +- ret = bdrv_co_common_block_status_above(bs, NULL, false, false, offset, +- bytes, &pnum, NULL, NULL, NULL); ++ ret = bdrv_co_common_block_status_above(bs, NULL, false, BDRV_WANT_ZERO, ++ offset, bytes, &pnum, NULL, NULL, ++ NULL); + + if (ret < 0) { + return ret; +@@ -2778,9 +2778,9 @@ int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t offset, + int64_t dummy; + IO_CODE(); + +- ret = bdrv_co_common_block_status_above(bs, bs, true, false, offset, +- bytes, pnum ? pnum : &dummy, NULL, +- NULL, NULL); ++ ret = bdrv_co_common_block_status_above(bs, bs, true, BDRV_WANT_ALLOCATED, ++ offset, bytes, pnum ? pnum : &dummy, ++ NULL, NULL, NULL); + if (ret < 0) { + return ret; + } +@@ -2813,7 +2813,8 @@ int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *bs, + int ret; + IO_CODE(); + +- ret = bdrv_co_common_block_status_above(bs, base, include_base, false, ++ ret = bdrv_co_common_block_status_above(bs, base, include_base, ++ BDRV_WANT_ALLOCATED, + offset, bytes, pnum, NULL, NULL, + &depth); + if (ret < 0) { +@@ -3710,8 +3711,8 @@ bdrv_co_preadv_snapshot(BdrvChild *child, int64_t offset, int64_t bytes, + } + + int coroutine_fn +-bdrv_co_snapshot_block_status(BlockDriverState *bs, +- bool want_zero, int64_t offset, int64_t bytes, ++bdrv_co_snapshot_block_status(BlockDriverState *bs, unsigned int mode, ++ int64_t offset, int64_t bytes, + int64_t *pnum, int64_t *map, + BlockDriverState **file) + { +@@ -3729,7 +3730,7 @@ bdrv_co_snapshot_block_status(BlockDriverState *bs, + } + + bdrv_inc_in_flight(bs); +- ret = drv->bdrv_co_snapshot_block_status(bs, want_zero, offset, bytes, ++ ret = drv->bdrv_co_snapshot_block_status(bs, mode, offset, bytes, + pnum, map, file); + bdrv_dec_in_flight(bs); + +diff --git a/block/iscsi.c b/block/iscsi.c +index 979bf90cb7..d7caa4b363 100644 +--- a/block/iscsi.c ++++ b/block/iscsi.c +@@ -694,9 +694,9 @@ out_unlock: + + + static int coroutine_fn iscsi_co_block_status(BlockDriverState *bs, +- bool want_zero, int64_t offset, +- int64_t bytes, int64_t *pnum, +- int64_t *map, ++ unsigned int mode, ++ int64_t offset, int64_t bytes, ++ int64_t *pnum, int64_t *map, + BlockDriverState **file) + { + IscsiLun *iscsilun = bs->opaque; +diff --git a/block/nbd.c b/block/nbd.c +index d464315766..a359aa236e 100644 +--- a/block/nbd.c ++++ b/block/nbd.c +@@ -1397,8 +1397,8 @@ nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) + } + + static int coroutine_fn GRAPH_RDLOCK nbd_client_co_block_status( +- BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes, +- int64_t *pnum, int64_t *map, BlockDriverState **file) ++ BlockDriverState *bs, unsigned int mode, int64_t offset, ++ int64_t bytes, int64_t *pnum, int64_t *map, BlockDriverState **file) + { + int ret, request_ret; + NBDExtent64 extent = { 0 }; +diff --git a/block/null.c b/block/null.c +index 4730acc1eb..95021230c8 100644 +--- a/block/null.c ++++ b/block/null.c +@@ -227,9 +227,9 @@ static int null_reopen_prepare(BDRVReopenState *reopen_state, + } + + static int coroutine_fn null_co_block_status(BlockDriverState *bs, +- bool want_zero, int64_t offset, +- int64_t bytes, int64_t *pnum, +- int64_t *map, ++ unsigned int mode, ++ int64_t offset, int64_t bytes, ++ int64_t *pnum, int64_t *map, + BlockDriverState **file) + { + BDRVNullState *s = bs->opaque; +diff --git a/block/parallels.c b/block/parallels.c +index 9205a0864f..22ea7834fd 100644 +--- a/block/parallels.c ++++ b/block/parallels.c +@@ -416,9 +416,9 @@ parallels_co_flush_to_os(BlockDriverState *bs) + } + + static int coroutine_fn GRAPH_RDLOCK +-parallels_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset, +- int64_t bytes, int64_t *pnum, int64_t *map, +- BlockDriverState **file) ++parallels_co_block_status(BlockDriverState *bs, unsigned int mode, ++ int64_t offset, int64_t bytes, int64_t *pnum, ++ int64_t *map, BlockDriverState **file) + { + BDRVParallelsState *s = bs->opaque; + int count; +diff --git a/block/qcow.c b/block/qcow.c +index c2f89db055..2e18c42d8f 100644 +--- a/block/qcow.c ++++ b/block/qcow.c +@@ -530,7 +530,7 @@ get_cluster_offset(BlockDriverState *bs, uint64_t offset, int allocate, + } + + static int coroutine_fn GRAPH_RDLOCK +-qcow_co_block_status(BlockDriverState *bs, bool want_zero, ++qcow_co_block_status(BlockDriverState *bs, unsigned int mode, + int64_t offset, int64_t bytes, int64_t *pnum, + int64_t *map, BlockDriverState **file) + { +diff --git a/block/qcow2.c b/block/qcow2.c +index a4cffb628c..788da07fee 100644 +--- a/block/qcow2.c ++++ b/block/qcow2.c +@@ -2147,9 +2147,9 @@ static void qcow2_join_options(QDict *options, QDict *old_options) + } + + static int coroutine_fn GRAPH_RDLOCK +-qcow2_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset, +- int64_t count, int64_t *pnum, int64_t *map, +- BlockDriverState **file) ++qcow2_co_block_status(BlockDriverState *bs, unsigned int mode, ++ int64_t offset, int64_t count, int64_t *pnum, ++ int64_t *map, BlockDriverState **file) + { + BDRVQcow2State *s = bs->opaque; + uint64_t host_offset; +diff --git a/block/qed.c b/block/qed.c +index fa5bc11085..b135e981e5 100644 +--- a/block/qed.c ++++ b/block/qed.c +@@ -832,9 +832,9 @@ fail: + } + + static int coroutine_fn GRAPH_RDLOCK +-bdrv_qed_co_block_status(BlockDriverState *bs, bool want_zero, int64_t pos, +- int64_t bytes, int64_t *pnum, int64_t *map, +- BlockDriverState **file) ++bdrv_qed_co_block_status(BlockDriverState *bs, unsigned int mode, ++ int64_t pos, int64_t bytes, int64_t *pnum, ++ int64_t *map, BlockDriverState **file) + { + BDRVQEDState *s = bs->opaque; + size_t len = MIN(bytes, SIZE_MAX); +diff --git a/block/quorum.c b/block/quorum.c +index db8fe891c4..bb4ed9483e 100644 +--- a/block/quorum.c ++++ b/block/quorum.c +@@ -1226,7 +1226,7 @@ static void quorum_child_perm(BlockDriverState *bs, BdrvChild *c, + * region contains zeroes, and BDRV_BLOCK_DATA otherwise. + */ + static int coroutine_fn GRAPH_RDLOCK +-quorum_co_block_status(BlockDriverState *bs, bool want_zero, ++quorum_co_block_status(BlockDriverState *bs, unsigned int mode, + int64_t offset, int64_t count, + int64_t *pnum, int64_t *map, BlockDriverState **file) + { +@@ -1238,7 +1238,7 @@ quorum_co_block_status(BlockDriverState *bs, bool want_zero, + for (i = 0; i < s->num_children; i++) { + int64_t bytes; + ret = bdrv_co_common_block_status_above(s->children[i]->bs, NULL, false, +- want_zero, offset, count, ++ mode, offset, count, + &bytes, NULL, NULL, NULL); + if (ret < 0) { + quorum_report_bad(QUORUM_OP_TYPE_READ, offset, count, +diff --git a/block/raw-format.c b/block/raw-format.c +index ac7e8495f6..623bca87a6 100644 +--- a/block/raw-format.c ++++ b/block/raw-format.c +@@ -283,8 +283,8 @@ fail: + } + + static int coroutine_fn GRAPH_RDLOCK +-raw_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset, +- int64_t bytes, int64_t *pnum, int64_t *map, ++raw_co_block_status(BlockDriverState *bs, unsigned int mode, ++ int64_t offset, int64_t bytes, int64_t *pnum, int64_t *map, + BlockDriverState **file) + { + BDRVRawState *s = bs->opaque; +diff --git a/block/rbd.c b/block/rbd.c +index 9c0fd0cb3f..627f8eb05a 100644 +--- a/block/rbd.c ++++ b/block/rbd.c +@@ -1504,9 +1504,9 @@ static int qemu_rbd_diff_iterate_cb(uint64_t offs, size_t len, + } + + static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs, +- bool want_zero, int64_t offset, +- int64_t bytes, int64_t *pnum, +- int64_t *map, ++ unsigned int mode, ++ int64_t offset, int64_t bytes, ++ int64_t *pnum, int64_t *map, + BlockDriverState **file) + { + BDRVRBDState *s = bs->opaque; +diff --git a/block/snapshot-access.c b/block/snapshot-access.c +index 84d0d13f86..972b8f2e68 100644 +--- a/block/snapshot-access.c ++++ b/block/snapshot-access.c +@@ -41,11 +41,11 @@ snapshot_access_co_preadv_part(BlockDriverState *bs, + + static int coroutine_fn GRAPH_RDLOCK + snapshot_access_co_block_status(BlockDriverState *bs, +- bool want_zero, int64_t offset, ++ unsigned int mode, int64_t offset, + int64_t bytes, int64_t *pnum, + int64_t *map, BlockDriverState **file) + { +- return bdrv_co_snapshot_block_status(bs->file->bs, want_zero, offset, ++ return bdrv_co_snapshot_block_status(bs->file->bs, mode, offset, + bytes, pnum, map, file); + } + +diff --git a/block/vdi.c b/block/vdi.c +index 6363da08ce..028fe68488 100644 +--- a/block/vdi.c ++++ b/block/vdi.c +@@ -521,8 +521,8 @@ static int vdi_reopen_prepare(BDRVReopenState *state, + } + + static int coroutine_fn GRAPH_RDLOCK +-vdi_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset, +- int64_t bytes, int64_t *pnum, int64_t *map, ++vdi_co_block_status(BlockDriverState *bs, unsigned int mode, ++ int64_t offset, int64_t bytes, int64_t *pnum, int64_t *map, + BlockDriverState **file) + { + BDRVVdiState *s = (BDRVVdiState *)bs->opaque; +diff --git a/block/vmdk.c b/block/vmdk.c +index 78f6433607..6f1af82078 100644 +--- a/block/vmdk.c ++++ b/block/vmdk.c +@@ -1777,7 +1777,7 @@ static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent, + } + + static int coroutine_fn GRAPH_RDLOCK +-vmdk_co_block_status(BlockDriverState *bs, bool want_zero, ++vmdk_co_block_status(BlockDriverState *bs, unsigned int mode, + int64_t offset, int64_t bytes, int64_t *pnum, + int64_t *map, BlockDriverState **file) + { +diff --git a/block/vpc.c b/block/vpc.c +index d95a204612..0dd641b614 100644 +--- a/block/vpc.c ++++ b/block/vpc.c +@@ -721,7 +721,7 @@ fail: + } + + static int coroutine_fn GRAPH_RDLOCK +-vpc_co_block_status(BlockDriverState *bs, bool want_zero, ++vpc_co_block_status(BlockDriverState *bs, unsigned int mode, + int64_t offset, int64_t bytes, + int64_t *pnum, int64_t *map, + BlockDriverState **file) +diff --git a/block/vvfat.c b/block/vvfat.c +index 8ffe8b3b9b..d59231357e 100644 +--- a/block/vvfat.c ++++ b/block/vvfat.c +@@ -3135,9 +3135,9 @@ vvfat_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, + } + + static int coroutine_fn vvfat_co_block_status(BlockDriverState *bs, +- bool want_zero, int64_t offset, +- int64_t bytes, int64_t *n, +- int64_t *map, ++ unsigned int mode, ++ int64_t offset, int64_t bytes, ++ int64_t *n, int64_t *map, + BlockDriverState **file) + { + *n = bytes; +diff --git a/include/block/block-common.h b/include/block/block-common.h +index 7030669f04..5beee6402b 100644 +--- a/include/block/block-common.h ++++ b/include/block/block-common.h +@@ -333,6 +333,17 @@ typedef enum { + #define BDRV_BLOCK_RECURSE 0x40 + #define BDRV_BLOCK_COMPRESSED 0x80 + ++/* ++ * Block status hints: the bitwise-or of these flags emphasize what ++ * the caller hopes to learn, and some drivers may be able to give ++ * faster answers by doing less work when the hint permits. ++ */ ++#define BDRV_WANT_ZERO BDRV_BLOCK_ZERO ++#define BDRV_WANT_OFFSET_VALID BDRV_BLOCK_OFFSET_VALID ++#define BDRV_WANT_ALLOCATED BDRV_BLOCK_ALLOCATED ++#define BDRV_WANT_PRECISE (BDRV_WANT_ZERO | BDRV_WANT_OFFSET_VALID | \ ++ BDRV_WANT_OFFSET_VALID) ++ + typedef QTAILQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue; + + typedef struct BDRVReopenState { +diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h +index ebb4e56a50..a9c0daa2a4 100644 +--- a/include/block/block_int-common.h ++++ b/include/block/block_int-common.h +@@ -608,15 +608,16 @@ struct BlockDriver { + * according to the current layer, and should only need to set + * BDRV_BLOCK_DATA, BDRV_BLOCK_ZERO, BDRV_BLOCK_OFFSET_VALID, + * and/or BDRV_BLOCK_RAW; if the current layer defers to a backing +- * layer, the result should be 0 (and not BDRV_BLOCK_ZERO). See +- * block.h for the overall meaning of the bits. As a hint, the +- * flag want_zero is true if the caller cares more about precise +- * mappings (favor accurate _OFFSET_VALID/_ZERO) or false for +- * overall allocation (favor larger *pnum, perhaps by reporting +- * _DATA instead of _ZERO). The block layer guarantees input +- * clamped to bdrv_getlength() and aligned to request_alignment, +- * as well as non-NULL pnum, map, and file; in turn, the driver +- * must return an error or set pnum to an aligned non-zero value. ++ * layer, the result should be 0 (and not BDRV_BLOCK_ZERO). The ++ * caller will synthesize BDRV_BLOCK_ALLOCATED based on the ++ * non-zero results. See block.h for the overall meaning of the ++ * bits. As a hint, the flags in @mode may include a bitwise-or ++ * of BDRV_WANT_ALLOCATED, BDRV_WANT_OFFSET_VALID, or ++ * BDRV_WANT_ZERO based on what the caller is looking for in the ++ * results. The block layer guarantees input clamped to ++ * bdrv_getlength() and aligned to request_alignment, as well as ++ * non-NULL pnum, map, and file; in turn, the driver must return ++ * an error or set pnum to an aligned non-zero value. + * + * Note that @bytes is just a hint on how big of a region the + * caller wants to inspect. It is not a limit on *pnum. +@@ -628,8 +629,8 @@ struct BlockDriver { + * to clamping *pnum for return to its caller. + */ + int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_block_status)( +- BlockDriverState *bs, +- bool want_zero, int64_t offset, int64_t bytes, int64_t *pnum, ++ BlockDriverState *bs, unsigned int mode, ++ int64_t offset, int64_t bytes, int64_t *pnum, + int64_t *map, BlockDriverState **file); + + /* +@@ -653,8 +654,8 @@ struct BlockDriver { + QEMUIOVector *qiov, size_t qiov_offset); + + int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_snapshot_block_status)( +- BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes, +- int64_t *pnum, int64_t *map, BlockDriverState **file); ++ BlockDriverState *bs, unsigned int mode, int64_t offset, ++ int64_t bytes, int64_t *pnum, int64_t *map, BlockDriverState **file); + + int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_pdiscard_snapshot)( + BlockDriverState *bs, int64_t offset, int64_t bytes); +diff --git a/include/block/block_int-io.h b/include/block/block_int-io.h +index 4a7cf2b4fd..4f94eb3c5a 100644 +--- a/include/block/block_int-io.h ++++ b/include/block/block_int-io.h +@@ -38,8 +38,8 @@ + int coroutine_fn GRAPH_RDLOCK bdrv_co_preadv_snapshot(BdrvChild *child, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset); + int coroutine_fn GRAPH_RDLOCK bdrv_co_snapshot_block_status( +- BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes, +- int64_t *pnum, int64_t *map, BlockDriverState **file); ++ BlockDriverState *bs, unsigned int mode, int64_t offset, ++ int64_t bytes, int64_t *pnum, int64_t *map, BlockDriverState **file); + int coroutine_fn GRAPH_RDLOCK bdrv_co_pdiscard_snapshot(BlockDriverState *bs, + int64_t offset, int64_t bytes); + +diff --git a/tests/unit/test-block-iothread.c b/tests/unit/test-block-iothread.c +index 3766d5de6b..373b72fdd8 100644 +--- a/tests/unit/test-block-iothread.c ++++ b/tests/unit/test-block-iothread.c +@@ -63,7 +63,7 @@ bdrv_test_co_truncate(BlockDriverState *bs, int64_t offset, bool exact, + } + + static int coroutine_fn bdrv_test_co_block_status(BlockDriverState *bs, +- bool want_zero, ++ unsigned int mode, + int64_t offset, int64_t count, + int64_t *pnum, int64_t *map, + BlockDriverState **file) +-- +2.48.1 + diff --git a/SOURCES/kvm-block-Let-bdrv_co_is_zero_fast-consolidate-adjacent-.patch b/SOURCES/kvm-block-Let-bdrv_co_is_zero_fast-consolidate-adjacent-.patch new file mode 100644 index 0000000..8b86f66 --- /dev/null +++ b/SOURCES/kvm-block-Let-bdrv_co_is_zero_fast-consolidate-adjacent-.patch @@ -0,0 +1,90 @@ +From 9f8158e56beae4221e91feb5a98cb4db9076cac4 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Fri, 9 May 2025 15:40:20 -0500 +Subject: [PATCH 05/16] block: Let bdrv_co_is_zero_fast consolidate adjacent + extents + +RH-Author: Eric Blake +RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors +RH-Jira: RHEL-82906 RHEL-83015 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Jon Maloy +RH-Commit: [3/14] 98bf9ff773d9a36f8a8e294e38629e3f20c41334 (ebblake/centos-qemu-kvm) + +Some BDS drivers have a cap on how much block status they can supply +in one query (for example, NBD talking to an older server cannot +inspect more than 4G per query; and qcow2 tends to cap its answers +rather than cross a cluster boundary of an L1 table). Although the +existing callers of bdrv_co_is_zero_fast are not passing in that large +of a 'bytes' parameter, an upcoming caller wants to query the entire +image at once, and will thus benefit from being able to treat adjacent +zero regions in a coalesced manner, rather than claiming the region is +non-zero merely because pnum was truncated and didn't match the +incoming bytes. + +While refactoring this into a loop, note that there is no need to +assign pnum prior to calling bdrv_co_common_block_status_above() (it +is guaranteed to be assigned deeper in the callstack). + +Signed-off-by: Eric Blake +Reviewed-by: Stefan Hajnoczi +Message-ID: <20250509204341.3553601-18-eblake@redhat.com> +(cherry picked from commit 31bf15d97dd1d205a3b264675f9a1b3bd1939068) +Jira: https://issues.redhat.com/browse/RHEL-82906 +Jira: https://issues.redhat.com/browse/RHEL-83015 +Signed-off-by: Eric Blake +--- + block/io.c | 27 +++++++++++++++------------ + 1 file changed, 15 insertions(+), 12 deletions(-) + +diff --git a/block/io.c b/block/io.c +index daaafe00d7..293c5dd393 100644 +--- a/block/io.c ++++ b/block/io.c +@@ -2747,28 +2747,31 @@ int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, int64_t offset, + * by @offset and @bytes is known to read as zeroes. + * Return 1 if that is the case, 0 otherwise and -errno on error. + * This test is meant to be fast rather than accurate so returning 0 +- * does not guarantee non-zero data. ++ * does not guarantee non-zero data; but a return of 1 is reliable. + */ + int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset, + int64_t bytes) + { + int ret; +- int64_t pnum = bytes; ++ int64_t pnum; + IO_CODE(); + +- if (!bytes) { +- return 1; +- } +- +- ret = bdrv_co_common_block_status_above(bs, NULL, false, BDRV_WANT_ZERO, +- offset, bytes, &pnum, NULL, NULL, +- NULL); ++ while (bytes) { ++ ret = bdrv_co_common_block_status_above(bs, NULL, false, ++ BDRV_WANT_ZERO, offset, bytes, ++ &pnum, NULL, NULL, NULL); + +- if (ret < 0) { +- return ret; ++ if (ret < 0) { ++ return ret; ++ } ++ if (!(ret & BDRV_BLOCK_ZERO)) { ++ return 0; ++ } ++ offset += pnum; ++ bytes -= pnum; + } + +- return (pnum == bytes) && (ret & BDRV_BLOCK_ZERO); ++ return 1; + } + + int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t offset, +-- +2.48.1 + diff --git a/SOURCES/kvm-block-io-skip-head-tail-requests-on-EINVAL.patch b/SOURCES/kvm-block-io-skip-head-tail-requests-on-EINVAL.patch new file mode 100644 index 0000000..42e6ecf --- /dev/null +++ b/SOURCES/kvm-block-io-skip-head-tail-requests-on-EINVAL.patch @@ -0,0 +1,74 @@ +From e629a362860977161e43ed80bb59d1d05a06b2f2 Mon Sep 17 00:00:00 2001 +From: Stefan Hajnoczi +Date: Thu, 17 Apr 2025 11:05:28 -0400 +Subject: [PATCH 4/5] block/io: skip head/tail requests on EINVAL + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 355: file-posix: probe discard alignment on Linux block devices +RH-Jira: RHEL-86032 +RH-Acked-by: Kevin Wolf +RH-Acked-by: Eric Blake +RH-Commit: [2/3] 0028fb11f18e16e2aba9506eabb2383c406d17b5 (stefanha/centos-stream-qemu-kvm) + +When guests send misaligned discard requests, the block layer breaks +them up into a misaligned head, an aligned main body, and a misaligned +tail. + +The file-posix block driver on Linux returns -EINVAL on misaligned +discard requests. This causes bdrv_co_pdiscard() to fail and guests +configured with werror=stop will pause. + +Add a special case for misaligned head/tail requests. Simply continue +when EINVAL is encountered so that the aligned main body of the request +can be completed and the guest is not paused. This is the best we can do +when guest discard limits do not match the host discard limits. + +Fixes: https://issues.redhat.com/browse/RHEL-86032 +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Hanna Czenczek +Message-ID: <20250417150528.76470-3-stefanha@redhat.com> +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf +(cherry picked from commit 4733cb0833c4b223f92ec0136980eeb5239ecb87) +Signed-off-by: Stefan Hajnoczi +--- + block/io.c | 15 ++++++++++----- + 1 file changed, 10 insertions(+), 5 deletions(-) + +diff --git a/block/io.c b/block/io.c +index 301514c880..3e189837a1 100644 +--- a/block/io.c ++++ b/block/io.c +@@ -3105,11 +3105,12 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, + /* Invalidate the cached block-status data range if this discard overlaps */ + bdrv_bsc_invalidate_range(bs, offset, bytes); + +- /* Discard is advisory, but some devices track and coalesce ++ /* ++ * Discard is advisory, but some devices track and coalesce + * unaligned requests, so we must pass everything down rather than +- * round here. Still, most devices will just silently ignore +- * unaligned requests (by returning -ENOTSUP), so we must fragment +- * the request accordingly. */ ++ * round here. Still, most devices reject unaligned requests with ++ * -EINVAL or -ENOTSUP, so we must fragment the request accordingly. ++ */ + align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); + assert(align % bs->bl.request_alignment == 0); + head = offset % align; +@@ -3176,7 +3177,11 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, + } + } + if (ret && ret != -ENOTSUP) { +- goto out; ++ if (ret == -EINVAL && (offset % align != 0 || num % align != 0)) { ++ /* Silently skip rejected unaligned head/tail requests */ ++ } else { ++ goto out; /* bail out */ ++ } + } + + offset += num; +-- +2.48.1 + diff --git a/SOURCES/kvm-block-skip-automatic-zero-init-of-large-array-in-ioq.patch b/SOURCES/kvm-block-skip-automatic-zero-init-of-large-array-in-ioq.patch new file mode 100644 index 0000000..72c7a02 --- /dev/null +++ b/SOURCES/kvm-block-skip-automatic-zero-init-of-large-array-in-ioq.patch @@ -0,0 +1,48 @@ +From d38bdce712f572e1920e3344132ff6600d657de2 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:36:41 +0100 +Subject: [PATCH 29/57] block: skip automatic zero-init of large array in + ioq_submit +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [3/30] 301a08b3acdcd95634dec5dab1d96fcfe3abf3be (stefanha/centos-stream-qemu-kvm) + +The 'ioq_submit' method has a struct array that is 8k in size. +Skip the automatic zero-init of this array to eliminate the +performance overhead in the I/O hot path. + +The 'iocbs' array will selectively initialized when processing +the I/O data. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Message-id: 20250610123709.835102-4-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 83750c1da807c973b0b11d977d61df7e41122d03) +Signed-off-by: Stefan Hajnoczi +--- + block/linux-aio.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/block/linux-aio.c b/block/linux-aio.c +index e3b5ec9aba..26d9f086d2 100644 +--- a/block/linux-aio.c ++++ b/block/linux-aio.c +@@ -291,7 +291,7 @@ static void ioq_submit(LinuxAioState *s) + { + int ret, len; + struct qemu_laiocb *aiocb; +- struct iocb *iocbs[MAX_EVENTS]; ++ QEMU_UNINITIALIZED struct iocb *iocbs[MAX_EVENTS]; + QSIMPLEQ_HEAD(, qemu_laiocb) completed; + + do { +-- +2.39.3 + diff --git a/SOURCES/kvm-chardev-char-fd-skip-automatic-zero-init-of-large-ar.patch b/SOURCES/kvm-chardev-char-fd-skip-automatic-zero-init-of-large-ar.patch new file mode 100644 index 0000000..4d56bf1 --- /dev/null +++ b/SOURCES/kvm-chardev-char-fd-skip-automatic-zero-init-of-large-ar.patch @@ -0,0 +1,49 @@ +From 1e8798a3adbbfc42167aaba0ee18175deac37193 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:36:42 +0100 +Subject: [PATCH 30/57] chardev/char-fd: skip automatic zero-init of large + array +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [4/30] b16fe5c9af4756e1856cd330df02a1a09d9f33ea (stefanha/centos-stream-qemu-kvm) + +The 'fd_chr_read' method has a 4k byte array used for copying +data between the socket and device. Skip the automatic zero-init +of this array to eliminate the performance overhead in the I/O +hot path. + +The 'buf' array will be fully initialized when reading data off +the network socket. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Message-id: 20250610123709.835102-5-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit a503bdc22b91869e3bf45522e36b122889465306) +Signed-off-by: Stefan Hajnoczi +--- + chardev/char-fd.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/chardev/char-fd.c b/chardev/char-fd.c +index d2c4923359..8dd662c066 100644 +--- a/chardev/char-fd.c ++++ b/chardev/char-fd.c +@@ -50,7 +50,7 @@ static gboolean fd_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque) + Chardev *chr = CHARDEV(opaque); + FDChardev *s = FD_CHARDEV(opaque); + int len; +- uint8_t buf[CHR_READ_BUF_LEN]; ++ QEMU_UNINITIALIZED uint8_t buf[CHR_READ_BUF_LEN]; + ssize_t ret; + + len = sizeof(buf); +-- +2.39.3 + diff --git a/SOURCES/kvm-chardev-char-pty-skip-automatic-zero-init-of-large-a.patch b/SOURCES/kvm-chardev-char-pty-skip-automatic-zero-init-of-large-a.patch new file mode 100644 index 0000000..7edacc8 --- /dev/null +++ b/SOURCES/kvm-chardev-char-pty-skip-automatic-zero-init-of-large-a.patch @@ -0,0 +1,49 @@ +From 74311b0ee8e211fccff211b975e4ae9236c063dc Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:36:43 +0100 +Subject: [PATCH 31/57] chardev/char-pty: skip automatic zero-init of large + array +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [5/30] a3b8458c30f485551093f292c00c20b0e118df77 (stefanha/centos-stream-qemu-kvm) + +The 'pty_chr_read' method has a 4k byte array used for copying +data between the PTY and device. Skip the automatic zero-init +of this array to eliminate the performance overhead in the I/O +hot path. + +The 'buf' array will be fully initialized when reading data off +the PTY. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Message-id: 20250610123709.835102-6-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 45bb7fb21c8d18294a9f92da99d01ab3c67c7df2) +Signed-off-by: Stefan Hajnoczi +--- + chardev/char-pty.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/chardev/char-pty.c b/chardev/char-pty.c +index cc2f7617fe..3319ad215d 100644 +--- a/chardev/char-pty.c ++++ b/chardev/char-pty.c +@@ -152,7 +152,7 @@ static gboolean pty_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque) + Chardev *chr = CHARDEV(opaque); + PtyChardev *s = PTY_CHARDEV(opaque); + gsize len; +- uint8_t buf[CHR_READ_BUF_LEN]; ++ QEMU_UNINITIALIZED uint8_t buf[CHR_READ_BUF_LEN]; + ssize_t ret; + + len = sizeof(buf); +-- +2.39.3 + diff --git a/SOURCES/kvm-chardev-char-socket-skip-automatic-zero-init-of-larg.patch b/SOURCES/kvm-chardev-char-socket-skip-automatic-zero-init-of-larg.patch new file mode 100644 index 0000000..3b6889b --- /dev/null +++ b/SOURCES/kvm-chardev-char-socket-skip-automatic-zero-init-of-larg.patch @@ -0,0 +1,49 @@ +From d56a8ce56f0de70ab2de266a80e25cf309e72fda Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:36:44 +0100 +Subject: [PATCH 32/57] chardev/char-socket: skip automatic zero-init of large + array +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [6/30] 86a2ac03efa1838fb30931c38945ee77de9bbe06 (stefanha/centos-stream-qemu-kvm) + +The 'tcp_chr_read' method has a 4k byte array used for copying +data between the socket and device. Skip the automatic zero-init +of this array to eliminate the performance overhead in the I/O +hot path. + +The 'buf' array will be fully initialized when reading data off +the network socket. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Message-id: 20250610123709.835102-7-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 9a23075cef1ac6e73a95a489ac72f41c573ceb9b) +Signed-off-by: Stefan Hajnoczi +--- + chardev/char-socket.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/chardev/char-socket.c b/chardev/char-socket.c +index 1ca9441b1b..99d644e89f 100644 +--- a/chardev/char-socket.c ++++ b/chardev/char-socket.c +@@ -497,7 +497,7 @@ static gboolean tcp_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque) + { + Chardev *chr = CHARDEV(opaque); + SocketChardev *s = SOCKET_CHARDEV(opaque); +- uint8_t buf[CHR_READ_BUF_LEN]; ++ QEMU_UNINITIALIZED uint8_t buf[CHR_READ_BUF_LEN]; + int len, size; + + if ((s->state != TCP_CHARDEV_STATE_CONNECTED) || +-- +2.39.3 + diff --git a/SOURCES/kvm-docs-devel-reset-Document-reset-expectations-for-DMA.patch b/SOURCES/kvm-docs-devel-reset-Document-reset-expectations-for-DMA.patch new file mode 100644 index 0000000..fcd1246 --- /dev/null +++ b/SOURCES/kvm-docs-devel-reset-Document-reset-expectations-for-DMA.patch @@ -0,0 +1,53 @@ +From 389c3c6b4215c9be3fd784c73af0e9795e796380 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Tue, 18 Feb 2025 19:25:35 +0100 +Subject: [PATCH 5/9] docs/devel/reset: Document reset expectations for DMA and + IOMMU +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Eric Auger +RH-MergeRequest: 341: Fix vIOMMU reset order +RH-Jira: RHEL-7188 +RH-Acked-by: Peter Xu +RH-Acked-by: Donald Dutile +RH-Acked-by: Cédric Le Goater +RH-Commit: [5/5] be8b9d9e34a2b301430dfa229c6785ab17d3fb16 (eauger1/centos-qemu-kvm) + +To avoid any translation faults, the IOMMUs are expected to be +reset after the devices they protect. Document that we expect +DMA requests to be stopped during the 'enter' or 'hold' phase +while IOMMUs should be reset during the 'exit' phase. + +Signed-off-by: Eric Auger +Reviewed-by: Zhenzhong Duan +Message-Id: <20250218182737.76722-6-eric.auger@redhat.com> +Reviewed-by: Peter Xu +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit dd6d545e8f2d9a0e8a8c287ec16469f03ef5c198) +Signed-off-by: Eric Auger +--- + docs/devel/reset.rst | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/docs/devel/reset.rst b/docs/devel/reset.rst +index 9746a4e8a0..24ab630465 100644 +--- a/docs/devel/reset.rst ++++ b/docs/devel/reset.rst +@@ -123,6 +123,11 @@ The *exit* phase is executed only when the last reset operation ends. Therefore + the object does not need to care how many of reset controllers it has and how + many of them have started a reset. + ++DMA capable devices are expected to cancel all outstanding DMA operations ++during either 'enter' or 'hold' phases. IOMMUs are expected to reset during ++the 'exit' phase and this sequencing makes sure no outstanding DMA request ++will fault. ++ + + Handling reset in a resettable object + ------------------------------------- +-- +2.48.1 + diff --git a/SOURCES/kvm-file-posix-Define-DM_MPATH_PROBE_PATHS.patch b/SOURCES/kvm-file-posix-Define-DM_MPATH_PROBE_PATHS.patch new file mode 100644 index 0000000..08287d4 --- /dev/null +++ b/SOURCES/kvm-file-posix-Define-DM_MPATH_PROBE_PATHS.patch @@ -0,0 +1,42 @@ +From d565fe385b3c45a41fa8e25942220aff38a04fc3 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Tue, 29 Apr 2025 17:05:41 +0200 +Subject: [PATCH 2/3] file-posix: Define DM_MPATH_PROBE_PATHS + +RH-Author: Kevin Wolf +RH-MergeRequest: 372: file-posix: Fix multipath failover with SCSI passthrough [9.7] +RH-Jira: RHEL-95408 +RH-Acked-by: Hanna Czenczek +RH-Acked-by: Stefan Hajnoczi +RH-Commit: [1/2] 7615906833a6bb2b4645fa5cd60d78aa9631cb7c (kmwolf/centos-qemu-kvm) + +While the kernel side isn't merged yet and we're still using old kernel +headers, just define DM_MPATH_PROBE_PATHS manually. + +This is a downstream-only patch that can be removed after the next minor +release. + +Signed-off-by: Kevin Wolf +--- + block/file-posix.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/block/file-posix.c b/block/file-posix.c +index 0cb4e922c0..6a5c506549 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -134,6 +134,11 @@ + #define RAW_LOCK_PERM_BASE 100 + #define RAW_LOCK_SHARED_BASE 200 + ++/* TODO Remove this when the kernel side is merged */ ++#if !defined(DM_MPATH_PROBE_PATHS) && defined(DM_GET_TARGET_VERSION) ++#define DM_MPATH_PROBE_PATHS _IO(DM_IOCTL, DM_GET_TARGET_VERSION_CMD + 1) ++#endif ++ + typedef struct BDRVRawState { + int fd; + bool use_lock; +-- +2.48.1 + diff --git a/SOURCES/kvm-file-posix-Fix-crash-on-discard_granularity-0.patch b/SOURCES/kvm-file-posix-Fix-crash-on-discard_granularity-0.patch new file mode 100644 index 0000000..8a45dcc --- /dev/null +++ b/SOURCES/kvm-file-posix-Fix-crash-on-discard_granularity-0.patch @@ -0,0 +1,46 @@ +From 3515c6541f71817727a3a8b18ec5252644b51bc0 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Tue, 29 Apr 2025 17:56:54 +0200 +Subject: [PATCH 5/5] file-posix: Fix crash on discard_granularity == 0 + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 355: file-posix: probe discard alignment on Linux block devices +RH-Jira: RHEL-86032 +RH-Acked-by: Kevin Wolf +RH-Acked-by: Eric Blake +RH-Commit: [3/3] b8139a4c5b19efff1f15c314447a6abb89db0ae7 (stefanha/centos-stream-qemu-kvm) + +Block devices that don't support discard have a discard_granularity of +0. Currently, this results in a division by zero when we try to make +sure that it's a multiple of request_alignment. Only try to update +bs->bl.pdiscard_alignment when we got a non-zero discard_granularity +from sysfs. + +Fixes: f605796aae4 ('file-posix: probe discard alignment on Linux block devices') +Signed-off-by: Kevin Wolf +Reviewed-by: Stefan Hajnoczi +Reviewed-by: Eric Blake +Message-ID: <20250429155654.102735-1-kwolf@redhat.com> +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 71a30d54e6ab1d5c102a8bee2c263414697402ea) +Signed-off-by: Stefan Hajnoczi +--- + block/file-posix.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/block/file-posix.c b/block/file-posix.c +index 3d5b024459..0cb4e922c0 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -1565,7 +1565,7 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) + int ret; + + ret = hdev_get_pdiscard_alignment(&st, &dalign); +- if (ret == 0) { ++ if (ret == 0 && dalign != 0) { + uint32_t ralign = bs->bl.request_alignment; + + /* Probably never happens, but handle it just in case */ +-- +2.48.1 + diff --git a/SOURCES/kvm-file-posix-Probe-paths-and-retry-SG_IO-on-potential-.patch b/SOURCES/kvm-file-posix-Probe-paths-and-retry-SG_IO-on-potential-.patch new file mode 100644 index 0000000..bd716a1 --- /dev/null +++ b/SOURCES/kvm-file-posix-Probe-paths-and-retry-SG_IO-on-potential-.patch @@ -0,0 +1,215 @@ +From 95c651ba1177bd88dbd9b52fe2ec8fedadcdb5c8 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Thu, 22 May 2025 15:08:03 +0200 +Subject: [PATCH 3/3] file-posix: Probe paths and retry SG_IO on potential path + errors + +RH-Author: Kevin Wolf +RH-MergeRequest: 372: file-posix: Fix multipath failover with SCSI passthrough [9.7] +RH-Jira: RHEL-95408 +RH-Acked-by: Hanna Czenczek +RH-Acked-by: Stefan Hajnoczi +RH-Commit: [2/2] 4312e9ec609e511afdfb6634e1d2370032d41543 (kmwolf/centos-qemu-kvm) + +When scsi-block is used on a host multipath device, it runs into the +problem that the kernel dm-mpath doesn't know anything about SCSI or +SG_IO and therefore can't decide if a SG_IO request returned an error +and needs to be retried on a different path. Instead of getting working +failover, an error is returned to scsi-block and handled according to +the configured error policy. Obviously, this is not what users want, +they want working failover. + +QEMU can parse the SG_IO result and determine whether this could have +been a path error, but just retrying the same request could just send it +to the same failing path again and result in the same error. + +With a kernel that supports the DM_MPATH_PROBE_PATHS ioctl on dm-mpath +block devices (queued in the device mapper tree for Linux 6.16), we can +tell the kernel to probe all paths and tell us if any usable paths +remained. If so, we can now retry the SG_IO ioctl and expect it to be +sent to a working path. + +Signed-off-by: Kevin Wolf +Message-ID: <20250522130803.34738-1-kwolf@redhat.com> +Reviewed-by: Stefan Hajnoczi +Reviewed-by: Hanna Czenczek +Signed-off-by: Kevin Wolf +(cherry picked from commit bf627788ef17721955bfcfba84209a07ae5f54ea) +Signed-off-by: Kevin Wolf +--- + block/file-posix.c | 115 ++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 114 insertions(+), 1 deletion(-) + +diff --git a/block/file-posix.c b/block/file-posix.c +index 6a5c506549..f17a3f4d10 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -41,6 +41,7 @@ + + #include "scsi/pr-manager.h" + #include "scsi/constants.h" ++#include "scsi/utils.h" + + #if defined(__APPLE__) && (__MACH__) + #include +@@ -72,6 +73,7 @@ + #include + #endif + #include ++#include + #include + #include + #include +@@ -139,6 +141,22 @@ + #define DM_MPATH_PROBE_PATHS _IO(DM_IOCTL, DM_GET_TARGET_VERSION_CMD + 1) + #endif + ++/* ++ * Multiple retries are mostly meant for two separate scenarios: ++ * ++ * - DM_MPATH_PROBE_PATHS returns success, but before SG_IO completes, another ++ * path goes down. ++ * ++ * - DM_MPATH_PROBE_PATHS failed all paths in the current path group, so we have ++ * to send another SG_IO to switch to another path group to probe the paths in ++ * it. ++ * ++ * Even if each path is in a separate path group (path_grouping_policy set to ++ * failover), it's rare to have more than eight path groups - and even then ++ * pretty unlikely that only bad path groups would be chosen in eight retries. ++ */ ++#define SG_IO_MAX_RETRIES 8 ++ + typedef struct BDRVRawState { + int fd; + bool use_lock; +@@ -166,6 +184,7 @@ typedef struct BDRVRawState { + bool use_linux_aio:1; + bool has_laio_fdsync:1; + bool use_linux_io_uring:1; ++ bool use_mpath:1; + int page_cache_inconsistent; /* errno from fdatasync failure */ + bool has_fallocate; + bool needs_alignment; +@@ -4248,15 +4267,105 @@ hdev_open_Mac_error: + /* Since this does ioctl the device must be already opened */ + bs->sg = hdev_is_sg(bs); + ++ /* sg devices aren't even block devices and can't use dm-mpath */ ++ s->use_mpath = !bs->sg; ++ + return ret; + } + + #if defined(__linux__) ++#if defined(DM_MPATH_PROBE_PATHS) ++static bool coroutine_fn sgio_path_error(int ret, sg_io_hdr_t *io_hdr) ++{ ++ if (ret < 0) { ++ switch (ret) { ++ case -ENODEV: ++ return true; ++ case -EAGAIN: ++ /* ++ * The device is probably suspended. This happens while the dm table ++ * is reloaded, e.g. because a path is added or removed. This is an ++ * operation that should complete within 1ms, so just wait a bit and ++ * retry. ++ * ++ * If the device was suspended for another reason, we'll wait and ++ * retry SG_IO_MAX_RETRIES times. This is a tolerable delay before ++ * we return an error and potentially stop the VM. ++ */ ++ qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000); ++ return true; ++ default: ++ return false; ++ } ++ } ++ ++ if (io_hdr->host_status != SCSI_HOST_OK) { ++ return true; ++ } ++ ++ switch (io_hdr->status) { ++ case GOOD: ++ case CONDITION_GOOD: ++ case INTERMEDIATE_GOOD: ++ case INTERMEDIATE_C_GOOD: ++ case RESERVATION_CONFLICT: ++ case COMMAND_TERMINATED: ++ return false; ++ case CHECK_CONDITION: ++ return !scsi_sense_buf_is_guest_recoverable(io_hdr->sbp, ++ io_hdr->mx_sb_len); ++ default: ++ return true; ++ } ++} ++ ++static bool coroutine_fn hdev_co_ioctl_sgio_retry(RawPosixAIOData *acb, int ret) ++{ ++ BDRVRawState *s = acb->bs->opaque; ++ RawPosixAIOData probe_acb; ++ ++ if (!s->use_mpath) { ++ return false; ++ } ++ ++ if (!sgio_path_error(ret, acb->ioctl.buf)) { ++ return false; ++ } ++ ++ probe_acb = (RawPosixAIOData) { ++ .bs = acb->bs, ++ .aio_type = QEMU_AIO_IOCTL, ++ .aio_fildes = s->fd, ++ .aio_offset = 0, ++ .ioctl = { ++ .buf = NULL, ++ .cmd = DM_MPATH_PROBE_PATHS, ++ }, ++ }; ++ ++ ret = raw_thread_pool_submit(handle_aiocb_ioctl, &probe_acb); ++ if (ret == -ENOTTY) { ++ s->use_mpath = false; ++ } else if (ret == -EAGAIN) { ++ /* The device might be suspended for a table reload, worth retrying */ ++ return true; ++ } ++ ++ return ret == 0; ++} ++#else ++static bool coroutine_fn hdev_co_ioctl_sgio_retry(RawPosixAIOData *acb, int ret) ++{ ++ return false; ++} ++#endif /* DM_MPATH_PROBE_PATHS */ ++ + static int coroutine_fn + hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) + { + BDRVRawState *s = bs->opaque; + RawPosixAIOData acb; ++ int retries = SG_IO_MAX_RETRIES; + int ret; + + ret = fd_open(bs); +@@ -4284,7 +4393,11 @@ hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) + }, + }; + +- return raw_thread_pool_submit(handle_aiocb_ioctl, &acb); ++ do { ++ ret = raw_thread_pool_submit(handle_aiocb_ioctl, &acb); ++ } while (req == SG_IO && retries-- && hdev_co_ioctl_sgio_retry(&acb, ret)); ++ ++ return ret; + } + #endif /* linux */ + +-- +2.48.1 + diff --git a/SOURCES/kvm-file-posix-gluster-Handle-zero-block-status-hint-bet.patch b/SOURCES/kvm-file-posix-gluster-Handle-zero-block-status-hint-bet.patch new file mode 100644 index 0000000..4b0d130 --- /dev/null +++ b/SOURCES/kvm-file-posix-gluster-Handle-zero-block-status-hint-bet.patch @@ -0,0 +1,64 @@ +From 39e0c370357a414abacd64fb6a172e7b25eb4d82 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Fri, 9 May 2025 15:40:19 -0500 +Subject: [PATCH 04/16] file-posix, gluster: Handle zero block status hint + better + +RH-Author: Eric Blake +RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors +RH-Jira: RHEL-82906 RHEL-83015 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Jon Maloy +RH-Commit: [2/14] 1f7b47ce5f5fb321aee41a16accf5bce3d1bfe95 (ebblake/centos-qemu-kvm) + +Although the previous patch to change 'bool want_zero' into a bitmask +made no semantic change, it is now time to differentiate. When the +caller specifically wants to know what parts of the file read as zero, +we need to use lseek and actually reporting holes, rather than +short-circuiting and advertising full allocation. + +This change will be utilized in later patches to let mirroring +optimize for the case when the destination already reads as zeroes. + +Signed-off-by: Eric Blake +Reviewed-by: Stefan Hajnoczi +Message-ID: <20250509204341.3553601-17-eblake@redhat.com> +(cherry picked from commit a6a0a7fb0e327d17594c971b4a39de14e025b415) +Jira: https://issues.redhat.com/browse/RHEL-82906 +Jira: https://issues.redhat.com/browse/RHEL-83015 +Signed-off-by: Eric Blake +--- + block/file-posix.c | 3 ++- + block/gluster.c | 2 +- + 2 files changed, 3 insertions(+), 2 deletions(-) + +diff --git a/block/file-posix.c b/block/file-posix.c +index 9ca55620ca..ce5da2b4c2 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -3293,7 +3293,8 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs, + return ret; + } + +- if (mode != BDRV_WANT_PRECISE) { ++ if (!(mode & BDRV_WANT_ZERO)) { ++ /* There is no backing file - all bytes are allocated in this file. */ + *pnum = bytes; + *map = offset; + *file = bs; +diff --git a/block/gluster.c b/block/gluster.c +index ae5c45666b..175c70164c 100644 +--- a/block/gluster.c ++++ b/block/gluster.c +@@ -1483,7 +1483,7 @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs, + return ret; + } + +- if (mode != BDRV_WANT_PRECISE) { ++ if (!(mode & BDRV_WANT_ZERO)) { + *pnum = bytes; + *map = offset; + *file = bs; +-- +2.48.1 + diff --git a/SOURCES/kvm-file-posix-probe-discard-alignment-on-Linux-block-de.patch b/SOURCES/kvm-file-posix-probe-discard-alignment-on-Linux-block-de.patch new file mode 100644 index 0000000..7d60479 --- /dev/null +++ b/SOURCES/kvm-file-posix-probe-discard-alignment-on-Linux-block-de.patch @@ -0,0 +1,131 @@ +From 29ae77d77cabc3582267cb8a7c4fe10d279a21e6 Mon Sep 17 00:00:00 2001 +From: Stefan Hajnoczi +Date: Thu, 17 Apr 2025 11:05:27 -0400 +Subject: [PATCH 3/5] file-posix: probe discard alignment on Linux block + devices + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 355: file-posix: probe discard alignment on Linux block devices +RH-Jira: RHEL-86032 +RH-Acked-by: Kevin Wolf +RH-Acked-by: Eric Blake +RH-Commit: [1/3] bb3c17b0da6edeb209874e97d4e2c3b1762a1749 (stefanha/centos-stream-qemu-kvm) + +Populate the pdiscard_alignment block limit so the block layer is able +align discard requests correctly. + +Signed-off-by: Stefan Hajnoczi +Message-ID: <20250417150528.76470-2-stefanha@redhat.com> +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf +(cherry picked from commit f605796aae42885034400c83ed6a9b07cd6d6481) +Signed-off-by: Stefan Hajnoczi +--- + block/file-posix.c | 67 +++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 66 insertions(+), 1 deletion(-) + +diff --git a/block/file-posix.c b/block/file-posix.c +index ff928b5e85..3d5b024459 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -1268,10 +1268,10 @@ static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned) + } + #endif /* defined(CONFIG_BLKZONED) */ + ++#ifdef CONFIG_LINUX + /* + * Get a sysfs attribute value as a long integer. + */ +-#ifdef CONFIG_LINUX + static long get_sysfs_long_val(struct stat *st, const char *attribute) + { + g_autofree char *str = NULL; +@@ -1291,6 +1291,30 @@ static long get_sysfs_long_val(struct stat *st, const char *attribute) + } + return ret; + } ++ ++/* ++ * Get a sysfs attribute value as a uint32_t. ++ */ ++static int get_sysfs_u32_val(struct stat *st, const char *attribute, ++ uint32_t *u32) ++{ ++ g_autofree char *str = NULL; ++ const char *end; ++ unsigned int val; ++ int ret; ++ ++ ret = get_sysfs_str_val(st, attribute, &str); ++ if (ret < 0) { ++ return ret; ++ } ++ ++ /* The file is ended with '\n', pass 'end' to accept that. */ ++ ret = qemu_strtoui(str, &end, 10, &val); ++ if (ret == 0 && end && *end == '\0') { ++ *u32 = val; ++ } ++ return ret; ++} + #endif + + static int hdev_get_max_segments(int fd, struct stat *st) +@@ -1310,6 +1334,23 @@ static int hdev_get_max_segments(int fd, struct stat *st) + #endif + } + ++/* ++ * Fills in *dalign with the discard alignment and returns 0 on success, ++ * -errno otherwise. ++ */ ++static int hdev_get_pdiscard_alignment(struct stat *st, uint32_t *dalign) ++{ ++#ifdef CONFIG_LINUX ++ /* ++ * Note that Linux "discard_granularity" is QEMU "discard_alignment". Linux ++ * "discard_alignment" is something else. ++ */ ++ return get_sysfs_u32_val(st, "discard_granularity", dalign); ++#else ++ return -ENOTSUP; ++#endif ++} ++ + #if defined(CONFIG_BLKZONED) + /* + * If the reset_all flag is true, then the wps of zone whose state is +@@ -1519,6 +1560,30 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) + } + } + ++ if (S_ISBLK(st.st_mode)) { ++ uint32_t dalign = 0; ++ int ret; ++ ++ ret = hdev_get_pdiscard_alignment(&st, &dalign); ++ if (ret == 0) { ++ uint32_t ralign = bs->bl.request_alignment; ++ ++ /* Probably never happens, but handle it just in case */ ++ if (dalign < ralign && (ralign % dalign == 0)) { ++ dalign = ralign; ++ } ++ ++ /* The block layer requires a multiple of request_alignment */ ++ if (dalign % ralign != 0) { ++ error_setg(errp, "Invalid pdiscard_alignment limit %u is not a " ++ "multiple of request_alignment %u", dalign, ralign); ++ return; ++ } ++ ++ bs->bl.pdiscard_alignment = dalign; ++ } ++ } ++ + raw_refresh_zoned_limits(bs, &st, errp); + } + +-- +2.48.1 + diff --git a/SOURCES/kvm-hw-arm-smmuv3-Move-reset-to-exit-phase.patch b/SOURCES/kvm-hw-arm-smmuv3-Move-reset-to-exit-phase.patch new file mode 100644 index 0000000..689a6f5 --- /dev/null +++ b/SOURCES/kvm-hw-arm-smmuv3-Move-reset-to-exit-phase.patch @@ -0,0 +1,123 @@ +From a3dfbe30e930c8d794057e45fffd91a9b0e6afd0 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Tue, 18 Feb 2025 19:25:33 +0100 +Subject: [PATCH 3/9] hw/arm/smmuv3: Move reset to exit phase +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Eric Auger +RH-MergeRequest: 341: Fix vIOMMU reset order +RH-Jira: RHEL-7188 +RH-Acked-by: Peter Xu +RH-Acked-by: Donald Dutile +RH-Acked-by: Cédric Le Goater +RH-Commit: [3/5] e291cb45c32e0fab49b200c275553bbe76b97264 (eauger1/centos-qemu-kvm) + +Currently the iommu may be reset before the devices +it protects. For example this happens with virtio-scsi-pci. +when system_reset is issued from qmp monitor: spurious +"virtio: zero sized buffers are not allowed" warnings can +be observed. This happens because outstanding DMA requests +are still happening while the SMMU gets reset. + +This can also happen with VFIO devices. In that case +spurious DMA translation faults can be observed on host. + +Make sure the SMMU is reset in the 'exit' phase after +all DMA capable devices have been reset during the 'enter' +or 'hold' phase. + +Signed-off-by: Eric Auger +Reviewed-by: Zhenzhong Duan + +Message-Id: <20250218182737.76722-4-eric.auger@redhat.com> +Reviewed-by: Peter Xu +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit e39e3f8b8dea856f141e9945167d2b18021ef445) +Signed-off-by: Eric Auger +--- + hw/arm/smmu-common.c | 9 +++++++-- + hw/arm/smmuv3.c | 14 ++++++++++---- + hw/arm/trace-events | 1 + + 3 files changed, 18 insertions(+), 6 deletions(-) + +diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c +index 3f82728758..f4210fcbc1 100644 +--- a/hw/arm/smmu-common.c ++++ b/hw/arm/smmu-common.c +@@ -924,7 +924,12 @@ static void smmu_base_realize(DeviceState *dev, Error **errp) + } + } + +-static void smmu_base_reset_hold(Object *obj, ResetType type) ++/* ++ * Make sure the IOMMU is reset in 'exit' phase after ++ * all outstanding DMA requests have been quiesced during ++ * the 'enter' or 'hold' reset phases ++ */ ++static void smmu_base_reset_exit(Object *obj, ResetType type) + { + SMMUState *s = ARM_SMMU(obj); + +@@ -950,7 +955,7 @@ static void smmu_base_class_init(ObjectClass *klass, void *data) + device_class_set_props(dc, smmu_dev_properties); + device_class_set_parent_realize(dc, smmu_base_realize, + &sbc->parent_realize); +- rc->phases.hold = smmu_base_reset_hold; ++ rc->phases.exit = smmu_base_reset_exit; + } + + static const TypeInfo smmu_base_info = { +diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c +index 3971976389..2e90570915 100644 +--- a/hw/arm/smmuv3.c ++++ b/hw/arm/smmuv3.c +@@ -1870,13 +1870,19 @@ static void smmu_init_irq(SMMUv3State *s, SysBusDevice *dev) + } + } + +-static void smmu_reset_hold(Object *obj, ResetType type) ++/* ++ * Make sure the IOMMU is reset in 'exit' phase after ++ * all outstanding DMA requests have been quiesced during ++ * the 'enter' or 'hold' reset phases ++ */ ++static void smmu_reset_exit(Object *obj, ResetType type) + { + SMMUv3State *s = ARM_SMMUV3(obj); + SMMUv3Class *c = ARM_SMMUV3_GET_CLASS(s); + +- if (c->parent_phases.hold) { +- c->parent_phases.hold(obj, type); ++ trace_smmu_reset_exit(); ++ if (c->parent_phases.exit) { ++ c->parent_phases.exit(obj, type); + } + + smmuv3_init_regs(s); +@@ -1999,7 +2005,7 @@ static void smmuv3_class_init(ObjectClass *klass, void *data) + SMMUv3Class *c = ARM_SMMUV3_CLASS(klass); + + dc->vmsd = &vmstate_smmuv3; +- resettable_class_set_parent_phases(rc, NULL, smmu_reset_hold, NULL, ++ resettable_class_set_parent_phases(rc, NULL, NULL, smmu_reset_exit, + &c->parent_phases); + device_class_set_parent_realize(dc, smmu_realize, + &c->parent_realize); +diff --git a/hw/arm/trace-events b/hw/arm/trace-events +index be6c8f720b..79ef347e3e 100644 +--- a/hw/arm/trace-events ++++ b/hw/arm/trace-events +@@ -56,6 +56,7 @@ smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid=0x%x" + smmuv3_notify_flag_add(const char *iommu) "ADD SMMUNotifier node for iommu mr=%s" + smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s" + smmuv3_inv_notifiers_iova(const char *name, int asid, int vmid, uint64_t iova, uint8_t tg, uint64_t num_pages, int stage) "iommu mr=%s asid=%d vmid=%d iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64" stage=%d" ++smmu_reset_exit(void) "" + + # strongarm.c + strongarm_uart_update_parameters(const char *label, int speed, char parity, int data_bits, int stop_bits) "%s speed=%d parity=%c data=%d stop=%d" +-- +2.48.1 + diff --git a/SOURCES/kvm-hw-audio-ac97-skip-automatic-zero-init-of-large-arra.patch b/SOURCES/kvm-hw-audio-ac97-skip-automatic-zero-init-of-large-arra.patch new file mode 100644 index 0000000..ccaf1c4 --- /dev/null +++ b/SOURCES/kvm-hw-audio-ac97-skip-automatic-zero-init-of-large-arra.patch @@ -0,0 +1,57 @@ +From 2018f62f2242d8d4a970d83ebef9b3c2bccf6fda Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:36:45 +0100 +Subject: [PATCH 33/57] hw/audio/ac97: skip automatic zero-init of large arrays +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [7/30] 4a6b59a9b9122d9f89e99b3e44df19e6d92ed941 (stefanha/centos-stream-qemu-kvm) + +The 'read_audio' & 'write_audio' methods have a 4k byte array used +for copying data between the audio backend and device. Skip the +automatic zero-init of these arrays to eliminate the performance +overhead in the I/O hot path. + +The 'tmpbuf' array will be fully initialized when reading data from +the audio backend and/or device memory. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Message-id: 20250610123709.835102-8-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 2553d2d26a9d0f46386bf8c37d184567e5cede6c) +Signed-off-by: Stefan Hajnoczi +--- + hw/audio/ac97.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/hw/audio/ac97.c b/hw/audio/ac97.c +index 3f0053f94d..681b5752a1 100644 +--- a/hw/audio/ac97.c ++++ b/hw/audio/ac97.c +@@ -886,7 +886,7 @@ static void nabm_writel(void *opaque, uint32_t addr, uint32_t val) + static int write_audio(AC97LinkState *s, AC97BusMasterRegs *r, + int max, int *stop) + { +- uint8_t tmpbuf[4096]; ++ QEMU_UNINITIALIZED uint8_t tmpbuf[4096]; + uint32_t addr = r->bd.addr; + uint32_t temp = r->picb << 1; + uint32_t written = 0; +@@ -959,7 +959,7 @@ static void write_bup(AC97LinkState *s, int elapsed) + static int read_audio(AC97LinkState *s, AC97BusMasterRegs *r, + int max, int *stop) + { +- uint8_t tmpbuf[4096]; ++ QEMU_UNINITIALIZED uint8_t tmpbuf[4096]; + uint32_t addr = r->bd.addr; + uint32_t temp = r->picb << 1; + uint32_t nread = 0; +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-audio-cs4231a-skip-automatic-zero-init-of-large-a.patch b/SOURCES/kvm-hw-audio-cs4231a-skip-automatic-zero-init-of-large-a.patch new file mode 100644 index 0000000..95f535f --- /dev/null +++ b/SOURCES/kvm-hw-audio-cs4231a-skip-automatic-zero-init-of-large-a.patch @@ -0,0 +1,59 @@ +From bd32bb22fb324a37b31ed9ac3387524f6f4ea5be Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:36:46 +0100 +Subject: [PATCH 34/57] hw/audio/cs4231a: skip automatic zero-init of large + arrays +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [8/30] 6c454bcc2927e49896c62718287fb9e4b37b3bb9 (stefanha/centos-stream-qemu-kvm) + +The 'cs_write_audio' method has a pair of byte arrays, one 4k in size +and one 8k, which are used in converting audio samples. Skip the +automatic zero-init of these arrays to eliminate the performance +overhead in the I/O hot path. + +The 'tmpbuf' array will be fully initialized when reading a block of +data from the guest. The 'linbuf' array will be fully initialized +when converting the audio samples. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Message-id: 20250610123709.835102-9-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit ca2cc0385d97cea66cd54ee42553f385c403d4a6) +Signed-off-by: Stefan Hajnoczi +--- + hw/audio/cs4231a.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/hw/audio/cs4231a.c b/hw/audio/cs4231a.c +index 9ef57f042d..5c312642cc 100644 +--- a/hw/audio/cs4231a.c ++++ b/hw/audio/cs4231a.c +@@ -528,7 +528,7 @@ static int cs_write_audio (CSState *s, int nchan, int dma_pos, + int dma_len, int len) + { + int temp, net; +- uint8_t tmpbuf[4096]; ++ QEMU_UNINITIALIZED uint8_t tmpbuf[4096]; + IsaDmaClass *k = ISADMA_GET_CLASS(s->isa_dma); + + temp = len; +@@ -547,7 +547,7 @@ static int cs_write_audio (CSState *s, int nchan, int dma_pos, + copied = k->read_memory(s->isa_dma, nchan, tmpbuf, dma_pos, to_copy); + if (s->tab) { + int i; +- int16_t linbuf[4096]; ++ QEMU_UNINITIALIZED int16_t linbuf[4096]; + + for (i = 0; i < copied; ++i) + linbuf[i] = s->tab[tmpbuf[i]]; +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-audio-es1370-skip-automatic-zero-init-of-large-ar.patch b/SOURCES/kvm-hw-audio-es1370-skip-automatic-zero-init-of-large-ar.patch new file mode 100644 index 0000000..76a5d89 --- /dev/null +++ b/SOURCES/kvm-hw-audio-es1370-skip-automatic-zero-init-of-large-ar.patch @@ -0,0 +1,49 @@ +From cb12ddc6ed836091aa7724e2f77ab79cd9089cad Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:36:47 +0100 +Subject: [PATCH 35/57] hw/audio/es1370: skip automatic zero-init of large + array +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [9/30] b992e4247d8d31dc09f9dc7671e7a532558174ec (stefanha/centos-stream-qemu-kvm) + +The 'es1370_transfer_audio' method has a 4k byte array used for +copying data between the audio backend and device. Skip the automatic +zero-init of this array to eliminate the performance overhead in +the I/O hot path. + +The 'tmpbuf' array will be fully initialized when reading data from +the audio backend and/or device memory. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Message-id: 20250610123709.835102-10-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 8236e206084b832d1d7ec947a4798b818f4cdf1f) +Signed-off-by: Stefan Hajnoczi +--- + hw/audio/es1370.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/audio/es1370.c b/hw/audio/es1370.c +index 4ab61d3b9d..6aea934f54 100644 +--- a/hw/audio/es1370.c ++++ b/hw/audio/es1370.c +@@ -604,7 +604,7 @@ static uint64_t es1370_read(void *opaque, hwaddr addr, unsigned size) + static void es1370_transfer_audio (ES1370State *s, struct chan *d, int loop_sel, + int max, bool *irq) + { +- uint8_t tmpbuf[4096]; ++ QEMU_UNINITIALIZED uint8_t tmpbuf[4096]; + size_t to_transfer; + uint32_t addr = d->frame_addr; + int sc = d->scount & 0xffff; +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-audio-gus-skip-automatic-zero-init-of-large-array.patch b/SOURCES/kvm-hw-audio-gus-skip-automatic-zero-init-of-large-array.patch new file mode 100644 index 0000000..2ce4fa8 --- /dev/null +++ b/SOURCES/kvm-hw-audio-gus-skip-automatic-zero-init-of-large-array.patch @@ -0,0 +1,48 @@ +From 9ad7091d82fd0577488f27ab54bb7851fe957020 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:36:48 +0100 +Subject: [PATCH 36/57] hw/audio/gus: skip automatic zero-init of large array +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [10/30] 366953d0417ac31e3060fdc327fe8dade3375bf0 (stefanha/centos-stream-qemu-kvm) + +The 'GUS_read_DMA' method has a 4k byte array used for copying +data between the audio backend and device. Skip the automatic +zero-init of this array to eliminate the performance overhead in +the I/O hot path. + +The 'tmpbuf' array will be fully initialized when reading data +from device memory. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Message-id: 20250610123709.835102-11-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 2e438da4929018c62609381e1156aac0b2fe3de3) +Signed-off-by: Stefan Hajnoczi +--- + hw/audio/gus.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/audio/gus.c b/hw/audio/gus.c +index 4beb3fd74e..e8b0b85d44 100644 +--- a/hw/audio/gus.c ++++ b/hw/audio/gus.c +@@ -183,7 +183,7 @@ static int GUS_read_DMA (void *opaque, int nchan, int dma_pos, int dma_len) + { + GUSState *s = opaque; + IsaDmaClass *k = ISADMA_GET_CLASS(s->isa_dma); +- char tmpbuf[4096]; ++ QEMU_UNINITIALIZED char tmpbuf[4096]; + int pos = dma_pos, mode, left = dma_len - dma_pos; + + ldebug ("read DMA %#x %d\n", dma_pos, dma_len); +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-audio-marvell_88w8618-skip-automatic-zero-init-of.patch b/SOURCES/kvm-hw-audio-marvell_88w8618-skip-automatic-zero-init-of.patch new file mode 100644 index 0000000..3608901 --- /dev/null +++ b/SOURCES/kvm-hw-audio-marvell_88w8618-skip-automatic-zero-init-of.patch @@ -0,0 +1,50 @@ +From 5cf61823cbe80b1ace2f5bdb9cc1971956425b98 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:36:49 +0100 +Subject: [PATCH 37/57] hw/audio/marvell_88w8618: skip automatic zero-init of + large array +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [11/30] e09cdb76430552081168873dadfef1b5c8f74327 (stefanha/centos-stream-qemu-kvm) + +The 'mv88w8618_audio_callback' method has a 4k byte array used for +copying data between the audio backend and device. Skip the automatic +zero-init of this array to eliminate the performance overhead in +the I/O hot path. + +The 'buf' array will be fully initialized when reading data from +device memory. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Message-id: 20250610123709.835102-12-berrange@redhat.com +[Fixed hw/audio/gus in commit message --Stefan] +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 5b6cd5c5df4229972d8a0fd9dd9a089a1644d6ba) +Signed-off-by: Stefan Hajnoczi +--- + hw/audio/marvell_88w8618.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/audio/marvell_88w8618.c b/hw/audio/marvell_88w8618.c +index cc285444bc..b7b4b27272 100644 +--- a/hw/audio/marvell_88w8618.c ++++ b/hw/audio/marvell_88w8618.c +@@ -66,7 +66,7 @@ static void mv88w8618_audio_callback(void *opaque, int free_out, int free_in) + { + mv88w8618_audio_state *s = opaque; + int16_t *codec_buffer; +- int8_t buf[4096]; ++ QEMU_UNINITIALIZED int8_t buf[4096]; + int8_t *mem_buffer; + int pos, block_size; + +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-audio-sb16-skip-automatic-zero-init-of-large-arra.patch b/SOURCES/kvm-hw-audio-sb16-skip-automatic-zero-init-of-large-arra.patch new file mode 100644 index 0000000..7e531d6 --- /dev/null +++ b/SOURCES/kvm-hw-audio-sb16-skip-automatic-zero-init-of-large-arra.patch @@ -0,0 +1,48 @@ +From 0b4d59d75edd49ef99f0a82fbcbe360c5b48e4f8 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:36:50 +0100 +Subject: [PATCH 38/57] hw/audio/sb16: skip automatic zero-init of large array +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [12/30] 6475d67546bf04745636b317e965bcd89b6fb2d2 (stefanha/centos-stream-qemu-kvm) + +The 'write_audio' method has a 4k byte array used for copying data +between the audio backend and device. Skip the automatic zero-init +of this array to eliminate the performance overhead in the I/O hot +path. + +The 'tmpbuf' array will be fully initialized when reading data from +device memory. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Message-id: 20250610123709.835102-13-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 30c82f6657c1ee9fbb5473924b4d3273f214bd6f) +Signed-off-by: Stefan Hajnoczi +--- + hw/audio/sb16.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/audio/sb16.c b/hw/audio/sb16.c +index fd76e78d18..04c818ed3d 100644 +--- a/hw/audio/sb16.c ++++ b/hw/audio/sb16.c +@@ -1181,7 +1181,7 @@ static int write_audio (SB16State *s, int nchan, int dma_pos, + IsaDma *isa_dma = nchan == s->dma ? s->isa_dma : s->isa_hdma; + IsaDmaClass *k = ISADMA_GET_CLASS(isa_dma); + int temp, net; +- uint8_t tmpbuf[4096]; ++ QEMU_UNINITIALIZED uint8_t tmpbuf[4096]; + + temp = len; + net = 0; +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-audio-via-ac97-skip-automatic-zero-init-of-large-.patch b/SOURCES/kvm-hw-audio-via-ac97-skip-automatic-zero-init-of-large-.patch new file mode 100644 index 0000000..c52f0c1 --- /dev/null +++ b/SOURCES/kvm-hw-audio-via-ac97-skip-automatic-zero-init-of-large-.patch @@ -0,0 +1,49 @@ +From 35332282ef8bd06f59206266006eff222ffe6bec Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:36:51 +0100 +Subject: [PATCH 39/57] hw/audio/via-ac97: skip automatic zero-init of large + array +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [13/30] 6391a04b29fcbb8bcdbce2c6b786758fc34f0d71 (stefanha/centos-stream-qemu-kvm) + +The 'out_cb' method has a 4k byte array used for copying data +between the audio backend and device. Skip the automatic zero-init +of this array to eliminate the performance overhead in the I/O hot +path. + +The 'tmpbuf' array will be fully initialized when reading data from +device memory. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Message-id: 20250610123709.835102-14-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit bb71d9fe1419f44529c91d1b09464718d157e647) +Signed-off-by: Stefan Hajnoczi +--- + hw/audio/via-ac97.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/audio/via-ac97.c b/hw/audio/via-ac97.c +index 4c127a1def..e8fcf44e5d 100644 +--- a/hw/audio/via-ac97.c ++++ b/hw/audio/via-ac97.c +@@ -175,7 +175,7 @@ static void out_cb(void *opaque, int avail) + ViaAC97SGDChannel *c = &s->aur; + int temp, to_copy, copied; + bool stop = false; +- uint8_t tmpbuf[4096]; ++ QEMU_UNINITIALIZED uint8_t tmpbuf[4096]; + + if (c->stat & STAT_PAUSED) { + return; +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-char-sclpconsole-lm-skip-automatic-zero-init-of-l.patch b/SOURCES/kvm-hw-char-sclpconsole-lm-skip-automatic-zero-init-of-l.patch new file mode 100644 index 0000000..98d11f0 --- /dev/null +++ b/SOURCES/kvm-hw-char-sclpconsole-lm-skip-automatic-zero-init-of-l.patch @@ -0,0 +1,49 @@ +From b0c16a93460c2dfe834a9f439d25dc833dfb7427 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:36:52 +0100 +Subject: [PATCH 40/57] hw/char/sclpconsole-lm: skip automatic zero-init of + large array +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [14/30] 1491e0147a799ec523fa67fd49649722a07299e7 (stefanha/centos-stream-qemu-kvm) + +The 'process_mdb' method has a 4k byte array used for copying data +between the guest and the chardev backend. Skip the automatic zero-init +of this array to eliminate the performance overhead in the I/O hot +path. + +The 'buffer' array will be selectively initialized when data is converted +between EBCDIC and ASCII. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Message-id: 20250610123709.835102-15-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 8b1dac1ad57082611419b0e2f347acd96115d25f) +Signed-off-by: Stefan Hajnoczi +--- + hw/char/sclpconsole-lm.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/char/sclpconsole-lm.c b/hw/char/sclpconsole-lm.c +index 7719f438f6..19e64b92f6 100644 +--- a/hw/char/sclpconsole-lm.c ++++ b/hw/char/sclpconsole-lm.c +@@ -214,7 +214,7 @@ static int process_mdb(SCLPEvent *event, MDBO *mdbo) + { + int rc; + int len; +- uint8_t buffer[SIZE_BUFFER]; ++ QEMU_UNINITIALIZED uint8_t buffer[SIZE_BUFFER]; + + len = be16_to_cpu(mdbo->length); + len -= sizeof(mdbo->length) + sizeof(mdbo->type) +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-display-vmware_vga-skip-automatic-zero-init-of-la.patch b/SOURCES/kvm-hw-display-vmware_vga-skip-automatic-zero-init-of-la.patch new file mode 100644 index 0000000..607fd50 --- /dev/null +++ b/SOURCES/kvm-hw-display-vmware_vga-skip-automatic-zero-init-of-la.patch @@ -0,0 +1,49 @@ +From 7b5624efccf55184278c6f4924efc2141df460f0 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:36:54 +0100 +Subject: [PATCH 42/57] hw/display/vmware_vga: skip automatic zero-init of + large struct +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [16/30] 4aaf459d4356bf28164be742889b9a78d3656703 (stefanha/centos-stream-qemu-kvm) + +The 'vmsvga_fifo_run' method has a struct which is a little over 20k +in size, used for holding image data for cursor changes. Skip the +automatic zero-init of this struct to eliminate the performance +overhead in the I/O hot path. + +The cursor variable will be fully initialized only when processing +a cursor definition message from the guest. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Message-id: 20250610123709.835102-17-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 7048e70f391df76d009eecca25f8027858f9f304) +Signed-off-by: Stefan Hajnoczi +--- + hw/display/vmware_vga.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/display/vmware_vga.c b/hw/display/vmware_vga.c +index 3db3ff98f7..69afe98a2f 100644 +--- a/hw/display/vmware_vga.c ++++ b/hw/display/vmware_vga.c +@@ -618,7 +618,7 @@ static void vmsvga_fifo_run(struct vmsvga_state_s *s) + uint32_t cmd, colour; + int args, len, maxloop = 1024; + int x, y, dx, dy, width, height; +- struct vmsvga_cursor_definition_s cursor; ++ QEMU_UNINITIALIZED struct vmsvga_cursor_definition_s cursor; + uint32_t cmd_start; + + len = vmsvga_fifo_length(s); +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-dma-xlnx_csu_dma-skip-automatic-zero-init-of-larg.patch b/SOURCES/kvm-hw-dma-xlnx_csu_dma-skip-automatic-zero-init-of-larg.patch new file mode 100644 index 0000000..d38d141 --- /dev/null +++ b/SOURCES/kvm-hw-dma-xlnx_csu_dma-skip-automatic-zero-init-of-larg.patch @@ -0,0 +1,47 @@ +From cd3500c9e248dbefb36273046e6eee44ee0d5cbe Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:36:53 +0100 +Subject: [PATCH 41/57] hw/dma/xlnx_csu_dma: skip automatic zero-init of large + array +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [15/30] 063c88269c7d3bf07ae05aaf2d3d154e2016db81 (stefanha/centos-stream-qemu-kvm) + +The 'xlnx_csu_dma_src_notify' method has a 4k byte array used for +copying DMA data. Skip the automatic zero-init of this array to +eliminate the performance overhead in the I/O hot path. + +The 'buf' array will be fully initialized when data is copied. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Message-id: 20250610123709.835102-16-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit ce14f24611aa0469b464a9512e192b4fd51dca2b) +Signed-off-by: Stefan Hajnoczi +--- + hw/dma/xlnx_csu_dma.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/dma/xlnx_csu_dma.c b/hw/dma/xlnx_csu_dma.c +index ae307482f2..9d1cccc5ca 100644 +--- a/hw/dma/xlnx_csu_dma.c ++++ b/hw/dma/xlnx_csu_dma.c +@@ -287,7 +287,7 @@ static uint32_t xlnx_csu_dma_advance(XlnxCSUDMA *s, uint32_t len) + static void xlnx_csu_dma_src_notify(void *opaque) + { + XlnxCSUDMA *s = XLNX_CSU_DMA(opaque); +- unsigned char buf[4 * 1024]; ++ QEMU_UNINITIALIZED unsigned char buf[4 * 1024]; + size_t rlen = 0; + + ptimer_transaction_begin(s->src_timer); +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-hyperv-syndbg-skip-automatic-zero-init-of-large-a.patch b/SOURCES/kvm-hw-hyperv-syndbg-skip-automatic-zero-init-of-large-a.patch new file mode 100644 index 0000000..26d816e --- /dev/null +++ b/SOURCES/kvm-hw-hyperv-syndbg-skip-automatic-zero-init-of-large-a.patch @@ -0,0 +1,56 @@ +From a4673aab85958c60867b12c65cc3483d734bb6e0 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:36:55 +0100 +Subject: [PATCH 43/57] hw/hyperv/syndbg: skip automatic zero-init of large + array +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [17/30] 5f71779c431128601baf46115fe65178532a3836 (stefanha/centos-stream-qemu-kvm) + +The 'handle_recv_msg' method has a 4k byte array used for copying +data between the network socket and guest memory. Skip the automatic +zero-init of this array to eliminate the performance overhead in the +I/O hot path. + +The 'data_buf' array will be fully initialized when data is read +off the network socket. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Message-id: 20250610123709.835102-18-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 5a1f614d0cd0bcc8e84e0b7ab6af63d56bd348a2) +Signed-off-by: Stefan Hajnoczi + +Conflicts: + hw/hyperv/syndbg.c + + Context conflict due to missing commit 3efb9d226221 + ("hw/hyperv/syndbg: common compilation unit") downstream. There is no + need to backport the commit because it's not a bug fix. +--- + hw/hyperv/syndbg.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/hyperv/syndbg.c b/hw/hyperv/syndbg.c +index 065e12fb1e..c7c43c8009 100644 +--- a/hw/hyperv/syndbg.c ++++ b/hw/hyperv/syndbg.c +@@ -188,7 +188,7 @@ static uint16_t handle_recv_msg(HvSynDbg *syndbg, uint64_t outgpa, + uint64_t timeout, uint32_t *retrieved_count) + { + uint16_t ret; +- uint8_t data_buf[TARGET_PAGE_SIZE - UDP_PKT_HEADER_SIZE]; ++ QEMU_UNINITIALIZED uint8_t data_buf[TARGET_PAGE_SIZE - UDP_PKT_HEADER_SIZE]; + hwaddr out_len; + void *out_data; + ssize_t recv_byte_count; +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-i386-Fix-machine-type-compatibility.patch b/SOURCES/kvm-hw-i386-Fix-machine-type-compatibility.patch new file mode 100644 index 0000000..430ba65 --- /dev/null +++ b/SOURCES/kvm-hw-i386-Fix-machine-type-compatibility.patch @@ -0,0 +1,87 @@ +From 2bb5dff02fb393530a12f4f00219cd2f90cd442a Mon Sep 17 00:00:00 2001 +From: Sebastian Ott +Date: Thu, 15 May 2025 18:45:51 +0200 +Subject: [PATCH 3/5] hw/i386: Fix machine type compatibility + +RH-Author: Sebastian Ott +RH-MergeRequest: 364: hw/i386: Fix machine type compatibility +RH-Jira: RHEL-91307 +RH-Acked-by: Cornelia Huck +RH-Acked-by: Jon Maloy +RH-Commit: [1/1] 44ddbcb3af119c65e99018d7ed90887f3948907e (seott1/cos-qemu-kvm) + +Upstream Status: RHEL only + +Ensure compatibility of rhel specific i440fx and q35 machine types. +Pick up missing bits from pc_compat_9_0 upstream. + +Signed-off-by: Sebastian Ott +--- + hw/i386/pc.c | 8 ++++++++ + hw/i386/pc_piix.c | 2 ++ + hw/i386/pc_q35.c | 2 ++ + include/hw/i386/pc.h | 3 +++ + 4 files changed, 15 insertions(+) + +diff --git a/hw/i386/pc.c b/hw/i386/pc.c +index fa9f16cbaf..5237538640 100644 +--- a/hw/i386/pc.c ++++ b/hw/i386/pc.c +@@ -298,6 +298,14 @@ GlobalProperty pc_rhel_compat[] = { + }; + const size_t pc_rhel_compat_len = G_N_ELEMENTS(pc_rhel_compat); + ++GlobalProperty pc_rhel_9_6_compat[] = { ++ /* pc_rhel_9_6_compat from pc_compat_9_0 */ ++ { TYPE_X86_CPU, "x-amd-topoext-features-only", "false" }, ++ { TYPE_X86_CPU, "x-l1-cache-per-thread", "false" }, ++ { TYPE_X86_CPU, "legacy-multi-node", "on" }, ++}; ++const size_t pc_rhel_9_6_compat_len = G_N_ELEMENTS(pc_rhel_9_6_compat); ++ + GlobalProperty pc_rhel_9_5_compat[] = { + /* pc_rhel_9_5_compat from pc_compat_pc_9_0 (backported from 9.1) */ + { TYPE_X86_CPU, "guest-phys-bits", "0" }, +diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c +index 10764bf596..0687317db5 100644 +--- a/hw/i386/pc_piix.c ++++ b/hw/i386/pc_piix.c +@@ -885,6 +885,8 @@ static void pc_i440fx_rhel_machine_7_6_0_options(MachineClass *m) + + compat_props_add(m->compat_props, hw_compat_rhel_9_6, + hw_compat_rhel_9_6_len); ++ compat_props_add(m->compat_props, pc_rhel_9_6_compat, ++ pc_rhel_9_6_compat_len); + compat_props_add(m->compat_props, pc_rhel_9_5_compat, + pc_rhel_9_5_compat_len); + compat_props_add(m->compat_props, hw_compat_rhel_9_5, +diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c +index 5bf08be0fb..871c760aea 100644 +--- a/hw/i386/pc_q35.c ++++ b/hw/i386/pc_q35.c +@@ -704,6 +704,8 @@ static void pc_q35_rhel_machine_9_4_0_options(MachineClass *m) + + compat_props_add(m->compat_props, hw_compat_rhel_9_6, + hw_compat_rhel_9_6_len); ++ compat_props_add(m->compat_props, pc_rhel_9_6_compat, ++ pc_rhel_9_6_compat_len); + compat_props_add(m->compat_props, pc_rhel_9_5_compat, + pc_rhel_9_5_compat_len); + compat_props_add(m->compat_props, hw_compat_rhel_9_5, +diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h +index 75c9271cdd..2b7c18f2b0 100644 +--- a/include/hw/i386/pc.h ++++ b/include/hw/i386/pc.h +@@ -305,6 +305,9 @@ extern const size_t pc_compat_2_3_len; + extern GlobalProperty pc_rhel_compat[]; + extern const size_t pc_rhel_compat_len; + ++extern GlobalProperty pc_rhel_9_6_compat[]; ++extern const size_t pc_rhel_9_6_compat_len; ++ + extern GlobalProperty pc_rhel_9_5_compat[]; + extern const size_t pc_rhel_9_5_compat_len; + +-- +2.48.1 + diff --git a/SOURCES/kvm-hw-i386-amd_iommu-Allow-migration-when-explicitly-cr.patch b/SOURCES/kvm-hw-i386-amd_iommu-Allow-migration-when-explicitly-cr.patch new file mode 100644 index 0000000..d68826a --- /dev/null +++ b/SOURCES/kvm-hw-i386-amd_iommu-Allow-migration-when-explicitly-cr.patch @@ -0,0 +1,117 @@ +From f1ff9d3b379697a2d4627e9529067195841d86a8 Mon Sep 17 00:00:00 2001 +From: Suravee Suthikulpanit +Date: Sun, 4 May 2025 17:04:05 +0000 +Subject: [PATCH 25/57] hw/i386/amd_iommu: Allow migration when explicitly + create the AMDVI-PCI device + +RH-Author: John Allen +RH-MergeRequest: 380: Add ability to manually specify the AMDVI-PCI device +RH-Jira: RHEL-70925 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [2/3] a42b88116e608a79b6fae13ebe3709874f2a853f (johnalle/qemu-kvm-fork) + +Add migration support for AMD IOMMU model by saving necessary AMDVIState +parameters for MMIO registers, device table, command buffer, and event +buffers. + +Also change devtab_len type from size_t to uint64_t to avoid 32-bit build +issue. + +Signed-off-by: Suravee Suthikulpanit +Message-Id: <20250504170405.12623-3-suravee.suthikulpanit@amd.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 28931c2e1591deb4bfaaf744fdc8813e96c230f1) + +JIRA: https://issues.redhat.com/browse/RHEL-70925 + +Signed-off-by: John Allen +--- + hw/i386/amd_iommu.c | 48 +++++++++++++++++++++++++++++++++++++++++++++ + hw/i386/amd_iommu.h | 2 +- + 2 files changed, 49 insertions(+), 1 deletion(-) + +diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c +index 6a5e76cfef..a34e0c5f59 100644 +--- a/hw/i386/amd_iommu.c ++++ b/hw/i386/amd_iommu.c +@@ -1611,8 +1611,55 @@ static void amdvi_sysbus_reset(DeviceState *dev) + amdvi_init(s); + } + ++static const VMStateDescription vmstate_amdvi_sysbus_migratable = { ++ .name = "amd-iommu", ++ .version_id = 1, ++ .minimum_version_id = 1, ++ .priority = MIG_PRI_IOMMU, ++ .fields = (VMStateField[]) { ++ /* Updated in amdvi_handle_control_write() */ ++ VMSTATE_BOOL(enabled, AMDVIState), ++ VMSTATE_BOOL(ga_enabled, AMDVIState), ++ VMSTATE_BOOL(ats_enabled, AMDVIState), ++ VMSTATE_BOOL(cmdbuf_enabled, AMDVIState), ++ VMSTATE_BOOL(completion_wait_intr, AMDVIState), ++ VMSTATE_BOOL(evtlog_enabled, AMDVIState), ++ VMSTATE_BOOL(evtlog_intr, AMDVIState), ++ /* Updated in amdvi_handle_devtab_write() */ ++ VMSTATE_UINT64(devtab, AMDVIState), ++ VMSTATE_UINT64(devtab_len, AMDVIState), ++ /* Updated in amdvi_handle_cmdbase_write() */ ++ VMSTATE_UINT64(cmdbuf, AMDVIState), ++ VMSTATE_UINT64(cmdbuf_len, AMDVIState), ++ /* Updated in amdvi_handle_cmdhead_write() */ ++ VMSTATE_UINT32(cmdbuf_head, AMDVIState), ++ /* Updated in amdvi_handle_cmdtail_write() */ ++ VMSTATE_UINT32(cmdbuf_tail, AMDVIState), ++ /* Updated in amdvi_handle_evtbase_write() */ ++ VMSTATE_UINT64(evtlog, AMDVIState), ++ VMSTATE_UINT32(evtlog_len, AMDVIState), ++ /* Updated in amdvi_handle_evthead_write() */ ++ VMSTATE_UINT32(evtlog_head, AMDVIState), ++ /* Updated in amdvi_handle_evttail_write() */ ++ VMSTATE_UINT32(evtlog_tail, AMDVIState), ++ /* Updated in amdvi_handle_pprbase_write() */ ++ VMSTATE_UINT64(ppr_log, AMDVIState), ++ VMSTATE_UINT32(pprlog_len, AMDVIState), ++ /* Updated in amdvi_handle_pprhead_write() */ ++ VMSTATE_UINT32(pprlog_head, AMDVIState), ++ /* Updated in amdvi_handle_tailhead_write() */ ++ VMSTATE_UINT32(pprlog_tail, AMDVIState), ++ /* MMIO registers */ ++ VMSTATE_UINT8_ARRAY(mmior, AMDVIState, AMDVI_MMIO_SIZE), ++ VMSTATE_UINT8_ARRAY(romask, AMDVIState, AMDVI_MMIO_SIZE), ++ VMSTATE_UINT8_ARRAY(w1cmask, AMDVIState, AMDVI_MMIO_SIZE), ++ VMSTATE_END_OF_LIST() ++ } ++}; ++ + static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) + { ++ DeviceClass *dc = (DeviceClass *) object_get_class(OBJECT(dev)); + AMDVIState *s = AMD_IOMMU_DEVICE(dev); + MachineState *ms = MACHINE(qdev_get_machine()); + PCMachineState *pcms = PC_MACHINE(ms); +@@ -1634,6 +1681,7 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) + } + + s->pci = AMD_IOMMU_PCI(pdev); ++ dc->vmsd = &vmstate_amdvi_sysbus_migratable; + } else { + s->pci = AMD_IOMMU_PCI(object_new(TYPE_AMD_IOMMU_PCI)); + /* This device should take care of IOMMU PCI properties */ +diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h +index ece71ff0b6..741dd9a910 100644 +--- a/hw/i386/amd_iommu.h ++++ b/hw/i386/amd_iommu.h +@@ -329,7 +329,7 @@ struct AMDVIState { + bool excl_enabled; + + hwaddr devtab; /* base address device table */ +- size_t devtab_len; /* device table length */ ++ uint64_t devtab_len; /* device table length */ + + hwaddr cmdbuf; /* command buffer base address */ + uint64_t cmdbuf_len; /* command buffer length */ +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-i386-amd_iommu-Assign-pci-id-0x1419-for-the-AMD-I.patch b/SOURCES/kvm-hw-i386-amd_iommu-Assign-pci-id-0x1419-for-the-AMD-I.patch new file mode 100644 index 0000000..4542745 --- /dev/null +++ b/SOURCES/kvm-hw-i386-amd_iommu-Assign-pci-id-0x1419-for-the-AMD-I.patch @@ -0,0 +1,57 @@ +From e611119b8b4e0712ab103628051d69ea84538719 Mon Sep 17 00:00:00 2001 +From: Suravee Suthikulpanit +Date: Tue, 25 Mar 2025 02:11:40 +0000 +Subject: [PATCH 23/57] hw/i386/amd_iommu: Assign pci-id 0x1419 for the AMD + IOMMU device +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: John Allen +RH-MergeRequest: 379: hw/i386/amd_iommu: Assign pci-id 0x1419 for the AMD IOMMU device +RH-Jira: RHEL-70926 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [1/1] 69d847f64543caf328da3e7663e7d2ebe53cd448 (johnalle/qemu-kvm-fork) + +Currently, the QEMU-emulated AMD IOMMU device use PCI vendor id 0x1022 +(AMD) with device id zero (undefined). Eventhough this does not cause any +functional issue for AMD IOMMU driver since it normally uses information +in the ACPI IVRS table to probe and initialize the device per +recommendation in the AMD IOMMU specification, the device id zero causes +the Windows Device Manager utility to show the device as an unknown device. + +Since Windows only recognizes AMD IOMMU device with device id 0x1419 as +listed in the machine.inf file, modify the QEMU AMD IOMMU model to use +the id 0x1419 to avoid the issue. This advertise the IOMMU as the AMD +IOMMU device for Family 15h (Models 10h-1fh). + +Signed-off-by: Suravee Suthikulpanit +Message-Id: <20250325021140.5676-1-suravee.suthikulpanit@amd.com> +Reviewed-by: Daniel P. Berrangé +Reviewed-by: Yan Vugenfirer +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 719255486df2fcbe1b8599786b37f4bb80272f1a) + +JIRA: https://issues.redhat.com/browse/RHEL-70926 + +Signed-off-by: John Allen +--- + hw/i386/amd_iommu.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c +index d804656ea8..59e1a01b7c 100644 +--- a/hw/i386/amd_iommu.c ++++ b/hw/i386/amd_iommu.c +@@ -1714,6 +1714,7 @@ static void amdvi_pci_class_init(ObjectClass *klass, void *data) + PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); + + k->vendor_id = PCI_VENDOR_ID_AMD; ++ k->device_id = 0x1419; + k->class_id = 0x0806; + k->realize = amdvi_pci_realize; + +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-i386-amd_iommu-Isolate-AMDVI-PCI-from-amd-iommu-d.patch b/SOURCES/kvm-hw-i386-amd_iommu-Isolate-AMDVI-PCI-from-amd-iommu-d.patch new file mode 100644 index 0000000..6da1d5d --- /dev/null +++ b/SOURCES/kvm-hw-i386-amd_iommu-Isolate-AMDVI-PCI-from-amd-iommu-d.patch @@ -0,0 +1,267 @@ +From 5a697d0f66360acca8216f49c06dc9702231d470 Mon Sep 17 00:00:00 2001 +From: Suravee Suthikulpanit +Date: Sun, 4 May 2025 17:04:04 +0000 +Subject: [PATCH 24/57] hw/i386/amd_iommu: Isolate AMDVI-PCI from amd-iommu + device to allow full control over the PCI device creation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: John Allen +RH-MergeRequest: 380: Add ability to manually specify the AMDVI-PCI device +RH-Jira: RHEL-70925 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [1/3] 58254a72ba2d810b57c610462494f76691126521 (johnalle/qemu-kvm-fork) + +Current amd-iommu model internally creates an AMDVI-PCI device. Here is +a snippet from info qtree: + + bus: main-system-bus + type System + dev: amd-iommu, id "" + xtsup = false + pci-id = "" + intremap = "on" + device-iotlb = false + pt = true + ... + dev: q35-pcihost, id "" + MCFG = -1 (0xffffffffffffffff) + pci-hole64-size = 34359738368 (32 GiB) + below-4g-mem-size = 134217728 (128 MiB) + above-4g-mem-size = 0 (0 B) + smm-ranges = true + x-pci-hole64-fix = true + x-config-reg-migration-enabled = true + bypass-iommu = false + bus: pcie.0 + type PCIE + dev: AMDVI-PCI, id "" + addr = 01.0 + romfile = "" + romsize = 4294967295 (0xffffffff) + rombar = -1 (0xffffffffffffffff) + multifunction = false + x-pcie-lnksta-dllla = true + x-pcie-extcap-init = true + failover_pair_id = "" + acpi-index = 0 (0x0) + x-pcie-err-unc-mask = true + x-pcie-ari-nextfn-1 = false + x-max-bounce-buffer-size = 4096 (4 KiB) + x-pcie-ext-tag = true + busnr = 0 (0x0) + class Class 0806, addr 00:01.0, pci id 1022:0000 (sub 1af4:1100) + ... + +This prohibits users from specifying the PCI topology for the amd-iommu device, +which becomes a problem when trying to support VM migration since it does not +guarantee the same enumeration of AMD IOMMU device. + +Therefore, allow the 'AMDVI-PCI' device to optionally be pre-created and +associated with a 'amd-iommu' device via a new 'pci-id' parameter on the +latter. + +For example: + -device AMDVI-PCI,id=iommupci0,bus=pcie.0,addr=0x05 \ + -device amd-iommu,intremap=on,pt=on,xtsup=on,pci-id=iommupci0 \ + +For backward-compatibility, internally create the AMDVI-PCI device if not +specified on the CLI. + +Co-developed-by: Daniel P. Berrangé +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Suravee Suthikulpanit +Message-Id: <20250504170405.12623-2-suravee.suthikulpanit@amd.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit f864a3235ea1d1d714b3cde2d9a810ea6344a7b5) + +JIRA: https://issues.redhat.com/browse/RHEL-70925 + +Signed-off-by: John Allen +--- + hw/i386/acpi-build.c | 8 +++---- + hw/i386/amd_iommu.c | 53 ++++++++++++++++++++++++++------------------ + hw/i386/amd_iommu.h | 3 ++- + 3 files changed, 38 insertions(+), 26 deletions(-) + +diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c +index 032fb1f904..236261f8aa 100644 +--- a/hw/i386/acpi-build.c ++++ b/hw/i386/acpi-build.c +@@ -2392,10 +2392,10 @@ build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id, + build_append_int_noprefix(table_data, ivhd_blob->len + 24, 2); + /* DeviceID */ + build_append_int_noprefix(table_data, +- object_property_get_int(OBJECT(&s->pci), "addr", ++ object_property_get_int(OBJECT(s->pci), "addr", + &error_abort), 2); + /* Capability offset */ +- build_append_int_noprefix(table_data, s->pci.capab_offset, 2); ++ build_append_int_noprefix(table_data, s->pci->capab_offset, 2); + /* IOMMU base address */ + build_append_int_noprefix(table_data, s->mr_mmio.addr, 8); + /* PCI Segment Group */ +@@ -2427,10 +2427,10 @@ build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id, + build_append_int_noprefix(table_data, ivhd_blob->len + 40, 2); + /* DeviceID */ + build_append_int_noprefix(table_data, +- object_property_get_int(OBJECT(&s->pci), "addr", ++ object_property_get_int(OBJECT(s->pci), "addr", + &error_abort), 2); + /* Capability offset */ +- build_append_int_noprefix(table_data, s->pci.capab_offset, 2); ++ build_append_int_noprefix(table_data, s->pci->capab_offset, 2); + /* IOMMU base address */ + build_append_int_noprefix(table_data, s->mr_mmio.addr, 8); + /* PCI Segment Group */ +diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c +index 59e1a01b7c..6a5e76cfef 100644 +--- a/hw/i386/amd_iommu.c ++++ b/hw/i386/amd_iommu.c +@@ -167,11 +167,11 @@ static void amdvi_generate_msi_interrupt(AMDVIState *s) + { + MSIMessage msg = {}; + MemTxAttrs attrs = { +- .requester_id = pci_requester_id(&s->pci.dev) ++ .requester_id = pci_requester_id(&s->pci->dev) + }; + +- if (msi_enabled(&s->pci.dev)) { +- msg = msi_get_message(&s->pci.dev, 0); ++ if (msi_enabled(&s->pci->dev)) { ++ msg = msi_get_message(&s->pci->dev, 0); + address_space_stl_le(&address_space_memory, msg.address, msg.data, + attrs, NULL); + } +@@ -239,7 +239,7 @@ static void amdvi_page_fault(AMDVIState *s, uint16_t devid, + info |= AMDVI_EVENT_IOPF_I | AMDVI_EVENT_IOPF; + amdvi_encode_event(evt, devid, addr, info); + amdvi_log_event(s, evt); +- pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS, ++ pci_word_test_and_set_mask(s->pci->dev.config + PCI_STATUS, + PCI_STATUS_SIG_TARGET_ABORT); + } + /* +@@ -256,7 +256,7 @@ static void amdvi_log_devtab_error(AMDVIState *s, uint16_t devid, + + amdvi_encode_event(evt, devid, devtab, info); + amdvi_log_event(s, evt); +- pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS, ++ pci_word_test_and_set_mask(s->pci->dev.config + PCI_STATUS, + PCI_STATUS_SIG_TARGET_ABORT); + } + /* log an event trying to access command buffer +@@ -269,7 +269,7 @@ static void amdvi_log_command_error(AMDVIState *s, hwaddr addr) + + amdvi_encode_event(evt, 0, addr, info); + amdvi_log_event(s, evt); +- pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS, ++ pci_word_test_and_set_mask(s->pci->dev.config + PCI_STATUS, + PCI_STATUS_SIG_TARGET_ABORT); + } + /* log an illegal command event +@@ -310,7 +310,7 @@ static void amdvi_log_pagetab_error(AMDVIState *s, uint16_t devid, + info |= AMDVI_EVENT_PAGE_TAB_HW_ERROR; + amdvi_encode_event(evt, devid, addr, info); + amdvi_log_event(s, evt); +- pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS, ++ pci_word_test_and_set_mask(s->pci->dev.config + PCI_STATUS, + PCI_STATUS_SIG_TARGET_ABORT); + } + +@@ -1607,7 +1607,7 @@ static void amdvi_sysbus_reset(DeviceState *dev) + { + AMDVIState *s = AMD_IOMMU_DEVICE(dev); + +- msi_reset(&s->pci.dev); ++ msi_reset(&s->pci->dev); + amdvi_init(s); + } + +@@ -1619,14 +1619,32 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) + X86MachineState *x86ms = X86_MACHINE(ms); + PCIBus *bus = pcms->pcibus; + +- s->iotlb = g_hash_table_new_full(amdvi_uint64_hash, +- amdvi_uint64_equal, g_free, g_free); ++ if (s->pci_id) { ++ PCIDevice *pdev = NULL; ++ int ret = pci_qdev_find_device(s->pci_id, &pdev); + +- /* This device should take care of IOMMU PCI properties */ +- if (!qdev_realize(DEVICE(&s->pci), &bus->qbus, errp)) { +- return; ++ if (ret) { ++ error_report("Cannot find PCI device '%s'", s->pci_id); ++ return; ++ } ++ ++ if (!object_dynamic_cast(OBJECT(pdev), TYPE_AMD_IOMMU_PCI)) { ++ error_report("Device '%s' must be an AMDVI-PCI device type", s->pci_id); ++ return; ++ } ++ ++ s->pci = AMD_IOMMU_PCI(pdev); ++ } else { ++ s->pci = AMD_IOMMU_PCI(object_new(TYPE_AMD_IOMMU_PCI)); ++ /* This device should take care of IOMMU PCI properties */ ++ if (!qdev_realize(DEVICE(s->pci), &bus->qbus, errp)) { ++ return; ++ } + } + ++ s->iotlb = g_hash_table_new_full(amdvi_uint64_hash, ++ amdvi_uint64_equal, g_free, g_free); ++ + /* Pseudo address space under root PCI bus. */ + x86ms->ioapic_as = amdvi_host_dma_iommu(bus, s, AMDVI_IOAPIC_SB_DEVID); + +@@ -1668,6 +1686,7 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) + + static Property amdvi_properties[] = { + DEFINE_PROP_BOOL("xtsup", AMDVIState, xtsup, false), ++ DEFINE_PROP_STRING("pci-id", AMDVIState, pci_id), + DEFINE_PROP_END_OF_LIST(), + }; + +@@ -1676,13 +1695,6 @@ static const VMStateDescription vmstate_amdvi_sysbus = { + .unmigratable = 1 + }; + +-static void amdvi_sysbus_instance_init(Object *klass) +-{ +- AMDVIState *s = AMD_IOMMU_DEVICE(klass); +- +- object_initialize(&s->pci, sizeof(s->pci), TYPE_AMD_IOMMU_PCI); +-} +- + static void amdvi_sysbus_class_init(ObjectClass *klass, void *data) + { + DeviceClass *dc = DEVICE_CLASS(klass); +@@ -1704,7 +1716,6 @@ static const TypeInfo amdvi_sysbus = { + .name = TYPE_AMD_IOMMU_DEVICE, + .parent = TYPE_X86_IOMMU_DEVICE, + .instance_size = sizeof(AMDVIState), +- .instance_init = amdvi_sysbus_instance_init, + .class_init = amdvi_sysbus_class_init + }; + +diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h +index e0dac4d9a9..ece71ff0b6 100644 +--- a/hw/i386/amd_iommu.h ++++ b/hw/i386/amd_iommu.h +@@ -315,7 +315,8 @@ struct AMDVIPCIState { + + struct AMDVIState { + X86IOMMUState iommu; /* IOMMU bus device */ +- AMDVIPCIState pci; /* IOMMU PCI device */ ++ AMDVIPCIState *pci; /* IOMMU PCI device */ ++ char *pci_id; /* ID of AMDVI-PCI device, if user created */ + + uint32_t version; + +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-i386-intel-iommu-Migrate-to-3-phase-reset.patch b/SOURCES/kvm-hw-i386-intel-iommu-Migrate-to-3-phase-reset.patch new file mode 100644 index 0000000..827c43c --- /dev/null +++ b/SOURCES/kvm-hw-i386-intel-iommu-Migrate-to-3-phase-reset.patch @@ -0,0 +1,96 @@ +From 67b281dc1ccdae05da6c6052c264ecd94723c0b2 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Tue, 18 Feb 2025 19:25:32 +0100 +Subject: [PATCH 2/9] hw/i386/intel-iommu: Migrate to 3-phase reset +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Eric Auger +RH-MergeRequest: 341: Fix vIOMMU reset order +RH-Jira: RHEL-7188 +RH-Acked-by: Peter Xu +RH-Acked-by: Donald Dutile +RH-Acked-by: Cédric Le Goater +RH-Commit: [2/5] 5b9b60b2b796529db10b846881e82e7df4626ec1 (eauger1/centos-qemu-kvm) + +Currently the IOMMU may be reset before the devices +it protects. For example this happens with virtio devices +but also with VFIO devices. In this latter case this +produces spurious translation faults on host. + +Let's use 3-phase reset mechanism and reset the IOMMU on +exit phase after all DMA capable devices have been reset +on 'enter' or 'hold' phase. + +Signed-off-by: Eric Auger +Acked-by: Michael S. Tsirkin +Acked-by: Jason Wang +Zhenzhong Duan + +Message-Id: <20250218182737.76722-3-eric.auger@redhat.com> +Reviewed-by: Peter Xu +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 2aaf48bcf27d8b3da5b30af6c1ced464d3df30f7) +Signed-off-by: Eric Auger + +Conflicts: Code change + hw/i386/intel_iommu.c +We miss e3d0814368d0 ("hw: Use device_class_set_legacy_reset() instead +of opencoding") meaning that instead of removing +device_class_set_legacy_reset(dc, vtd_reset) we remove +dc->reset = vtd_reset; +--- + hw/i386/intel_iommu.c | 12 +++++++++--- + hw/i386/trace-events | 1 + + 2 files changed, 10 insertions(+), 3 deletions(-) + +diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c +index 16d2885fcc..4acefcf5c8 100644 +--- a/hw/i386/intel_iommu.c ++++ b/hw/i386/intel_iommu.c +@@ -4212,10 +4212,11 @@ static void vtd_init(IntelIOMMUState *s) + /* Should not reset address_spaces when reset because devices will still use + * the address space they got at first (won't ask the bus again). + */ +-static void vtd_reset(DeviceState *dev) ++static void vtd_reset_exit(Object *obj, ResetType type) + { +- IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev); ++ IntelIOMMUState *s = INTEL_IOMMU_DEVICE(obj); + ++ trace_vtd_reset_exit(); + vtd_init(s); + vtd_address_space_refresh_all(s); + } +@@ -4367,8 +4368,13 @@ static void vtd_class_init(ObjectClass *klass, void *data) + { + DeviceClass *dc = DEVICE_CLASS(klass); + X86IOMMUClass *x86_class = X86_IOMMU_DEVICE_CLASS(klass); ++ ResettableClass *rc = RESETTABLE_CLASS(klass); + +- dc->reset = vtd_reset; ++ /* ++ * Use 'exit' reset phase to make sure all DMA requests ++ * have been quiesced during 'enter' or 'hold' phase ++ */ ++ rc->phases.exit = vtd_reset_exit; + dc->vmsd = &vtd_vmstate; + device_class_set_props(dc, vtd_properties); + dc->hotpluggable = false; +diff --git a/hw/i386/trace-events b/hw/i386/trace-events +index 53c02d7ac8..ac9e1a10aa 100644 +--- a/hw/i386/trace-events ++++ b/hw/i386/trace-events +@@ -68,6 +68,7 @@ vtd_frr_new(int index, uint64_t hi, uint64_t lo) "index %d high 0x%"PRIx64" low + vtd_warn_invalid_qi_tail(uint16_t tail) "tail 0x%"PRIx16 + vtd_warn_ir_vector(uint16_t sid, int index, int vec, int target) "sid 0x%"PRIx16" index %d vec %d (should be: %d)" + vtd_warn_ir_trigger(uint16_t sid, int index, int trig, int target) "sid 0x%"PRIx16" index %d trigger %d (should be: %d)" ++vtd_reset_exit(void) "" + + # amd_iommu.c + amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" + offset 0x%"PRIx32 +-- +2.48.1 + diff --git a/SOURCES/kvm-hw-misc-aspeed_hace-skip-automatic-zero-init-of-larg.patch b/SOURCES/kvm-hw-misc-aspeed_hace-skip-automatic-zero-init-of-larg.patch new file mode 100644 index 0000000..a1553f8 --- /dev/null +++ b/SOURCES/kvm-hw-misc-aspeed_hace-skip-automatic-zero-init-of-larg.patch @@ -0,0 +1,57 @@ +From 0bfbd2c49c01ee77d3b5a21bf9fe675916cbf0ed Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:36:56 +0100 +Subject: [PATCH 44/57] hw/misc/aspeed_hace: skip automatic zero-init of large + array +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [18/30] ec8510be6b23b26b3eecd6767e1deb0c0c50dd58 (stefanha/centos-stream-qemu-kvm) + +The 'do_hash_operation' method has a 256 element iovec array used for +holding pointers to data that is to be hashed. Skip the automatic +zero-init of this array to eliminate the performance overhead in the +I/O hot path. + +The 'iovec' array will be selectively initialized based on data that +needs to be hashed. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Message-id: 20250610123709.835102-19-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 6992c886838282f36b20deee44b666bbfc573a8f) +Signed-off-by: Stefan Hajnoczi + +Conflicts: + hw/misc/aspeed_hace.c + + Context conflict due to missing commit b9ccbe212e24 + ("hw/misc/aspeed_hace: Extract accumulation-mode hash execution into + helper function") downstream. The commit is not a bug fix, so there is + no need to backport it. +--- + hw/misc/aspeed_hace.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/misc/aspeed_hace.c b/hw/misc/aspeed_hace.c +index c06c04ddc6..d2118f1864 100644 +--- a/hw/misc/aspeed_hace.c ++++ b/hw/misc/aspeed_hace.c +@@ -188,7 +188,7 @@ static int gen_acc_mode_iov(AspeedHACEState *s, struct iovec *iov, int id, + static void do_hash_operation(AspeedHACEState *s, int algo, bool sg_mode, + bool acc_mode) + { +- struct iovec iov[ASPEED_HACE_MAX_SG]; ++ QEMU_UNINITIALIZED struct iovec iov[ASPEED_HACE_MAX_SG]; + g_autofree uint8_t *digest_buf = NULL; + size_t digest_len = 0; + int niov = 0; +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-net-rtl8139-skip-automatic-zero-init-of-large-arr.patch b/SOURCES/kvm-hw-net-rtl8139-skip-automatic-zero-init-of-large-arr.patch new file mode 100644 index 0000000..8161972 --- /dev/null +++ b/SOURCES/kvm-hw-net-rtl8139-skip-automatic-zero-init-of-large-arr.patch @@ -0,0 +1,48 @@ +From cc173deaaa4d9dc6ad9188e0b03f46b7e64f26b2 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:36:57 +0100 +Subject: [PATCH 45/57] hw/net/rtl8139: skip automatic zero-init of large array +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [19/30] 344c720aef2feb35f84fd4b21f2b1b31e5572286 (stefanha/centos-stream-qemu-kvm) + +The 'rtl8139_transmit_one' method has a 8k byte array used for +copying data between guest and host. Skip the automatic zero-init +of this array to eliminate the performance overhead in the I/O +hot path. + +The 'txbuffer' will be fully initialized when reading PCI DMA +buffers. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Message-id: 20250610123709.835102-20-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 3ccc6489dd4925ddd1f3066bd3751389169cd7aa) +Signed-off-by: Stefan Hajnoczi +--- + hw/net/rtl8139.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c +index f2fe057535..a2732bf1c1 100644 +--- a/hw/net/rtl8139.c ++++ b/hw/net/rtl8139.c +@@ -1818,7 +1818,7 @@ static int rtl8139_transmit_one(RTL8139State *s, int descriptor) + + PCIDevice *d = PCI_DEVICE(s); + int txsize = s->TxStatus[descriptor] & 0x1fff; +- uint8_t txbuffer[0x2000]; ++ QEMU_UNINITIALIZED uint8_t txbuffer[0x2000]; + + DPRINTF("+++ transmit reading %d bytes from host memory at 0x%08x\n", + txsize, s->TxAddr[descriptor]); +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-net-tulip-skip-automatic-zero-init-of-large-array.patch b/SOURCES/kvm-hw-net-tulip-skip-automatic-zero-init-of-large-array.patch new file mode 100644 index 0000000..06ea05e --- /dev/null +++ b/SOURCES/kvm-hw-net-tulip-skip-automatic-zero-init-of-large-array.patch @@ -0,0 +1,47 @@ +From 400b5c8ae7f06a450ef91230343d7ce489142a38 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:36:58 +0100 +Subject: [PATCH 46/57] hw/net/tulip: skip automatic zero-init of large array +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [20/30] b3d29de8495c0ff40c26974673adefe4eb27a417 (stefanha/centos-stream-qemu-kvm) + +The 'tulip_setup_frame' method has a 4k byte array used for copynig +DMA data from the device. Skip the automatic zero-init of this array +to eliminate the performance overhead in the I/O hot path. + +The 'buf' array will be fully initialized when reading data from the +device. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Message-id: 20250610123709.835102-21-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit e1afd5ee6eb2954f4baf3c97820e4aaf7de97d2a) +Signed-off-by: Stefan Hajnoczi +--- + hw/net/tulip.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/net/tulip.c b/hw/net/tulip.c +index 1f2ef20977..5cf2b96fbd 100644 +--- a/hw/net/tulip.c ++++ b/hw/net/tulip.c +@@ -629,7 +629,7 @@ static void tulip_setup_filter_addr(TULIPState *s, uint8_t *buf, int n) + static void tulip_setup_frame(TULIPState *s, + struct tulip_descriptor *desc) + { +- uint8_t buf[4096]; ++ QEMU_UNINITIALIZED uint8_t buf[4096]; + int len = (desc->control >> TDES1_BUF1_SIZE_SHIFT) & TDES1_BUF1_SIZE_MASK; + int i; + +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-net-virtio-net-skip-automatic-zero-init-of-large-.patch b/SOURCES/kvm-hw-net-virtio-net-skip-automatic-zero-init-of-large-.patch new file mode 100644 index 0000000..4fbe7a4 --- /dev/null +++ b/SOURCES/kvm-hw-net-virtio-net-skip-automatic-zero-init-of-large-.patch @@ -0,0 +1,54 @@ +From 0925796a4537e20e033a675ebc8899e4580235f3 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:36:59 +0100 +Subject: [PATCH 47/57] hw/net/virtio-net: skip automatic zero-init of large + arrays +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [21/30] 0450189a4c4c779b5a1850e9ea8278a5129c5f7f (stefanha/centos-stream-qemu-kvm) + +The 'virtio_net_receive_rcu' method has three arrays with +VIRTQUEUE_MAX_SIZE elements, which are apprixmately 32k in +size used for copying data between guest and host. Skip the +automatic zero-init of these arrays to eliminate the +performance overhead in the I/O hot path. + +The three arrays will be selectively initialized as required +when processing network buffers. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Message-id: 20250610123709.835102-22-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 21cf31c51a7aeff4270c9b30b37e019c536d54b2) +Signed-off-by: Stefan Hajnoczi +--- + hw/net/virtio-net.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c +index 3d2b2460ad..086ea20ea0 100644 +--- a/hw/net/virtio-net.c ++++ b/hw/net/virtio-net.c +@@ -1895,9 +1895,9 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf, + VirtIONet *n = qemu_get_nic_opaque(nc); + VirtIONetQueue *q = virtio_net_get_subqueue(nc); + VirtIODevice *vdev = VIRTIO_DEVICE(n); +- VirtQueueElement *elems[VIRTQUEUE_MAX_SIZE]; +- size_t lens[VIRTQUEUE_MAX_SIZE]; +- struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE]; ++ QEMU_UNINITIALIZED VirtQueueElement *elems[VIRTQUEUE_MAX_SIZE]; ++ QEMU_UNINITIALIZED size_t lens[VIRTQUEUE_MAX_SIZE]; ++ QEMU_UNINITIALIZED struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE]; + struct virtio_net_hdr_v1_hash extra_hdr; + unsigned mhdr_cnt = 0; + size_t offset, i, guest_offset, j; +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-net-xgamc-skip-automatic-zero-init-of-large-array.patch b/SOURCES/kvm-hw-net-xgamc-skip-automatic-zero-init-of-large-array.patch new file mode 100644 index 0000000..027ab99 --- /dev/null +++ b/SOURCES/kvm-hw-net-xgamc-skip-automatic-zero-init-of-large-array.patch @@ -0,0 +1,47 @@ +From 34116b3a243f005938a30e9b38c6f47a62752c3e Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:37:00 +0100 +Subject: [PATCH 48/57] hw/net/xgamc: skip automatic zero-init of large array +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [22/30] 63536d627705775c4bf72a511de3d68ec30ac7de (stefanha/centos-stream-qemu-kvm) + +The 'xgmac_enet_send' method has a 8k byte array used for copying +data between guest and host. Skip the automatic zero-init of this +array to eliminate the performance overhead in the I/O hot path. + +The 'frame' buffer will be fully initialized when reading guest +memory to fetch the data to send. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Message-id: 20250610123709.835102-23-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 8b723287b84a62bb5d1a7799ef0959ca8e6c293a) +Signed-off-by: Stefan Hajnoczi +--- + hw/net/xgmac.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/net/xgmac.c b/hw/net/xgmac.c +index ffe3fc8dbe..eff8022aca 100644 +--- a/hw/net/xgmac.c ++++ b/hw/net/xgmac.c +@@ -207,7 +207,7 @@ static void xgmac_enet_send(XgmacState *s) + struct desc bd; + int frame_size; + int len; +- uint8_t frame[8192]; ++ QEMU_UNINITIALIZED uint8_t frame[8192]; + uint8_t *ptr; + + ptr = frame; +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-nvme-ctrl-skip-automatic-zero-init-of-large-array.patch b/SOURCES/kvm-hw-nvme-ctrl-skip-automatic-zero-init-of-large-array.patch new file mode 100644 index 0000000..6a84a1c --- /dev/null +++ b/SOURCES/kvm-hw-nvme-ctrl-skip-automatic-zero-init-of-large-array.patch @@ -0,0 +1,72 @@ +From 3e0134b45828bf9a623a26ac41d5fbb3a8d2917b Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:37:01 +0100 +Subject: [PATCH 49/57] hw/nvme/ctrl: skip automatic zero-init of large arrays +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [23/30] 57ce4361ffb307be4ea4d3edf9e0dac269d16908 (stefanha/centos-stream-qemu-kvm) + +The 'nvme_map_sgl' method has a 256 element array used for copying +data from the device. Skip the automatic zero-init of this array +to eliminate the performance overhead in the I/O hot path. + +The 'segment' array will be fully initialized when reading data from +the device. + +The 'nme_changed_nslist' method has a 4k byte array that is manually +initialized with memset(). The compiler ought to be intelligent +enough to turn the memset() into a static initialization operation, +and thus not duplicate the automatic zero-init. Replacing memset() +with '{}' makes it unambiguous that the array is statically initialized. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Reviewed-by: Klaus Jensen +Message-id: 20250610123709.835102-24-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 7eeb1d3acc175813ad3d5e824f26123e0992093a) +Signed-off-by: Stefan Hajnoczi +--- + hw/nvme/ctrl.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c +index d451ee0d00..75d7f20801 100644 +--- a/hw/nvme/ctrl.c ++++ b/hw/nvme/ctrl.c +@@ -1047,7 +1047,8 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl, + */ + #define SEG_CHUNK_SIZE 256 + +- NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld; ++ QEMU_UNINITIALIZED NvmeSglDescriptor segment[SEG_CHUNK_SIZE]; ++ NvmeSglDescriptor *sgld, *last_sgld; + uint64_t nsgld; + uint32_t seg_len; + uint16_t status; +@@ -5029,7 +5030,7 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, + static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, + uint64_t off, NvmeRequest *req) + { +- uint32_t nslist[1024]; ++ uint32_t nslist[1024] = {}; + uint32_t trans_len; + int i = 0; + uint32_t nsid; +@@ -5039,7 +5040,6 @@ static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, + return NVME_INVALID_FIELD | NVME_DNR; + } + +- memset(nslist, 0x0, sizeof(nslist)); + trans_len = MIN(sizeof(nslist) - off, buf_len); + + while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) != +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-pci-Basic-support-for-PCI-power-management.patch b/SOURCES/kvm-hw-pci-Basic-support-for-PCI-power-management.patch new file mode 100644 index 0000000..6287a46 --- /dev/null +++ b/SOURCES/kvm-hw-pci-Basic-support-for-PCI-power-management.patch @@ -0,0 +1,242 @@ +From 98b0cd83c09d35a3da0ae142c09038174355e87e Mon Sep 17 00:00:00 2001 +From: Alex Williamson +Date: Tue, 25 Feb 2025 14:52:25 -0700 +Subject: [PATCH 2/7] hw/pci: Basic support for PCI power management +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Eric Auger +RH-MergeRequest: 348: PCI: Implement basic PCI PM capability backing +RH-Jira: RHEL-7301 +RH-Acked-by: Cédric Le Goater +RH-Acked-by: Alex Williamson +RH-Acked-by: Jon Maloy +RH-Commit: [2/6] 5faff6382c124711887704fff4f857e8f85e7be5 (eauger1/centos-qemu-kvm) + +Conflicts: contextual conflict in include/hw/pci/pci.h +we don't have 449dca6ac93a ("pcie: enable Extended tag field support") +downstream so we don't have x-pcie-ext-tag definition. + +The memory and IO BARs for devices are only accessible in the D0 power +state. In other power states the PCI spec defines that the device +responds to TLPs and messages with an Unsupported Request response. + +To approximate this behavior, consider the BARs as unmapped when the +device is not in the D0 power state. This makes the BARs inaccessible +and has the additional bonus for vfio-pci that we don't attempt to DMA +map BARs for devices in a non-D0 power state. + +To support this, an interface is added for devices to register the PM +capability, which allows central tracking to enforce valid transitions +and unmap BARs in non-D0 states. + +NB. We currently have device models (eepro100 and pcie_pci_bridge) +that register a PM capability but do not set wmask to enable writes to +the power state field. In order to maintain migration compatibility, +this new helper does not manage the wmask to enable guest writes to +initiate a power state change. The contents and write access of the +PM capability are still managed by the caller. + +Cc: Michael S. Tsirkin +Cc: Marcel Apfelbaum +Signed-off-by: Alex Williamson +Reviewed-by: Eric Auger +Reviewed-by: Michael S. Tsirkin +Link: https://lore.kernel.org/qemu-devel/20250225215237.3314011-2-alex.williamson@redhat.com +Signed-off-by: Cédric Le Goater +(cherry picked from commit 9461afd2008b0820fc45a6a7bc675df1b6791e4f) +Signed-off-by: Eric Auger +--- + hw/pci/pci.c | 93 ++++++++++++++++++++++++++++++++++++- + hw/pci/trace-events | 2 + + include/hw/pci/pci.h | 3 ++ + include/hw/pci/pci_device.h | 3 ++ + 4 files changed, 99 insertions(+), 2 deletions(-) + +diff --git a/hw/pci/pci.c b/hw/pci/pci.c +index 83c9d5b9ea..d774ae47d2 100644 +--- a/hw/pci/pci.c ++++ b/hw/pci/pci.c +@@ -365,6 +365,84 @@ static void pci_msi_trigger(PCIDevice *dev, MSIMessage msg) + attrs, NULL); + } + ++/* ++ * Register and track a PM capability. If wmask is also enabled for the power ++ * state field of the pmcsr register, guest writes may change the device PM ++ * state. BAR access is only enabled while the device is in the D0 state. ++ * Return the capability offset or negative error code. ++ */ ++int pci_pm_init(PCIDevice *d, uint8_t offset, Error **errp) ++{ ++ int cap = pci_add_capability(d, PCI_CAP_ID_PM, offset, PCI_PM_SIZEOF, errp); ++ ++ if (cap < 0) { ++ return cap; ++ } ++ ++ d->pm_cap = cap; ++ d->cap_present |= QEMU_PCI_CAP_PM; ++ ++ return cap; ++} ++ ++static uint8_t pci_pm_state(PCIDevice *d) ++{ ++ uint16_t pmcsr; ++ ++ if (!(d->cap_present & QEMU_PCI_CAP_PM)) { ++ return 0; ++ } ++ ++ pmcsr = pci_get_word(d->config + d->pm_cap + PCI_PM_CTRL); ++ ++ return pmcsr & PCI_PM_CTRL_STATE_MASK; ++} ++ ++/* ++ * Update the PM capability state based on the new value stored in config ++ * space respective to the old, pre-write state provided. If the new value ++ * is rejected (unsupported or invalid transition) restore the old value. ++ * Return the resulting PM state. ++ */ ++static uint8_t pci_pm_update(PCIDevice *d, uint32_t addr, int l, uint8_t old) ++{ ++ uint16_t pmc; ++ uint8_t new; ++ ++ if (!(d->cap_present & QEMU_PCI_CAP_PM) || ++ !range_covers_byte(addr, l, d->pm_cap + PCI_PM_CTRL)) { ++ return old; ++ } ++ ++ new = pci_pm_state(d); ++ if (new == old) { ++ return old; ++ } ++ ++ pmc = pci_get_word(d->config + d->pm_cap + PCI_PM_PMC); ++ ++ /* ++ * Transitions to D1 & D2 are only allowed if supported. Devices may ++ * only transition to higher D-states or to D0. ++ */ ++ if ((!(pmc & PCI_PM_CAP_D1) && new == 1) || ++ (!(pmc & PCI_PM_CAP_D2) && new == 2) || ++ (old && new && new < old)) { ++ pci_word_test_and_clear_mask(d->config + d->pm_cap + PCI_PM_CTRL, ++ PCI_PM_CTRL_STATE_MASK); ++ pci_word_test_and_set_mask(d->config + d->pm_cap + PCI_PM_CTRL, ++ old); ++ trace_pci_pm_bad_transition(d->name, pci_dev_bus_num(d), ++ PCI_SLOT(d->devfn), PCI_FUNC(d->devfn), ++ old, new); ++ return old; ++ } ++ ++ trace_pci_pm_transition(d->name, pci_dev_bus_num(d), PCI_SLOT(d->devfn), ++ PCI_FUNC(d->devfn), old, new); ++ return new; ++} ++ + static void pci_reset_regions(PCIDevice *dev) + { + int r; +@@ -404,6 +482,11 @@ static void pci_do_device_reset(PCIDevice *dev) + pci_get_word(dev->wmask + PCI_INTERRUPT_LINE) | + pci_get_word(dev->w1cmask + PCI_INTERRUPT_LINE)); + dev->config[PCI_CACHE_LINE_SIZE] = 0x0; ++ /* Default PM state is D0 */ ++ if (dev->cap_present & QEMU_PCI_CAP_PM) { ++ pci_word_test_and_clear_mask(dev->config + dev->pm_cap + PCI_PM_CTRL, ++ PCI_PM_CTRL_STATE_MASK); ++ } + pci_reset_regions(dev); + pci_update_mappings(dev); + +@@ -1525,7 +1608,7 @@ static void pci_update_mappings(PCIDevice *d) + continue; + + new_addr = pci_bar_address(d, i, r->type, r->size); +- if (!d->enabled) { ++ if (!d->enabled || pci_pm_state(d)) { + new_addr = PCI_BAR_UNMAPPED; + } + +@@ -1591,6 +1674,7 @@ uint32_t pci_default_read_config(PCIDevice *d, + + void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int l) + { ++ uint8_t new_pm_state, old_pm_state = pci_pm_state(d); + int i, was_irq_disabled = pci_irq_disabled(d); + uint32_t val = val_in; + +@@ -1603,11 +1687,16 @@ void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int + d->config[addr + i] = (d->config[addr + i] & ~wmask) | (val & wmask); + d->config[addr + i] &= ~(val & w1cmask); /* W1C: Write 1 to Clear */ + } ++ ++ new_pm_state = pci_pm_update(d, addr, l, old_pm_state); ++ + if (ranges_overlap(addr, l, PCI_BASE_ADDRESS_0, 24) || + ranges_overlap(addr, l, PCI_ROM_ADDRESS, 4) || + ranges_overlap(addr, l, PCI_ROM_ADDRESS1, 4) || +- range_covers_byte(addr, l, PCI_COMMAND)) ++ range_covers_byte(addr, l, PCI_COMMAND) || ++ !!new_pm_state != !!old_pm_state) { + pci_update_mappings(d); ++ } + + if (ranges_overlap(addr, l, PCI_COMMAND, 2)) { + pci_update_irq_disabled(d, was_irq_disabled); +diff --git a/hw/pci/trace-events b/hw/pci/trace-events +index 19643aa8c6..c82a87ffdd 100644 +--- a/hw/pci/trace-events ++++ b/hw/pci/trace-events +@@ -1,6 +1,8 @@ + # See docs/devel/tracing.rst for syntax documentation. + + # pci.c ++pci_pm_bad_transition(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, uint8_t old, uint8_t new) "%s %02x:%02x.%x REJECTED PM transition D%d->D%d" ++pci_pm_transition(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, uint8_t old, uint8_t new) "%s %02x:%02x.%x PM transition D%d->D%d" + pci_update_mappings_del(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, int bar, uint64_t addr, uint64_t size) "%s %02x:%02x.%x %d,0x%"PRIx64"+0x%"PRIx64 + pci_update_mappings_add(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, int bar, uint64_t addr, uint64_t size) "%s %02x:%02x.%x %d,0x%"PRIx64"+0x%"PRIx64 + pci_route_irq(int dev_irq, const char *dev_path, int parent_irq, const char *parent_path) "IRQ %d @%s -> IRQ %d @%s" +diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h +index 45365ae085..afeb5a2263 100644 +--- a/include/hw/pci/pci.h ++++ b/include/hw/pci/pci.h +@@ -213,6 +213,8 @@ enum { + QEMU_PCIE_ERR_UNC_MASK = (1 << QEMU_PCIE_ERR_UNC_MASK_BITNR), + #define QEMU_PCIE_ARI_NEXTFN_1_BITNR 12 + QEMU_PCIE_ARI_NEXTFN_1 = (1 << QEMU_PCIE_ARI_NEXTFN_1_BITNR), ++#define QEMU_PCI_CAP_PM_BITNR 14 ++ QEMU_PCI_CAP_PM = (1 << QEMU_PCI_CAP_PM_BITNR), + }; + + typedef struct PCIINTxRoute { +@@ -680,5 +682,6 @@ static inline void pci_irq_pulse(PCIDevice *pci_dev) + MSIMessage pci_get_msi_message(PCIDevice *dev, int vector); + void pci_set_enabled(PCIDevice *pci_dev, bool state); + void pci_set_power(PCIDevice *pci_dev, bool state); ++int pci_pm_init(PCIDevice *pci_dev, uint8_t offset, Error **errp); + + #endif +diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h +index f38fb31119..325d7bcaf7 100644 +--- a/include/hw/pci/pci_device.h ++++ b/include/hw/pci/pci_device.h +@@ -105,6 +105,9 @@ struct PCIDevice { + /* Capability bits */ + uint32_t cap_present; + ++ /* Offset of PM capability in config space */ ++ uint8_t pm_cap; ++ + /* Offset of MSI-X capability in config space */ + uint8_t msix_cap; + +-- +2.48.1 + diff --git a/SOURCES/kvm-hw-pci-Rename-has_power-to-enabled.patch b/SOURCES/kvm-hw-pci-Rename-has_power-to-enabled.patch new file mode 100644 index 0000000..4041ddb --- /dev/null +++ b/SOURCES/kvm-hw-pci-Rename-has_power-to-enabled.patch @@ -0,0 +1,130 @@ +From 8711bb1a54d4f5734d44545cd8e7262bc358f51d Mon Sep 17 00:00:00 2001 +From: Akihiko Odaki +Date: Thu, 9 Jan 2025 15:29:46 +0900 +Subject: [PATCH 1/7] hw/pci: Rename has_power to enabled +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Eric Auger +RH-MergeRequest: 348: PCI: Implement basic PCI PM capability backing +RH-Jira: RHEL-7301 +RH-Acked-by: Cédric Le Goater +RH-Acked-by: Alex Williamson +RH-Acked-by: Jon Maloy +RH-Commit: [1/6] ac8a7427a1203e33aa323933818a7114c0eb4520 (eauger1/centos-qemu-kvm) + +The renamed state will not only represent powering state of PFs, but +also represent SR-IOV VF enablement in the future. + +Signed-off-by: Akihiko Odaki +Reviewed-by: Philippe Mathieu-Daudé +Message-ID: <20250109-reuse-v19-1-f541e82ca5f7@daynix.com> +Signed-off-by: Philippe Mathieu-Daudé +(cherry picked from commit c407eef162f765dd83d45e048585731be41a66fc) +Signed-off-by: Eric Auger +--- + hw/pci/pci.c | 17 +++++++++++------ + hw/pci/pci_host.c | 4 ++-- + include/hw/pci/pci.h | 1 + + include/hw/pci/pci_device.h | 2 +- + 4 files changed, 15 insertions(+), 9 deletions(-) + +diff --git a/hw/pci/pci.c b/hw/pci/pci.c +index fab86d0567..83c9d5b9ea 100644 +--- a/hw/pci/pci.c ++++ b/hw/pci/pci.c +@@ -1525,7 +1525,7 @@ static void pci_update_mappings(PCIDevice *d) + continue; + + new_addr = pci_bar_address(d, i, r->type, r->size); +- if (!d->has_power) { ++ if (!d->enabled) { + new_addr = PCI_BAR_UNMAPPED; + } + +@@ -1613,7 +1613,7 @@ void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int + pci_update_irq_disabled(d, was_irq_disabled); + memory_region_set_enabled(&d->bus_master_enable_region, + (pci_get_word(d->config + PCI_COMMAND) +- & PCI_COMMAND_MASTER) && d->has_power); ++ & PCI_COMMAND_MASTER) && d->enabled); + } + + msi_write_config(d, addr, val_in, l); +@@ -2886,16 +2886,21 @@ MSIMessage pci_get_msi_message(PCIDevice *dev, int vector) + + void pci_set_power(PCIDevice *d, bool state) + { +- if (d->has_power == state) { ++ pci_set_enabled(d, state); ++} ++ ++void pci_set_enabled(PCIDevice *d, bool state) ++{ ++ if (d->enabled == state) { + return; + } + +- d->has_power = state; ++ d->enabled = state; + pci_update_mappings(d); + memory_region_set_enabled(&d->bus_master_enable_region, + (pci_get_word(d->config + PCI_COMMAND) +- & PCI_COMMAND_MASTER) && d->has_power); +- if (!d->has_power) { ++ & PCI_COMMAND_MASTER) && d->enabled); ++ if (!d->enabled) { + pci_device_reset(d); + } + } +diff --git a/hw/pci/pci_host.c b/hw/pci/pci_host.c +index dfe6fe6184..0d82727cc9 100644 +--- a/hw/pci/pci_host.c ++++ b/hw/pci/pci_host.c +@@ -86,7 +86,7 @@ void pci_host_config_write_common(PCIDevice *pci_dev, uint32_t addr, + * allowing direct removal of unexposed functions. + */ + if ((pci_dev->qdev.hotplugged && !pci_get_function_0(pci_dev)) || +- !pci_dev->has_power || is_pci_dev_ejected(pci_dev)) { ++ !pci_dev->enabled || is_pci_dev_ejected(pci_dev)) { + return; + } + +@@ -111,7 +111,7 @@ uint32_t pci_host_config_read_common(PCIDevice *pci_dev, uint32_t addr, + * allowing direct removal of unexposed functions. + */ + if ((pci_dev->qdev.hotplugged && !pci_get_function_0(pci_dev)) || +- !pci_dev->has_power || is_pci_dev_ejected(pci_dev)) { ++ !pci_dev->enabled || is_pci_dev_ejected(pci_dev)) { + return ~0x0; + } + +diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h +index eb26cac810..45365ae085 100644 +--- a/include/hw/pci/pci.h ++++ b/include/hw/pci/pci.h +@@ -678,6 +678,7 @@ static inline void pci_irq_pulse(PCIDevice *pci_dev) + } + + MSIMessage pci_get_msi_message(PCIDevice *dev, int vector); ++void pci_set_enabled(PCIDevice *pci_dev, bool state); + void pci_set_power(PCIDevice *pci_dev, bool state); + + #endif +diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h +index 15694f2489..f38fb31119 100644 +--- a/include/hw/pci/pci_device.h ++++ b/include/hw/pci/pci_device.h +@@ -57,7 +57,7 @@ typedef struct PCIReqIDCache PCIReqIDCache; + struct PCIDevice { + DeviceState qdev; + bool partially_hotplugged; +- bool has_power; ++ bool enabled; + + /* PCI config space */ + uint8_t *config; +-- +2.48.1 + diff --git a/SOURCES/kvm-hw-ppc-spapr_tpm_proxy-skip-automatic-zero-init-of-l.patch b/SOURCES/kvm-hw-ppc-spapr_tpm_proxy-skip-automatic-zero-init-of-l.patch new file mode 100644 index 0000000..4b12664 --- /dev/null +++ b/SOURCES/kvm-hw-ppc-spapr_tpm_proxy-skip-automatic-zero-init-of-l.patch @@ -0,0 +1,52 @@ +From 4c3fe6e7b88c58713c0c499d4bf0658a055ee52e Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:37:03 +0100 +Subject: [PATCH 50/57] hw/ppc/spapr_tpm_proxy: skip automatic zero-init of + large arrays +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [24/30] 8d963380c64a33a27adc99738b42b52864229111 (stefanha/centos-stream-qemu-kvm) + +The 'tpm_execute' method has a pair of 4k arrays used for copying +data between guest and host. Skip the automatic zero-init of these +arrays to eliminate the performance overhead in the I/O hot path. + +The two arrays will be fully initialized when reading data from +guest memory or reading data from the proxy FD. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Reviewed-by: Klaus Jensen +Reviewed-by: Harsh Prateek Bora +Message-id: 20250610123709.835102-26-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 5dd9087fff74b5672526cad254e76f790fb35c7a) +Signed-off-by: Stefan Hajnoczi +--- + hw/ppc/spapr_tpm_proxy.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/hw/ppc/spapr_tpm_proxy.c b/hw/ppc/spapr_tpm_proxy.c +index e10af35a18..88833d9e2e 100644 +--- a/hw/ppc/spapr_tpm_proxy.c ++++ b/hw/ppc/spapr_tpm_proxy.c +@@ -41,8 +41,8 @@ static ssize_t tpm_execute(SpaprTpmProxy *tpm_proxy, target_ulong *args) + target_ulong data_in_size = args[2]; + uint64_t data_out = ppc64_phys_to_real(args[3]); + target_ulong data_out_size = args[4]; +- uint8_t buf_in[TPM_SPAPR_BUFSIZE]; +- uint8_t buf_out[TPM_SPAPR_BUFSIZE]; ++ QEMU_UNINITIALIZED uint8_t buf_in[TPM_SPAPR_BUFSIZE]; ++ QEMU_UNINITIALIZED uint8_t buf_out[TPM_SPAPR_BUFSIZE]; + ssize_t ret; + + trace_spapr_tpm_execute(data_in, data_in_size, data_out, data_out_size); +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-s390-ccw-device-Convert-to-three-phase-reset.patch b/SOURCES/kvm-hw-s390-ccw-device-Convert-to-three-phase-reset.patch new file mode 100644 index 0000000..d5b71e3 --- /dev/null +++ b/SOURCES/kvm-hw-s390-ccw-device-Convert-to-three-phase-reset.patch @@ -0,0 +1,63 @@ +From 5126609c0714c66a0ec41328017e7e8388c78bf4 Mon Sep 17 00:00:00 2001 +From: Peter Maydell +Date: Fri, 13 Sep 2024 15:31:43 +0100 +Subject: [PATCH 02/26] hw/s390/ccw-device: Convert to three-phase reset +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [2/26] 58f6fc2e65a101e069feac399859464d31e43045 (thuth/qemu-kvm-cs) + +Convert the TYPE_CCW_DEVICE to three-phase reset. This is a +device class which is subclassed, so it needs to be three-phase +before we can convert the subclass. + +Signed-off-by: Peter Maydell +Reviewed-by: Nina Schoetterl-Glausch +Reviewed-by: Philippe Mathieu-Daudé +Acked-by: Thomas Huth +Message-id: 20240830145812.1967042-2-peter.maydell@linaro.org +(cherry picked from commit 6a0e10b76b68e2f412746a1d5ed7d6efee804864) +Signed-off-by: Thomas Huth +--- + hw/s390x/ccw-device.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/hw/s390x/ccw-device.c b/hw/s390x/ccw-device.c +index d7bb364579..30f2fb486f 100644 +--- a/hw/s390x/ccw-device.c ++++ b/hw/s390x/ccw-device.c +@@ -88,9 +88,9 @@ static Property ccw_device_properties[] = { + DEFINE_PROP_END_OF_LIST(), + }; + +-static void ccw_device_reset(DeviceState *d) ++static void ccw_device_reset_hold(Object *obj, ResetType type) + { +- CcwDevice *ccw_dev = CCW_DEVICE(d); ++ CcwDevice *ccw_dev = CCW_DEVICE(obj); + + css_reset_sch(ccw_dev->sch); + } +@@ -99,11 +99,12 @@ static void ccw_device_class_init(ObjectClass *klass, void *data) + { + DeviceClass *dc = DEVICE_CLASS(klass); + CCWDeviceClass *k = CCW_DEVICE_CLASS(klass); ++ ResettableClass *rc = RESETTABLE_CLASS(klass); + + k->realize = ccw_device_realize; + k->refill_ids = ccw_device_refill_ids; + device_class_set_props(dc, ccw_device_properties); +- dc->reset = ccw_device_reset; ++ rc->phases.hold = ccw_device_reset_hold; + dc->bus_type = TYPE_VIRTUAL_CSS_BUS; + } + +-- +2.48.1 + diff --git a/SOURCES/kvm-hw-s390-virtio-ccw-Convert-to-three-phase-reset.patch b/SOURCES/kvm-hw-s390-virtio-ccw-Convert-to-three-phase-reset.patch new file mode 100644 index 0000000..15ea0b0 --- /dev/null +++ b/SOURCES/kvm-hw-s390-virtio-ccw-Convert-to-three-phase-reset.patch @@ -0,0 +1,92 @@ +From 7cbf9be09907407a64d739a2d0862af2ad08eaf5 Mon Sep 17 00:00:00 2001 +From: Peter Maydell +Date: Fri, 13 Sep 2024 15:31:43 +0100 +Subject: [PATCH 03/26] hw/s390/virtio-ccw: Convert to three-phase reset +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [3/26] e06ee194fa289a387433b905eb0999a048681a92 (thuth/qemu-kvm-cs) + +Convert the virtio-ccw code to three-phase reset. This allows us to +remove a call to device_class_set_parent_reset(), replacing it with +the three-phase equivalent resettable_class_set_parent_phases(). +Removing all the device_class_set_parent_reset() uses will allow us +to remove some of the glue code that interworks between three-phase +and legacy reset. + +This is a simple conversion, with no behavioural changes. + +Signed-off-by: Peter Maydell +Reviewed-by: Philippe Mathieu-Daudé +Reviewed-by: Nina Schoetterl-Glausch +Acked-by: Thomas Huth +Reviewed-by: Richard Henderson +Message-id: 20240830145812.1967042-3-peter.maydell@linaro.org +(cherry picked from commit 6affa00d6ebebf24485667fe146470b0d6feb90d) +Signed-off-by: Thomas Huth +--- + hw/s390x/virtio-ccw.c | 13 ++++++++----- + hw/s390x/virtio-ccw.h | 2 +- + 2 files changed, 9 insertions(+), 6 deletions(-) + +diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c +index b4676909dd..96747318d2 100644 +--- a/hw/s390x/virtio-ccw.c ++++ b/hw/s390x/virtio-ccw.c +@@ -913,14 +913,15 @@ static void virtio_ccw_notify(DeviceState *d, uint16_t vector) + } + } + +-static void virtio_ccw_reset(DeviceState *d) ++static void virtio_ccw_reset_hold(Object *obj, ResetType type) + { +- VirtioCcwDevice *dev = VIRTIO_CCW_DEVICE(d); ++ VirtioCcwDevice *dev = VIRTIO_CCW_DEVICE(obj); + VirtIOCCWDeviceClass *vdc = VIRTIO_CCW_DEVICE_GET_CLASS(dev); + + virtio_ccw_reset_virtio(dev); +- if (vdc->parent_reset) { +- vdc->parent_reset(d); ++ ++ if (vdc->parent_phases.hold) { ++ vdc->parent_phases.hold(obj, type); + } + } + +@@ -1233,11 +1234,13 @@ static void virtio_ccw_device_class_init(ObjectClass *klass, void *data) + DeviceClass *dc = DEVICE_CLASS(klass); + CCWDeviceClass *k = CCW_DEVICE_CLASS(dc); + VirtIOCCWDeviceClass *vdc = VIRTIO_CCW_DEVICE_CLASS(klass); ++ ResettableClass *rc = RESETTABLE_CLASS(klass); + + k->unplug = virtio_ccw_busdev_unplug; + dc->realize = virtio_ccw_busdev_realize; + dc->unrealize = virtio_ccw_busdev_unrealize; +- device_class_set_parent_reset(dc, virtio_ccw_reset, &vdc->parent_reset); ++ resettable_class_set_parent_phases(rc, NULL, virtio_ccw_reset_hold, NULL, ++ &vdc->parent_phases); + } + + static const TypeInfo virtio_ccw_device_info = { +diff --git a/hw/s390x/virtio-ccw.h b/hw/s390x/virtio-ccw.h +index fac186c8f6..c7a830a194 100644 +--- a/hw/s390x/virtio-ccw.h ++++ b/hw/s390x/virtio-ccw.h +@@ -57,7 +57,7 @@ struct VirtIOCCWDeviceClass { + CCWDeviceClass parent_class; + void (*realize)(VirtioCcwDevice *dev, Error **errp); + void (*unrealize)(VirtioCcwDevice *dev); +- void (*parent_reset)(DeviceState *dev); ++ ResettablePhases parent_phases; + }; + + /* Performance improves when virtqueue kick processing is decoupled from the +-- +2.48.1 + diff --git a/SOURCES/kvm-hw-s390x-ccw-device-Fix-memory-leak-in-loadparm-sett.patch b/SOURCES/kvm-hw-s390x-ccw-device-Fix-memory-leak-in-loadparm-sett.patch new file mode 100644 index 0000000..5bf4b1a --- /dev/null +++ b/SOURCES/kvm-hw-s390x-ccw-device-Fix-memory-leak-in-loadparm-sett.patch @@ -0,0 +1,47 @@ +From b25bbfcad4a3df94555f6b5f238910314a5d17ea Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 25 Jun 2025 10:27:51 +0200 +Subject: [PATCH 02/57] hw/s390x/ccw-device: Fix memory leak in loadparm setter +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +RH-MergeRequest: 387: s390x: Fix memory leaks related to loadparm [rhel-9] +RH-Jira: RHEL-98554 +RH-Acked-by: Cédric Le Goater +RH-Acked-by: Kevin Wolf +RH-Commit: [2/2] d85cf8b3c93ede47b51c4aa1336dc54f58b8cc3f (thuth/qemu-kvm-cs) + +Commit bdf12f2a fixed the setter for the "loadparm" machine property, +which gets a string from a visitor, passes it to s390_ipl_fmt_loadparm() +and then forgot to free it. It left another instance of the same problem +unfixed in the "loadparm" device property. Fix it. + +Signed-off-by: Kevin Wolf +Message-ID: <20250625082751.24896-1-kwolf@redhat.com> +Reviewed-by: Eric Farman +Reviewed-by: Halil Pasic +Tested-by: Thomas Huth +Signed-off-by: Thomas Huth +(cherry picked from commit 78e3781541209b3dcd6f4bb66adf3a3e504b88a4) +--- + hw/s390x/ccw-device.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/s390x/ccw-device.c b/hw/s390x/ccw-device.c +index 30f2fb486f..63e937401e 100644 +--- a/hw/s390x/ccw-device.c ++++ b/hw/s390x/ccw-device.c +@@ -57,7 +57,7 @@ static void ccw_device_set_loadparm(Object *obj, Visitor *v, + Error **errp) + { + CcwDevice *dev = CCW_DEVICE(obj); +- char *val; ++ g_autofree char *val = NULL; + int index; + + index = object_property_get_int(obj, "bootindex", NULL); +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-scsi-lsi53c895a-skip-automatic-zero-init-of-large.patch b/SOURCES/kvm-hw-scsi-lsi53c895a-skip-automatic-zero-init-of-large.patch new file mode 100644 index 0000000..77a1f92 --- /dev/null +++ b/SOURCES/kvm-hw-scsi-lsi53c895a-skip-automatic-zero-init-of-large.patch @@ -0,0 +1,49 @@ +From 45884bfad1f14585407a04eff9230a75bc5095fa Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:37:05 +0100 +Subject: [PATCH 52/57] hw/scsi/lsi53c895a: skip automatic zero-init of large + array +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [26/30] 235884d43fcb3e49b320e36faa631a3656d07de6 (stefanha/centos-stream-qemu-kvm) + +The 'lsi_memcpy' method has a 4k byte array used for copying data +to/from the device. Skip the automatic zero-init of this array to +eliminate the performance overhead in the I/O hot path. + +The 'buf' array will be fully initialized when data is copied. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Reviewed-by: Klaus Jensen +Reviewed-by: Harsh Prateek Bora +Message-id: 20250610123709.835102-28-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 55243edf42ee87bce9f36ca251f3ab9cda1563e4) +Signed-off-by: Stefan Hajnoczi +--- + hw/scsi/lsi53c895a.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/scsi/lsi53c895a.c b/hw/scsi/lsi53c895a.c +index f1935e5328..f165705f8a 100644 +--- a/hw/scsi/lsi53c895a.c ++++ b/hw/scsi/lsi53c895a.c +@@ -1112,7 +1112,7 @@ bad: + static void lsi_memcpy(LSIState *s, uint32_t dest, uint32_t src, int count) + { + int n; +- uint8_t buf[LSI_BUF_SIZE]; ++ QEMU_UNINITIALIZED uint8_t buf[LSI_BUF_SIZE]; + + trace_lsi_memcpy(dest, src, count); + while (count) { +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-scsi-megasas-skip-automatic-zero-init-of-large-ar.patch b/SOURCES/kvm-hw-scsi-megasas-skip-automatic-zero-init-of-large-ar.patch new file mode 100644 index 0000000..140160c --- /dev/null +++ b/SOURCES/kvm-hw-scsi-megasas-skip-automatic-zero-init-of-large-ar.patch @@ -0,0 +1,73 @@ +From 9f76103e90ce8406bc5bbda72a7314b82e56652e Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:37:06 +0100 +Subject: [PATCH 53/57] hw/scsi/megasas: skip automatic zero-init of large + arrays +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [27/30] b3a3f466fd03c64c665c52e26079b03def376f48 (stefanha/centos-stream-qemu-kvm) + +The 'megasas_dcmd_pd_get_list' and 'megasas_dcmd_get_properties' +methods have 4k structs used for copying data from the device. +Skip the automatic zero-init of this array to eliminate the +performance overhead in the I/O hot path. + +The 'info' structs are manually initialized with memset(). The +compiler ought to be intelligent enough to turn the memset() +into a static initialization operation, and thus not duplicate +the automatic zero-init. Replacing memset() with '{}' makes it +unambiguous that the arrays are statically initialized. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Reviewed-by: Klaus Jensen +Reviewed-by: Harsh Prateek Bora +Message-id: 20250610123709.835102-29-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit ca0559e2350c618048f7caf80cb79c1259e7cfd2) +Signed-off-by: Stefan Hajnoczi +--- + hw/scsi/megasas.c | 7 ++----- + 1 file changed, 2 insertions(+), 5 deletions(-) + +diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c +index 2d0c607177..91b65accbc 100644 +--- a/hw/scsi/megasas.c ++++ b/hw/scsi/megasas.c +@@ -981,13 +981,11 @@ static int megasas_event_wait(MegasasState *s, MegasasCmd *cmd) + + static int megasas_dcmd_pd_get_list(MegasasState *s, MegasasCmd *cmd) + { +- struct mfi_pd_list info; +- size_t dcmd_size = sizeof(info); ++ struct mfi_pd_list info = {}; + BusChild *kid; + uint32_t offset, dcmd_limit, num_pd_disks = 0, max_pd_disks; + dma_addr_t residual; + +- memset(&info, 0, dcmd_size); + offset = 8; + dcmd_limit = offset + sizeof(struct mfi_pd_address); + if (cmd->iov_size < dcmd_limit) { +@@ -1429,11 +1427,10 @@ static int megasas_dcmd_cfg_read(MegasasState *s, MegasasCmd *cmd) + + static int megasas_dcmd_get_properties(MegasasState *s, MegasasCmd *cmd) + { +- struct mfi_ctrl_props info; ++ struct mfi_ctrl_props info = {}; + size_t dcmd_size = sizeof(info); + dma_addr_t residual; + +- memset(&info, 0x0, dcmd_size); + if (cmd->iov_size < dcmd_size) { + trace_megasas_dcmd_invalid_xfer_len(cmd->index, cmd->iov_size, + dcmd_size); +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-ufs-lu-skip-automatic-zero-init-of-large-array.patch b/SOURCES/kvm-hw-ufs-lu-skip-automatic-zero-init-of-large-array.patch new file mode 100644 index 0000000..175b89b --- /dev/null +++ b/SOURCES/kvm-hw-ufs-lu-skip-automatic-zero-init-of-large-array.patch @@ -0,0 +1,50 @@ +From 3a0ae5a2f873fc7062262efc24a5403233988f5f Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:37:07 +0100 +Subject: [PATCH 54/57] hw/ufs/lu: skip automatic zero-init of large array +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [28/30] 62e7c83d15143387f6d6b366c8ec46b312d05577 (stefanha/centos-stream-qemu-kvm) + +The 'ufs_emulate_scsi_cmd' method has a 4k byte array used for +copying data from the device. Skip the automatic zero-init of +this array to eliminate the performance overhead in the I/O hot +path. + +The 'outbuf' array will be fully initialized when data is copied +from the guest. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Reviewed-by: Klaus Jensen +Reviewed-by: Harsh Prateek Bora +Message-id: 20250610123709.835102-30-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 7708e298180550eac262c1fd742e6e80c711a5d8) +Signed-off-by: Stefan Hajnoczi +--- + hw/ufs/lu.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/ufs/lu.c b/hw/ufs/lu.c +index 81bfff9b4e..caad82dcc4 100644 +--- a/hw/ufs/lu.c ++++ b/hw/ufs/lu.c +@@ -194,7 +194,7 @@ static int ufs_emulate_wlun_inquiry(UfsRequest *req, uint8_t *outbuf, + static UfsReqResult ufs_emulate_scsi_cmd(UfsLu *lu, UfsRequest *req) + { + uint8_t lun = lu->lun; +- uint8_t outbuf[4096]; ++ QEMU_UNINITIALIZED uint8_t outbuf[4096]; + uint8_t sense_buf[UFS_SENSE_SIZE]; + uint8_t scsi_status; + int len = 0; +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-usb-hcd-ohci-skip-automatic-zero-init-of-large-ar.patch b/SOURCES/kvm-hw-usb-hcd-ohci-skip-automatic-zero-init-of-large-ar.patch new file mode 100644 index 0000000..b5daa5b --- /dev/null +++ b/SOURCES/kvm-hw-usb-hcd-ohci-skip-automatic-zero-init-of-large-ar.patch @@ -0,0 +1,50 @@ +From 6d4761010ea4dc218a1623513f410fc2d1cfc832 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:37:04 +0100 +Subject: [PATCH 51/57] hw/usb/hcd-ohci: skip automatic zero-init of large + array +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [25/30] 721dd97d384fb755c4a6a00cfc3d867e43f25b0b (stefanha/centos-stream-qemu-kvm) + +The 'ohci_service_iso_td' method has a 8k byte array used for copying +data between guest and host. Skip the automatic zero-init of this +array to eliminate the performance overhead in the I/O hot path. + +The 'buf' array will be fully initialized when reading data from guest +memory. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Reviewed-by: Klaus Jensen +Reviewed-by: Harsh Prateek Bora +Message-id: 20250610123709.835102-27-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 14997d521d1cd0bb36c902ef1032f0d3f2a3c912) +Signed-off-by: Stefan Hajnoczi +--- + hw/usb/hcd-ohci.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/usb/hcd-ohci.c b/hw/usb/hcd-ohci.c +index 71b54914d3..72a9f9f474 100644 +--- a/hw/usb/hcd-ohci.c ++++ b/hw/usb/hcd-ohci.c +@@ -577,7 +577,7 @@ static int ohci_service_iso_td(OHCIState *ohci, struct ohci_ed *ed) + USBDevice *dev; + USBEndpoint *ep; + USBPacket *pkt; +- uint8_t buf[8192]; ++ QEMU_UNINITIALIZED uint8_t buf[8192]; + bool int_req; + struct ohci_iso_td iso_td; + uint32_t addr; +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-vfio-common-Add-a-trace-point-in-vfio_reset_handl.patch b/SOURCES/kvm-hw-vfio-common-Add-a-trace-point-in-vfio_reset_handl.patch new file mode 100644 index 0000000..0c06398 --- /dev/null +++ b/SOURCES/kvm-hw-vfio-common-Add-a-trace-point-in-vfio_reset_handl.patch @@ -0,0 +1,61 @@ +From 04f11749dd21b4df1ea2818785d650dd6eee2cbe Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Tue, 18 Feb 2025 19:25:34 +0100 +Subject: [PATCH 4/9] hw/vfio/common: Add a trace point in vfio_reset_handler +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Eric Auger +RH-MergeRequest: 341: Fix vIOMMU reset order +RH-Jira: RHEL-7188 +RH-Acked-by: Peter Xu +RH-Acked-by: Donald Dutile +RH-Acked-by: Cédric Le Goater +RH-Commit: [4/5] 46878ffdc96997d1f6d09bde3fce350564e499fd (eauger1/centos-qemu-kvm) + +To ease the debug of reset sequence, let's add a trace point +in vfio_reset_handler() + +Signed-off-by: Eric Auger +Reviewed-by: Cédric Le Goater +Acked-by: Michael S. Tsirkin +Reviewed-by: Zhenzhong Duan +Message-Id: <20250218182737.76722-5-eric.auger@redhat.com> +Reviewed-by: Peter Xu +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit d410e709526d1cd4aa9085c6e254a622594a02a5) +Signed-off-by: Eric Auger +--- + hw/vfio/common.c | 1 + + hw/vfio/trace-events | 1 + + 2 files changed, 2 insertions(+) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 36d0cf6585..6982f88fc8 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -1395,6 +1395,7 @@ void vfio_reset_handler(void *opaque) + { + VFIODevice *vbasedev; + ++ trace_vfio_reset_handler(); + QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { + if (vbasedev->dev->realized) { + vbasedev->ops->vfio_compute_needs_reset(vbasedev); +diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events +index 3756ff660e..9523a9ccb0 100644 +--- a/hw/vfio/trace-events ++++ b/hw/vfio/trace-events +@@ -120,6 +120,7 @@ vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype + vfio_legacy_dma_unmap_overflow_workaround(void) "" + vfio_get_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64 + vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64 ++vfio_reset_handler(void) "" + + # platform.c + vfio_platform_realize(char *name, char *compat) "vfio device %s, compat = %s" +-- +2.48.1 + diff --git a/SOURCES/kvm-hw-vfio-pci-Re-order-pre-reset.patch b/SOURCES/kvm-hw-vfio-pci-Re-order-pre-reset.patch new file mode 100644 index 0000000..7318f84 --- /dev/null +++ b/SOURCES/kvm-hw-vfio-pci-Re-order-pre-reset.patch @@ -0,0 +1,74 @@ +From d6a961077e753b9ad5a670a1529634fe20322ce2 Mon Sep 17 00:00:00 2001 +From: Alex Williamson +Date: Tue, 25 Feb 2025 14:52:29 -0700 +Subject: [PATCH 6/7] hw/vfio/pci: Re-order pre-reset +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Eric Auger +RH-MergeRequest: 348: PCI: Implement basic PCI PM capability backing +RH-Jira: RHEL-7301 +RH-Acked-by: Cédric Le Goater +RH-Acked-by: Alex Williamson +RH-Acked-by: Jon Maloy +RH-Commit: [6/6] c6c386ecbabda93f8a79da926ece95c2195fbc36 (eauger1/centos-qemu-kvm) + +We want the device in the D0 power state going into reset, but the +config write can enable the BARs in the address space, which are +then removed from the address space once we clear the memory enable +bit in the command register. Re-order to clear the command bit +first, so the power state change doesn't enable the BARs. + +Cc: Cédric Le Goater +Reviewed-by: Zhenzhong Duan +Reviewed-by: Eric Auger +Signed-off-by: Alex Williamson +Reviewed-by: Michael S. Tsirkin +Link: https://lore.kernel.org/qemu-devel/20250225215237.3314011-6-alex.williamson@redhat.com +Signed-off-by: Cédric Le Goater +(cherry picked from commit 518a69a598916749338de3852d41d961d4503115) +Signed-off-by: Eric Auger +--- + hw/vfio/pci.c | 18 +++++++++--------- + 1 file changed, 9 insertions(+), 9 deletions(-) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 595b5c9b25..ffe72fd1d0 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -2414,6 +2414,15 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev) + + vfio_disable_interrupts(vdev); + ++ /* ++ * Stop any ongoing DMA by disconnecting I/O, MMIO, and bus master. ++ * Also put INTx Disable in known state. ++ */ ++ cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2); ++ cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER | ++ PCI_COMMAND_INTX_DISABLE); ++ vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2); ++ + /* Make sure the device is in D0 */ + if (pdev->pm_cap) { + uint16_t pmcsr; +@@ -2433,15 +2442,6 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev) + } + } + } +- +- /* +- * Stop any ongoing DMA by disconnecting I/O, MMIO, and bus master. +- * Also put INTx Disable in known state. +- */ +- cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2); +- cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER | +- PCI_COMMAND_INTX_DISABLE); +- vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2); + } + + void vfio_pci_post_reset(VFIOPCIDevice *vdev) +-- +2.48.1 + diff --git a/SOURCES/kvm-hw-virtio-Also-include-md-stubs-in-case-CONFIG_VIRTI.patch b/SOURCES/kvm-hw-virtio-Also-include-md-stubs-in-case-CONFIG_VIRTI.patch new file mode 100644 index 0000000..c062e65 --- /dev/null +++ b/SOURCES/kvm-hw-virtio-Also-include-md-stubs-in-case-CONFIG_VIRTI.patch @@ -0,0 +1,59 @@ +From afa3a488f3ca52a5455987e4cd643882c4b15d8a Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Thu, 13 Mar 2025 07:35:22 +0100 +Subject: [PATCH 24/26] hw/virtio: Also include md stubs in case + CONFIG_VIRTIO_PCI is not set +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [24/26] ae6307b26d01d2a317f7e5d1d3b3a16b6d5f56de (thuth/qemu-kvm-cs) + +For the s390x target, it's possible to build the QEMU binary without +CONFIG_VIRTIO_PCI and only have the virtio-mem device via the ccw +transport. In that case, QEMU currently fails to link correctly: + + /usr/bin/ld: libqemu-s390x-softmmu.a.p/hw_s390x_s390-virtio-ccw.c.o: in function `s390_machine_device_pre_plug': + ../hw/s390x/s390-virtio-ccw.c:579:(.text+0x1e96): undefined reference to `virtio_md_pci_pre_plug' + /usr/bin/ld: libqemu-s390x-softmmu.a.p/hw_s390x_s390-virtio-ccw.c.o: in function `s390_machine_device_plug': + ../hw/s390x/s390-virtio-ccw.c:608:(.text+0x21a4): undefined reference to `virtio_md_pci_plug' + /usr/bin/ld: libqemu-s390x-softmmu.a.p/hw_s390x_s390-virtio-ccw.c.o: in function `s390_machine_device_unplug_request': + ../hw/s390x/s390-virtio-ccw.c:622:(.text+0x2334): undefined reference to `virtio_md_pci_unplug_request' + /usr/bin/ld: libqemu-s390x-softmmu.a.p/hw_s390x_s390-virtio-ccw.c.o: in function `s390_machine_device_unplug': + ../hw/s390x/s390-virtio-ccw.c:633:(.text+0x2436): undefined reference to `virtio_md_pci_unplug' + clang: error: linker command failed with exit code 1 (use -v to see invocation) + +We also need to include the stubs when CONFIG_VIRTIO_PCI is missing. + +Fixes: aa910c20ec5 ("s390x: virtio-mem support") +Message-ID: <20250313063522.1348288-1-thuth@redhat.com> +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Thomas Huth +(cherry picked from commit c1a6bff276ca52ffde472532d92bb5bb122dab3f) +Signed-off-by: Thomas Huth +--- + hw/virtio/meson.build | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build +index c38bdd6fa4..e2f9c75625 100644 +--- a/hw/virtio/meson.build ++++ b/hw/virtio/meson.build +@@ -89,7 +89,8 @@ specific_virtio_ss.add_all(when: 'CONFIG_VIRTIO_PCI', if_true: virtio_pci_ss) + system_ss.add_all(when: 'CONFIG_VIRTIO', if_true: system_virtio_ss) + system_ss.add(when: 'CONFIG_VIRTIO', if_false: files('vhost-stub.c')) + system_ss.add(when: 'CONFIG_VIRTIO', if_false: files('virtio-stub.c')) +-system_ss.add(when: 'CONFIG_VIRTIO_MD', if_false: files('virtio-md-stubs.c')) ++system_ss.add(when: ['CONFIG_VIRTIO_MD', 'CONFIG_VIRTIO_PCI'], ++ if_false: files('virtio-md-stubs.c')) + + system_ss.add(files('virtio-hmp-cmds.c')) + +-- +2.48.1 + diff --git a/SOURCES/kvm-hw-virtio-virtio-avoid-cost-of-ftrivial-auto-var-ini.patch b/SOURCES/kvm-hw-virtio-virtio-avoid-cost-of-ftrivial-auto-var-ini.patch new file mode 100644 index 0000000..e006e88 --- /dev/null +++ b/SOURCES/kvm-hw-virtio-virtio-avoid-cost-of-ftrivial-auto-var-ini.patch @@ -0,0 +1,73 @@ +From 4727c044a09fb8c4fb6d667f26eb55bb6de7554d Mon Sep 17 00:00:00 2001 +From: Stefan Hajnoczi +Date: Tue, 10 Jun 2025 13:36:40 +0100 +Subject: [PATCH 28/57] hw/virtio/virtio: avoid cost of -ftrivial-auto-var-init + in hot path +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [2/30] 1c2cc6292deaaac068f4514439703c22c9ccb300 (stefanha/centos-stream-qemu-kvm) + +Since commit 7ff9ff039380 ("meson: mitigate against use of uninitialize +stack for exploits") the -ftrivial-auto-var-init=zero compiler option is +used to zero local variables. While this reduces security risks +associated with uninitialized stack data, it introduced a measurable +bottleneck in the virtqueue_split_pop() and virtqueue_packed_pop() +functions. + +These virtqueue functions are in the hot path. They are called for each +element (request) that is popped from a VIRTIO device's virtqueue. Using +__attribute__((uninitialized)) on large stack variables in these +functions improves fio randread bs=4k iodepth=64 performance from 304k +to 332k IOPS (+9%). + +This issue was found using perf-top(1). virtqueue_split_pop() was one of +the top CPU consumers and the "annotate" feature showed that the memory +zeroing instructions at the beginning of the functions were hot. + +Fixes: 7ff9ff039380 ("meson: mitigate against use of uninitialize stack for exploits") +Cc: Daniel P. Berrangé +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Stefan Hajnoczi +Message-id: 20250610123709.835102-3-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit ba2868ce091cd4abe4be6de4b7e44b3be303b352) +Signed-off-by: Stefan Hajnoczi +--- + hw/virtio/virtio.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c +index 10f24a58dd..7f7b178a50 100644 +--- a/hw/virtio/virtio.c ++++ b/hw/virtio/virtio.c +@@ -1680,8 +1680,8 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t sz) + VirtIODevice *vdev = vq->vdev; + VirtQueueElement *elem = NULL; + unsigned out_num, in_num, elem_entries; +- hwaddr addr[VIRTQUEUE_MAX_SIZE]; +- struct iovec iov[VIRTQUEUE_MAX_SIZE]; ++ hwaddr QEMU_UNINITIALIZED addr[VIRTQUEUE_MAX_SIZE]; ++ struct iovec QEMU_UNINITIALIZED iov[VIRTQUEUE_MAX_SIZE]; + VRingDesc desc; + int rc; + +@@ -1826,8 +1826,8 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t sz) + VirtIODevice *vdev = vq->vdev; + VirtQueueElement *elem = NULL; + unsigned out_num, in_num, elem_entries; +- hwaddr addr[VIRTQUEUE_MAX_SIZE]; +- struct iovec iov[VIRTQUEUE_MAX_SIZE]; ++ hwaddr QEMU_UNINITIALIZED addr[VIRTQUEUE_MAX_SIZE]; ++ struct iovec QEMU_UNINITIALIZED iov[VIRTQUEUE_MAX_SIZE]; + VRingPackedDesc desc; + uint16_t id; + int rc; +-- +2.39.3 + diff --git a/SOURCES/kvm-hw-virtio-virtio-iommu-Migrate-to-3-phase-reset.patch b/SOURCES/kvm-hw-virtio-virtio-iommu-Migrate-to-3-phase-reset.patch new file mode 100644 index 0000000..3922e9b --- /dev/null +++ b/SOURCES/kvm-hw-virtio-virtio-iommu-Migrate-to-3-phase-reset.patch @@ -0,0 +1,96 @@ +From 9ca5d7ac4f0ff5f10bf424df8104fe5abe01e431 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Tue, 18 Feb 2025 19:25:31 +0100 +Subject: [PATCH 1/9] hw/virtio/virtio-iommu: Migrate to 3-phase reset +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Eric Auger +RH-MergeRequest: 341: Fix vIOMMU reset order +RH-Jira: RHEL-7188 +RH-Acked-by: Peter Xu +RH-Acked-by: Donald Dutile +RH-Acked-by: Cédric Le Goater +RH-Commit: [1/5] 32bf47497d5d4817a448d07ffa7a844aee82ae3c (eauger1/centos-qemu-kvm) + +Currently the iommu may be reset before the devices +it protects. For example this happens with virtio-net. + +Let's use 3-phase reset mechanism and reset the IOMMU on +exit phase after all DMA capable devices have been +reset during the 'enter' or 'hold' phase. + +Signed-off-by: Eric Auger +Acked-by: Michael S. Tsirkin +Reviewed-by: Zhenzhong Duan +Acked-by: Jason Wang + +Message-Id: <20250218182737.76722-2-eric.auger@redhat.com> +Reviewed-by: Peter Xu +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit d261b84d354a41a38336af813f92f636d3fb3f78) +Signed-off-by: Eric Auger +--- + hw/virtio/trace-events | 2 +- + hw/virtio/virtio-iommu.c | 14 ++++++++++---- + 2 files changed, 11 insertions(+), 5 deletions(-) + +diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events +index 04e36ae047..76f0d458b2 100644 +--- a/hw/virtio/trace-events ++++ b/hw/virtio/trace-events +@@ -108,7 +108,7 @@ virtio_pci_notify_write(uint64_t addr, uint64_t val, unsigned int size) "0x%" PR + virtio_pci_notify_write_pio(uint64_t addr, uint64_t val, unsigned int size) "0x%" PRIx64" = 0x%" PRIx64 " (%d)" + + # hw/virtio/virtio-iommu.c +-virtio_iommu_device_reset(void) "reset!" ++virtio_iommu_device_reset_exit(void) "reset!" + virtio_iommu_system_reset(void) "system reset!" + virtio_iommu_get_features(uint64_t features) "device supports features=0x%"PRIx64 + virtio_iommu_device_status(uint8_t status) "driver status = %d" +diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c +index 59ef4fb217..496200ebc5 100644 +--- a/hw/virtio/virtio-iommu.c ++++ b/hw/virtio/virtio-iommu.c +@@ -1504,11 +1504,11 @@ static void virtio_iommu_device_unrealize(DeviceState *dev) + virtio_cleanup(vdev); + } + +-static void virtio_iommu_device_reset(VirtIODevice *vdev) ++static void virtio_iommu_device_reset_exit(Object *obj, ResetType type) + { +- VirtIOIOMMU *s = VIRTIO_IOMMU(vdev); ++ VirtIOIOMMU *s = VIRTIO_IOMMU(obj); + +- trace_virtio_iommu_device_reset(); ++ trace_virtio_iommu_device_reset_exit(); + + if (s->domains) { + g_tree_destroy(s->domains); +@@ -1669,6 +1669,7 @@ static void virtio_iommu_class_init(ObjectClass *klass, void *data) + { + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); ++ ResettableClass *rc = RESETTABLE_CLASS(klass); + + device_class_set_props(dc, virtio_iommu_properties); + dc->vmsd = &vmstate_virtio_iommu; +@@ -1676,7 +1677,12 @@ static void virtio_iommu_class_init(ObjectClass *klass, void *data) + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + vdc->realize = virtio_iommu_device_realize; + vdc->unrealize = virtio_iommu_device_unrealize; +- vdc->reset = virtio_iommu_device_reset; ++ ++ /* ++ * Use 'exit' reset phase to make sure all DMA requests ++ * have been quiesced during 'enter' or 'hold' phase ++ */ ++ rc->phases.exit = virtio_iommu_device_reset_exit; + vdc->get_config = virtio_iommu_get_config; + vdc->set_config = virtio_iommu_set_config; + vdc->get_features = virtio_iommu_get_features; +-- +2.48.1 + diff --git a/SOURCES/kvm-include-qemu-compiler-add-QEMU_UNINITIALIZED-attribu.patch b/SOURCES/kvm-include-qemu-compiler-add-QEMU_UNINITIALIZED-attribu.patch new file mode 100644 index 0000000..a196764 --- /dev/null +++ b/SOURCES/kvm-include-qemu-compiler-add-QEMU_UNINITIALIZED-attribu.patch @@ -0,0 +1,80 @@ +From cf92fd8487195ac45bfbdad15168eaec70f3aaa9 Mon Sep 17 00:00:00 2001 +From: Stefan Hajnoczi +Date: Tue, 10 Jun 2025 13:36:39 +0100 +Subject: [PATCH 27/57] include/qemu/compiler: add QEMU_UNINITIALIZED attribute + macro +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [1/30] 43c2412d318b6d8e0dcb0b37340640a9d90c3188 (stefanha/centos-stream-qemu-kvm) + +The QEMU_UNINITIALIZED macro is to be used to skip the default compiler +variable initialization done by -ftrivial-auto-var-init=zero. + +Use this in cases where there a method in the device I/O path (or other +important hot paths), that has large variables on the stack. A rule of +thumb is that "large" means a method with 4kb data in the local stack +frame. Any variables which are KB in size, should be annotated with this +attribute, to pre-emptively eliminate any potential overhead from the +compiler zero'ing memory. + +Given that this turns off a security hardening feature, when using this +to flag variables, it is important that the code is double-checked to +ensure there is no possible use of uninitialized data in the method. + +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Daniel P. Berrangé +Message-id: 20250610123709.835102-2-berrange@redhat.com +[DB: split off patch & rewrite guidance on when to use the annotation] +Signed-off-by: Daniel P. Berrangé +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit c653b67d1863b7ebfa67f7c9f4aec209d7b5ced5) +Signed-off-by: Stefan Hajnoczi + +Conflicts: + include/qemu/compiler.h + Context conflict due to clang Thread Safety Analysis macros. +--- + include/qemu/compiler.h | 20 ++++++++++++++++++++ + 1 file changed, 20 insertions(+) + +diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h +index c06954ccb4..cc193d5b82 100644 +--- a/include/qemu/compiler.h ++++ b/include/qemu/compiler.h +@@ -212,6 +212,26 @@ + # define QEMU_USED + #endif + ++/* ++ * Disable -ftrivial-auto-var-init on a local variable. ++ * ++ * Use this in cases where there a method in the device I/O path (or other ++ * important hot paths), that has large variables on the stack. A rule of ++ * thumb is that "large" means a method with 4kb data in the local stack ++ * frame. Any variables which are KB in size, should be annotated with this ++ * attribute, to pre-emptively eliminate any potential overhead from the ++ * compiler's implicit zero'ing of memory. ++ * ++ * Given that this turns off a security hardening feature, when using this ++ * to flag variables, it is important that the code is double-checked to ++ * ensure there is no possible use of uninitialized data in the method. ++ */ ++#if __has_attribute(uninitialized) ++# define QEMU_UNINITIALIZED __attribute__((uninitialized)) ++#else ++# define QEMU_UNINITIALIZED ++#endif ++ + /* + * Ugly CPP trick that is like "defined FOO", but also works in C + * code. Useful to replace #ifdef with "if" statements; assumes +-- +2.39.3 + diff --git a/SOURCES/kvm-io-Fix-partial-struct-copy-in-qio_dns_resolver_looku.patch b/SOURCES/kvm-io-Fix-partial-struct-copy-in-qio_dns_resolver_looku.patch new file mode 100644 index 0000000..23d927a --- /dev/null +++ b/SOURCES/kvm-io-Fix-partial-struct-copy-in-qio_dns_resolver_looku.patch @@ -0,0 +1,73 @@ +From 4545870823aea92b18a7e747b686b666d08006a4 Mon Sep 17 00:00:00 2001 +From: Juraj Marcin +Date: Wed, 21 May 2025 15:52:30 +0200 +Subject: [PATCH 08/57] io: Fix partial struct copy in + qio_dns_resolver_lookup_sync_inet() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Juraj Marcin +RH-MergeRequest: 369: util/qemu-sockets: Introduce inet socket options controlling TCP keep-alive +RH-Jira: RHEL-67104 +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina +RH-Commit: [1/7] 92c8b3e63c22a3ca6e5adc76cac1a9f812034912 (JurajMarcin/centos-src-qemu-kvm) + +Commit aec21d3175 (qapi: Add InetSocketAddress member keep-alive) +introduces the keep-alive flag, but this flag is not copied together +with other options in qio_dns_resolver_lookup_sync_inet(). + +This patch fixes this issue and also prevents future ones by copying the +entire structure first and only then overriding a few attributes that +need to be different. + +Fixes: aec21d31756c (qapi: Add InetSocketAddress member keep-alive) +Signed-off-by: Juraj Marcin +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Daniel P. Berrangé + +(cherry picked from commit 0dc051aa85e1bd68d5c5110fa8af69204e6dbd3d) + +JIRA: https://issues.redhat.com/browse/RHEL-67104 + +Signed-off-by: Juraj Marcin +--- + io/dns-resolver.c | 21 +++++---------------- + 1 file changed, 5 insertions(+), 16 deletions(-) + +diff --git a/io/dns-resolver.c b/io/dns-resolver.c +index 53b0e8407a..3712438f82 100644 +--- a/io/dns-resolver.c ++++ b/io/dns-resolver.c +@@ -111,22 +111,11 @@ static int qio_dns_resolver_lookup_sync_inet(QIODNSResolver *resolver, + uaddr, INET6_ADDRSTRLEN, uport, 32, + NI_NUMERICHOST | NI_NUMERICSERV); + +- newaddr->u.inet = (InetSocketAddress){ +- .host = g_strdup(uaddr), +- .port = g_strdup(uport), +- .has_numeric = true, +- .numeric = true, +- .has_to = iaddr->has_to, +- .to = iaddr->to, +- .has_ipv4 = iaddr->has_ipv4, +- .ipv4 = iaddr->ipv4, +- .has_ipv6 = iaddr->has_ipv6, +- .ipv6 = iaddr->ipv6, +-#ifdef HAVE_IPPROTO_MPTCP +- .has_mptcp = iaddr->has_mptcp, +- .mptcp = iaddr->mptcp, +-#endif +- }; ++ newaddr->u.inet = *iaddr; ++ newaddr->u.inet.host = g_strdup(uaddr), ++ newaddr->u.inet.port = g_strdup(uport), ++ newaddr->u.inet.has_numeric = true, ++ newaddr->u.inet.numeric = true, + + (*addrs)[i] = newaddr; + } +-- +2.39.3 + diff --git a/SOURCES/kvm-iotests-Improve-iotest-194-to-mirror-data.patch b/SOURCES/kvm-iotests-Improve-iotest-194-to-mirror-data.patch new file mode 100644 index 0000000..f696184 --- /dev/null +++ b/SOURCES/kvm-iotests-Improve-iotest-194-to-mirror-data.patch @@ -0,0 +1,42 @@ +From 8832268a98104ba3065a57dedcd3db43231512ba Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Fri, 9 May 2025 15:40:22 -0500 +Subject: [PATCH 07/16] iotests: Improve iotest 194 to mirror data + +RH-Author: Eric Blake +RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors +RH-Jira: RHEL-82906 RHEL-83015 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Jon Maloy +RH-Commit: [5/14] bfbe8eab1035480cef9d69d1974ba66b755b1b60 (ebblake/centos-qemu-kvm) + +Mirroring a completely sparse image to a sparse destination should be +practically instantaneous. It isn't yet, but the test will be more +realistic if it has some non-zero to mirror as well as the holes. + +Signed-off-by: Eric Blake +Reviewed-by: Stefan Hajnoczi +Message-ID: <20250509204341.3553601-20-eblake@redhat.com> +(cherry picked from commit eb89627899bb84148d272394e885725eff456ae9) +Jira: https://issues.redhat.com/browse/RHEL-82906 +Jira: https://issues.redhat.com/browse/RHEL-83015 +Signed-off-by: Eric Blake +--- + tests/qemu-iotests/194 | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/tests/qemu-iotests/194 b/tests/qemu-iotests/194 +index c0ce82dd25..d0b9c084f5 100755 +--- a/tests/qemu-iotests/194 ++++ b/tests/qemu-iotests/194 +@@ -34,6 +34,7 @@ with iotests.FilePath('source.img') as source_img_path, \ + + img_size = '1G' + iotests.qemu_img_create('-f', iotests.imgfmt, source_img_path, img_size) ++ iotests.qemu_io('-f', iotests.imgfmt, '-c', 'write 512M 1M', source_img_path) + iotests.qemu_img_create('-f', iotests.imgfmt, dest_img_path, img_size) + + iotests.log('Launching VMs...') +-- +2.48.1 + diff --git a/SOURCES/kvm-iotests-common.rc-add-disk_usage-function.patch b/SOURCES/kvm-iotests-common.rc-add-disk_usage-function.patch new file mode 100644 index 0000000..14ffba7 --- /dev/null +++ b/SOURCES/kvm-iotests-common.rc-add-disk_usage-function.patch @@ -0,0 +1,68 @@ +From 644f39de9e2466a9570833b1070acf47a53863ea Mon Sep 17 00:00:00 2001 +From: Andrey Drobyshev +Date: Fri, 9 May 2025 15:40:29 -0500 +Subject: [PATCH 14/16] iotests/common.rc: add disk_usage function + +RH-Author: Eric Blake +RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors +RH-Jira: RHEL-82906 RHEL-83015 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Jon Maloy +RH-Commit: [12/14] 0e5d4217f97fe6e952de23eedbc2b8d9c7600665 (ebblake/centos-qemu-kvm) + +Move the definition from iotests/250 to common.rc. This is used to +detect real disk usage of sparse files. In particular, we want to use +it for checking subclusters-based discards. + +Signed-off-by: Andrey Drobyshev +Reviewed-by: Alexander Ivanov +Reviewed-by: Alberto Garcia +Message-ID: <20240913163942.423050-6-andrey.drobyshev@virtuozzo.com> +Signed-off-by: Eric Blake +Reviewed-by: Stefan Hajnoczi +Message-ID: <20250509204341.3553601-27-eblake@redhat.com> +(cherry picked from commit be9bac072ede6e6aa27079f59efcf17b56bd7b26) +Jira: https://issues.redhat.com/browse/RHEL-82906 +Jira: https://issues.redhat.com/browse/RHEL-83015 +Signed-off-by: Eric Blake +--- + tests/qemu-iotests/250 | 5 ----- + tests/qemu-iotests/common.rc | 6 ++++++ + 2 files changed, 6 insertions(+), 5 deletions(-) + +diff --git a/tests/qemu-iotests/250 b/tests/qemu-iotests/250 +index af48f83aba..c0a0dbc0ff 100755 +--- a/tests/qemu-iotests/250 ++++ b/tests/qemu-iotests/250 +@@ -52,11 +52,6 @@ _unsupported_imgopts data_file + # bdrv_co_truncate(bs->file) call in qcow2_co_truncate(), which might succeed + # anyway. + +-disk_usage() +-{ +- du --block-size=1 $1 | awk '{print $1}' +-} +- + size=2100M + + _make_test_img -o "cluster_size=1M,preallocation=metadata" $size +diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc +index 95c12577dd..237f746af8 100644 +--- a/tests/qemu-iotests/common.rc ++++ b/tests/qemu-iotests/common.rc +@@ -140,6 +140,12 @@ _optstr_add() + fi + } + ++# report real disk usage for sparse files ++disk_usage() ++{ ++ du --block-size=1 "$1" | awk '{print $1}' ++} ++ + # Set the variables to the empty string to turn Valgrind off + # for specific processes, e.g. + # $ VALGRIND_QEMU_IO= ./check -qcow2 -valgrind 015 +-- +2.48.1 + diff --git a/SOURCES/kvm-meson-configure-add-valgrind-option-en-dis-able-valg.patch b/SOURCES/kvm-meson-configure-add-valgrind-option-en-dis-able-valg.patch new file mode 100644 index 0000000..93c87f9 --- /dev/null +++ b/SOURCES/kvm-meson-configure-add-valgrind-option-en-dis-able-valg.patch @@ -0,0 +1,110 @@ +From 0277328b5a2d1df5d9843423ab5f5fa9481bad79 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Fri, 25 Apr 2025 13:17:12 +0100 +Subject: [PATCH 1/5] meson/configure: add 'valgrind' option & --{en, + dis}able-valgrind flag +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Daniel P. Berrangé +RH-MergeRequest: 359: distro: add an explicit valgrind-devel build dep +RH-Jira: RHEL-88153 +RH-Acked-by: Eric Blake +RH-Acked-by: Jon Maloy +RH-Commit: [1/2] ba9bc44ef9cef6fa76e2092500608575f223f1f7 (berrange/centos-src-qemu) + +Currently valgrind debugging support for coroutine stacks is enabled +unconditionally when valgrind/valgrind.h is found. There is no way +to disable valgrind support if valgrind.h is present in the build env. + +This is bad for distros, as an dependency far down the chain may cause +valgrind.h to become installed, inadvertently enabling QEMU's valgrind +debugging support. It also means if a distro wants valgrind support +there is no way to mandate this. + +The solution is to add a 'valgrind' build feature to meson and thus +configure script. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Thomas Huth +Message-ID: <20250425121713.1913424-1-berrange@redhat.com> +Signed-off-by: Thomas Huth +(cherry picked from commit 6b1c744ec0d66d6d568f9a156282153fc11a21cf) + +Conflicts: + meson.build - context from upstream is not present in older tree +--- + meson.build | 13 ++++++++++++- + meson_options.txt | 2 ++ + scripts/meson-buildoptions.sh | 3 +++ + 3 files changed, 17 insertions(+), 1 deletion(-) + +diff --git a/meson.build b/meson.build +index 1dd97c6f49..5bb2b757c3 100644 +--- a/meson.build ++++ b/meson.build +@@ -2463,7 +2463,17 @@ config_host_data.set('CONFIG_FSTRIM', qga_fstrim) + # has_header + config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h')) + config_host_data.set('CONFIG_LINUX_MAGIC_H', cc.has_header('linux/magic.h')) +-config_host_data.set('CONFIG_VALGRIND_H', cc.has_header('valgrind/valgrind.h')) ++valgrind = false ++if get_option('valgrind').allowed() ++ if cc.has_header('valgrind/valgrind.h') ++ valgrind = true ++ else ++ if get_option('valgrind').enabled() ++ error('valgrind requested but valgrind.h not found') ++ endif ++ endif ++endif ++config_host_data.set('CONFIG_VALGRIND_H', valgrind) + config_host_data.set('HAVE_BTRFS_H', cc.has_header('linux/btrfs.h')) + config_host_data.set('HAVE_DRM_H', cc.has_header('libdrm/drm.h')) + config_host_data.set('HAVE_PTY_H', cc.has_header('pty.h')) +@@ -4549,6 +4559,7 @@ summary_info += {'libdw': libdw} + if host_os == 'freebsd' + summary_info += {'libinotify-kqueue': inotify} + endif ++summary_info += {'valgrind': valgrind} + summary(summary_info, bool_yn: true, section: 'Dependencies') + + if host_arch == 'unknown' +diff --git a/meson_options.txt b/meson_options.txt +index aa2ba0baef..da06441fdf 100644 +--- a/meson_options.txt ++++ b/meson_options.txt +@@ -113,6 +113,8 @@ option('dbus_display', type: 'feature', value: 'auto', + description: '-display dbus support') + option('tpm', type : 'feature', value : 'auto', + description: 'TPM support') ++option('valgrind', type : 'feature', value: 'auto', ++ description: 'valgrind debug support for coroutine stacks') + + # Do not enable it by default even for Mingw32, because it doesn't + # work on Wine. +diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh +index 5f0cbfc725..251470ea6d 100644 +--- a/scripts/meson-buildoptions.sh ++++ b/scripts/meson-buildoptions.sh +@@ -191,6 +191,7 @@ meson_options_help() { + printf "%s\n" ' u2f U2F emulation support' + printf "%s\n" ' uadk UADK Library support' + printf "%s\n" ' usb-redir libusbredir support' ++ printf "%s\n" ' valgrind valgrind debug support for coroutine stacks' + printf "%s\n" ' vde vde network backend support' + printf "%s\n" ' vdi vdi image format support' + printf "%s\n" ' vduse-blk-export' +@@ -509,6 +510,8 @@ _meson_option_parse() { + --disable-uadk) printf "%s" -Duadk=disabled ;; + --enable-usb-redir) printf "%s" -Dusb_redir=enabled ;; + --disable-usb-redir) printf "%s" -Dusb_redir=disabled ;; ++ --enable-valgrind) printf "%s" -Dvalgrind=enabled ;; ++ --disable-valgrind) printf "%s" -Dvalgrind=disabled ;; + --enable-vde) printf "%s" -Dvde=enabled ;; + --disable-vde) printf "%s" -Dvde=disabled ;; + --enable-vdi) printf "%s" -Dvdi=enabled ;; +-- +2.48.1 + diff --git a/SOURCES/kvm-migration-Fix-UAF-for-incoming-migration-on-Migratio.patch b/SOURCES/kvm-migration-Fix-UAF-for-incoming-migration-on-Migratio.patch new file mode 100644 index 0000000..d9d12bf --- /dev/null +++ b/SOURCES/kvm-migration-Fix-UAF-for-incoming-migration-on-Migratio.patch @@ -0,0 +1,180 @@ +From 5d7d7a2ec6301f4d0b0dbea4fbdcab4e41a9cf07 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Thu, 20 Feb 2025 08:24:59 -0500 +Subject: [PATCH 7/9] migration: Fix UAF for incoming migration on + MigrationState + +RH-Author: Peter Xu +RH-MergeRequest: 344: migration: Fix UAF for incoming migration on MigrationState +RH-Jira: RHEL-69775 +RH-Acked-by: Juraj Marcin +RH-Acked-by: Jon Maloy +RH-Commit: [1/1] 106e2b4c1c461202c912b5e3ea7e586c4ab05d8c (peterx/qemu-kvm) + +On the incoming migration side, QEMU uses a coroutine to load all the VM +states. Inside, it may reference MigrationState on global states like +migration capabilities, parameters, error state, shared mutexes and more. + +However there's nothing yet to make sure MigrationState won't get +destroyed (e.g. after migration_shutdown()). Meanwhile there's also no API +available to remove the incoming coroutine in migration_shutdown(), +avoiding it to access the freed elements. + +There's a bug report showing this can happen and crash dest QEMU when +migration is cancelled on source. + +When it happens, the dest main thread is trying to cleanup everything: + + #0 qemu_aio_coroutine_enter + #1 aio_dispatch_handler + #2 aio_poll + #3 monitor_cleanup + #4 qemu_cleanup + #5 qemu_default_main + +Then it found the migration incoming coroutine, schedule it (even after +migration_shutdown()), causing crash: + + #0 __pthread_kill_implementation + #1 __pthread_kill_internal + #2 __GI_raise + #3 __GI_abort + #4 __assert_fail_base + #5 __assert_fail + #6 qemu_mutex_lock_impl + #7 qemu_lockable_mutex_lock + #8 qemu_lockable_lock + #9 qemu_lockable_auto_lock + #10 migrate_set_error + #11 process_incoming_migration_co + #12 coroutine_trampoline + +To fix it, take a refcount after an incoming setup is properly done when +qmp_migrate_incoming() succeeded the 1st time. As it's during a QMP +handler which needs BQL, it means the main loop is still alive (without +going into cleanups, which also needs BQL). + +Releasing the refcount now only until the incoming migration coroutine +finished or failed. Hence the refcount is valid for both (1) setup phase +of incoming ports, mostly IO watches (e.g. qio_channel_add_watch_full()), +and (2) the incoming coroutine itself (process_incoming_migration_co()). + +Note that we can't unref in migration_incoming_state_destroy(), because +both qmp_xen_load_devices_state() and load_snapshot() will use it without +an incoming migration. Those hold BQL so they're not prone to this issue. + +PS: I suspect nobody uses Xen's command at all, as it didn't register yank, +hence AFAIU the command should crash on master when trying to unregister +yank in migration_incoming_state_destroy().. but that's another story. + +Also note that in some incoming failure cases we may not always unref the +MigrationState refcount, which is a trade-off to keep things simple. We +could make it accurate, but it can be an overkill. Some examples: + + - Unlike most of the rest protocols, socket_start_incoming_migration() + may create net listener after incoming port setup sucessfully. + It means we can't unref in migration_channel_process_incoming() as a + generic path because socket protocol might keep using MigrationState. + + - For either socket or file, multiple IO watches might be created, it + means logically each IO watch needs to take one refcount for + MigrationState so as to be 100% accurate on ownership of refcount taken. + +In general, we at least need per-protocol handling to make it accurate, +which can be an overkill if we know incoming failed after all. Add a short +comment to explain that when taking the refcount in qmp_migrate_incoming(). + +Bugzilla: https://issues.redhat.com/browse/RHEL-69775 +Tested-by: Yan Fu +Signed-off-by: Peter Xu +Reviewed-by: Fabiano Rosas +Message-ID: <20250220132459.512610-1-peterx@redhat.com> +Signed-off-by: Fabiano Rosas +(cherry picked from commit d657a14de5d597bbfe7b54e4c4f0646f440e98ad) +Signed-off-by: Peter Xu +--- + migration/migration.c | 40 ++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 38 insertions(+), 2 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index 999d4cac54..aabdc45c16 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -115,6 +115,27 @@ static void migration_downtime_start(MigrationState *s) + s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + } + ++/* ++ * This is unfortunate: incoming migration actually needs the outgoing ++ * migration state (MigrationState) to be there too, e.g. to query ++ * capabilities, parameters, using locks, setup errors, etc. ++ * ++ * NOTE: when calling this, making sure current_migration exists and not ++ * been freed yet! Otherwise trying to access the refcount is already ++ * an use-after-free itself.. ++ * ++ * TODO: Move shared part of incoming / outgoing out into separate object. ++ * Then this is not needed. ++ */ ++static void migrate_incoming_ref_outgoing_state(void) ++{ ++ object_ref(migrate_get_current()); ++} ++static void migrate_incoming_unref_outgoing_state(void) ++{ ++ object_unref(migrate_get_current()); ++} ++ + static void migration_downtime_end(MigrationState *s) + { + int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); +@@ -821,7 +842,7 @@ process_incoming_migration_co(void *opaque) + * postcopy thread. + */ + trace_process_incoming_migration_co_postcopy_end_main(); +- return; ++ goto out; + } + /* Else if something went wrong then just fall out of the normal exit */ + } +@@ -837,7 +858,8 @@ process_incoming_migration_co(void *opaque) + } + + migration_bh_schedule(process_incoming_migration_bh, mis); +- return; ++ goto out; ++ + fail: + migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, + MIGRATION_STATUS_FAILED); +@@ -854,6 +876,9 @@ fail: + + exit(EXIT_FAILURE); + } ++out: ++ /* Pairs with the refcount taken in qmp_migrate_incoming() */ ++ migrate_incoming_unref_outgoing_state(); + } + + /** +@@ -1875,6 +1900,17 @@ void qmp_migrate_incoming(const char *uri, bool has_channels, + return; + } + ++ /* ++ * Making sure MigrationState is available until incoming migration ++ * completes. ++ * ++ * NOTE: QEMU _might_ leak this refcount in some failure paths, but ++ * that's OK. This is the minimum change we need to at least making ++ * sure success case is clean on the refcount. We can try harder to ++ * make it accurate for any kind of failures, but it might be an ++ * overkill and doesn't bring us much benefit. ++ */ ++ migrate_incoming_ref_outgoing_state(); + once = false; + } + +-- +2.48.1 + diff --git a/SOURCES/kvm-migration-postcopy-Spatial-locality-page-hint-for-pr.patch b/SOURCES/kvm-migration-postcopy-Spatial-locality-page-hint-for-pr.patch new file mode 100644 index 0000000..bacd275 --- /dev/null +++ b/SOURCES/kvm-migration-postcopy-Spatial-locality-page-hint-for-pr.patch @@ -0,0 +1,237 @@ +From 1fa31324da8ebba64a44c1e9b64f7e59c29f3d75 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Thu, 24 Apr 2025 18:07:05 -0400 +Subject: [PATCH 1/2] migration/postcopy: Spatial locality page hint for + preempt mode +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Peter Xu +RH-MergeRequest: 358: migration/postcopy: Spatial locality page hint for preempt mode +RH-Jira: RHEL-85159 +RH-Acked-by: Juraj Marcin +RH-Acked-by: Daniel P. Berrangé +RH-Commit: [1/1] f5bce349c80f98428c73a3898f87d4d10ec2f4bd (peterx/qemu-kvm) + +The preempt mode postcopy has been introduced for a while. From latency +POV, it should always win the vanilla postcopy. + +However there's one thing missing when preempt mode is enabled right now, +which is the spatial locality hint when there're page requests from the +destination side. + +In vanilla postcopy, as long as a page request was unqueued, it will update +the PSS of the precopy background stream, so that after a page request the +background thread will move the pages after whatever was requested. It's +pretty much a natural behavior when there's only one channel anyway, and +one scanner to send the pages. + +Preempt mode didn't follow that, because preempt mode has its own channel +and its own PSS (which doesn't linearly scan the guest memory, but +dedicated to resolve page requested from destination). So the page request +process and the background migration process are completely separate. + +This patch adds the hint explicitly for preempt mode. With that, whenever +the preempt mode receives a page request on the source, it will service the +remote page fault in the return path, then it'll provide a hint to the +background thread so that we'll start sending the pages right after the +requested ones in the background, assuming the follow up pages have a +higher chance to be accessed later. + +NOTE: since the background migration thread and return path thread run +completely concurrently, it doesn't always mean the hint will be applied +every single time. For example, it's possible that the return path thread +receives multiple page requests in a row without the background thread +getting the chance to consume one. In such case, the preempt thread only +provide the hint if the previous hint has been consumed. After all, +there's no point queuing hints when we only have one linear scanner. + +This could measureably improve the simple sequential memory access pattern +during postcopy (when preempt is on). For random accesses, I can measure a +slight increase of remote page fault latency from ~500us -> ~600us, that +could be a trade-off to have such hint mechanism, and after all that's +still greatly improved comparing to vanilla postcopy on random (~10ms). + +The patch is verified by our QE team in a video streaming test case, to +reduce the pause of the video from ~1min to a few seconds when switching +over to postcopy with preempt mode. + +Reported-by: Xiaohui Li +Tested-by: Xiaohui Li +Reviewed-by: Juraj Marcin +Link: https://lore.kernel.org/r/20250424220705.195544-1-peterx@redhat.com +Signed-off-by: Peter Xu +(cherry picked from commit 20d82622812d888478d04a2d0d8575d70eb5d749) +Signed-off-by: Peter Xu +--- + migration/ram.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 96 insertions(+), 1 deletion(-) + +diff --git a/migration/ram.c b/migration/ram.c +index edec1a2d07..0803f85b8a 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -112,6 +112,36 @@ + + XBZRLECacheStats xbzrle_counters; + ++/* ++ * This structure locates a specific location of a guest page. In QEMU, ++ * it's described in a tuple of (ramblock, offset). ++ */ ++struct PageLocation { ++ RAMBlock *block; ++ unsigned long offset; ++}; ++typedef struct PageLocation PageLocation; ++ ++/** ++ * PageLocationHint: describes a hint to a page location ++ * ++ * @valid set if the hint is vaild and to be consumed ++ * @location: the hint content ++ * ++ * In postcopy preempt mode, the urgent channel may provide hints to the ++ * background channel, so that QEMU source can try to migrate whatever is ++ * right after the requested urgent pages. ++ * ++ * This is based on the assumption that the VM (already running on the ++ * destination side) tends to access the memory with spatial locality. ++ * This is also the default behavior of vanilla postcopy (preempt off). ++ */ ++struct PageLocationHint { ++ bool valid; ++ PageLocation location; ++}; ++typedef struct PageLocationHint PageLocationHint; ++ + /* used by the search for pages to send */ + struct PageSearchStatus { + /* The migration channel used for a specific host page */ +@@ -414,6 +444,13 @@ struct RAMState { + * RAM migration. + */ + unsigned int postcopy_bmap_sync_requested; ++ /* ++ * Page hint during postcopy when preempt mode is on. Return path ++ * thread sets it, while background migration thread consumes it. ++ * ++ * Protected by @bitmap_mutex. ++ */ ++ PageLocationHint page_hint; + }; + typedef struct RAMState RAMState; + +@@ -2091,6 +2128,21 @@ static void pss_host_page_finish(PageSearchStatus *pss) + pss->host_page_start = pss->host_page_end = 0; + } + ++static void ram_page_hint_update(RAMState *rs, PageSearchStatus *pss) ++{ ++ PageLocationHint *hint = &rs->page_hint; ++ ++ /* If there's a pending hint not consumed, don't bother */ ++ if (hint->valid) { ++ return; ++ } ++ ++ /* Provide a hint to the background stream otherwise */ ++ hint->location.block = pss->block; ++ hint->location.offset = pss->page; ++ hint->valid = true; ++} ++ + /* + * Send an urgent host page specified by `pss'. Need to be called with + * bitmap_mutex held. +@@ -2136,6 +2188,7 @@ out: + /* For urgent requests, flush immediately if sent */ + if (sent) { + qemu_fflush(pss->pss_channel); ++ ram_page_hint_update(rs, pss); + } + return ret; + } +@@ -2223,6 +2276,30 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) + return (res < 0 ? res : pages); + } + ++static bool ram_page_hint_valid(RAMState *rs) ++{ ++ /* There's only page hint during postcopy preempt mode */ ++ if (!postcopy_preempt_active()) { ++ return false; ++ } ++ ++ return rs->page_hint.valid; ++} ++ ++static void ram_page_hint_collect(RAMState *rs, RAMBlock **block, ++ unsigned long *page) ++{ ++ PageLocationHint *hint = &rs->page_hint; ++ ++ assert(hint->valid); ++ ++ *block = hint->location.block; ++ *page = hint->location.offset; ++ ++ /* Mark the hint consumed */ ++ hint->valid = false; ++} ++ + /** + * ram_find_and_save_block: finds a dirty page and sends it to f + * +@@ -2239,6 +2316,8 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) + static int ram_find_and_save_block(RAMState *rs) + { + PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; ++ unsigned long next_page; ++ RAMBlock *next_block; + int pages = 0; + + /* No dirty page as there is zero RAM */ +@@ -2258,7 +2337,14 @@ static int ram_find_and_save_block(RAMState *rs) + rs->last_page = 0; + } + +- pss_init(pss, rs->last_seen_block, rs->last_page); ++ if (ram_page_hint_valid(rs)) { ++ ram_page_hint_collect(rs, &next_block, &next_page); ++ } else { ++ next_block = rs->last_seen_block; ++ next_page = rs->last_page; ++ } ++ ++ pss_init(pss, next_block, next_page); + + while (true){ + if (!get_queued_page(rs, pss)) { +@@ -2392,6 +2478,13 @@ static void ram_save_cleanup(void *opaque) + migration_ops = NULL; + } + ++static void ram_page_hint_reset(PageLocationHint *hint) ++{ ++ hint->location.block = NULL; ++ hint->location.offset = 0; ++ hint->valid = false; ++} ++ + static void ram_state_reset(RAMState *rs) + { + int i; +@@ -2404,6 +2497,8 @@ static void ram_state_reset(RAMState *rs) + rs->last_page = 0; + rs->last_version = ram_list.version; + rs->xbzrle_started = false; ++ ++ ram_page_hint_reset(&rs->page_hint); + } + + #define MAX_WAIT 50 /* ms, half buffered_file limit */ +-- +2.48.1 + diff --git a/SOURCES/kvm-mirror-Allow-QMP-override-to-declare-target-already-.patch b/SOURCES/kvm-mirror-Allow-QMP-override-to-declare-target-already-.patch new file mode 100644 index 0000000..c00be71 --- /dev/null +++ b/SOURCES/kvm-mirror-Allow-QMP-override-to-declare-target-already-.patch @@ -0,0 +1,295 @@ +From a5f6042a0c80daf3672fa071b724cb05e6f6e928 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Fri, 9 May 2025 15:40:25 -0500 +Subject: [PATCH 10/16] mirror: Allow QMP override to declare target already + zero + +RH-Author: Eric Blake +RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors +RH-Jira: RHEL-82906 RHEL-83015 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Jon Maloy +RH-Commit: [8/14] fb054864175d83e9d232464295b170808bee0e6c (ebblake/centos-qemu-kvm) + +QEMU has an optimization for a just-created drive-mirror destination +that is not possible for blockdev-mirror (which can't create the +destination) - any time we know the destination starts life as all +zeroes, we can skip a pre-zeroing pass on the destination. Recent +patches have added an improved heuristic for detecting if a file +contains all zeroes, and we plan to use that heuristic in upcoming +patches. But since a heuristic cannot quickly detect all scenarios, +and there may be cases where the caller is aware of information that +QEMU cannot learn quickly, it makes sense to have a way to tell QEMU +to assume facts about the destination that can make the mirror +operation faster. Given our existing example of "qemu-img convert +--target-is-zero", it is time to expose this override in QMP for +blockdev-mirror as well. + +This patch results in some slight redundancy between the older +s->zero_target (set any time mode==FULL and the destination image was +not just created - ie. clear if drive-mirror is asking to skip the +pre-zero pass) and the newly-introduced s->target_is_zero (in addition +to the QMP override, it is set when drive-mirror creates the +destination image); this will be cleaned up in the next patch. + +There is also a subtlety that we must consider. When drive-mirror is +passing target_is_zero on behalf of a just-created image, we know the +image is sparse (skipping the pre-zeroing keeps it that way), so it +doesn't matter whether the destination also has "discard":"unmap" and +"detect-zeroes":"unmap". But now that we are letting the user set the +knob for target-is-zero, if the user passes a pre-existing file that +is fully allocated, it is fine to leave the file fully allocated under +"detect-zeroes":"on", but if the file is open with +"detect-zeroes":"unmap", we should really be trying harder to punch +holes in the destination for every region of zeroes copied from the +source. The easiest way to do this is to still run the pre-zeroing +pass (turning the entire destination file sparse before populating +just the allocated portions of the source), even though that currently +results in double I/O to the portions of the file that are allocated. +A later patch will add further optimizations to reduce redundant +zeroing I/O during the mirror operation. + +Since "target-is-zero":true is designed for optimizations, it is okay +to silently ignore the parameter rather than erroring if the user ever +sets the parameter in a scenario where the mirror job can't exploit it +(for example, when doing "sync":"top" instead of "sync":"full", we +can't pre-zero, so setting the parameter won't make a speed +difference). + +Signed-off-by: Eric Blake +Acked-by: Markus Armbruster +Message-ID: <20250509204341.3553601-23-eblake@redhat.com> +Reviewed-by: Sunny Zhu +Reviewed-by: Stefan Hajnoczi +(cherry picked from commit d17a34bfb94bda3a89d7320ae67255ded1d8c939) +Jira: https://issues.redhat.com/browse/RHEL-82906 +Jira: https://issues.redhat.com/browse/RHEL-83015 +Signed-off-by: Eric Blake +--- + block/mirror.c | 27 ++++++++++++++++++++++---- + blockdev.c | 18 ++++++++++------- + include/block/block_int-global-state.h | 3 ++- + qapi/block-core.json | 8 +++++++- + tests/unit/test-block-iothread.c | 2 +- + 5 files changed, 44 insertions(+), 14 deletions(-) + +diff --git a/block/mirror.c b/block/mirror.c +index c8bbaa0b35..bba3e3b05c 100644 +--- a/block/mirror.c ++++ b/block/mirror.c +@@ -55,6 +55,8 @@ typedef struct MirrorBlockJob { + BlockMirrorBackingMode backing_mode; + /* Whether the target image requires explicit zero-initialization */ + bool zero_target; ++ /* Whether the target should be assumed to be already zero initialized */ ++ bool target_is_zero; + /* + * To be accesssed with atomics. Written only under the BQL (required by the + * current implementation of mirror_change()). +@@ -844,12 +846,26 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s) + BlockDriverState *target_bs = blk_bs(s->target); + int ret = -EIO; + int64_t count; ++ bool punch_holes = ++ target_bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP && ++ bdrv_can_write_zeroes_with_unmap(target_bs); + + bdrv_graph_co_rdlock(); + bs = s->mirror_top_bs->backing->bs; + bdrv_graph_co_rdunlock(); + +- if (s->zero_target) { ++ if (s->zero_target && (!s->target_is_zero || punch_holes)) { ++ /* ++ * Here, we are in FULL mode; our goal is to avoid writing ++ * zeroes if the destination already reads as zero, except ++ * when we are trying to punch holes. This is possible if ++ * zeroing happened externally (s->target_is_zero) or if we ++ * have a fast way to pre-zero the image (the dirty bitmap ++ * will be populated later by the non-zero portions, the same ++ * as for TOP mode). If pre-zeroing is not fast, or we need ++ * to punch holes, then our only recourse is to write the ++ * entire image. ++ */ + if (!bdrv_can_write_zeroes_with_unmap(target_bs)) { + bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, s->bdev_length); + return 0; +@@ -1714,7 +1730,7 @@ static BlockJob *mirror_start_job( + uint32_t granularity, int64_t buf_size, + MirrorSyncMode sync_mode, + BlockMirrorBackingMode backing_mode, +- bool zero_target, ++ bool zero_target, bool target_is_zero, + BlockdevOnError on_source_error, + BlockdevOnError on_target_error, + bool unmap, +@@ -1883,6 +1899,7 @@ static BlockJob *mirror_start_job( + s->sync_mode = sync_mode; + s->backing_mode = backing_mode; + s->zero_target = zero_target; ++ s->target_is_zero = target_is_zero; + qatomic_set(&s->copy_mode, copy_mode); + s->base = base; + s->base_overlay = bdrv_find_overlay(bs, base); +@@ -2011,7 +2028,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs, + int creation_flags, int64_t speed, + uint32_t granularity, int64_t buf_size, + MirrorSyncMode mode, BlockMirrorBackingMode backing_mode, +- bool zero_target, ++ bool zero_target, bool target_is_zero, + BlockdevOnError on_source_error, + BlockdevOnError on_target_error, + bool unmap, const char *filter_node_name, +@@ -2034,7 +2051,8 @@ void mirror_start(const char *job_id, BlockDriverState *bs, + + mirror_start_job(job_id, bs, creation_flags, target, replaces, + speed, granularity, buf_size, mode, backing_mode, +- zero_target, on_source_error, on_target_error, unmap, ++ zero_target, ++ target_is_zero, on_source_error, on_target_error, unmap, + NULL, NULL, &mirror_job_driver, base, false, + filter_node_name, true, copy_mode, false, errp); + } +@@ -2062,6 +2080,7 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs, + job = mirror_start_job( + job_id, bs, creation_flags, base, NULL, speed, 0, 0, + MIRROR_SYNC_MODE_TOP, MIRROR_LEAVE_BACKING_CHAIN, false, ++ false, + on_error, on_error, true, cb, opaque, + &commit_active_job_driver, base, auto_complete, + filter_node_name, false, MIRROR_COPY_MODE_BACKGROUND, +diff --git a/blockdev.c b/blockdev.c +index 70046b6690..db11a99312 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -2795,7 +2795,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs, + const char *replaces, + enum MirrorSyncMode sync, + BlockMirrorBackingMode backing_mode, +- bool zero_target, ++ bool zero_target, bool target_is_zero, + bool has_speed, int64_t speed, + bool has_granularity, uint32_t granularity, + bool has_buf_size, int64_t buf_size, +@@ -2906,11 +2906,10 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs, + /* pass the node name to replace to mirror start since it's loose coupling + * and will allow to check whether the node still exist at mirror completion + */ +- mirror_start(job_id, bs, target, +- replaces, job_flags, ++ mirror_start(job_id, bs, target, replaces, job_flags, + speed, granularity, buf_size, sync, backing_mode, zero_target, +- on_source_error, on_target_error, unmap, filter_node_name, +- copy_mode, errp); ++ target_is_zero, on_source_error, on_target_error, unmap, ++ filter_node_name, copy_mode, errp); + } + + void qmp_drive_mirror(DriveMirror *arg, Error **errp) +@@ -2925,6 +2924,7 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp) + int64_t size; + const char *format = arg->format; + bool zero_target; ++ bool target_is_zero; + int ret; + + bs = qmp_get_root_bs(arg->device, errp); +@@ -3041,6 +3041,8 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp) + zero_target = (arg->sync == MIRROR_SYNC_MODE_FULL && + (arg->mode == NEW_IMAGE_MODE_EXISTING || + !bdrv_has_zero_init(target_bs))); ++ target_is_zero = (arg->mode != NEW_IMAGE_MODE_EXISTING && ++ bdrv_has_zero_init(target_bs)); + bdrv_graph_rdunlock_main_loop(); + + +@@ -3052,7 +3054,7 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp) + + blockdev_mirror_common(arg->job_id, bs, target_bs, + arg->replaces, arg->sync, +- backing_mode, zero_target, ++ backing_mode, zero_target, target_is_zero, + arg->has_speed, arg->speed, + arg->has_granularity, arg->granularity, + arg->has_buf_size, arg->buf_size, +@@ -3082,6 +3084,7 @@ void qmp_blockdev_mirror(const char *job_id, + bool has_copy_mode, MirrorCopyMode copy_mode, + bool has_auto_finalize, bool auto_finalize, + bool has_auto_dismiss, bool auto_dismiss, ++ bool has_target_is_zero, bool target_is_zero, + Error **errp) + { + BlockDriverState *bs; +@@ -3112,7 +3115,8 @@ void qmp_blockdev_mirror(const char *job_id, + + blockdev_mirror_common(job_id, bs, target_bs, + replaces, sync, backing_mode, +- zero_target, has_speed, speed, ++ zero_target, has_target_is_zero && target_is_zero, ++ has_speed, speed, + has_granularity, granularity, + has_buf_size, buf_size, + has_on_source_error, on_source_error, +diff --git a/include/block/block_int-global-state.h b/include/block/block_int-global-state.h +index eb2d92a226..8cf0003ce7 100644 +--- a/include/block/block_int-global-state.h ++++ b/include/block/block_int-global-state.h +@@ -140,6 +140,7 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs, + * @mode: Whether to collapse all images in the chain to the target. + * @backing_mode: How to establish the target's backing chain after completion. + * @zero_target: Whether the target should be explicitly zero-initialized ++ * @target_is_zero: Whether the target already is zero-initialized. + * @on_source_error: The action to take upon error reading from the source. + * @on_target_error: The action to take upon error writing to the target. + * @unmap: Whether to unmap target where source sectors only contain zeroes. +@@ -159,7 +160,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs, + int creation_flags, int64_t speed, + uint32_t granularity, int64_t buf_size, + MirrorSyncMode mode, BlockMirrorBackingMode backing_mode, +- bool zero_target, ++ bool zero_target, bool target_is_zero, + BlockdevOnError on_source_error, + BlockdevOnError on_target_error, + bool unmap, const char *filter_node_name, +diff --git a/qapi/block-core.json b/qapi/block-core.json +index c1af3d1f7d..3969c60b93 100644 +--- a/qapi/block-core.json ++++ b/qapi/block-core.json +@@ -2535,6 +2535,11 @@ + # disappear from the query list without user intervention. + # Defaults to true. (Since 3.1) + # ++# @target-is-zero: Assume the destination reads as all zeroes before ++# the mirror started. Setting this to true can speed up the ++# mirror. Setting this to true when the destination is not ++# actually all zero can corrupt the destination. (Since 10.1) ++# + # Since: 2.6 + # + # .. qmp-example:: +@@ -2554,7 +2559,8 @@ + '*on-target-error': 'BlockdevOnError', + '*filter-node-name': 'str', + '*copy-mode': 'MirrorCopyMode', +- '*auto-finalize': 'bool', '*auto-dismiss': 'bool' }, ++ '*auto-finalize': 'bool', '*auto-dismiss': 'bool', ++ '*target-is-zero': 'bool'}, + 'allow-preconfig': true } + + ## +diff --git a/tests/unit/test-block-iothread.c b/tests/unit/test-block-iothread.c +index 373b72fdd8..033711d8d7 100644 +--- a/tests/unit/test-block-iothread.c ++++ b/tests/unit/test-block-iothread.c +@@ -755,7 +755,7 @@ static void test_propagate_mirror(void) + + /* Start a mirror job */ + mirror_start("job0", src, target, NULL, JOB_DEFAULT, 0, 0, 0, +- MIRROR_SYNC_MODE_NONE, MIRROR_OPEN_BACKING_CHAIN, false, ++ MIRROR_SYNC_MODE_NONE, MIRROR_OPEN_BACKING_CHAIN, false, false, + BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT, + false, "filter_node", MIRROR_COPY_MODE_BACKGROUND, + &error_abort); +-- +2.48.1 + diff --git a/SOURCES/kvm-mirror-Drop-redundant-zero_target-parameter.patch b/SOURCES/kvm-mirror-Drop-redundant-zero_target-parameter.patch new file mode 100644 index 0000000..ee01f81 --- /dev/null +++ b/SOURCES/kvm-mirror-Drop-redundant-zero_target-parameter.patch @@ -0,0 +1,241 @@ +From 5040f835f07f3355ae80b3da2ae83ce35de022e0 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Fri, 9 May 2025 15:40:26 -0500 +Subject: [PATCH 11/16] mirror: Drop redundant zero_target parameter + +RH-Author: Eric Blake +RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors +RH-Jira: RHEL-82906 RHEL-83015 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Jon Maloy +RH-Commit: [9/14] b84a938c69e3761211b9fee4c59b465d55f61855 (ebblake/centos-qemu-kvm) + +The two callers to a mirror job (drive-mirror and blockdev-mirror) set +zero_target precisely when sync mode == FULL, with the one exception +that drive-mirror skips zeroing the target if it was newly created and +reads as zero. But given the previous patch, that exception is +equally captured by target_is_zero. + +Meanwhile, there is another slight wrinkle, fortunately caught by +iotest 185: if the caller uses "sync":"top" but the source has no +backing file, the code in blockdev.c was changing sync to be FULL, but +only after it had set zero_target=false. In mirror.c, prior to recent +patches, this didn't matter: the only places that inspected sync were +setting is_none_mode (both TOP and FULL had set that to false), and +mirror_start() setting base = mode == MIRROR_SYNC_MODE_TOP ? +bdrv_backing_chain_next(bs) : NULL. But now that we are passing sync +around, the slammed sync mode would result in a new pre-zeroing pass +even when the user had passed "sync":"top" in an effort to skip +pre-zeroing. Fortunately, the assignment of base when bs has no +backing chain still works out to NULL if we don't slam things. So +with the forced change of sync ripped out of blockdev.c, the sync mode +is passed through the full callstack unmolested, and we can now +reliably reconstruct the same settings as what used to be passed in by +zero_target=false, without the redundant parameter. + +Signed-off-by: Eric Blake +Message-ID: <20250509204341.3553601-24-eblake@redhat.com> +Reviewed-by: Sunny Zhu +Reviewed-by: Stefan Hajnoczi +[eblake: Fix regression in iotest 185] +Signed-off-by: Eric Blake +(cherry picked from commit 253b43a29077de9266351e120c600a73b82e9c49) +Jira: https://issues.redhat.com/browse/RHEL-82906 +Jira: https://issues.redhat.com/browse/RHEL-83015 +Signed-off-by: Eric Blake +--- + block/mirror.c | 13 +++++-------- + blockdev.c | 19 ++++--------------- + include/block/block_int-global-state.h | 3 +-- + tests/unit/test-block-iothread.c | 2 +- + 4 files changed, 11 insertions(+), 26 deletions(-) + +diff --git a/block/mirror.c b/block/mirror.c +index bba3e3b05c..b35d12adaa 100644 +--- a/block/mirror.c ++++ b/block/mirror.c +@@ -53,8 +53,6 @@ typedef struct MirrorBlockJob { + Error *replace_blocker; + MirrorSyncMode sync_mode; + BlockMirrorBackingMode backing_mode; +- /* Whether the target image requires explicit zero-initialization */ +- bool zero_target; + /* Whether the target should be assumed to be already zero initialized */ + bool target_is_zero; + /* +@@ -854,7 +852,9 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s) + bs = s->mirror_top_bs->backing->bs; + bdrv_graph_co_rdunlock(); + +- if (s->zero_target && (!s->target_is_zero || punch_holes)) { ++ if (s->sync_mode == MIRROR_SYNC_MODE_TOP) { ++ /* In TOP mode, there is no benefit to a pre-zeroing pass. */ ++ } else if (!s->target_is_zero || punch_holes) { + /* + * Here, we are in FULL mode; our goal is to avoid writing + * zeroes if the destination already reads as zero, except +@@ -1730,7 +1730,7 @@ static BlockJob *mirror_start_job( + uint32_t granularity, int64_t buf_size, + MirrorSyncMode sync_mode, + BlockMirrorBackingMode backing_mode, +- bool zero_target, bool target_is_zero, ++ bool target_is_zero, + BlockdevOnError on_source_error, + BlockdevOnError on_target_error, + bool unmap, +@@ -1898,7 +1898,6 @@ static BlockJob *mirror_start_job( + s->on_target_error = on_target_error; + s->sync_mode = sync_mode; + s->backing_mode = backing_mode; +- s->zero_target = zero_target; + s->target_is_zero = target_is_zero; + qatomic_set(&s->copy_mode, copy_mode); + s->base = base; +@@ -2028,7 +2027,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs, + int creation_flags, int64_t speed, + uint32_t granularity, int64_t buf_size, + MirrorSyncMode mode, BlockMirrorBackingMode backing_mode, +- bool zero_target, bool target_is_zero, ++ bool target_is_zero, + BlockdevOnError on_source_error, + BlockdevOnError on_target_error, + bool unmap, const char *filter_node_name, +@@ -2051,7 +2050,6 @@ void mirror_start(const char *job_id, BlockDriverState *bs, + + mirror_start_job(job_id, bs, creation_flags, target, replaces, + speed, granularity, buf_size, mode, backing_mode, +- zero_target, + target_is_zero, on_source_error, on_target_error, unmap, + NULL, NULL, &mirror_job_driver, base, false, + filter_node_name, true, copy_mode, false, errp); +@@ -2080,7 +2078,6 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs, + job = mirror_start_job( + job_id, bs, creation_flags, base, NULL, speed, 0, 0, + MIRROR_SYNC_MODE_TOP, MIRROR_LEAVE_BACKING_CHAIN, false, +- false, + on_error, on_error, true, cb, opaque, + &commit_active_job_driver, base, auto_complete, + filter_node_name, false, MIRROR_COPY_MODE_BACKGROUND, +diff --git a/blockdev.c b/blockdev.c +index db11a99312..04fa759e30 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -2795,7 +2795,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs, + const char *replaces, + enum MirrorSyncMode sync, + BlockMirrorBackingMode backing_mode, +- bool zero_target, bool target_is_zero, ++ bool target_is_zero, + bool has_speed, int64_t speed, + bool has_granularity, uint32_t granularity, + bool has_buf_size, int64_t buf_size, +@@ -2862,10 +2862,6 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs, + return; + } + +- if (!bdrv_backing_chain_next(bs) && sync == MIRROR_SYNC_MODE_TOP) { +- sync = MIRROR_SYNC_MODE_FULL; +- } +- + if (!replaces) { + /* We want to mirror from @bs, but keep implicit filters on top */ + unfiltered_bs = bdrv_skip_implicit_filters(bs); +@@ -2907,7 +2903,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs, + * and will allow to check whether the node still exist at mirror completion + */ + mirror_start(job_id, bs, target, replaces, job_flags, +- speed, granularity, buf_size, sync, backing_mode, zero_target, ++ speed, granularity, buf_size, sync, backing_mode, + target_is_zero, on_source_error, on_target_error, unmap, + filter_node_name, copy_mode, errp); + } +@@ -2923,7 +2919,6 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp) + int flags; + int64_t size; + const char *format = arg->format; +- bool zero_target; + bool target_is_zero; + int ret; + +@@ -3038,9 +3033,6 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp) + } + + bdrv_graph_rdlock_main_loop(); +- zero_target = (arg->sync == MIRROR_SYNC_MODE_FULL && +- (arg->mode == NEW_IMAGE_MODE_EXISTING || +- !bdrv_has_zero_init(target_bs))); + target_is_zero = (arg->mode != NEW_IMAGE_MODE_EXISTING && + bdrv_has_zero_init(target_bs)); + bdrv_graph_rdunlock_main_loop(); +@@ -3054,7 +3046,7 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp) + + blockdev_mirror_common(arg->job_id, bs, target_bs, + arg->replaces, arg->sync, +- backing_mode, zero_target, target_is_zero, ++ backing_mode, target_is_zero, + arg->has_speed, arg->speed, + arg->has_granularity, arg->granularity, + arg->has_buf_size, arg->buf_size, +@@ -3091,7 +3083,6 @@ void qmp_blockdev_mirror(const char *job_id, + BlockDriverState *target_bs; + AioContext *aio_context; + BlockMirrorBackingMode backing_mode = MIRROR_LEAVE_BACKING_CHAIN; +- bool zero_target; + int ret; + + bs = qmp_get_root_bs(device, errp); +@@ -3104,8 +3095,6 @@ void qmp_blockdev_mirror(const char *job_id, + return; + } + +- zero_target = (sync == MIRROR_SYNC_MODE_FULL); +- + aio_context = bdrv_get_aio_context(bs); + + ret = bdrv_try_change_aio_context(target_bs, aio_context, NULL, errp); +@@ -3115,7 +3104,7 @@ void qmp_blockdev_mirror(const char *job_id, + + blockdev_mirror_common(job_id, bs, target_bs, + replaces, sync, backing_mode, +- zero_target, has_target_is_zero && target_is_zero, ++ has_target_is_zero && target_is_zero, + has_speed, speed, + has_granularity, granularity, + has_buf_size, buf_size, +diff --git a/include/block/block_int-global-state.h b/include/block/block_int-global-state.h +index 8cf0003ce7..d21bd7fd2f 100644 +--- a/include/block/block_int-global-state.h ++++ b/include/block/block_int-global-state.h +@@ -139,7 +139,6 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs, + * @buf_size: The amount of data that can be in flight at one time. + * @mode: Whether to collapse all images in the chain to the target. + * @backing_mode: How to establish the target's backing chain after completion. +- * @zero_target: Whether the target should be explicitly zero-initialized + * @target_is_zero: Whether the target already is zero-initialized. + * @on_source_error: The action to take upon error reading from the source. + * @on_target_error: The action to take upon error writing to the target. +@@ -160,7 +159,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs, + int creation_flags, int64_t speed, + uint32_t granularity, int64_t buf_size, + MirrorSyncMode mode, BlockMirrorBackingMode backing_mode, +- bool zero_target, bool target_is_zero, ++ bool target_is_zero, + BlockdevOnError on_source_error, + BlockdevOnError on_target_error, + bool unmap, const char *filter_node_name, +diff --git a/tests/unit/test-block-iothread.c b/tests/unit/test-block-iothread.c +index 033711d8d7..373b72fdd8 100644 +--- a/tests/unit/test-block-iothread.c ++++ b/tests/unit/test-block-iothread.c +@@ -755,7 +755,7 @@ static void test_propagate_mirror(void) + + /* Start a mirror job */ + mirror_start("job0", src, target, NULL, JOB_DEFAULT, 0, 0, 0, +- MIRROR_SYNC_MODE_NONE, MIRROR_OPEN_BACKING_CHAIN, false, false, ++ MIRROR_SYNC_MODE_NONE, MIRROR_OPEN_BACKING_CHAIN, false, + BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT, + false, "filter_node", MIRROR_COPY_MODE_BACKGROUND, + &error_abort); +-- +2.48.1 + diff --git a/SOURCES/kvm-mirror-Minor-refactoring.patch b/SOURCES/kvm-mirror-Minor-refactoring.patch new file mode 100644 index 0000000..eda26ee --- /dev/null +++ b/SOURCES/kvm-mirror-Minor-refactoring.patch @@ -0,0 +1,92 @@ +From 0102da22fe5aefde9d398d539fc290ab062346f1 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Fri, 9 May 2025 15:40:23 -0500 +Subject: [PATCH 08/16] mirror: Minor refactoring + +RH-Author: Eric Blake +RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors +RH-Jira: RHEL-82906 RHEL-83015 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Jon Maloy +RH-Commit: [6/14] 886fa2e3249f48f89d3e04ba619d370031851d89 (ebblake/centos-qemu-kvm) + +Commit 5791ba52 (v9.2) pre-initialized ret in mirror_dirty_init to +silence a false positive compiler warning, even though in all code +paths where ret is used, it was guaranteed to be reassigned +beforehand. But since the function returns -errno, and -1 is not +always the right errno, it's better to initialize to -EIO. + +An upcoming patch wants to track two bitmaps in +do_sync_target_write(); this will be easier if the current variables +related to the dirty bitmap are renamed. + +Signed-off-by: Eric Blake +Reviewed-by: Stefan Hajnoczi +Message-ID: <20250509204341.3553601-21-eblake@redhat.com> +(cherry picked from commit 870f8963cf1a84f8ec929b05a6d68906974a76c5) +Conflicts: + block/mirror.c - commit 5791ba52 not present +Jira: https://issues.redhat.com/browse/RHEL-82906 +Jira: https://issues.redhat.com/browse/RHEL-83015 +Signed-off-by: Eric Blake +--- + block/mirror.c | 22 +++++++++++----------- + 1 file changed, 11 insertions(+), 11 deletions(-) + +diff --git a/block/mirror.c b/block/mirror.c +index 61f0a717b7..22f8bd98c4 100644 +--- a/block/mirror.c ++++ b/block/mirror.c +@@ -841,7 +841,7 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s) + int64_t offset; + BlockDriverState *bs; + BlockDriverState *target_bs = blk_bs(s->target); +- int ret; ++ int ret = -EIO; + int64_t count; + + bdrv_graph_co_rdlock(); +@@ -1341,7 +1341,7 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method, + { + int ret; + size_t qiov_offset = 0; +- int64_t bitmap_offset, bitmap_end; ++ int64_t dirty_bitmap_offset, dirty_bitmap_end; + + if (!QEMU_IS_ALIGNED(offset, job->granularity) && + bdrv_dirty_bitmap_get(job->dirty_bitmap, offset)) +@@ -1388,11 +1388,11 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method, + * Tails are either clean or shrunk, so for bitmap resetting + * we safely align the range down. + */ +- bitmap_offset = QEMU_ALIGN_UP(offset, job->granularity); +- bitmap_end = QEMU_ALIGN_DOWN(offset + bytes, job->granularity); +- if (bitmap_offset < bitmap_end) { +- bdrv_reset_dirty_bitmap(job->dirty_bitmap, bitmap_offset, +- bitmap_end - bitmap_offset); ++ dirty_bitmap_offset = QEMU_ALIGN_UP(offset, job->granularity); ++ dirty_bitmap_end = QEMU_ALIGN_DOWN(offset + bytes, job->granularity); ++ if (dirty_bitmap_offset < dirty_bitmap_end) { ++ bdrv_reset_dirty_bitmap(job->dirty_bitmap, dirty_bitmap_offset, ++ dirty_bitmap_end - dirty_bitmap_offset); + } + + job_progress_increase_remaining(&job->common.job, bytes); +@@ -1430,10 +1430,10 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method, + * at function start, and they must be still dirty, as we've locked + * the region for in-flight op. + */ +- bitmap_offset = QEMU_ALIGN_DOWN(offset, job->granularity); +- bitmap_end = QEMU_ALIGN_UP(offset + bytes, job->granularity); +- bdrv_set_dirty_bitmap(job->dirty_bitmap, bitmap_offset, +- bitmap_end - bitmap_offset); ++ dirty_bitmap_offset = QEMU_ALIGN_DOWN(offset, job->granularity); ++ dirty_bitmap_end = QEMU_ALIGN_UP(offset + bytes, job->granularity); ++ bdrv_set_dirty_bitmap(job->dirty_bitmap, dirty_bitmap_offset, ++ dirty_bitmap_end - dirty_bitmap_offset); + qatomic_set(&job->actively_synced, false); + + action = mirror_error_action(job, false, -ret); +-- +2.48.1 + diff --git a/SOURCES/kvm-mirror-Pass-full-sync-mode-rather-than-bool-to-inter.patch b/SOURCES/kvm-mirror-Pass-full-sync-mode-rather-than-bool-to-inter.patch new file mode 100644 index 0000000..653bb20 --- /dev/null +++ b/SOURCES/kvm-mirror-Pass-full-sync-mode-rather-than-bool-to-inter.patch @@ -0,0 +1,139 @@ +From 482db3e637a16d5877e523e87c53ddb2579b4b66 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Fri, 9 May 2025 15:40:24 -0500 +Subject: [PATCH 09/16] mirror: Pass full sync mode rather than bool to + internals + +RH-Author: Eric Blake +RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors +RH-Jira: RHEL-82906 RHEL-83015 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Jon Maloy +RH-Commit: [7/14] f45a83a14b0eea07517176d44ab0c49db8233ea0 (ebblake/centos-qemu-kvm) + +Out of the five possible values for MirrorSyncMode, INCREMENTAL and +BITMAP are already rejected up front in mirror_start, leaving NONE, +TOP, and FULL as the remaining values that the code was collapsing +into a single bool is_none_mode. Furthermore, mirror_dirty_init() is +only reachable for modes TOP and FULL, as further guided by +s->zero_target. However, upcoming patches want to further optimize +the pre-zeroing pass of a sync=full mirror in mirror_dirty_init(), +while avoiding that pass on a sync=top action. Instead of throwing +away context by collapsing these two values into +s->is_none_mode=false, it is better to pass s->sync_mode throughout +the entire operation. For active commit, the desired semantics match +sync mode TOP. + +Signed-off-by: Eric Blake +Message-ID: <20250509204341.3553601-22-eblake@redhat.com> +Reviewed-by: Sunny Zhu +Reviewed-by: Stefan Hajnoczi +(cherry picked from commit 9474d97bd7421b4fe7c806ab0949697514d11e88) +Jira: https://issues.redhat.com/browse/RHEL-82906 +Jira: https://issues.redhat.com/browse/RHEL-83015 +Signed-off-by: Eric Blake +--- + block/mirror.c | 24 ++++++++++++------------ + 1 file changed, 12 insertions(+), 12 deletions(-) + +diff --git a/block/mirror.c b/block/mirror.c +index 22f8bd98c4..c8bbaa0b35 100644 +--- a/block/mirror.c ++++ b/block/mirror.c +@@ -51,7 +51,7 @@ typedef struct MirrorBlockJob { + BlockDriverState *to_replace; + /* Used to block operations on the drive-mirror-replace target */ + Error *replace_blocker; +- bool is_none_mode; ++ MirrorSyncMode sync_mode; + BlockMirrorBackingMode backing_mode; + /* Whether the target image requires explicit zero-initialization */ + bool zero_target; +@@ -723,9 +723,10 @@ static int mirror_exit_common(Job *job) + &error_abort); + + if (!abort && s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) { +- BlockDriverState *backing = s->is_none_mode ? src : s->base; ++ BlockDriverState *backing; + BlockDriverState *unfiltered_target = bdrv_skip_filters(target_bs); + ++ backing = s->sync_mode == MIRROR_SYNC_MODE_NONE ? src : s->base; + if (bdrv_cow_bs(unfiltered_target) != backing) { + bdrv_set_backing_hd(unfiltered_target, backing, &local_err); + if (local_err) { +@@ -1020,7 +1021,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) + mirror_free_init(s); + + s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); +- if (!s->is_none_mode) { ++ if (s->sync_mode != MIRROR_SYNC_MODE_NONE) { + ret = mirror_dirty_init(s); + if (ret < 0 || job_is_cancelled(&s->common.job)) { + goto immediate_exit; +@@ -1711,6 +1712,7 @@ static BlockJob *mirror_start_job( + int creation_flags, BlockDriverState *target, + const char *replaces, int64_t speed, + uint32_t granularity, int64_t buf_size, ++ MirrorSyncMode sync_mode, + BlockMirrorBackingMode backing_mode, + bool zero_target, + BlockdevOnError on_source_error, +@@ -1719,7 +1721,7 @@ static BlockJob *mirror_start_job( + BlockCompletionFunc *cb, + void *opaque, + const BlockJobDriver *driver, +- bool is_none_mode, BlockDriverState *base, ++ BlockDriverState *base, + bool auto_complete, const char *filter_node_name, + bool is_mirror, MirrorCopyMode copy_mode, + bool base_ro, +@@ -1878,7 +1880,7 @@ static BlockJob *mirror_start_job( + s->replaces = g_strdup(replaces); + s->on_source_error = on_source_error; + s->on_target_error = on_target_error; +- s->is_none_mode = is_none_mode; ++ s->sync_mode = sync_mode; + s->backing_mode = backing_mode; + s->zero_target = zero_target; + qatomic_set(&s->copy_mode, copy_mode); +@@ -2015,7 +2017,6 @@ void mirror_start(const char *job_id, BlockDriverState *bs, + bool unmap, const char *filter_node_name, + MirrorCopyMode copy_mode, Error **errp) + { +- bool is_none_mode; + BlockDriverState *base; + + GLOBAL_STATE_CODE(); +@@ -2028,14 +2029,13 @@ void mirror_start(const char *job_id, BlockDriverState *bs, + } + + bdrv_graph_rdlock_main_loop(); +- is_none_mode = mode == MIRROR_SYNC_MODE_NONE; + base = mode == MIRROR_SYNC_MODE_TOP ? bdrv_backing_chain_next(bs) : NULL; + bdrv_graph_rdunlock_main_loop(); + + mirror_start_job(job_id, bs, creation_flags, target, replaces, +- speed, granularity, buf_size, backing_mode, zero_target, +- on_source_error, on_target_error, unmap, NULL, NULL, +- &mirror_job_driver, is_none_mode, base, false, ++ speed, granularity, buf_size, mode, backing_mode, ++ zero_target, on_source_error, on_target_error, unmap, ++ NULL, NULL, &mirror_job_driver, base, false, + filter_node_name, true, copy_mode, false, errp); + } + +@@ -2061,9 +2061,9 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs, + + job = mirror_start_job( + job_id, bs, creation_flags, base, NULL, speed, 0, 0, +- MIRROR_LEAVE_BACKING_CHAIN, false, ++ MIRROR_SYNC_MODE_TOP, MIRROR_LEAVE_BACKING_CHAIN, false, + on_error, on_error, true, cb, opaque, +- &commit_active_job_driver, false, base, auto_complete, ++ &commit_active_job_driver, base, auto_complete, + filter_node_name, false, MIRROR_COPY_MODE_BACKGROUND, + base_read_only, errp); + if (!job) { +-- +2.48.1 + diff --git a/SOURCES/kvm-mirror-Reduce-I-O-when-destination-is-detect-zeroes-.patch b/SOURCES/kvm-mirror-Reduce-I-O-when-destination-is-detect-zeroes-.patch new file mode 100644 index 0000000..3864b3d --- /dev/null +++ b/SOURCES/kvm-mirror-Reduce-I-O-when-destination-is-detect-zeroes-.patch @@ -0,0 +1,58 @@ +From be6ce2c91fe949d1c264de974ab4f6c4efc6976e Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 13 May 2025 17:00:45 -0500 +Subject: [PATCH 16/16] mirror: Reduce I/O when destination is + detect-zeroes:unmap + +RH-Author: Eric Blake +RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors +RH-Jira: RHEL-82906 RHEL-83015 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Jon Maloy +RH-Commit: [14/14] 66f3de2ba9f977c9bc1c54f67d76b366df132e62 (ebblake/centos-qemu-kvm) + +If we are going to punch holes in the mirror destination even for the +portions where the source image is unallocated, it is nicer to treat +the entire image as dirty and punch as we go, rather than pre-zeroing +the entire image just to re-do I/O to the allocated portions of the +image. + +Signed-off-by: Eric Blake +Message-ID: <20250513220142.535200-2-eblake@redhat.com> +Reviewed-by: Stefan Hajnoczi +(cherry picked from commit 9abfc81246c9cc1845080eec5920779961187c07) +Jira: https://issues.redhat.com/browse/RHEL-82906 +Jira: https://issues.redhat.com/browse/RHEL-83015 +Signed-off-by: Eric Blake +--- + block/mirror.c | 13 +++++++++---- + 1 file changed, 9 insertions(+), 4 deletions(-) + +diff --git a/block/mirror.c b/block/mirror.c +index 7f3b5477ce..87c19ddf0d 100644 +--- a/block/mirror.c ++++ b/block/mirror.c +@@ -920,11 +920,16 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s) + * zeroing happened externally (ret > 0) or if we have a fast + * way to pre-zero the image (the dirty bitmap will be + * populated later by the non-zero portions, the same as for +- * TOP mode). If pre-zeroing is not fast, then our only +- * recourse is to mark the entire image dirty. The act of +- * pre-zeroing will populate the zero bitmap. ++ * TOP mode). If pre-zeroing is not fast, or we need to visit ++ * the entire image in order to punch holes even in the ++ * non-allocated regions of the source, then just mark the ++ * entire image dirty and leave the zero bitmap clear at this ++ * point in time. Otherwise, it can be faster to pre-zero the ++ * image now, even if we re-write the allocated portions of ++ * the disk later, and the pre-zero pass will populate the ++ * zero bitmap. + */ +- if (!bdrv_can_write_zeroes_with_unmap(target_bs)) { ++ if (!bdrv_can_write_zeroes_with_unmap(target_bs) || punch_holes) { + bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, s->bdev_length); + return 0; + } +-- +2.48.1 + diff --git a/SOURCES/kvm-mirror-Skip-pre-zeroing-destination-if-it-is-already.patch b/SOURCES/kvm-mirror-Skip-pre-zeroing-destination-if-it-is-already.patch new file mode 100644 index 0000000..ea9ad0b --- /dev/null +++ b/SOURCES/kvm-mirror-Skip-pre-zeroing-destination-if-it-is-already.patch @@ -0,0 +1,180 @@ +From 423ce7727eecae647330287e1264ac0d938fa7f9 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Fri, 9 May 2025 15:40:27 -0500 +Subject: [PATCH 12/16] mirror: Skip pre-zeroing destination if it is already + zero + +RH-Author: Eric Blake +RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors +RH-Jira: RHEL-82906 RHEL-83015 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Jon Maloy +RH-Commit: [10/14] e754ae559123099f4aed322f6a4287cf3323f54d (ebblake/centos-qemu-kvm) + +When doing a sync=full mirroring, we can skip pre-zeroing the +destination if it already reads as zeroes and we are not also trying +to punch holes due to detect-zeroes. With this patch, there are fewer +scenarios that have to pass in an explicit target-is-zero, while still +resulting in a sparse destination remaining sparse. + +A later patch will then further improve things to skip writing to the +destination for parts of the image where the source is zero; but even +with just this patch, it is possible to see a difference for any +source that does not report itself as fully allocated, coupled with a +destination BDS that can quickly report that it already reads as zero. +(For a source that reports as fully allocated, such as a file, the +rest of mirror_dirty_init() still sets the entire dirty bitmap to +true, so even though we avoided the pre-zeroing, we are not yet +avoiding all redundant I/O). + +Iotest 194 detects the difference made by this patch: for a file +source (where block status reports the entire image as allocated, and +therefore we end up writing zeroes everywhere in the destination +anyways), the job length remains the same. But for a qcow2 source and +a destination that reads as all zeroes, the dirty bitmap changes to +just tracking the allocated portions of the source, which results in +faster completion and smaller job statistics. For the test to pass +with both ./check -file and -qcow2, a new python filter is needed to +mask out the now-varying job amounts (this matches the shell filters +_filter_block_job_{offset,len} in common.filter). A later test will +also be added which further validates expected sparseness, so it does +not matter that 194 is no longer explicitly looking at how many bytes +were copied. + +Signed-off-by: Eric Blake +Message-ID: <20250509204341.3553601-25-eblake@redhat.com> +Reviewed-by: Sunny Zhu +Reviewed-by: Stefan Hajnoczi +(cherry picked from commit 181a63667adf16c35b57e446def3e41c70f1fea6) +Jira: https://issues.redhat.com/browse/RHEL-82906 +Jira: https://issues.redhat.com/browse/RHEL-83015 +Signed-off-by: Eric Blake +--- + block/mirror.c | 24 ++++++++++++++++-------- + tests/qemu-iotests/194 | 6 ++++-- + tests/qemu-iotests/194.out | 4 ++-- + tests/qemu-iotests/iotests.py | 12 +++++++++++- + 4 files changed, 33 insertions(+), 13 deletions(-) + +diff --git a/block/mirror.c b/block/mirror.c +index b35d12adaa..29cac1777c 100644 +--- a/block/mirror.c ++++ b/block/mirror.c +@@ -848,23 +848,31 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s) + target_bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP && + bdrv_can_write_zeroes_with_unmap(target_bs); + ++ /* Determine if the image is already zero, regardless of sync mode. */ + bdrv_graph_co_rdlock(); + bs = s->mirror_top_bs->backing->bs; ++ if (s->target_is_zero) { ++ ret = 1; ++ } else { ++ ret = bdrv_co_is_all_zeroes(target_bs); ++ } + bdrv_graph_co_rdunlock(); + +- if (s->sync_mode == MIRROR_SYNC_MODE_TOP) { ++ /* Determine if a pre-zeroing pass is necessary. */ ++ if (ret < 0) { ++ return ret; ++ } else if (s->sync_mode == MIRROR_SYNC_MODE_TOP) { + /* In TOP mode, there is no benefit to a pre-zeroing pass. */ +- } else if (!s->target_is_zero || punch_holes) { ++ } else if (ret == 0 || punch_holes) { + /* + * Here, we are in FULL mode; our goal is to avoid writing + * zeroes if the destination already reads as zero, except + * when we are trying to punch holes. This is possible if +- * zeroing happened externally (s->target_is_zero) or if we +- * have a fast way to pre-zero the image (the dirty bitmap +- * will be populated later by the non-zero portions, the same +- * as for TOP mode). If pre-zeroing is not fast, or we need +- * to punch holes, then our only recourse is to write the +- * entire image. ++ * zeroing happened externally (ret > 0) or if we have a fast ++ * way to pre-zero the image (the dirty bitmap will be ++ * populated later by the non-zero portions, the same as for ++ * TOP mode). If pre-zeroing is not fast, or we need to punch ++ * holes, then our only recourse is to write the entire image. + */ + if (!bdrv_can_write_zeroes_with_unmap(target_bs)) { + bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, s->bdev_length); +diff --git a/tests/qemu-iotests/194 b/tests/qemu-iotests/194 +index d0b9c084f5..e114c0b269 100755 +--- a/tests/qemu-iotests/194 ++++ b/tests/qemu-iotests/194 +@@ -62,7 +62,8 @@ with iotests.FilePath('source.img') as source_img_path, \ + + iotests.log('Waiting for `drive-mirror` to complete...') + iotests.log(source_vm.event_wait('BLOCK_JOB_READY'), +- filters=[iotests.filter_qmp_event]) ++ filters=[iotests.filter_qmp_event, ++ iotests.filter_block_job]) + + iotests.log('Starting migration...') + capabilities = [{'capability': 'events', 'state': True}, +@@ -88,7 +89,8 @@ with iotests.FilePath('source.img') as source_img_path, \ + + while True: + event2 = source_vm.event_wait('BLOCK_JOB_COMPLETED') +- iotests.log(event2, filters=[iotests.filter_qmp_event]) ++ iotests.log(event2, filters=[iotests.filter_qmp_event, ++ iotests.filter_block_job]) + if event2['event'] == 'BLOCK_JOB_COMPLETED': + iotests.log('Stopping the NBD server on destination...') + iotests.log(dest_vm.qmp('nbd-server-stop')) +diff --git a/tests/qemu-iotests/194.out b/tests/qemu-iotests/194.out +index 376ed1d2e6..84e0fc34be 100644 +--- a/tests/qemu-iotests/194.out ++++ b/tests/qemu-iotests/194.out +@@ -7,7 +7,7 @@ Launching NBD server on destination... + Starting `drive-mirror` on source... + {"return": {}} + Waiting for `drive-mirror` to complete... +-{"data": {"device": "mirror-job0", "len": 1073741824, "offset": 1073741824, "speed": 0, "type": "mirror"}, "event": "BLOCK_JOB_READY", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++{"data": {"device": "mirror-job0", "len": "LEN", "offset": "OFFSET", "speed": 0, "type": "mirror"}, "event": "BLOCK_JOB_READY", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + Starting migration... + {"return": {}} + {"execute": "migrate-start-postcopy", "arguments": {}} +@@ -17,7 +17,7 @@ Starting migration... + {"data": {"status": "completed"}, "event": "MIGRATION", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + Gracefully ending the `drive-mirror` job on source... + {"return": {}} +-{"data": {"device": "mirror-job0", "len": 1073741824, "offset": 1073741824, "speed": 0, "type": "mirror"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} ++{"data": {"device": "mirror-job0", "len": "LEN", "offset": "OFFSET", "speed": 0, "type": "mirror"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}} + Stopping the NBD server on destination... + {"return": {}} + Wait for migration completion on target... +diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py +index c8cb028c2d..978bef1499 100644 +--- a/tests/qemu-iotests/iotests.py ++++ b/tests/qemu-iotests/iotests.py +@@ -601,13 +601,23 @@ def filter_chown(msg): + return chown_re.sub("chown UID:GID", msg) + + def filter_qmp_event(event): +- '''Filter a QMP event dict''' ++ '''Filter the timestamp of a QMP event dict''' + event = dict(event) + if 'timestamp' in event: + event['timestamp']['seconds'] = 'SECS' + event['timestamp']['microseconds'] = 'USECS' + return event + ++def filter_block_job(event): ++ '''Filter the offset and length of a QMP block job event dict''' ++ event = dict(event) ++ if 'data' in event: ++ if 'offset' in event['data']: ++ event['data']['offset'] = 'OFFSET' ++ if 'len' in event['data']: ++ event['data']['len'] = 'LEN' ++ return event ++ + def filter_qmp(qmsg, filter_fn): + '''Given a string filter, filter a QMP object's values. + filter_fn takes a (key, value) pair.''' +-- +2.48.1 + diff --git a/SOURCES/kvm-mirror-Skip-writing-zeroes-when-target-is-already-ze.patch b/SOURCES/kvm-mirror-Skip-writing-zeroes-when-target-is-already-ze.patch new file mode 100644 index 0000000..4da52ce --- /dev/null +++ b/SOURCES/kvm-mirror-Skip-writing-zeroes-when-target-is-already-ze.patch @@ -0,0 +1,355 @@ +From 8a2e660ff3ec7f7506fbd4197d4dc8f53db7859a Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Fri, 9 May 2025 15:40:28 -0500 +Subject: [PATCH 13/16] mirror: Skip writing zeroes when target is already zero + +RH-Author: Eric Blake +RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors +RH-Jira: RHEL-82906 RHEL-83015 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Jon Maloy +RH-Commit: [11/14] f6bb5e0cecee07af0389aa18c3bddb47d6c5cf54 (ebblake/centos-qemu-kvm) + +When mirroring, the goal is to ensure that the destination reads the +same as the source; this goal is met whether the destination is sparse +or fully-allocated (except when explicitly punching holes, then merely +reading zero is not enough to know if it is sparse, so we still want +to punch the hole). Avoiding a redundant write to zero (whether in +the background because the zero cluster was marked in the dirty +bitmap, or in the foreground because the guest is writing zeroes) when +the destination already reads as zero makes mirroring faster, and +avoids allocating the destination merely because the source reports as +allocated. + +The effect is especially pronounced when the source is a raw file. +That's because when the source is a qcow2 file, the dirty bitmap only +visits the portions of the source that are allocated, which tend to be +non-zero. But when the source is a raw file, +bdrv_co_is_allocated_above() reports the entire file as allocated so +mirror_dirty_init sets the entire dirty bitmap, and it is only later +during mirror_iteration that we change to consulting the more precise +bdrv_co_block_status_above() to learn where the source reads as zero. + +Remember that since a mirror operation can write a cluster more than +once (every time the guest changes the source, the destination is also +changed to keep up), and the guest can change whether a given cluster +reads as zero, is discarded, or has non-zero data over the course of +the mirror operation, we can't take the shortcut of relying on +s->target_is_zero (which is static for the life of the job) in +mirror_co_zero() to see if the destination is already zero, because +that information may be stale. Any solution we use must be dynamic in +the face of the guest writing or discarding a cluster while the mirror +has been ongoing. + +We could just teach mirror_co_zero() to do a block_status() probe of +the destination, and skip the zeroes if the destination already reads +as zero, but we know from past experience that extra block_status() +calls are not always cheap (tmpfs, anyone?), especially when they are +random access rather than linear. Use of block_status() of the source +by the background task in a linear fashion is not our bottleneck (it's +a background task, after all); but since mirroring can be done while +the source is actively being changed, we don't want a slow +block_status() of the destination to occur on the hot path of the +guest trying to do random-access writes to the source. + +So this patch takes a slightly different approach: any time we have to +track dirty clusters, we can also track which clusters are known to +read as zero. For sync=TOP or when we are punching holes from +"detect-zeroes":"unmap", the zero bitmap starts out empty, but +prevents a second write zero to a cluster that was already zero by an +earlier pass; for sync=FULL when we are not punching holes, the zero +bitmap starts out full if the destination reads as zero during +initialization. Either way, I/O to the destination can now avoid +redundant write zero to a cluster that already reads as zero, all +without having to do a block_status() per write on the destination. + +With this patch, if I create a raw sparse destination file, connect it +with QMP 'blockdev-add' while leaving it at the default "discard": +"ignore", then run QMP 'blockdev-mirror' with "sync": "full", the +destination remains sparse rather than fully allocated. Meanwhile, a +destination image that is already fully allocated remains so unless it +was opened with "detect-zeroes": "unmap". And any time writing zeroes +is skipped, the job counters are not incremented. + +Signed-off-by: Eric Blake +Message-ID: <20250509204341.3553601-26-eblake@redhat.com> +Reviewed-by: Stefan Hajnoczi +(cherry picked from commit 7e277545b90874171128804e256a538fb0e8dd7e) +Jira: https://issues.redhat.com/browse/RHEL-82906 +Jira: https://issues.redhat.com/browse/RHEL-83015 +Signed-off-by: Eric Blake +--- + block/mirror.c | 107 ++++++++++++++++++++++++++++++++++++++++++------- + 1 file changed, 93 insertions(+), 14 deletions(-) + +diff --git a/block/mirror.c b/block/mirror.c +index 29cac1777c..7f3b5477ce 100644 +--- a/block/mirror.c ++++ b/block/mirror.c +@@ -73,6 +73,7 @@ typedef struct MirrorBlockJob { + size_t buf_size; + int64_t bdev_length; + unsigned long *cow_bitmap; ++ unsigned long *zero_bitmap; + BdrvDirtyBitmap *dirty_bitmap; + BdrvDirtyBitmapIter *dbi; + uint8_t *buf; +@@ -108,9 +109,12 @@ struct MirrorOp { + int64_t offset; + uint64_t bytes; + +- /* The pointee is set by mirror_co_read(), mirror_co_zero(), and +- * mirror_co_discard() before yielding for the first time */ ++ /* ++ * These pointers are set by mirror_co_read(), mirror_co_zero(), and ++ * mirror_co_discard() before yielding for the first time ++ */ + int64_t *bytes_handled; ++ bool *io_skipped; + + bool is_pseudo_op; + bool is_active_write; +@@ -408,15 +412,34 @@ static void coroutine_fn mirror_co_read(void *opaque) + static void coroutine_fn mirror_co_zero(void *opaque) + { + MirrorOp *op = opaque; +- int ret; ++ bool write_needed = true; ++ int ret = 0; + + op->s->in_flight++; + op->s->bytes_in_flight += op->bytes; + *op->bytes_handled = op->bytes; + op->is_in_flight = true; + +- ret = blk_co_pwrite_zeroes(op->s->target, op->offset, op->bytes, +- op->s->unmap ? BDRV_REQ_MAY_UNMAP : 0); ++ if (op->s->zero_bitmap) { ++ unsigned long end = DIV_ROUND_UP(op->offset + op->bytes, ++ op->s->granularity); ++ assert(QEMU_IS_ALIGNED(op->offset, op->s->granularity)); ++ assert(QEMU_IS_ALIGNED(op->bytes, op->s->granularity) || ++ op->offset + op->bytes == op->s->bdev_length); ++ if (find_next_zero_bit(op->s->zero_bitmap, end, ++ op->offset / op->s->granularity) == end) { ++ write_needed = false; ++ *op->io_skipped = true; ++ } ++ } ++ if (write_needed) { ++ ret = blk_co_pwrite_zeroes(op->s->target, op->offset, op->bytes, ++ op->s->unmap ? BDRV_REQ_MAY_UNMAP : 0); ++ } ++ if (ret >= 0 && op->s->zero_bitmap) { ++ bitmap_set(op->s->zero_bitmap, op->offset / op->s->granularity, ++ DIV_ROUND_UP(op->bytes, op->s->granularity)); ++ } + mirror_write_complete(op, ret); + } + +@@ -435,29 +458,43 @@ static void coroutine_fn mirror_co_discard(void *opaque) + } + + static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset, +- unsigned bytes, MirrorMethod mirror_method) ++ unsigned bytes, MirrorMethod mirror_method, ++ bool *io_skipped) + { + MirrorOp *op; + Coroutine *co; + int64_t bytes_handled = -1; + ++ assert(QEMU_IS_ALIGNED(offset, s->granularity)); ++ assert(QEMU_IS_ALIGNED(bytes, s->granularity) || ++ offset + bytes == s->bdev_length); + op = g_new(MirrorOp, 1); + *op = (MirrorOp){ + .s = s, + .offset = offset, + .bytes = bytes, + .bytes_handled = &bytes_handled, ++ .io_skipped = io_skipped, + }; + qemu_co_queue_init(&op->waiting_requests); + + switch (mirror_method) { + case MIRROR_METHOD_COPY: ++ if (s->zero_bitmap) { ++ bitmap_clear(s->zero_bitmap, offset / s->granularity, ++ DIV_ROUND_UP(bytes, s->granularity)); ++ } + co = qemu_coroutine_create(mirror_co_read, op); + break; + case MIRROR_METHOD_ZERO: ++ /* s->zero_bitmap handled in mirror_co_zero */ + co = qemu_coroutine_create(mirror_co_zero, op); + break; + case MIRROR_METHOD_DISCARD: ++ if (s->zero_bitmap) { ++ bitmap_clear(s->zero_bitmap, offset / s->granularity, ++ DIV_ROUND_UP(bytes, s->granularity)); ++ } + co = qemu_coroutine_create(mirror_co_discard, op); + break; + default: +@@ -568,6 +605,7 @@ static void coroutine_fn GRAPH_UNLOCKED mirror_iteration(MirrorBlockJob *s) + int ret; + int64_t io_bytes; + int64_t io_bytes_acct; ++ bool io_skipped = false; + MirrorMethod mirror_method = MIRROR_METHOD_COPY; + + assert(!(offset % s->granularity)); +@@ -611,8 +649,10 @@ static void coroutine_fn GRAPH_UNLOCKED mirror_iteration(MirrorBlockJob *s) + } + + io_bytes = mirror_clip_bytes(s, offset, io_bytes); +- io_bytes = mirror_perform(s, offset, io_bytes, mirror_method); +- if (mirror_method != MIRROR_METHOD_COPY && write_zeroes_ok) { ++ io_bytes = mirror_perform(s, offset, io_bytes, mirror_method, ++ &io_skipped); ++ if (io_skipped || ++ (mirror_method != MIRROR_METHOD_COPY && write_zeroes_ok)) { + io_bytes_acct = 0; + } else { + io_bytes_acct = io_bytes; +@@ -847,8 +887,10 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s) + bool punch_holes = + target_bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP && + bdrv_can_write_zeroes_with_unmap(target_bs); ++ int64_t bitmap_length = DIV_ROUND_UP(s->bdev_length, s->granularity); + + /* Determine if the image is already zero, regardless of sync mode. */ ++ s->zero_bitmap = bitmap_new(bitmap_length); + bdrv_graph_co_rdlock(); + bs = s->mirror_top_bs->backing->bs; + if (s->target_is_zero) { +@@ -862,7 +904,14 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s) + if (ret < 0) { + return ret; + } else if (s->sync_mode == MIRROR_SYNC_MODE_TOP) { +- /* In TOP mode, there is no benefit to a pre-zeroing pass. */ ++ /* ++ * In TOP mode, there is no benefit to a pre-zeroing pass, but ++ * the zero bitmap can be set if the destination already reads ++ * as zero and we are not punching holes. ++ */ ++ if (ret > 0 && !punch_holes) { ++ bitmap_set(s->zero_bitmap, 0, bitmap_length); ++ } + } else if (ret == 0 || punch_holes) { + /* + * Here, we are in FULL mode; our goal is to avoid writing +@@ -871,8 +920,9 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s) + * zeroing happened externally (ret > 0) or if we have a fast + * way to pre-zero the image (the dirty bitmap will be + * populated later by the non-zero portions, the same as for +- * TOP mode). If pre-zeroing is not fast, or we need to punch +- * holes, then our only recourse is to write the entire image. ++ * TOP mode). If pre-zeroing is not fast, then our only ++ * recourse is to mark the entire image dirty. The act of ++ * pre-zeroing will populate the zero bitmap. + */ + if (!bdrv_can_write_zeroes_with_unmap(target_bs)) { + bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, s->bdev_length); +@@ -883,6 +933,7 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s) + for (offset = 0; offset < s->bdev_length; ) { + int bytes = MIN(s->bdev_length - offset, + QEMU_ALIGN_DOWN(INT_MAX, s->granularity)); ++ bool ignored; + + mirror_throttle(s); + +@@ -898,12 +949,15 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s) + continue; + } + +- mirror_perform(s, offset, bytes, MIRROR_METHOD_ZERO); ++ mirror_perform(s, offset, bytes, MIRROR_METHOD_ZERO, &ignored); + offset += bytes; + } + + mirror_wait_for_all_io(s); + s->initial_zeroing_ongoing = false; ++ } else { ++ /* In FULL mode, and image already reads as zero. */ ++ bitmap_set(s->zero_bitmap, 0, bitmap_length); + } + + /* First part, loop on the sectors and initialize the dirty bitmap. */ +@@ -1188,6 +1242,7 @@ immediate_exit: + assert(s->in_flight == 0); + qemu_vfree(s->buf); + g_free(s->cow_bitmap); ++ g_free(s->zero_bitmap); + g_free(s->in_flight_bitmap); + bdrv_dirty_iter_free(s->dbi); + +@@ -1367,6 +1422,7 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method, + int ret; + size_t qiov_offset = 0; + int64_t dirty_bitmap_offset, dirty_bitmap_end; ++ int64_t zero_bitmap_offset, zero_bitmap_end; + + if (!QEMU_IS_ALIGNED(offset, job->granularity) && + bdrv_dirty_bitmap_get(job->dirty_bitmap, offset)) +@@ -1410,8 +1466,9 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method, + } + + /* +- * Tails are either clean or shrunk, so for bitmap resetting +- * we safely align the range down. ++ * Tails are either clean or shrunk, so for dirty bitmap resetting ++ * we safely align the range narrower. But for zero bitmap, round ++ * range wider for checking or clearing, and narrower for setting. + */ + dirty_bitmap_offset = QEMU_ALIGN_UP(offset, job->granularity); + dirty_bitmap_end = QEMU_ALIGN_DOWN(offset + bytes, job->granularity); +@@ -1419,22 +1476,44 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method, + bdrv_reset_dirty_bitmap(job->dirty_bitmap, dirty_bitmap_offset, + dirty_bitmap_end - dirty_bitmap_offset); + } ++ zero_bitmap_offset = offset / job->granularity; ++ zero_bitmap_end = DIV_ROUND_UP(offset + bytes, job->granularity); + + job_progress_increase_remaining(&job->common.job, bytes); + job->active_write_bytes_in_flight += bytes; + + switch (method) { + case MIRROR_METHOD_COPY: ++ if (job->zero_bitmap) { ++ bitmap_clear(job->zero_bitmap, zero_bitmap_offset, ++ zero_bitmap_end - zero_bitmap_offset); ++ } + ret = blk_co_pwritev_part(job->target, offset, bytes, + qiov, qiov_offset, flags); + break; + + case MIRROR_METHOD_ZERO: ++ if (job->zero_bitmap) { ++ if (find_next_zero_bit(job->zero_bitmap, zero_bitmap_end, ++ zero_bitmap_offset) == zero_bitmap_end) { ++ ret = 0; ++ break; ++ } ++ } + assert(!qiov); + ret = blk_co_pwrite_zeroes(job->target, offset, bytes, flags); ++ if (job->zero_bitmap && ret >= 0) { ++ bitmap_set(job->zero_bitmap, dirty_bitmap_offset / job->granularity, ++ (dirty_bitmap_end - dirty_bitmap_offset) / ++ job->granularity); ++ } + break; + + case MIRROR_METHOD_DISCARD: ++ if (job->zero_bitmap) { ++ bitmap_clear(job->zero_bitmap, zero_bitmap_offset, ++ zero_bitmap_end - zero_bitmap_offset); ++ } + assert(!qiov); + ret = blk_co_pdiscard(job->target, offset, bytes); + break; +-- +2.48.1 + diff --git a/SOURCES/kvm-net-socket-skip-automatic-zero-init-of-large-array.patch b/SOURCES/kvm-net-socket-skip-automatic-zero-init-of-large-array.patch new file mode 100644 index 0000000..f5361cf --- /dev/null +++ b/SOURCES/kvm-net-socket-skip-automatic-zero-init-of-large-array.patch @@ -0,0 +1,49 @@ +From 4b9a1a9154467fd65ac2a0a26959d3342d8fcd49 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:37:08 +0100 +Subject: [PATCH 55/57] net/socket: skip automatic zero-init of large array +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [29/30] 645ad4d138d1222ea9bd1b2ac3b84d9ff83e2fa2 (stefanha/centos-stream-qemu-kvm) + +The 'net_socket_send' method has a 68k byte array used for copying +data between guest and host. Skip the automatic zero-init of this +array to eliminate the performance overhead in the I/O hot path. + +The 'buf1' array will be fully initialized when reading data off +the network socket. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Reviewed-by: Klaus Jensen +Reviewed-by: Harsh Prateek Bora +Message-id: 20250610123709.835102-31-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 751b0e79f1e0e7f88fad2fe2f22595ad03d78859) +Signed-off-by: Stefan Hajnoczi +--- + net/socket.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/socket.c b/net/socket.c +index 8e3702e1f3..784dda686f 100644 +--- a/net/socket.c ++++ b/net/socket.c +@@ -157,7 +157,7 @@ static void net_socket_send(void *opaque) + NetSocketState *s = opaque; + int size; + int ret; +- uint8_t buf1[NET_BUFSIZE]; ++ QEMU_UNINITIALIZED uint8_t buf1[NET_BUFSIZE]; + const uint8_t *buf; + + size = recv(s->fd, buf1, sizeof(buf1), 0); +-- +2.39.3 + diff --git a/SOURCES/kvm-net-stream-skip-automatic-zero-init-of-large-array.patch b/SOURCES/kvm-net-stream-skip-automatic-zero-init-of-large-array.patch new file mode 100644 index 0000000..e9abf3f --- /dev/null +++ b/SOURCES/kvm-net-stream-skip-automatic-zero-init-of-large-array.patch @@ -0,0 +1,49 @@ +From 94310a4168257297e52058d5d6aea4a2d06630c6 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Tue, 10 Jun 2025 13:37:09 +0100 +Subject: [PATCH 56/57] net/stream: skip automatic zero-init of large array +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED +RH-Jira: RHEL-99888 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [30/30] 9dfec5c0e6358e3557bf58d66eee8e4ba6e93621 (stefanha/centos-stream-qemu-kvm) + +The 'net_stream_send' method has a 68k byte array used for copying +data between guest and host. Skip the automatic zero-init of this +array to eliminate the performance overhead in the I/O hot path. + +The 'buf1' array will be fully initialized when reading data off +the network socket. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Stefan Hajnoczi +Reviewed-by: Klaus Jensen +Reviewed-by: Harsh Prateek Bora +Message-id: 20250610123709.835102-32-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 837b87c4c5ba9ac7a255133c6642b8d578272a70) +Signed-off-by: Stefan Hajnoczi +--- + net/stream.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/stream.c b/net/stream.c +index 97e6ec6679..12384ffee5 100644 +--- a/net/stream.c ++++ b/net/stream.c +@@ -148,7 +148,7 @@ static gboolean net_stream_send(QIOChannel *ioc, + NetStreamState *s = data; + int size; + int ret; +- char buf1[NET_BUFSIZE]; ++ QEMU_UNINITIALIZED char buf1[NET_BUFSIZE]; + const char *buf; + + size = qio_channel_read(s->ioc, buf1, sizeof(buf1), NULL); +-- +2.39.3 + diff --git a/SOURCES/kvm-net-vhost-user-add-QAPI-events-to-report-connection-.patch b/SOURCES/kvm-net-vhost-user-add-QAPI-events-to-report-connection-.patch new file mode 100644 index 0000000..a792e7c --- /dev/null +++ b/SOURCES/kvm-net-vhost-user-add-QAPI-events-to-report-connection-.patch @@ -0,0 +1,133 @@ +From b6de1e19ba778547e92997c6cad77d7cf755c78b Mon Sep 17 00:00:00 2001 +From: Laurent Vivier +Date: Mon, 17 Feb 2025 10:25:50 +0100 +Subject: [PATCH 1/3] net: vhost-user: add QAPI events to report connection + state +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Laurent Vivier +RH-MergeRequest: 371: net: vhost-user: add QAPI events to report connection state +RH-Jira: RHEL-95120 +RH-Acked-by: Eugenio Pérez +RH-Acked-by: Cindy Lu +RH-Commit: [1/1] c8f65026e3548891fe713a1622438388e285dbf3 (lvivier/qemu-kvm-centos) + +The netdev reports NETDEV_VHOST_USER_CONNECTED event when +the chardev is connected, and NETDEV_VHOST_USER_DISCONNECTED +when it is disconnected. + +The NETDEV_VHOST_USER_CONNECTED event includes the chardev id. + +This allows a system manager like libvirt to detect when the server +fails. + +For instance with passt: + +{ 'execute': 'qmp_capabilities' } +{ "return": { } } + +[killing passt here] + +{ "timestamp": { "seconds": 1739538634, "microseconds": 920450 }, + "event": "NETDEV_VHOST_USER_DISCONNECTED", + "data": { "netdev-id": "netdev0" } } + +[automatic reconnection with reconnect-ms] + +{ "timestamp": { "seconds": 1739538638, "microseconds": 354181 }, + "event": "NETDEV_VHOST_USER_CONNECTED", + "data": { "netdev-id": "netdev0", "chardev-id": "chr0" } } + +Tested-by: Stefano Brivio +Signed-off-by: Laurent Vivier +Message-Id: <20250217092550.1172055-1-lvivier@redhat.com> +Acked-by: Markus Armbruster +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 02fd9f8aeeb184276b283ae2f404bc3acf1e7b7a) +--- + net/vhost-user.c | 3 +++ + qapi/net.json | 40 ++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 43 insertions(+) + +diff --git a/net/vhost-user.c b/net/vhost-user.c +index 12555518e8..0b235e50c6 100644 +--- a/net/vhost-user.c ++++ b/net/vhost-user.c +@@ -16,6 +16,7 @@ + #include "chardev/char-fe.h" + #include "qapi/error.h" + #include "qapi/qapi-commands-net.h" ++#include "qapi/qapi-events-net.h" + #include "qemu/config-file.h" + #include "qemu/error-report.h" + #include "qemu/option.h" +@@ -271,6 +272,7 @@ static void chr_closed_bh(void *opaque) + if (err) { + error_report_err(err); + } ++ qapi_event_send_netdev_vhost_user_disconnected(name); + } + + static void net_vhost_user_event(void *opaque, QEMUChrEvent event) +@@ -300,6 +302,7 @@ static void net_vhost_user_event(void *opaque, QEMUChrEvent event) + net_vhost_user_watch, s); + qmp_set_link(name, true, &err); + s->started = true; ++ qapi_event_send_netdev_vhost_user_connected(name, chr->label); + break; + case CHR_EVENT_CLOSED: + /* a close event may happen during a read/write, but vhost +diff --git a/qapi/net.json b/qapi/net.json +index 87fc0d0b28..7bd1eaa1ba 100644 +--- a/qapi/net.json ++++ b/qapi/net.json +@@ -1020,3 +1020,43 @@ + ## + { 'event': 'NETDEV_STREAM_DISCONNECTED', + 'data': { 'netdev-id': 'str' } } ++ ++## ++# @NETDEV_VHOST_USER_CONNECTED: ++# ++# Emitted when the vhost-user chardev is connected ++# ++# @netdev-id: QEMU netdev id that is connected ++# ++# @chardev-id: The character device id used by the QEMU netdev ++# ++# Since: 10.0 ++# ++# .. qmp-example:: ++# ++# <- { "timestamp": {"seconds": 1739538638, "microseconds": 354181 }, ++# "event": "NETDEV_VHOST_USER_CONNECTED", ++# "data": { "netdev-id": "netdev0", "chardev-id": "chr0" } } ++# ++## ++{ 'event': 'NETDEV_VHOST_USER_CONNECTED', ++ 'data': { 'netdev-id': 'str', 'chardev-id': 'str' } } ++ ++## ++# @NETDEV_VHOST_USER_DISCONNECTED: ++# ++# Emitted when the vhost-user chardev is disconnected ++# ++# @netdev-id: QEMU netdev id that is disconnected ++# ++# Since: 10.0 ++# ++# .. qmp-example:: ++# ++# <- { "timestamp": { "seconds": 1739538634, "microseconds": 920450 }, ++# "event": "NETDEV_VHOST_USER_DISCONNECTED", ++# "data": { "netdev-id": "netdev0" } } ++# ++## ++{ 'event': 'NETDEV_VHOST_USER_DISCONNECTED', ++ 'data': { 'netdev-id': 'str' } } +-- +2.48.1 + diff --git a/SOURCES/kvm-pci-Use-PCI-PM-capability-initializer.patch b/SOURCES/kvm-pci-Use-PCI-PM-capability-initializer.patch new file mode 100644 index 0000000..e2470de --- /dev/null +++ b/SOURCES/kvm-pci-Use-PCI-PM-capability-initializer.patch @@ -0,0 +1,153 @@ +From 978951b390bb7073293c792c4714516ad40cba73 Mon Sep 17 00:00:00 2001 +From: Alex Williamson +Date: Tue, 25 Feb 2025 14:52:26 -0700 +Subject: [PATCH 3/7] pci: Use PCI PM capability initializer +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Eric Auger +RH-MergeRequest: 348: PCI: Implement basic PCI PM capability backing +RH-Jira: RHEL-7301 +RH-Acked-by: Cédric Le Goater +RH-Acked-by: Alex Williamson +RH-Acked-by: Jon Maloy +RH-Commit: [3/6] fd862caa094490a9b8a04b00ad39ba58e0b46a7a (eauger1/centos-qemu-kvm) + +Switch callers directly initializing the PCI PM capability with +pci_add_capability() to use pci_pm_init(). + +Cc: Dmitry Fleytman +Cc: Akihiko Odaki +Cc: Jason Wang +Cc: Stefan Weil +Cc: Sriram Yagnaraman +Cc: Keith Busch +Cc: Klaus Jensen +Cc: Jesper Devantier +Cc: Michael S. Tsirkin +Cc: Marcel Apfelbaum +Cc: Cédric Le Goater +Signed-off-by: Alex Williamson +Reviewed-by: Eric Auger +Reviewed-by: Akihiko Odaki +Reviewed-by: Michael S. Tsirkin +Link: https://lore.kernel.org/qemu-devel/20250225215237.3314011-3-alex.williamson@redhat.com +Signed-off-by: Cédric Le Goater +(cherry picked from commit 0681ec253141d838210b3c5e6bc0d2d71f2e111e) +Signed-off-by: Eric Auger +--- + hw/net/e1000e.c | 3 +-- + hw/net/eepro100.c | 4 +--- + hw/net/igb.c | 3 +-- + hw/nvme/ctrl.c | 3 +-- + hw/pci-bridge/pcie_pci_bridge.c | 2 +- + hw/vfio/pci.c | 7 ++++++- + hw/virtio/virtio-pci.c | 3 +-- + 7 files changed, 12 insertions(+), 13 deletions(-) + +diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c +index 843892ce09..9eb93d049d 100644 +--- a/hw/net/e1000e.c ++++ b/hw/net/e1000e.c +@@ -372,8 +372,7 @@ static int + e1000e_add_pm_capability(PCIDevice *pdev, uint8_t offset, uint16_t pmc) + { + Error *local_err = NULL; +- int ret = pci_add_capability(pdev, PCI_CAP_ID_PM, offset, +- PCI_PM_SIZEOF, &local_err); ++ int ret = pci_pm_init(pdev, offset, &local_err); + + if (local_err) { + error_report_err(local_err); +diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c +index d9a70c4544..668a410055 100644 +--- a/hw/net/eepro100.c ++++ b/hw/net/eepro100.c +@@ -549,9 +549,7 @@ static void e100_pci_reset(EEPRO100State *s, Error **errp) + if (info->power_management) { + /* Power Management Capabilities */ + int cfg_offset = 0xdc; +- int r = pci_add_capability(&s->dev, PCI_CAP_ID_PM, +- cfg_offset, PCI_PM_SIZEOF, +- errp); ++ int r = pci_pm_init(&s->dev, cfg_offset, errp); + if (r < 0) { + return; + } +diff --git a/hw/net/igb.c b/hw/net/igb.c +index b92bba402e..a3c22e2391 100644 +--- a/hw/net/igb.c ++++ b/hw/net/igb.c +@@ -356,8 +356,7 @@ static int + igb_add_pm_capability(PCIDevice *pdev, uint8_t offset, uint16_t pmc) + { + Error *local_err = NULL; +- int ret = pci_add_capability(pdev, PCI_CAP_ID_PM, offset, +- PCI_PM_SIZEOF, &local_err); ++ int ret = pci_pm_init(pdev, offset, &local_err); + + if (local_err) { + error_report_err(local_err); +diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c +index 9f277b81d8..d451ee0d00 100644 +--- a/hw/nvme/ctrl.c ++++ b/hw/nvme/ctrl.c +@@ -8293,8 +8293,7 @@ static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset) + Error *err = NULL; + int ret; + +- ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, offset, +- PCI_PM_SIZEOF, &err); ++ ret = pci_pm_init(pci_dev, offset, &err); + if (err) { + error_report_err(err); + return ret; +diff --git a/hw/pci-bridge/pcie_pci_bridge.c b/hw/pci-bridge/pcie_pci_bridge.c +index 7646ac2397..2f098e3a13 100644 +--- a/hw/pci-bridge/pcie_pci_bridge.c ++++ b/hw/pci-bridge/pcie_pci_bridge.c +@@ -52,7 +52,7 @@ static void pcie_pci_bridge_realize(PCIDevice *d, Error **errp) + goto cap_error; + } + +- pos = pci_add_capability(d, PCI_CAP_ID_PM, 0, PCI_PM_SIZEOF, errp); ++ pos = pci_pm_init(d, 0, errp); + if (pos < 0) { + goto pm_error; + } +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 82a47edc89..e18b57d864 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -2220,7 +2220,12 @@ static bool vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp) + case PCI_CAP_ID_PM: + vfio_check_pm_reset(vdev, pos); + vdev->pm_cap = pos; +- ret = pci_add_capability(pdev, cap_id, pos, size, errp) >= 0; ++ ret = pci_pm_init(pdev, pos, errp) >= 0; ++ /* ++ * PCI-core config space emulation needs write access to the power ++ * state enabled for tracking BAR mapping relative to PM state. ++ */ ++ pci_set_word(pdev->wmask + pos + PCI_PM_CTRL, PCI_PM_CTRL_STATE_MASK); + break; + case PCI_CAP_ID_AF: + vfio_check_af_flr(vdev, pos); +diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c +index 524b63e5c7..4b2aeaad8d 100644 +--- a/hw/virtio/virtio-pci.c ++++ b/hw/virtio/virtio-pci.c +@@ -2195,8 +2195,7 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) + pos = pcie_endpoint_cap_init(pci_dev, 0); + assert(pos > 0); + +- pos = pci_add_capability(pci_dev, PCI_CAP_ID_PM, 0, +- PCI_PM_SIZEOF, errp); ++ pos = pci_pm_init(pci_dev, 0, errp); + if (pos < 0) { + return; + } +-- +2.48.1 + diff --git a/SOURCES/kvm-pcie-virtio-Remove-redundant-pm_cap.patch b/SOURCES/kvm-pcie-virtio-Remove-redundant-pm_cap.patch new file mode 100644 index 0000000..15d82a2 --- /dev/null +++ b/SOURCES/kvm-pcie-virtio-Remove-redundant-pm_cap.patch @@ -0,0 +1,99 @@ +From 274e81bcf091c981d1e27e49fbe98e63d5308472 Mon Sep 17 00:00:00 2001 +From: Alex Williamson +Date: Tue, 25 Feb 2025 14:52:28 -0700 +Subject: [PATCH 5/7] pcie, virtio: Remove redundant pm_cap +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Eric Auger +RH-MergeRequest: 348: PCI: Implement basic PCI PM capability backing +RH-Jira: RHEL-7301 +RH-Acked-by: Cédric Le Goater +RH-Acked-by: Alex Williamson +RH-Acked-by: Jon Maloy +RH-Commit: [5/6] 81c6e3c9c52a0b3f0b9269b4ac7f56e8e4b5d68b (eauger1/centos-qemu-kvm) + +The pm_cap on the PCIExpressDevice object can be distilled down +to the new instance on the PCIDevice object. + +Cc: Michael S. Tsirkin +Cc: Marcel Apfelbaum +Reviewed-by: Michael S. Tsirkin +Reviewed-by: Zhenzhong Duan +Reviewed-by: Eric Auger +Signed-off-by: Alex Williamson +Link: https://lore.kernel.org/qemu-devel/20250225215237.3314011-5-alex.williamson@redhat.com +Signed-off-by: Cédric Le Goater +(cherry picked from commit 8b8d08cf293b930d0f55b2d5385d8dd27e0c6b41) +Signed-off-by: Eric Auger +--- + hw/pci-bridge/pcie_pci_bridge.c | 1 - + hw/virtio/virtio-pci.c | 8 +++----- + include/hw/pci/pcie.h | 2 -- + 3 files changed, 3 insertions(+), 8 deletions(-) + +diff --git a/hw/pci-bridge/pcie_pci_bridge.c b/hw/pci-bridge/pcie_pci_bridge.c +index 2f098e3a13..c0ba6d7928 100644 +--- a/hw/pci-bridge/pcie_pci_bridge.c ++++ b/hw/pci-bridge/pcie_pci_bridge.c +@@ -56,7 +56,6 @@ static void pcie_pci_bridge_realize(PCIDevice *d, Error **errp) + if (pos < 0) { + goto pm_error; + } +- d->exp.pm_cap = pos; + pci_set_word(d->config + pos + PCI_PM_PMC, 0x3); + + pcie_cap_arifwd_init(d); +diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c +index 4b2aeaad8d..a85787b837 100644 +--- a/hw/virtio/virtio-pci.c ++++ b/hw/virtio/virtio-pci.c +@@ -2200,8 +2200,6 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) + return; + } + +- pci_dev->exp.pm_cap = pos; +- + /* + * Indicates that this function complies with revision 1.2 of the + * PCI Power Management Interface Specification. +@@ -2295,11 +2293,11 @@ static bool virtio_pci_no_soft_reset(PCIDevice *dev) + { + uint16_t pmcsr; + +- if (!pci_is_express(dev) || !dev->exp.pm_cap) { ++ if (!pci_is_express(dev) || !(dev->cap_present & QEMU_PCI_CAP_PM)) { + return false; + } + +- pmcsr = pci_get_word(dev->config + dev->exp.pm_cap + PCI_PM_CTRL); ++ pmcsr = pci_get_word(dev->config + dev->pm_cap + PCI_PM_CTRL); + + /* + * When No_Soft_Reset bit is set and the device +@@ -2328,7 +2326,7 @@ static void virtio_pci_bus_reset_hold(Object *obj, ResetType type) + + if (proxy->flags & VIRTIO_PCI_FLAG_INIT_PM) { + pci_word_test_and_clear_mask( +- dev->config + dev->exp.pm_cap + PCI_PM_CTRL, ++ dev->config + dev->pm_cap + PCI_PM_CTRL, + PCI_PM_CTRL_STATE_MASK); + } + } +diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h +index 5eddb90976..8a30d07fd0 100644 +--- a/include/hw/pci/pcie.h ++++ b/include/hw/pci/pcie.h +@@ -58,8 +58,6 @@ typedef enum { + struct PCIExpressDevice { + /* Offset of express capability in config space */ + uint8_t exp_cap; +- /* Offset of Power Management capability in config space */ +- uint8_t pm_cap; + + /* SLOT */ + bool hpev_notified; /* Logical AND of conditions for hot plug event. +-- +2.48.1 + diff --git a/SOURCES/kvm-qga-implement-a-guest-get-load-command.patch b/SOURCES/kvm-qga-implement-a-guest-get-load-command.patch new file mode 100644 index 0000000..d4622ff --- /dev/null +++ b/SOURCES/kvm-qga-implement-a-guest-get-load-command.patch @@ -0,0 +1,139 @@ +From 22f26a93ab94bf87c0724891a5886797a38c23b4 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Mon, 2 Dec 2024 12:19:27 +0000 +Subject: [PATCH 6/9] qga: implement a 'guest-get-load' command +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Konstantin Kostiuk +RH-MergeRequest: 343: RHEL-69622: qga: implement a 'guest-get-load' command +RH-Jira: RHEL-69622 +RH-Acked-by: Daniel P. Berrangé +RH-Acked-by: Jon Maloy +RH-Commit: [1/1] 9284c70737ad9f700d37f8c3833f855f2354acb7 (kkostiuk/redhat-centos-stream-src-qemu-kvm) + +Provide a way to report the process load average, via a new +'guest-get-load' command. + +This is only implemented for POSIX platforms providing 'getloadavg'. + +Example illustrated with qmp-shell: + +(QEMU) guest-get-load +{ + "return": { + "load15m": 1.546875, + "load1m": 1.669921875, + "load5m": 1.9306640625 + } +} + +Windows has no native equivalent API, but it would be possible to +simulate it as illustrated here (BSD-3-Clause): + + https://github.com/giampaolo/psutil/pull/1485 + +This is left as an exercise for future contributors. + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Konstantin Kostiuk +Message-ID: <20241202121927.864335-1-berrange@redhat.com> +Signed-off-by: Konstantin Kostiuk +--- + meson.build | 1 + + qga/commands-posix.c | 20 ++++++++++++++++++++ + qga/qapi-schema.json | 37 +++++++++++++++++++++++++++++++++++++ + 3 files changed, 58 insertions(+) + +diff --git a/meson.build b/meson.build +index b3529aa0e1..1dd97c6f49 100644 +--- a/meson.build ++++ b/meson.build +@@ -2497,6 +2497,7 @@ config_host_data.set('CONFIG_SETNS', cc.has_function('setns') and cc.has_functio + config_host_data.set('CONFIG_SYNCFS', cc.has_function('syncfs')) + config_host_data.set('CONFIG_SYNC_FILE_RANGE', cc.has_function('sync_file_range')) + config_host_data.set('CONFIG_TIMERFD', cc.has_function('timerfd_create')) ++config_host_data.set('CONFIG_GETLOADAVG', cc.has_function('getloadavg')) + config_host_data.set('HAVE_COPY_FILE_RANGE', cc.has_function('copy_file_range')) + config_host_data.set('HAVE_GETIFADDRS', cc.has_function('getifaddrs')) + config_host_data.set('HAVE_GLIB_WITH_SLICE_ALLOCATOR', glib_has_gslice) +diff --git a/qga/commands-posix.c b/qga/commands-posix.c +index 49e40f9127..abfa53d6e9 100644 +--- a/qga/commands-posix.c ++++ b/qga/commands-posix.c +@@ -1371,3 +1371,23 @@ char *qga_get_host_name(Error **errp) + + return g_steal_pointer(&hostname); + } ++ ++#ifdef CONFIG_GETLOADAVG ++GuestLoadAverage *qmp_guest_get_load(Error **errp) ++{ ++ double loadavg[3]; ++ GuestLoadAverage *ret = NULL; ++ ++ if (getloadavg(loadavg, G_N_ELEMENTS(loadavg)) < 0) { ++ error_setg_errno(errp, errno, ++ "cannot query load average"); ++ return NULL; ++ } ++ ++ ret = g_new0(GuestLoadAverage, 1); ++ ret->load1m = loadavg[0]; ++ ret->load5m = loadavg[1]; ++ ret->load15m = loadavg[2]; ++ return ret; ++} ++#endif +diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json +index 495706cf73..739f008ff2 100644 +--- a/qga/qapi-schema.json ++++ b/qga/qapi-schema.json +@@ -1852,6 +1852,43 @@ + 'if': 'CONFIG_LINUX' + } + ++ ++## ++# @GuestLoadAverage: ++# ++# Statistics about process load information ++# ++# @load1m: 1-minute load avage ++# ++# @load5m: 5-minute load avage ++# ++# @load15m: 15-minute load avage ++# ++# Since: 10.0 ++## ++{ 'struct': 'GuestLoadAverage', ++ 'data': { ++ 'load1m': 'number', ++ 'load5m': 'number', ++ 'load15m': 'number' ++ }, ++ 'if': 'CONFIG_GETLOADAVG' ++} ++ ++## ++# @guest-get-load: ++# ++# Retrieve CPU process load information ++# ++# Returns: load information ++# ++# Since: 10.0 ++## ++{ 'command': 'guest-get-load', ++ 'returns': 'GuestLoadAverage', ++ 'if': 'CONFIG_GETLOADAVG' ++} ++ + ## + # @GuestNetworkRoute: + # +-- +2.48.1 + diff --git a/SOURCES/kvm-rbd-Fix-.bdrv_get_specific_info-implementation.patch b/SOURCES/kvm-rbd-Fix-.bdrv_get_specific_info-implementation.patch new file mode 100644 index 0000000..e071b64 --- /dev/null +++ b/SOURCES/kvm-rbd-Fix-.bdrv_get_specific_info-implementation.patch @@ -0,0 +1,273 @@ +From 181b9ca805f3ae09c24a925eea0460525f30c90e Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Mon, 11 Aug 2025 15:40:10 +0200 +Subject: [PATCH] rbd: Fix .bdrv_get_specific_info implementation + +RH-Author: Kevin Wolf +RH-MergeRequest: 400: rbd: Fix .bdrv_get_specific_info implementation +RH-Jira: RHEL-108726 +RH-Acked-by: Hanna Czenczek +RH-Acked-by: Stefan Hajnoczi +RH-Commit: [1/1] 5a488d6e2355adcec7fc4fd686c6be001808a146 (kmwolf/centos-qemu-kvm) + +qemu_rbd_get_specific_info() has at least two problems: + +The first is that it issues a blocking rbd_read() call in order to probe +the encryption format for the image while querying the node. This means +that if the connection to the server goes down, not only I/O is stuck +(which is unavoidable), but query-names-block-nodes will actually make +the whole QEMU instance unresponsive. .bdrv_get_specific_info +implementations shouldn't perform blocking operations, but only return +what is already known. + +The second is that the information returned isn't even correct. If the +image is already opened with encryption enabled at the RBD level, we'll +probe for "double encryption", i.e. if the encrypted data contains +another encryption header. If it doesn't (which is the normal case), we +won't return the encryption format. If it does, we return misleading +information because it looks like we're talking about the outer level +(the encryption format of the image itself) while the information is +about an encryption header in the guest data. + +Fix this by storing the encryption format in BDRVRBDState when the image +is opened (and we do blocking operations anyway) and returning only the +stored information in qemu_rbd_get_specific_info(). + +The information we'll store is either the actual encryption format that +we enabled on the RBD level, or if the image is unencrypted, the result +of the same probing as we previously did when querying the node. Probing +image formats based on content that can be modified by the guest has +long been known as problematic, but as long as we only output it to the +user instead of making decisions based on it, it should be okay. It is +undoubtedly useful in the context of 'qemu-img info' when you're trying +to figure out which encryption options you have to use to open the +image successfully. + +Fixes: 42e4ac9ef5a6 ("block/rbd: Add support for rbd image encryption") +Buglink: https://issues.redhat.com/browse/RHEL-105440 +Signed-off-by: Kevin Wolf +Message-ID: <20250811134010.81787-1-kwolf@redhat.com> +Reviewed-by: Hanna Czenczek +Signed-off-by: Kevin Wolf +(cherry picked from commit 4af976ef398e4e823addc00bf1c58787ba4952fe) +Signed-off-by: Kevin Wolf +--- + block/rbd.c | 104 ++++++++++++++++++++++++++++--------------- + qapi/block-core.json | 9 +++- + 2 files changed, 76 insertions(+), 37 deletions(-) + +diff --git a/block/rbd.c b/block/rbd.c +index 627f8eb05a..d5546da71b 100644 +--- a/block/rbd.c ++++ b/block/rbd.c +@@ -99,6 +99,14 @@ typedef struct BDRVRBDState { + char *namespace; + uint64_t image_size; + uint64_t object_size; ++ ++ /* ++ * If @bs->encrypted is true, this is the encryption format actually loaded ++ * at the librbd level. If it is false, it is the result of probing. ++ * RBD_IMAGE_ENCRYPTION_FORMAT__MAX means that encryption is not enabled and ++ * probing didn't find any known encryption header either. ++ */ ++ RbdImageEncryptionFormat encryption_format; + } BDRVRBDState; + + typedef struct RBDTask { +@@ -471,10 +479,12 @@ static int qemu_rbd_encryption_format(rbd_image_t image, + return 0; + } + +-static int qemu_rbd_encryption_load(rbd_image_t image, ++static int qemu_rbd_encryption_load(BlockDriverState *bs, ++ rbd_image_t image, + RbdEncryptionOptions *encrypt, + Error **errp) + { ++ BDRVRBDState *s = bs->opaque; + int r = 0; + g_autofree char *passphrase = NULL; + rbd_encryption_luks1_format_options_t luks_opts; +@@ -545,15 +555,19 @@ static int qemu_rbd_encryption_load(rbd_image_t image, + error_setg_errno(errp, -r, "encryption load fail"); + return r; + } ++ bs->encrypted = true; ++ s->encryption_format = encrypt->format; + + return 0; + } + + #ifdef LIBRBD_SUPPORTS_ENCRYPTION_LOAD2 +-static int qemu_rbd_encryption_load2(rbd_image_t image, ++static int qemu_rbd_encryption_load2(BlockDriverState *bs, ++ rbd_image_t image, + RbdEncryptionOptions *encrypt, + Error **errp) + { ++ BDRVRBDState *s = bs->opaque; + int r = 0; + int encrypt_count = 1; + int i; +@@ -639,6 +653,8 @@ static int qemu_rbd_encryption_load2(rbd_image_t image, + error_setg_errno(errp, -r, "layered encryption load fail"); + goto exit; + } ++ bs->encrypted = true; ++ s->encryption_format = encrypt->format; + + exit: + for (i = 0; i < encrypt_count; ++i) { +@@ -672,6 +688,45 @@ exit: + #endif + #endif + ++/* ++ * For an image without encryption enabled on the rbd layer, probe the start of ++ * the image if it could be opened as an encrypted image so that we can display ++ * it when the user queries the node (most importantly in qemu-img). ++ * ++ * If the guest writes an encryption header to its disk after this probing, this ++ * won't be reflected when queried, but that's okay. There is no reason why the ++ * user should want to apply encryption at the rbd level while the image is ++ * still in use. This is just guest data. ++ */ ++static void qemu_rbd_encryption_probe(BlockDriverState *bs) ++{ ++ BDRVRBDState *s = bs->opaque; ++ char buf[RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN] = {0}; ++ int r; ++ ++ assert(s->encryption_format == RBD_IMAGE_ENCRYPTION_FORMAT__MAX); ++ ++ r = rbd_read(s->image, 0, ++ RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN, buf); ++ if (r < RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) { ++ return; ++ } ++ ++ if (memcmp(buf, rbd_luks_header_verification, ++ RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) { ++ s->encryption_format = RBD_IMAGE_ENCRYPTION_FORMAT_LUKS; ++ } else if (memcmp(buf, rbd_luks2_header_verification, ++ RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) { ++ s->encryption_format = RBD_IMAGE_ENCRYPTION_FORMAT_LUKS2; ++ } else if (memcmp(buf, rbd_layered_luks_header_verification, ++ RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) { ++ s->encryption_format = RBD_IMAGE_ENCRYPTION_FORMAT_LUKS; ++ } else if (memcmp(buf, rbd_layered_luks2_header_verification, ++ RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) { ++ s->encryption_format = RBD_IMAGE_ENCRYPTION_FORMAT_LUKS2; ++ } ++} ++ + /* FIXME Deprecate and remove keypairs or make it available in QMP. */ + static int qemu_rbd_do_create(BlockdevCreateOptions *options, + const char *keypairs, const char *password_secret, +@@ -1134,17 +1189,18 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, + goto failed_open; + } + ++ s->encryption_format = RBD_IMAGE_ENCRYPTION_FORMAT__MAX; + if (opts->encrypt) { + #ifdef LIBRBD_SUPPORTS_ENCRYPTION + if (opts->encrypt->parent) { + #ifdef LIBRBD_SUPPORTS_ENCRYPTION_LOAD2 +- r = qemu_rbd_encryption_load2(s->image, opts->encrypt, errp); ++ r = qemu_rbd_encryption_load2(bs, s->image, opts->encrypt, errp); + #else + r = -ENOTSUP; + error_setg(errp, "RBD library does not support layered encryption"); + #endif + } else { +- r = qemu_rbd_encryption_load(s->image, opts->encrypt, errp); ++ r = qemu_rbd_encryption_load(bs, s->image, opts->encrypt, errp); + } + if (r < 0) { + goto failed_post_open; +@@ -1154,6 +1210,8 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, + error_setg(errp, "RBD library does not support image encryption"); + goto failed_post_open; + #endif ++ } else { ++ qemu_rbd_encryption_probe(bs); + } + + r = rbd_stat(s->image, &info, sizeof(info)); +@@ -1413,17 +1471,6 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs, + { + BDRVRBDState *s = bs->opaque; + ImageInfoSpecific *spec_info; +- char buf[RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN] = {0}; +- int r; +- +- if (s->image_size >= RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) { +- r = rbd_read(s->image, 0, +- RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN, buf); +- if (r < 0) { +- error_setg_errno(errp, -r, "cannot read image start for probe"); +- return NULL; +- } +- } + + spec_info = g_new(ImageInfoSpecific, 1); + *spec_info = (ImageInfoSpecific){ +@@ -1431,28 +1478,13 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs, + .u.rbd.data = g_new0(ImageInfoSpecificRbd, 1), + }; + +- if (memcmp(buf, rbd_luks_header_verification, +- RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) { +- spec_info->u.rbd.data->encryption_format = +- RBD_IMAGE_ENCRYPTION_FORMAT_LUKS; +- spec_info->u.rbd.data->has_encryption_format = true; +- } else if (memcmp(buf, rbd_luks2_header_verification, +- RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) { +- spec_info->u.rbd.data->encryption_format = +- RBD_IMAGE_ENCRYPTION_FORMAT_LUKS2; +- spec_info->u.rbd.data->has_encryption_format = true; +- } else if (memcmp(buf, rbd_layered_luks_header_verification, +- RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) { +- spec_info->u.rbd.data->encryption_format = +- RBD_IMAGE_ENCRYPTION_FORMAT_LUKS; +- spec_info->u.rbd.data->has_encryption_format = true; +- } else if (memcmp(buf, rbd_layered_luks2_header_verification, +- RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) { +- spec_info->u.rbd.data->encryption_format = +- RBD_IMAGE_ENCRYPTION_FORMAT_LUKS2; +- spec_info->u.rbd.data->has_encryption_format = true; ++ if (s->encryption_format == RBD_IMAGE_ENCRYPTION_FORMAT__MAX) { ++ assert(!bs->encrypted); + } else { +- spec_info->u.rbd.data->has_encryption_format = false; ++ ImageInfoSpecificRbd *rbd_info = spec_info->u.rbd.data; ++ ++ rbd_info->has_encryption_format = true; ++ rbd_info->encryption_format = s->encryption_format; + } + + return spec_info; +diff --git a/qapi/block-core.json b/qapi/block-core.json +index 3969c60b93..15b91e2d4a 100644 +--- a/qapi/block-core.json ++++ b/qapi/block-core.json +@@ -158,7 +158,14 @@ + ## + # @ImageInfoSpecificRbd: + # +-# @encryption-format: Image encryption format ++# @encryption-format: Image encryption format. If encryption is enabled for the ++# image (see encrypted in BlockNodeInfo), this is the actual format in which the ++# image is accessed. If encryption is not enabled, this is the result of ++# probing when the image was opened, to give a suggestion which encryption ++# format could be enabled. Note that probing results can be changed by the ++# guest by writing a (possibly partial) encryption format header to the ++# image, so don't treat this information as trusted if the guest is not ++# trusted. + # + # Since: 6.1 + ## +-- +2.50.1 + diff --git a/SOURCES/kvm-redhat-Enable-virtio-mem-on-s390x.patch b/SOURCES/kvm-redhat-Enable-virtio-mem-on-s390x.patch new file mode 100644 index 0000000..ca56520 --- /dev/null +++ b/SOURCES/kvm-redhat-Enable-virtio-mem-on-s390x.patch @@ -0,0 +1,36 @@ +From 7300a435547b7e999227648fd1451db00e9c4867 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Mon, 24 Mar 2025 18:09:26 +0100 +Subject: [PATCH 26/26] redhat: Enable virtio-mem on s390x + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [26/26] 076b44c8f0262e903c5e17eda676614aec6f5c98 (thuth/qemu-kvm-cs) + +JIRA: https://issues.redhat.com/browse/RHEL-72977 + +Enable virtio-mem on s390x now, too. + +Signed-off-by: Thomas Huth +--- + configs/devices/s390x-softmmu/s390x-rh-devices.mak | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/configs/devices/s390x-softmmu/s390x-rh-devices.mak b/configs/devices/s390x-softmmu/s390x-rh-devices.mak +index 24cf6dbd03..834281d872 100644 +--- a/configs/devices/s390x-softmmu/s390x-rh-devices.mak ++++ b/configs/devices/s390x-softmmu/s390x-rh-devices.mak +@@ -12,6 +12,7 @@ CONFIG_VFIO_CCW=y + CONFIG_VFIO_PCI=y + CONFIG_VHOST_USER=y + CONFIG_VIRTIO_CCW=y ++CONFIG_VIRTIO_MEM=y + CONFIG_WDT_DIAG288=y + CONFIG_VHOST_VSOCK=y + CONFIG_VHOST_USER_VSOCK=y +-- +2.48.1 + diff --git a/SOURCES/kvm-reset-Add-RESET_TYPE_WAKEUP.patch b/SOURCES/kvm-reset-Add-RESET_TYPE_WAKEUP.patch new file mode 100644 index 0000000..bcdc6ac --- /dev/null +++ b/SOURCES/kvm-reset-Add-RESET_TYPE_WAKEUP.patch @@ -0,0 +1,94 @@ +From 2de79d978c2cd29ad686dd91e74a86dbf2121f1f Mon Sep 17 00:00:00 2001 +From: Juraj Marcin +Date: Wed, 4 Sep 2024 12:37:13 +0200 +Subject: [PATCH 06/26] reset: Add RESET_TYPE_WAKEUP + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [6/26] 6169fe25bfa5715340c180ee8711d0ad61832106 (thuth/qemu-kvm-cs) + +Some devices need to distinguish cold start reset from waking up from a +suspended state. This patch adds new value to the enum, and updates the +i386 wakeup method to use this new reset type. + +Message-ID: <20240904103722.946194-3-jmarcin@redhat.com> +Reviewed-by: David Hildenbrand +Signed-off-by: Juraj Marcin +Signed-off-by: David Hildenbrand +(cherry picked from commit 759cbb4ee971da13ddfa8ad73befc2351d542044) +Signed-off-by: Thomas Huth +--- + docs/devel/reset.rst | 12 +++++++++++- + hw/i386/pc.c | 2 +- + include/hw/resettable.h | 2 ++ + 3 files changed, 14 insertions(+), 2 deletions(-) + +diff --git a/docs/devel/reset.rst b/docs/devel/reset.rst +index d2799eba7a..44bd51b42e 100644 +--- a/docs/devel/reset.rst ++++ b/docs/devel/reset.rst +@@ -44,6 +44,17 @@ The Resettable interface handles reset types with an enum ``ResetType``: + value on each cold reset, such as RNG seed information, and which they + must not reinitialize on a snapshot-load reset. + ++``RESET_TYPE_WAKEUP`` ++ If the machine supports waking up from a suspended state and needs to reset ++ its devices during wake-up (from the ``MachineClass::wakeup()`` method), this ++ reset type should be used for such a request. Devices can utilize this reset ++ type to differentiate the reset requested during machine wake-up from other ++ reset requests. For example, RAM content must not be lost during wake-up, and ++ memory devices like virtio-mem that provide additional RAM must not reset ++ such state during wake-ups, but might do so during cold resets. However, this ++ reset type should not be used for wake-up detection, as not every machine ++ type issues a device reset request during wake-up. ++ + ``RESET_TYPE_S390_CPU_NORMAL`` + This is only used for S390 CPU objects; it clears interrupts, stops + processing, and clears the TLB, but does not touch register contents. +@@ -53,7 +64,6 @@ The Resettable interface handles reset types with an enum ``ResetType``: + ``RESET_TYPE_S390_CPU_NORMAL`` does and also clears the PSW, prefix, + FPC, timer and control registers. It does not touch gprs, fprs or acrs. + +- + Devices which implement reset methods must treat any unknown ``ResetType`` + as equivalent to ``RESET_TYPE_COLD``; this will reduce the amount of + existing code we need to change if we add more types in future. +diff --git a/hw/i386/pc.c b/hw/i386/pc.c +index fedcf2a65f..fa9f16cbaf 100644 +--- a/hw/i386/pc.c ++++ b/hw/i386/pc.c +@@ -1889,7 +1889,7 @@ static void pc_machine_reset(MachineState *machine, ResetType type) + static void pc_machine_wakeup(MachineState *machine) + { + cpu_synchronize_all_states(); +- pc_machine_reset(machine, RESET_TYPE_COLD); ++ pc_machine_reset(machine, RESET_TYPE_WAKEUP); + cpu_synchronize_all_post_reset(); + } + +diff --git a/include/hw/resettable.h b/include/hw/resettable.h +index 83b561fc83..cf37cd5ead 100644 +--- a/include/hw/resettable.h ++++ b/include/hw/resettable.h +@@ -29,6 +29,7 @@ typedef struct ResettableState ResettableState; + * Types of reset. + * + * + Cold: reset resulting from a power cycle of the object. ++ * + Wakeup: reset resulting from a wake-up from a suspended state. + * + * TODO: Support has to be added to handle more types. In particular, + * ResettableState structure needs to be expanded. +@@ -36,6 +37,7 @@ typedef struct ResettableState ResettableState; + typedef enum ResetType { + RESET_TYPE_COLD, + RESET_TYPE_SNAPSHOT_LOAD, ++ RESET_TYPE_WAKEUP, + RESET_TYPE_S390_CPU_INITIAL, + RESET_TYPE_S390_CPU_NORMAL, + } ResetType; +-- +2.48.1 + diff --git a/SOURCES/kvm-reset-Use-ResetType-for-qemu_devices_reset-and-Machi.patch b/SOURCES/kvm-reset-Use-ResetType-for-qemu_devices_reset-and-Machi.patch new file mode 100644 index 0000000..2b0b933 --- /dev/null +++ b/SOURCES/kvm-reset-Use-ResetType-for-qemu_devices_reset-and-Machi.patch @@ -0,0 +1,360 @@ +From 8d48193b5a661f31c1c1db068d241b31ae379339 Mon Sep 17 00:00:00 2001 +From: Juraj Marcin +Date: Wed, 4 Sep 2024 12:37:12 +0200 +Subject: [PATCH 05/26] reset: Use ResetType for qemu_devices_reset() and + MachineClass::reset() + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [5/26] ea1324b27885d979bcc54cc355dbdf940686776c (thuth/qemu-kvm-cs) + +Currently, both qemu_devices_reset() and MachineClass::reset() use +ShutdownCause for the reason of the reset. However, the Resettable +interface uses ResetState, so ShutdownCause needs to be translated to +ResetType somewhere. Translating it qemu_devices_reset() makes adding +new reset types harder, as they cannot always be matched to a single +ShutdownCause here, and devices may need to check the ResetType to +determine what to reset and if to reset at all. + +This patch moves this translation up in the call stack to +qemu_system_reset() and updates all MachineClass children to use the +ResetType instead. + +Message-ID: <20240904103722.946194-2-jmarcin@redhat.com> +Reviewed-by: David Hildenbrand +Reviewed-by: Peter Maydell +Signed-off-by: Juraj Marcin +Signed-off-by: David Hildenbrand +(cherry picked from commit 1b063fe2df002052cc2d10799764979b8c583480) +Signed-off-by: Thomas Huth +--- + hw/arm/aspeed.c | 4 ++-- + hw/arm/mps2-tz.c | 4 ++-- + hw/core/reset.c | 5 +---- + hw/hppa/machine.c | 4 ++-- + hw/i386/microvm.c | 4 ++-- + hw/i386/pc.c | 6 +++--- + hw/ppc/pegasos2.c | 4 ++-- + hw/ppc/pnv.c | 4 ++-- + hw/ppc/spapr.c | 6 +++--- + hw/s390x/s390-virtio-ccw.c | 4 ++-- + include/hw/boards.h | 3 ++- + include/sysemu/reset.h | 5 +++-- + system/runstate.c | 13 +++++++++++-- + 13 files changed, 37 insertions(+), 29 deletions(-) + +diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c +index fd5603f7aa..cbca7685da 100644 +--- a/hw/arm/aspeed.c ++++ b/hw/arm/aspeed.c +@@ -1529,12 +1529,12 @@ static void aspeed_machine_bletchley_class_init(ObjectClass *oc, void *data) + aspeed_machine_class_init_cpus_defaults(mc); + } + +-static void fby35_reset(MachineState *state, ShutdownCause reason) ++static void fby35_reset(MachineState *state, ResetType type) + { + AspeedMachineState *bmc = ASPEED_MACHINE(state); + AspeedGPIOState *gpio = &bmc->soc->gpio; + +- qemu_devices_reset(reason); ++ qemu_devices_reset(type); + + /* Board ID: 7 (Class-1, 4 slots) */ + object_property_set_bool(OBJECT(gpio), "gpioV4", true, &error_fatal); +diff --git a/hw/arm/mps2-tz.c b/hw/arm/mps2-tz.c +index aec57c0d68..8edf57a66d 100644 +--- a/hw/arm/mps2-tz.c ++++ b/hw/arm/mps2-tz.c +@@ -1254,7 +1254,7 @@ static void mps2_set_remap(Object *obj, const char *value, Error **errp) + } + } + +-static void mps2_machine_reset(MachineState *machine, ShutdownCause reason) ++static void mps2_machine_reset(MachineState *machine, ResetType type) + { + MPS2TZMachineState *mms = MPS2TZ_MACHINE(machine); + +@@ -1264,7 +1264,7 @@ static void mps2_machine_reset(MachineState *machine, ShutdownCause reason) + * reset see the correct mapping. + */ + remap_memory(mms, mms->remap); +- qemu_devices_reset(reason); ++ qemu_devices_reset(type); + } + + static void mps2tz_class_init(ObjectClass *oc, void *data) +diff --git a/hw/core/reset.c b/hw/core/reset.c +index 58dfc8db3d..14a2639fbf 100644 +--- a/hw/core/reset.c ++++ b/hw/core/reset.c +@@ -170,11 +170,8 @@ void qemu_unregister_resettable(Object *obj) + resettable_container_remove(get_root_reset_container(), obj); + } + +-void qemu_devices_reset(ShutdownCause reason) ++void qemu_devices_reset(ResetType type) + { +- ResetType type = (reason == SHUTDOWN_CAUSE_SNAPSHOT_LOAD) ? +- RESET_TYPE_SNAPSHOT_LOAD : RESET_TYPE_COLD; +- + /* Reset the simulation */ + resettable_reset(OBJECT(get_root_reset_container()), type); + } +diff --git a/hw/hppa/machine.c b/hw/hppa/machine.c +index 5d0a8739de..8259fe2e38 100644 +--- a/hw/hppa/machine.c ++++ b/hw/hppa/machine.c +@@ -642,12 +642,12 @@ static void machine_HP_C3700_init(MachineState *machine) + machine_HP_common_init_tail(machine, pci_bus, translate); + } + +-static void hppa_machine_reset(MachineState *ms, ShutdownCause reason) ++static void hppa_machine_reset(MachineState *ms, ResetType type) + { + unsigned int smp_cpus = ms->smp.cpus; + int i; + +- qemu_devices_reset(reason); ++ qemu_devices_reset(type); + + /* Start all CPUs at the firmware entry point. + * Monarch CPU will initialize firmware, secondary CPUs +diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c +index 40edcee7af..8ae4dff7f2 100644 +--- a/hw/i386/microvm.c ++++ b/hw/i386/microvm.c +@@ -462,7 +462,7 @@ static void microvm_machine_state_init(MachineState *machine) + microvm_devices_init(mms); + } + +-static void microvm_machine_reset(MachineState *machine, ShutdownCause reason) ++static void microvm_machine_reset(MachineState *machine, ResetType type) + { + MicrovmMachineState *mms = MICROVM_MACHINE(machine); + CPUState *cs; +@@ -475,7 +475,7 @@ static void microvm_machine_reset(MachineState *machine, ShutdownCause reason) + mms->kernel_cmdline_fixed = true; + } + +- qemu_devices_reset(reason); ++ qemu_devices_reset(type); + + CPU_FOREACH(cs) { + cpu = X86_CPU(cs); +diff --git a/hw/i386/pc.c b/hw/i386/pc.c +index fa0e42d072..fedcf2a65f 100644 +--- a/hw/i386/pc.c ++++ b/hw/i386/pc.c +@@ -1869,12 +1869,12 @@ static void pc_machine_initfn(Object *obj) + qemu_add_machine_init_done_notifier(&pcms->machine_done); + } + +-static void pc_machine_reset(MachineState *machine, ShutdownCause reason) ++static void pc_machine_reset(MachineState *machine, ResetType type) + { + CPUState *cs; + X86CPU *cpu; + +- qemu_devices_reset(reason); ++ qemu_devices_reset(type); + + /* Reset APIC after devices have been reset to cancel + * any changes that qemu_devices_reset() might have done. +@@ -1889,7 +1889,7 @@ static void pc_machine_reset(MachineState *machine, ShutdownCause reason) + static void pc_machine_wakeup(MachineState *machine) + { + cpu_synchronize_all_states(); +- pc_machine_reset(machine, SHUTDOWN_CAUSE_NONE); ++ pc_machine_reset(machine, RESET_TYPE_COLD); + cpu_synchronize_all_post_reset(); + } + +diff --git a/hw/ppc/pegasos2.c b/hw/ppc/pegasos2.c +index 9b0a6b70ab..8ff4a00c34 100644 +--- a/hw/ppc/pegasos2.c ++++ b/hw/ppc/pegasos2.c +@@ -291,14 +291,14 @@ static void pegasos2_superio_write(uint8_t addr, uint8_t val) + cpu_physical_memory_write(PCI1_IO_BASE + 0x3f1, &val, 1); + } + +-static void pegasos2_machine_reset(MachineState *machine, ShutdownCause reason) ++static void pegasos2_machine_reset(MachineState *machine, ResetType type) + { + Pegasos2MachineState *pm = PEGASOS2_MACHINE(machine); + void *fdt; + uint64_t d[2]; + int sz; + +- qemu_devices_reset(reason); ++ qemu_devices_reset(type); + if (!pm->vof) { + return; /* Firmware should set up machine so nothing to do */ + } +diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c +index 3526852685..988fd55d88 100644 +--- a/hw/ppc/pnv.c ++++ b/hw/ppc/pnv.c +@@ -709,13 +709,13 @@ static void pnv_powerdown_notify(Notifier *n, void *opaque) + } + } + +-static void pnv_reset(MachineState *machine, ShutdownCause reason) ++static void pnv_reset(MachineState *machine, ResetType type) + { + PnvMachineState *pnv = PNV_MACHINE(machine); + IPMIBmc *bmc; + void *fdt; + +- qemu_devices_reset(reason); ++ qemu_devices_reset(type); + + /* + * The machine should provide by default an internal BMC simulator. +diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c +index 29e66f1b3f..11c953669a 100644 +--- a/hw/ppc/spapr.c ++++ b/hw/ppc/spapr.c +@@ -1725,7 +1725,7 @@ void spapr_check_mmu_mode(bool guest_radix) + } + } + +-static void spapr_machine_reset(MachineState *machine, ShutdownCause reason) ++static void spapr_machine_reset(MachineState *machine, ResetType type) + { + SpaprMachineState *spapr = SPAPR_MACHINE(machine); + PowerPCCPU *first_ppc_cpu; +@@ -1733,7 +1733,7 @@ static void spapr_machine_reset(MachineState *machine, ShutdownCause reason) + void *fdt; + int rc; + +- if (reason != SHUTDOWN_CAUSE_SNAPSHOT_LOAD) { ++ if (type != RESET_TYPE_SNAPSHOT_LOAD) { + /* + * Record-replay snapshot load must not consume random, this was + * already replayed from initial machine reset. +@@ -1769,7 +1769,7 @@ static void spapr_machine_reset(MachineState *machine, ShutdownCause reason) + spapr_setup_hpt(spapr); + } + +- qemu_devices_reset(reason); ++ qemu_devices_reset(type); + + spapr_ovec_cleanup(spapr->ov5_cas); + spapr->ov5_cas = spapr_ovec_new(); +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index ef2a9687c7..94cad1705b 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -434,7 +434,7 @@ static void s390_pv_prepare_reset(S390CcwMachineState *ms) + s390_pv_prep_reset(); + } + +-static void s390_machine_reset(MachineState *machine, ShutdownCause reason) ++static void s390_machine_reset(MachineState *machine, ResetType type) + { + S390CcwMachineState *ms = S390_CCW_MACHINE(machine); + enum s390_reset reset_type; +@@ -466,7 +466,7 @@ static void s390_machine_reset(MachineState *machine, ShutdownCause reason) + * Device reset includes CPU clear resets so this has to be + * done AFTER the unprotect call above. + */ +- qemu_devices_reset(reason); ++ qemu_devices_reset(type); + s390_crypto_reset(); + + /* configure and start the ipl CPU only */ +diff --git a/include/hw/boards.h b/include/hw/boards.h +index ffefc0a625..fe011b1e86 100644 +--- a/include/hw/boards.h ++++ b/include/hw/boards.h +@@ -10,6 +10,7 @@ + #include "qemu/module.h" + #include "qom/object.h" + #include "hw/core/cpu.h" ++#include "hw/resettable.h" + + #define TYPE_MACHINE_SUFFIX "-machine" + +@@ -253,7 +254,7 @@ struct MachineClass { + const char *deprecation_reason; + + void (*init)(MachineState *state); +- void (*reset)(MachineState *state, ShutdownCause reason); ++ void (*reset)(MachineState *state, ResetType type); + void (*wakeup)(MachineState *state); + int (*kvm_type)(MachineState *machine, const char *arg); + +diff --git a/include/sysemu/reset.h b/include/sysemu/reset.h +index ae436044a9..0e297c0e02 100644 +--- a/include/sysemu/reset.h ++++ b/include/sysemu/reset.h +@@ -27,6 +27,7 @@ + #ifndef QEMU_SYSEMU_RESET_H + #define QEMU_SYSEMU_RESET_H + ++#include "hw/resettable.h" + #include "qapi/qapi-events-run-state.h" + + typedef void QEMUResetHandler(void *opaque); +@@ -110,7 +111,7 @@ void qemu_unregister_reset(QEMUResetHandler *func, void *opaque); + + /** + * qemu_devices_reset: Perform a complete system reset +- * @reason: reason for the reset ++ * @reason: type of the reset + * + * This function performs the low-level work needed to do a complete reset + * of the system (calling all the callbacks registered with +@@ -121,6 +122,6 @@ void qemu_unregister_reset(QEMUResetHandler *func, void *opaque); + * If you want to trigger a system reset from, for instance, a device + * model, don't use this function. Use qemu_system_reset_request(). + */ +-void qemu_devices_reset(ShutdownCause reason); ++void qemu_devices_reset(ResetType type); + + #endif +diff --git a/system/runstate.c b/system/runstate.c +index a0e2a5fd22..c2c9afa905 100644 +--- a/system/runstate.c ++++ b/system/runstate.c +@@ -32,6 +32,7 @@ + #include "exec/cpu-common.h" + #include "gdbstub/syscalls.h" + #include "hw/boards.h" ++#include "hw/resettable.h" + #include "migration/misc.h" + #include "migration/postcopy-ram.h" + #include "monitor/monitor.h" +@@ -507,15 +508,23 @@ static int qemu_debug_requested(void) + void qemu_system_reset(ShutdownCause reason) + { + MachineClass *mc; ++ ResetType type; + + mc = current_machine ? MACHINE_GET_CLASS(current_machine) : NULL; + + cpu_synchronize_all_states(); + ++ switch (reason) { ++ case SHUTDOWN_CAUSE_SNAPSHOT_LOAD: ++ type = RESET_TYPE_SNAPSHOT_LOAD; ++ break; ++ default: ++ type = RESET_TYPE_COLD; ++ } + if (mc && mc->reset) { +- mc->reset(current_machine, reason); ++ mc->reset(current_machine, type); + } else { +- qemu_devices_reset(reason); ++ qemu_devices_reset(type); + } + switch (reason) { + case SHUTDOWN_CAUSE_NONE: +-- +2.48.1 + diff --git a/SOURCES/kvm-s390x-Fix-leak-in-machine_set_loadparm.patch b/SOURCES/kvm-s390x-Fix-leak-in-machine_set_loadparm.patch new file mode 100644 index 0000000..8b2660d --- /dev/null +++ b/SOURCES/kvm-s390x-Fix-leak-in-machine_set_loadparm.patch @@ -0,0 +1,60 @@ +From 4f627e0ae8efb96380070b6a8d50e88c71f40477 Mon Sep 17 00:00:00 2001 +From: Fabiano Rosas +Date: Fri, 9 May 2025 14:49:38 -0300 +Subject: [PATCH 01/57] s390x: Fix leak in machine_set_loadparm +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +RH-MergeRequest: 387: s390x: Fix memory leaks related to loadparm [rhel-9] +RH-Jira: RHEL-98554 +RH-Acked-by: Cédric Le Goater +RH-Acked-by: Kevin Wolf +RH-Commit: [1/2] dadf5b9e187a644e0a8a8c565b1b913ef7f4dcc8 (thuth/qemu-kvm-cs) + +ASAN spotted a leaking string in machine_set_loadparm(): + +Direct leak of 9 byte(s) in 1 object(s) allocated from: + #0 0x560ffb5bb379 in malloc ../projects/compiler-rt/lib/asan/asan_malloc_linux.cpp:69:3 + #1 0x7f1aca926518 in g_malloc ../glib/gmem.c:106 + #2 0x7f1aca94113e in g_strdup ../glib/gstrfuncs.c:364 + #3 0x560ffc8afbf9 in qobject_input_type_str ../qapi/qobject-input-visitor.c:542:12 + #4 0x560ffc8a80ff in visit_type_str ../qapi/qapi-visit-core.c:349:10 + #5 0x560ffbe6053a in machine_set_loadparm ../hw/s390x/s390-virtio-ccw.c:802:10 + #6 0x560ffc0c5e52 in object_property_set ../qom/object.c:1450:5 + #7 0x560ffc0d4175 in object_property_set_qobject ../qom/qom-qobject.c:28:10 + #8 0x560ffc0c6004 in object_property_set_str ../qom/object.c:1458:15 + #9 0x560ffbe2ae60 in update_machine_ipl_properties ../hw/s390x/ipl.c:569:9 + #10 0x560ffbe2aa65 in s390_ipl_update_diag308 ../hw/s390x/ipl.c:594:5 + #11 0x560ffbdee132 in handle_diag_308 ../target/s390x/diag.c:147:9 + #12 0x560ffbebb956 in helper_diag ../target/s390x/tcg/misc_helper.c:137:9 + #13 0x7f1a3c51c730 (/memfd:tcg-jit (deleted)+0x39730) + +Cc: qemu-stable@nongnu.org +Signed-off-by: Fabiano Rosas +Message-ID: <20250509174938.25935-1-farosas@suse.de> +Fixes: 1fd396e3228 ("s390x: Register TYPE_S390_CCW_MACHINE properties as class properties") +Reviewed-by: Thomas Huth +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Thomas Huth +(cherry picked from commit bdf12f2a56bf3f13c52eb51f0a994bbfe40706b2) +--- + hw/s390x/s390-virtio-ccw.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index 77a1bde71e..fc18ab575f 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -782,6 +782,7 @@ static void machine_set_loadparm(Object *obj, Visitor *v, + } + + s390_ipl_fmt_loadparm(ms->loadparm, val, errp); ++ g_free(val); + } + + static void ccw_machine_class_init(ObjectClass *oc, void *data) +-- +2.39.3 + diff --git a/SOURCES/kvm-s390x-introduce-s390_get_memory_limit.patch b/SOURCES/kvm-s390x-introduce-s390_get_memory_limit.patch new file mode 100644 index 0000000..8647d8e --- /dev/null +++ b/SOURCES/kvm-s390x-introduce-s390_get_memory_limit.patch @@ -0,0 +1,144 @@ +From 1dd38383832fc27f2980f33bb5e10ec1af7e3fc3 Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Thu, 19 Dec 2024 15:41:07 +0100 +Subject: [PATCH 15/26] s390x: introduce s390_get_memory_limit() + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [15/26] 5ae6a624a6541283cb15e90ebeb8fef3940c823b (thuth/qemu-kvm-cs) + +Let's add s390_get_memory_limit(), to query what has been successfully +set via s390_set_memory_limit(). Allow setting the limit only once. + +We'll remember the limit in the machine state. Move +s390_set_memory_limit() to machine code, merging it into +set_memory_limit(), because this really is a machine property. + +Message-ID: <20241219144115.2820241-7-david@redhat.com> +Acked-by: Michael S. Tsirkin +Reviewed-by: Thomas Huth +Signed-off-by: David Hildenbrand +(cherry picked from commit 27221b69a3ea49339a1f82b9622126f3928e0915) +Signed-off-by: Thomas Huth +--- + hw/s390x/s390-virtio-ccw.c | 17 ++++++++++++----- + include/hw/s390x/s390-virtio-ccw.h | 8 ++++++++ + target/s390x/cpu-sysemu.c | 8 -------- + target/s390x/cpu.h | 1 - + 4 files changed, 20 insertions(+), 14 deletions(-) + +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index 248ac28d20..f5f147eb92 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -45,6 +45,7 @@ + #include "migration/blocker.h" + #include "qapi/visitor.h" + #include "hw/s390x/cpu-topology.h" ++#include "kvm/kvm_s390x.h" + #include CONFIG_DEVICES + + static Error *pv_mig_blocker; +@@ -121,12 +122,16 @@ static void subsystem_reset(void) + } + } + +-static void set_memory_limit(uint64_t new_limit) ++static void s390_set_memory_limit(S390CcwMachineState *s390ms, ++ uint64_t new_limit) + { +- uint64_t hw_limit; +- int ret; ++ uint64_t hw_limit = 0; ++ int ret = 0; + +- ret = s390_set_memory_limit(new_limit, &hw_limit); ++ assert(!s390ms->memory_limit && new_limit); ++ if (kvm_enabled()) { ++ ret = kvm_s390_set_mem_limit(new_limit, &hw_limit); ++ } + if (ret == -E2BIG) { + error_report("host supports a maximum of %" PRIu64 " GB", + hw_limit / GiB); +@@ -135,10 +140,12 @@ static void set_memory_limit(uint64_t new_limit) + error_report("setting the guest size failed"); + exit(EXIT_FAILURE); + } ++ s390ms->memory_limit = new_limit; + } + + static void s390_memory_init(MachineState *machine) + { ++ S390CcwMachineState *s390ms = S390_CCW_MACHINE(machine); + MemoryRegion *sysmem = get_system_memory(); + MemoryRegion *ram = machine->ram; + uint64_t ram_size = memory_region_size(ram); +@@ -154,7 +161,7 @@ static void s390_memory_init(MachineState *machine) + exit(EXIT_FAILURE); + } + +- set_memory_limit(ram_size); ++ s390_set_memory_limit(s390ms, ram_size); + + /* Map the initial memory. Must happen after setting the memory limit. */ + memory_region_add_subregion(sysmem, 0, ram); +diff --git a/include/hw/s390x/s390-virtio-ccw.h b/include/hw/s390x/s390-virtio-ccw.h +index 996864a34e..de04336c5a 100644 +--- a/include/hw/s390x/s390-virtio-ccw.h ++++ b/include/hw/s390x/s390-virtio-ccw.h +@@ -29,10 +29,18 @@ struct S390CcwMachineState { + bool dea_key_wrap; + bool pv; + uint8_t loadparm[8]; ++ uint64_t memory_limit; + + SCLPDevice *sclp; + }; + ++static inline uint64_t s390_get_memory_limit(S390CcwMachineState *s390ms) ++{ ++ /* We expect to be called only after the limit was set. */ ++ assert(s390ms->memory_limit); ++ return s390ms->memory_limit; ++} ++ + #define S390_PTF_REASON_NONE (0x00 << 8) + #define S390_PTF_REASON_DONE (0x01 << 8) + #define S390_PTF_REASON_BUSY (0x02 << 8) +diff --git a/target/s390x/cpu-sysemu.c b/target/s390x/cpu-sysemu.c +index 1cd30c1d84..3118a25fee 100644 +--- a/target/s390x/cpu-sysemu.c ++++ b/target/s390x/cpu-sysemu.c +@@ -255,14 +255,6 @@ unsigned int s390_cpu_set_state(uint8_t cpu_state, S390CPU *cpu) + return s390_count_running_cpus(); + } + +-int s390_set_memory_limit(uint64_t new_limit, uint64_t *hw_limit) +-{ +- if (kvm_enabled()) { +- return kvm_s390_set_mem_limit(new_limit, hw_limit); +- } +- return 0; +-} +- + void s390_set_max_pagesize(uint64_t pagesize, Error **errp) + { + if (kvm_enabled()) { +diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h +index 6a64472403..ecaf3191d2 100644 +--- a/target/s390x/cpu.h ++++ b/target/s390x/cpu.h +@@ -881,7 +881,6 @@ static inline void s390_do_cpu_load_normal(CPUState *cs, run_on_cpu_data arg) + + /* cpu.c */ + void s390_crypto_reset(void); +-int s390_set_memory_limit(uint64_t new_limit, uint64_t *hw_limit); + void s390_set_max_pagesize(uint64_t pagesize, Error **errp); + void s390_cmma_reset(void); + void s390_enable_css_support(S390CPU *cpu); +-- +2.48.1 + diff --git a/SOURCES/kvm-s390x-pci-add-support-for-guests-that-request-direct.patch b/SOURCES/kvm-s390x-pci-add-support-for-guests-that-request-direct.patch new file mode 100644 index 0000000..da8c553 --- /dev/null +++ b/SOURCES/kvm-s390x-pci-add-support-for-guests-that-request-direct.patch @@ -0,0 +1,256 @@ +From c60d0770ff3f9124e6e9d7beb03e1ef8067e8e26 Mon Sep 17 00:00:00 2001 +From: Christoph Schlameuss +Date: Thu, 12 Jun 2025 13:25:32 +0200 +Subject: [PATCH 01/16] s390x/pci: add support for guests that request direct + mapping +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Christoph Schlameuss +RH-MergeRequest: 376: Draft: KVM: Performance Enhanced Refresh PCI Translation +RH-Jira: RHEL-11430 +RH-Acked-by: Thomas Huth +RH-Acked-by: Cédric Le Goater +RH-Commit: [1/2] 11d1dd9a5add55ae43d5d922588a33945ecbfe27 (cschlame/qemu-kvm) + +JIRA: https://issues.redhat.com/browse/RHEL-11430 +Conflicts: hw/s390x/s390-pci-bus.c old s390_pci_device_properties[] still has DEFINE_PROP_END_OF_LIST() + hw/s390x/s390-pci-inst.c hw_accel.h is still in sysemu + hw/s390x/s390-virtio-ccw.c changes from ccw_machine_9_2_class_options() moved to ccw_rhel_machine_9_6_0_class_options() + +commit dfcee1ea4c52ac60e0a06221eafb7b6253eb10c3 +Author: Matthew Rosato +Date: Wed Feb 26 16:00:12 2025 -0500 + + s390x/pci: add support for guests that request direct mapping + + When receiving a guest mpcifc(4) or mpcifc(6) instruction without the T + bit set, treat this as a request to perform direct mapping instead of + address translation. In order to facilitate this, pin the entirety of + guest memory into the host iommu. + + Pinning for the direct mapping case is handled via vfio and its memory + listener. Additionally, ram discard settings are inherited from vfio: + coordinated discards (e.g. virtio-mem) are allowed while uncoordinated + discards (e.g. virtio-balloon) are disabled. + + Subsequent guest DMA operations are all expected to be of the format + guest_phys+sdma, allowing them to be used as lookup into the host + iommu table. + + Signed-off-by: Matthew Rosato + Reviewed-by: David Hildenbrand + Message-ID: <20250226210013.238349-2-mjrosato@linux.ibm.com> + Signed-off-by: Thomas Huth + +Signed-off-by: Christoph Schlameuss +--- + hw/s390x/s390-pci-bus.c | 39 +++++++++++++++++++++++++++++++-- + hw/s390x/s390-pci-inst.c | 13 +++++++++-- + hw/s390x/s390-pci-vfio.c | 23 +++++++++++++++---- + hw/s390x/s390-virtio-ccw.c | 5 +++++ + include/hw/s390x/s390-pci-bus.h | 3 +++ + 5 files changed, 75 insertions(+), 8 deletions(-) + +diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c +index 3e57d5faca..13bc02d837 100644 +--- a/hw/s390x/s390-pci-bus.c ++++ b/hw/s390x/s390-pci-bus.c +@@ -18,6 +18,8 @@ + #include "hw/s390x/s390-pci-inst.h" + #include "hw/s390x/s390-pci-kvm.h" + #include "hw/s390x/s390-pci-vfio.h" ++#include "hw/s390x/s390-virtio-ccw.h" ++#include "hw/boards.h" + #include "hw/pci/pci_bus.h" + #include "hw/qdev-properties.h" + #include "hw/pci/pci_bridge.h" +@@ -724,12 +726,42 @@ void s390_pci_iommu_enable(S390PCIIOMMU *iommu) + g_free(name); + } + ++void s390_pci_iommu_direct_map_enable(S390PCIIOMMU *iommu) ++{ ++ MachineState *ms = MACHINE(qdev_get_machine()); ++ S390CcwMachineState *s390ms = S390_CCW_MACHINE(ms); ++ ++ /* ++ * For direct-mapping we must map the entire guest address space. Rather ++ * than using an iommu, create a memory region alias that maps GPA X to ++ * IOVA X + SDMA. VFIO will handle pinning via its memory listener. ++ */ ++ g_autofree char *name = g_strdup_printf("iommu-dm-s390-%04x", ++ iommu->pbdev->uid); ++ ++ iommu->dm_mr = g_malloc0(sizeof(*iommu->dm_mr)); ++ memory_region_init_alias(iommu->dm_mr, OBJECT(&iommu->mr), name, ++ get_system_memory(), 0, ++ s390_get_memory_limit(s390ms)); ++ iommu->enabled = true; ++ memory_region_add_subregion(&iommu->mr, iommu->pbdev->zpci_fn.sdma, ++ iommu->dm_mr); ++} ++ + void s390_pci_iommu_disable(S390PCIIOMMU *iommu) + { + iommu->enabled = false; + g_hash_table_remove_all(iommu->iotlb); +- memory_region_del_subregion(&iommu->mr, MEMORY_REGION(&iommu->iommu_mr)); +- object_unparent(OBJECT(&iommu->iommu_mr)); ++ if (iommu->dm_mr) { ++ memory_region_del_subregion(&iommu->mr, iommu->dm_mr); ++ object_unparent(OBJECT(iommu->dm_mr)); ++ g_free(iommu->dm_mr); ++ iommu->dm_mr = NULL; ++ } else { ++ memory_region_del_subregion(&iommu->mr, ++ MEMORY_REGION(&iommu->iommu_mr)); ++ object_unparent(OBJECT(&iommu->iommu_mr)); ++ } + } + + static void s390_pci_iommu_free(S390pciState *s, PCIBus *bus, int32_t devfn) +@@ -1130,6 +1162,7 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, + /* Always intercept emulated devices */ + pbdev->interp = false; + pbdev->forwarding_assist = false; ++ pbdev->rtr_avail = false; + } + + if (s390_pci_msix_init(pbdev) && !pbdev->interp) { +@@ -1488,6 +1521,8 @@ static Property s390_pci_device_properties[] = { + DEFINE_PROP_BOOL("interpret", S390PCIBusDevice, interp, true), + DEFINE_PROP_BOOL("forwarding-assist", S390PCIBusDevice, forwarding_assist, + true), ++ DEFINE_PROP_BOOL("relaxed-translation", S390PCIBusDevice, rtr_avail, ++ true), + DEFINE_PROP_END_OF_LIST(), + }; + +diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c +index 30149546c0..803ebcd9b3 100644 +--- a/hw/s390x/s390-pci-inst.c ++++ b/hw/s390x/s390-pci-inst.c +@@ -16,6 +16,7 @@ + #include "exec/memory.h" + #include "qemu/error-report.h" + #include "sysemu/hw_accel.h" ++#include "hw/boards.h" + #include "hw/pci/pci_device.h" + #include "hw/s390x/s390-pci-inst.h" + #include "hw/s390x/s390-pci-bus.h" +@@ -1008,17 +1009,25 @@ static int reg_ioat(CPUS390XState *env, S390PCIBusDevice *pbdev, ZpciFib fib, + } + + /* currently we only support designation type 1 with translation */ +- if (!(dt == ZPCI_IOTA_RTTO && t)) { ++ if (t && dt != ZPCI_IOTA_RTTO) { + error_report("unsupported ioat dt %d t %d", dt, t); + s390_program_interrupt(env, PGM_OPERAND, ra); + return -EINVAL; ++ } else if (!t && !pbdev->rtr_avail) { ++ error_report("relaxed translation not allowed"); ++ s390_program_interrupt(env, PGM_OPERAND, ra); ++ return -EINVAL; + } + + iommu->pba = pba; + iommu->pal = pal; + iommu->g_iota = g_iota; + +- s390_pci_iommu_enable(iommu); ++ if (t) { ++ s390_pci_iommu_enable(iommu); ++ } else { ++ s390_pci_iommu_direct_map_enable(iommu); ++ } + + return 0; + } +diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c +index 7dbbc76823..443e222912 100644 +--- a/hw/s390x/s390-pci-vfio.c ++++ b/hw/s390x/s390-pci-vfio.c +@@ -131,13 +131,28 @@ static void s390_pci_read_base(S390PCIBusDevice *pbdev, + /* Store function type separately for type-specific behavior */ + pbdev->pft = cap->pft; + ++ /* ++ * If the device is a passthrough ISM device, disallow relaxed ++ * translation. ++ */ ++ if (pbdev->pft == ZPCI_PFT_ISM) { ++ pbdev->rtr_avail = false; ++ } ++ + /* + * If appropriate, reduce the size of the supported DMA aperture reported +- * to the guest based upon the vfio DMA limit. ++ * to the guest based upon the vfio DMA limit. This is applicable for ++ * devices that are guaranteed to not use relaxed translation. If the ++ * device is capable of relaxed translation then we must advertise the ++ * full aperture. In this case, if translation is used then we will ++ * rely on the vfio DMA limit counting and use RPCIT CC1 / status 16 ++ * to request that the guest free DMA mappings as necessary. + */ +- vfio_size = pbdev->iommu->max_dma_limit << TARGET_PAGE_BITS; +- if (vfio_size > 0 && vfio_size < cap->end_dma - cap->start_dma + 1) { +- pbdev->zpci_fn.edma = cap->start_dma + vfio_size - 1; ++ if (!pbdev->rtr_avail) { ++ vfio_size = pbdev->iommu->max_dma_limit << TARGET_PAGE_BITS; ++ if (vfio_size > 0 && vfio_size < cap->end_dma - cap->start_dma + 1) { ++ pbdev->zpci_fn.edma = cap->start_dma + vfio_size - 1; ++ } + } + } + +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index 312e8f18aa..77a1bde71e 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -1348,8 +1348,13 @@ static void ccw_rhel_machine_9_6_0_instance_options(MachineState *machine) + + static void ccw_rhel_machine_9_6_0_class_options(MachineClass *mc) + { ++ static GlobalProperty compat[] = { ++ { TYPE_S390_PCI_DEVICE, "relaxed-translation", "off", }, ++ }; ++ + /* NB: remember to move this line to the *latest* RHEL 9 machine */ + compat_props_add(mc->compat_props, hw_compat_rhel_9, hw_compat_rhel_9_len); ++ compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat)); + } + DEFINE_CCW_MACHINE_AS_LATEST(9, 6, 0); + +diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h +index 2c43ea123f..04944d4fed 100644 +--- a/include/hw/s390x/s390-pci-bus.h ++++ b/include/hw/s390x/s390-pci-bus.h +@@ -277,6 +277,7 @@ struct S390PCIIOMMU { + AddressSpace as; + MemoryRegion mr; + IOMMUMemoryRegion iommu_mr; ++ MemoryRegion *dm_mr; + bool enabled; + uint64_t g_iota; + uint64_t pba; +@@ -362,6 +363,7 @@ struct S390PCIBusDevice { + bool interp; + bool forwarding_assist; + bool aif; ++ bool rtr_avail; + QTAILQ_ENTRY(S390PCIBusDevice) link; + }; + +@@ -389,6 +391,7 @@ int pci_chsc_sei_nt2_have_event(void); + void s390_pci_sclp_configure(SCCB *sccb); + void s390_pci_sclp_deconfigure(SCCB *sccb); + void s390_pci_iommu_enable(S390PCIIOMMU *iommu); ++void s390_pci_iommu_direct_map_enable(S390PCIIOMMU *iommu); + void s390_pci_iommu_disable(S390PCIIOMMU *iommu); + void s390_pci_generate_error_event(uint16_t pec, uint32_t fh, uint32_t fid, + uint64_t faddr, uint32_t e); +-- +2.48.1 + diff --git a/SOURCES/kvm-s390x-pci-indicate-QEMU-supports-relaxed-translation.patch b/SOURCES/kvm-s390x-pci-indicate-QEMU-supports-relaxed-translation.patch new file mode 100644 index 0000000..918fb63 --- /dev/null +++ b/SOURCES/kvm-s390x-pci-indicate-QEMU-supports-relaxed-translation.patch @@ -0,0 +1,72 @@ +From 13e8ddbd282da692c8199a6cb9ca847334089e29 Mon Sep 17 00:00:00 2001 +From: Christoph Schlameuss +Date: Thu, 12 Jun 2025 11:48:41 +0200 +Subject: [PATCH 02/16] s390x/pci: indicate QEMU supports relaxed translation + for passthrough +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Christoph Schlameuss +RH-MergeRequest: 376: Draft: KVM: Performance Enhanced Refresh PCI Translation +RH-Jira: RHEL-11430 +RH-Acked-by: Thomas Huth +RH-Acked-by: Cédric Le Goater +RH-Commit: [2/2] afd514268347d0b434a60d7c6c09d20b84e5d902 (cschlame/qemu-kvm) + +JIRA: https://issues.redhat.com/browse/RHEL-11430 + +commit d9b5dfc7122559e5b5959ecf534788b90c3dd102 +Author: Matthew Rosato +Date: Wed Feb 26 16:00:13 2025 -0500 + + s390x/pci: indicate QEMU supports relaxed translation for passthrough + + Specifying this bit in the guest CLP response indicates that the guest + can optionally choose to skip translation and instead use + identity-mapped operations. + + Tested-by: Niklas Schnelle + Reviewed-by: Niklas Schnelle + Signed-off-by: Matthew Rosato + Message-ID: <20250226210013.238349-3-mjrosato@linux.ibm.com> + Signed-off-by: Thomas Huth + +Signed-off-by: Christoph Schlameuss +--- + hw/s390x/s390-pci-vfio.c | 5 ++++- + include/hw/s390x/s390-pci-clp.h | 1 + + 2 files changed, 5 insertions(+), 1 deletion(-) + +diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c +index 443e222912..6236ac7f1e 100644 +--- a/hw/s390x/s390-pci-vfio.c ++++ b/hw/s390x/s390-pci-vfio.c +@@ -238,8 +238,11 @@ static void s390_pci_read_group(S390PCIBusDevice *pbdev, + pbdev->pci_group = s390_group_create(pbdev->zpci_fn.pfgid, start_gid); + + resgrp = &pbdev->pci_group->zpci_group; ++ if (pbdev->rtr_avail) { ++ resgrp->fr |= CLP_RSP_QPCIG_MASK_RTR; ++ } + if (cap->flags & VFIO_DEVICE_INFO_ZPCI_FLAG_REFRESH) { +- resgrp->fr = 1; ++ resgrp->fr |= CLP_RSP_QPCIG_MASK_REFRESH; + } + resgrp->dasm = cap->dasm; + resgrp->msia = cap->msi_addr; +diff --git a/include/hw/s390x/s390-pci-clp.h b/include/hw/s390x/s390-pci-clp.h +index 03b7f9ba5f..6a635d693b 100644 +--- a/include/hw/s390x/s390-pci-clp.h ++++ b/include/hw/s390x/s390-pci-clp.h +@@ -158,6 +158,7 @@ typedef struct ClpRspQueryPciGrp { + #define CLP_RSP_QPCIG_MASK_NOI 0xfff + uint16_t i; + uint8_t version; ++#define CLP_RSP_QPCIG_MASK_RTR 0x20 + #define CLP_RSP_QPCIG_MASK_FRAME 0x2 + #define CLP_RSP_QPCIG_MASK_REFRESH 0x1 + uint8_t fr; +-- +2.48.1 + diff --git a/SOURCES/kvm-s390x-pv-prepare-for-memory-devices.patch b/SOURCES/kvm-s390x-pv-prepare-for-memory-devices.patch new file mode 100644 index 0000000..16ff2c8 --- /dev/null +++ b/SOURCES/kvm-s390x-pv-prepare-for-memory-devices.patch @@ -0,0 +1,46 @@ +From 9d5420c4370b74d60f082f2aa1225b19150ee629 Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Thu, 19 Dec 2024 15:41:12 +0100 +Subject: [PATCH 20/26] s390x/pv: prepare for memory devices + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [20/26] cdbe71168b9afa9657b94f1e7500568314c707a8 (thuth/qemu-kvm-cs) + +Let's avoid checking for the maxram_size, and instead rely on the memory +limit determined in s390_memory_init(), that might be larger than +maxram_size, for example due to alignment purposes. + +This check now correctly mimics what the kernel will check in +kvm_s390_pv_set_aside(), whereby a VM <= 2 GiB VM would end up using +a segment type ASCE. + +Message-ID: <20241219144115.2820241-12-david@redhat.com> +Acked-by: Michael S. Tsirkin +Reviewed-by: Nina Schoetterl-Glausch +Signed-off-by: David Hildenbrand +(cherry picked from commit a056332e732110c8ef0d40ffd49bd03afc2f04ca) +Signed-off-by: Thomas Huth +--- + target/s390x/kvm/pv.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/target/s390x/kvm/pv.c b/target/s390x/kvm/pv.c +index 424cce75ca..fa66607e7b 100644 +--- a/target/s390x/kvm/pv.c ++++ b/target/s390x/kvm/pv.c +@@ -133,7 +133,7 @@ bool s390_pv_vm_try_disable_async(S390CcwMachineState *ms) + * If the feature is not present or if the VM is not larger than 2 GiB, + * KVM_PV_ASYNC_CLEANUP_PREPARE fill fail; no point in attempting it. + */ +- if ((MACHINE(ms)->ram_size <= 2 * GiB) || ++ if (s390_get_memory_limit(ms) <= 2 * GiB || + !kvm_check_extension(kvm_state, KVM_CAP_S390_PROTECTED_ASYNC_DISABLE)) { + return false; + } +-- +2.48.1 + diff --git a/SOURCES/kvm-s390x-remember-the-maximum-page-size.patch b/SOURCES/kvm-s390x-remember-the-maximum-page-size.patch new file mode 100644 index 0000000..b2fd41e --- /dev/null +++ b/SOURCES/kvm-s390x-remember-the-maximum-page-size.patch @@ -0,0 +1,107 @@ +From 5a311d410bca4a5530a51c0b789ce8525d2d0653 Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Thu, 19 Dec 2024 15:41:13 +0100 +Subject: [PATCH 21/26] s390x: remember the maximum page size + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [21/26] 3b97c555b153d42e4fcb27dbb65fbf3edac622a4 (thuth/qemu-kvm-cs) + +Let's remember the value (successfully) set via s390_set_max_pagesize(). +This will be helpful to reject hotplugged memory devices that would exceed +this initially set page size. + +Handle it just like how we handle s390_get_memory_limit(), storing it in +the machine, and moving the handling to machine code. + +Message-ID: <20241219144115.2820241-13-david@redhat.com> +Acked-by: Michael S. Tsirkin +Reviewed-by: Thomas Huth +Signed-off-by: David Hildenbrand +(cherry picked from commit df2ac211a62e6ced7f1495b634fa6f78962f2321) +Signed-off-by: Thomas Huth +--- + hw/s390x/s390-virtio-ccw.c | 12 +++++++++++- + include/hw/s390x/s390-virtio-ccw.h | 1 + + target/s390x/cpu-sysemu.c | 7 ------- + target/s390x/cpu.h | 1 - + 4 files changed, 12 insertions(+), 9 deletions(-) + +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index 824c73536a..bd05a22b4e 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -143,6 +143,16 @@ static void s390_set_memory_limit(S390CcwMachineState *s390ms, + s390ms->memory_limit = new_limit; + } + ++static void s390_set_max_pagesize(S390CcwMachineState *s390ms, ++ uint64_t pagesize) ++{ ++ assert(!s390ms->max_pagesize && pagesize); ++ if (kvm_enabled()) { ++ kvm_s390_set_max_pagesize(pagesize, &error_fatal); ++ } ++ s390ms->max_pagesize = pagesize; ++} ++ + static void s390_memory_init(MachineState *machine) + { + S390CcwMachineState *s390ms = S390_CCW_MACHINE(machine); +@@ -191,7 +201,7 @@ static void s390_memory_init(MachineState *machine) + * Configure the maximum page size. As no memory devices were created + * yet, this is the page size of initial memory only. + */ +- s390_set_max_pagesize(qemu_maxrampagesize(), &error_fatal); ++ s390_set_max_pagesize(s390ms, qemu_maxrampagesize()); + /* Initialize storage key device */ + s390_skeys_init(); + /* Initialize storage attributes device */ +diff --git a/include/hw/s390x/s390-virtio-ccw.h b/include/hw/s390x/s390-virtio-ccw.h +index de04336c5a..599740a998 100644 +--- a/include/hw/s390x/s390-virtio-ccw.h ++++ b/include/hw/s390x/s390-virtio-ccw.h +@@ -30,6 +30,7 @@ struct S390CcwMachineState { + bool pv; + uint8_t loadparm[8]; + uint64_t memory_limit; ++ uint64_t max_pagesize; + + SCLPDevice *sclp; + }; +diff --git a/target/s390x/cpu-sysemu.c b/target/s390x/cpu-sysemu.c +index 3118a25fee..706a5c53e2 100644 +--- a/target/s390x/cpu-sysemu.c ++++ b/target/s390x/cpu-sysemu.c +@@ -255,13 +255,6 @@ unsigned int s390_cpu_set_state(uint8_t cpu_state, S390CPU *cpu) + return s390_count_running_cpus(); + } + +-void s390_set_max_pagesize(uint64_t pagesize, Error **errp) +-{ +- if (kvm_enabled()) { +- kvm_s390_set_max_pagesize(pagesize, errp); +- } +-} +- + void s390_cmma_reset(void) + { + if (kvm_enabled()) { +diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h +index ecaf3191d2..9770a62ac9 100644 +--- a/target/s390x/cpu.h ++++ b/target/s390x/cpu.h +@@ -881,7 +881,6 @@ static inline void s390_do_cpu_load_normal(CPUState *cs, run_on_cpu_data arg) + + /* cpu.c */ + void s390_crypto_reset(void); +-void s390_set_max_pagesize(uint64_t pagesize, Error **errp); + void s390_cmma_reset(void); + void s390_enable_css_support(S390CPU *cpu); + void s390_do_cpu_set_diag318(CPUState *cs, run_on_cpu_data arg); +-- +2.48.1 + diff --git a/SOURCES/kvm-s390x-rename-s390-virtio-hcall-to-s390-hypercall.patch b/SOURCES/kvm-s390x-rename-s390-virtio-hcall-to-s390-hypercall.patch new file mode 100644 index 0000000..90bb4c3 --- /dev/null +++ b/SOURCES/kvm-s390x-rename-s390-virtio-hcall-to-s390-hypercall.patch @@ -0,0 +1,113 @@ +From 2fbdf7e3cf23daea470aaa4a29e16641feb76f3c Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Thu, 19 Dec 2024 15:41:05 +0100 +Subject: [PATCH 13/26] s390x: rename s390-virtio-hcall* to s390-hypercall* + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [13/26] 3c1ef3cbb137517b306871f0a88a61a59740af5a (thuth/qemu-kvm-cs) + +Let's make it clearer that we are talking about general +QEMU/KVM-specific hypercalls. + +Message-ID: <20241219144115.2820241-5-david@redhat.com> +Acked-by: Michael S. Tsirkin +Reviewed-by: Thomas Huth +Signed-off-by: David Hildenbrand +(cherry picked from commit 85489fc3652d0c4433c940f1a80a952e8cb5d3cb) +Signed-off-by: Thomas Huth +--- + hw/s390x/meson.build | 2 +- + hw/s390x/{s390-virtio-hcall.c => s390-hypercall.c} | 2 +- + hw/s390x/{s390-virtio-hcall.h => s390-hypercall.h} | 6 +++--- + target/s390x/kvm/kvm.c | 2 +- + target/s390x/tcg/misc_helper.c | 2 +- + 5 files changed, 7 insertions(+), 7 deletions(-) + rename hw/s390x/{s390-virtio-hcall.c => s390-hypercall.c} (97%) + rename hw/s390x/{s390-virtio-hcall.h => s390-hypercall.h} (86%) + +diff --git a/hw/s390x/meson.build b/hw/s390x/meson.build +index d6c8c33915..e344a3bd8c 100644 +--- a/hw/s390x/meson.build ++++ b/hw/s390x/meson.build +@@ -29,7 +29,7 @@ s390x_ss.add(when: 'CONFIG_TCG', if_true: files( + )) + s390x_ss.add(when: 'CONFIG_S390_CCW_VIRTIO', if_true: files( + 's390-virtio-ccw.c', +- 's390-virtio-hcall.c', ++ 's390-hypercall.c', + )) + s390x_ss.add(when: 'CONFIG_TERMINAL3270', if_true: files('3270-ccw.c')) + s390x_ss.add(when: 'CONFIG_VFIO', if_true: files('s390-pci-vfio.c')) +diff --git a/hw/s390x/s390-virtio-hcall.c b/hw/s390x/s390-hypercall.c +similarity index 97% +rename from hw/s390x/s390-virtio-hcall.c +rename to hw/s390x/s390-hypercall.c +index 5fb78a719e..f816c2b1ef 100644 +--- a/hw/s390x/s390-virtio-hcall.c ++++ b/hw/s390x/s390-hypercall.c +@@ -12,7 +12,7 @@ + #include "qemu/osdep.h" + #include "cpu.h" + #include "hw/boards.h" +-#include "hw/s390x/s390-virtio-hcall.h" ++#include "hw/s390x/s390-hypercall.h" + #include "hw/s390x/ioinst.h" + #include "hw/s390x/css.h" + #include "virtio-ccw.h" +diff --git a/hw/s390x/s390-virtio-hcall.h b/hw/s390x/s390-hypercall.h +similarity index 86% +rename from hw/s390x/s390-virtio-hcall.h +rename to hw/s390x/s390-hypercall.h +index dca456b926..2fa81dbfdd 100644 +--- a/hw/s390x/s390-virtio-hcall.h ++++ b/hw/s390x/s390-hypercall.h +@@ -9,8 +9,8 @@ + * directory. + */ + +-#ifndef HW_S390_VIRTIO_HCALL_H +-#define HW_S390_VIRTIO_HCALL_H ++#ifndef HW_S390_HYPERCALL_H ++#define HW_S390_HYPERCALL_H + + #include "cpu.h" + +@@ -21,4 +21,4 @@ + + void handle_diag_500(S390CPU *cpu, uintptr_t ra); + +-#endif /* HW_S390_VIRTIO_HCALL_H */ ++#endif /* HW_S390_HYPERCALL_H */ +diff --git a/target/s390x/kvm/kvm.c b/target/s390x/kvm/kvm.c +index 42d6a54126..afc8d570c9 100644 +--- a/target/s390x/kvm/kvm.c ++++ b/target/s390x/kvm/kvm.c +@@ -49,7 +49,7 @@ + #include "hw/s390x/ebcdic.h" + #include "exec/memattrs.h" + #include "hw/s390x/s390-virtio-ccw.h" +-#include "hw/s390x/s390-virtio-hcall.h" ++#include "hw/s390x/s390-hypercall.h" + #include "target/s390x/kvm/pv.h" + #include CONFIG_DEVICES + +diff --git a/target/s390x/tcg/misc_helper.c b/target/s390x/tcg/misc_helper.c +index 2b4310003b..b726a95352 100644 +--- a/target/s390x/tcg/misc_helper.c ++++ b/target/s390x/tcg/misc_helper.c +@@ -36,7 +36,7 @@ + #include "sysemu/cpus.h" + #include "sysemu/sysemu.h" + #include "hw/s390x/ebcdic.h" +-#include "hw/s390x/s390-virtio-hcall.h" ++#include "hw/s390x/s390-hypercall.h" + #include "hw/s390x/sclp.h" + #include "hw/s390x/s390_flic.h" + #include "hw/s390x/ioinst.h" +-- +2.48.1 + diff --git a/SOURCES/kvm-s390x-s390-hypercall-introduce-DIAG500-STORAGE_LIMIT.patch b/SOURCES/kvm-s390x-s390-hypercall-introduce-DIAG500-STORAGE_LIMIT.patch new file mode 100644 index 0000000..af613d5 --- /dev/null +++ b/SOURCES/kvm-s390x-s390-hypercall-introduce-DIAG500-STORAGE_LIMIT.patch @@ -0,0 +1,99 @@ +From 86417a068f24964422d4fd5ea301d70a0f8142d2 Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Thu, 19 Dec 2024 15:41:08 +0100 +Subject: [PATCH 16/26] s390x/s390-hypercall: introduce DIAG500 STORAGE_LIMIT + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [16/26] c1c341227388735450ddbba0201e7523e0658c07 (thuth/qemu-kvm-cs) + +A guest OS that supports memory hotplug / memory devices must during +boot be aware of the maximum possible physical memory address that it might +have to handle at a later stage during its runtime. + +For example, the maximum possible memory address might be required to +prepare the kernel virtual address space accordingly (e.g., select page +table hierarchy depth). + +On s390x there is currently no such mechanism that is compatible with +paravirtualized memory devices, because the whole SCLP interface was +designed around the idea of "storage increments" and "standby memory". +Paravirtualized memory devices we want to support, such as virtio-mem, have +no intersection with any of that, but could co-exist with them in the +future if ever needed. + +In particular, a guest OS must never detect and use device memory +without the help of a proper device driver. Device memory must not be +exposed in any firmware-provided memory map (SCLP or diag260 on s390x). +For this reason, these memory devices will be places in memory *above* +the "maximum storage increment" exposed via SCLP. + +Let's provide a new diag500 subcode to query the memory limit determined in +s390_memory_init(). + +Message-ID: <20241219144115.2820241-8-david@redhat.com> +Acked-by: Michael S. Tsirkin +Reviewed-by: Thomas Huth +Signed-off-by: David Hildenbrand +(cherry picked from commit f7c168657816486527727d860b73747d41f0c5f6) +Signed-off-by: Thomas Huth +--- + hw/s390x/s390-hypercall.c | 12 +++++++++++- + hw/s390x/s390-hypercall.h | 1 + + 2 files changed, 12 insertions(+), 1 deletion(-) + +diff --git a/hw/s390x/s390-hypercall.c b/hw/s390x/s390-hypercall.c +index f816c2b1ef..ac1b08b2cd 100644 +--- a/hw/s390x/s390-hypercall.c ++++ b/hw/s390x/s390-hypercall.c +@@ -11,7 +11,7 @@ + + #include "qemu/osdep.h" + #include "cpu.h" +-#include "hw/boards.h" ++#include "hw/s390x/s390-virtio-ccw.h" + #include "hw/s390x/s390-hypercall.h" + #include "hw/s390x/ioinst.h" + #include "hw/s390x/css.h" +@@ -57,6 +57,13 @@ static int handle_virtio_ccw_notify(uint64_t subch_id, uint64_t data) + return 0; + } + ++static uint64_t handle_storage_limit(void) ++{ ++ S390CcwMachineState *s390ms = S390_CCW_MACHINE(qdev_get_machine()); ++ ++ return s390_get_memory_limit(s390ms) - 1; ++} ++ + void handle_diag_500(S390CPU *cpu, uintptr_t ra) + { + CPUS390XState *env = &cpu->env; +@@ -69,6 +76,9 @@ void handle_diag_500(S390CPU *cpu, uintptr_t ra) + case DIAG500_VIRTIO_CCW_NOTIFY: + env->regs[2] = handle_virtio_ccw_notify(env->regs[2], env->regs[3]); + break; ++ case DIAG500_STORAGE_LIMIT: ++ env->regs[2] = handle_storage_limit(); ++ break; + default: + s390_program_interrupt(env, PGM_SPECIFICATION, ra); + } +diff --git a/hw/s390x/s390-hypercall.h b/hw/s390x/s390-hypercall.h +index 2fa81dbfdd..4f07209128 100644 +--- a/hw/s390x/s390-hypercall.h ++++ b/hw/s390x/s390-hypercall.h +@@ -18,6 +18,7 @@ + #define DIAG500_VIRTIO_RESET 1 /* legacy */ + #define DIAG500_VIRTIO_SET_STATUS 2 /* legacy */ + #define DIAG500_VIRTIO_CCW_NOTIFY 3 /* KVM_S390_VIRTIO_CCW_NOTIFY */ ++#define DIAG500_STORAGE_LIMIT 4 + + void handle_diag_500(S390CPU *cpu, uintptr_t ra); + +-- +2.48.1 + diff --git a/SOURCES/kvm-s390x-s390-skeys-prepare-for-memory-devices.patch b/SOURCES/kvm-s390x-s390-skeys-prepare-for-memory-devices.patch new file mode 100644 index 0000000..885ebf1 --- /dev/null +++ b/SOURCES/kvm-s390x-s390-skeys-prepare-for-memory-devices.patch @@ -0,0 +1,56 @@ +From 53d1b43699c6b30583f41a18a33c28893718aeac Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Thu, 19 Dec 2024 15:41:10 +0100 +Subject: [PATCH 18/26] s390x/s390-skeys: prepare for memory devices + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [18/26] 47edda0eeb6d5932f81633f2d9d294b1ca5f413c (thuth/qemu-kvm-cs) + +With memory devices, we will have storage keys for memory that +exceeds the initial ram size. + +The TODO already states that current handling is subopimal, +but we won't worry about improving that (TCG-only) thing for now. + +Message-ID: <20241219144115.2820241-10-david@redhat.com> +Acked-by: Michael S. Tsirkin +Reviewed-by: Thomas Huth +Signed-off-by: David Hildenbrand +(cherry picked from commit d1e3c2ac41b3f73708682e4e8212c32ad35013b9) +Signed-off-by: Thomas Huth +--- + hw/s390x/s390-skeys.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/hw/s390x/s390-skeys.c b/hw/s390x/s390-skeys.c +index bf22d6863e..e4297b3b8a 100644 +--- a/hw/s390x/s390-skeys.c ++++ b/hw/s390x/s390-skeys.c +@@ -11,7 +11,7 @@ + + #include "qemu/osdep.h" + #include "qemu/units.h" +-#include "hw/boards.h" ++#include "hw/s390x/s390-virtio-ccw.h" + #include "hw/qdev-properties.h" + #include "hw/s390x/storage-keys.h" + #include "qapi/error.h" +@@ -251,9 +251,9 @@ static bool qemu_s390_enable_skeys(S390SKeysState *ss) + * g_once_init_enter() is good enough. + */ + if (g_once_init_enter(&initialized)) { +- MachineState *machine = MACHINE(qdev_get_machine()); ++ S390CcwMachineState *s390ms = S390_CCW_MACHINE(qdev_get_machine()); + +- skeys->key_count = machine->ram_size / TARGET_PAGE_SIZE; ++ skeys->key_count = s390_get_memory_limit(s390ms) / TARGET_PAGE_SIZE; + skeys->keydata = g_malloc0(skeys->key_count); + g_once_init_leave(&initialized, 1); + } +-- +2.48.1 + diff --git a/SOURCES/kvm-s390x-s390-stattrib-kvm-prepare-for-memory-devices-a.patch b/SOURCES/kvm-s390x-s390-stattrib-kvm-prepare-for-memory-devices-a.patch new file mode 100644 index 0000000..e8102a4 --- /dev/null +++ b/SOURCES/kvm-s390x-s390-stattrib-kvm-prepare-for-memory-devices-a.patch @@ -0,0 +1,155 @@ +From 1195c91d10892a888870248fd881612955b9e1eb Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Thu, 19 Dec 2024 15:41:09 +0100 +Subject: [PATCH 17/26] s390x/s390-stattrib-kvm: prepare for memory devices and + sparse memory layouts + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [17/26] 799aa7b2b9cc2a948e9f391bc0ecf739254c78b1 (thuth/qemu-kvm-cs) + +With memory devices, we will have storage attributes for memory that +exceeds the initial ram size. Further, we can easily have memory holes, +for which there (currently) are no storage attributes. + +In particular, with memory holes, KVM_S390_SET_CMMA_BITS will fail to set +some storage attributes. + +So let's do it like we handle storage keys migration, relying on +guest_phys_blocks_append(). However, in contrast to storage key +migration, we will handle it on the migration destination. + +This is a preparation for virtio-mem support. Note that ever since the +"early migration" feature was added (x-early-migration), the state +of device blocks (plugged/unplugged) is migrated early such that +guest_phys_blocks_append() will properly consider all currently plugged +memory blocks and skip any unplugged ones. + +In the future, we should try getting rid of the large temporary buffer +and also not send any attributes for any memory holes, just so they +get ignored on the destination. + +Message-ID: <20241219144115.2820241-9-david@redhat.com> +Acked-by: Michael S. Tsirkin +Reviewed-by: Thomas Huth +Signed-off-by: David Hildenbrand +(cherry picked from commit 241e6b2d27b090b17cda5b011b2064544b0c458b) +Signed-off-by: Thomas Huth +--- + hw/s390x/s390-stattrib-kvm.c | 67 +++++++++++++++++++++++------------- + 1 file changed, 43 insertions(+), 24 deletions(-) + +diff --git a/hw/s390x/s390-stattrib-kvm.c b/hw/s390x/s390-stattrib-kvm.c +index eeaa811098..33ec91422a 100644 +--- a/hw/s390x/s390-stattrib-kvm.c ++++ b/hw/s390x/s390-stattrib-kvm.c +@@ -10,11 +10,12 @@ + */ + + #include "qemu/osdep.h" +-#include "hw/boards.h" ++#include "hw/s390x/s390-virtio-ccw.h" + #include "migration/qemu-file.h" + #include "hw/s390x/storage-attributes.h" + #include "qemu/error-report.h" + #include "sysemu/kvm.h" ++#include "sysemu/memory_mapping.h" + #include "exec/ram_addr.h" + #include "kvm/kvm_s390x.h" + #include "qapi/error.h" +@@ -84,8 +85,8 @@ static int kvm_s390_stattrib_set_stattr(S390StAttribState *sa, + uint8_t *values) + { + KVMS390StAttribState *sas = KVM_S390_STATTRIB(sa); +- MachineState *machine = MACHINE(qdev_get_machine()); +- unsigned long max = machine->ram_size / TARGET_PAGE_SIZE; ++ S390CcwMachineState *s390ms = S390_CCW_MACHINE(qdev_get_machine()); ++ unsigned long max = s390_get_memory_limit(s390ms) / TARGET_PAGE_SIZE; + + if (start_gfn + count > max) { + error_report("Out of memory bounds when setting storage attributes"); +@@ -103,39 +104,57 @@ static int kvm_s390_stattrib_set_stattr(S390StAttribState *sa, + static void kvm_s390_stattrib_synchronize(S390StAttribState *sa) + { + KVMS390StAttribState *sas = KVM_S390_STATTRIB(sa); +- MachineState *machine = MACHINE(qdev_get_machine()); +- unsigned long max = machine->ram_size / TARGET_PAGE_SIZE; +- /* We do not need to reach the maximum buffer size allowed */ +- unsigned long cx, len = KVM_S390_SKEYS_MAX / 2; ++ S390CcwMachineState *s390ms = S390_CCW_MACHINE(qdev_get_machine()); ++ unsigned long max = s390_get_memory_limit(s390ms) / TARGET_PAGE_SIZE; ++ unsigned long start_gfn, end_gfn, pages; ++ GuestPhysBlockList guest_phys_blocks; ++ GuestPhysBlock *block; + int r; + struct kvm_s390_cmma_log clog = { + .flags = 0, + .mask = ~0ULL, + }; + +- if (sas->incoming_buffer) { +- for (cx = 0; cx + len <= max; cx += len) { +- clog.start_gfn = cx; +- clog.count = len; +- clog.values = (uint64_t)(sas->incoming_buffer + cx); +- r = kvm_vm_ioctl(kvm_state, KVM_S390_SET_CMMA_BITS, &clog); +- if (r) { +- error_report("KVM_S390_SET_CMMA_BITS failed: %s", strerror(-r)); +- return; +- } +- } +- if (cx < max) { +- clog.start_gfn = cx; +- clog.count = max - cx; +- clog.values = (uint64_t)(sas->incoming_buffer + cx); ++ if (!sas->incoming_buffer) { ++ return; ++ } ++ guest_phys_blocks_init(&guest_phys_blocks); ++ guest_phys_blocks_append(&guest_phys_blocks); ++ ++ QTAILQ_FOREACH(block, &guest_phys_blocks.head, next) { ++ assert(QEMU_IS_ALIGNED(block->target_start, TARGET_PAGE_SIZE)); ++ assert(QEMU_IS_ALIGNED(block->target_end, TARGET_PAGE_SIZE)); ++ ++ start_gfn = block->target_start / TARGET_PAGE_SIZE; ++ end_gfn = block->target_end / TARGET_PAGE_SIZE; ++ ++ while (start_gfn < end_gfn) { ++ /* Don't exceed the maximum buffer size. */ ++ pages = MIN(end_gfn - start_gfn, KVM_S390_SKEYS_MAX / 2); ++ ++ /* ++ * If we ever get guest physical memory beyond the configured ++ * memory limit, something went very wrong. ++ */ ++ assert(start_gfn + pages <= max); ++ ++ clog.start_gfn = start_gfn; ++ clog.count = pages; ++ clog.values = (uint64_t)(sas->incoming_buffer + start_gfn); + r = kvm_vm_ioctl(kvm_state, KVM_S390_SET_CMMA_BITS, &clog); + if (r) { + error_report("KVM_S390_SET_CMMA_BITS failed: %s", strerror(-r)); ++ goto out; + } ++ ++ start_gfn += pages; + } +- g_free(sas->incoming_buffer); +- sas->incoming_buffer = NULL; + } ++ ++out: ++ guest_phys_blocks_free(&guest_phys_blocks); ++ g_free(sas->incoming_buffer); ++ sas->incoming_buffer = NULL; + } + + static int kvm_s390_stattrib_set_migrationmode(S390StAttribState *sa, bool val, +-- +2.48.1 + diff --git a/SOURCES/kvm-s390x-s390-virtio-ccw-don-t-crash-on-weird-RAM-sizes.patch b/SOURCES/kvm-s390x-s390-virtio-ccw-don-t-crash-on-weird-RAM-sizes.patch new file mode 100644 index 0000000..fa99566 --- /dev/null +++ b/SOURCES/kvm-s390x-s390-virtio-ccw-don-t-crash-on-weird-RAM-sizes.patch @@ -0,0 +1,63 @@ +From 4ee3076ac566622929f9410636483c4f0b2da967 Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Thu, 19 Dec 2024 15:41:02 +0100 +Subject: [PATCH 10/26] s390x/s390-virtio-ccw: don't crash on weird RAM sizes + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [10/26] 55738da52f3cf4746bee2b17780a10720fa05863 (thuth/qemu-kvm-cs) + +KVM is not happy when starting a VM with weird RAM sizes: + + # qemu-system-s390x --enable-kvm --nographic -m 1234K + qemu-system-s390x: kvm_set_user_memory_region: KVM_SET_USER_MEMORY_REGION + failed, slot=0, start=0x0, size=0x244000: Invalid argument + kvm_set_phys_mem: error registering slot: Invalid argument + Aborted (core dumped) + +Let's handle that in a better way by rejecting such weird RAM sizes +right from the start: + + # qemu-system-s390x --enable-kvm --nographic -m 1234K + qemu-system-s390x: ram size must be multiples of 1 MiB + +Message-ID: <20241219144115.2820241-2-david@redhat.com> +Acked-by: Michael S. Tsirkin +Reviewed-by: Eric Farman +Reviewed-by: Thomas Huth +Acked-by: Janosch Frank +Signed-off-by: David Hildenbrand +(cherry picked from commit 14e568ab4836347481af2e334009c385f456a734) +Signed-off-by: Thomas Huth +--- + hw/s390x/s390-virtio-ccw.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index 94cad1705b..82ded9666c 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -180,6 +180,17 @@ static void s390_memory_init(MemoryRegion *ram) + { + MemoryRegion *sysmem = get_system_memory(); + ++ if (!QEMU_IS_ALIGNED(memory_region_size(ram), 1 * MiB)) { ++ /* ++ * SCLP cannot possibly expose smaller granularity right now and KVM ++ * cannot handle smaller granularity. As we don't support NUMA, the ++ * region size directly corresponds to machine->ram_size, and the region ++ * is a single RAM memory region. ++ */ ++ error_report("ram size must be multiples of 1 MiB"); ++ exit(EXIT_FAILURE); ++ } ++ + /* allocate RAM for core */ + memory_region_add_subregion(sysmem, 0, ram); + +-- +2.48.1 + diff --git a/SOURCES/kvm-s390x-s390-virtio-ccw-move-setting-the-maximum-guest.patch b/SOURCES/kvm-s390x-s390-virtio-ccw-move-setting-the-maximum-guest.patch new file mode 100644 index 0000000..5a498d6 --- /dev/null +++ b/SOURCES/kvm-s390x-s390-virtio-ccw-move-setting-the-maximum-guest.patch @@ -0,0 +1,140 @@ +From 9ec2d356210f1e66f50519cc4d58633a13db9004 Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Thu, 19 Dec 2024 15:41:06 +0100 +Subject: [PATCH 14/26] s390x/s390-virtio-ccw: move setting the maximum guest + size from sclp to machine code + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [14/26] a5970c1c6d8d09a473a25a7eee533ec3a6711ec8 (thuth/qemu-kvm-cs) + +Nowadays, it feels more natural to have that code located in +s390_memory_init(), where we also have direct access to the machine +object. + +While at it, use the actual RAM size, not the maximum RAM size which +cannot currently be reached without support for any memory devices. +Consequently update s390_pv_vm_try_disable_async() to rely on the RAM size +as well, to avoid temporary issues while we further rework that +handling. + +set_memory_limit() is temporary, we'll merge it with +s390_set_memory_limit() next. + +Message-ID: <20241219144115.2820241-6-david@redhat.com> +Acked-by: Michael S. Tsirkin +Reviewed-by: Thomas Huth +Signed-off-by: David Hildenbrand +(cherry picked from commit 3c6fb557d295949bea291c3bf88ee9c83392e78c) +Signed-off-by: Thomas Huth +--- + hw/s390x/s390-virtio-ccw.c | 28 ++++++++++++++++++++++++---- + hw/s390x/sclp.c | 11 ----------- + target/s390x/kvm/pv.c | 2 +- + 3 files changed, 25 insertions(+), 16 deletions(-) + +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index d47e99028e..248ac28d20 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -121,11 +121,29 @@ static void subsystem_reset(void) + } + } + +-static void s390_memory_init(MemoryRegion *ram) ++static void set_memory_limit(uint64_t new_limit) ++{ ++ uint64_t hw_limit; ++ int ret; ++ ++ ret = s390_set_memory_limit(new_limit, &hw_limit); ++ if (ret == -E2BIG) { ++ error_report("host supports a maximum of %" PRIu64 " GB", ++ hw_limit / GiB); ++ exit(EXIT_FAILURE); ++ } else if (ret) { ++ error_report("setting the guest size failed"); ++ exit(EXIT_FAILURE); ++ } ++} ++ ++static void s390_memory_init(MachineState *machine) + { + MemoryRegion *sysmem = get_system_memory(); ++ MemoryRegion *ram = machine->ram; ++ uint64_t ram_size = memory_region_size(ram); + +- if (!QEMU_IS_ALIGNED(memory_region_size(ram), 1 * MiB)) { ++ if (!QEMU_IS_ALIGNED(ram_size, 1 * MiB)) { + /* + * SCLP cannot possibly expose smaller granularity right now and KVM + * cannot handle smaller granularity. As we don't support NUMA, the +@@ -136,7 +154,9 @@ static void s390_memory_init(MemoryRegion *ram) + exit(EXIT_FAILURE); + } + +- /* allocate RAM for core */ ++ set_memory_limit(ram_size); ++ ++ /* Map the initial memory. Must happen after setting the memory limit. */ + memory_region_add_subregion(sysmem, 0, ram); + + /* +@@ -211,7 +231,7 @@ static void ccw_init(MachineState *machine) + qdev_realize_and_unref(DEVICE(ms->sclp), NULL, &error_fatal); + + /* init memory + setup max page size. Required for the CPU model */ +- s390_memory_init(machine->ram); ++ s390_memory_init(machine); + + /* init CPUs (incl. CPU model) early so s390_has_feature() works */ + s390_init_cpus(machine); +diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c +index 8757626b5c..73e88ab4eb 100644 +--- a/hw/s390x/sclp.c ++++ b/hw/s390x/sclp.c +@@ -376,10 +376,7 @@ void sclp_service_interrupt(uint32_t sccb) + /* qemu object creation and initialization functions */ + static void sclp_realize(DeviceState *dev, Error **errp) + { +- MachineState *machine = MACHINE(qdev_get_machine()); + SCLPDevice *sclp = SCLP(dev); +- uint64_t hw_limit; +- int ret; + + /* + * qdev_device_add searches the sysbus for TYPE_SCLP_EVENTS_BUS. As long +@@ -389,14 +386,6 @@ static void sclp_realize(DeviceState *dev, Error **errp) + if (!sysbus_realize(SYS_BUS_DEVICE(sclp->event_facility), errp)) { + return; + } +- +- ret = s390_set_memory_limit(machine->maxram_size, &hw_limit); +- if (ret == -E2BIG) { +- error_setg(errp, "host supports a maximum of %" PRIu64 " GB", +- hw_limit / GiB); +- } else if (ret) { +- error_setg(errp, "setting the guest size failed"); +- } + } + + static void sclp_memory_init(SCLPDevice *sclp) +diff --git a/target/s390x/kvm/pv.c b/target/s390x/kvm/pv.c +index dde836d21a..424cce75ca 100644 +--- a/target/s390x/kvm/pv.c ++++ b/target/s390x/kvm/pv.c +@@ -133,7 +133,7 @@ bool s390_pv_vm_try_disable_async(S390CcwMachineState *ms) + * If the feature is not present or if the VM is not larger than 2 GiB, + * KVM_PV_ASYNC_CLEANUP_PREPARE fill fail; no point in attempting it. + */ +- if ((MACHINE(ms)->maxram_size <= 2 * GiB) || ++ if ((MACHINE(ms)->ram_size <= 2 * GiB) || + !kvm_check_extension(kvm_state, KVM_CAP_S390_PROTECTED_ASYNC_DISABLE)) { + return false; + } +-- +2.48.1 + diff --git a/SOURCES/kvm-s390x-s390-virtio-ccw-prepare-for-memory-devices.patch b/SOURCES/kvm-s390x-s390-virtio-ccw-prepare-for-memory-devices.patch new file mode 100644 index 0000000..17e7e6a --- /dev/null +++ b/SOURCES/kvm-s390x-s390-virtio-ccw-prepare-for-memory-devices.patch @@ -0,0 +1,117 @@ +From 0e7d7bf86fb242c1ea90bf9648fb061626790eda Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Thu, 19 Dec 2024 15:41:11 +0100 +Subject: [PATCH 19/26] s390x/s390-virtio-ccw: prepare for memory devices + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [19/26] 2441c8c5f5a06d5ca93188dd44e8a08f06d1722b (thuth/qemu-kvm-cs) + +Let's prepare our address space for memory devices if enabled via +"maxmem" and if we have CONFIG_MEM_DEVICE enabled at all. Note that +CONFIG_MEM_DEVICE will be selected automatically once we add support +for devices. + +Just like on other architectures, the region container for memory devices +is placed directly above our initial memory. For now, we only align the +start address of the region up to 1 GiB, but we won't add any additional +space to the region for internal alignment purposes; this can be done in +the future if really required. + +The RAM size returned via SCLP is not modified, as this only +covers initial RAM (and standby memory we don't implement) and not memory +devices; clarify that in the docs of read_SCP_info(). Existing OSes without +support for memory devices will keep working as is, even when memory +devices would be attached the VM. + +Guest OSs which support memory devices, such as virtio-mem, will +consult diag500(), to find out the maximum possible pfn. Guest OSes that +don't support memory devices, don't have to be changed and will continue +relying on information provided by SCLP. + +There are no remaining maxram_size users in s390x code, and the remaining +ram_size users only care about initial RAM: +* hw/s390x/ipl.c +* hw/s390x/s390-hypercall.c +* hw/s390x/sclp.c +* target/s390x/kvm/pv.c + +Message-ID: <20241219144115.2820241-11-david@redhat.com> +Acked-by: Michael S. Tsirkin +Reviewed-by: Thomas Huth +Signed-off-by: David Hildenbrand +(cherry picked from commit 1e86400298cf0fed5f7d49427db477775b859093) +Signed-off-by: Thomas Huth +--- + hw/s390x/s390-virtio-ccw.c | 23 ++++++++++++++++++++++- + hw/s390x/sclp.c | 6 +++++- + 2 files changed, 27 insertions(+), 2 deletions(-) + +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index f5f147eb92..824c73536a 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -149,6 +149,7 @@ static void s390_memory_init(MachineState *machine) + MemoryRegion *sysmem = get_system_memory(); + MemoryRegion *ram = machine->ram; + uint64_t ram_size = memory_region_size(ram); ++ uint64_t devmem_base, devmem_size; + + if (!QEMU_IS_ALIGNED(ram_size, 1 * MiB)) { + /* +@@ -161,11 +162,31 @@ static void s390_memory_init(MachineState *machine) + exit(EXIT_FAILURE); + } + +- s390_set_memory_limit(s390ms, ram_size); ++ devmem_size = 0; ++ devmem_base = ram_size; ++#ifdef CONFIG_MEM_DEVICE ++ if (machine->ram_size < machine->maxram_size) { ++ ++ /* ++ * Make sure memory devices have a sane default alignment, even ++ * when weird initial memory sizes are specified. ++ */ ++ devmem_base = QEMU_ALIGN_UP(devmem_base, 1 * GiB); ++ devmem_size = machine->maxram_size - machine->ram_size; ++ } ++#endif ++ s390_set_memory_limit(s390ms, devmem_base + devmem_size); + + /* Map the initial memory. Must happen after setting the memory limit. */ + memory_region_add_subregion(sysmem, 0, ram); + ++ /* Initialize address space for memory devices. */ ++#ifdef CONFIG_MEM_DEVICE ++ if (devmem_size) { ++ machine_memory_devices_init(machine, devmem_base, devmem_size); ++ } ++#endif /* CONFIG_MEM_DEVICE */ ++ + /* + * Configure the maximum page size. As no memory devices were created + * yet, this is the page size of initial memory only. +diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c +index 73e88ab4eb..5945c9b1d8 100644 +--- a/hw/s390x/sclp.c ++++ b/hw/s390x/sclp.c +@@ -161,7 +161,11 @@ static void read_SCP_info(SCLPDevice *sclp, SCCB *sccb) + read_info->rnsize2 = cpu_to_be32(rnsize); + } + +- /* we don't support standby memory, maxram_size is never exposed */ ++ /* ++ * We don't support standby memory. maxram_size is used for sizing the ++ * memory device region, which is not exposed through SCLP but through ++ * diag500. ++ */ + rnmax = machine->ram_size >> sclp->increment_size; + if (rnmax < 0x10000) { + read_info->rnmax = cpu_to_be16(rnmax); +-- +2.48.1 + diff --git a/SOURCES/kvm-s390x-s390-virtio-hcall-prepare-for-more-diag500-hyp.patch b/SOURCES/kvm-s390x-s390-virtio-hcall-prepare-for-more-diag500-hyp.patch new file mode 100644 index 0000000..4bc283e --- /dev/null +++ b/SOURCES/kvm-s390x-s390-virtio-hcall-prepare-for-more-diag500-hyp.patch @@ -0,0 +1,163 @@ +From d2764db41fc6edcead9ad27b8d31e7bff524c0c0 Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Thu, 19 Dec 2024 15:41:04 +0100 +Subject: [PATCH 12/26] s390x/s390-virtio-hcall: prepare for more diag500 + hypercalls + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [12/26] 6573602d71b9e70679a48315f913309be29d6239 (thuth/qemu-kvm-cs) + +Let's generalize, abstracting the virtio bits. diag500 is now a generic +hypercall to handle QEMU/KVM specific things. Explicitly specify all +already defined subcodes, including legacy ones (so we know what we can +use for new hypercalls). + +Move the PGM_SPECIFICATION injection into the renamed function +handle_diag_500(), so we can turn it into a void function. + +We'll rename the files separately, so git properly detects the rename. + +Message-ID: <20241219144115.2820241-4-david@redhat.com> +Acked-by: Michael S. Tsirkin +Reviewed-by: Thomas Huth +Signed-off-by: David Hildenbrand +(cherry picked from commit 6e9cc2da4e8b997fd6ff3249034f436b84fc7974) +Signed-off-by: Thomas Huth +--- + hw/s390x/s390-virtio-hcall.c | 15 ++++++++------- + hw/s390x/s390-virtio-hcall.h | 11 ++++++----- + target/s390x/kvm/kvm.c | 20 +++----------------- + target/s390x/tcg/misc_helper.c | 5 +++-- + 4 files changed, 20 insertions(+), 31 deletions(-) + +diff --git a/hw/s390x/s390-virtio-hcall.c b/hw/s390x/s390-virtio-hcall.c +index ca49e3cd22..5fb78a719e 100644 +--- a/hw/s390x/s390-virtio-hcall.c ++++ b/hw/s390x/s390-virtio-hcall.c +@@ -1,5 +1,5 @@ + /* +- * Support for virtio hypercalls on s390 ++ * Support for QEMU/KVM hypercalls on s390 + * + * Copyright 2012 IBM Corp. + * Author(s): Cornelia Huck +@@ -57,18 +57,19 @@ static int handle_virtio_ccw_notify(uint64_t subch_id, uint64_t data) + return 0; + } + +-int s390_virtio_hypercall(CPUS390XState *env) ++void handle_diag_500(S390CPU *cpu, uintptr_t ra) + { ++ CPUS390XState *env = &cpu->env; + const uint64_t subcode = env->regs[1]; + + switch (subcode) { +- case KVM_S390_VIRTIO_NOTIFY: ++ case DIAG500_VIRTIO_NOTIFY: + env->regs[2] = handle_virtio_notify(env->regs[2]); +- return 0; +- case KVM_S390_VIRTIO_CCW_NOTIFY: ++ break; ++ case DIAG500_VIRTIO_CCW_NOTIFY: + env->regs[2] = handle_virtio_ccw_notify(env->regs[2], env->regs[3]); +- return 0; ++ break; + default: +- return -EINVAL; ++ s390_program_interrupt(env, PGM_SPECIFICATION, ra); + } + } +diff --git a/hw/s390x/s390-virtio-hcall.h b/hw/s390x/s390-virtio-hcall.h +index 3d9fe147d2..dca456b926 100644 +--- a/hw/s390x/s390-virtio-hcall.h ++++ b/hw/s390x/s390-virtio-hcall.h +@@ -1,5 +1,5 @@ + /* +- * Support for virtio hypercalls on s390x ++ * Support for QEMU/KVM hypercalls on s390x + * + * Copyright IBM Corp. 2012, 2017 + * Author(s): Cornelia Huck +@@ -12,12 +12,13 @@ + #ifndef HW_S390_VIRTIO_HCALL_H + #define HW_S390_VIRTIO_HCALL_H + +-#include "standard-headers/asm-s390/virtio-ccw.h" + #include "cpu.h" + +-/* The only thing that we need from the old kvm_virtio.h file */ +-#define KVM_S390_VIRTIO_NOTIFY 0 ++#define DIAG500_VIRTIO_NOTIFY 0 /* legacy, implemented as a NOP */ ++#define DIAG500_VIRTIO_RESET 1 /* legacy */ ++#define DIAG500_VIRTIO_SET_STATUS 2 /* legacy */ ++#define DIAG500_VIRTIO_CCW_NOTIFY 3 /* KVM_S390_VIRTIO_CCW_NOTIFY */ + +-int s390_virtio_hypercall(CPUS390XState *env); ++void handle_diag_500(S390CPU *cpu, uintptr_t ra); + + #endif /* HW_S390_VIRTIO_HCALL_H */ +diff --git a/target/s390x/kvm/kvm.c b/target/s390x/kvm/kvm.c +index 5947dda829..42d6a54126 100644 +--- a/target/s390x/kvm/kvm.c ++++ b/target/s390x/kvm/kvm.c +@@ -1492,22 +1492,6 @@ static int handle_e3(S390CPU *cpu, struct kvm_run *run, uint8_t ipbl) + return r; + } + +-static int handle_hypercall(S390CPU *cpu, struct kvm_run *run) +-{ +- CPUS390XState *env = &cpu->env; +- int ret = -EINVAL; +- +-#ifdef CONFIG_S390_CCW_VIRTIO +- ret = s390_virtio_hypercall(env); +-#endif /* CONFIG_S390_CCW_VIRTIO */ +- if (ret == -EINVAL) { +- kvm_s390_program_interrupt(cpu, PGM_SPECIFICATION); +- return 0; +- } +- +- return ret; +-} +- + static void kvm_handle_diag_288(S390CPU *cpu, struct kvm_run *run) + { + uint64_t r1, r3; +@@ -1603,9 +1587,11 @@ static int handle_diag(S390CPU *cpu, struct kvm_run *run, uint32_t ipb) + case DIAG_SET_CONTROL_PROGRAM_CODES: + handle_diag_318(cpu, run); + break; ++#ifdef CONFIG_S390_CCW_VIRTIO + case DIAG_KVM_HYPERCALL: +- r = handle_hypercall(cpu, run); ++ handle_diag_500(cpu, RA_IGNORED); + break; ++#endif /* CONFIG_S390_CCW_VIRTIO */ + case DIAG_KVM_BREAKPOINT: + r = handle_sw_breakpoint(cpu, run); + break; +diff --git a/target/s390x/tcg/misc_helper.c b/target/s390x/tcg/misc_helper.c +index f44136a568..2b4310003b 100644 +--- a/target/s390x/tcg/misc_helper.c ++++ b/target/s390x/tcg/misc_helper.c +@@ -119,10 +119,11 @@ void HELPER(diag)(CPUS390XState *env, uint32_t r1, uint32_t r3, uint32_t num) + switch (num) { + #ifdef CONFIG_S390_CCW_VIRTIO + case 0x500: +- /* KVM hypercall */ ++ /* QEMU/KVM hypercall */ + bql_lock(); +- r = s390_virtio_hypercall(env); ++ handle_diag_500(env_archcpu(env), GETPC()); + bql_unlock(); ++ r = 0; + break; + #endif /* CONFIG_S390_CCW_VIRTIO */ + case 0x44: +-- +2.48.1 + diff --git a/SOURCES/kvm-s390x-s390-virtio-hcall-remove-hypercall-registratio.patch b/SOURCES/kvm-s390x-s390-virtio-hcall-remove-hypercall-registratio.patch new file mode 100644 index 0000000..14d03de --- /dev/null +++ b/SOURCES/kvm-s390x-s390-virtio-hcall-remove-hypercall-registratio.patch @@ -0,0 +1,296 @@ +From 16ccb16d393a3e63936dc993c30c67fdecb1f120 Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Thu, 19 Dec 2024 15:41:03 +0100 +Subject: [PATCH 11/26] s390x/s390-virtio-hcall: remove hypercall registration + mechanism + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [11/26] 5e8d2720fe9fd6e6e24487d71988821f1cf27f17 (thuth/qemu-kvm-cs) + +Nowadays, we only have a single machine type in QEMU, everything is based +on virtio-ccw and the traditional virtio machine does no longer exist. No +need to dynamically register diag500 handlers. Move the two existing +handlers into s390-virtio-hcall.c. + +Message-ID: <20241219144115.2820241-3-david@redhat.com> +Acked-by: Michael S. Tsirkin +Reviewed-by: Thomas Huth +Acked-by: Christian Borntraeger +Signed-off-by: David Hildenbrand +(cherry picked from commit 4be0fce498d0a08f18b3a9accdb9ded79484d30a) +Signed-off-by: Thomas Huth +--- + hw/s390x/meson.build | 6 ++-- + hw/s390x/s390-virtio-ccw.c | 58 ------------------------------ + hw/s390x/s390-virtio-hcall.c | 65 +++++++++++++++++++++++++--------- + hw/s390x/s390-virtio-hcall.h | 2 -- + target/s390x/kvm/kvm.c | 5 ++- + target/s390x/tcg/misc_helper.c | 3 ++ + 6 files changed, 60 insertions(+), 79 deletions(-) + +diff --git a/hw/s390x/meson.build b/hw/s390x/meson.build +index 482fd13420..d6c8c33915 100644 +--- a/hw/s390x/meson.build ++++ b/hw/s390x/meson.build +@@ -12,7 +12,6 @@ s390x_ss.add(files( + 's390-pci-inst.c', + 's390-skeys.c', + 's390-stattrib.c', +- 's390-virtio-hcall.c', + 'sclp.c', + 'sclpcpu.c', + 'sclpquiesce.c', +@@ -28,7 +27,10 @@ s390x_ss.add(when: 'CONFIG_KVM', if_true: files( + s390x_ss.add(when: 'CONFIG_TCG', if_true: files( + 'tod-tcg.c', + )) +-s390x_ss.add(when: 'CONFIG_S390_CCW_VIRTIO', if_true: files('s390-virtio-ccw.c')) ++s390x_ss.add(when: 'CONFIG_S390_CCW_VIRTIO', if_true: files( ++ 's390-virtio-ccw.c', ++ 's390-virtio-hcall.c', ++)) + s390x_ss.add(when: 'CONFIG_TERMINAL3270', if_true: files('3270-ccw.c')) + s390x_ss.add(when: 'CONFIG_VFIO', if_true: files('s390-pci-vfio.c')) + +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index 82ded9666c..d47e99028e 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -16,11 +16,8 @@ + #include "exec/ram_addr.h" + #include "exec/confidential-guest-support.h" + #include "hw/boards.h" +-#include "hw/s390x/s390-virtio-hcall.h" + #include "hw/s390x/sclp.h" + #include "hw/s390x/s390_flic.h" +-#include "hw/s390x/ioinst.h" +-#include "hw/s390x/css.h" + #include "virtio-ccw.h" + #include "qemu/config-file.h" + #include "qemu/ctype.h" +@@ -124,58 +121,6 @@ static void subsystem_reset(void) + } + } + +-static int virtio_ccw_hcall_notify(const uint64_t *args) +-{ +- uint64_t subch_id = args[0]; +- uint64_t data = args[1]; +- SubchDev *sch; +- VirtIODevice *vdev; +- int cssid, ssid, schid, m; +- uint16_t vq_idx = data; +- +- if (ioinst_disassemble_sch_ident(subch_id, &m, &cssid, &ssid, &schid)) { +- return -EINVAL; +- } +- sch = css_find_subch(m, cssid, ssid, schid); +- if (!sch || !css_subch_visible(sch)) { +- return -EINVAL; +- } +- +- vdev = virtio_ccw_get_vdev(sch); +- if (vq_idx >= VIRTIO_QUEUE_MAX || !virtio_queue_get_num(vdev, vq_idx)) { +- return -EINVAL; +- } +- +- if (virtio_vdev_has_feature(vdev, VIRTIO_F_NOTIFICATION_DATA)) { +- virtio_queue_set_shadow_avail_idx(virtio_get_queue(vdev, vq_idx), +- (data >> 16) & 0xFFFF); +- } +- +- virtio_queue_notify(vdev, vq_idx); +- return 0; +-} +- +-static int virtio_ccw_hcall_early_printk(const uint64_t *args) +-{ +- uint64_t mem = args[0]; +- MachineState *ms = MACHINE(qdev_get_machine()); +- +- if (mem < ms->ram_size) { +- /* Early printk */ +- return 0; +- } +- return -EINVAL; +-} +- +-static void virtio_ccw_register_hcalls(void) +-{ +- s390_register_virtio_hypercall(KVM_S390_VIRTIO_CCW_NOTIFY, +- virtio_ccw_hcall_notify); +- /* Tolerate early printk. */ +- s390_register_virtio_hypercall(KVM_S390_VIRTIO_NOTIFY, +- virtio_ccw_hcall_early_printk); +-} +- + static void s390_memory_init(MemoryRegion *ram) + { + MemoryRegion *sysmem = get_system_memory(); +@@ -296,9 +241,6 @@ static void ccw_init(MachineState *machine) + OBJECT(dev)); + sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal); + +- /* register hypercalls */ +- virtio_ccw_register_hcalls(); +- + s390_enable_css_support(s390_cpu_addr2state(0)); + + ret = css_create_css_image(VIRTUAL_CSSID, true); +diff --git a/hw/s390x/s390-virtio-hcall.c b/hw/s390x/s390-virtio-hcall.c +index ec7cf8beb3..ca49e3cd22 100644 +--- a/hw/s390x/s390-virtio-hcall.c ++++ b/hw/s390x/s390-virtio-hcall.c +@@ -11,31 +11,64 @@ + + #include "qemu/osdep.h" + #include "cpu.h" ++#include "hw/boards.h" + #include "hw/s390x/s390-virtio-hcall.h" ++#include "hw/s390x/ioinst.h" ++#include "hw/s390x/css.h" ++#include "virtio-ccw.h" + +-#define MAX_DIAG_SUBCODES 255 ++static int handle_virtio_notify(uint64_t mem) ++{ ++ MachineState *ms = MACHINE(qdev_get_machine()); + +-static s390_virtio_fn s390_diag500_table[MAX_DIAG_SUBCODES]; ++ if (mem < ms->ram_size) { ++ /* Early printk */ ++ return 0; ++ } ++ return -EINVAL; ++} + +-void s390_register_virtio_hypercall(uint64_t code, s390_virtio_fn fn) ++static int handle_virtio_ccw_notify(uint64_t subch_id, uint64_t data) + { +- assert(code < MAX_DIAG_SUBCODES); +- assert(!s390_diag500_table[code]); ++ SubchDev *sch; ++ VirtIODevice *vdev; ++ int cssid, ssid, schid, m; ++ uint16_t vq_idx = data; ++ ++ if (ioinst_disassemble_sch_ident(subch_id, &m, &cssid, &ssid, &schid)) { ++ return -EINVAL; ++ } ++ sch = css_find_subch(m, cssid, ssid, schid); ++ if (!sch || !css_subch_visible(sch)) { ++ return -EINVAL; ++ } + +- s390_diag500_table[code] = fn; ++ vdev = virtio_ccw_get_vdev(sch); ++ if (vq_idx >= VIRTIO_QUEUE_MAX || !virtio_queue_get_num(vdev, vq_idx)) { ++ return -EINVAL; ++ } ++ ++ if (virtio_vdev_has_feature(vdev, VIRTIO_F_NOTIFICATION_DATA)) { ++ virtio_queue_set_shadow_avail_idx(virtio_get_queue(vdev, vq_idx), ++ (data >> 16) & 0xFFFF); ++ } ++ ++ virtio_queue_notify(vdev, vq_idx); ++ return 0; + } + + int s390_virtio_hypercall(CPUS390XState *env) + { +- s390_virtio_fn fn; +- +- if (env->regs[1] < MAX_DIAG_SUBCODES) { +- fn = s390_diag500_table[env->regs[1]]; +- if (fn) { +- env->regs[2] = fn(&env->regs[2]); +- return 0; +- } +- } ++ const uint64_t subcode = env->regs[1]; + +- return -EINVAL; ++ switch (subcode) { ++ case KVM_S390_VIRTIO_NOTIFY: ++ env->regs[2] = handle_virtio_notify(env->regs[2]); ++ return 0; ++ case KVM_S390_VIRTIO_CCW_NOTIFY: ++ env->regs[2] = handle_virtio_ccw_notify(env->regs[2], env->regs[3]); ++ return 0; ++ default: ++ return -EINVAL; ++ } + } +diff --git a/hw/s390x/s390-virtio-hcall.h b/hw/s390x/s390-virtio-hcall.h +index 3ae6d6ae3a..3d9fe147d2 100644 +--- a/hw/s390x/s390-virtio-hcall.h ++++ b/hw/s390x/s390-virtio-hcall.h +@@ -18,8 +18,6 @@ + /* The only thing that we need from the old kvm_virtio.h file */ + #define KVM_S390_VIRTIO_NOTIFY 0 + +-typedef int (*s390_virtio_fn)(const uint64_t *args); +-void s390_register_virtio_hypercall(uint64_t code, s390_virtio_fn fn); + int s390_virtio_hypercall(CPUS390XState *env); + + #endif /* HW_S390_VIRTIO_HCALL_H */ +diff --git a/target/s390x/kvm/kvm.c b/target/s390x/kvm/kvm.c +index 7a0ca5570f..5947dda829 100644 +--- a/target/s390x/kvm/kvm.c ++++ b/target/s390x/kvm/kvm.c +@@ -51,6 +51,7 @@ + #include "hw/s390x/s390-virtio-ccw.h" + #include "hw/s390x/s390-virtio-hcall.h" + #include "target/s390x/kvm/pv.h" ++#include CONFIG_DEVICES + + #define kvm_vm_check_mem_attr(s, attr) \ + kvm_vm_check_attr(s, KVM_S390_VM_MEM_CTRL, attr) +@@ -1494,9 +1495,11 @@ static int handle_e3(S390CPU *cpu, struct kvm_run *run, uint8_t ipbl) + static int handle_hypercall(S390CPU *cpu, struct kvm_run *run) + { + CPUS390XState *env = &cpu->env; +- int ret; ++ int ret = -EINVAL; + ++#ifdef CONFIG_S390_CCW_VIRTIO + ret = s390_virtio_hypercall(env); ++#endif /* CONFIG_S390_CCW_VIRTIO */ + if (ret == -EINVAL) { + kvm_s390_program_interrupt(cpu, PGM_SPECIFICATION); + return 0; +diff --git a/target/s390x/tcg/misc_helper.c b/target/s390x/tcg/misc_helper.c +index 303f86d363..f44136a568 100644 +--- a/target/s390x/tcg/misc_helper.c ++++ b/target/s390x/tcg/misc_helper.c +@@ -43,6 +43,7 @@ + #include "hw/s390x/s390-pci-inst.h" + #include "hw/boards.h" + #include "hw/s390x/tod.h" ++#include CONFIG_DEVICES + #endif + + /* #define DEBUG_HELPER */ +@@ -116,12 +117,14 @@ void HELPER(diag)(CPUS390XState *env, uint32_t r1, uint32_t r3, uint32_t num) + uint64_t r; + + switch (num) { ++#ifdef CONFIG_S390_CCW_VIRTIO + case 0x500: + /* KVM hypercall */ + bql_lock(); + r = s390_virtio_hypercall(env); + bql_unlock(); + break; ++#endif /* CONFIG_S390_CCW_VIRTIO */ + case 0x44: + /* yield */ + r = 0; +-- +2.48.1 + diff --git a/SOURCES/kvm-s390x-virtio-ccw-add-support-for-virtio-based-memory.patch b/SOURCES/kvm-s390x-virtio-ccw-add-support-for-virtio-based-memory.patch new file mode 100644 index 0000000..808f95b --- /dev/null +++ b/SOURCES/kvm-s390x-virtio-ccw-add-support-for-virtio-based-memory.patch @@ -0,0 +1,423 @@ +From 6b82fca2ecac0c7b30780ebb71ce5bad0421b9b4 Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Thu, 19 Dec 2024 15:41:14 +0100 +Subject: [PATCH 22/26] s390x/virtio-ccw: add support for virtio based memory + devices + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [22/26] 270a9fbe7e5bacfa6c9377815a01da26c4d26097 (thuth/qemu-kvm-cs) + +Let's implement support for abstract virtio based memory devices, using +the virtio-pci implementation as an orientation. Wire them up in the +machine hotplug handler, taking care of s390x page size limitations. + +As we neither support virtio-mem or virtio-pmem yet, the code is +effectively unused. We'll implement support for virtio-mem based on this +next. + +Note that we won't wire up the virtio-pci variant (should currently be +impossible due to lack of support for MSI-X), but we'll add a safety net +to reject plugging them in the pre-plug handler. + +Message-ID: <20241219144115.2820241-14-david@redhat.com> +Acked-by: Michael S. Tsirkin +Signed-off-by: David Hildenbrand +(cherry picked from commit 88d86f6f1e36741ba9e1625da19a7ccf1a343d39) +Signed-off-by: Thomas Huth +--- + MAINTAINERS | 3 + + hw/s390x/meson.build | 3 + + hw/s390x/s390-virtio-ccw.c | 47 +++++++++- + hw/s390x/virtio-ccw-md-stubs.c | 24 ++++++ + hw/s390x/virtio-ccw-md.c | 153 +++++++++++++++++++++++++++++++++ + hw/s390x/virtio-ccw-md.h | 44 ++++++++++ + hw/virtio/Kconfig | 1 + + 7 files changed, 274 insertions(+), 1 deletion(-) + create mode 100644 hw/s390x/virtio-ccw-md-stubs.c + create mode 100644 hw/s390x/virtio-ccw-md.c + create mode 100644 hw/s390x/virtio-ccw-md.h + +diff --git a/MAINTAINERS b/MAINTAINERS +index 3584d6a6c6..f21dc3fa75 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -2387,6 +2387,9 @@ F: include/hw/virtio/virtio-crypto.h + virtio based memory device + M: David Hildenbrand + S: Supported ++F: hw/s390x/virtio-ccw-md.c ++F: hw/s390x/virtio-ccw-md.h ++F: hw/s390x/virtio-ccw-md-stubs.c + F: hw/virtio/virtio-md-pci.c + F: include/hw/virtio/virtio-md-pci.h + F: stubs/virtio-md-pci.c +diff --git a/hw/s390x/meson.build b/hw/s390x/meson.build +index e344a3bd8c..4431868408 100644 +--- a/hw/s390x/meson.build ++++ b/hw/s390x/meson.build +@@ -50,8 +50,11 @@ endif + virtio_ss.add(when: 'CONFIG_VHOST_SCSI', if_true: files('vhost-scsi-ccw.c')) + virtio_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: files('vhost-vsock-ccw.c')) + virtio_ss.add(when: 'CONFIG_VHOST_USER_FS', if_true: files('vhost-user-fs-ccw.c')) ++virtio_ss.add(when: 'CONFIG_VIRTIO_MD', if_true: files('virtio-ccw-md.c')) + s390x_ss.add_all(when: 'CONFIG_VIRTIO_CCW', if_true: virtio_ss) + ++s390x_ss.add(when: 'CONFIG_VIRTIO_MD', if_false: files('virtio-ccw-md-stubs.c')) ++ + hw_arch += {'s390x': s390x_ss} + + hw_s390x_modules = {} +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index bd05a22b4e..9f4ad01789 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -46,6 +46,8 @@ + #include "qapi/visitor.h" + #include "hw/s390x/cpu-topology.h" + #include "kvm/kvm_s390x.h" ++#include "hw/virtio/virtio-md-pci.h" ++#include "hw/s390x/virtio-ccw-md.h" + #include CONFIG_DEVICES + + static Error *pv_mig_blocker; +@@ -546,11 +548,39 @@ static void s390_machine_reset(MachineState *machine, ResetType type) + s390_ipl_clear_reset_request(); + } + ++static void s390_machine_device_pre_plug(HotplugHandler *hotplug_dev, ++ DeviceState *dev, Error **errp) ++{ ++ if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_CCW)) { ++ virtio_ccw_md_pre_plug(VIRTIO_MD_CCW(dev), MACHINE(hotplug_dev), errp); ++ } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) { ++ error_setg(errp, ++ "PCI-attached virtio based memory devices not supported"); ++ } ++} ++ + static void s390_machine_device_plug(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) + { ++ S390CcwMachineState *s390ms = S390_CCW_MACHINE(hotplug_dev); ++ + if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + s390_cpu_plug(hotplug_dev, dev, errp); ++ } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_CCW)) { ++ /* ++ * At this point, the device is realized and set all memdevs mapped, so ++ * qemu_maxrampagesize() will pick up the page sizes of these memdevs ++ * as well. Before we plug the device and expose any RAM memory regions ++ * to the system, make sure we don't exceed the previously set max page ++ * size. While only relevant for KVM, there is not really any use case ++ * for this with TCG, so we'll unconditionally reject it. ++ */ ++ if (qemu_maxrampagesize() != s390ms->max_pagesize) { ++ error_setg(errp, "Memory device uses a bigger page size than" ++ " initial memory"); ++ return; ++ } ++ virtio_ccw_md_plug(VIRTIO_MD_CCW(dev), MACHINE(hotplug_dev), errp); + } + } + +@@ -560,9 +590,20 @@ static void s390_machine_device_unplug_request(HotplugHandler *hotplug_dev, + if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + error_setg(errp, "CPU hot unplug not supported on this machine"); + return; ++ } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_CCW)) { ++ virtio_ccw_md_unplug_request(VIRTIO_MD_CCW(dev), MACHINE(hotplug_dev), ++ errp); + } + } + ++static void s390_machine_device_unplug(HotplugHandler *hotplug_dev, ++ DeviceState *dev, Error **errp) ++{ ++ if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_CCW)) { ++ virtio_ccw_md_unplug(VIRTIO_MD_CCW(dev), MACHINE(hotplug_dev), errp); ++ } ++ } ++ + static CpuInstanceProperties s390_cpu_index_to_props(MachineState *ms, + unsigned cpu_index) + { +@@ -609,7 +650,9 @@ static const CPUArchIdList *s390_possible_cpu_arch_ids(MachineState *ms) + static HotplugHandler *s390_get_hotplug_handler(MachineState *machine, + DeviceState *dev) + { +- if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { ++ if (object_dynamic_cast(OBJECT(dev), TYPE_CPU) || ++ object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_CCW) || ++ object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) { + return HOTPLUG_HANDLER(machine); + } + return NULL; +@@ -769,8 +812,10 @@ static void ccw_machine_class_init(ObjectClass *oc, void *data) + mc->possible_cpu_arch_ids = s390_possible_cpu_arch_ids; + /* it is overridden with 'host' cpu *in kvm_arch_init* */ + mc->default_cpu_type = S390_CPU_TYPE_NAME("qemu"); ++ hc->pre_plug = s390_machine_device_pre_plug; + hc->plug = s390_machine_device_plug; + hc->unplug_request = s390_machine_device_unplug_request; ++ hc->unplug = s390_machine_device_unplug; + nc->nmi_monitor_handler = s390_nmi; + mc->default_ram_id = "s390.ram"; + mc->default_nic = "virtio-net-ccw"; +diff --git a/hw/s390x/virtio-ccw-md-stubs.c b/hw/s390x/virtio-ccw-md-stubs.c +new file mode 100644 +index 0000000000..e937865550 +--- /dev/null ++++ b/hw/s390x/virtio-ccw-md-stubs.c +@@ -0,0 +1,24 @@ ++#include "qemu/osdep.h" ++#include "qapi/error.h" ++#include "hw/s390x/virtio-ccw-md.h" ++ ++void virtio_ccw_md_pre_plug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp) ++{ ++ error_setg(errp, "virtio based memory devices not supported"); ++} ++ ++void virtio_ccw_md_plug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp) ++{ ++ error_setg(errp, "virtio based memory devices not supported"); ++} ++ ++void virtio_ccw_md_unplug_request(VirtIOMDCcw *vmd, MachineState *ms, ++ Error **errp) ++{ ++ error_setg(errp, "virtio based memory devices not supported"); ++} ++ ++void virtio_ccw_md_unplug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp) ++{ ++ error_setg(errp, "virtio based memory devices not supported"); ++} +diff --git a/hw/s390x/virtio-ccw-md.c b/hw/s390x/virtio-ccw-md.c +new file mode 100644 +index 0000000000..de333282df +--- /dev/null ++++ b/hw/s390x/virtio-ccw-md.c +@@ -0,0 +1,153 @@ ++/* ++ * Virtio CCW support for abstract virtio based memory device ++ * ++ * Copyright (C) 2024 Red Hat, Inc. ++ * ++ * Authors: ++ * David Hildenbrand ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2. ++ * See the COPYING file in the top-level directory. ++ */ ++ ++#include "qemu/osdep.h" ++#include "hw/s390x/virtio-ccw-md.h" ++#include "hw/mem/memory-device.h" ++#include "qapi/error.h" ++#include "qemu/error-report.h" ++ ++void virtio_ccw_md_pre_plug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp) ++{ ++ DeviceState *dev = DEVICE(vmd); ++ HotplugHandler *bus_handler = qdev_get_bus_hotplug_handler(dev); ++ MemoryDeviceState *md = MEMORY_DEVICE(vmd); ++ Error *local_err = NULL; ++ ++ if (!bus_handler && dev->hotplugged) { ++ /* ++ * Without a bus hotplug handler, we cannot control the plug/unplug ++ * order. We should never reach this point when hotplugging, but ++ * better add a safety net. ++ */ ++ error_setg(errp, "hotplug of virtio based memory devices not supported" ++ " on this bus."); ++ return; ++ } ++ ++ /* ++ * First, see if we can plug this memory device at all. If that ++ * succeeds, branch of to the actual hotplug handler. ++ */ ++ memory_device_pre_plug(md, ms, &local_err); ++ if (!local_err && bus_handler) { ++ hotplug_handler_pre_plug(bus_handler, dev, &local_err); ++ } ++ error_propagate(errp, local_err); ++} ++ ++void virtio_ccw_md_plug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp) ++{ ++ DeviceState *dev = DEVICE(vmd); ++ HotplugHandler *bus_handler = qdev_get_bus_hotplug_handler(dev); ++ MemoryDeviceState *md = MEMORY_DEVICE(vmd); ++ Error *local_err = NULL; ++ ++ /* ++ * Plug the memory device first and then branch off to the actual ++ * hotplug handler. If that one fails, we can easily undo the memory ++ * device bits. ++ */ ++ memory_device_plug(md, ms); ++ if (bus_handler) { ++ hotplug_handler_plug(bus_handler, dev, &local_err); ++ if (local_err) { ++ memory_device_unplug(md, ms); ++ } ++ } ++ error_propagate(errp, local_err); ++} ++ ++void virtio_ccw_md_unplug_request(VirtIOMDCcw *vmd, MachineState *ms, ++ Error **errp) ++{ ++ VirtIOMDCcwClass *vmdc = VIRTIO_MD_CCW_GET_CLASS(vmd); ++ DeviceState *dev = DEVICE(vmd); ++ HotplugHandler *bus_handler = qdev_get_bus_hotplug_handler(dev); ++ HotplugHandlerClass *hdc; ++ Error *local_err = NULL; ++ ++ if (!vmdc->unplug_request_check) { ++ error_setg(errp, ++ "this virtio based memory devices cannot be unplugged"); ++ return; ++ } ++ ++ if (!bus_handler) { ++ error_setg(errp, "hotunplug of virtio based memory devices not" ++ "supported on this bus"); ++ return; ++ } ++ ++ vmdc->unplug_request_check(vmd, &local_err); ++ if (local_err) { ++ error_propagate(errp, local_err); ++ return; ++ } ++ ++ /* ++ * Forward the async request or turn it into a sync request (handling it ++ * like qdev_unplug()). ++ */ ++ hdc = HOTPLUG_HANDLER_GET_CLASS(bus_handler); ++ if (hdc->unplug_request) { ++ hotplug_handler_unplug_request(bus_handler, dev, &local_err); ++ } else { ++ virtio_ccw_md_unplug(vmd, ms, &local_err); ++ if (!local_err) { ++ object_unparent(OBJECT(dev)); ++ } ++ } ++} ++ ++void virtio_ccw_md_unplug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp) ++{ ++ DeviceState *dev = DEVICE(vmd); ++ HotplugHandler *bus_handler = qdev_get_bus_hotplug_handler(dev); ++ MemoryDeviceState *md = MEMORY_DEVICE(vmd); ++ Error *local_err = NULL; ++ ++ /* Unplug the memory device while it is still realized. */ ++ memory_device_unplug(md, ms); ++ ++ if (bus_handler) { ++ hotplug_handler_unplug(bus_handler, dev, &local_err); ++ if (local_err) { ++ /* Not expected to fail ... but still try to recover. */ ++ memory_device_plug(md, ms); ++ error_propagate(errp, local_err); ++ return; ++ } ++ } else { ++ /* Very unexpected, but let's just try to do the right thing. */ ++ warn_report("Unexpected unplug of virtio based memory device"); ++ qdev_unrealize(dev); ++ } ++} ++ ++static const TypeInfo virtio_ccw_md_info = { ++ .name = TYPE_VIRTIO_MD_CCW, ++ .parent = TYPE_VIRTIO_CCW_DEVICE, ++ .instance_size = sizeof(VirtIOMDCcw), ++ .class_size = sizeof(VirtIOMDCcwClass), ++ .abstract = true, ++ .interfaces = (InterfaceInfo[]) { ++ { TYPE_MEMORY_DEVICE }, ++ { } ++ }, ++}; ++ ++static void virtio_ccw_md_register(void) ++{ ++ type_register_static(&virtio_ccw_md_info); ++} ++type_init(virtio_ccw_md_register) +diff --git a/hw/s390x/virtio-ccw-md.h b/hw/s390x/virtio-ccw-md.h +new file mode 100644 +index 0000000000..39ba864c92 +--- /dev/null ++++ b/hw/s390x/virtio-ccw-md.h +@@ -0,0 +1,44 @@ ++/* ++ * Virtio CCW support for abstract virtio based memory device ++ * ++ * Copyright (C) 2024 Red Hat, Inc. ++ * ++ * Authors: ++ * David Hildenbrand ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2. ++ * See the COPYING file in the top-level directory. ++ */ ++ ++#ifndef HW_S390X_VIRTIO_CCW_MD_H ++#define HW_S390X_VIRTIO_CCW_MD_H ++ ++#include "virtio-ccw.h" ++#include "qom/object.h" ++ ++/* ++ * virtio-md-ccw: This extends VirtioCcwDevice. ++ */ ++#define TYPE_VIRTIO_MD_CCW "virtio-md-ccw" ++ ++OBJECT_DECLARE_TYPE(VirtIOMDCcw, VirtIOMDCcwClass, VIRTIO_MD_CCW) ++ ++struct VirtIOMDCcwClass { ++ /* private */ ++ VirtIOCCWDeviceClass parent; ++ ++ /* public */ ++ void (*unplug_request_check)(VirtIOMDCcw *vmd, Error **errp); ++}; ++ ++struct VirtIOMDCcw { ++ VirtioCcwDevice parent_obj; ++}; ++ ++void virtio_ccw_md_pre_plug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp); ++void virtio_ccw_md_plug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp); ++void virtio_ccw_md_unplug_request(VirtIOMDCcw *vmd, MachineState *ms, ++ Error **errp); ++void virtio_ccw_md_unplug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp); ++ ++#endif /* HW_S390X_VIRTIO_CCW_MD_H */ +diff --git a/hw/virtio/Kconfig b/hw/virtio/Kconfig +index 0afec2ae92..f4b14e1a44 100644 +--- a/hw/virtio/Kconfig ++++ b/hw/virtio/Kconfig +@@ -25,6 +25,7 @@ config VIRTIO_MMIO + config VIRTIO_CCW + bool + select VIRTIO ++ select VIRTIO_MD_SUPPORTED + + config VIRTIO_BALLOON + bool +-- +2.48.1 + diff --git a/SOURCES/kvm-s390x-virtio-mem-support.patch b/SOURCES/kvm-s390x-virtio-mem-support.patch new file mode 100644 index 0000000..3c7313f --- /dev/null +++ b/SOURCES/kvm-s390x-virtio-mem-support.patch @@ -0,0 +1,459 @@ +From fa68427f55bee8d18d846e03ebf9f1eeb80f274d Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Thu, 19 Dec 2024 15:41:15 +0100 +Subject: [PATCH 23/26] s390x: virtio-mem support + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [23/26] 4c59ba9025ce5ba7686a7f3e01bb70e8c580709f (thuth/qemu-kvm-cs) + +Let's add our virtio-mem-ccw proxy device and wire it up. We should +be supporting everything (e.g., device unplug, "dynamic-memslots") that +we already support for the virtio-pci variant. + +With a Linux guest that supports virtio-mem (and has automatic memory +onlining properly configured) the following example will work: + +1. Start a VM with 4G initial memory and a virtio-mem device with a maximum + capacity of 16GB: + + qemu/build/qemu-system-s390x \ + --enable-kvm \ + -m 4G,maxmem=20G \ + -nographic \ + -smp 8 \ + -hda Fedora-Server-KVM-40-1.14.s390x.qcow2 \ + -chardev socket,id=monitor,path=/var/tmp/monitor,server,nowait \ + -mon chardev=monitor,mode=readline \ + -object memory-backend-ram,id=mem0,size=16G,reserve=off \ + -device virtio-mem-ccw,id=vmem0,memdev=mem0,dynamic-memslots=on + +2. Query the current size of virtio-mem device: + + (qemu) info memory-devices + Memory device [virtio-mem]: "vmem0" + memaddr: 0x100000000 + node: 0 + requested-size: 0 + size: 0 + max-size: 17179869184 + block-size: 1048576 + memdev: /objects/mem0 + +3. Request to grow it to 8GB (hotplug 8GB): + + (qemu) qom-set vmem0 requested-size 8G + (qemu) info memory-devices + Memory device [virtio-mem]: "vmem0" + memaddr: 0x100000000 + node: 0 + requested-size: 8589934592 + size: 8589934592 + max-size: 17179869184 + block-size: 1048576 + memdev: /objects/mem0 + +4. Request to grow to 16GB (hotplug another 8GB): + + (qemu) qom-set vmem0 requested-size 16G + (qemu) info memory-devices + Memory device [virtio-mem]: "vmem0" + memaddr: 0x100000000 + node: 0 + requested-size: 17179869184 + size: 17179869184 + max-size: 17179869184 + block-size: 1048576 + memdev: /objects/mem0 + +5. Try to hotunplug all memory again, shrinking to 0GB: + + (qemu) qom-set vmem0 requested-size 0G + (qemu) info memory-devices + Memory device [virtio-mem]: "vmem0" + memaddr: 0x100000000 + node: 0 + requested-size: 0 + size: 0 + max-size: 17179869184 + block-size: 1048576 + memdev: /objects/mem0 + +6. If it worked, unplug the device + + (qemu) device_del vmem0 + (qemu) info memory-devices + (qemu) object_del mem0 + +7. Hotplug a new device with a smaller capacity and directly size it to 1GB + + (qemu) object_add memory-backend-ram,id=mem0,size=8G,reserve=off + (qemu) device_add virtio-mem-ccw,id=vmem0,memdev=mem0,\ + dynamic-memslots=on,requested-size=1G + (qemu) info memory-devices + Memory device [virtio-mem]: "vmem0" + memaddr: 0x100000000 + node: 0 + requested-size: 1073741824 + size: 1073741824 + max-size: 8589934592 + block-size: 1048576 + memdev: /objects/mem0 + +Trying to use a virtio-mem device backed by hugetlb into a !hugetlb VM +correctly results in the error: + ... Memory device uses a bigger page size than initial memory + +Note that the virtio-mem driver in Linux will supports 1 MiB (pageblock) +granularity. + +Message-ID: <20241219144115.2820241-15-david@redhat.com> +Acked-by: Michael S. Tsirkin +Signed-off-by: David Hildenbrand +(cherry picked from commit aa910c20ec5f3b10551da19e441b3e2b54406e25) +Signed-off-by: Thomas Huth +--- + MAINTAINERS | 2 + + hw/s390x/Kconfig | 1 + + hw/s390x/meson.build | 1 + + hw/s390x/virtio-ccw-mem.c | 226 ++++++++++++++++++++++++++++++++++++++ + hw/s390x/virtio-ccw-mem.h | 34 ++++++ + hw/virtio/virtio-mem.c | 4 +- + 6 files changed, 267 insertions(+), 1 deletion(-) + create mode 100644 hw/s390x/virtio-ccw-mem.c + create mode 100644 hw/s390x/virtio-ccw-mem.h + +diff --git a/MAINTAINERS b/MAINTAINERS +index f21dc3fa75..f7b7ceffc4 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -2401,6 +2401,8 @@ W: https://virtio-mem.gitlab.io/ + F: hw/virtio/virtio-mem.c + F: hw/virtio/virtio-mem-pci.h + F: hw/virtio/virtio-mem-pci.c ++F: hw/s390x/virtio-ccw-mem.c ++F: hw/s390x/virtio-ccw-mem.h + F: include/hw/virtio/virtio-mem.h + + virtio-snd +diff --git a/hw/s390x/Kconfig b/hw/s390x/Kconfig +index 3bbf4ae56e..5d57daff77 100644 +--- a/hw/s390x/Kconfig ++++ b/hw/s390x/Kconfig +@@ -15,3 +15,4 @@ config S390_CCW_VIRTIO + select SCLPCONSOLE + select VIRTIO_CCW + select MSI_NONBROKEN ++ select VIRTIO_MEM_SUPPORTED +diff --git a/hw/s390x/meson.build b/hw/s390x/meson.build +index 4431868408..3bbebfd817 100644 +--- a/hw/s390x/meson.build ++++ b/hw/s390x/meson.build +@@ -51,6 +51,7 @@ virtio_ss.add(when: 'CONFIG_VHOST_SCSI', if_true: files('vhost-scsi-ccw.c')) + virtio_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: files('vhost-vsock-ccw.c')) + virtio_ss.add(when: 'CONFIG_VHOST_USER_FS', if_true: files('vhost-user-fs-ccw.c')) + virtio_ss.add(when: 'CONFIG_VIRTIO_MD', if_true: files('virtio-ccw-md.c')) ++virtio_ss.add(when: 'CONFIG_VIRTIO_MEM', if_true: files('virtio-ccw-mem.c')) + s390x_ss.add_all(when: 'CONFIG_VIRTIO_CCW', if_true: virtio_ss) + + s390x_ss.add(when: 'CONFIG_VIRTIO_MD', if_false: files('virtio-ccw-md-stubs.c')) +diff --git a/hw/s390x/virtio-ccw-mem.c b/hw/s390x/virtio-ccw-mem.c +new file mode 100644 +index 0000000000..bee0d560cb +--- /dev/null ++++ b/hw/s390x/virtio-ccw-mem.c +@@ -0,0 +1,226 @@ ++/* ++ * virtio-mem CCW implementation ++ * ++ * Copyright (C) 2024 Red Hat, Inc. ++ * ++ * Authors: ++ * David Hildenbrand ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2. ++ * See the COPYING file in the top-level directory. ++ */ ++ ++#include "qemu/osdep.h" ++#include "hw/qdev-properties.h" ++#include "qapi/error.h" ++#include "qemu/module.h" ++#include "virtio-ccw-mem.h" ++#include "hw/mem/memory-device.h" ++#include "qapi/qapi-events-machine.h" ++#include "qapi/qapi-events-misc.h" ++ ++static void virtio_ccw_mem_realize(VirtioCcwDevice *ccw_dev, Error **errp) ++{ ++ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(ccw_dev); ++ DeviceState *vdev = DEVICE(&dev->vdev); ++ ++ qdev_realize(vdev, BUS(&ccw_dev->bus), errp); ++} ++ ++static void virtio_ccw_mem_set_addr(MemoryDeviceState *md, uint64_t addr, ++ Error **errp) ++{ ++ object_property_set_uint(OBJECT(md), VIRTIO_MEM_ADDR_PROP, addr, errp); ++} ++ ++static uint64_t virtio_ccw_mem_get_addr(const MemoryDeviceState *md) ++{ ++ return object_property_get_uint(OBJECT(md), VIRTIO_MEM_ADDR_PROP, ++ &error_abort); ++} ++ ++static MemoryRegion *virtio_ccw_mem_get_memory_region(MemoryDeviceState *md, ++ Error **errp) ++{ ++ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(md); ++ VirtIOMEM *vmem = &dev->vdev; ++ VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem); ++ ++ return vmc->get_memory_region(vmem, errp); ++} ++ ++static void virtio_ccw_mem_decide_memslots(MemoryDeviceState *md, ++ unsigned int limit) ++{ ++ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(md); ++ VirtIOMEM *vmem = VIRTIO_MEM(&dev->vdev); ++ VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem); ++ ++ vmc->decide_memslots(vmem, limit); ++} ++ ++static unsigned int virtio_ccw_mem_get_memslots(MemoryDeviceState *md) ++{ ++ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(md); ++ VirtIOMEM *vmem = VIRTIO_MEM(&dev->vdev); ++ VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem); ++ ++ return vmc->get_memslots(vmem); ++} ++ ++static uint64_t virtio_ccw_mem_get_plugged_size(const MemoryDeviceState *md, ++ Error **errp) ++{ ++ return object_property_get_uint(OBJECT(md), VIRTIO_MEM_SIZE_PROP, ++ errp); ++} ++ ++static void virtio_ccw_mem_fill_device_info(const MemoryDeviceState *md, ++ MemoryDeviceInfo *info) ++{ ++ VirtioMEMDeviceInfo *vi = g_new0(VirtioMEMDeviceInfo, 1); ++ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(md); ++ VirtIOMEM *vmem = &dev->vdev; ++ VirtIOMEMClass *vpc = VIRTIO_MEM_GET_CLASS(vmem); ++ DeviceState *vdev = DEVICE(md); ++ ++ if (vdev->id) { ++ vi->id = g_strdup(vdev->id); ++ } ++ ++ /* let the real device handle everything else */ ++ vpc->fill_device_info(vmem, vi); ++ ++ info->u.virtio_mem.data = vi; ++ info->type = MEMORY_DEVICE_INFO_KIND_VIRTIO_MEM; ++} ++ ++static uint64_t virtio_ccw_mem_get_min_alignment(const MemoryDeviceState *md) ++{ ++ return object_property_get_uint(OBJECT(md), VIRTIO_MEM_BLOCK_SIZE_PROP, ++ &error_abort); ++} ++ ++static void virtio_ccw_mem_size_change_notify(Notifier *notifier, void *data) ++{ ++ VirtIOMEMCcw *dev = container_of(notifier, VirtIOMEMCcw, ++ size_change_notifier); ++ DeviceState *vdev = DEVICE(dev); ++ char *qom_path = object_get_canonical_path(OBJECT(dev)); ++ const uint64_t * const size_p = data; ++ ++ qapi_event_send_memory_device_size_change(vdev->id, *size_p, qom_path); ++ g_free(qom_path); ++} ++ ++static void virtio_ccw_mem_unplug_request_check(VirtIOMDCcw *vmd, Error **errp) ++{ ++ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(vmd); ++ VirtIOMEM *vmem = &dev->vdev; ++ VirtIOMEMClass *vpc = VIRTIO_MEM_GET_CLASS(vmem); ++ ++ vpc->unplug_request_check(vmem, errp); ++} ++ ++static void virtio_ccw_mem_get_requested_size(Object *obj, Visitor *v, ++ const char *name, void *opaque, ++ Error **errp) ++{ ++ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(obj); ++ ++ object_property_get(OBJECT(&dev->vdev), name, v, errp); ++} ++ ++static void virtio_ccw_mem_set_requested_size(Object *obj, Visitor *v, ++ const char *name, void *opaque, ++ Error **errp) ++{ ++ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(obj); ++ DeviceState *vdev = DEVICE(obj); ++ ++ /* ++ * If we passed virtio_ccw_mem_unplug_request_check(), making sure that ++ * the requested size is 0, don't allow modifying the requested size ++ * anymore, otherwise the VM might end up hotplugging memory before ++ * handling the unplug request. ++ */ ++ if (vdev->pending_deleted_event) { ++ error_setg(errp, "'%s' cannot be changed if the device is in the" ++ " process of unplug", name); ++ return; ++ } ++ ++ object_property_set(OBJECT(&dev->vdev), name, v, errp); ++} ++ ++static Property virtio_ccw_mem_properties[] = { ++ DEFINE_PROP_BIT("ioeventfd", VirtioCcwDevice, flags, ++ VIRTIO_CCW_FLAG_USE_IOEVENTFD_BIT, true), ++ DEFINE_PROP_UINT32("max_revision", VirtioCcwDevice, max_rev, ++ VIRTIO_CCW_MAX_REV), ++ DEFINE_PROP_END_OF_LIST(), ++}; ++ ++static void virtio_ccw_mem_class_init(ObjectClass *klass, void *data) ++{ ++ DeviceClass *dc = DEVICE_CLASS(klass); ++ VirtIOCCWDeviceClass *k = VIRTIO_CCW_DEVICE_CLASS(klass); ++ MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(klass); ++ VirtIOMDCcwClass *vmdc = VIRTIO_MD_CCW_CLASS(klass); ++ ++ k->realize = virtio_ccw_mem_realize; ++ set_bit(DEVICE_CATEGORY_MISC, dc->categories); ++ device_class_set_props(dc, virtio_ccw_mem_properties); ++ ++ mdc->get_addr = virtio_ccw_mem_get_addr; ++ mdc->set_addr = virtio_ccw_mem_set_addr; ++ mdc->get_plugged_size = virtio_ccw_mem_get_plugged_size; ++ mdc->get_memory_region = virtio_ccw_mem_get_memory_region; ++ mdc->decide_memslots = virtio_ccw_mem_decide_memslots; ++ mdc->get_memslots = virtio_ccw_mem_get_memslots; ++ mdc->fill_device_info = virtio_ccw_mem_fill_device_info; ++ mdc->get_min_alignment = virtio_ccw_mem_get_min_alignment; ++ ++ vmdc->unplug_request_check = virtio_ccw_mem_unplug_request_check; ++} ++ ++static void virtio_ccw_mem_instance_init(Object *obj) ++{ ++ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(obj); ++ VirtIOMEMClass *vmc; ++ VirtIOMEM *vmem; ++ ++ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), ++ TYPE_VIRTIO_MEM); ++ ++ dev->size_change_notifier.notify = virtio_ccw_mem_size_change_notify; ++ vmem = &dev->vdev; ++ vmc = VIRTIO_MEM_GET_CLASS(vmem); ++ /* ++ * We never remove the notifier again, as we expect both devices to ++ * disappear at the same time. ++ */ ++ vmc->add_size_change_notifier(vmem, &dev->size_change_notifier); ++ ++ object_property_add_alias(obj, VIRTIO_MEM_BLOCK_SIZE_PROP, ++ OBJECT(&dev->vdev), VIRTIO_MEM_BLOCK_SIZE_PROP); ++ object_property_add_alias(obj, VIRTIO_MEM_SIZE_PROP, OBJECT(&dev->vdev), ++ VIRTIO_MEM_SIZE_PROP); ++ object_property_add(obj, VIRTIO_MEM_REQUESTED_SIZE_PROP, "size", ++ virtio_ccw_mem_get_requested_size, ++ virtio_ccw_mem_set_requested_size, NULL, NULL); ++} ++ ++static const TypeInfo virtio_ccw_mem = { ++ .name = TYPE_VIRTIO_MEM_CCW, ++ .parent = TYPE_VIRTIO_MD_CCW, ++ .instance_size = sizeof(VirtIOMEMCcw), ++ .instance_init = virtio_ccw_mem_instance_init, ++ .class_init = virtio_ccw_mem_class_init, ++}; ++ ++static void virtio_ccw_mem_register_types(void) ++{ ++ type_register_static(&virtio_ccw_mem); ++} ++type_init(virtio_ccw_mem_register_types) +diff --git a/hw/s390x/virtio-ccw-mem.h b/hw/s390x/virtio-ccw-mem.h +new file mode 100644 +index 0000000000..738ab2c744 +--- /dev/null ++++ b/hw/s390x/virtio-ccw-mem.h +@@ -0,0 +1,34 @@ ++/* ++ * Virtio MEM CCW device ++ * ++ * Copyright (C) 2024 Red Hat, Inc. ++ * ++ * Authors: ++ * David Hildenbrand ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2. ++ * See the COPYING file in the top-level directory. ++ */ ++ ++#ifndef HW_S390X_VIRTIO_CCW_MEM_H ++#define HW_S390X_VIRTIO_CCW_MEM_H ++ ++#include "virtio-ccw-md.h" ++#include "hw/virtio/virtio-mem.h" ++#include "qom/object.h" ++ ++typedef struct VirtIOMEMCcw VirtIOMEMCcw; ++ ++/* ++ * virtio-mem-ccw: This extends VirtIOMDCcw ++ */ ++#define TYPE_VIRTIO_MEM_CCW "virtio-mem-ccw" ++DECLARE_INSTANCE_CHECKER(VirtIOMEMCcw, VIRTIO_MEM_CCW, TYPE_VIRTIO_MEM_CCW) ++ ++struct VirtIOMEMCcw { ++ VirtIOMDCcw parent_obj; ++ VirtIOMEM vdev; ++ Notifier size_change_notifier; ++}; ++ ++#endif /* HW_S390X_VIRTIO_CCW_MEM_H */ +diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c +index 00da98b6e1..c9f8a23bbc 100644 +--- a/hw/virtio/virtio-mem.c ++++ b/hw/virtio/virtio-mem.c +@@ -61,6 +61,8 @@ static uint32_t virtio_mem_default_thp_size(void) + } else if (qemu_real_host_page_size() == 64 * KiB) { + default_thp_size = 512 * MiB; + } ++#elif defined(__s390x__) ++ default_thp_size = 1 * MiB; + #endif + + return default_thp_size; +@@ -161,7 +163,7 @@ static bool virtio_mem_has_shared_zeropage(RAMBlock *rb) + * necessary (as the section size can change). But it's more likely that the + * section size will rather get smaller and not bigger over time. + */ +-#if defined(TARGET_X86_64) || defined(TARGET_I386) ++#if defined(TARGET_X86_64) || defined(TARGET_I386) || defined(TARGET_S390X) + #define VIRTIO_MEM_USABLE_EXTENT (2 * (128 * MiB)) + #elif defined(TARGET_ARM) + #define VIRTIO_MEM_USABLE_EXTENT (2 * (512 * MiB)) +-- +2.48.1 + diff --git a/SOURCES/kvm-scripts-improve-error-from-qemu-trace-stap-on-missin.patch b/SOURCES/kvm-scripts-improve-error-from-qemu-trace-stap-on-missin.patch new file mode 100644 index 0000000..a6c8257 --- /dev/null +++ b/SOURCES/kvm-scripts-improve-error-from-qemu-trace-stap-on-missin.patch @@ -0,0 +1,90 @@ +From 314804fa4be6d653a7809b64076d4f3133a0ff59 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= +Date: Fri, 6 Dec 2024 11:45:24 +0000 +Subject: [PATCH 8/9] scripts: improve error from qemu-trace-stap on missing + 'stap' +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Daniel P. Berrangé +RH-MergeRequest: 345: scripts: improve error from qemu-trace-stap on missing 'stap' +RH-Jira: RHEL-47340 +RH-Acked-by: Gerd Hoffmann +RH-Acked-by: Stefan Hajnoczi +RH-Commit: [1/2] c90635123f40e683488d83b59c71a5236c6d4659 (berrange/centos-src-qemu) + +If the 'stap' binary is missing in $PATH, a huge trace is thrown + + $ qemu-trace-stap list /usr/bin/qemu-system-x86_64 + Traceback (most recent call last): + File "/usr/bin/qemu-trace-stap", line 169, in + main() + File "/usr/bin/qemu-trace-stap", line 165, in main + args.func(args) + File "/usr/bin/qemu-trace-stap", line 83, in cmd_run + subprocess.call(stapargs) + File "/usr/lib64/python3.12/subprocess.py", line 389, in call + with Popen(*popenargs, **kwargs) as p: + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib64/python3.12/subprocess.py", line 1026, in {}init{} + self._execute_child(args, executable, preexec_fn, close_fds, + File "/usr/lib64/python3.12/subprocess.py", line 1955, in _execute_child + raise child_exception_type(errno_num, err_msg, err_filename) + FileNotFoundError: [Errno 2] No such file or directory: 'stap' + +With this change the user now gets + + $ qemu-trace-stap list /usr/bin/qemu-system-x86_64 + Unable to find 'stap' in $PATH + +Signed-off-by: Daniel P. Berrangé +Reviewed-by: Philippe Mathieu-Daudé +Message-id: 20241206114524.1666664-1-berrange@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 9976be3911a2d0503f026ae37c17077273bf30ee) +--- + scripts/qemu-trace-stap | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/scripts/qemu-trace-stap b/scripts/qemu-trace-stap +index eb6e951ff2..e983460ee7 100755 +--- a/scripts/qemu-trace-stap ++++ b/scripts/qemu-trace-stap +@@ -56,6 +56,7 @@ def tapset_dir(binary): + + + def cmd_run(args): ++ stap = which("stap") + prefix = probe_prefix(args.binary) + tapsets = tapset_dir(args.binary) + +@@ -76,7 +77,7 @@ def cmd_run(args): + + # We request an 8MB buffer, since the stap default 1MB buffer + # can be easily overflowed by frequently firing QEMU traces +- stapargs = ["stap", "-s", "8", "-I", tapsets ] ++ stapargs = [stap, "-s", "8", "-I", tapsets ] + if args.pid is not None: + stapargs.extend(["-x", args.pid]) + stapargs.extend(["-e", script]) +@@ -84,6 +85,7 @@ def cmd_run(args): + + + def cmd_list(args): ++ stap = which("stap") + tapsets = tapset_dir(args.binary) + + if args.verbose: +@@ -96,7 +98,7 @@ def cmd_list(args): + + if verbose: + print("Listing probes with name '%s'" % script) +- proc = subprocess.Popen(["stap", "-I", tapsets, "-l", script], ++ proc = subprocess.Popen([stap, "-I", tapsets, "-l", script], + stdout=subprocess.PIPE, + universal_newlines=True) + out, err = proc.communicate() +-- +2.48.1 + diff --git a/SOURCES/kvm-target-i386-Add-PerfMonV2-feature-bit.patch b/SOURCES/kvm-target-i386-Add-PerfMonV2-feature-bit.patch new file mode 100644 index 0000000..217987d --- /dev/null +++ b/SOURCES/kvm-target-i386-Add-PerfMonV2-feature-bit.patch @@ -0,0 +1,105 @@ +From 1587da0703e72cca8325a20b709280b8df85d066 Mon Sep 17 00:00:00 2001 +From: Sandipan Das +Date: Thu, 24 Oct 2024 17:18:21 -0500 +Subject: [PATCH 16/57] target/i386: Add PerfMonV2 feature bit + +RH-Author: John Allen +RH-MergeRequest: 378: Update EPYC Models and Feature Bits +RH-Jira: RHEL-52649 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [2/8] ec365cf4ac558c6c83f7a957e8df937cb6fbfa27 (johnalle/qemu-kvm-fork) + +CPUID leaf 0x80000022, i.e. ExtPerfMonAndDbg, advertises new performance +monitoring features for AMD processors. Bit 0 of EAX indicates support +for Performance Monitoring Version 2 (PerfMonV2) features. If found to +be set during PMU initialization, the EBX bits can be used to determine +the number of available counters for different PMUs. It also denotes the +availability of global control and status registers. + +Add the required CPUID feature word and feature bit to allow guests to +make use of the PerfMonV2 features. + +Signed-off-by: Sandipan Das +Signed-off-by: Babu Moger +Reviewed-by: Zhao Liu +Link: https://lore.kernel.org/r/a96f00ee2637674c63c61e9fc4dee343ea818053.1729807947.git.babu.moger@amd.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit 209b0ac12074341d0093985eb9ad3e7edb252ce5) + +JIRA: https://issues.redhat.com/browse/RHEL-52649 + +Signed-off-by: John Allen +--- + target/i386/cpu.c | 26 ++++++++++++++++++++++++++ + target/i386/cpu.h | 4 ++++ + 2 files changed, 30 insertions(+) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 53069a460c..4546369836 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1246,6 +1246,22 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + .tcg_features = 0, + .unmigratable_flags = 0, + }, ++ [FEAT_8000_0022_EAX] = { ++ .type = CPUID_FEATURE_WORD, ++ .feat_names = { ++ "perfmon-v2", NULL, NULL, NULL, ++ NULL, NULL, NULL, NULL, ++ NULL, NULL, NULL, NULL, ++ NULL, NULL, NULL, NULL, ++ NULL, NULL, NULL, NULL, ++ NULL, NULL, NULL, NULL, ++ NULL, NULL, NULL, NULL, ++ NULL, NULL, NULL, NULL, ++ }, ++ .cpuid = { .eax = 0x80000022, .reg = R_EAX, }, ++ .tcg_features = 0, ++ .unmigratable_flags = 0, ++ }, + [FEAT_XSAVE] = { + .type = CPUID_FEATURE_WORD, + .feat_names = { +@@ -7096,6 +7112,16 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, + *edx = 0; + } + break; ++ case 0x80000022: ++ *eax = *ebx = *ecx = *edx = 0; ++ /* AMD Extended Performance Monitoring and Debug */ ++ if (kvm_enabled() && cpu->enable_pmu && ++ (env->features[FEAT_8000_0022_EAX] & CPUID_8000_0022_EAX_PERFMON_V2)) { ++ *eax |= CPUID_8000_0022_EAX_PERFMON_V2; ++ *ebx |= kvm_arch_get_supported_cpuid(cs->kvm_state, index, count, ++ R_EBX) & 0xf; ++ } ++ break; + case 0xC0000000: + *eax = env->cpuid_xlevel2; + *ebx = 0; +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index 9a16239b8e..cf92a4972c 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -638,6 +638,7 @@ typedef enum FeatureWord { + FEAT_8000_0007_EDX, /* CPUID[8000_0007].EDX */ + FEAT_8000_0008_EBX, /* CPUID[8000_0008].EBX */ + FEAT_8000_0021_EAX, /* CPUID[8000_0021].EAX */ ++ FEAT_8000_0022_EAX, /* CPUID[8000_0022].EAX */ + FEAT_C000_0001_EDX, /* CPUID[C000_0001].EDX */ + FEAT_KVM, /* CPUID[4000_0001].EAX (KVM_CPUID_FEATURES) */ + FEAT_KVM_HINTS, /* CPUID[4000_0001].EDX */ +@@ -1044,6 +1045,9 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w); + /* Not vulnerable to SRSO at the user-kernel boundary */ + #define CPUID_8000_0021_EAX_SRSO_USER_KERNEL_NO (1U << 30) + ++/* Performance Monitoring Version 2 */ ++#define CPUID_8000_0022_EAX_PERFMON_V2 (1U << 0) ++ + #define CPUID_XSAVE_XSAVEOPT (1U << 0) + #define CPUID_XSAVE_XSAVEC (1U << 1) + #define CPUID_XSAVE_XGETBV1 (1U << 2) +-- +2.39.3 + diff --git a/SOURCES/kvm-target-i386-Add-couple-of-feature-bits-in-CPUID_Fn80.patch b/SOURCES/kvm-target-i386-Add-couple-of-feature-bits-in-CPUID_Fn80.patch new file mode 100644 index 0000000..6214232 --- /dev/null +++ b/SOURCES/kvm-target-i386-Add-couple-of-feature-bits-in-CPUID_Fn80.patch @@ -0,0 +1,83 @@ +From 79ac76edecdbbe253ad42385730aac18cdc40bd7 Mon Sep 17 00:00:00 2001 +From: Babu Moger +Date: Fri, 20 Jun 2025 14:54:53 -0500 +Subject: [PATCH 20/57] target/i386: Add couple of feature bits in + CPUID_Fn80000021_EAX + +RH-Author: John Allen +RH-MergeRequest: 378: Update EPYC Models and Feature Bits +RH-Jira: RHEL-52649 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [6/8] c6507eb24fcef271fdd6a234d2c255ef38c4e691 (johnalle/qemu-kvm-fork) + +Add CPUID bit indicates that a WRMSR to MSR_FS_BASE, MSR_GS_BASE, or +MSR_KERNEL_GS_BASE is non-serializing amd PREFETCHI that the +cates +support for IC prefetch. + +CPUID_Fn80000021_EAX +Bit Feature description +20 Indicates support for IC prefetch. +1 FsGsKernelGsBaseNonSerializing. + WRMSR to FS_BASE, GS_BASE and KernelGSbase are +serializing. + +Link: https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/programmer-references/57238.zip +Signed-off-by: Babu Moger +Reviewed-by: Maksim Davydov +Reviewed-by: Zhao Liu +Link: https://lore.kernel.org/r/a5f6283a59579b09ac345b3f21ecb3b3b2d92451.1746734284.git.babu.moger@amd.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit dfd5b456108a75588ab094358ba5754787146d3d) + +JIRA: https://issues.redhat.com/browse/RHEL-52649 + +Signed-off-by: John Allen +--- + target/i386/cpu.c | 4 ++-- + target/i386/cpu.h | 4 ++++ + 2 files changed, 6 insertions(+), 2 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 7d48c51767..2218071fca 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1233,12 +1233,12 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + [FEAT_8000_0021_EAX] = { + .type = CPUID_FEATURE_WORD, + .feat_names = { +- "no-nested-data-bp", NULL, "lfence-always-serializing", NULL, ++ "no-nested-data-bp", "fs-gs-base-ns", "lfence-always-serializing", NULL, + NULL, NULL, "null-sel-clr-base", NULL, + "auto-ibrs", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, +- NULL, NULL, NULL, NULL, ++ "prefetchi", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + "ibpb-brtype", "srso-no", "srso-user-kernel-no", NULL, + }, +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index cf92a4972c..e513e5f62d 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -1030,12 +1030,16 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w); + + /* Processor ignores nested data breakpoints */ + #define CPUID_8000_0021_EAX_NO_NESTED_DATA_BP (1U << 0) ++/* WRMSR to FS_BASE, GS_BASE, or KERNEL_GS_BASE is non-serializing */ ++#define CPUID_8000_0021_EAX_FS_GS_BASE_NS (1U << 1) + /* LFENCE is always serializing */ + #define CPUID_8000_0021_EAX_LFENCE_ALWAYS_SERIALIZING (1U << 2) + /* Null Selector Clears Base */ + #define CPUID_8000_0021_EAX_NULL_SEL_CLR_BASE (1U << 6) + /* Automatic IBRS */ + #define CPUID_8000_0021_EAX_AUTO_IBRS (1U << 8) ++/* Indicates support for IC prefetch */ ++#define CPUID_8000_0021_EAX_PREFETCHI (1U << 20) + /* Selective Branch Predictor Barrier */ + #define CPUID_8000_0021_EAX_SBPB (1U << 27) + /* IBPB includes branch type prediction flushing */ +-- +2.39.3 + diff --git a/SOURCES/kvm-target-i386-Add-support-for-EPYC-Turin-model.patch b/SOURCES/kvm-target-i386-Add-support-for-EPYC-Turin-model.patch new file mode 100644 index 0000000..6293b4c --- /dev/null +++ b/SOURCES/kvm-target-i386-Add-support-for-EPYC-Turin-model.patch @@ -0,0 +1,200 @@ +From e0b59a57883faac254cd75cc243fed784ad4975b Mon Sep 17 00:00:00 2001 +From: Babu Moger +Date: Thu, 8 May 2025 14:58:04 -0500 +Subject: [PATCH 22/57] target/i386: Add support for EPYC-Turin model + +RH-Author: John Allen +RH-MergeRequest: 378: Update EPYC Models and Feature Bits +RH-Jira: RHEL-52649 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [8/8] 42e90c7fc6bf858f98ae3a6be35d70b824a7d6bf (johnalle/qemu-kvm-fork) + +Add the support for AMD EPYC zen 5 processors (EPYC-Turin). + +Add the following new feature bits on top of the feature bits from +the previous generation EPYC models. + +movdiri : Move Doubleword as Direct Store Instruction +movdir64b : Move 64 Bytes as Direct Store Instruction +avx512-vp2intersect : AVX512 Vector Pair Intersection to a Pair + of Mask Register +avx-vnni : AVX VNNI Instruction +prefetchi : Indicates support for IC prefetch +sbpb : Selective Branch Predictor Barrier +ibpb-brtype : IBPB includes branch type prediction flushing +srso-user-kernel-no : Not vulnerable to SRSO at the user-kernel boundary + +Link: https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/programmer-references/57238.zip +Link: https://www.amd.com/content/dam/amd/en/documents/corporate/cr/speculative-return-stack-overflow-whitepaper.pdf +Signed-off-by: Babu Moger +Reviewed-by: Zhao Liu +Link: https://lore.kernel.org/r/b4fa7708a0e1453d2e9b8ec3dc881feb92eeca0b.1746734284.git.babu.moger@amd.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit 3771a4daa273ba17cb27309984413790d1df5651) + +JIRA: https://issues.redhat.com/browse/RHEL-52649 + +Signed-off-by: John Allen +--- + target/i386/cpu.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 138 insertions(+) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 2bc2d41259..fdfa183f4d 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -2651,6 +2651,61 @@ static const CPUCaches epyc_genoa_v2_cache_info = { + .share_level = CPU_TOPO_LEVEL_DIE, + }, + }; ++ ++static const CPUCaches epyc_turin_cache_info = { ++ .l1d_cache = &(CPUCacheInfo) { ++ .type = DATA_CACHE, ++ .level = 1, ++ .size = 48 * KiB, ++ .line_size = 64, ++ .associativity = 12, ++ .partitions = 1, ++ .sets = 64, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .share_level = CPU_TOPO_LEVEL_CORE, ++ }, ++ .l1i_cache = &(CPUCacheInfo) { ++ .type = INSTRUCTION_CACHE, ++ .level = 1, ++ .size = 32 * KiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 64, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .share_level = CPU_TOPO_LEVEL_CORE, ++ }, ++ .l2_cache = &(CPUCacheInfo) { ++ .type = UNIFIED_CACHE, ++ .level = 2, ++ .size = 1 * MiB, ++ .line_size = 64, ++ .associativity = 16, ++ .partitions = 1, ++ .sets = 1024, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .inclusive = true, ++ .share_level = CPU_TOPO_LEVEL_CORE, ++ }, ++ .l3_cache = &(CPUCacheInfo) { ++ .type = UNIFIED_CACHE, ++ .level = 3, ++ .size = 32 * MiB, ++ .line_size = 64, ++ .associativity = 16, ++ .partitions = 1, ++ .sets = 32768, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .no_invd_sharing = true, ++ .complex_indexing = false, ++ .share_level = CPU_TOPO_LEVEL_DIE, ++ }, ++}; ++ + /* The following VMX features are not supported by KVM and are left out in the + * CPU definitions: + * +@@ -5644,6 +5699,89 @@ static const X86CPUDefinition builtin_x86_defs[] = { + { /* end of list */ } + } + }, ++ { ++ .name = "EPYC-Turin", ++ .level = 0xd, ++ .vendor = CPUID_VENDOR_AMD, ++ .family = 26, ++ .model = 0, ++ .stepping = 0, ++ .features[FEAT_1_ECX] = ++ CPUID_EXT_RDRAND | CPUID_EXT_F16C | CPUID_EXT_AVX | ++ CPUID_EXT_XSAVE | CPUID_EXT_AES | CPUID_EXT_POPCNT | ++ CPUID_EXT_MOVBE | CPUID_EXT_SSE42 | CPUID_EXT_SSE41 | ++ CPUID_EXT_PCID | CPUID_EXT_CX16 | CPUID_EXT_FMA | ++ CPUID_EXT_SSSE3 | CPUID_EXT_MONITOR | CPUID_EXT_PCLMULQDQ | ++ CPUID_EXT_SSE3, ++ .features[FEAT_1_EDX] = ++ CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | CPUID_MMX | CPUID_CLFLUSH | ++ CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | CPUID_PGE | ++ CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | CPUID_MCE | ++ CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | CPUID_DE | ++ CPUID_VME | CPUID_FP87, ++ .features[FEAT_6_EAX] = ++ CPUID_6_EAX_ARAT, ++ .features[FEAT_7_0_EBX] = ++ CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_AVX2 | ++ CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ERMS | ++ CPUID_7_0_EBX_INVPCID | CPUID_7_0_EBX_AVX512F | ++ CPUID_7_0_EBX_AVX512DQ | CPUID_7_0_EBX_RDSEED | CPUID_7_0_EBX_ADX | ++ CPUID_7_0_EBX_SMAP | CPUID_7_0_EBX_AVX512IFMA | ++ CPUID_7_0_EBX_CLFLUSHOPT | CPUID_7_0_EBX_CLWB | ++ CPUID_7_0_EBX_AVX512CD | CPUID_7_0_EBX_SHA_NI | ++ CPUID_7_0_EBX_AVX512BW | CPUID_7_0_EBX_AVX512VL, ++ .features[FEAT_7_0_ECX] = ++ CPUID_7_0_ECX_AVX512_VBMI | CPUID_7_0_ECX_UMIP | CPUID_7_0_ECX_PKU | ++ CPUID_7_0_ECX_AVX512_VBMI2 | CPUID_7_0_ECX_GFNI | ++ CPUID_7_0_ECX_VAES | CPUID_7_0_ECX_VPCLMULQDQ | ++ CPUID_7_0_ECX_AVX512VNNI | CPUID_7_0_ECX_AVX512BITALG | ++ CPUID_7_0_ECX_AVX512_VPOPCNTDQ | CPUID_7_0_ECX_LA57 | ++ CPUID_7_0_ECX_RDPID | CPUID_7_0_ECX_MOVDIRI | ++ CPUID_7_0_ECX_MOVDIR64B, ++ .features[FEAT_7_0_EDX] = ++ CPUID_7_0_EDX_FSRM | CPUID_7_0_EDX_AVX512_VP2INTERSECT, ++ .features[FEAT_7_1_EAX] = ++ CPUID_7_1_EAX_AVX_VNNI | CPUID_7_1_EAX_AVX512_BF16, ++ .features[FEAT_8000_0001_ECX] = ++ CPUID_EXT3_OSVW | CPUID_EXT3_3DNOWPREFETCH | ++ CPUID_EXT3_MISALIGNSSE | CPUID_EXT3_SSE4A | CPUID_EXT3_ABM | ++ CPUID_EXT3_CR8LEG | CPUID_EXT3_SVM | CPUID_EXT3_LAHF_LM | ++ CPUID_EXT3_TOPOEXT | CPUID_EXT3_PERFCORE, ++ .features[FEAT_8000_0001_EDX] = ++ CPUID_EXT2_LM | CPUID_EXT2_RDTSCP | CPUID_EXT2_PDPE1GB | ++ CPUID_EXT2_FFXSR | CPUID_EXT2_MMXEXT | CPUID_EXT2_NX | ++ CPUID_EXT2_SYSCALL, ++ .features[FEAT_8000_0007_EBX] = ++ CPUID_8000_0007_EBX_OVERFLOW_RECOV | CPUID_8000_0007_EBX_SUCCOR, ++ .features[FEAT_8000_0008_EBX] = ++ CPUID_8000_0008_EBX_CLZERO | CPUID_8000_0008_EBX_XSAVEERPTR | ++ CPUID_8000_0008_EBX_WBNOINVD | CPUID_8000_0008_EBX_IBPB | ++ CPUID_8000_0008_EBX_IBRS | CPUID_8000_0008_EBX_STIBP | ++ CPUID_8000_0008_EBX_STIBP_ALWAYS_ON | ++ CPUID_8000_0008_EBX_AMD_SSBD | CPUID_8000_0008_EBX_AMD_PSFD, ++ .features[FEAT_8000_0021_EAX] = ++ CPUID_8000_0021_EAX_NO_NESTED_DATA_BP | ++ CPUID_8000_0021_EAX_FS_GS_BASE_NS | ++ CPUID_8000_0021_EAX_LFENCE_ALWAYS_SERIALIZING | ++ CPUID_8000_0021_EAX_NULL_SEL_CLR_BASE | ++ CPUID_8000_0021_EAX_AUTO_IBRS | CPUID_8000_0021_EAX_PREFETCHI | ++ CPUID_8000_0021_EAX_SBPB | CPUID_8000_0021_EAX_IBPB_BRTYPE | ++ CPUID_8000_0021_EAX_SRSO_USER_KERNEL_NO, ++ .features[FEAT_8000_0022_EAX] = ++ CPUID_8000_0022_EAX_PERFMON_V2, ++ .features[FEAT_XSAVE] = ++ CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XSAVEC | ++ CPUID_XSAVE_XGETBV1 | CPUID_XSAVE_XSAVES, ++ .features[FEAT_SVM] = ++ CPUID_SVM_NPT | CPUID_SVM_LBRV | CPUID_SVM_NRIPSAVE | ++ CPUID_SVM_TSCSCALE | CPUID_SVM_VMCBCLEAN | CPUID_SVM_FLUSHASID | ++ CPUID_SVM_PAUSEFILTER | CPUID_SVM_PFTHRESHOLD | ++ CPUID_SVM_V_VMSAVE_VMLOAD | CPUID_SVM_VGIF | ++ CPUID_SVM_VNMI | CPUID_SVM_SVME_ADDR_CHK, ++ .xlevel = 0x80000022, ++ .model_id = "AMD EPYC-Turin Processor", ++ .cache_info = &epyc_turin_cache_info, ++ }, + }; + + /* +-- +2.39.3 + diff --git a/SOURCES/kvm-target-i386-Exclude-hv-syndbg-from-hv-passthrough.patch b/SOURCES/kvm-target-i386-Exclude-hv-syndbg-from-hv-passthrough.patch new file mode 100644 index 0000000..df4e5e3 --- /dev/null +++ b/SOURCES/kvm-target-i386-Exclude-hv-syndbg-from-hv-passthrough.patch @@ -0,0 +1,102 @@ +From 0288537593cd4452a2523b686b297dad3735f7f8 Mon Sep 17 00:00:00 2001 +From: Vitaly Kuznetsov +Date: Thu, 17 Apr 2025 15:30:50 +0200 +Subject: [PATCH 2/2] target/i386: Exclude 'hv-syndbg' from 'hv-passthrough' + +RH-Author: Vitaly Kuznetsov +RH-MergeRequest: 352: hyper-v: exclude 'hv-syndbg' from 'hv-passthrough' set +RH-Jira: RHEL-7130 +RH-Acked-by: Maxim Levitsky +RH-Acked-by: Ani Sinha +RH-Acked-by: Emanuele Giuseppe Esposito +RH-Commit: [2/2] bf276ad5b340139f71b92e656a0c7756a55dec0b (vkuznets/qemu-kvm) + +Windows with Hyper-V role enabled doesn't boot with 'hv-passthrough' when +no debugger is configured, this significantly limits the usefulness of the +feature as there's no support for subtracting Hyper-V features from CPU +flags at this moment (e.g. "-cpu host,hv-passthrough,-hv-syndbg" does not +work). While this is also theoretically fixable, 'hv-syndbg' is likely +very special and unneeded in the default set. Genuine Hyper-V doesn't seem +to enable it either. + +Introduce 'skip_passthrough' flag to 'kvm_hyperv_properties' and use it as +one-off to skip 'hv-syndbg' when enabling features in 'hv-passthrough' +mode. Note, "-cpu host,hv-passthrough,hv-syndbg" can still be used if +needed. + +As both 'hv-passthrough' and 'hv-syndbg' are debug features, the change +should not have any effect on production environments. + +Signed-off-by: Vitaly Kuznetsov +Link: https://lore.kernel.org/r/20240917160051.2637594-3-vkuznets@redhat.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit 7d7b9c7655a26e09c800ef40373078a80e90d9f3) +Signed-off-by: Vitaly Kuznetsov +--- + docs/system/i386/hyperv.rst | 13 +++++++++---- + target/i386/kvm/kvm.c | 7 +++++-- + 2 files changed, 14 insertions(+), 6 deletions(-) + +diff --git a/docs/system/i386/hyperv.rst b/docs/system/i386/hyperv.rst +index 2505dc4c86..009947e391 100644 +--- a/docs/system/i386/hyperv.rst ++++ b/docs/system/i386/hyperv.rst +@@ -262,14 +262,19 @@ Supplementary features + ``hv-passthrough`` + In some cases (e.g. during development) it may make sense to use QEMU in + 'pass-through' mode and give Windows guests all enlightenments currently +- supported by KVM. This pass-through mode is enabled by "hv-passthrough" CPU +- flag. ++ supported by KVM. + + Note: ``hv-passthrough`` flag only enables enlightenments which are known to QEMU + (have corresponding 'hv-' flag) and copies ``hv-spinlocks`` and ``hv-vendor-id`` + values from KVM to QEMU. ``hv-passthrough`` overrides all other 'hv-' settings on +- the command line. Also, enabling this flag effectively prevents migration as the +- list of enabled enlightenments may differ between target and destination hosts. ++ the command line. ++ ++ Note: ``hv-passthrough`` does not enable ``hv-syndbg`` which can prevent certain ++ Windows guests from booting when used without proper configuration. If needed, ++ ``hv-syndbg`` can be enabled additionally. ++ ++ Note: ``hv-passthrough`` effectively prevents migration as the list of enabled ++ enlightenments may differ between target and destination hosts. + + ``hv-enforce-cpuid`` + By default, KVM allows the guest to use all currently supported Hyper-V +diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c +index 5bf77d761f..94b678e9e3 100644 +--- a/target/i386/kvm/kvm.c ++++ b/target/i386/kvm/kvm.c +@@ -913,6 +913,7 @@ static struct { + uint32_t bits; + } flags[2]; + uint64_t dependencies; ++ bool skip_passthrough; + } kvm_hyperv_properties[] = { + [HYPERV_FEAT_RELAXED] = { + .desc = "relaxed timing (hv-relaxed)", +@@ -1041,7 +1042,8 @@ static struct { + {.func = HV_CPUID_FEATURES, .reg = R_EDX, + .bits = HV_FEATURE_DEBUG_MSRS_AVAILABLE} + }, +- .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_RELAXED) ++ .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_RELAXED), ++ .skip_passthrough = true, + }, + [HYPERV_FEAT_MSR_BITMAP] = { + .desc = "enlightened MSR-Bitmap (hv-emsr-bitmap)", +@@ -1450,7 +1452,8 @@ bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp) + * hv_build_cpuid_leaf() uses this info to build guest CPUIDs. + */ + for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) { +- if (hyperv_feature_supported(cs, feat)) { ++ if (hyperv_feature_supported(cs, feat) && ++ !kvm_hyperv_properties[feat].skip_passthrough) { + cpu->hyperv_features |= BIT(feat); + } + } +-- +2.48.1 + diff --git a/SOURCES/kvm-target-i386-Expose-bits-related-to-SRSO-vulnerabilit.patch b/SOURCES/kvm-target-i386-Expose-bits-related-to-SRSO-vulnerabilit.patch new file mode 100644 index 0000000..7c666cf --- /dev/null +++ b/SOURCES/kvm-target-i386-Expose-bits-related-to-SRSO-vulnerabilit.patch @@ -0,0 +1,84 @@ +From 1d667a354613385b1552fdbae91799882776f908 Mon Sep 17 00:00:00 2001 +From: Babu Moger +Date: Thu, 24 Oct 2024 17:18:23 -0500 +Subject: [PATCH 15/57] target/i386: Expose bits related to SRSO vulnerability + +RH-Author: John Allen +RH-MergeRequest: 378: Update EPYC Models and Feature Bits +RH-Jira: RHEL-52649 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [1/8] 9a6f4126ab023269e8afb3537aaa94ae60228382 (johnalle/qemu-kvm-fork) + +Add following bits related Speculative Return Stack Overflow (SRSO). +Guests can make use of these bits if supported. + +These bits are reported via CPUID Fn8000_0021_EAX. +=================================================================== +Bit Feature Description +=================================================================== +27 SBPB Indicates support for the Selective Branch Predictor Barrier. +28 IBPB_BRTYPE MSR_PRED_CMD[IBPB] flushes all branch type predictions. +29 SRSO_NO Not vulnerable to SRSO. +30 SRSO_USER_KERNEL_NO Not vulnerable to SRSO at the user-kernel boundary. +=================================================================== + +Link: https://www.amd.com/content/dam/amd/en/documents/corporate/cr/speculative-return-stack-overflow-whitepaper.pdf +Link: https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/programmer-references/57238.zip +Signed-off-by: Babu Moger +Link: https://lore.kernel.org/r/dadbd70c38f4e165418d193918a3747bd715c5f4.1729807947.git.babu.moger@amd.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit 2ec282b8eaaddf5c136f7566b5f61d80288a2065) + +JIRA: https://issues.redhat.com/browse/RHEL-52649 + +Signed-off-by: John Allen +--- + target/i386/cpu.c | 2 +- + target/i386/cpu.h | 14 +++++++++++--- + 2 files changed, 12 insertions(+), 4 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 0a955b1c45..53069a460c 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1240,7 +1240,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, +- NULL, NULL, NULL, NULL, ++ "ibpb-brtype", "srso-no", "srso-user-kernel-no", NULL, + }, + .cpuid = { .eax = 0x80000021, .reg = R_EAX, }, + .tcg_features = 0, +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index 4da9ed5930..9a16239b8e 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -1028,13 +1028,21 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w); + #define CPUID_8000_0008_EBX_AMD_PSFD (1U << 28) + + /* Processor ignores nested data breakpoints */ +-#define CPUID_8000_0021_EAX_No_NESTED_DATA_BP (1U << 0) ++#define CPUID_8000_0021_EAX_NO_NESTED_DATA_BP (1U << 0) + /* LFENCE is always serializing */ + #define CPUID_8000_0021_EAX_LFENCE_ALWAYS_SERIALIZING (1U << 2) + /* Null Selector Clears Base */ +-#define CPUID_8000_0021_EAX_NULL_SEL_CLR_BASE (1U << 6) ++#define CPUID_8000_0021_EAX_NULL_SEL_CLR_BASE (1U << 6) + /* Automatic IBRS */ +-#define CPUID_8000_0021_EAX_AUTO_IBRS (1U << 8) ++#define CPUID_8000_0021_EAX_AUTO_IBRS (1U << 8) ++/* Selective Branch Predictor Barrier */ ++#define CPUID_8000_0021_EAX_SBPB (1U << 27) ++/* IBPB includes branch type prediction flushing */ ++#define CPUID_8000_0021_EAX_IBPB_BRTYPE (1U << 28) ++/* Not vulnerable to Speculative Return Stack Overflow */ ++#define CPUID_8000_0021_EAX_SRSO_NO (1U << 29) ++/* Not vulnerable to SRSO at the user-kernel boundary */ ++#define CPUID_8000_0021_EAX_SRSO_USER_KERNEL_NO (1U << 30) + + #define CPUID_XSAVE_XSAVEOPT (1U << 0) + #define CPUID_XSAVE_XSAVEC (1U << 1) +-- +2.39.3 + diff --git a/SOURCES/kvm-target-i386-Fix-conditional-CONFIG_SYNDBG-enablement.patch b/SOURCES/kvm-target-i386-Fix-conditional-CONFIG_SYNDBG-enablement.patch new file mode 100644 index 0000000..049f3fe --- /dev/null +++ b/SOURCES/kvm-target-i386-Fix-conditional-CONFIG_SYNDBG-enablement.patch @@ -0,0 +1,108 @@ +From 26d5561f7a07c9bc6f8ea9a602c53bfa5daddd13 Mon Sep 17 00:00:00 2001 +From: Vitaly Kuznetsov +Date: Thu, 17 Apr 2025 15:30:42 +0200 +Subject: [PATCH 1/2] target/i386: Fix conditional CONFIG_SYNDBG enablement + +RH-Author: Vitaly Kuznetsov +RH-MergeRequest: 352: hyper-v: exclude 'hv-syndbg' from 'hv-passthrough' set +RH-Jira: RHEL-7130 +RH-Acked-by: Maxim Levitsky +RH-Acked-by: Ani Sinha +RH-Acked-by: Emanuele Giuseppe Esposito +RH-Commit: [1/2] 0446b6202fb3dbae865da0dc7e08092399661f7a (vkuznets/qemu-kvm) + +Putting HYPERV_FEAT_SYNDBG entry under "#ifdef CONFIG_SYNDBG" in +'kvm_hyperv_properties' array is wrong: as HYPERV_FEAT_SYNDBG is not +the highest feature number, the result is an empty (zeroed) entry in +the array (and not a skipped entry!). hyperv_feature_supported() is +designed to check that all CPUID bits are set but for a zeroed +feature in 'kvm_hyperv_properties' it returns 'true' so QEMU considers +HYPERV_FEAT_SYNDBG as always supported, regardless of whether KVM host +actually supports it. + +To fix the issue, leave HYPERV_FEAT_SYNDBG's definition in +'kvm_hyperv_properties' array, there's nothing wrong in having it defined +even when 'CONFIG_SYNDBG' is not set. Instead, put "hv-syndbg" CPU property +under '#ifdef CONFIG_SYNDBG' to alter the existing behavior when the flag +is silently skipped in !CONFIG_SYNDBG builds. + +Leave an 'assert' sentinel in hyperv_feature_supported() making sure there +are no 'holes' or improperly defined features in 'kvm_hyperv_properties'. + +Fixes: d8701185f40c ("hw: hyperv: Initial commit for Synthetic Debugging device") +Signed-off-by: Vitaly Kuznetsov +Link: https://lore.kernel.org/r/20240917160051.2637594-2-vkuznets@redhat.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit bbf3810f2c4f97bd7a1982d3e0ff0f00295b8169) +Signed-off-by: Vitaly Kuznetsov +--- + target/i386/cpu.c | 2 ++ + target/i386/kvm/kvm.c | 11 +++++++---- + 2 files changed, 9 insertions(+), 4 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index a70a3aa670..0a955b1c45 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -8450,8 +8450,10 @@ static Property x86_cpu_properties[] = { + HYPERV_FEAT_TLBFLUSH_DIRECT, 0), + DEFINE_PROP_ON_OFF_AUTO("hv-no-nonarch-coresharing", X86CPU, + hyperv_no_nonarch_cs, ON_OFF_AUTO_OFF), ++#ifdef CONFIG_SYNDBG + DEFINE_PROP_BIT64("hv-syndbg", X86CPU, hyperv_features, + HYPERV_FEAT_SYNDBG, 0), ++#endif + DEFINE_PROP_BOOL("hv-passthrough", X86CPU, hyperv_passthrough, false), + DEFINE_PROP_BOOL("hv-enforce-cpuid", X86CPU, hyperv_enforce_cpuid, false), + +diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c +index d0329a4ed7..5bf77d761f 100644 +--- a/target/i386/kvm/kvm.c ++++ b/target/i386/kvm/kvm.c +@@ -1035,7 +1035,6 @@ static struct { + .bits = HV_DEPRECATING_AEOI_RECOMMENDED} + } + }, +-#ifdef CONFIG_SYNDBG + [HYPERV_FEAT_SYNDBG] = { + .desc = "Enable synthetic kernel debugger channel (hv-syndbg)", + .flags = { +@@ -1044,7 +1043,6 @@ static struct { + }, + .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_RELAXED) + }, +-#endif + [HYPERV_FEAT_MSR_BITMAP] = { + .desc = "enlightened MSR-Bitmap (hv-emsr-bitmap)", + .flags = { +@@ -1296,6 +1294,13 @@ static bool hyperv_feature_supported(CPUState *cs, int feature) + uint32_t func, bits; + int i, reg; + ++ /* ++ * kvm_hyperv_properties needs to define at least one CPUID flag which ++ * must be used to detect the feature, it's hard to say whether it is ++ * supported or not otherwise. ++ */ ++ assert(kvm_hyperv_properties[feature].flags[0].func); ++ + for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties[feature].flags); i++) { + + func = kvm_hyperv_properties[feature].flags[i].func; +@@ -3925,13 +3930,11 @@ static int kvm_put_msrs(X86CPU *cpu, int level) + kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS, + env->msr_hv_tsc_emulation_status); + } +-#ifdef CONFIG_SYNDBG + if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG) && + has_msr_hv_syndbg_options) { + kvm_msr_entry_add(cpu, HV_X64_MSR_SYNDBG_OPTIONS, + hyperv_syndbg_query_options()); + } +-#endif + } + if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) { + kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE, +-- +2.48.1 + diff --git a/SOURCES/kvm-target-i386-Update-EPYC-CPU-model-for-Cache-property.patch b/SOURCES/kvm-target-i386-Update-EPYC-CPU-model-for-Cache-property.patch new file mode 100644 index 0000000..f5cfccc --- /dev/null +++ b/SOURCES/kvm-target-i386-Update-EPYC-CPU-model-for-Cache-property.patch @@ -0,0 +1,147 @@ +From 4091d13096918dfff5f3a292b43e613ea888ddc1 Mon Sep 17 00:00:00 2001 +From: Babu Moger +Date: Thu, 8 May 2025 14:57:59 -0500 +Subject: [PATCH 17/57] target/i386: Update EPYC CPU model for Cache property, + RAS, SVM feature bits + +RH-Author: John Allen +RH-MergeRequest: 378: Update EPYC Models and Feature Bits +RH-Jira: RHEL-52649 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [3/8] afc52d066ad5f66732b0bb04142e210c52896708 (johnalle/qemu-kvm-fork) + +Found that some of the cache properties are not set correctly for EPYC models. + +l1d_cache.no_invd_sharing should not be true. +l1i_cache.no_invd_sharing should not be true. + +L2.self_init should be true. +L2.inclusive should be true. + +L3.inclusive should not be true. +L3.no_invd_sharing should be true. + +Fix the cache properties. + +Also add the missing RAS and SVM features bits on AMD +EPYC CPU models. The SVM feature bits are used in nested guests. + +succor : Software uncorrectable error containment and recovery capability. +overflow-recov : MCA overflow recovery support. +lbrv : LBR virtualization +tsc-scale : MSR based TSC rate control +vmcb-clean : VMCB clean bits +flushbyasid : Flush by ASID +pause-filter : Pause intercept filter +pfthreshold : PAUSE filter threshold +v-vmsave-vmload : Virtualized VMLOAD and VMSAVE +vgif : Virtualized GIF + +Signed-off-by: Babu Moger +Reviewed-by: Maksim Davydov +Reviewed-by: Zhao Liu +Link: https://lore.kernel.org/r/515941861700d7066186c9600bc5d96a1741ef0c.1746734284.git.babu.moger@amd.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit 397db937e85d7b9f5a6f0b30764786cef09d1ff3) + +JIRA: https://issues.redhat.com/browse/RHEL-52649 + +Signed-off-by: John Allen +--- + target/i386/cpu.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 73 insertions(+) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 4546369836..32c575f63b 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -2166,6 +2166,60 @@ static CPUCaches epyc_v4_cache_info = { + }, + }; + ++static CPUCaches epyc_v5_cache_info = { ++ .l1d_cache = &(CPUCacheInfo) { ++ .type = DATA_CACHE, ++ .level = 1, ++ .size = 32 * KiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 64, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .share_level = CPU_TOPO_LEVEL_CORE, ++ }, ++ .l1i_cache = &(CPUCacheInfo) { ++ .type = INSTRUCTION_CACHE, ++ .level = 1, ++ .size = 64 * KiB, ++ .line_size = 64, ++ .associativity = 4, ++ .partitions = 1, ++ .sets = 256, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .share_level = CPU_TOPO_LEVEL_CORE, ++ }, ++ .l2_cache = &(CPUCacheInfo) { ++ .type = UNIFIED_CACHE, ++ .level = 2, ++ .size = 512 * KiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 1024, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .inclusive = true, ++ .share_level = CPU_TOPO_LEVEL_CORE, ++ }, ++ .l3_cache = &(CPUCacheInfo) { ++ .type = UNIFIED_CACHE, ++ .level = 3, ++ .size = 8 * MiB, ++ .line_size = 64, ++ .associativity = 16, ++ .partitions = 1, ++ .sets = 8192, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .no_invd_sharing = true, ++ .complex_indexing = false, ++ .share_level = CPU_TOPO_LEVEL_DIE, ++ }, ++}; ++ + static const CPUCaches epyc_rome_cache_info = { + .l1d_cache = &(CPUCacheInfo) { + .type = DATA_CACHE, +@@ -5059,6 +5113,25 @@ static const X86CPUDefinition builtin_x86_defs[] = { + }, + .cache_info = &epyc_v4_cache_info + }, ++ { ++ .version = 5, ++ .props = (PropValue[]) { ++ { "overflow-recov", "on" }, ++ { "succor", "on" }, ++ { "lbrv", "on" }, ++ { "tsc-scale", "on" }, ++ { "vmcb-clean", "on" }, ++ { "flushbyasid", "on" }, ++ { "pause-filter", "on" }, ++ { "pfthreshold", "on" }, ++ { "v-vmsave-vmload", "on" }, ++ { "vgif", "on" }, ++ { "model-id", ++ "AMD EPYC-v5 Processor" }, ++ { /* end of list */ } ++ }, ++ .cache_info = &epyc_v5_cache_info ++ }, + { /* end of list */ } + } + }, +-- +2.39.3 + diff --git a/SOURCES/kvm-target-i386-Update-EPYC-Genoa-for-Cache-property-per.patch b/SOURCES/kvm-target-i386-Update-EPYC-Genoa-for-Cache-property-per.patch new file mode 100644 index 0000000..59260d8 --- /dev/null +++ b/SOURCES/kvm-target-i386-Update-EPYC-Genoa-for-Cache-property-per.patch @@ -0,0 +1,167 @@ +From 768e39f40b394eb4524a83857b86e8f7497f4414 Mon Sep 17 00:00:00 2001 +From: Babu Moger +Date: Thu, 8 May 2025 14:58:03 -0500 +Subject: [PATCH 21/57] target/i386: Update EPYC-Genoa for Cache property, + perfmon-v2, RAS and SVM feature bits + +RH-Author: John Allen +RH-MergeRequest: 378: Update EPYC Models and Feature Bits +RH-Jira: RHEL-52649 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [7/8] b144233a1115385a1f792c4454f4511173f753d8 (johnalle/qemu-kvm-fork) + +Found that some of the cache properties are not set correctly for EPYC models. +l1d_cache.no_invd_sharing should not be true. +l1i_cache.no_invd_sharing should not be true. + +L2.self_init should be true. +L2.inclusive should be true. + +L3.inclusive should not be true. +L3.no_invd_sharing should be true. + +Fix these cache properties. + +Also add the missing RAS and SVM features bits on AMD EPYC-Genoa model. +The SVM feature bits are used in nested guests. + +perfmon-v2 : Allow guests to make use of the PerfMonV2 features. +succor : Software uncorrectable error containment and recovery capability. +overflow-recov : MCA overflow recovery support. +lbrv : LBR virtualization +tsc-scale : MSR based TSC rate control +vmcb-clean : VMCB clean bits +flushbyasid : Flush by ASID +pause-filter : Pause intercept filter +pfthreshold : PAUSE filter threshold +v-vmsave-vmload: Virtualized VMLOAD and VMSAVE +vgif : Virtualized GIF +fs-gs-base-ns : WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing + +The feature details are available in APM listed below [1]. +[1] AMD64 Architecture Programmer's Manual Volume 2: System Programming +Publication # 24593 Revision 3.41. + +Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 +Signed-off-by: Babu Moger +Reviewed-by: Maksim Davydov +Reviewed-by: Zhao Liu +Link: https://lore.kernel.org/r/afe3f05d4116124fd5795f28fc23d7b396140313.1746734284.git.babu.moger@amd.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit abc92cc8488b5dbcc403b5be24d8092180605101) + +JIRA: https://issues.redhat.com/browse/RHEL-52649 + +Signed-off-by: John Allen +--- + target/i386/cpu.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 79 insertions(+), 1 deletion(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 2218071fca..2bc2d41259 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -2598,6 +2598,59 @@ static const CPUCaches epyc_genoa_cache_info = { + }, + }; + ++static const CPUCaches epyc_genoa_v2_cache_info = { ++ .l1d_cache = &(CPUCacheInfo) { ++ .type = DATA_CACHE, ++ .level = 1, ++ .size = 32 * KiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 64, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .share_level = CPU_TOPO_LEVEL_CORE, ++ }, ++ .l1i_cache = &(CPUCacheInfo) { ++ .type = INSTRUCTION_CACHE, ++ .level = 1, ++ .size = 32 * KiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 64, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .share_level = CPU_TOPO_LEVEL_CORE, ++ }, ++ .l2_cache = &(CPUCacheInfo) { ++ .type = UNIFIED_CACHE, ++ .level = 2, ++ .size = 1 * MiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 2048, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .inclusive = true, ++ .share_level = CPU_TOPO_LEVEL_CORE, ++ }, ++ .l3_cache = &(CPUCacheInfo) { ++ .type = UNIFIED_CACHE, ++ .level = 3, ++ .size = 32 * MiB, ++ .line_size = 64, ++ .associativity = 16, ++ .partitions = 1, ++ .sets = 32768, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .no_invd_sharing = true, ++ .complex_indexing = false, ++ .share_level = CPU_TOPO_LEVEL_DIE, ++ }, ++}; + /* The following VMX features are not supported by KVM and are left out in the + * CPU definitions: + * +@@ -5530,7 +5583,7 @@ static const X86CPUDefinition builtin_x86_defs[] = { + CPUID_8000_0008_EBX_STIBP_ALWAYS_ON | + CPUID_8000_0008_EBX_AMD_SSBD | CPUID_8000_0008_EBX_AMD_PSFD, + .features[FEAT_8000_0021_EAX] = +- CPUID_8000_0021_EAX_No_NESTED_DATA_BP | ++ CPUID_8000_0021_EAX_NO_NESTED_DATA_BP | + CPUID_8000_0021_EAX_LFENCE_ALWAYS_SERIALIZING | + CPUID_8000_0021_EAX_NULL_SEL_CLR_BASE | + CPUID_8000_0021_EAX_AUTO_IBRS, +@@ -5565,6 +5618,31 @@ static const X86CPUDefinition builtin_x86_defs[] = { + .xlevel = 0x80000022, + .model_id = "AMD EPYC-Genoa Processor", + .cache_info = &epyc_genoa_cache_info, ++ .versions = (X86CPUVersionDefinition[]) { ++ { .version = 1 }, ++ { ++ .version = 2, ++ .props = (PropValue[]) { ++ { "overflow-recov", "on" }, ++ { "succor", "on" }, ++ { "lbrv", "on" }, ++ { "tsc-scale", "on" }, ++ { "vmcb-clean", "on" }, ++ { "flushbyasid", "on" }, ++ { "pause-filter", "on" }, ++ { "pfthreshold", "on" }, ++ { "v-vmsave-vmload", "on" }, ++ { "vgif", "on" }, ++ { "fs-gs-base-ns", "on" }, ++ { "perfmon-v2", "on" }, ++ { "model-id", ++ "AMD EPYC-Genoa-v2 Processor" }, ++ { /* end of list */ } ++ }, ++ .cache_info = &epyc_genoa_v2_cache_info ++ }, ++ { /* end of list */ } ++ } + }, + }; + +-- +2.39.3 + diff --git a/SOURCES/kvm-target-i386-Update-EPYC-Milan-CPU-model-for-Cache-pr.patch b/SOURCES/kvm-target-i386-Update-EPYC-Milan-CPU-model-for-Cache-pr.patch new file mode 100644 index 0000000..ba143ba --- /dev/null +++ b/SOURCES/kvm-target-i386-Update-EPYC-Milan-CPU-model-for-Cache-pr.patch @@ -0,0 +1,146 @@ +From a2cd6a5aac0ba2bbb50d2ff22b83c8b9d7761028 Mon Sep 17 00:00:00 2001 +From: Babu Moger +Date: Thu, 8 May 2025 14:58:01 -0500 +Subject: [PATCH 19/57] target/i386: Update EPYC-Milan CPU model for Cache + property, RAS, SVM feature bits + +RH-Author: John Allen +RH-MergeRequest: 378: Update EPYC Models and Feature Bits +RH-Jira: RHEL-52649 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [5/8] e9e34ade25cb7be05d40745e1d074c0356d1923f (johnalle/qemu-kvm-fork) + +Found that some of the cache properties are not set correctly for EPYC models. +l1d_cache.no_invd_sharing should not be true. +l1i_cache.no_invd_sharing should not be true. + +L2.self_init should be true. +L2.inclusive should be true. + +L3.inclusive should not be true. +L3.no_invd_sharing should be true. + +Fix these cache properties. + +Also add the missing RAS and SVM features bits on AMD EPYC-Milan model. +The SVM feature bits are used in nested guests. + +succor : Software uncorrectable error containment and recovery capability. +overflow-recov : MCA overflow recovery support. +lbrv : LBR virtualization +tsc-scale : MSR based TSC rate control +vmcb-clean : VMCB clean bits +flushbyasid : Flush by ASID +pause-filter : Pause intercept filter +pfthreshold : PAUSE filter threshold +v-vmsave-vmload : Virtualized VMLOAD and VMSAVE +vgif : Virtualized GIF + +Signed-off-by: Babu Moger +Reviewed-by: Maksim Davydov +Reviewed-by: Zhao Liu +Link: https://lore.kernel.org/r/c619c0e09a9d5d496819ed48d69181d65f416891.1746734284.git.babu.moger@amd.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit fc014d9ba5b26b27401e0e88a4e1ef827c68fe64) + +JIRA: https://issues.redhat.com/browse/RHEL-52649 + +Signed-off-by: John Allen +--- + target/i386/cpu.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 73 insertions(+) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index a73b5bfca4..7d48c51767 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -2490,6 +2490,60 @@ static const CPUCaches epyc_milan_v2_cache_info = { + }, + }; + ++static const CPUCaches epyc_milan_v3_cache_info = { ++ .l1d_cache = &(CPUCacheInfo) { ++ .type = DATA_CACHE, ++ .level = 1, ++ .size = 32 * KiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 64, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .share_level = CPU_TOPO_LEVEL_CORE, ++ }, ++ .l1i_cache = &(CPUCacheInfo) { ++ .type = INSTRUCTION_CACHE, ++ .level = 1, ++ .size = 32 * KiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 64, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .share_level = CPU_TOPO_LEVEL_CORE, ++ }, ++ .l2_cache = &(CPUCacheInfo) { ++ .type = UNIFIED_CACHE, ++ .level = 2, ++ .size = 512 * KiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 1024, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .inclusive = true, ++ .share_level = CPU_TOPO_LEVEL_CORE, ++ }, ++ .l3_cache = &(CPUCacheInfo) { ++ .type = UNIFIED_CACHE, ++ .level = 3, ++ .size = 32 * MiB, ++ .line_size = 64, ++ .associativity = 16, ++ .partitions = 1, ++ .sets = 32768, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .no_invd_sharing = true, ++ .complex_indexing = false, ++ .share_level = CPU_TOPO_LEVEL_DIE, ++ }, ++}; ++ + static const CPUCaches epyc_genoa_cache_info = { + .l1d_cache = &(CPUCacheInfo) { + .type = DATA_CACHE, +@@ -5418,6 +5472,25 @@ static const X86CPUDefinition builtin_x86_defs[] = { + }, + .cache_info = &epyc_milan_v2_cache_info + }, ++ { ++ .version = 3, ++ .props = (PropValue[]) { ++ { "overflow-recov", "on" }, ++ { "succor", "on" }, ++ { "lbrv", "on" }, ++ { "tsc-scale", "on" }, ++ { "vmcb-clean", "on" }, ++ { "flushbyasid", "on" }, ++ { "pause-filter", "on" }, ++ { "pfthreshold", "on" }, ++ { "v-vmsave-vmload", "on" }, ++ { "vgif", "on" }, ++ { "model-id", ++ "AMD EPYC-Milan-v3 Processor" }, ++ { /* end of list */ } ++ }, ++ .cache_info = &epyc_milan_v3_cache_info ++ }, + { /* end of list */ } + } + }, +-- +2.39.3 + diff --git a/SOURCES/kvm-target-i386-Update-EPYC-Rome-CPU-model-for-Cache-pro.patch b/SOURCES/kvm-target-i386-Update-EPYC-Rome-CPU-model-for-Cache-pro.patch new file mode 100644 index 0000000..82c2cb4 --- /dev/null +++ b/SOURCES/kvm-target-i386-Update-EPYC-Rome-CPU-model-for-Cache-pro.patch @@ -0,0 +1,147 @@ +From dc86ee01fb27b174871ff8be9095ed1a20513772 Mon Sep 17 00:00:00 2001 +From: Babu Moger +Date: Thu, 8 May 2025 14:58:00 -0500 +Subject: [PATCH 18/57] target/i386: Update EPYC-Rome CPU model for Cache + property, RAS, SVM feature bits + +RH-Author: John Allen +RH-MergeRequest: 378: Update EPYC Models and Feature Bits +RH-Jira: RHEL-52649 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [4/8] f23618215eee3c54d9ba52c5b74f5d574c522649 (johnalle/qemu-kvm-fork) + +Found that some of the cache properties are not set correctly for EPYC models. + +l1d_cache.no_invd_sharing should not be true. +l1i_cache.no_invd_sharing should not be true. + +L2.self_init should be true. +L2.inclusive should be true. + +L3.inclusive should not be true. +L3.no_invd_sharing should be true. + +Fix these cache properties. + +Also add the missing RAS and SVM features bits on AMD EPYC-Rome. The SVM +feature bits are used in nested guests. + +succor : Software uncorrectable error containment and recovery capability. +overflow-recov : MCA overflow recovery support. +lbrv : LBR virtualization +tsc-scale : MSR based TSC rate control +vmcb-clean : VMCB clean bits +flushbyasid : Flush by ASID +pause-filter : Pause intercept filter +pfthreshold : PAUSE filter threshold +v-vmsave-vmload : Virtualized VMLOAD and VMSAVE +vgif : Virtualized GIF + +Signed-off-by: Babu Moger +Reviewed-by: Maksim Davydov +Reviewed-by: Zhao Liu +Link: https://lore.kernel.org/r/8265af72057b84c99ac3a02a5487e32759cc69b1.1746734284.git.babu.moger@amd.com +Signed-off-by: Paolo Bonzini +(cherry picked from commit 83d940e9700527ff080416ce2fa52ee1f4771d72) + +JIRA: https://issues.redhat.com/browse/RHEL-52649 + +Signed-off-by: John Allen +--- + target/i386/cpu.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 73 insertions(+) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 32c575f63b..a73b5bfca4 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -2328,6 +2328,60 @@ static const CPUCaches epyc_rome_v3_cache_info = { + }, + }; + ++static const CPUCaches epyc_rome_v5_cache_info = { ++ .l1d_cache = &(CPUCacheInfo) { ++ .type = DATA_CACHE, ++ .level = 1, ++ .size = 32 * KiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 64, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .share_level = CPU_TOPO_LEVEL_CORE, ++ }, ++ .l1i_cache = &(CPUCacheInfo) { ++ .type = INSTRUCTION_CACHE, ++ .level = 1, ++ .size = 32 * KiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 64, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .share_level = CPU_TOPO_LEVEL_CORE, ++ }, ++ .l2_cache = &(CPUCacheInfo) { ++ .type = UNIFIED_CACHE, ++ .level = 2, ++ .size = 512 * KiB, ++ .line_size = 64, ++ .associativity = 8, ++ .partitions = 1, ++ .sets = 1024, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .inclusive = true, ++ .share_level = CPU_TOPO_LEVEL_CORE, ++ }, ++ .l3_cache = &(CPUCacheInfo) { ++ .type = UNIFIED_CACHE, ++ .level = 3, ++ .size = 16 * MiB, ++ .line_size = 64, ++ .associativity = 16, ++ .partitions = 1, ++ .sets = 16384, ++ .lines_per_tag = 1, ++ .self_init = true, ++ .no_invd_sharing = true, ++ .complex_indexing = false, ++ .share_level = CPU_TOPO_LEVEL_DIE, ++ }, ++}; ++ + static const CPUCaches epyc_milan_cache_info = { + .l1d_cache = &(CPUCacheInfo) { + .type = DATA_CACHE, +@@ -5270,6 +5324,25 @@ static const X86CPUDefinition builtin_x86_defs[] = { + { /* end of list */ } + }, + }, ++ { ++ .version = 5, ++ .props = (PropValue[]) { ++ { "overflow-recov", "on" }, ++ { "succor", "on" }, ++ { "lbrv", "on" }, ++ { "tsc-scale", "on" }, ++ { "vmcb-clean", "on" }, ++ { "flushbyasid", "on" }, ++ { "pause-filter", "on" }, ++ { "pfthreshold", "on" }, ++ { "v-vmsave-vmload", "on" }, ++ { "vgif", "on" }, ++ { "model-id", ++ "AMD EPYC-Rome-v5 Processor" }, ++ { /* end of list */ } ++ }, ++ .cache_info = &epyc_rome_v5_cache_info ++ }, + { /* end of list */ } + } + }, +-- +2.39.3 + diff --git a/SOURCES/kvm-target-s390-Convert-CPU-to-Resettable-interface.patch b/SOURCES/kvm-target-s390-Convert-CPU-to-Resettable-interface.patch new file mode 100644 index 0000000..6b93825 --- /dev/null +++ b/SOURCES/kvm-target-s390-Convert-CPU-to-Resettable-interface.patch @@ -0,0 +1,284 @@ +From 50c4cbbe0a8849dd0c720c6e706498cb0d46f5b3 Mon Sep 17 00:00:00 2001 +From: Peter Maydell +Date: Fri, 13 Sep 2024 15:31:43 +0100 +Subject: [PATCH 04/26] target/s390: Convert CPU to Resettable interface + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [4/26] 157b29ced6b92ecec5e69f8bc60d0183a0c88fa0 (thuth/qemu-kvm-cs) + +Convert the s390 CPU to the Resettable interface. This is slightly +more involved than the other CPU types were (see commits +9130cade5fc22..d66e64dd006df) because S390 has its own set of +different kinds of reset with different behaviours that it needs to +trigger. + +We handle this by adding these reset types to the Resettable +ResetType enum. Now instead of having an underlying implementation +of reset that is s390-specific and which might be called either +directly or via the DeviceClass::reset method, we can implement only +the Resettable hold phase method, and have the places that need to +trigger an s390-specific reset type do so by calling +resettable_reset(). + +The other option would have been to smuggle in the s390 reset +type via, for instance, a field in the CPU state that we set +in s390_do_cpu_initial_reset() etc and then examined in the +reset method, but doing it this way seems cleaner. + +The motivation for this change is that this is the last caller +of the legacy device_class_set_parent_reset() function, and +removing that will let us clean up some glue code that we added +for the transition to three-phase reset. + +Signed-off-by: Peter Maydell +Reviewed-by: Nina Schoetterl-Glausch +Reviewed-by: Richard Henderson +Acked-by: Thomas Huth +Message-id: 20240830145812.1967042-4-peter.maydell@linaro.org +(cherry picked from commit cf7f61d13f28f32d0b14abb70ce1bd9e41623b2e) +Signed-off-by: Thomas Huth +--- + docs/devel/reset.rst | 10 ++++++++++ + include/hw/resettable.h | 2 ++ + target/s390x/cpu.c | 38 +++++++++++++++++--------------------- + target/s390x/cpu.h | 21 ++++----------------- + target/s390x/sigp.c | 8 ++------ + 5 files changed, 35 insertions(+), 44 deletions(-) + +diff --git a/docs/devel/reset.rst b/docs/devel/reset.rst +index 24ab630465..d2799eba7a 100644 +--- a/docs/devel/reset.rst ++++ b/docs/devel/reset.rst +@@ -44,6 +44,16 @@ The Resettable interface handles reset types with an enum ``ResetType``: + value on each cold reset, such as RNG seed information, and which they + must not reinitialize on a snapshot-load reset. + ++``RESET_TYPE_S390_CPU_NORMAL`` ++ This is only used for S390 CPU objects; it clears interrupts, stops ++ processing, and clears the TLB, but does not touch register contents. ++ ++``RESET_TYPE_S390_CPU_INITIAL`` ++ This is only used for S390 CPU objects; it does everything ++ ``RESET_TYPE_S390_CPU_NORMAL`` does and also clears the PSW, prefix, ++ FPC, timer and control registers. It does not touch gprs, fprs or acrs. ++ ++ + Devices which implement reset methods must treat any unknown ``ResetType`` + as equivalent to ``RESET_TYPE_COLD``; this will reduce the amount of + existing code we need to change if we add more types in future. +diff --git a/include/hw/resettable.h b/include/hw/resettable.h +index 7e249deb8b..83b561fc83 100644 +--- a/include/hw/resettable.h ++++ b/include/hw/resettable.h +@@ -36,6 +36,8 @@ typedef struct ResettableState ResettableState; + typedef enum ResetType { + RESET_TYPE_COLD, + RESET_TYPE_SNAPSHOT_LOAD, ++ RESET_TYPE_S390_CPU_INITIAL, ++ RESET_TYPE_S390_CPU_NORMAL, + } ResetType; + + /* +diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c +index 0fbfcd35d8..4e41a3dff5 100644 +--- a/target/s390x/cpu.c ++++ b/target/s390x/cpu.c +@@ -32,6 +32,7 @@ + #include "sysemu/hw_accel.h" + #include "hw/qdev-properties.h" + #include "hw/qdev-properties-system.h" ++#include "hw/resettable.h" + #include "fpu/softfloat-helpers.h" + #include "disas/capstone.h" + #include "sysemu/tcg.h" +@@ -162,23 +163,25 @@ static void s390_query_cpu_fast(CPUState *cpu, CpuInfoFast *value) + #endif + } + +-/* S390CPUClass::reset() */ +-static void s390_cpu_reset(CPUState *s, cpu_reset_type type) ++/* S390CPUClass Resettable reset_hold phase method */ ++static void s390_cpu_reset_hold(Object *obj, ResetType type) + { +- S390CPU *cpu = S390_CPU(s); ++ S390CPU *cpu = S390_CPU(obj); + S390CPUClass *scc = S390_CPU_GET_CLASS(cpu); + CPUS390XState *env = &cpu->env; +- DeviceState *dev = DEVICE(s); + +- scc->parent_reset(dev); ++ if (scc->parent_phases.hold) { ++ scc->parent_phases.hold(obj, type); ++ } + cpu->env.sigp_order = 0; + s390_cpu_set_state(S390_CPU_STATE_STOPPED, cpu); + + switch (type) { +- case S390_CPU_RESET_CLEAR: ++ default: ++ /* RESET_TYPE_COLD: power on or "clear" reset */ + memset(env, 0, offsetof(CPUS390XState, start_initial_reset_fields)); + /* fall through */ +- case S390_CPU_RESET_INITIAL: ++ case RESET_TYPE_S390_CPU_INITIAL: + /* initial reset does not clear everything! */ + memset(&env->start_initial_reset_fields, 0, + offsetof(CPUS390XState, start_normal_reset_fields) - +@@ -203,7 +206,7 @@ static void s390_cpu_reset(CPUState *s, cpu_reset_type type) + set_float_detect_tininess(float_tininess_before_rounding, + &env->fpu_status); + /* fall through */ +- case S390_CPU_RESET_NORMAL: ++ case RESET_TYPE_S390_CPU_NORMAL: + env->psw.mask &= ~PSW_MASK_RI; + memset(&env->start_normal_reset_fields, 0, + offsetof(CPUS390XState, end_reset_fields) - +@@ -212,20 +215,18 @@ static void s390_cpu_reset(CPUState *s, cpu_reset_type type) + env->pfault_token = -1UL; + env->bpbc = false; + break; +- default: +- g_assert_not_reached(); + } + + /* Reset state inside the kernel that we cannot access yet from QEMU. */ + if (kvm_enabled()) { + switch (type) { +- case S390_CPU_RESET_CLEAR: ++ default: + kvm_s390_reset_vcpu_clear(cpu); + break; +- case S390_CPU_RESET_INITIAL: ++ case RESET_TYPE_S390_CPU_INITIAL: + kvm_s390_reset_vcpu_initial(cpu); + break; +- case S390_CPU_RESET_NORMAL: ++ case RESET_TYPE_S390_CPU_NORMAL: + kvm_s390_reset_vcpu_normal(cpu); + break; + } +@@ -315,12 +316,6 @@ static Property s390x_cpu_properties[] = { + DEFINE_PROP_END_OF_LIST() + }; + +-static void s390_cpu_reset_full(DeviceState *dev) +-{ +- CPUState *s = CPU(dev); +- return s390_cpu_reset(s, S390_CPU_RESET_CLEAR); +-} +- + #ifdef CONFIG_TCG + #include "hw/core/tcg-cpu-ops.h" + +@@ -383,15 +378,16 @@ static void s390_cpu_class_init(ObjectClass *oc, void *data) + S390CPUClass *scc = S390_CPU_CLASS(oc); + CPUClass *cc = CPU_CLASS(scc); + DeviceClass *dc = DEVICE_CLASS(oc); ++ ResettableClass *rc = RESETTABLE_CLASS(oc); + + device_class_set_parent_realize(dc, s390_cpu_realizefn, + &scc->parent_realize); + device_class_set_props(dc, s390x_cpu_properties); + dc->user_creatable = true; + +- device_class_set_parent_reset(dc, s390_cpu_reset_full, &scc->parent_reset); ++ resettable_class_set_parent_phases(rc, NULL, s390_cpu_reset_hold, NULL, ++ &scc->parent_phases); + +- scc->reset = s390_cpu_reset; + cc->class_by_name = s390_cpu_class_by_name, + cc->has_work = s390_cpu_has_work; + cc->mmu_index = s390x_cpu_mmu_index; +diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h +index d6b75ad0e0..6a64472403 100644 +--- a/target/s390x/cpu.h ++++ b/target/s390x/cpu.h +@@ -177,19 +177,11 @@ struct ArchCPU { + uint32_t irqstate_saved_size; + }; + +-typedef enum cpu_reset_type { +- S390_CPU_RESET_NORMAL, +- S390_CPU_RESET_INITIAL, +- S390_CPU_RESET_CLEAR, +-} cpu_reset_type; +- + /** + * S390CPUClass: + * @parent_realize: The parent class' realize handler. +- * @parent_reset: The parent class' reset handler. ++ * @parent_phases: The parent class' reset phase handlers. + * @load_normal: Performs a load normal. +- * @cpu_reset: Performs a CPU reset. +- * @initial_cpu_reset: Performs an initial CPU reset. + * + * An S/390 CPU model. + */ +@@ -203,9 +195,8 @@ struct S390CPUClass { + const char *desc; + + DeviceRealize parent_realize; +- DeviceReset parent_reset; ++ ResettablePhases parent_phases; + void (*load_normal)(CPUState *cpu); +- void (*reset)(CPUState *cpu, cpu_reset_type type); + }; + + #ifndef CONFIG_USER_ONLY +@@ -872,16 +863,12 @@ static inline void s390_do_cpu_full_reset(CPUState *cs, run_on_cpu_data arg) + + static inline void s390_do_cpu_reset(CPUState *cs, run_on_cpu_data arg) + { +- S390CPUClass *scc = S390_CPU_GET_CLASS(cs); +- +- scc->reset(cs, S390_CPU_RESET_NORMAL); ++ resettable_reset(OBJECT(cs), RESET_TYPE_S390_CPU_NORMAL); + } + + static inline void s390_do_cpu_initial_reset(CPUState *cs, run_on_cpu_data arg) + { +- S390CPUClass *scc = S390_CPU_GET_CLASS(cs); +- +- scc->reset(cs, S390_CPU_RESET_INITIAL); ++ resettable_reset(OBJECT(cs), RESET_TYPE_S390_CPU_INITIAL); + } + + static inline void s390_do_cpu_load_normal(CPUState *cs, run_on_cpu_data arg) +diff --git a/target/s390x/sigp.c b/target/s390x/sigp.c +index ad0ad61177..08aaecf12b 100644 +--- a/target/s390x/sigp.c ++++ b/target/s390x/sigp.c +@@ -251,24 +251,20 @@ static void sigp_restart(CPUState *cs, run_on_cpu_data arg) + + static void sigp_initial_cpu_reset(CPUState *cs, run_on_cpu_data arg) + { +- S390CPU *cpu = S390_CPU(cs); +- S390CPUClass *scc = S390_CPU_GET_CLASS(cpu); + SigpInfo *si = arg.host_ptr; + + cpu_synchronize_state(cs); +- scc->reset(cs, S390_CPU_RESET_INITIAL); ++ resettable_reset(OBJECT(cs), RESET_TYPE_S390_CPU_INITIAL); + cpu_synchronize_post_reset(cs); + si->cc = SIGP_CC_ORDER_CODE_ACCEPTED; + } + + static void sigp_cpu_reset(CPUState *cs, run_on_cpu_data arg) + { +- S390CPU *cpu = S390_CPU(cs); +- S390CPUClass *scc = S390_CPU_GET_CLASS(cpu); + SigpInfo *si = arg.host_ptr; + + cpu_synchronize_state(cs); +- scc->reset(cs, S390_CPU_RESET_NORMAL); ++ resettable_reset(OBJECT(cs), RESET_TYPE_S390_CPU_NORMAL); + cpu_synchronize_post_reset(cs); + si->cc = SIGP_CC_ORDER_CODE_ACCEPTED; + } +-- +2.48.1 + diff --git a/SOURCES/kvm-tests-Add-iotest-mirror-sparse-for-recent-patches.patch b/SOURCES/kvm-tests-Add-iotest-mirror-sparse-for-recent-patches.patch new file mode 100644 index 0000000..2becf51 --- /dev/null +++ b/SOURCES/kvm-tests-Add-iotest-mirror-sparse-for-recent-patches.patch @@ -0,0 +1,545 @@ +From e72aaba2efda48e083d92e6dacfe58667bdfa958 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Fri, 9 May 2025 15:40:30 -0500 +Subject: [PATCH 15/16] tests: Add iotest mirror-sparse for recent patches + +RH-Author: Eric Blake +RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors +RH-Jira: RHEL-82906 RHEL-83015 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Jon Maloy +RH-Commit: [13/14] 6b7792e85b81e45d11c2664349db75d905e72adf (ebblake/centos-qemu-kvm) + +Prove that blockdev-mirror can now result in sparse raw destination +files, regardless of whether the source is raw or qcow2. By making +this a separate test, it was possible to test effects of individual +patches for the various pieces that all have to work together for a +sparse mirror to be successful. + +Note that ./check -file produces different job lengths than ./check +-qcow2 (the test uses a filter to normalize); that's because when +deciding how much of the image to be mirrored, the code looks at how +much of the source image was allocated (for qcow2, this is only the +written clusters; for raw, it is the entire file). But the important +part is that the destination file ends up smaller than 3M, rather than +the 20M it used to be before this patch series. + +Signed-off-by: Eric Blake +Message-ID: <20250509204341.3553601-28-eblake@redhat.com> +Reviewed-by: Stefan Hajnoczi +(cherry picked from commit c0ddcb2cbc146e64f666eaae4edc7b5db7e5814d) +Jira: https://issues.redhat.com/browse/RHEL-82906 +Jira: https://issues.redhat.com/browse/RHEL-83015 +Signed-off-by: Eric Blake +--- + tests/qemu-iotests/tests/mirror-sparse | 125 +++++++ + tests/qemu-iotests/tests/mirror-sparse.out | 365 +++++++++++++++++++++ + 2 files changed, 490 insertions(+) + create mode 100755 tests/qemu-iotests/tests/mirror-sparse + create mode 100644 tests/qemu-iotests/tests/mirror-sparse.out + +diff --git a/tests/qemu-iotests/tests/mirror-sparse b/tests/qemu-iotests/tests/mirror-sparse +new file mode 100755 +index 0000000000..8c52a4e244 +--- /dev/null ++++ b/tests/qemu-iotests/tests/mirror-sparse +@@ -0,0 +1,125 @@ ++#!/usr/bin/env bash ++# group: rw auto quick ++# ++# Test blockdev-mirror with raw sparse destination ++# ++# Copyright (C) 2025 Red Hat, Inc. ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++# ++ ++seq="$(basename $0)" ++echo "QA output created by $seq" ++ ++status=1 # failure is the default! ++ ++_cleanup() ++{ ++ _cleanup_test_img ++ _cleanup_qemu ++} ++trap "_cleanup; exit \$status" 0 1 2 3 15 ++ ++# get standard environment, filters and checks ++cd .. ++. ./common.rc ++. ./common.filter ++. ./common.qemu ++ ++_supported_fmt qcow2 raw # Format of the source. dst is always raw file ++_supported_proto file ++_supported_os Linux ++ ++echo ++echo "=== Initial image setup ===" ++echo ++ ++TEST_IMG="$TEST_IMG.base" _make_test_img 20M ++$QEMU_IO -c 'w 8M 2M' -f $IMGFMT "$TEST_IMG.base" | _filter_qemu_io ++ ++_launch_qemu \ ++ -blockdev '{"driver":"file", "cache":{"direct":true, "no-flush":false}, ++ "filename":"'"$TEST_IMG.base"'", "node-name":"src-file"}' \ ++ -blockdev '{"driver":"'$IMGFMT'", "node-name":"src", "file":"src-file"}' ++h1=$QEMU_HANDLE ++_send_qemu_cmd $h1 '{"execute": "qmp_capabilities"}' 'return' ++ ++# Check several combinations; most should result in a sparse destination; ++# the destination should only be fully allocated if pre-allocated ++# and not punching holes due to detect-zeroes ++# do_test creation discard zeroes result ++do_test() { ++ creation=$1 ++ discard=$2 ++ zeroes=$3 ++ expected=$4 ++ ++echo ++echo "=== Testing creation=$creation discard=$discard zeroes=$zeroes ===" ++echo ++ ++rm -f $TEST_IMG ++if test $creation = external; then ++ truncate --size=20M $TEST_IMG ++else ++ _send_qemu_cmd $h1 '{"execute": "blockdev-create", "arguments": ++ {"options": {"driver":"file", "filename":"'$TEST_IMG'", ++ "size":'$((20*1024*1024))', "preallocation":"'$creation'"}, ++ "job-id":"job1"}}' 'concluded' ++ _send_qemu_cmd $h1 '{"execute": "job-dismiss", "arguments": ++ {"id": "job1"}}' 'return' ++fi ++_send_qemu_cmd $h1 '{"execute": "blockdev-add", "arguments": ++ {"node-name": "dst", "driver":"file", ++ "filename":"'$TEST_IMG'", "aio":"threads", ++ "auto-read-only":true, "discard":"'$discard'", ++ "detect-zeroes":"'$zeroes'"}}' 'return' ++_send_qemu_cmd $h1 '{"execute":"blockdev-mirror", "arguments": ++ {"sync":"full", "device":"src", "target":"dst", ++ "job-id":"job2"}}' 'return' ++_timed_wait_for $h1 '"ready"' ++_send_qemu_cmd $h1 '{"execute": "job-complete", "arguments": ++ {"id":"job2"}}' 'return' \ ++ | _filter_block_job_offset | _filter_block_job_len ++_send_qemu_cmd $h1 '{"execute": "blockdev-del", "arguments": ++ {"node-name": "dst"}}' 'return' \ ++ | _filter_block_job_offset | _filter_block_job_len ++$QEMU_IMG compare -U -f $IMGFMT -F raw $TEST_IMG.base $TEST_IMG ++result=$(disk_usage $TEST_IMG) ++if test $result -lt $((3*1024*1024)); then ++ actual=sparse ++elif test $result = $((20*1024*1024)); then ++ actual=full ++else ++ actual=unknown ++fi ++echo "Destination is $actual; expected $expected" ++} ++ ++do_test external ignore off sparse ++do_test external unmap off sparse ++do_test external unmap unmap sparse ++do_test off ignore off sparse ++do_test off unmap off sparse ++do_test off unmap unmap sparse ++do_test full ignore off full ++do_test full unmap off sparse ++do_test full unmap unmap sparse ++ ++_send_qemu_cmd $h1 '{"execute":"quit"}' '' ++ ++# success, all done ++echo '*** done' ++rm -f $seq.full ++status=0 +diff --git a/tests/qemu-iotests/tests/mirror-sparse.out b/tests/qemu-iotests/tests/mirror-sparse.out +new file mode 100644 +index 0000000000..2103b891c3 +--- /dev/null ++++ b/tests/qemu-iotests/tests/mirror-sparse.out +@@ -0,0 +1,365 @@ ++QA output created by mirror-sparse ++ ++=== Initial image setup === ++ ++Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=20971520 ++wrote 2097152/2097152 bytes at offset 8388608 ++2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++{"execute": "qmp_capabilities"} ++{"return": {}} ++ ++=== Testing creation=external discard=ignore zeroes=off === ++ ++{"execute": "blockdev-add", "arguments": ++ {"node-name": "dst", "driver":"file", ++ "filename":"TEST_DIR/t.IMGFMT", "aio":"threads", ++ "auto-read-only":true, "discard":"ignore", ++ "detect-zeroes":"off"}} ++{"return": {}} ++{"execute":"blockdev-mirror", "arguments": ++ {"sync":"full", "device":"src", "target":"dst", ++ "job-id":"job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job2"}} ++{"return": {}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "job2"}} ++{"execute": "job-complete", "arguments": ++ {"id":"job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "job2", "len": LEN, "offset": OFFSET, "speed": 0, "type": "mirror"}} ++{"return": {}} ++{"execute": "blockdev-del", "arguments": ++ {"node-name": "dst"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "pending", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "job2", "len": LEN, "offset": OFFSET, "speed": 0, "type": "mirror"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job2"}} ++{"return": {}} ++Images are identical. ++Destination is sparse; expected sparse ++ ++=== Testing creation=external discard=unmap zeroes=off === ++ ++{"execute": "blockdev-add", "arguments": ++ {"node-name": "dst", "driver":"file", ++ "filename":"TEST_DIR/t.IMGFMT", "aio":"threads", ++ "auto-read-only":true, "discard":"unmap", ++ "detect-zeroes":"off"}} ++{"return": {}} ++{"execute":"blockdev-mirror", "arguments": ++ {"sync":"full", "device":"src", "target":"dst", ++ "job-id":"job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job2"}} ++{"return": {}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "job2"}} ++{"execute": "job-complete", "arguments": ++ {"id":"job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "job2", "len": LEN, "offset": OFFSET, "speed": 0, "type": "mirror"}} ++{"return": {}} ++{"execute": "blockdev-del", "arguments": ++ {"node-name": "dst"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "pending", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "job2", "len": LEN, "offset": OFFSET, "speed": 0, "type": "mirror"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job2"}} ++{"return": {}} ++Images are identical. ++Destination is sparse; expected sparse ++ ++=== Testing creation=external discard=unmap zeroes=unmap === ++ ++{"execute": "blockdev-add", "arguments": ++ {"node-name": "dst", "driver":"file", ++ "filename":"TEST_DIR/t.IMGFMT", "aio":"threads", ++ "auto-read-only":true, "discard":"unmap", ++ "detect-zeroes":"unmap"}} ++{"return": {}} ++{"execute":"blockdev-mirror", "arguments": ++ {"sync":"full", "device":"src", "target":"dst", ++ "job-id":"job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job2"}} ++{"return": {}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "job2"}} ++{"execute": "job-complete", "arguments": ++ {"id":"job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "job2", "len": LEN, "offset": OFFSET, "speed": 0, "type": "mirror"}} ++{"return": {}} ++{"execute": "blockdev-del", "arguments": ++ {"node-name": "dst"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "pending", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "job2", "len": LEN, "offset": OFFSET, "speed": 0, "type": "mirror"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job2"}} ++{"return": {}} ++Images are identical. ++Destination is sparse; expected sparse ++ ++=== Testing creation=off discard=ignore zeroes=off === ++ ++{"execute": "blockdev-create", "arguments": ++ {"options": {"driver":"file", "filename":"TEST_DIR/t.IMGFMT", ++ "size":20971520, "preallocation":"off"}, ++ "job-id":"job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job1"}} ++{"return": {}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "pending", "id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job1"}} ++{"execute": "job-dismiss", "arguments": ++ {"id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job1"}} ++{"return": {}} ++{"execute": "blockdev-add", "arguments": ++ {"node-name": "dst", "driver":"file", ++ "filename":"TEST_DIR/t.IMGFMT", "aio":"threads", ++ "auto-read-only":true, "discard":"ignore", ++ "detect-zeroes":"off"}} ++{"return": {}} ++{"execute":"blockdev-mirror", "arguments": ++ {"sync":"full", "device":"src", "target":"dst", ++ "job-id":"job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job2"}} ++{"return": {}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "job2"}} ++{"execute": "job-complete", "arguments": ++ {"id":"job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "job2", "len": LEN, "offset": OFFSET, "speed": 0, "type": "mirror"}} ++{"return": {}} ++{"execute": "blockdev-del", "arguments": ++ {"node-name": "dst"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "pending", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "job2", "len": LEN, "offset": OFFSET, "speed": 0, "type": "mirror"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job2"}} ++{"return": {}} ++Images are identical. ++Destination is sparse; expected sparse ++ ++=== Testing creation=off discard=unmap zeroes=off === ++ ++{"execute": "blockdev-create", "arguments": ++ {"options": {"driver":"file", "filename":"TEST_DIR/t.IMGFMT", ++ "size":20971520, "preallocation":"off"}, ++ "job-id":"job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job1"}} ++{"return": {}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "pending", "id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job1"}} ++{"execute": "job-dismiss", "arguments": ++ {"id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job1"}} ++{"return": {}} ++{"execute": "blockdev-add", "arguments": ++ {"node-name": "dst", "driver":"file", ++ "filename":"TEST_DIR/t.IMGFMT", "aio":"threads", ++ "auto-read-only":true, "discard":"unmap", ++ "detect-zeroes":"off"}} ++{"return": {}} ++{"execute":"blockdev-mirror", "arguments": ++ {"sync":"full", "device":"src", "target":"dst", ++ "job-id":"job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job2"}} ++{"return": {}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "job2"}} ++{"execute": "job-complete", "arguments": ++ {"id":"job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "job2", "len": LEN, "offset": OFFSET, "speed": 0, "type": "mirror"}} ++{"return": {}} ++{"execute": "blockdev-del", "arguments": ++ {"node-name": "dst"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "pending", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "job2", "len": LEN, "offset": OFFSET, "speed": 0, "type": "mirror"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job2"}} ++{"return": {}} ++Images are identical. ++Destination is sparse; expected sparse ++ ++=== Testing creation=off discard=unmap zeroes=unmap === ++ ++{"execute": "blockdev-create", "arguments": ++ {"options": {"driver":"file", "filename":"TEST_DIR/t.IMGFMT", ++ "size":20971520, "preallocation":"off"}, ++ "job-id":"job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job1"}} ++{"return": {}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "pending", "id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job1"}} ++{"execute": "job-dismiss", "arguments": ++ {"id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job1"}} ++{"return": {}} ++{"execute": "blockdev-add", "arguments": ++ {"node-name": "dst", "driver":"file", ++ "filename":"TEST_DIR/t.IMGFMT", "aio":"threads", ++ "auto-read-only":true, "discard":"unmap", ++ "detect-zeroes":"unmap"}} ++{"return": {}} ++{"execute":"blockdev-mirror", "arguments": ++ {"sync":"full", "device":"src", "target":"dst", ++ "job-id":"job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job2"}} ++{"return": {}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "job2"}} ++{"execute": "job-complete", "arguments": ++ {"id":"job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "job2", "len": LEN, "offset": OFFSET, "speed": 0, "type": "mirror"}} ++{"return": {}} ++{"execute": "blockdev-del", "arguments": ++ {"node-name": "dst"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "pending", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "job2", "len": LEN, "offset": OFFSET, "speed": 0, "type": "mirror"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job2"}} ++{"return": {}} ++Images are identical. ++Destination is sparse; expected sparse ++ ++=== Testing creation=full discard=ignore zeroes=off === ++ ++{"execute": "blockdev-create", "arguments": ++ {"options": {"driver":"file", "filename":"TEST_DIR/t.IMGFMT", ++ "size":20971520, "preallocation":"full"}, ++ "job-id":"job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job1"}} ++{"return": {}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "pending", "id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job1"}} ++{"execute": "job-dismiss", "arguments": ++ {"id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job1"}} ++{"return": {}} ++{"execute": "blockdev-add", "arguments": ++ {"node-name": "dst", "driver":"file", ++ "filename":"TEST_DIR/t.IMGFMT", "aio":"threads", ++ "auto-read-only":true, "discard":"ignore", ++ "detect-zeroes":"off"}} ++{"return": {}} ++{"execute":"blockdev-mirror", "arguments": ++ {"sync":"full", "device":"src", "target":"dst", ++ "job-id":"job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job2"}} ++{"return": {}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "job2"}} ++{"execute": "job-complete", "arguments": ++ {"id":"job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "job2", "len": LEN, "offset": OFFSET, "speed": 0, "type": "mirror"}} ++{"return": {}} ++{"execute": "blockdev-del", "arguments": ++ {"node-name": "dst"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "pending", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "job2", "len": LEN, "offset": OFFSET, "speed": 0, "type": "mirror"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job2"}} ++{"return": {}} ++Images are identical. ++Destination is full; expected full ++ ++=== Testing creation=full discard=unmap zeroes=off === ++ ++{"execute": "blockdev-create", "arguments": ++ {"options": {"driver":"file", "filename":"TEST_DIR/t.IMGFMT", ++ "size":20971520, "preallocation":"full"}, ++ "job-id":"job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job1"}} ++{"return": {}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "pending", "id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job1"}} ++{"execute": "job-dismiss", "arguments": ++ {"id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job1"}} ++{"return": {}} ++{"execute": "blockdev-add", "arguments": ++ {"node-name": "dst", "driver":"file", ++ "filename":"TEST_DIR/t.IMGFMT", "aio":"threads", ++ "auto-read-only":true, "discard":"unmap", ++ "detect-zeroes":"off"}} ++{"return": {}} ++{"execute":"blockdev-mirror", "arguments": ++ {"sync":"full", "device":"src", "target":"dst", ++ "job-id":"job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job2"}} ++{"return": {}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "job2"}} ++{"execute": "job-complete", "arguments": ++ {"id":"job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "job2", "len": LEN, "offset": OFFSET, "speed": 0, "type": "mirror"}} ++{"return": {}} ++{"execute": "blockdev-del", "arguments": ++ {"node-name": "dst"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "pending", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "job2", "len": LEN, "offset": OFFSET, "speed": 0, "type": "mirror"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job2"}} ++{"return": {}} ++Images are identical. ++Destination is sparse; expected sparse ++ ++=== Testing creation=full discard=unmap zeroes=unmap === ++ ++{"execute": "blockdev-create", "arguments": ++ {"options": {"driver":"file", "filename":"TEST_DIR/t.IMGFMT", ++ "size":20971520, "preallocation":"full"}, ++ "job-id":"job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job1"}} ++{"return": {}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "pending", "id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job1"}} ++{"execute": "job-dismiss", "arguments": ++ {"id": "job1"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job1"}} ++{"return": {}} ++{"execute": "blockdev-add", "arguments": ++ {"node-name": "dst", "driver":"file", ++ "filename":"TEST_DIR/t.IMGFMT", "aio":"threads", ++ "auto-read-only":true, "discard":"unmap", ++ "detect-zeroes":"unmap"}} ++{"return": {}} ++{"execute":"blockdev-mirror", "arguments": ++ {"sync":"full", "device":"src", "target":"dst", ++ "job-id":"job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job2"}} ++{"return": {}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "job2"}} ++{"execute": "job-complete", "arguments": ++ {"id":"job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "job2", "len": LEN, "offset": OFFSET, "speed": 0, "type": "mirror"}} ++{"return": {}} ++{"execute": "blockdev-del", "arguments": ++ {"node-name": "dst"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "pending", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "job2", "len": LEN, "offset": OFFSET, "speed": 0, "type": "mirror"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job2"}} ++{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job2"}} ++{"return": {}} ++Images are identical. ++Destination is sparse; expected sparse ++{"execute":"quit"} ++*** done +-- +2.48.1 + diff --git a/SOURCES/kvm-tests-unit-test-util-sockets-fix-mem-leak-on-error-o.patch b/SOURCES/kvm-tests-unit-test-util-sockets-fix-mem-leak-on-error-o.patch new file mode 100644 index 0000000..d0787f4 --- /dev/null +++ b/SOURCES/kvm-tests-unit-test-util-sockets-fix-mem-leak-on-error-o.patch @@ -0,0 +1,53 @@ +From 83f09a8c65e1fef416e39d9b0a4ead14ed00601e Mon Sep 17 00:00:00 2001 +From: Matheus Tavares Bernardino +Date: Mon, 26 May 2025 10:20:55 -0700 +Subject: [PATCH 14/57] tests/unit/test-util-sockets: fix mem-leak on error + object + +RH-Author: Juraj Marcin +RH-MergeRequest: 369: util/qemu-sockets: Introduce inet socket options controlling TCP keep-alive +RH-Jira: RHEL-67104 +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina +RH-Commit: [7/7] 31c74f784a26812e5c5898efacda9b4069874ed7 (JurajMarcin/centos-src-qemu-kvm) + +The test fails with --enable-asan as the error struct is never freed. +In the case where the test expects a success but it fails, let's also +report the error for debugging (it will be freed internally). + +Fixes 316e8ee8d6 ("util/qemu-sockets: Refactor inet_parse() to use QemuOpts") + +Signed-off-by: Matheus Tavares Bernardino +Reviewed-by: Juraj Marcin +Message-ID: <518d94c7db20060b2a086cf55ee9bffab992a907.1748280011.git.matheus.bernardino@oss.qualcomm.com> +Signed-off-by: Thomas Huth + +(cherry picked from commit 5c54a367265ec19ed94a535cd15d178c16b8cae0) + +JIRA: https://issues.redhat.com/browse/RHEL-67104 + +Signed-off-by: Juraj Marcin +--- + tests/unit/test-util-sockets.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/tests/unit/test-util-sockets.c b/tests/unit/test-util-sockets.c +index 8492f4d68f..ee66d727c3 100644 +--- a/tests/unit/test-util-sockets.c ++++ b/tests/unit/test-util-sockets.c +@@ -341,8 +341,12 @@ static void inet_parse_test_helper(const char *str, + int rc = inet_parse(&addr, str, &error); + + if (success) { ++ if (error) { ++ error_report_err(error); ++ } + g_assert_cmpint(rc, ==, 0); + } else { ++ error_free(error); + g_assert_cmpint(rc, <, 0); + } + if (exp_addr != NULL) { +-- +2.39.3 + diff --git a/SOURCES/kvm-ui-vnc-Update-display-update-interval-when-VM-state-.patch b/SOURCES/kvm-ui-vnc-Update-display-update-interval-when-VM-state-.patch new file mode 100644 index 0000000..9d30908 --- /dev/null +++ b/SOURCES/kvm-ui-vnc-Update-display-update-interval-when-VM-state-.patch @@ -0,0 +1,97 @@ +From a69b8d66fb515cd55cef2fcaa626c350d761f1a9 Mon Sep 17 00:00:00 2001 +From: Juraj Marcin +Date: Wed, 21 May 2025 17:16:13 +0200 +Subject: [PATCH 57/57] ui/vnc: Update display update interval when VM state + changes to RUNNING +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Juraj Marcin +RH-MergeRequest: 385: ui/vnc: Update display update interval when VM state changes to RUNNING +RH-Jira: RHEL-100741 +RH-Acked-by: Peter Xu +RH-Acked-by: Marc-André Lureau +RH-Commit: [1/1] 30dd4790a607d646465c18d621073df997e8850b (JurajMarcin/centos-src-qemu-kvm) + +If a virtual machine is paused for an extended period time, for example, +due to an incoming migration, there are also no changes on the screen. +VNC in such case increases the display update interval by +VNC_REFRESH_INTERVAL_INC (50 ms). The update interval can then grow up +to VNC_REFRESH_INTERVAL_MAX (3000 ms). + +When the machine resumes, it can then take up to 3 seconds for the first +display update. Furthermore, the update interval is then halved with +each display update with changes on the screen. If there are moving +elements on the screen, such as a video, this can be perceived as +freezing and stuttering for few seconds before the movement is smooth +again. + +This patch resolves this issue, by adding a listener to VM state changes +and changing the update interval when the VM state changes to RUNNING. +The update_displaychangelistener() function updates the internal timer, +and the display is refreshed immediately if the timer is expired. + +Signed-off-by: Juraj Marcin +Reviewed-by: Marc-André Lureau +Reviewed-by: Peter Xu +Reviewed-by: Daniel P. Berrangé +Link: https://lore.kernel.org/r/20250521151616.3951178-1-jmarcin@redhat.com +Signed-off-by: Peter Xu + +(cherry picked from commit 0310d594d98b39f9dde79b87fd8b0ad16e7c5459) + +JIRA: https://issues.redhat.com/browse/RHEL-100741 + +Signed-off-by: Juraj Marcin +--- + ui/vnc.c | 12 ++++++++++++ + ui/vnc.h | 2 ++ + 2 files changed, 14 insertions(+) + +diff --git a/ui/vnc.c b/ui/vnc.c +index 5057ec8680..4afc925a18 100644 +--- a/ui/vnc.c ++++ b/ui/vnc.c +@@ -3386,6 +3386,16 @@ static const DisplayChangeListenerOps dcl_ops = { + .dpy_cursor_define = vnc_dpy_cursor_define, + }; + ++static void vmstate_change_handler(void *opaque, bool running, RunState state) ++{ ++ VncDisplay *vd = opaque; ++ ++ if (state != RUN_STATE_RUNNING) { ++ return; ++ } ++ update_displaychangelistener(&vd->dcl, VNC_REFRESH_INTERVAL_BASE); ++} ++ + void vnc_display_init(const char *id, Error **errp) + { + VncDisplay *vd; +@@ -3422,6 +3432,8 @@ void vnc_display_init(const char *id, Error **errp) + vd->dcl.ops = &dcl_ops; + register_displaychangelistener(&vd->dcl); + vd->kbd = qkbd_state_init(vd->dcl.con); ++ vd->vmstate_handler_entry = qemu_add_vm_change_state_handler( ++ &vmstate_change_handler, vd); + } + + +diff --git a/ui/vnc.h b/ui/vnc.h +index e5fa2efa3e..e9da707dbc 100644 +--- a/ui/vnc.h ++++ b/ui/vnc.h +@@ -186,6 +186,8 @@ struct VncDisplay + #endif + + AudioState *audio_state; ++ ++ VMChangeStateEntry *vmstate_handler_entry; + }; + + typedef struct VncTight { +-- +2.39.3 + diff --git a/SOURCES/kvm-util-qemu-sockets-Add-support-for-keep-alive-flag-to.patch b/SOURCES/kvm-util-qemu-sockets-Add-support-for-keep-alive-flag-to.patch new file mode 100644 index 0000000..9c53a87 --- /dev/null +++ b/SOURCES/kvm-util-qemu-sockets-Add-support-for-keep-alive-flag-to.patch @@ -0,0 +1,86 @@ +From 644b9e34d8c764598e663eb983e2d6eca4ed2510 Mon Sep 17 00:00:00 2001 +From: Juraj Marcin +Date: Wed, 21 May 2025 15:52:33 +0200 +Subject: [PATCH 11/57] util/qemu-sockets: Add support for keep-alive flag to + passive sockets +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Juraj Marcin +RH-MergeRequest: 369: util/qemu-sockets: Introduce inet socket options controlling TCP keep-alive +RH-Jira: RHEL-67104 +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina +RH-Commit: [4/7] a8ec1996262b2bb657b8fe2e72c9045faee0a64c (JurajMarcin/centos-src-qemu-kvm) + +Commit aec21d3175 (qapi: Add InetSocketAddress member keep-alive) +introduces the keep-alive flag, which enables the SO_KEEPALIVE socket +option, but only on client-side sockets. However, this option is also +useful for server-side sockets, so they can check if a client is still +reachable or drop the connection otherwise. + +This patch enables the SO_KEEPALIVE socket option on passive server-side +sockets if the keep-alive flag is enabled. This socket option is then +inherited by active server-side sockets communicating with connected +clients. + +Signed-off-by: Juraj Marcin +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Daniel P. Berrangé + +(cherry picked from commit 00064705ed1f3943d3634be25da434466c87e7d5) + +JIRA: https://issues.redhat.com/browse/RHEL-67104 + +Signed-off-by: Juraj Marcin +--- + qapi/sockets.json | 4 ++-- + util/qemu-sockets.c | 9 +++------ + 2 files changed, 5 insertions(+), 8 deletions(-) + +diff --git a/qapi/sockets.json b/qapi/sockets.json +index 6a95023315..62797cd027 100644 +--- a/qapi/sockets.json ++++ b/qapi/sockets.json +@@ -56,8 +56,8 @@ + # @ipv6: whether to accept IPv6 addresses, default try both IPv4 and + # IPv6 + # +-# @keep-alive: enable keep-alive when connecting to this socket. Not +-# supported for passive sockets. (Since 4.2) ++# @keep-alive: enable keep-alive when connecting to/listening on this socket. ++# (Since 4.2, not supported for listening sockets until 10.1) + # + # @mptcp: enable multi-path TCP. (Since 6.1) + # +diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c +index 631d0c4023..8fc1f86145 100644 +--- a/util/qemu-sockets.c ++++ b/util/qemu-sockets.c +@@ -236,12 +236,6 @@ static int inet_listen_saddr(InetSocketAddress *saddr, + int saved_errno = 0; + bool socket_created = false; + +- if (saddr->keep_alive) { +- error_setg(errp, "keep-alive option is not supported for passive " +- "sockets"); +- return -1; +- } +- + memset(&ai,0, sizeof(ai)); + ai.ai_flags = AI_PASSIVE; + if (saddr->has_numeric && saddr->numeric) { +@@ -349,6 +343,9 @@ static int inet_listen_saddr(InetSocketAddress *saddr, + goto fail; + } + /* We have a listening socket */ ++ if (inet_set_sockopts(slisten, saddr, errp) < 0) { ++ goto fail; ++ } + freeaddrinfo(res); + return slisten; + } +-- +2.39.3 + diff --git a/SOURCES/kvm-util-qemu-sockets-Introduce-inet-socket-options-cont.patch b/SOURCES/kvm-util-qemu-sockets-Introduce-inet-socket-options-cont.patch new file mode 100644 index 0000000..8839085 --- /dev/null +++ b/SOURCES/kvm-util-qemu-sockets-Introduce-inet-socket-options-cont.patch @@ -0,0 +1,314 @@ +From 3e9458cd71f909474c1dd051f43fd3fbef8d53fd Mon Sep 17 00:00:00 2001 +From: Juraj Marcin +Date: Wed, 21 May 2025 15:52:35 +0200 +Subject: [PATCH 13/57] util/qemu-sockets: Introduce inet socket options + controlling TCP keep-alive +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Juraj Marcin +RH-MergeRequest: 369: util/qemu-sockets: Introduce inet socket options controlling TCP keep-alive +RH-Jira: RHEL-67104 +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina +RH-Commit: [6/7] 3861e7874d5952c53a5020c123b3b2e632149008 (JurajMarcin/centos-src-qemu-kvm) + +With the default TCP stack configuration, it could be even 2 hours +before the connection times out due to the other side not being +reachable. However, in some cases, the application needs to be aware of +a connection issue much sooner. + +This is the case, for example, for postcopy live migration. If there is +no traffic from the migration destination guest (server-side) to the +migration source guest (client-side), the destination keeps waiting for +pages indefinitely and does not switch to the postcopy-paused state. +This can happen, for example, if the destination QEMU instance is +started with the '-S' command line option and the machine is not started +yet, or if the machine is idle and produces no new page faults for +not-yet-migrated pages. + +This patch introduces new inet socket parameters that control count, +idle period, and interval of TCP keep-alive packets before the +connection is considered broken. These parameters are available on +systems where the respective TCP socket options are defined, that +includes Linux, Windows, macOS, but not OpenBSD. Additionally, macOS +defines TCP_KEEPIDLE as TCP_KEEPALIVE instead, so the patch supplies its +own definition. + +The default value for all is 0, which means the system configuration is +used. + +Signed-off-by: Juraj Marcin +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Daniel P. Berrangé + +(cherry picked from commit 1bd4237cb1095d71c16afad3ce93b4a1e453173e) + +JIRA: https://issues.redhat.com/browse/RHEL-67104 + +Signed-off-by: Juraj Marcin +--- + meson.build | 30 +++++++++++++ + qapi/sockets.json | 19 ++++++++ + tests/unit/test-util-sockets.c | 39 +++++++++++++++++ + util/qemu-sockets.c | 80 ++++++++++++++++++++++++++++++++++ + 4 files changed, 168 insertions(+) + +diff --git a/meson.build b/meson.build +index 5bb2b757c3..c4539b66c5 100644 +--- a/meson.build ++++ b/meson.build +@@ -2581,6 +2581,36 @@ config_host_data.set('HAVE_OPTRESET', + cc.has_header_symbol('getopt.h', 'optreset')) + config_host_data.set('HAVE_IPPROTO_MPTCP', + cc.has_header_symbol('netinet/in.h', 'IPPROTO_MPTCP')) ++config_host_data.set('HAVE_TCP_KEEPCNT', ++ cc.has_header_symbol('netinet/tcp.h', 'TCP_KEEPCNT') or ++ cc.compiles(''' ++ #include ++ #ifndef TCP_KEEPCNT ++ #error ++ #endif ++ int main(void) { return 0; }''', ++ name: 'Win32 TCP_KEEPCNT')) ++# On Darwin TCP_KEEPIDLE is available under different name, TCP_KEEPALIVE. ++# https://github.com/apple/darwin-xnu/blob/xnu-4570.1.46/bsd/man/man4/tcp.4#L172 ++config_host_data.set('HAVE_TCP_KEEPIDLE', ++ cc.has_header_symbol('netinet/tcp.h', 'TCP_KEEPIDLE') or ++ cc.has_header_symbol('netinet/tcp.h', 'TCP_KEEPALIVE') or ++ cc.compiles(''' ++ #include ++ #ifndef TCP_KEEPIDLE ++ #error ++ #endif ++ int main(void) { return 0; }''', ++ name: 'Win32 TCP_KEEPIDLE')) ++config_host_data.set('HAVE_TCP_KEEPINTVL', ++ cc.has_header_symbol('netinet/tcp.h', 'TCP_KEEPINTVL') or ++ cc.compiles(''' ++ #include ++ #ifndef TCP_KEEPINTVL ++ #error ++ #endif ++ int main(void) { return 0; }''', ++ name: 'Win32 TCP_KEEPINTVL')) + + # has_member + config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID', +diff --git a/qapi/sockets.json b/qapi/sockets.json +index 62797cd027..f9f559daba 100644 +--- a/qapi/sockets.json ++++ b/qapi/sockets.json +@@ -59,6 +59,22 @@ + # @keep-alive: enable keep-alive when connecting to/listening on this socket. + # (Since 4.2, not supported for listening sockets until 10.1) + # ++# @keep-alive-count: number of keep-alive packets sent before the connection is ++# closed. Only supported for TCP sockets on systems where TCP_KEEPCNT ++# socket option is defined (this includes Linux, Windows, macOS, FreeBSD, ++# but not OpenBSD). When set to 0, system setting is used. (Since 10.1) ++# ++# @keep-alive-idle: time in seconds the connection needs to be idle before ++# sending a keepalive packet. Only supported for TCP sockets on systems ++# where TCP_KEEPIDLE socket option is defined (this includes Linux, ++# Windows, macOS, FreeBSD, but not OpenBSD). When set to 0, system setting ++# is used. (Since 10.1) ++# ++# @keep-alive-interval: time in seconds between keep-alive packets. Only ++# supported for TCP sockets on systems where TCP_KEEPINTVL is defined (this ++# includes Linux, Windows, macOS, FreeBSD, but not OpenBSD). When set to ++# 0, system setting is used. (Since 10.1) ++# + # @mptcp: enable multi-path TCP. (Since 6.1) + # + # Since: 1.3 +@@ -71,6 +87,9 @@ + '*ipv4': 'bool', + '*ipv6': 'bool', + '*keep-alive': 'bool', ++ '*keep-alive-count': { 'type': 'uint32', 'if': 'HAVE_TCP_KEEPCNT' }, ++ '*keep-alive-idle': { 'type': 'uint32', 'if': 'HAVE_TCP_KEEPIDLE' }, ++ '*keep-alive-interval': { 'type': 'uint32', 'if': 'HAVE_TCP_KEEPINTVL' }, + '*mptcp': { 'type': 'bool', 'if': 'HAVE_IPPROTO_MPTCP' } } } + + ## +diff --git a/tests/unit/test-util-sockets.c b/tests/unit/test-util-sockets.c +index 9e39b92e7c..8492f4d68f 100644 +--- a/tests/unit/test-util-sockets.c ++++ b/tests/unit/test-util-sockets.c +@@ -359,6 +359,24 @@ static void inet_parse_test_helper(const char *str, + g_assert_cmpint(addr.ipv6, ==, exp_addr->ipv6); + g_assert_cmpint(addr.has_keep_alive, ==, exp_addr->has_keep_alive); + g_assert_cmpint(addr.keep_alive, ==, exp_addr->keep_alive); ++#ifdef HAVE_TCP_KEEPCNT ++ g_assert_cmpint(addr.has_keep_alive_count, ==, ++ exp_addr->has_keep_alive_count); ++ g_assert_cmpint(addr.keep_alive_count, ==, ++ exp_addr->keep_alive_count); ++#endif ++#ifdef HAVE_TCP_KEEPIDLE ++ g_assert_cmpint(addr.has_keep_alive_idle, ==, ++ exp_addr->has_keep_alive_idle); ++ g_assert_cmpint(addr.keep_alive_idle, ==, ++ exp_addr->keep_alive_idle); ++#endif ++#ifdef HAVE_TCP_KEEPINTVL ++ g_assert_cmpint(addr.has_keep_alive_interval, ==, ++ exp_addr->has_keep_alive_interval); ++ g_assert_cmpint(addr.keep_alive_interval, ==, ++ exp_addr->keep_alive_interval); ++#endif + #ifdef HAVE_IPPROTO_MPTCP + g_assert_cmpint(addr.has_mptcp, ==, exp_addr->has_mptcp); + g_assert_cmpint(addr.mptcp, ==, exp_addr->mptcp); +@@ -460,6 +478,18 @@ static void test_inet_parse_all_options_good(void) + .ipv6 = true, + .has_keep_alive = true, + .keep_alive = true, ++#ifdef HAVE_TCP_KEEPCNT ++ .has_keep_alive_count = true, ++ .keep_alive_count = 10, ++#endif ++#ifdef HAVE_TCP_KEEPIDLE ++ .has_keep_alive_idle = true, ++ .keep_alive_idle = 60, ++#endif ++#ifdef HAVE_TCP_KEEPINTVL ++ .has_keep_alive_interval = true, ++ .keep_alive_interval = 30, ++#endif + #ifdef HAVE_IPPROTO_MPTCP + .has_mptcp = true, + .mptcp = false, +@@ -467,6 +497,15 @@ static void test_inet_parse_all_options_good(void) + }; + inet_parse_test_helper( + "[::1]:5000,numeric=on,to=5006,ipv4=off,ipv6=on,keep-alive=on" ++#ifdef HAVE_TCP_KEEPCNT ++ ",keep-alive-count=10" ++#endif ++#ifdef HAVE_TCP_KEEPIDLE ++ ",keep-alive-idle=60" ++#endif ++#ifdef HAVE_TCP_KEEPINTVL ++ ",keep-alive-interval=30" ++#endif + #ifdef HAVE_IPPROTO_MPTCP + ",mptcp=off" + #endif +diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c +index 8017124c74..ef0a137bd2 100644 +--- a/util/qemu-sockets.c ++++ b/util/qemu-sockets.c +@@ -45,6 +45,14 @@ + # define AI_NUMERICSERV 0 + #endif + ++/* ++ * On macOS TCP_KEEPIDLE is available under a different name, TCP_KEEPALIVE. ++ * https://github.com/apple/darwin-xnu/blob/xnu-4570.1.46/bsd/man/man4/tcp.4#L172 ++ */ ++#if defined(TCP_KEEPALIVE) && !defined(TCP_KEEPIDLE) ++# define TCP_KEEPIDLE TCP_KEEPALIVE ++#endif ++ + + static int inet_getport(struct addrinfo *e) + { +@@ -218,6 +226,42 @@ static int inet_set_sockopts(int sock, InetSocketAddress *saddr, Error **errp) + "Unable to set keep-alive option on socket"); + return -1; + } ++#ifdef HAVE_TCP_KEEPCNT ++ if (saddr->has_keep_alive_count && saddr->keep_alive_count) { ++ int keep_count = saddr->keep_alive_count; ++ ret = setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT, &keep_count, ++ sizeof(keep_count)); ++ if (ret < 0) { ++ error_setg_errno(errp, errno, ++ "Unable to set TCP keep-alive count option on socket"); ++ return -1; ++ } ++ } ++#endif ++#ifdef HAVE_TCP_KEEPIDLE ++ if (saddr->has_keep_alive_idle && saddr->keep_alive_idle) { ++ int keep_idle = saddr->keep_alive_idle; ++ ret = setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE, &keep_idle, ++ sizeof(keep_idle)); ++ if (ret < 0) { ++ error_setg_errno(errp, errno, ++ "Unable to set TCP keep-alive idle option on socket"); ++ return -1; ++ } ++ } ++#endif ++#ifdef HAVE_TCP_KEEPINTVL ++ if (saddr->has_keep_alive_interval && saddr->keep_alive_interval) { ++ int keep_interval = saddr->keep_alive_interval; ++ ret = setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL, &keep_interval, ++ sizeof(keep_interval)); ++ if (ret < 0) { ++ error_setg_errno(errp, errno, ++ "Unable to set TCP keep-alive interval option on socket"); ++ return -1; ++ } ++ } ++#endif + } + return 0; + } +@@ -631,6 +675,24 @@ static QemuOptsList inet_opts = { + .name = "keep-alive", + .type = QEMU_OPT_BOOL, + }, ++#ifdef HAVE_TCP_KEEPCNT ++ { ++ .name = "keep-alive-count", ++ .type = QEMU_OPT_NUMBER, ++ }, ++#endif ++#ifdef HAVE_TCP_KEEPIDLE ++ { ++ .name = "keep-alive-idle", ++ .type = QEMU_OPT_NUMBER, ++ }, ++#endif ++#ifdef HAVE_TCP_KEEPINTVL ++ { ++ .name = "keep-alive-interval", ++ .type = QEMU_OPT_NUMBER, ++ }, ++#endif + #ifdef HAVE_IPPROTO_MPTCP + { + .name = "mptcp", +@@ -696,6 +758,24 @@ int inet_parse(InetSocketAddress *addr, const char *str, Error **errp) + addr->has_keep_alive = true; + addr->keep_alive = qemu_opt_get_bool(opts, "keep-alive", false); + } ++#ifdef HAVE_TCP_KEEPCNT ++ if (qemu_opt_find(opts, "keep-alive-count")) { ++ addr->has_keep_alive_count = true; ++ addr->keep_alive_count = qemu_opt_get_number(opts, "keep-alive-count", 0); ++ } ++#endif ++#ifdef HAVE_TCP_KEEPIDLE ++ if (qemu_opt_find(opts, "keep-alive-idle")) { ++ addr->has_keep_alive_idle = true; ++ addr->keep_alive_idle = qemu_opt_get_number(opts, "keep-alive-idle", 0); ++ } ++#endif ++#ifdef HAVE_TCP_KEEPINTVL ++ if (qemu_opt_find(opts, "keep-alive-interval")) { ++ addr->has_keep_alive_interval = true; ++ addr->keep_alive_interval = qemu_opt_get_number(opts, "keep-alive-interval", 0); ++ } ++#endif + #ifdef HAVE_IPPROTO_MPTCP + if (qemu_opt_find(opts, "mptcp")) { + addr->has_mptcp = true; +-- +2.39.3 + diff --git a/SOURCES/kvm-util-qemu-sockets-Refactor-inet_parse-to-use-QemuOpt.patch b/SOURCES/kvm-util-qemu-sockets-Refactor-inet_parse-to-use-QemuOpt.patch new file mode 100644 index 0000000..e533fa2 --- /dev/null +++ b/SOURCES/kvm-util-qemu-sockets-Refactor-inet_parse-to-use-QemuOpt.patch @@ -0,0 +1,461 @@ +From d2155b6fe1f200a588dd5e95d7587e96109da989 Mon Sep 17 00:00:00 2001 +From: Juraj Marcin +Date: Wed, 21 May 2025 15:52:34 +0200 +Subject: [PATCH 12/57] util/qemu-sockets: Refactor inet_parse() to use + QemuOpts +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Juraj Marcin +RH-MergeRequest: 369: util/qemu-sockets: Introduce inet socket options controlling TCP keep-alive +RH-Jira: RHEL-67104 +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina +RH-Commit: [5/7] 7fd97ed112c6259928d94595c2ae23fe1208621e (JurajMarcin/centos-src-qemu-kvm) + +Currently, the inet address parser cannot handle multiple options where +one is prefixed with the name of the other. For example, with the +'keep-alive-idle' option added, the current parser cannot parse +'127.0.0.1:5000,keep-alive-idle=60,keep-alive' correctly. Instead, it +fails with "error parsing 'keep-alive' flag '-idle=60,keep-alive'". + +To resolve these issues, this patch rewrites the inet address parsing +using the QemuOpts parser, which the inet_parse_flag() function tries to +mimic. This new parser supports all previously supported options and on +top of that the 'numeric' flag is now also supported. The only +difference is, the new parser produces an error if an unknown option is +passed, instead of silently ignoring it. + +Signed-off-by: Juraj Marcin +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Daniel P. Berrangé + +(cherry picked from commit 316e8ee8d614f049bfae697570a5e62af450491c) + +JIRA: https://issues.redhat.com/browse/RHEL-67104 + +Signed-off-by: Juraj Marcin +--- + tests/unit/test-util-sockets.c | 196 +++++++++++++++++++++++++++++++++ + util/qemu-sockets.c | 158 +++++++++++++------------- + 2 files changed, 270 insertions(+), 84 deletions(-) + +diff --git a/tests/unit/test-util-sockets.c b/tests/unit/test-util-sockets.c +index 4c9dd0b271..9e39b92e7c 100644 +--- a/tests/unit/test-util-sockets.c ++++ b/tests/unit/test-util-sockets.c +@@ -332,6 +332,177 @@ static void test_socket_unix_abstract(void) + + #endif /* CONFIG_LINUX */ + ++static void inet_parse_test_helper(const char *str, ++ InetSocketAddress *exp_addr, bool success) ++{ ++ InetSocketAddress addr; ++ Error *error = NULL; ++ ++ int rc = inet_parse(&addr, str, &error); ++ ++ if (success) { ++ g_assert_cmpint(rc, ==, 0); ++ } else { ++ g_assert_cmpint(rc, <, 0); ++ } ++ if (exp_addr != NULL) { ++ g_assert_cmpstr(addr.host, ==, exp_addr->host); ++ g_assert_cmpstr(addr.port, ==, exp_addr->port); ++ /* Own members: */ ++ g_assert_cmpint(addr.has_numeric, ==, exp_addr->has_numeric); ++ g_assert_cmpint(addr.numeric, ==, exp_addr->numeric); ++ g_assert_cmpint(addr.has_to, ==, exp_addr->has_to); ++ g_assert_cmpint(addr.to, ==, exp_addr->to); ++ g_assert_cmpint(addr.has_ipv4, ==, exp_addr->has_ipv4); ++ g_assert_cmpint(addr.ipv4, ==, exp_addr->ipv4); ++ g_assert_cmpint(addr.has_ipv6, ==, exp_addr->has_ipv6); ++ g_assert_cmpint(addr.ipv6, ==, exp_addr->ipv6); ++ g_assert_cmpint(addr.has_keep_alive, ==, exp_addr->has_keep_alive); ++ g_assert_cmpint(addr.keep_alive, ==, exp_addr->keep_alive); ++#ifdef HAVE_IPPROTO_MPTCP ++ g_assert_cmpint(addr.has_mptcp, ==, exp_addr->has_mptcp); ++ g_assert_cmpint(addr.mptcp, ==, exp_addr->mptcp); ++#endif ++ } ++ ++ g_free(addr.host); ++ g_free(addr.port); ++} ++ ++static void test_inet_parse_nohost_good(void) ++{ ++ char host[] = ""; ++ char port[] = "5000"; ++ InetSocketAddress exp_addr = { ++ .host = host, ++ .port = port, ++ }; ++ inet_parse_test_helper(":5000", &exp_addr, true); ++} ++ ++static void test_inet_parse_empty_bad(void) ++{ ++ inet_parse_test_helper("", NULL, false); ++} ++ ++static void test_inet_parse_only_colon_bad(void) ++{ ++ inet_parse_test_helper(":", NULL, false); ++} ++ ++static void test_inet_parse_ipv4_good(void) ++{ ++ char host[] = "127.0.0.1"; ++ char port[] = "5000"; ++ InetSocketAddress exp_addr = { ++ .host = host, ++ .port = port, ++ }; ++ inet_parse_test_helper("127.0.0.1:5000", &exp_addr, true); ++} ++ ++static void test_inet_parse_ipv4_noport_bad(void) ++{ ++ inet_parse_test_helper("127.0.0.1", NULL, false); ++} ++ ++static void test_inet_parse_ipv6_good(void) ++{ ++ char host[] = "::1"; ++ char port[] = "5000"; ++ InetSocketAddress exp_addr = { ++ .host = host, ++ .port = port, ++ }; ++ inet_parse_test_helper("[::1]:5000", &exp_addr, true); ++} ++ ++static void test_inet_parse_ipv6_noend_bad(void) ++{ ++ inet_parse_test_helper("[::1", NULL, false); ++} ++ ++static void test_inet_parse_ipv6_noport_bad(void) ++{ ++ inet_parse_test_helper("[::1]:", NULL, false); ++} ++ ++static void test_inet_parse_ipv6_empty_bad(void) ++{ ++ inet_parse_test_helper("[]:5000", NULL, false); ++} ++ ++static void test_inet_parse_hostname_good(void) ++{ ++ char host[] = "localhost"; ++ char port[] = "5000"; ++ InetSocketAddress exp_addr = { ++ .host = host, ++ .port = port, ++ }; ++ inet_parse_test_helper("localhost:5000", &exp_addr, true); ++} ++ ++static void test_inet_parse_all_options_good(void) ++{ ++ char host[] = "::1"; ++ char port[] = "5000"; ++ InetSocketAddress exp_addr = { ++ .host = host, ++ .port = port, ++ .has_numeric = true, ++ .numeric = true, ++ .has_to = true, ++ .to = 5006, ++ .has_ipv4 = true, ++ .ipv4 = false, ++ .has_ipv6 = true, ++ .ipv6 = true, ++ .has_keep_alive = true, ++ .keep_alive = true, ++#ifdef HAVE_IPPROTO_MPTCP ++ .has_mptcp = true, ++ .mptcp = false, ++#endif ++ }; ++ inet_parse_test_helper( ++ "[::1]:5000,numeric=on,to=5006,ipv4=off,ipv6=on,keep-alive=on" ++#ifdef HAVE_IPPROTO_MPTCP ++ ",mptcp=off" ++#endif ++ , &exp_addr, true); ++} ++ ++static void test_inet_parse_all_implicit_bool_good(void) ++{ ++ char host[] = "::1"; ++ char port[] = "5000"; ++ InetSocketAddress exp_addr = { ++ .host = host, ++ .port = port, ++ .has_numeric = true, ++ .numeric = true, ++ .has_to = true, ++ .to = 5006, ++ .has_ipv4 = true, ++ .ipv4 = true, ++ .has_ipv6 = true, ++ .ipv6 = true, ++ .has_keep_alive = true, ++ .keep_alive = true, ++#ifdef HAVE_IPPROTO_MPTCP ++ .has_mptcp = true, ++ .mptcp = true, ++#endif ++ }; ++ inet_parse_test_helper( ++ "[::1]:5000,numeric,to=5006,ipv4,ipv6,keep-alive" ++#ifdef HAVE_IPPROTO_MPTCP ++ ",mptcp" ++#endif ++ , &exp_addr, true); ++} ++ + int main(int argc, char **argv) + { + bool has_ipv4, has_ipv6; +@@ -377,6 +548,31 @@ int main(int argc, char **argv) + test_socket_unix_abstract); + #endif + ++ g_test_add_func("/util/socket/inet-parse/nohost-good", ++ test_inet_parse_nohost_good); ++ g_test_add_func("/util/socket/inet-parse/empty-bad", ++ test_inet_parse_empty_bad); ++ g_test_add_func("/util/socket/inet-parse/only-colon-bad", ++ test_inet_parse_only_colon_bad); ++ g_test_add_func("/util/socket/inet-parse/ipv4-good", ++ test_inet_parse_ipv4_good); ++ g_test_add_func("/util/socket/inet-parse/ipv4-noport-bad", ++ test_inet_parse_ipv4_noport_bad); ++ g_test_add_func("/util/socket/inet-parse/ipv6-good", ++ test_inet_parse_ipv6_good); ++ g_test_add_func("/util/socket/inet-parse/ipv6-noend-bad", ++ test_inet_parse_ipv6_noend_bad); ++ g_test_add_func("/util/socket/inet-parse/ipv6-noport-bad", ++ test_inet_parse_ipv6_noport_bad); ++ g_test_add_func("/util/socket/inet-parse/ipv6-empty-bad", ++ test_inet_parse_ipv6_empty_bad); ++ g_test_add_func("/util/socket/inet-parse/hostname-good", ++ test_inet_parse_hostname_good); ++ g_test_add_func("/util/socket/inet-parse/all-options-good", ++ test_inet_parse_all_options_good); ++ g_test_add_func("/util/socket/inet-parse/all-bare-bool-good", ++ test_inet_parse_all_implicit_bool_good); ++ + end: + return g_test_run(); + } +diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c +index 8fc1f86145..8017124c74 100644 +--- a/util/qemu-sockets.c ++++ b/util/qemu-sockets.c +@@ -30,6 +30,7 @@ + #include "qapi/qobject-input-visitor.h" + #include "qapi/qobject-output-visitor.h" + #include "qemu/cutils.h" ++#include "qemu/option.h" + #include "trace.h" + + #ifndef AI_ADDRCONFIG +@@ -601,115 +602,104 @@ err: + return -1; + } + +-/* compatibility wrapper */ +-static int inet_parse_flag(const char *flagname, const char *optstr, bool *val, +- Error **errp) +-{ +- char *end; +- size_t len; +- +- end = strstr(optstr, ","); +- if (end) { +- if (end[1] == ',') { /* Reject 'ipv6=on,,foo' */ +- error_setg(errp, "error parsing '%s' flag '%s'", flagname, optstr); +- return -1; +- } +- len = end - optstr; +- } else { +- len = strlen(optstr); +- } +- if (len == 0 || (len == 3 && strncmp(optstr, "=on", len) == 0)) { +- *val = true; +- } else if (len == 4 && strncmp(optstr, "=off", len) == 0) { +- *val = false; +- } else { +- error_setg(errp, "error parsing '%s' flag '%s'", flagname, optstr); +- return -1; +- } +- return 0; +-} ++static QemuOptsList inet_opts = { ++ .name = "InetSocketAddress", ++ .head = QTAILQ_HEAD_INITIALIZER(inet_opts.head), ++ .implied_opt_name = "addr", ++ .desc = { ++ { ++ .name = "addr", ++ .type = QEMU_OPT_STRING, ++ }, ++ { ++ .name = "numeric", ++ .type = QEMU_OPT_BOOL, ++ }, ++ { ++ .name = "to", ++ .type = QEMU_OPT_NUMBER, ++ }, ++ { ++ .name = "ipv4", ++ .type = QEMU_OPT_BOOL, ++ }, ++ { ++ .name = "ipv6", ++ .type = QEMU_OPT_BOOL, ++ }, ++ { ++ .name = "keep-alive", ++ .type = QEMU_OPT_BOOL, ++ }, ++#ifdef HAVE_IPPROTO_MPTCP ++ { ++ .name = "mptcp", ++ .type = QEMU_OPT_BOOL, ++ }, ++#endif ++ { /* end of list */ } ++ }, ++}; + + int inet_parse(InetSocketAddress *addr, const char *str, Error **errp) + { +- const char *optstr, *h; +- char host[65]; +- char port[33]; +- int to; +- int pos; +- char *begin; +- ++ QemuOpts *opts = qemu_opts_parse(&inet_opts, str, true, errp); ++ if (!opts) { ++ return -1; ++ } + memset(addr, 0, sizeof(*addr)); + + /* parse address */ +- if (str[0] == ':') { +- /* no host given */ +- host[0] = '\0'; +- if (sscanf(str, ":%32[^,]%n", port, &pos) != 1) { +- error_setg(errp, "error parsing port in address '%s'", str); +- return -1; +- } +- } else if (str[0] == '[') { ++ const char *addr_str = qemu_opt_get(opts, "addr"); ++ if (!addr_str) { ++ error_setg(errp, "error parsing address ''"); ++ return -1; ++ } ++ if (str[0] == '[') { + /* IPv6 addr */ +- if (sscanf(str, "[%64[^]]]:%32[^,]%n", host, port, &pos) != 2) { +- error_setg(errp, "error parsing IPv6 address '%s'", str); ++ const char *ip_end = strstr(addr_str, "]:"); ++ if (!ip_end || ip_end - addr_str < 2 || strlen(ip_end) < 3) { ++ error_setg(errp, "error parsing IPv6 address '%s'", addr_str); + return -1; + } ++ addr->host = g_strndup(addr_str + 1, ip_end - addr_str - 1); ++ addr->port = g_strdup(ip_end + 2); + } else { +- /* hostname or IPv4 addr */ +- if (sscanf(str, "%64[^:]:%32[^,]%n", host, port, &pos) != 2) { +- error_setg(errp, "error parsing address '%s'", str); ++ /* no host, hostname or IPv4 addr */ ++ const char *port = strchr(addr_str, ':'); ++ if (!port || strlen(port) < 2) { ++ error_setg(errp, "error parsing address '%s'", addr_str); + return -1; + } ++ addr->host = g_strndup(addr_str, port - addr_str); ++ addr->port = g_strdup(port + 1); + } + +- addr->host = g_strdup(host); +- addr->port = g_strdup(port); +- + /* parse options */ +- optstr = str + pos; +- h = strstr(optstr, ",to="); +- if (h) { +- h += 4; +- if (sscanf(h, "%d%n", &to, &pos) != 1 || +- (h[pos] != '\0' && h[pos] != ',')) { +- error_setg(errp, "error parsing to= argument"); +- return -1; +- } ++ if (qemu_opt_find(opts, "numeric")) { ++ addr->has_numeric = true, ++ addr->numeric = qemu_opt_get_bool(opts, "numeric", false); ++ } ++ if (qemu_opt_find(opts, "to")) { + addr->has_to = true; +- addr->to = to; ++ addr->to = qemu_opt_get_number(opts, "to", 0); + } +- begin = strstr(optstr, ",ipv4"); +- if (begin) { +- if (inet_parse_flag("ipv4", begin + 5, &addr->ipv4, errp) < 0) { +- return -1; +- } ++ if (qemu_opt_find(opts, "ipv4")) { + addr->has_ipv4 = true; ++ addr->ipv4 = qemu_opt_get_bool(opts, "ipv4", false); + } +- begin = strstr(optstr, ",ipv6"); +- if (begin) { +- if (inet_parse_flag("ipv6", begin + 5, &addr->ipv6, errp) < 0) { +- return -1; +- } ++ if (qemu_opt_find(opts, "ipv6")) { + addr->has_ipv6 = true; ++ addr->ipv6 = qemu_opt_get_bool(opts, "ipv6", false); + } +- begin = strstr(optstr, ",keep-alive"); +- if (begin) { +- if (inet_parse_flag("keep-alive", begin + strlen(",keep-alive"), +- &addr->keep_alive, errp) < 0) +- { +- return -1; +- } ++ if (qemu_opt_find(opts, "keep-alive")) { + addr->has_keep_alive = true; ++ addr->keep_alive = qemu_opt_get_bool(opts, "keep-alive", false); + } + #ifdef HAVE_IPPROTO_MPTCP +- begin = strstr(optstr, ",mptcp"); +- if (begin) { +- if (inet_parse_flag("mptcp", begin + strlen(",mptcp"), +- &addr->mptcp, errp) < 0) +- { +- return -1; +- } ++ if (qemu_opt_find(opts, "mptcp")) { + addr->has_mptcp = true; ++ addr->mptcp = qemu_opt_get_bool(opts, "mptcp", 0); + } + #endif + return 0; +-- +2.39.3 + diff --git a/SOURCES/kvm-util-qemu-sockets-Refactor-setting-client-sockopts-i.patch b/SOURCES/kvm-util-qemu-sockets-Refactor-setting-client-sockopts-i.patch new file mode 100644 index 0000000..6b95ffb --- /dev/null +++ b/SOURCES/kvm-util-qemu-sockets-Refactor-setting-client-sockopts-i.patch @@ -0,0 +1,83 @@ +From cc7fbd3aabe3be1e2966472a151dc618be02ac4c Mon Sep 17 00:00:00 2001 +From: Juraj Marcin +Date: Wed, 21 May 2025 15:52:31 +0200 +Subject: [PATCH 09/57] util/qemu-sockets: Refactor setting client sockopts + into a separate function +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Juraj Marcin +RH-MergeRequest: 369: util/qemu-sockets: Introduce inet socket options controlling TCP keep-alive +RH-Jira: RHEL-67104 +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina +RH-Commit: [2/7] b9b258d9a21a31ead1dd4488f1f55f1728fa8b41 (JurajMarcin/centos-src-qemu-kvm) + +This is done in preparation for enabling the SO_KEEPALIVE support for +server sockets and adding settings for more TCP keep-alive socket +options. + +Signed-off-by: Juraj Marcin +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Daniel P. Berrangé + +(cherry picked from commit b8b5278aca78be4a1c2e7cbb11c6be176f63706d) + +JIRA: https://issues.redhat.com/browse/RHEL-67104 + +Signed-off-by: Juraj Marcin +--- + util/qemu-sockets.c | 29 +++++++++++++++++++---------- + 1 file changed, 19 insertions(+), 10 deletions(-) + +diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c +index 60c44b2b56..2c0e4883ce 100644 +--- a/util/qemu-sockets.c ++++ b/util/qemu-sockets.c +@@ -205,6 +205,22 @@ static int try_bind(int socket, InetSocketAddress *saddr, struct addrinfo *e) + #endif + } + ++static int inet_set_sockopts(int sock, InetSocketAddress *saddr, Error **errp) ++{ ++ if (saddr->keep_alive) { ++ int keep_alive = 1; ++ int ret = setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, ++ &keep_alive, sizeof(keep_alive)); ++ ++ if (ret < 0) { ++ error_setg_errno(errp, errno, ++ "Unable to set keep-alive option on socket"); ++ return -1; ++ } ++ } ++ return 0; ++} ++ + static int inet_listen_saddr(InetSocketAddress *saddr, + int port_offset, + int num, +@@ -476,16 +492,9 @@ int inet_connect_saddr(InetSocketAddress *saddr, Error **errp) + return sock; + } + +- if (saddr->keep_alive) { +- int val = 1; +- int ret = setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, +- &val, sizeof(val)); +- +- if (ret < 0) { +- error_setg_errno(errp, errno, "Unable to set KEEPALIVE"); +- close(sock); +- return -1; +- } ++ if (inet_set_sockopts(sock, saddr, errp) < 0) { ++ close(sock); ++ return -1; + } + + return sock; +-- +2.39.3 + diff --git a/SOURCES/kvm-util-qemu-sockets-Refactor-success-and-failure-paths.patch b/SOURCES/kvm-util-qemu-sockets-Refactor-success-and-failure-paths.patch new file mode 100644 index 0000000..de5c984 --- /dev/null +++ b/SOURCES/kvm-util-qemu-sockets-Refactor-success-and-failure-paths.patch @@ -0,0 +1,141 @@ +From 17af9176a1fff5af0d4c1bf1beb52228f7fc324d Mon Sep 17 00:00:00 2001 +From: Juraj Marcin +Date: Wed, 21 May 2025 15:52:32 +0200 +Subject: [PATCH 10/57] util/qemu-sockets: Refactor success and failure paths + in inet_listen_saddr() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Juraj Marcin +RH-MergeRequest: 369: util/qemu-sockets: Introduce inet socket options controlling TCP keep-alive +RH-Jira: RHEL-67104 +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina +RH-Commit: [3/7] 5d5f0e704cdf3454a3c402ea7b130aa42705eb9f (JurajMarcin/centos-src-qemu-kvm) + +To get a listening socket, we need to first create a socket, try binding +it to a certain port, and lastly starting listening to it. Each of these +operations can fail due to various reasons, one of them being that the +requested address/port is already in use. In such case, the function +tries the same process with a new port number. + +This patch refactors the port number loop, so the success path is no +longer buried inside the 'if' statements in the middle of the loop. Now, +the success path is not nested and ends at the end of the iteration +after successful socket creation, binding, and listening. In case any of +the operations fails, it either continues to the next iteration (and the +next port) or jumps out of the loop to handle the error and exits the +function. + +Signed-off-by: Juraj Marcin +Reviewed-by: Daniel P. Berrangé +Signed-off-by: Daniel P. Berrangé + +(cherry picked from commit 911e0f2c6e2d00c985affa75ec188c8edcf480f2) + +JIRA: https://issues.redhat.com/browse/RHEL-67104 + +Signed-off-by: Juraj Marcin +--- + util/qemu-sockets.c | 51 ++++++++++++++++++++++++--------------------- + 1 file changed, 27 insertions(+), 24 deletions(-) + +diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c +index 2c0e4883ce..631d0c4023 100644 +--- a/util/qemu-sockets.c ++++ b/util/qemu-sockets.c +@@ -303,11 +303,20 @@ static int inet_listen_saddr(InetSocketAddress *saddr, + port_min = inet_getport(e); + port_max = saddr->has_to ? saddr->to + port_offset : port_min; + for (p = port_min; p <= port_max; p++) { ++ if (slisten >= 0) { ++ /* ++ * We have a socket we tried with the previous port. It cannot ++ * be rebound, we need to close it and create a new one. ++ */ ++ close(slisten); ++ slisten = -1; ++ } + inet_setport(e, p); + + slisten = create_fast_reuse_socket(e); + if (slisten < 0) { +- /* First time we expect we might fail to create the socket ++ /* ++ * First time we expect we might fail to create the socket + * eg if 'e' has AF_INET6 but ipv6 kmod is not loaded. + * Later iterations should always succeed if first iteration + * worked though, so treat that as fatal. +@@ -317,40 +326,38 @@ static int inet_listen_saddr(InetSocketAddress *saddr, + } else { + error_setg_errno(errp, errno, + "Failed to recreate failed listening socket"); +- goto listen_failed; ++ goto fail; + } + } + socket_created = true; + + rc = try_bind(slisten, saddr, e); + if (rc < 0) { +- if (errno != EADDRINUSE) { +- error_setg_errno(errp, errno, "Failed to bind socket"); +- goto listen_failed; +- } +- } else { +- if (!listen(slisten, num)) { +- goto listen_ok; ++ if (errno == EADDRINUSE) { ++ /* This port is already used, try the next one */ ++ continue; + } +- if (errno != EADDRINUSE) { +- error_setg_errno(errp, errno, "Failed to listen on socket"); +- goto listen_failed; ++ error_setg_errno(errp, errno, "Failed to bind socket"); ++ goto fail; ++ } ++ if (listen(slisten, num)) { ++ if (errno == EADDRINUSE) { ++ /* This port is already used, try the next one */ ++ continue; + } ++ error_setg_errno(errp, errno, "Failed to listen on socket"); ++ goto fail; + } +- /* Someone else managed to bind to the same port and beat us +- * to listen on it! Socket semantics does not allow us to +- * recover from this situation, so we need to recreate the +- * socket to allow bind attempts for subsequent ports: +- */ +- close(slisten); +- slisten = -1; ++ /* We have a listening socket */ ++ freeaddrinfo(res); ++ return slisten; + } + } + error_setg_errno(errp, errno, + socket_created ? + "Failed to find an available port" : + "Failed to create a socket"); +-listen_failed: ++fail: + saved_errno = errno; + if (slisten >= 0) { + close(slisten); +@@ -358,10 +365,6 @@ listen_failed: + freeaddrinfo(res); + errno = saved_errno; + return -1; +- +-listen_ok: +- freeaddrinfo(res); +- return slisten; + } + + #ifdef _WIN32 +-- +2.39.3 + diff --git a/SOURCES/kvm-vfio-helpers-Align-mmaps.patch b/SOURCES/kvm-vfio-helpers-Align-mmaps.patch new file mode 100644 index 0000000..784e024 --- /dev/null +++ b/SOURCES/kvm-vfio-helpers-Align-mmaps.patch @@ -0,0 +1,100 @@ +From 0e733c43122688a40b0bad9cf9af43ac3655fa30 Mon Sep 17 00:00:00 2001 +From: Alex Williamson +Date: Tue, 22 Oct 2024 14:08:29 -0600 +Subject: [PATCH 5/5] vfio/helpers: Align mmaps +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Donald Dutile +RH-MergeRequest: 366: Improve VFIO mmapping performance with huge pfnmaps +RH-Jira: RHEL-88533 +RH-Acked-by: Cédric Le Goater +RH-Acked-by: Alex Williamson +RH-Commit: [2/2] f0e99cf993f82796352376bc7280342729ea5624 (ddutile/qemu-kvm) + +Thanks to work by Peter Xu, support is introduced in Linux v6.12 to +allow pfnmap insertions at PMD and PUD levels of the page table. This +means that provided a properly aligned mmap, the vfio driver is able +to map MMIO at significantly larger intervals than PAGE_SIZE. For +example on x86_64 (the only architecture currently supporting huge +pfnmaps for PUD), rather than 4KiB mappings, we can map device MMIO +using 2MiB and even 1GiB page table entries. + +Typically mmap will already provide PMD aligned mappings, so devices +with moderately sized MMIO ranges, even GPUs with standard 256MiB BARs, +will already take advantage of this support. However in order to better +support devices exposing multi-GiB MMIO, such as 3D accelerators or GPUs +with resizable BARs enabled, we need to manually align the mmap. + +There doesn't seem to be a way for userspace to easily learn about PMD +and PUD mapping level sizes, therefore this takes the simple approach +to align the mapping to the power-of-two size of the region, up to 1GiB, +which is currently the maximum alignment we care about. + +Cc: Peter Xu +Signed-off-by: Alex Williamson +Reviewed-by: Peter Xu +Reviewed-by: Cédric Le Goater +(cherry picked from commit 00b519c0bca0e933ed22e2e6f8bca6b23f41f950) + +Jira: https://issues.redhat.com/browse/RHEL-88533 + +Signed-off-by: Donald Dutile +--- + hw/vfio/helpers.c | 32 ++++++++++++++++++++++++++++++-- + 1 file changed, 30 insertions(+), 2 deletions(-) + +diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c +index b9e606e364..913796f437 100644 +--- a/hw/vfio/helpers.c ++++ b/hw/vfio/helpers.c +@@ -27,6 +27,7 @@ + #include "trace.h" + #include "qapi/error.h" + #include "qemu/error-report.h" ++#include "qemu/units.h" + #include "monitor/monitor.h" + + /* +@@ -406,8 +407,35 @@ int vfio_region_mmap(VFIORegion *region) + prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0; + + for (i = 0; i < region->nr_mmaps; i++) { +- region->mmaps[i].mmap = mmap(NULL, region->mmaps[i].size, prot, +- MAP_SHARED, region->vbasedev->fd, ++ size_t align = MIN(1ULL << ctz64(region->mmaps[i].size), 1 * GiB); ++ void *map_base, *map_align; ++ ++ /* ++ * Align the mmap for more efficient mapping in the kernel. Ideally ++ * we'd know the PMD and PUD mapping sizes to use as discrete alignment ++ * intervals, but we don't. As of Linux v6.12, the largest PUD size ++ * supporting huge pfnmap is 1GiB (ARCH_SUPPORTS_PUD_PFNMAP is only set ++ * on x86_64). Align by power-of-two size, capped at 1GiB. ++ * ++ * NB. qemu_memalign() and friends actually allocate memory, whereas ++ * the region size here can exceed host memory, therefore we manually ++ * create an oversized anonymous mapping and clean it up for alignment. ++ */ ++ map_base = mmap(0, region->mmaps[i].size + align, PROT_NONE, ++ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); ++ if (map_base == MAP_FAILED) { ++ ret = -errno; ++ goto no_mmap; ++ } ++ ++ map_align = (void *)ROUND_UP((uintptr_t)map_base, (uintptr_t)align); ++ munmap(map_base, map_align - map_base); ++ munmap(map_align + region->mmaps[i].size, ++ align - (map_align - map_base)); ++ ++ region->mmaps[i].mmap = mmap(map_align, region->mmaps[i].size, prot, ++ MAP_SHARED | MAP_FIXED, ++ region->vbasedev->fd, + region->fd_offset + + region->mmaps[i].offset); + if (region->mmaps[i].mmap == MAP_FAILED) { +-- +2.48.1 + diff --git a/SOURCES/kvm-vfio-helpers-Refactor-vfio_region_mmap-error-handlin.patch b/SOURCES/kvm-vfio-helpers-Refactor-vfio_region_mmap-error-handlin.patch new file mode 100644 index 0000000..6b06e31 --- /dev/null +++ b/SOURCES/kvm-vfio-helpers-Refactor-vfio_region_mmap-error-handlin.patch @@ -0,0 +1,93 @@ +From f3af9e4476546c0bc814f78d5dd1047ec60768e8 Mon Sep 17 00:00:00 2001 +From: Alex Williamson +Date: Tue, 22 Oct 2024 14:08:28 -0600 +Subject: [PATCH 4/5] vfio/helpers: Refactor vfio_region_mmap() error handling +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Donald Dutile +RH-MergeRequest: 366: Improve VFIO mmapping performance with huge pfnmaps +RH-Jira: RHEL-88533 +RH-Acked-by: Cédric Le Goater +RH-Acked-by: Alex Williamson +RH-Commit: [1/2] b83c7dbc6a6037b465a141961ae810e5551fad30 (ddutile/qemu-kvm) + +Move error handling code to the end of the function so that it can more +easily be shared by new mmap failure conditions. No functional change +intended. + +Signed-off-by: Alex Williamson +Reviewed-by: Peter Xu +Reviewed-by: Cédric Le Goater +(cherry picked from commit 49915c0d2c9868e6f25e52e4d839943611b69e98) + +Jira: https://issues.redhat.com/browse/RHEL-88533 + +Signed-off-by: Donald Dutile +--- + hw/vfio/helpers.c | 34 +++++++++++++++++----------------- + 1 file changed, 17 insertions(+), 17 deletions(-) + +diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c +index ea15c79db0..b9e606e364 100644 +--- a/hw/vfio/helpers.c ++++ b/hw/vfio/helpers.c +@@ -395,7 +395,7 @@ static void vfio_subregion_unmap(VFIORegion *region, int index) + + int vfio_region_mmap(VFIORegion *region) + { +- int i, prot = 0; ++ int i, ret, prot = 0; + char *name; + + if (!region->mem) { +@@ -411,22 +411,8 @@ int vfio_region_mmap(VFIORegion *region) + region->fd_offset + + region->mmaps[i].offset); + if (region->mmaps[i].mmap == MAP_FAILED) { +- int ret = -errno; +- +- trace_vfio_region_mmap_fault(memory_region_name(region->mem), i, +- region->fd_offset + +- region->mmaps[i].offset, +- region->fd_offset + +- region->mmaps[i].offset + +- region->mmaps[i].size - 1, ret); +- +- region->mmaps[i].mmap = NULL; +- +- for (i--; i >= 0; i--) { +- vfio_subregion_unmap(region, i); +- } +- +- return ret; ++ ret = -errno; ++ goto no_mmap; + } + + name = g_strdup_printf("%s mmaps[%d]", +@@ -446,6 +432,20 @@ int vfio_region_mmap(VFIORegion *region) + } + + return 0; ++ ++no_mmap: ++ trace_vfio_region_mmap_fault(memory_region_name(region->mem), i, ++ region->fd_offset + region->mmaps[i].offset, ++ region->fd_offset + region->mmaps[i].offset + ++ region->mmaps[i].size - 1, ret); ++ ++ region->mmaps[i].mmap = NULL; ++ ++ for (i--; i >= 0; i--) { ++ vfio_subregion_unmap(region, i); ++ } ++ ++ return ret; + } + + void vfio_region_unmap(VFIORegion *region) +-- +2.48.1 + diff --git a/SOURCES/kvm-vfio-pci-Delete-local-pm_cap.patch b/SOURCES/kvm-vfio-pci-Delete-local-pm_cap.patch new file mode 100644 index 0000000..0d18886 --- /dev/null +++ b/SOURCES/kvm-vfio-pci-Delete-local-pm_cap.patch @@ -0,0 +1,81 @@ +From 80be4b7d44d4721bacaa6205a47f2d898a090c6b Mon Sep 17 00:00:00 2001 +From: Alex Williamson +Date: Tue, 25 Feb 2025 14:52:27 -0700 +Subject: [PATCH 4/7] vfio/pci: Delete local pm_cap +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Eric Auger +RH-MergeRequest: 348: PCI: Implement basic PCI PM capability backing +RH-Jira: RHEL-7301 +RH-Acked-by: Cédric Le Goater +RH-Acked-by: Alex Williamson +RH-Acked-by: Jon Maloy +RH-Commit: [4/6] 85bd6b15af7c483e36e265c12b7b1689a4872f4c (eauger1/centos-qemu-kvm) + +This is now redundant to PCIDevice.pm_cap. + +Cc: Cédric Le Goater +Reviewed-by: Zhenzhong Duan +Reviewed-by: Eric Auger +Signed-off-by: Alex Williamson +Reviewed-by: Michael S. Tsirkin +Link: https://lore.kernel.org/qemu-devel/20250225215237.3314011-4-alex.williamson@redhat.com +Signed-off-by: Cédric Le Goater +(cherry picked from commit 05c6a8eff6298675080aa2692ee05a310b3483b4) +Signed-off-by: Eric Auger +--- + hw/vfio/pci.c | 9 ++++----- + hw/vfio/pci.h | 1 - + 2 files changed, 4 insertions(+), 6 deletions(-) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index e18b57d864..595b5c9b25 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -2219,7 +2219,6 @@ static bool vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp) + break; + case PCI_CAP_ID_PM: + vfio_check_pm_reset(vdev, pos); +- vdev->pm_cap = pos; + ret = pci_pm_init(pdev, pos, errp) >= 0; + /* + * PCI-core config space emulation needs write access to the power +@@ -2416,17 +2415,17 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev) + vfio_disable_interrupts(vdev); + + /* Make sure the device is in D0 */ +- if (vdev->pm_cap) { ++ if (pdev->pm_cap) { + uint16_t pmcsr; + uint8_t state; + +- pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); ++ pmcsr = vfio_pci_read_config(pdev, pdev->pm_cap + PCI_PM_CTRL, 2); + state = pmcsr & PCI_PM_CTRL_STATE_MASK; + if (state) { + pmcsr &= ~PCI_PM_CTRL_STATE_MASK; +- vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2); ++ vfio_pci_write_config(pdev, pdev->pm_cap + PCI_PM_CTRL, pmcsr, 2); + /* vfio handles the necessary delay here */ +- pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); ++ pmcsr = vfio_pci_read_config(pdev, pdev->pm_cap + PCI_PM_CTRL, 2); + state = pmcsr & PCI_PM_CTRL_STATE_MASK; + if (state) { + error_report("vfio: Unable to power on device, stuck in D%d", +diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h +index 0d3c93fb2e..ca8d55f8b2 100644 +--- a/hw/vfio/pci.h ++++ b/hw/vfio/pci.h +@@ -161,7 +161,6 @@ struct VFIOPCIDevice { + int32_t bootindex; + uint32_t igd_gms; + OffAutoPCIBAR msix_relo; +- uint8_t pm_cap; + uint8_t nv_gpudirect_clique; + bool pci_aer; + bool req_enabled; +-- +2.48.1 + diff --git a/SOURCES/kvm-virtio-kconfig-memory-devices-are-PCI-only.patch b/SOURCES/kvm-virtio-kconfig-memory-devices-are-PCI-only.patch new file mode 100644 index 0000000..0ddc3e9 --- /dev/null +++ b/SOURCES/kvm-virtio-kconfig-memory-devices-are-PCI-only.patch @@ -0,0 +1,87 @@ +From a582cf6f68febba05e20548f643c8be637eab7b8 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Fri, 6 Sep 2024 12:16:58 +0200 +Subject: [PATCH 01/26] virtio: kconfig: memory devices are PCI only + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [1/26] 0f0eab06b6f79f84c2e8d4fee28309b3c7c57414 (thuth/qemu-kvm-cs) + +Virtio memory devices rely on PCI BARs to expose the contents of memory. +Because of this they cannot be used (yet) with virtio-mmio or virtio-ccw. +In fact the code that is common to virtio-mem and virtio-pmem, which +is in hw/virtio/virtio-md-pci.c, is only included if CONFIG_VIRTIO_PCI +is set. Reproduce the same condition in the Kconfig file, only allowing +VIRTIO_MEM and VIRTIO_PMEM to be defined if the transport supports it. + +Without this patch it is possible to create a configuration with +CONFIG_VIRTIO_PCI=n and CONFIG_VIRTIO_MEM=y, but that causes a +linking failure. + +Message-ID: <20240906101658.514470-1-pbonzini@redhat.com> +Reported-by: Michael Tokarev +Reviewed-by: David Hildenbrand +Signed-off-by: Paolo Bonzini +Signed-off-by: David Hildenbrand +(cherry picked from commit 8d018fe59a0beff580ac6b3399d642c4277d9dd0) +Signed-off-by: Thomas Huth +--- + hw/virtio/Kconfig | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/hw/virtio/Kconfig b/hw/virtio/Kconfig +index aa63ff7fd4..0afec2ae92 100644 +--- a/hw/virtio/Kconfig ++++ b/hw/virtio/Kconfig +@@ -16,6 +16,7 @@ config VIRTIO_PCI + default y if PCI_DEVICES + depends on PCI + select VIRTIO ++ select VIRTIO_MD_SUPPORTED + + config VIRTIO_MMIO + bool +@@ -35,10 +36,17 @@ config VIRTIO_CRYPTO + default y + depends on VIRTIO + ++# not all virtio transports support memory devices; if none does, ++# no need to include the code ++config VIRTIO_MD_SUPPORTED ++ bool ++ + config VIRTIO_MD + bool ++ depends on VIRTIO_MD_SUPPORTED + select MEM_DEVICE + ++# selected by the board if it has the required support code + config VIRTIO_PMEM_SUPPORTED + bool + +@@ -46,9 +54,11 @@ config VIRTIO_PMEM + bool + default y + depends on VIRTIO ++ depends on VIRTIO_MD_SUPPORTED + depends on VIRTIO_PMEM_SUPPORTED + select VIRTIO_MD + ++# selected by the board if it has the required support code + config VIRTIO_MEM_SUPPORTED + bool + +@@ -57,6 +67,7 @@ config VIRTIO_MEM + default y + depends on VIRTIO + depends on LINUX ++ depends on VIRTIO_MD_SUPPORTED + depends on VIRTIO_MEM_SUPPORTED + select VIRTIO_MD + +-- +2.48.1 + diff --git a/SOURCES/kvm-virtio-mem-Add-support-for-suspend-wake-up-with-plug.patch b/SOURCES/kvm-virtio-mem-Add-support-for-suspend-wake-up-with-plug.patch new file mode 100644 index 0000000..ba58998 --- /dev/null +++ b/SOURCES/kvm-virtio-mem-Add-support-for-suspend-wake-up-with-plug.patch @@ -0,0 +1,79 @@ +From 001200670ce9076a34419828e7e7ba92f19a80b7 Mon Sep 17 00:00:00 2001 +From: Juraj Marcin +Date: Wed, 4 Sep 2024 12:37:15 +0200 +Subject: [PATCH 08/26] virtio-mem: Add support for suspend+wake-up with + plugged memory + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [8/26] 08e25d41e32b3ac2bf5e0266f9c7e91739eda4d4 (thuth/qemu-kvm-cs) + +Before, the virtio-mem device would unplug all the memory with any reset +of the device, including during the wake-up of the guest from a +suspended state. Due to this, the virtio-mem driver in the Linux kernel +disallowed suspend-to-ram requests in the guest when the +VIRTIO_MEM_F_PERSISTENT_SUSPEND feature is not exposed by QEMU. + +This patch adds the code to skip the reset on wake-up and exposes +theVIRTIO_MEM_F_PERSISTENT_SUSPEND feature to the guest kernel driver +when suspending is possible in QEMU (currently only x86). + +Message-ID: <20240904103722.946194-5-jmarcin@redhat.com> +Reviewed-by: David Hildenbrand +Signed-off-by: Juraj Marcin +Signed-off-by: David Hildenbrand +(cherry picked from commit 1f5f49056d0f140568805d66f33396ed5cd90369) +Signed-off-by: Thomas Huth +--- + hw/virtio/virtio-mem.c | 10 ++++++++++ + hw/virtio/virtio-qmp.c | 3 +++ + 2 files changed, 13 insertions(+) + +diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c +index 025ae4abac..51642a15ef 100644 +--- a/hw/virtio/virtio-mem.c ++++ b/hw/virtio/virtio-mem.c +@@ -883,6 +883,9 @@ static uint64_t virtio_mem_get_features(VirtIODevice *vdev, uint64_t features, + if (vmem->unplugged_inaccessible == ON_OFF_AUTO_ON) { + virtio_add_feature(&features, VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE); + } ++ if (qemu_wakeup_suspend_enabled()) { ++ virtio_add_feature(&features, VIRTIO_MEM_F_PERSISTENT_SUSPEND); ++ } + return features; + } + +@@ -1842,6 +1845,13 @@ static void virtio_mem_system_reset_hold(Object *obj, ResetType type) + { + VirtIOMEM *vmem = VIRTIO_MEM(obj); + ++ /* ++ * When waking up from standby/suspend-to-ram, do not unplug any memory. ++ */ ++ if (type == RESET_TYPE_WAKEUP) { ++ return; ++ } ++ + /* + * During usual resets, we will unplug all memory and shrink the usable + * region size. This is, however, not possible in all scenarios. Then, +diff --git a/hw/virtio/virtio-qmp.c b/hw/virtio/virtio-qmp.c +index 1dd96ed20f..cccc6fe761 100644 +--- a/hw/virtio/virtio-qmp.c ++++ b/hw/virtio/virtio-qmp.c +@@ -450,6 +450,9 @@ static const qmp_virtio_feature_map_t virtio_mem_feature_map[] = { + FEATURE_ENTRY(VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE, \ + "VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE: Unplugged memory cannot be " + "accessed"), ++ FEATURE_ENTRY(VIRTIO_MEM_F_PERSISTENT_SUSPEND, \ ++ "VIRTIO_MEM_F_PERSISTENT_SUSPND: Plugged memory will remain " ++ "plugged when suspending+resuming"), + { -1, "" } + }; + #endif +-- +2.48.1 + diff --git a/SOURCES/kvm-virtio-mem-Use-new-Resettable-framework-instead-of-L.patch b/SOURCES/kvm-virtio-mem-Use-new-Resettable-framework-instead-of-L.patch new file mode 100644 index 0000000..5bb3c9e --- /dev/null +++ b/SOURCES/kvm-virtio-mem-Use-new-Resettable-framework-instead-of-L.patch @@ -0,0 +1,141 @@ +From 6bc0cdecdc736d642bb6c040e07d79a0a3e591ea Mon Sep 17 00:00:00 2001 +From: Juraj Marcin +Date: Wed, 4 Sep 2024 12:37:14 +0200 +Subject: [PATCH 07/26] virtio-mem: Use new Resettable framework instead of + LegacyReset + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [7/26] 31fddbeb4aaf6794b83399a7e2996f01d918d748 (thuth/qemu-kvm-cs) + +LegacyReset does not pass ResetType to the reset callback method, which +the new Resettable framework uses. Due to this, virtio-mem cannot use +the new RESET_TYPE_WAKEUP to skip the reset during wake-up from a +suspended state. + +This patch adds overrides Resettable interface methods in VirtIOMEMClass +to use the new Resettable framework and replaces +qemu_[un]register_reset() calls with qemu_[un]register_resettable(). + +Message-ID: <20240904103722.946194-4-jmarcin@redhat.com> +Reviewed-by: David Hildenbrand +Signed-off-by: Juraj Marcin +Signed-off-by: David Hildenbrand +(cherry picked from commit c009a311e93963860cfba917605a4bf903a06bce) +Signed-off-by: Thomas Huth +--- + hw/virtio/virtio-mem.c | 38 +++++++++++++++++++++------------- + include/hw/virtio/virtio-mem.h | 4 ++++ + 2 files changed, 28 insertions(+), 14 deletions(-) + +diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c +index ba11aa4646..025ae4abac 100644 +--- a/hw/virtio/virtio-mem.c ++++ b/hw/virtio/virtio-mem.c +@@ -895,18 +895,6 @@ static int virtio_mem_validate_features(VirtIODevice *vdev) + return 0; + } + +-static void virtio_mem_system_reset(void *opaque) +-{ +- VirtIOMEM *vmem = VIRTIO_MEM(opaque); +- +- /* +- * During usual resets, we will unplug all memory and shrink the usable +- * region size. This is, however, not possible in all scenarios. Then, +- * the guest has to deal with this manually (VIRTIO_MEM_REQ_UNPLUG_ALL). +- */ +- virtio_mem_unplug_all(vmem); +-} +- + static void virtio_mem_prepare_mr(VirtIOMEM *vmem) + { + const uint64_t region_size = memory_region_size(&vmem->memdev->mr); +@@ -1123,7 +1111,7 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp) + vmstate_register_any(VMSTATE_IF(vmem), + &vmstate_virtio_mem_device_early, vmem); + } +- qemu_register_reset(virtio_mem_system_reset, vmem); ++ qemu_register_resettable(OBJECT(vmem)); + + /* + * Set ourselves as RamDiscardManager before the plug handler maps the +@@ -1143,7 +1131,7 @@ static void virtio_mem_device_unrealize(DeviceState *dev) + * found via an address space anymore. Unset ourselves. + */ + memory_region_set_ram_discard_manager(&vmem->memdev->mr, NULL); +- qemu_unregister_reset(virtio_mem_system_reset, vmem); ++ qemu_unregister_resettable(OBJECT(vmem)); + if (vmem->early_migration) { + vmstate_unregister(VMSTATE_IF(vmem), &vmstate_virtio_mem_device_early, + vmem); +@@ -1844,12 +1832,31 @@ static void virtio_mem_unplug_request_check(VirtIOMEM *vmem, Error **errp) + } + } + ++static ResettableState *virtio_mem_get_reset_state(Object *obj) ++{ ++ VirtIOMEM *vmem = VIRTIO_MEM(obj); ++ return &vmem->reset_state; ++} ++ ++static void virtio_mem_system_reset_hold(Object *obj, ResetType type) ++{ ++ VirtIOMEM *vmem = VIRTIO_MEM(obj); ++ ++ /* ++ * During usual resets, we will unplug all memory and shrink the usable ++ * region size. This is, however, not possible in all scenarios. Then, ++ * the guest has to deal with this manually (VIRTIO_MEM_REQ_UNPLUG_ALL). ++ */ ++ virtio_mem_unplug_all(vmem); ++} ++ + static void virtio_mem_class_init(ObjectClass *klass, void *data) + { + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + VirtIOMEMClass *vmc = VIRTIO_MEM_CLASS(klass); + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_CLASS(klass); ++ ResettableClass *rc = RESETTABLE_CLASS(klass); + + device_class_set_props(dc, virtio_mem_properties); + dc->vmsd = &vmstate_virtio_mem; +@@ -1876,6 +1883,9 @@ static void virtio_mem_class_init(ObjectClass *klass, void *data) + rdmc->replay_discarded = virtio_mem_rdm_replay_discarded; + rdmc->register_listener = virtio_mem_rdm_register_listener; + rdmc->unregister_listener = virtio_mem_rdm_unregister_listener; ++ ++ rc->get_state = virtio_mem_get_reset_state; ++ rc->phases.hold = virtio_mem_system_reset_hold; + } + + static const TypeInfo virtio_mem_info = { +diff --git a/include/hw/virtio/virtio-mem.h b/include/hw/virtio/virtio-mem.h +index 5f5b02b8f9..a1af144c28 100644 +--- a/include/hw/virtio/virtio-mem.h ++++ b/include/hw/virtio/virtio-mem.h +@@ -14,6 +14,7 @@ + #define HW_VIRTIO_MEM_H + + #include "standard-headers/linux/virtio_mem.h" ++#include "hw/resettable.h" + #include "hw/virtio/virtio.h" + #include "qapi/qapi-types-misc.h" + #include "sysemu/hostmem.h" +@@ -115,6 +116,9 @@ struct VirtIOMEM { + + /* listeners to notify on plug/unplug activity. */ + QLIST_HEAD(, RamDiscardListener) rdl_list; ++ ++ /* State of the resettable container */ ++ ResettableState reset_state; + }; + + struct VirtIOMEMClass { +-- +2.48.1 + diff --git a/SOURCES/kvm-virtio-mem-don-t-warn-about-THP-sizes-on-a-kernel-wi.patch b/SOURCES/kvm-virtio-mem-don-t-warn-about-THP-sizes-on-a-kernel-wi.patch new file mode 100644 index 0000000..f527a70 --- /dev/null +++ b/SOURCES/kvm-virtio-mem-don-t-warn-about-THP-sizes-on-a-kernel-wi.patch @@ -0,0 +1,59 @@ +From f4052d25199bfce8ce29a173934a805fe1cf7e3e Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Tue, 10 Sep 2024 18:34:33 +0200 +Subject: [PATCH 25/26] virtio-mem: don't warn about THP sizes on a kernel + without THP support + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [25/26] 5dff17ef818722db8f1fa87cff5b7777afc3c814 (thuth/qemu-kvm-cs) + +If the config directory in sysfs does not exist at all, we are dealing +with a system that does not support THPs. Simply use 1 MiB block size +then, instead of warning "Could not detect THP size, falling back to +..." and falling back to the default THP size. + +Cc: "Michael S. Tsirkin" +Cc: Gavin Shan +Cc: Juraj Marcin +Signed-off-by: David Hildenbrand +Message-Id: <20240910163433.2100295-1-david@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +(cherry picked from commit 95b717a8154b955de2782305f305b63f357b0576) +Signed-off-by: Thomas Huth +--- + hw/virtio/virtio-mem.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c +index c9f8a23bbc..4977658312 100644 +--- a/hw/virtio/virtio-mem.c ++++ b/hw/virtio/virtio-mem.c +@@ -90,6 +90,7 @@ static uint32_t virtio_mem_default_thp_size(void) + static uint32_t thp_size; + + #define HPAGE_PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" ++#define HPAGE_PATH "/sys/kernel/mm/transparent_hugepage/" + static uint32_t virtio_mem_thp_size(void) + { + gchar *content = NULL; +@@ -100,6 +101,12 @@ static uint32_t virtio_mem_thp_size(void) + return thp_size; + } + ++ /* No THP -> no restrictions. */ ++ if (!g_file_test(HPAGE_PATH, G_FILE_TEST_EXISTS)) { ++ thp_size = VIRTIO_MEM_MIN_BLOCK_SIZE; ++ return thp_size; ++ } ++ + /* + * Try to probe the actual THP size, fallback to (sane but eventually + * incorrect) default sizes. +-- +2.48.1 + diff --git a/SOURCES/kvm-virtio-mem-unplug-memory-only-during-system-resets-n.patch b/SOURCES/kvm-virtio-mem-unplug-memory-only-during-system-resets-n.patch new file mode 100644 index 0000000..cfb49f4 --- /dev/null +++ b/SOURCES/kvm-virtio-mem-unplug-memory-only-during-system-resets-n.patch @@ -0,0 +1,258 @@ +From e5f2bb584154eef665211228f1ac3113e2acc269 Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Fri, 25 Oct 2024 12:41:03 +0200 +Subject: [PATCH 09/26] virtio-mem: unplug memory only during system resets, + not device resets + +RH-Author: Thomas Huth +RH-MergeRequest: 351: Enable virtio-mem support on s390x +RH-Jira: RHEL-72977 +RH-Acked-by: David Hildenbrand +RH-Acked-by: Juraj Marcin +RH-Commit: [9/26] 7c5ddd4d3fd0d19caa946bcbe98cb5732404978b (thuth/qemu-kvm-cs) + +We recently converted from the LegacyReset to the new reset framework +in commit c009a311e939 ("virtio-mem: Use new Resettable framework instead +of LegacyReset") to be able to use the ResetType to filter out wakeup +resets. + +However, this change had an undesired implications: as we override the +Resettable interface methods in VirtIOMEMClass, the reset handler will +not only get called during system resets (i.e., qemu_devices_reset()) +but also during any direct or indirect device rests (e.g., +device_cold_reset()). + +Further, we might now receive two reset callbacks during +qemu_devices_reset(), first when reset by a parent and later when reset +directly. + +The memory state of virtio-mem devices is rather special: it's supposed to +be persistent/unchanged during most resets (similar to resetting a hard +disk will not destroy the data), unless actually cold-resetting the whole +system (different to a hard disk where a reboot will not destroy the data): +ripping out system RAM is something guest OSes don't particularly enjoy, +but we want to detect when rebooting to an OS that does not support +virtio-mem and wouldn't be able to detect+use the memory -- and we want +to force-defragment hotplugged memory to also shrink the usable device +memory region. So we rally want to catch system resets to do that. + +On supported targets (e.g., x86), getting a cold reset on the +device/parent triggers is not that easy (but looks like PCI code +might trigger it), so this implication went unnoticed. + +However, with upcoming s390x support it is problematic: during +kdump, s390x triggers a subsystem reset, ending up in +s390_machine_reset() and calling only subsystem_reset() instead of +qemu_devices_reset() -- because it's not a full system reset. + +In subsystem_reset(), s390x performs a device_cold_reset() of any +TYPE_VIRTUAL_CSS_BRIDGE device, which ends up resetting all children, +including the virtio-mem device. Consequently, we wrongly detect a system +reset and unplug all device memory, resulting in hotplugged memory not +getting included in the crash dump -- undesired. + +We really must not mess with hotplugged memory state during simple +device resets. To fix, create+register a new reset object that will only +get triggered during qemu_devices_reset() calls, but not during any other +resets as it is logically not the child of any other object. + +Message-ID: <20241025104103.342188-1-david@redhat.com> +Acked-by: Michael S. Tsirkin +Cc: "Michael S. Tsirkin" +Cc: Juraj Marcin +Cc: Peter Maydell +Signed-off-by: David Hildenbrand +(cherry picked from commit 713484d0389c9d1cbb87eca060361281248b69f5) +Signed-off-by: Thomas Huth +--- + hw/virtio/virtio-mem.c | 103 +++++++++++++++++++++++---------- + include/hw/virtio/virtio-mem.h | 13 ++++- + 2 files changed, 84 insertions(+), 32 deletions(-) + +diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c +index 51642a15ef..00da98b6e1 100644 +--- a/hw/virtio/virtio-mem.c ++++ b/hw/virtio/virtio-mem.c +@@ -949,6 +949,7 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp) + VirtIOMEM *vmem = VIRTIO_MEM(dev); + uint64_t page_size; + RAMBlock *rb; ++ Object *obj; + int ret; + + if (!vmem->memdev) { +@@ -1114,7 +1115,28 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp) + vmstate_register_any(VMSTATE_IF(vmem), + &vmstate_virtio_mem_device_early, vmem); + } +- qemu_register_resettable(OBJECT(vmem)); ++ ++ /* ++ * We only want to unplug all memory to start with a clean slate when ++ * it is safe for the guest -- during system resets that call ++ * qemu_devices_reset(). ++ * ++ * We'll filter out selected qemu_devices_reset() calls used for other ++ * purposes, like resetting all devices during wakeup from suspend on ++ * x86 based on the reset type passed to qemu_devices_reset(). ++ * ++ * Unplugging all memory during simple device resets can result in the VM ++ * unexpectedly losing RAM, corrupting VM state. ++ * ++ * Simple device resets (or resets triggered by getting a parent device ++ * reset) must not change the state of plugged memory blocks. Therefore, ++ * we need a dedicated reset object that only gets called during ++ * qemu_devices_reset(). ++ */ ++ obj = object_new(TYPE_VIRTIO_MEM_SYSTEM_RESET); ++ vmem->system_reset = VIRTIO_MEM_SYSTEM_RESET(obj); ++ vmem->system_reset->vmem = vmem; ++ qemu_register_resettable(obj); + + /* + * Set ourselves as RamDiscardManager before the plug handler maps the +@@ -1134,7 +1156,10 @@ static void virtio_mem_device_unrealize(DeviceState *dev) + * found via an address space anymore. Unset ourselves. + */ + memory_region_set_ram_discard_manager(&vmem->memdev->mr, NULL); +- qemu_unregister_resettable(OBJECT(vmem)); ++ ++ qemu_unregister_resettable(OBJECT(vmem->system_reset)); ++ object_unref(OBJECT(vmem->system_reset)); ++ + if (vmem->early_migration) { + vmstate_unregister(VMSTATE_IF(vmem), &vmstate_virtio_mem_device_early, + vmem); +@@ -1835,38 +1860,12 @@ static void virtio_mem_unplug_request_check(VirtIOMEM *vmem, Error **errp) + } + } + +-static ResettableState *virtio_mem_get_reset_state(Object *obj) +-{ +- VirtIOMEM *vmem = VIRTIO_MEM(obj); +- return &vmem->reset_state; +-} +- +-static void virtio_mem_system_reset_hold(Object *obj, ResetType type) +-{ +- VirtIOMEM *vmem = VIRTIO_MEM(obj); +- +- /* +- * When waking up from standby/suspend-to-ram, do not unplug any memory. +- */ +- if (type == RESET_TYPE_WAKEUP) { +- return; +- } +- +- /* +- * During usual resets, we will unplug all memory and shrink the usable +- * region size. This is, however, not possible in all scenarios. Then, +- * the guest has to deal with this manually (VIRTIO_MEM_REQ_UNPLUG_ALL). +- */ +- virtio_mem_unplug_all(vmem); +-} +- + static void virtio_mem_class_init(ObjectClass *klass, void *data) + { + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + VirtIOMEMClass *vmc = VIRTIO_MEM_CLASS(klass); + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_CLASS(klass); +- ResettableClass *rc = RESETTABLE_CLASS(klass); + + device_class_set_props(dc, virtio_mem_properties); + dc->vmsd = &vmstate_virtio_mem; +@@ -1893,9 +1892,6 @@ static void virtio_mem_class_init(ObjectClass *klass, void *data) + rdmc->replay_discarded = virtio_mem_rdm_replay_discarded; + rdmc->register_listener = virtio_mem_rdm_register_listener; + rdmc->unregister_listener = virtio_mem_rdm_unregister_listener; +- +- rc->get_state = virtio_mem_get_reset_state; +- rc->phases.hold = virtio_mem_system_reset_hold; + } + + static const TypeInfo virtio_mem_info = { +@@ -1918,3 +1914,48 @@ static void virtio_register_types(void) + } + + type_init(virtio_register_types) ++ ++OBJECT_DEFINE_SIMPLE_TYPE_WITH_INTERFACES(VirtioMemSystemReset, virtio_mem_system_reset, VIRTIO_MEM_SYSTEM_RESET, OBJECT, { TYPE_RESETTABLE_INTERFACE }, { }) ++ ++static void virtio_mem_system_reset_init(Object *obj) ++{ ++} ++ ++static void virtio_mem_system_reset_finalize(Object *obj) ++{ ++} ++ ++static ResettableState *virtio_mem_system_reset_get_state(Object *obj) ++{ ++ VirtioMemSystemReset *vmem_reset = VIRTIO_MEM_SYSTEM_RESET(obj); ++ ++ return &vmem_reset->reset_state; ++} ++ ++static void virtio_mem_system_reset_hold(Object *obj, ResetType type) ++{ ++ VirtioMemSystemReset *vmem_reset = VIRTIO_MEM_SYSTEM_RESET(obj); ++ VirtIOMEM *vmem = vmem_reset->vmem; ++ ++ /* ++ * When waking up from standby/suspend-to-ram, do not unplug any memory. ++ */ ++ if (type == RESET_TYPE_WAKEUP) { ++ return; ++ } ++ ++ /* ++ * During usual resets, we will unplug all memory and shrink the usable ++ * region size. This is, however, not possible in all scenarios. Then, ++ * the guest has to deal with this manually (VIRTIO_MEM_REQ_UNPLUG_ALL). ++ */ ++ virtio_mem_unplug_all(vmem); ++} ++ ++static void virtio_mem_system_reset_class_init(ObjectClass *klass, void *data) ++{ ++ ResettableClass *rc = RESETTABLE_CLASS(klass); ++ ++ rc->get_state = virtio_mem_system_reset_get_state; ++ rc->phases.hold = virtio_mem_system_reset_hold; ++} +diff --git a/include/hw/virtio/virtio-mem.h b/include/hw/virtio/virtio-mem.h +index a1af144c28..550ce585b2 100644 +--- a/include/hw/virtio/virtio-mem.h ++++ b/include/hw/virtio/virtio-mem.h +@@ -25,6 +25,10 @@ + OBJECT_DECLARE_TYPE(VirtIOMEM, VirtIOMEMClass, + VIRTIO_MEM) + ++#define TYPE_VIRTIO_MEM_SYSTEM_RESET "virtio-mem-system-reset" ++ ++OBJECT_DECLARE_SIMPLE_TYPE(VirtioMemSystemReset, VIRTIO_MEM_SYSTEM_RESET) ++ + #define VIRTIO_MEM_MEMDEV_PROP "memdev" + #define VIRTIO_MEM_NODE_PROP "node" + #define VIRTIO_MEM_SIZE_PROP "size" +@@ -117,8 +121,15 @@ struct VirtIOMEM { + /* listeners to notify on plug/unplug activity. */ + QLIST_HEAD(, RamDiscardListener) rdl_list; + +- /* State of the resettable container */ ++ /* Catch system resets -> qemu_devices_reset() only. */ ++ VirtioMemSystemReset *system_reset; ++}; ++ ++struct VirtioMemSystemReset { ++ Object parent; ++ + ResettableState reset_state; ++ VirtIOMEM *vmem; + }; + + struct VirtIOMEMClass { +-- +2.48.1 + diff --git a/SOURCES/kvm-virtio-net-disable-USO-for-virt-rhel9.6.patch b/SOURCES/kvm-virtio-net-disable-USO-for-virt-rhel9.6.patch new file mode 100644 index 0000000..632c5aa --- /dev/null +++ b/SOURCES/kvm-virtio-net-disable-USO-for-virt-rhel9.6.patch @@ -0,0 +1,135 @@ +From a7cd7f5b3bd6df30e75532fb19b645c5349f6183 Mon Sep 17 00:00:00 2001 +From: Shaoqin Huang +Date: Thu, 24 Apr 2025 04:48:29 -0400 +Subject: [PATCH 1/5] virtio-net: disable USO for virt-rhel9.6 + +RH-Author: Shaoqin Huang +RH-MergeRequest: 353: virtio-net: disable USO for virt-rhel9.6 +RH-Jira: RHEL-80313 +RH-Acked-by: Thomas Huth +RH-Acked-by: Eric Auger +RH-Commit: [1/2] c7099480e656106219040d45ce7b76b19376227a (shahuang/qemu-kvm) + +JIRA: https://issues.redhat.com/browse/RHEL-80313 +Upstream Status: RHEL only + +RHEL9 kernels have USO* disabled while RHEL10 has it enabled, this can +cause the migration to fail when running a RHEL9 qemu on a RHEL10 kernel +and then migrate to a RHEL9 kernel. + +Make sure the virt-rhel9.6 machine type in RHEL9 stay the same +independent of the kernel. + +Signed-off-by: Shaoqin Huang +--- + hw/arm/virt.c | 3 +++ + hw/core/machine.c | 15 +++++++++------ + hw/i386/pc_piix.c | 1 + + hw/i386/pc_q35.c | 3 +++ + hw/s390x/s390-virtio-ccw.c | 2 ++ + include/hw/boards.h | 3 +++ + 6 files changed, 21 insertions(+), 6 deletions(-) + +diff --git a/hw/arm/virt.c b/hw/arm/virt.c +index c5270a5abc..896deaa025 100644 +--- a/hw/arm/virt.c ++++ b/hw/arm/virt.c +@@ -3600,6 +3600,9 @@ DEFINE_VIRT_MACHINE(2, 6) + static void virt_rhel_machine_9_6_0_options(MachineClass *mc) + { + compat_props_add(mc->compat_props, arm_rhel9_compat, arm_rhel9_compat_len); ++ ++ /* NB: remember to move this line to the *latest* RHEL 9 machine */ ++ compat_props_add(mc->compat_props, hw_compat_rhel_9, hw_compat_rhel_9_len); + } + DEFINE_VIRT_MACHINE_AS_LATEST(9, 6, 0) + +diff --git a/hw/core/machine.c b/hw/core/machine.c +index add42660f8..37751f6b9b 100644 +--- a/hw/core/machine.c ++++ b/hw/core/machine.c +@@ -305,6 +305,15 @@ GlobalProperty hw_compat_2_1[] = { + }; + const size_t hw_compat_2_1_len = G_N_ELEMENTS(hw_compat_2_1); + ++/* Apply this to all RHEL9 boards going backward and forward */ ++GlobalProperty hw_compat_rhel_9[] = { ++ /* supported by userspace, but RHEL 9 *kernels* do not support USO. */ ++ { TYPE_VIRTIO_NET, "host_uso", "off"}, ++ { TYPE_VIRTIO_NET, "guest_uso4", "off"}, ++ { TYPE_VIRTIO_NET, "guest_uso6", "off"}, ++}; ++const size_t hw_compat_rhel_9_len = G_N_ELEMENTS(hw_compat_rhel_9); ++ + /* + * RHEL only: machine types for previous major releases are deprecated + */ +@@ -341,12 +350,6 @@ GlobalProperty hw_compat_rhel_9_5[] = { + const size_t hw_compat_rhel_9_5_len = G_N_ELEMENTS(hw_compat_rhel_9_5); + + GlobalProperty hw_compat_rhel_9_4[] = { +- /* hw_compat_rhel_9_4 from hw_compat_8_0 */ +- { TYPE_VIRTIO_NET, "host_uso", "off"}, +- /* hw_compat_rhel_9_4 from hw_compat_8_0 */ +- { TYPE_VIRTIO_NET, "guest_uso4", "off"}, +- /* hw_compat_rhel_9_4 from hw_compat_8_0 */ +- { TYPE_VIRTIO_NET, "guest_uso6", "off"}, + /* hw_compat_rhel_9_4 from hw_compat_8_1 */ + { TYPE_PCI_BRIDGE, "x-pci-express-writeable-slt-bug", "true" }, + /* hw_compat_rhel_9_4 from hw_compat_8_1 */ +diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c +index 656abb5d39..10764bf596 100644 +--- a/hw/i386/pc_piix.c ++++ b/hw/i386/pc_piix.c +@@ -929,6 +929,7 @@ static void pc_i440fx_rhel_machine_7_6_0_options(MachineClass *m) + compat_props_add(m->compat_props, pc_rhel_8_0_compat, pc_rhel_8_0_compat_len); + compat_props_add(m->compat_props, hw_compat_rhel_7_6, hw_compat_rhel_7_6_len); + compat_props_add(m->compat_props, pc_rhel_7_6_compat, pc_rhel_7_6_compat_len); ++ compat_props_add(m->compat_props, hw_compat_rhel_9, hw_compat_rhel_9_len); + } + + DEFINE_I440FX_MACHINE(7, 6, 0); +diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c +index 578f63524f..5bf08be0fb 100644 +--- a/hw/i386/pc_q35.c ++++ b/hw/i386/pc_q35.c +@@ -679,6 +679,9 @@ static void pc_q35_rhel_machine_9_6_0_options(MachineClass *m) + m->desc = "RHEL-9.6.0 PC (Q35 + ICH9, 2009)"; + pcmc->smbios_stream_product = "RHEL"; + pcmc->smbios_stream_version = "9.6.0"; ++ ++ /* NB: remember to move this line to the *latest* RHEL 9 machine */ ++ compat_props_add(m->compat_props, hw_compat_rhel_9, hw_compat_rhel_9_len); + } + + DEFINE_Q35_MACHINE_BUGFIX(9, 6, 0); +diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c +index 9f4ad01789..312e8f18aa 100644 +--- a/hw/s390x/s390-virtio-ccw.c ++++ b/hw/s390x/s390-virtio-ccw.c +@@ -1348,6 +1348,8 @@ static void ccw_rhel_machine_9_6_0_instance_options(MachineState *machine) + + static void ccw_rhel_machine_9_6_0_class_options(MachineClass *mc) + { ++ /* NB: remember to move this line to the *latest* RHEL 9 machine */ ++ compat_props_add(mc->compat_props, hw_compat_rhel_9, hw_compat_rhel_9_len); + } + DEFINE_CCW_MACHINE_AS_LATEST(9, 6, 0); + +diff --git a/include/hw/boards.h b/include/hw/boards.h +index fe011b1e86..8f3fa40cf9 100644 +--- a/include/hw/boards.h ++++ b/include/hw/boards.h +@@ -803,6 +803,9 @@ extern const size_t hw_compat_2_2_len; + extern GlobalProperty hw_compat_2_1[]; + extern const size_t hw_compat_2_1_len; + ++extern GlobalProperty hw_compat_rhel_9[]; ++extern const size_t hw_compat_rhel_9_len; ++ + extern GlobalProperty hw_compat_rhel_9_6[]; + extern const size_t hw_compat_rhel_9_6_len; + +-- +2.48.1 + diff --git a/SOURCES/qemu-ga.sysconfig b/SOURCES/qemu-ga.sysconfig index 736b471..b574514 100644 --- a/SOURCES/qemu-ga.sysconfig +++ b/SOURCES/qemu-ga.sysconfig @@ -13,7 +13,7 @@ # # You can get the list of RPC commands using "qemu-ga --allow-rpcs='?'". # There should be no spaces between commas and commands in the allow list. -FILTER_RPC_ARGS="--allow-rpcs=guest-sync-delimited,guest-sync,guest-ping,guest-get-time,guest-set-time,guest-info,guest-shutdown,guest-fsfreeze-status,guest-fsfreeze-freeze,guest-fsfreeze-freeze-list,guest-fsfreeze-thaw,guest-fstrim,guest-suspend-disk,guest-suspend-ram,guest-suspend-hybrid,guest-network-get-interfaces,guest-get-vcpus,guest-set-vcpus,guest-get-disks,guest-get-fsinfo,guest-set-user-password,guest-get-memory-blocks,guest-set-memory-blocks,guest-get-memory-block-info,guest-get-host-name,guest-get-users,guest-get-timezone,guest-get-osinfo,guest-get-devices,guest-ssh-get-authorized-keys,guest-ssh-add-authorized-keys,guest-ssh-remove-authorized-keys,guest-get-diskstats,guest-get-cpustats" +FILTER_RPC_ARGS="--allow-rpcs=guest-sync-delimited,guest-sync,guest-ping,guest-get-time,guest-set-time,guest-info,guest-shutdown,guest-fsfreeze-status,guest-fsfreeze-freeze,guest-fsfreeze-freeze-list,guest-fsfreeze-thaw,guest-fstrim,guest-suspend-disk,guest-suspend-ram,guest-suspend-hybrid,guest-network-get-interfaces,guest-get-vcpus,guest-set-vcpus,guest-get-disks,guest-get-fsinfo,guest-set-user-password,guest-get-memory-blocks,guest-set-memory-blocks,guest-get-memory-block-info,guest-get-host-name,guest-get-users,guest-get-timezone,guest-get-osinfo,guest-get-devices,guest-ssh-get-authorized-keys,guest-ssh-add-authorized-keys,guest-ssh-remove-authorized-keys,guest-get-diskstats,guest-get-cpustats,guest-network-get-route,guest-get-load" # Fsfreeze hook script specification. # diff --git a/SPECS/qemu-kvm.spec b/SPECS/qemu-kvm.spec index 5422e1f..0b8b9ce 100644 --- a/SPECS/qemu-kvm.spec +++ b/SPECS/qemu-kvm.spec @@ -149,7 +149,7 @@ Obsoletes: %{name}-block-ssh <= %{epoch}:%{version} \ Summary: QEMU is a machine emulator and virtualizer Name: qemu-kvm Version: 9.1.0 -Release: 15%{?rcrel}%{?dist}%{?cc_suffix} +Release: 26%{?rcrel}%{?dist}%{?cc_suffix} # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped # Epoch 15 used for RHEL 8 # Epoch 17 used for RHEL 9 (due to release versioning offset in RHEL 8.5) @@ -465,6 +465,278 @@ Patch147: kvm-iotests-Add-qsd-migrate-case.patch # For RHEL-54296 - Provide QMP command for block device reactivation after migration [rhel-9.5] # For RHEL-78397 - backport fix for double migration of a paused VM (disk activation rewrite) Patch148: kvm-iotests-Add-NBD-based-tests-for-inactive-nodes.patch +# For RHEL-7188 - [intel iommu][PF] DMAR: DRHD: handling fault status reg +Patch149: kvm-hw-virtio-virtio-iommu-Migrate-to-3-phase-reset.patch +# For RHEL-7188 - [intel iommu][PF] DMAR: DRHD: handling fault status reg +Patch150: kvm-hw-i386-intel-iommu-Migrate-to-3-phase-reset.patch +# For RHEL-7188 - [intel iommu][PF] DMAR: DRHD: handling fault status reg +Patch151: kvm-hw-arm-smmuv3-Move-reset-to-exit-phase.patch +# For RHEL-7188 - [intel iommu][PF] DMAR: DRHD: handling fault status reg +Patch152: kvm-hw-vfio-common-Add-a-trace-point-in-vfio_reset_handl.patch +# For RHEL-7188 - [intel iommu][PF] DMAR: DRHD: handling fault status reg +Patch153: kvm-docs-devel-reset-Document-reset-expectations-for-DMA.patch +# For RHEL-69622 - [qemu-guest-agent][RFE] Report CPU load average +Patch154: kvm-qga-implement-a-guest-get-load-command.patch +# For RHEL-69775 - Guest crashed on the target host when the migration was canceled +Patch155: kvm-migration-Fix-UAF-for-incoming-migration-on-Migratio.patch +# For RHEL-47340 - [Qemu RHEL-9] qemu-trace-stap should handle lack of stap more gracefully +Patch156: kvm-scripts-improve-error-from-qemu-trace-stap-on-missin.patch +# For RHEL-7301 - [intel iommu] VFIO_MAP_DMA failed: Bad address on system_powerdown +Patch157: kvm-hw-pci-Rename-has_power-to-enabled.patch +# For RHEL-7301 - [intel iommu] VFIO_MAP_DMA failed: Bad address on system_powerdown +Patch158: kvm-hw-pci-Basic-support-for-PCI-power-management.patch +# For RHEL-7301 - [intel iommu] VFIO_MAP_DMA failed: Bad address on system_powerdown +Patch159: kvm-pci-Use-PCI-PM-capability-initializer.patch +# For RHEL-7301 - [intel iommu] VFIO_MAP_DMA failed: Bad address on system_powerdown +Patch160: kvm-vfio-pci-Delete-local-pm_cap.patch +# For RHEL-7301 - [intel iommu] VFIO_MAP_DMA failed: Bad address on system_powerdown +Patch161: kvm-pcie-virtio-Remove-redundant-pm_cap.patch +# For RHEL-7301 - [intel iommu] VFIO_MAP_DMA failed: Bad address on system_powerdown +Patch162: kvm-hw-vfio-pci-Re-order-pre-reset.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch163: kvm-virtio-kconfig-memory-devices-are-PCI-only.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch164: kvm-hw-s390-ccw-device-Convert-to-three-phase-reset.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch165: kvm-hw-s390-virtio-ccw-Convert-to-three-phase-reset.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch166: kvm-target-s390-Convert-CPU-to-Resettable-interface.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch167: kvm-reset-Use-ResetType-for-qemu_devices_reset-and-Machi.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch168: kvm-reset-Add-RESET_TYPE_WAKEUP.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch169: kvm-virtio-mem-Use-new-Resettable-framework-instead-of-L.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch170: kvm-virtio-mem-Add-support-for-suspend-wake-up-with-plug.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch171: kvm-virtio-mem-unplug-memory-only-during-system-resets-n.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch172: kvm-s390x-s390-virtio-ccw-don-t-crash-on-weird-RAM-sizes.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch173: kvm-s390x-s390-virtio-hcall-remove-hypercall-registratio.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch174: kvm-s390x-s390-virtio-hcall-prepare-for-more-diag500-hyp.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch175: kvm-s390x-rename-s390-virtio-hcall-to-s390-hypercall.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch176: kvm-s390x-s390-virtio-ccw-move-setting-the-maximum-guest.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch177: kvm-s390x-introduce-s390_get_memory_limit.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch178: kvm-s390x-s390-hypercall-introduce-DIAG500-STORAGE_LIMIT.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch179: kvm-s390x-s390-stattrib-kvm-prepare-for-memory-devices-a.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch180: kvm-s390x-s390-skeys-prepare-for-memory-devices.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch181: kvm-s390x-s390-virtio-ccw-prepare-for-memory-devices.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch182: kvm-s390x-pv-prepare-for-memory-devices.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch183: kvm-s390x-remember-the-maximum-page-size.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch184: kvm-s390x-virtio-ccw-add-support-for-virtio-based-memory.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch185: kvm-s390x-virtio-mem-support.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch186: kvm-hw-virtio-Also-include-md-stubs-in-case-CONFIG_VIRTI.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch187: kvm-virtio-mem-don-t-warn-about-THP-sizes-on-a-kernel-wi.patch +# For RHEL-72977 - [IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part +Patch188: kvm-redhat-Enable-virtio-mem-on-s390x.patch +# For RHEL-7130 - [Hyper-V][RHEL9.2] Nested Hyper-V on KVM: L1 Windows VM with BIOS mode fails to boot up when using '-cpu host,hv_passthrough’ flag +Patch189: kvm-target-i386-Fix-conditional-CONFIG_SYNDBG-enablement.patch +# For RHEL-7130 - [Hyper-V][RHEL9.2] Nested Hyper-V on KVM: L1 Windows VM with BIOS mode fails to boot up when using '-cpu host,hv_passthrough’ flag +Patch190: kvm-target-i386-Exclude-hv-syndbg-from-hv-passthrough.patch +# For RHEL-80313 - Unable to migrate VM from RHEL10.0/qemu-kvm-9.6 to RHEL9.6/qemu-kvm-9.6 +Patch191: kvm-virtio-net-disable-USO-for-virt-rhel9.6.patch +# For RHEL-80313 - Unable to migrate VM from RHEL10.0/qemu-kvm-9.6 to RHEL9.6/qemu-kvm-9.6 +Patch192: kvm-arm-Use-arm_virt_compat_set-to-apply-the-compat.patch +# For RHEL-86032 - QEMU sends unaligned discards on 4K devices [RHEL-9.7] +Patch193: kvm-file-posix-probe-discard-alignment-on-Linux-block-de.patch +# For RHEL-86032 - QEMU sends unaligned discards on 4K devices [RHEL-9.7] +Patch194: kvm-block-io-skip-head-tail-requests-on-EINVAL.patch +# For RHEL-86032 - QEMU sends unaligned discards on 4K devices [RHEL-9.7] +Patch195: kvm-file-posix-Fix-crash-on-discard_granularity-0.patch +# For RHEL-88153 - [s390x] valgrind not working with qemu-kvm for non-x86 builds +Patch196: kvm-meson-configure-add-valgrind-option-en-dis-able-valg.patch +# For RHEL-88153 - [s390x] valgrind not working with qemu-kvm for non-x86 builds +Patch197: kvm-hw-i386-Fix-machine-type-compatibility.patch +# For RHEL-88533 - Improve VFIO mmapping performance with huge pfnmaps +Patch198: kvm-vfio-helpers-Refactor-vfio_region_mmap-error-handlin.patch +# For RHEL-88533 - Improve VFIO mmapping performance with huge pfnmaps +Patch199: kvm-vfio-helpers-Align-mmaps.patch +# For RHEL-85159 - Video stuck about 1 min after switchover phase when play one video during postcopy-preempt migration +Patch200: kvm-migration-postcopy-Spatial-locality-page-hint-for-pr.patch +# For RHEL-95120 - Allow libvirt to restart passt/vhost-user when the process is killed [rhel-9.7] +Patch201: kvm-net-vhost-user-add-QAPI-events-to-report-connection-.patch +# For RHEL-95408 - Support multipath failover with scsi-block [rhel-9] +Patch202: kvm-file-posix-Define-DM_MPATH_PROBE_PATHS.patch +# For RHEL-95408 - Support multipath failover with scsi-block [rhel-9] +Patch203: kvm-file-posix-Probe-paths-and-retry-SG_IO-on-potential-.patch +# For RHEL-11430 - [IBM 9.7 FEAT] KVM: Performance Enhanced Refresh PCI Translation - qemu part +Patch204: kvm-s390x-pci-add-support-for-guests-that-request-direct.patch +# For RHEL-11430 - [IBM 9.7 FEAT] KVM: Performance Enhanced Refresh PCI Translation - qemu part +Patch205: kvm-s390x-pci-indicate-QEMU-supports-relaxed-translation.patch +# For RHEL-82906 - --migrate-disks-detect-zeroes doesn't take effect for disk migration [rhel-9.7] +# For RHEL-83015 - Disk size of target raw image is full allocated when doing mirror with default discard value [rhel-9.7] +Patch206: kvm-block-Expand-block-status-mode-from-bool-to-flags.patch +# For RHEL-82906 - --migrate-disks-detect-zeroes doesn't take effect for disk migration [rhel-9.7] +# For RHEL-83015 - Disk size of target raw image is full allocated when doing mirror with default discard value [rhel-9.7] +Patch207: kvm-file-posix-gluster-Handle-zero-block-status-hint-bet.patch +# For RHEL-82906 - --migrate-disks-detect-zeroes doesn't take effect for disk migration [rhel-9.7] +# For RHEL-83015 - Disk size of target raw image is full allocated when doing mirror with default discard value [rhel-9.7] +Patch208: kvm-block-Let-bdrv_co_is_zero_fast-consolidate-adjacent-.patch +# For RHEL-82906 - --migrate-disks-detect-zeroes doesn't take effect for disk migration [rhel-9.7] +# For RHEL-83015 - Disk size of target raw image is full allocated when doing mirror with default discard value [rhel-9.7] +Patch209: kvm-block-Add-new-bdrv_co_is_all_zeroes-function.patch +# For RHEL-82906 - --migrate-disks-detect-zeroes doesn't take effect for disk migration [rhel-9.7] +# For RHEL-83015 - Disk size of target raw image is full allocated when doing mirror with default discard value [rhel-9.7] +Patch210: kvm-iotests-Improve-iotest-194-to-mirror-data.patch +# For RHEL-82906 - --migrate-disks-detect-zeroes doesn't take effect for disk migration [rhel-9.7] +# For RHEL-83015 - Disk size of target raw image is full allocated when doing mirror with default discard value [rhel-9.7] +Patch211: kvm-mirror-Minor-refactoring.patch +# For RHEL-82906 - --migrate-disks-detect-zeroes doesn't take effect for disk migration [rhel-9.7] +# For RHEL-83015 - Disk size of target raw image is full allocated when doing mirror with default discard value [rhel-9.7] +Patch212: kvm-mirror-Pass-full-sync-mode-rather-than-bool-to-inter.patch +# For RHEL-82906 - --migrate-disks-detect-zeroes doesn't take effect for disk migration [rhel-9.7] +# For RHEL-83015 - Disk size of target raw image is full allocated when doing mirror with default discard value [rhel-9.7] +Patch213: kvm-mirror-Allow-QMP-override-to-declare-target-already-.patch +# For RHEL-82906 - --migrate-disks-detect-zeroes doesn't take effect for disk migration [rhel-9.7] +# For RHEL-83015 - Disk size of target raw image is full allocated when doing mirror with default discard value [rhel-9.7] +Patch214: kvm-mirror-Drop-redundant-zero_target-parameter.patch +# For RHEL-82906 - --migrate-disks-detect-zeroes doesn't take effect for disk migration [rhel-9.7] +# For RHEL-83015 - Disk size of target raw image is full allocated when doing mirror with default discard value [rhel-9.7] +Patch215: kvm-mirror-Skip-pre-zeroing-destination-if-it-is-already.patch +# For RHEL-82906 - --migrate-disks-detect-zeroes doesn't take effect for disk migration [rhel-9.7] +# For RHEL-83015 - Disk size of target raw image is full allocated when doing mirror with default discard value [rhel-9.7] +Patch216: kvm-mirror-Skip-writing-zeroes-when-target-is-already-ze.patch +# For RHEL-82906 - --migrate-disks-detect-zeroes doesn't take effect for disk migration [rhel-9.7] +# For RHEL-83015 - Disk size of target raw image is full allocated when doing mirror with default discard value [rhel-9.7] +Patch217: kvm-iotests-common.rc-add-disk_usage-function.patch +# For RHEL-82906 - --migrate-disks-detect-zeroes doesn't take effect for disk migration [rhel-9.7] +# For RHEL-83015 - Disk size of target raw image is full allocated when doing mirror with default discard value [rhel-9.7] +Patch218: kvm-tests-Add-iotest-mirror-sparse-for-recent-patches.patch +# For RHEL-82906 - --migrate-disks-detect-zeroes doesn't take effect for disk migration [rhel-9.7] +# For RHEL-83015 - Disk size of target raw image is full allocated when doing mirror with default discard value [rhel-9.7] +Patch219: kvm-mirror-Reduce-I-O-when-destination-is-detect-zeroes-.patch +# For RHEL-98554 - [s390x][RHEL9.7.0][virtio_block] there would be memory leak with virtio_blk disks +Patch220: kvm-s390x-Fix-leak-in-machine_set_loadparm.patch +# For RHEL-98554 - [s390x][RHEL9.7.0][virtio_block] there would be memory leak with virtio_blk disks +Patch221: kvm-hw-s390x-ccw-device-Fix-memory-leak-in-loadparm-sett.patch +# For RHEL-66202 - [AMDSERVER 9.6 Feature] qemu: Interrupt Remap support for emulated amd viommu +Patch222: kvm-amd_iommu-Rename-variable-mmio-to-mr_mmio.patch +# For RHEL-66202 - [AMDSERVER 9.6 Feature] qemu: Interrupt Remap support for emulated amd viommu +Patch223: kvm-amd_iommu-Add-support-for-pass-though-mode.patch +# For RHEL-66202 - [AMDSERVER 9.6 Feature] qemu: Interrupt Remap support for emulated amd viommu +Patch224: kvm-amd_iommu-Use-shared-memory-region-for-Interrupt-Rem.patch +# For RHEL-66202 - [AMDSERVER 9.6 Feature] qemu: Interrupt Remap support for emulated amd viommu +Patch225: kvm-amd_iommu-Send-notification-when-invalidate-interrup.patch +# For RHEL-66202 - [AMDSERVER 9.6 Feature] qemu: Interrupt Remap support for emulated amd viommu +Patch226: kvm-amd_iommu-Check-APIC-ID-255-for-XTSup.patch +# For RHEL-67104 - postcopy on the destination host can't switch into pause status under the network issue if boot VM with '-S' +Patch227: kvm-io-Fix-partial-struct-copy-in-qio_dns_resolver_looku.patch +# For RHEL-67104 - postcopy on the destination host can't switch into pause status under the network issue if boot VM with '-S' +Patch228: kvm-util-qemu-sockets-Refactor-setting-client-sockopts-i.patch +# For RHEL-67104 - postcopy on the destination host can't switch into pause status under the network issue if boot VM with '-S' +Patch229: kvm-util-qemu-sockets-Refactor-success-and-failure-paths.patch +# For RHEL-67104 - postcopy on the destination host can't switch into pause status under the network issue if boot VM with '-S' +Patch230: kvm-util-qemu-sockets-Add-support-for-keep-alive-flag-to.patch +# For RHEL-67104 - postcopy on the destination host can't switch into pause status under the network issue if boot VM with '-S' +Patch231: kvm-util-qemu-sockets-Refactor-inet_parse-to-use-QemuOpt.patch +# For RHEL-67104 - postcopy on the destination host can't switch into pause status under the network issue if boot VM with '-S' +Patch232: kvm-util-qemu-sockets-Introduce-inet-socket-options-cont.patch +# For RHEL-67104 - postcopy on the destination host can't switch into pause status under the network issue if boot VM with '-S' +Patch233: kvm-tests-unit-test-util-sockets-fix-mem-leak-on-error-o.patch +# For RHEL-52649 - [AMDSERVER 9.6 Feature] Turin: Qemu EPYC-Turin Model +Patch234: kvm-target-i386-Expose-bits-related-to-SRSO-vulnerabilit.patch +# For RHEL-52649 - [AMDSERVER 9.6 Feature] Turin: Qemu EPYC-Turin Model +Patch235: kvm-target-i386-Add-PerfMonV2-feature-bit.patch +# For RHEL-52649 - [AMDSERVER 9.6 Feature] Turin: Qemu EPYC-Turin Model +Patch236: kvm-target-i386-Update-EPYC-CPU-model-for-Cache-property.patch +# For RHEL-52649 - [AMDSERVER 9.6 Feature] Turin: Qemu EPYC-Turin Model +Patch237: kvm-target-i386-Update-EPYC-Rome-CPU-model-for-Cache-pro.patch +# For RHEL-52649 - [AMDSERVER 9.6 Feature] Turin: Qemu EPYC-Turin Model +Patch238: kvm-target-i386-Update-EPYC-Milan-CPU-model-for-Cache-pr.patch +# For RHEL-52649 - [AMDSERVER 9.6 Feature] Turin: Qemu EPYC-Turin Model +Patch239: kvm-target-i386-Add-couple-of-feature-bits-in-CPUID_Fn80.patch +# For RHEL-52649 - [AMDSERVER 9.6 Feature] Turin: Qemu EPYC-Turin Model +Patch240: kvm-target-i386-Update-EPYC-Genoa-for-Cache-property-per.patch +# For RHEL-52649 - [AMDSERVER 9.6 Feature] Turin: Qemu EPYC-Turin Model +Patch241: kvm-target-i386-Add-support-for-EPYC-Turin-model.patch +# For RHEL-70926 - Qemu/amd-iommu: Advertise a suitable device id +Patch242: kvm-hw-i386-amd_iommu-Assign-pci-id-0x1419-for-the-AMD-I.patch +# For RHEL-70925 - Qemu/amd-iommu: Add ability to manually specify the AMDVI-PCI device +Patch243: kvm-hw-i386-amd_iommu-Isolate-AMDVI-PCI-from-amd-iommu-d.patch +# For RHEL-70925 - Qemu/amd-iommu: Add ability to manually specify the AMDVI-PCI device +Patch244: kvm-hw-i386-amd_iommu-Allow-migration-when-explicitly-cr.patch +# For RHEL-70925 - Qemu/amd-iommu: Add ability to manually specify the AMDVI-PCI device +Patch245: kvm-Enable-amd-iommu-device.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch246: kvm-include-qemu-compiler-add-QEMU_UNINITIALIZED-attribu.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch247: kvm-hw-virtio-virtio-avoid-cost-of-ftrivial-auto-var-ini.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch248: kvm-block-skip-automatic-zero-init-of-large-array-in-ioq.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch249: kvm-chardev-char-fd-skip-automatic-zero-init-of-large-ar.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch250: kvm-chardev-char-pty-skip-automatic-zero-init-of-large-a.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch251: kvm-chardev-char-socket-skip-automatic-zero-init-of-larg.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch252: kvm-hw-audio-ac97-skip-automatic-zero-init-of-large-arra.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch253: kvm-hw-audio-cs4231a-skip-automatic-zero-init-of-large-a.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch254: kvm-hw-audio-es1370-skip-automatic-zero-init-of-large-ar.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch255: kvm-hw-audio-gus-skip-automatic-zero-init-of-large-array.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch256: kvm-hw-audio-marvell_88w8618-skip-automatic-zero-init-of.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch257: kvm-hw-audio-sb16-skip-automatic-zero-init-of-large-arra.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch258: kvm-hw-audio-via-ac97-skip-automatic-zero-init-of-large-.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch259: kvm-hw-char-sclpconsole-lm-skip-automatic-zero-init-of-l.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch260: kvm-hw-dma-xlnx_csu_dma-skip-automatic-zero-init-of-larg.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch261: kvm-hw-display-vmware_vga-skip-automatic-zero-init-of-la.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch262: kvm-hw-hyperv-syndbg-skip-automatic-zero-init-of-large-a.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch263: kvm-hw-misc-aspeed_hace-skip-automatic-zero-init-of-larg.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch264: kvm-hw-net-rtl8139-skip-automatic-zero-init-of-large-arr.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch265: kvm-hw-net-tulip-skip-automatic-zero-init-of-large-array.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch266: kvm-hw-net-virtio-net-skip-automatic-zero-init-of-large-.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch267: kvm-hw-net-xgamc-skip-automatic-zero-init-of-large-array.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch268: kvm-hw-nvme-ctrl-skip-automatic-zero-init-of-large-array.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch269: kvm-hw-ppc-spapr_tpm_proxy-skip-automatic-zero-init-of-l.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch270: kvm-hw-usb-hcd-ohci-skip-automatic-zero-init-of-large-ar.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch271: kvm-hw-scsi-lsi53c895a-skip-automatic-zero-init-of-large.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch272: kvm-hw-scsi-megasas-skip-automatic-zero-init-of-large-ar.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch273: kvm-hw-ufs-lu-skip-automatic-zero-init-of-large-array.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch274: kvm-net-socket-skip-automatic-zero-init-of-large-array.patch +# For RHEL-99888 - -ftrivial-auto-var-init=zero reduced performance [rhel-9] +Patch275: kvm-net-stream-skip-automatic-zero-init-of-large-array.patch +# For RHEL-100741 - Video stuck after switchover phase when play one video during migration [rhel-9] +Patch276: kvm-ui-vnc-Update-display-update-interval-when-VM-state-.patch +# For RHEL-108726 - Openstack guest becomes inaccessible via network when storage network on the hypervisor is disabled/lost [rhel-9] +Patch277: kvm-rbd-Fix-.bdrv_get_specific_info-implementation.patch %if %{have_clang} BuildRequires: clang @@ -542,6 +814,9 @@ BuildRequires: pulseaudio-libs-devel BuildRequires: spice-protocol BuildRequires: capstone-devel BuildRequires: python3-tomli +%ifarch %{valgrind_arches} +BuildRequires: valgrind-devel +%endif # Requires for qemu-kvm package Requires: %{name}-core = %{epoch}:%{version}-%{release} @@ -621,6 +896,8 @@ This package provides documentation and auxiliary programs used with %{name}. %package tools Summary: %{name} support tools +Recommends: systemtap-client +Recommends: systemtap-devel %description tools %{name}-tools provides various tools related to %{name} usage. @@ -916,6 +1193,7 @@ ulimit -n 10240 --disable-u2f \\\ --disable-usb-redir \\\ --disable-user \\\ + --disable-valgrind \\\ --disable-vde \\\ --disable-vdi \\\ --disable-vduse-blk-export \\\ @@ -1038,6 +1316,9 @@ run_configure \ --enable-tpm \ %if %{have_usbredir} --enable-usb-redir \ +%endif +%ifarch %{valgrind_arches} + --enable-valgrind \ %endif --enable-vdi \ --enable-vhost-kernel \ @@ -1531,6 +1812,221 @@ useradd -r -u 107 -g qemu -G kvm -d / -s /sbin/nologin \ %endif %changelog +* Wed Aug 20 2025 Jon Maloy - 9.1.0-26 +- kvm-rbd-Fix-.bdrv_get_specific_info-implementation.patch [RHEL-108726] +- Resolves: RHEL-108726 + (Openstack guest becomes inaccessible via network when storage network on the hypervisor is disabled/lost [rhel-9]) + +* Tue Jul 08 2025 Miroslav Rezanina - 9.1.0-25 +- kvm-s390x-Fix-leak-in-machine_set_loadparm.patch [RHEL-98554] +- kvm-hw-s390x-ccw-device-Fix-memory-leak-in-loadparm-sett.patch [RHEL-98554] +- kvm-amd_iommu-Rename-variable-mmio-to-mr_mmio.patch [RHEL-66202] +- kvm-amd_iommu-Add-support-for-pass-though-mode.patch [RHEL-66202] +- kvm-amd_iommu-Use-shared-memory-region-for-Interrupt-Rem.patch [RHEL-66202] +- kvm-amd_iommu-Send-notification-when-invalidate-interrup.patch [RHEL-66202] +- kvm-amd_iommu-Check-APIC-ID-255-for-XTSup.patch [RHEL-66202] +- kvm-io-Fix-partial-struct-copy-in-qio_dns_resolver_looku.patch [RHEL-67104] +- kvm-util-qemu-sockets-Refactor-setting-client-sockopts-i.patch [RHEL-67104] +- kvm-util-qemu-sockets-Refactor-success-and-failure-paths.patch [RHEL-67104] +- kvm-util-qemu-sockets-Add-support-for-keep-alive-flag-to.patch [RHEL-67104] +- kvm-util-qemu-sockets-Refactor-inet_parse-to-use-QemuOpt.patch [RHEL-67104] +- kvm-util-qemu-sockets-Introduce-inet-socket-options-cont.patch [RHEL-67104] +- kvm-tests-unit-test-util-sockets-fix-mem-leak-on-error-o.patch [RHEL-67104] +- kvm-target-i386-Expose-bits-related-to-SRSO-vulnerabilit.patch [RHEL-52649] +- kvm-target-i386-Add-PerfMonV2-feature-bit.patch [RHEL-52649] +- kvm-target-i386-Update-EPYC-CPU-model-for-Cache-property.patch [RHEL-52649] +- kvm-target-i386-Update-EPYC-Rome-CPU-model-for-Cache-pro.patch [RHEL-52649] +- kvm-target-i386-Update-EPYC-Milan-CPU-model-for-Cache-pr.patch [RHEL-52649] +- kvm-target-i386-Add-couple-of-feature-bits-in-CPUID_Fn80.patch [RHEL-52649] +- kvm-target-i386-Update-EPYC-Genoa-for-Cache-property-per.patch [RHEL-52649] +- kvm-target-i386-Add-support-for-EPYC-Turin-model.patch [RHEL-52649] +- kvm-hw-i386-amd_iommu-Assign-pci-id-0x1419-for-the-AMD-I.patch [RHEL-70926] +- kvm-hw-i386-amd_iommu-Isolate-AMDVI-PCI-from-amd-iommu-d.patch [RHEL-70925] +- kvm-hw-i386-amd_iommu-Allow-migration-when-explicitly-cr.patch [RHEL-70925] +- kvm-Enable-amd-iommu-device.patch [RHEL-70925] +- kvm-include-qemu-compiler-add-QEMU_UNINITIALIZED-attribu.patch [RHEL-99888] +- kvm-hw-virtio-virtio-avoid-cost-of-ftrivial-auto-var-ini.patch [RHEL-99888] +- kvm-block-skip-automatic-zero-init-of-large-array-in-ioq.patch [RHEL-99888] +- kvm-chardev-char-fd-skip-automatic-zero-init-of-large-ar.patch [RHEL-99888] +- kvm-chardev-char-pty-skip-automatic-zero-init-of-large-a.patch [RHEL-99888] +- kvm-chardev-char-socket-skip-automatic-zero-init-of-larg.patch [RHEL-99888] +- kvm-hw-audio-ac97-skip-automatic-zero-init-of-large-arra.patch [RHEL-99888] +- kvm-hw-audio-cs4231a-skip-automatic-zero-init-of-large-a.patch [RHEL-99888] +- kvm-hw-audio-es1370-skip-automatic-zero-init-of-large-ar.patch [RHEL-99888] +- kvm-hw-audio-gus-skip-automatic-zero-init-of-large-array.patch [RHEL-99888] +- kvm-hw-audio-marvell_88w8618-skip-automatic-zero-init-of.patch [RHEL-99888] +- kvm-hw-audio-sb16-skip-automatic-zero-init-of-large-arra.patch [RHEL-99888] +- kvm-hw-audio-via-ac97-skip-automatic-zero-init-of-large-.patch [RHEL-99888] +- kvm-hw-char-sclpconsole-lm-skip-automatic-zero-init-of-l.patch [RHEL-99888] +- kvm-hw-dma-xlnx_csu_dma-skip-automatic-zero-init-of-larg.patch [RHEL-99888] +- kvm-hw-display-vmware_vga-skip-automatic-zero-init-of-la.patch [RHEL-99888] +- kvm-hw-hyperv-syndbg-skip-automatic-zero-init-of-large-a.patch [RHEL-99888] +- kvm-hw-misc-aspeed_hace-skip-automatic-zero-init-of-larg.patch [RHEL-99888] +- kvm-hw-net-rtl8139-skip-automatic-zero-init-of-large-arr.patch [RHEL-99888] +- kvm-hw-net-tulip-skip-automatic-zero-init-of-large-array.patch [RHEL-99888] +- kvm-hw-net-virtio-net-skip-automatic-zero-init-of-large-.patch [RHEL-99888] +- kvm-hw-net-xgamc-skip-automatic-zero-init-of-large-array.patch [RHEL-99888] +- kvm-hw-nvme-ctrl-skip-automatic-zero-init-of-large-array.patch [RHEL-99888] +- kvm-hw-ppc-spapr_tpm_proxy-skip-automatic-zero-init-of-l.patch [RHEL-99888] +- kvm-hw-usb-hcd-ohci-skip-automatic-zero-init-of-large-ar.patch [RHEL-99888] +- kvm-hw-scsi-lsi53c895a-skip-automatic-zero-init-of-large.patch [RHEL-99888] +- kvm-hw-scsi-megasas-skip-automatic-zero-init-of-large-ar.patch [RHEL-99888] +- kvm-hw-ufs-lu-skip-automatic-zero-init-of-large-array.patch [RHEL-99888] +- kvm-net-socket-skip-automatic-zero-init-of-large-array.patch [RHEL-99888] +- kvm-net-stream-skip-automatic-zero-init-of-large-array.patch [RHEL-99888] +- kvm-ui-vnc-Update-display-update-interval-when-VM-state-.patch [RHEL-100741] +- Resolves: RHEL-98554 + ([s390x][RHEL9.7.0][virtio_block] there would be memory leak with virtio_blk disks) +- Resolves: RHEL-66202 + ([AMDSERVER 9.6 Feature] qemu: Interrupt Remap support for emulated amd viommu) +- Resolves: RHEL-67104 + (postcopy on the destination host can't switch into pause status under the network issue if boot VM with '-S') +- Resolves: RHEL-52649 + ([AMDSERVER 9.6 Feature] Turin: Qemu EPYC-Turin Model) +- Resolves: RHEL-70926 + (Qemu/amd-iommu: Advertise a suitable device id) +- Resolves: RHEL-70925 + (Qemu/amd-iommu: Add ability to manually specify the AMDVI-PCI device) +- Resolves: RHEL-99888 + (-ftrivial-auto-var-init=zero reduced performance [rhel-9]) +- Resolves: RHEL-100741 + (Video stuck after switchover phase when play one video during migration [rhel-9]) + +* Mon Jun 16 2025 Jon Maloy - 9.1.0-24 +- kvm-s390x-pci-add-support-for-guests-that-request-direct.patch [RHEL-11430] +- kvm-s390x-pci-indicate-QEMU-supports-relaxed-translation.patch [RHEL-11430] +- kvm-block-Expand-block-status-mode-from-bool-to-flags.patch [RHEL-82906 RHEL-83015] +- kvm-file-posix-gluster-Handle-zero-block-status-hint-bet.patch [RHEL-82906 RHEL-83015] +- kvm-block-Let-bdrv_co_is_zero_fast-consolidate-adjacent-.patch [RHEL-82906 RHEL-83015] +- kvm-block-Add-new-bdrv_co_is_all_zeroes-function.patch [RHEL-82906 RHEL-83015] +- kvm-iotests-Improve-iotest-194-to-mirror-data.patch [RHEL-82906 RHEL-83015] +- kvm-mirror-Minor-refactoring.patch [RHEL-82906 RHEL-83015] +- kvm-mirror-Pass-full-sync-mode-rather-than-bool-to-inter.patch [RHEL-82906 RHEL-83015] +- kvm-mirror-Allow-QMP-override-to-declare-target-already-.patch [RHEL-82906 RHEL-83015] +- kvm-mirror-Drop-redundant-zero_target-parameter.patch [RHEL-82906 RHEL-83015] +- kvm-mirror-Skip-pre-zeroing-destination-if-it-is-already.patch [RHEL-82906 RHEL-83015] +- kvm-mirror-Skip-writing-zeroes-when-target-is-already-ze.patch [RHEL-82906 RHEL-83015] +- kvm-iotests-common.rc-add-disk_usage-function.patch [RHEL-82906 RHEL-83015] +- kvm-tests-Add-iotest-mirror-sparse-for-recent-patches.patch [RHEL-82906 RHEL-83015] +- kvm-mirror-Reduce-I-O-when-destination-is-detect-zeroes-.patch [RHEL-82906 RHEL-83015] +- Resolves: RHEL-11430 + ([IBM 9.7 FEAT] KVM: Performance Enhanced Refresh PCI Translation - qemu part) +- Resolves: RHEL-82906 + (--migrate-disks-detect-zeroes doesn't take effect for disk migration [rhel-9.7]) +- Resolves: RHEL-83015 + (Disk size of target raw image is full allocated when doing mirror with default discard value [rhel-9.7]) + +* Mon Jun 09 2025 Jon Maloy - 9.1.0-23 +- kvm-net-vhost-user-add-QAPI-events-to-report-connection-.patch [RHEL-95120] +- kvm-file-posix-Define-DM_MPATH_PROBE_PATHS.patch [RHEL-95408] +- kvm-file-posix-Probe-paths-and-retry-SG_IO-on-potential-.patch [RHEL-95408] +- Resolves: RHEL-95120 + (Allow libvirt to restart passt/vhost-user when the process is killed [rhel-9.7]) +- Resolves: RHEL-95408 + (Support multipath failover with scsi-block [rhel-9]) + +* Mon Jun 02 2025 Jon Maloy - 9.1.0-22 +- kvm-migration-postcopy-Spatial-locality-page-hint-for-pr.patch [RHEL-85159] +- kvm-Allow-guest-network-get-route-guest-get-load-QGA-com.patch [RHEL-91605 RHEL-91606] +- Resolves: RHEL-85159 + (Video stuck about 1 min after switchover phase when play one video during postcopy-preempt migration) +- Resolves: RHEL-91605 + ([qemu-guest-agent] Add new api 'guest-network-get-route' to allow-rpc [RHEL-9]) +- Resolves: RHEL-91606 + ([qemu-guest-agent] Enable 'guest-get-load' by default [RHEL-9]) + +* Mon May 26 2025 Jon Maloy - 9.1.0-21 +- kvm-meson-configure-add-valgrind-option-en-dis-able-valg.patch [RHEL-88153] +- kvm-distro-add-an-explicit-valgrind-devel-build-dep.patch [RHEL-88153] +- kvm-hw-i386-Fix-machine-type-compatibility.patch [RHEL-91307] +- kvm-vfio-helpers-Refactor-vfio_region_mmap-error-handlin.patch [RHEL-88533] +- kvm-vfio-helpers-Align-mmaps.patch [RHEL-88533] +- Resolves: RHEL-88153 + ([s390x] valgrind not working with qemu-kvm for non-x86 builds) +- Resolves: RHEL-91307 + (Fix x86 M-type compats) +- Resolves: RHEL-88533 + (Improve VFIO mmapping performance with huge pfnmaps) + +* Tue May 13 2025 Jon Maloy - 9.1.0-20 +- kvm-virtio-net-disable-USO-for-virt-rhel9.6.patch [RHEL-80313] +- kvm-arm-Use-arm_virt_compat_set-to-apply-the-compat.patch [RHEL-80313] +- kvm-file-posix-probe-discard-alignment-on-Linux-block-de.patch [RHEL-86032] +- kvm-block-io-skip-head-tail-requests-on-EINVAL.patch [RHEL-86032] +- kvm-file-posix-Fix-crash-on-discard_granularity-0.patch [RHEL-86032] +- Resolves: RHEL-80313 + (Unable to migrate VM from RHEL10.0/qemu-kvm-9.6 to RHEL9.6/qemu-kvm-9.6) +- Resolves: RHEL-86032 + (QEMU sends unaligned discards on 4K devices [RHEL-9.7]) + +* Mon Apr 28 2025 Jon Maloy - 9.1.0-19 +- kvm-target-i386-Fix-conditional-CONFIG_SYNDBG-enablement.patch [RHEL-7130] +- kvm-target-i386-Exclude-hv-syndbg-from-hv-passthrough.patch [RHEL-7130] +- Resolves: RHEL-7130 + ([Hyper-V][RHEL9.2] Nested Hyper-V on KVM: L1 Windows VM with BIOS mode fails to boot up when using '-cpu host,hv_passthrough’ flag) + +* Mon Apr 14 2025 Jon Maloy - 9.1.0-18 +- kvm-virtio-kconfig-memory-devices-are-PCI-only.patch [RHEL-72977] +- kvm-hw-s390-ccw-device-Convert-to-three-phase-reset.patch [RHEL-72977] +- kvm-hw-s390-virtio-ccw-Convert-to-three-phase-reset.patch [RHEL-72977] +- kvm-target-s390-Convert-CPU-to-Resettable-interface.patch [RHEL-72977] +- kvm-reset-Use-ResetType-for-qemu_devices_reset-and-Machi.patch [RHEL-72977] +- kvm-reset-Add-RESET_TYPE_WAKEUP.patch [RHEL-72977] +- kvm-virtio-mem-Use-new-Resettable-framework-instead-of-L.patch [RHEL-72977] +- kvm-virtio-mem-Add-support-for-suspend-wake-up-with-plug.patch [RHEL-72977] +- kvm-virtio-mem-unplug-memory-only-during-system-resets-n.patch [RHEL-72977] +- kvm-s390x-s390-virtio-ccw-don-t-crash-on-weird-RAM-sizes.patch [RHEL-72977] +- kvm-s390x-s390-virtio-hcall-remove-hypercall-registratio.patch [RHEL-72977] +- kvm-s390x-s390-virtio-hcall-prepare-for-more-diag500-hyp.patch [RHEL-72977] +- kvm-s390x-rename-s390-virtio-hcall-to-s390-hypercall.patch [RHEL-72977] +- kvm-s390x-s390-virtio-ccw-move-setting-the-maximum-guest.patch [RHEL-72977] +- kvm-s390x-introduce-s390_get_memory_limit.patch [RHEL-72977] +- kvm-s390x-s390-hypercall-introduce-DIAG500-STORAGE_LIMIT.patch [RHEL-72977] +- kvm-s390x-s390-stattrib-kvm-prepare-for-memory-devices-a.patch [RHEL-72977] +- kvm-s390x-s390-skeys-prepare-for-memory-devices.patch [RHEL-72977] +- kvm-s390x-s390-virtio-ccw-prepare-for-memory-devices.patch [RHEL-72977] +- kvm-s390x-pv-prepare-for-memory-devices.patch [RHEL-72977] +- kvm-s390x-remember-the-maximum-page-size.patch [RHEL-72977] +- kvm-s390x-virtio-ccw-add-support-for-virtio-based-memory.patch [RHEL-72977] +- kvm-s390x-virtio-mem-support.patch [RHEL-72977] +- kvm-hw-virtio-Also-include-md-stubs-in-case-CONFIG_VIRTI.patch [RHEL-72977] +- kvm-virtio-mem-don-t-warn-about-THP-sizes-on-a-kernel-wi.patch [RHEL-72977] +- kvm-redhat-Enable-virtio-mem-on-s390x.patch [RHEL-72977] +- Resolves: RHEL-72977 + ([IBM 9.7 FEAT] KVM: Enable virtio-mem support - qemu part) + +* Mon Mar 31 2025 Jon Maloy - 9.1.0-17 +- kvm-hw-pci-Rename-has_power-to-enabled.patch [RHEL-7301] +- kvm-hw-pci-Basic-support-for-PCI-power-management.patch [RHEL-7301] +- kvm-pci-Use-PCI-PM-capability-initializer.patch [RHEL-7301] +- kvm-vfio-pci-Delete-local-pm_cap.patch [RHEL-7301] +- kvm-pcie-virtio-Remove-redundant-pm_cap.patch [RHEL-7301] +- kvm-hw-vfio-pci-Re-order-pre-reset.patch [RHEL-7301] +- kvm-Also-recommend-systemtap-devel-from-qemu-tools.patch [RHEL-47340] +- Resolves: RHEL-7301 + ([intel iommu] VFIO_MAP_DMA failed: Bad address on system_powerdown) +- Resolves: RHEL-47340 + ([Qemu RHEL-9] qemu-trace-stap should handle lack of stap more gracefully) + +* Thu Mar 20 2025 Jon Maloy - 9.1.0-16 +- kvm-hw-virtio-virtio-iommu-Migrate-to-3-phase-reset.patch [RHEL-7188] +- kvm-hw-i386-intel-iommu-Migrate-to-3-phase-reset.patch [RHEL-7188] +- kvm-hw-arm-smmuv3-Move-reset-to-exit-phase.patch [RHEL-7188] +- kvm-hw-vfio-common-Add-a-trace-point-in-vfio_reset_handl.patch [RHEL-7188] +- kvm-docs-devel-reset-Document-reset-expectations-for-DMA.patch [RHEL-7188] +- kvm-qga-implement-a-guest-get-load-command.patch [RHEL-69622] +- kvm-migration-Fix-UAF-for-incoming-migration-on-Migratio.patch [RHEL-69775] +- kvm-scripts-improve-error-from-qemu-trace-stap-on-missin.patch [RHEL-47340] +- kvm-Recommend-systemtap-client-from-qemu-tools.patch [RHEL-47340] +- Resolves: RHEL-7188 + ([intel iommu][PF] DMAR: DRHD: handling fault status reg) +- Resolves: RHEL-69622 + ([qemu-guest-agent][RFE] Report CPU load average) +- Resolves: RHEL-69775 + (Guest crashed on the target host when the migration was canceled) +- Resolves: RHEL-47340 + ([Qemu RHEL-9] qemu-trace-stap should handle lack of stap more gracefully) + * Mon Feb 17 2025 Jon Maloy - 9.1.0-15 - kvm-net-Fix-announce_self.patch [RHEL-73891] - kvm-migration-Add-helper-to-get-target-runstate.patch [RHEL-54296 RHEL-78397]