import CS qemu-kvm-9.1.0-26.el9

This commit is contained in:
eabdullin 2025-09-15 12:37:45 +00:00
parent 8a741c4808
commit ab9af75c93
131 changed files with 16444 additions and 2 deletions

View File

@ -0,0 +1,38 @@
From 0608561efc441f234d9aaf45f1867ffb5c43cffe Mon Sep 17 00:00:00 2001
From: John Allen <john.allen@amd.com>
Date: Wed, 11 Jun 2025 15:41:14 -0500
Subject: [PATCH 26/57] Enable amd-iommu device
RH-Author: John Allen <None>
RH-MergeRequest: 380: Add ability to manually specify the AMDVI-PCI device
RH-Jira: RHEL-70925
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [3/3] 852500a18275e14bcd94d598ccd0ee33b76578dc (johnalle/qemu-kvm-fork)
Now that the amdvi-pci device that amd-iommu creates can be specified
manually, amd-iommu device can be enabled.
JIRA: https://issues.redhat.com/browse/RHEL-70925
Upstream: RHEL ONLY
Signed-off-by: John Allen <johnalle@redhat.com>
---
configs/devices/x86_64-softmmu/x86_64-rh-devices.mak | 1 +
1 file changed, 1 insertion(+)
diff --git a/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak b/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak
index 3e5f693b62..2b15fdc2db 100644
--- a/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak
+++ b/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak
@@ -97,6 +97,7 @@ CONFIG_VIRTIO_MEM=y
CONFIG_VIRTIO_PCI=y
CONFIG_VIRTIO_VGA=y
CONFIG_VIRTIO_IOMMU=y
+CONFIG_AMD_IOMMU=y
CONFIG_VMMOUSE=y
CONFIG_VMPORT=y
CONFIG_VTD=y
--
2.39.3

View File

@ -0,0 +1,141 @@
From 4114553452f7187283aefa001bc8342fc65b6b72 Mon Sep 17 00:00:00 2001
From: John Allen <john.allen@amd.com>
Date: Wed, 11 Dec 2024 15:06:48 -0600
Subject: [PATCH 04/57] amd_iommu: Add support for pass though mode
RH-Author: John Allen <None>
RH-MergeRequest: 303: Interrupt Remap support for emulated amd viommu
RH-Jira: RHEL-66202
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [2/5] 0434fefd554baf27fb9d93026af513c621f8cdb0 (johnalle/qemu-kvm-fork)
JIRA: https://issues.redhat.com/browse/RHEL-66202
commit c1f46999ef506d9854534560a94d02cf3cf9edd1
Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Fri Sep 27 12:29:10 2024 -0500
amd_iommu: Add support for pass though mode
Introduce 'nodma' shared memory region to support PT mode
so that for each device, we only create an alias to shared memory
region when DMA-remapping is disabled.
Reviewed-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Santosh Shukla <santosh.shukla@amd.com>
Message-Id: <20240927172913.121477-3-santosh.shukla@amd.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: John Allen <john.allen@amd.com>
---
hw/i386/amd_iommu.c | 49 ++++++++++++++++++++++++++++++++++++---------
hw/i386/amd_iommu.h | 2 ++
2 files changed, 42 insertions(+), 9 deletions(-)
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 148b5ee51d..567cb8adc9 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -60,8 +60,9 @@ struct AMDVIAddressSpace {
uint8_t bus_num; /* bus number */
uint8_t devfn; /* device function */
AMDVIState *iommu_state; /* AMDVI - one per machine */
- MemoryRegion root; /* AMDVI Root memory map region */
+ MemoryRegion root; /* AMDVI Root memory map region */
IOMMUMemoryRegion iommu; /* Device's address translation region */
+ MemoryRegion iommu_nodma; /* Alias of shared nodma memory region */
MemoryRegion iommu_ir; /* Device's interrupt remapping region */
AddressSpace as; /* device's corresponding address space */
};
@@ -1412,6 +1413,7 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
AMDVIState *s = opaque;
AMDVIAddressSpace **iommu_as, *amdvi_dev_as;
int bus_num = pci_bus_num(bus);
+ X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
iommu_as = s->address_spaces[bus_num];
@@ -1436,13 +1438,13 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
* Memory region relationships looks like (Address range shows
* only lower 32 bits to make it short in length...):
*
- * |-----------------+-------------------+----------|
- * | Name | Address range | Priority |
- * |-----------------+-------------------+----------+
- * | amdvi_root | 00000000-ffffffff | 0 |
- * | amdvi_iommu | 00000000-ffffffff | 1 |
- * | amdvi_iommu_ir | fee00000-feefffff | 64 |
- * |-----------------+-------------------+----------|
+ * |--------------------+-------------------+----------|
+ * | Name | Address range | Priority |
+ * |--------------------+-------------------+----------+
+ * | amdvi-root | 00000000-ffffffff | 0 |
+ * | amdvi-iommu_nodma | 00000000-ffffffff | 0 |
+ * | amdvi-iommu_ir | fee00000-feefffff | 64 |
+ * |--------------------+-------------------+----------|
*/
memory_region_init_iommu(&amdvi_dev_as->iommu,
sizeof(amdvi_dev_as->iommu),
@@ -1461,7 +1463,25 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
64);
memory_region_add_subregion_overlap(&amdvi_dev_as->root, 0,
MEMORY_REGION(&amdvi_dev_as->iommu),
- 1);
+ 0);
+
+ /* Build the DMA Disabled alias to shared memory */
+ memory_region_init_alias(&amdvi_dev_as->iommu_nodma, OBJECT(s),
+ "amdvi-sys", &s->mr_sys, 0,
+ memory_region_size(&s->mr_sys));
+ memory_region_add_subregion_overlap(&amdvi_dev_as->root, 0,
+ &amdvi_dev_as->iommu_nodma,
+ 0);
+
+ if (!x86_iommu->pt_supported) {
+ memory_region_set_enabled(&amdvi_dev_as->iommu_nodma, false);
+ memory_region_set_enabled(MEMORY_REGION(&amdvi_dev_as->iommu),
+ true);
+ } else {
+ memory_region_set_enabled(MEMORY_REGION(&amdvi_dev_as->iommu),
+ false);
+ memory_region_set_enabled(&amdvi_dev_as->iommu_nodma, true);
+ }
}
return &iommu_as[devfn]->as;
}
@@ -1602,6 +1622,17 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
"amdvi-mmio", AMDVI_MMIO_SIZE);
memory_region_add_subregion(get_system_memory(), AMDVI_BASE_ADDR,
&s->mr_mmio);
+
+ /* Create the share memory regions by all devices */
+ memory_region_init(&s->mr_sys, OBJECT(s), "amdvi-sys", UINT64_MAX);
+
+ /* set up the DMA disabled memory region */
+ memory_region_init_alias(&s->mr_nodma, OBJECT(s),
+ "amdvi-nodma", get_system_memory(), 0,
+ memory_region_size(get_system_memory()));
+ memory_region_add_subregion_overlap(&s->mr_sys, 0,
+ &s->mr_nodma, 0);
+
pci_setup_iommu(bus, &amdvi_iommu_ops, s);
amdvi_init(s);
}
diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
index e5c2ae94f2..be417e51c4 100644
--- a/hw/i386/amd_iommu.h
+++ b/hw/i386/amd_iommu.h
@@ -354,6 +354,8 @@ struct AMDVIState {
uint32_t pprlog_tail; /* ppr log tail */
MemoryRegion mr_mmio; /* MMIO region */
+ MemoryRegion mr_sys;
+ MemoryRegion mr_nodma;
uint8_t mmior[AMDVI_MMIO_SIZE]; /* read/write MMIO */
uint8_t w1cmask[AMDVI_MMIO_SIZE]; /* read/write 1 clear mask */
uint8_t romask[AMDVI_MMIO_SIZE]; /* MMIO read/only mask */
--
2.39.3

View File

@ -0,0 +1,66 @@
From 0397ebacdba6539147d9986255c3f81cbfdabf1e Mon Sep 17 00:00:00 2001
From: John Allen <john.allen@amd.com>
Date: Wed, 11 Dec 2024 15:07:03 -0600
Subject: [PATCH 07/57] amd_iommu: Check APIC ID > 255 for XTSup
RH-Author: John Allen <None>
RH-MergeRequest: 303: Interrupt Remap support for emulated amd viommu
RH-Jira: RHEL-66202
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [5/5] f39b3e3cdefc2b562f1ad2ef939a37bf404f355a (johnalle/qemu-kvm-fork)
JIRA: https://issues.redhat.com/browse/RHEL-66202
commit b12cb3819baf6d9ee8140d4dd6d36fa829e2c6d9
Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Fri Sep 27 12:29:13 2024 -0500
amd_iommu: Check APIC ID > 255 for XTSup
The XTSup mode enables x2APIC support for AMD IOMMU, which is needed
to support vcpu w/ APIC ID > 255.
Reviewed-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Santosh Shukla <santosh.shukla@amd.com>
Message-Id: <20240927172913.121477-6-santosh.shukla@amd.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: John Allen <john.allen@amd.com>
---
hw/i386/amd_iommu.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 82d76dfca9..d804656ea8 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -32,6 +32,7 @@
#include "trace.h"
#include "hw/i386/apic-msidef.h"
#include "hw/qdev-properties.h"
+#include "kvm/kvm_i386.h"
/* used AMD-Vi MMIO registers */
const char *amdvi_mmio_low[] = {
@@ -1651,6 +1652,16 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
memory_region_add_subregion_overlap(&s->mr_sys, AMDVI_INT_ADDR_FIRST,
&s->mr_ir, 1);
+ /* AMD IOMMU with x2APIC mode requires xtsup=on */
+ if (x86ms->apic_id_limit > 255 && !s->xtsup) {
+ error_report("AMD IOMMU with x2APIC confguration requires xtsup=on");
+ exit(EXIT_FAILURE);
+ }
+ if (s->xtsup && kvm_irqchip_is_split() && !kvm_enable_x2apic()) {
+ error_report("AMD IOMMU xtsup=on requires support on the KVM side");
+ exit(EXIT_FAILURE);
+ }
+
pci_setup_iommu(bus, &amdvi_iommu_ops, s);
amdvi_init(s);
}
--
2.39.3

View File

@ -0,0 +1,94 @@
From f733325d3d91576ae9f6e341faabc301542fc6c8 Mon Sep 17 00:00:00 2001
From: John Allen <john.allen@amd.com>
Date: Wed, 11 Dec 2024 15:06:44 -0600
Subject: [PATCH 03/57] amd_iommu: Rename variable mmio to mr_mmio
RH-Author: John Allen <None>
RH-MergeRequest: 303: Interrupt Remap support for emulated amd viommu
RH-Jira: RHEL-66202
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [1/5] 1996a48efb7210d4d1e0b929be2d115d672e1a02 (johnalle/qemu-kvm-fork)
JIRA: https://issues.redhat.com/browse/RHEL-66202
commit 2e6f051cfc58e69dcb392cd245d8f01b0c2e963f
Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Fri Sep 27 12:29:09 2024 -0500
amd_iommu: Rename variable mmio to mr_mmio
Rename the MMIO memory region variable 'mmio' to 'mr_mmio'
so to correctly name align with struct AMDVIState::variable type.
No functional change intended.
Reviewed-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Santosh Shukla <santosh.shukla@amd.com>
Message-Id: <20240927172913.121477-2-santosh.shukla@amd.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: John Allen <john.allen@amd.com>
---
hw/i386/acpi-build.c | 4 ++--
hw/i386/amd_iommu.c | 6 +++---
hw/i386/amd_iommu.h | 2 +-
3 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 5d4bd2b710..032fb1f904 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2397,7 +2397,7 @@ build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id,
/* Capability offset */
build_append_int_noprefix(table_data, s->pci.capab_offset, 2);
/* IOMMU base address */
- build_append_int_noprefix(table_data, s->mmio.addr, 8);
+ build_append_int_noprefix(table_data, s->mr_mmio.addr, 8);
/* PCI Segment Group */
build_append_int_noprefix(table_data, 0, 2);
/* IOMMU info */
@@ -2432,7 +2432,7 @@ build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id,
/* Capability offset */
build_append_int_noprefix(table_data, s->pci.capab_offset, 2);
/* IOMMU base address */
- build_append_int_noprefix(table_data, s->mmio.addr, 8);
+ build_append_int_noprefix(table_data, s->mr_mmio.addr, 8);
/* PCI Segment Group */
build_append_int_noprefix(table_data, 0, 2);
/* IOMMU info */
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 87643d2891..148b5ee51d 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -1598,10 +1598,10 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
x86ms->ioapic_as = amdvi_host_dma_iommu(bus, s, AMDVI_IOAPIC_SB_DEVID);
/* set up MMIO */
- memory_region_init_io(&s->mmio, OBJECT(s), &mmio_mem_ops, s, "amdvi-mmio",
- AMDVI_MMIO_SIZE);
+ memory_region_init_io(&s->mr_mmio, OBJECT(s), &mmio_mem_ops, s,
+ "amdvi-mmio", AMDVI_MMIO_SIZE);
memory_region_add_subregion(get_system_memory(), AMDVI_BASE_ADDR,
- &s->mmio);
+ &s->mr_mmio);
pci_setup_iommu(bus, &amdvi_iommu_ops, s);
amdvi_init(s);
}
diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
index 73619fe9ea..e5c2ae94f2 100644
--- a/hw/i386/amd_iommu.h
+++ b/hw/i386/amd_iommu.h
@@ -353,7 +353,7 @@ struct AMDVIState {
uint32_t pprlog_head; /* ppr log head */
uint32_t pprlog_tail; /* ppr log tail */
- MemoryRegion mmio; /* MMIO region */
+ MemoryRegion mr_mmio; /* MMIO region */
uint8_t mmior[AMDVI_MMIO_SIZE]; /* read/write MMIO */
uint8_t w1cmask[AMDVI_MMIO_SIZE]; /* read/write 1 clear mask */
uint8_t romask[AMDVI_MMIO_SIZE]; /* MMIO read/only mask */
--
2.39.3

View File

@ -0,0 +1,81 @@
From 17ce6ac0d8edb04ba79bb39d3f695cd0506a9dc2 Mon Sep 17 00:00:00 2001
From: John Allen <john.allen@amd.com>
Date: Wed, 11 Dec 2024 15:06:59 -0600
Subject: [PATCH 06/57] amd_iommu: Send notification when invalidate interrupt
entry cache
RH-Author: John Allen <None>
RH-MergeRequest: 303: Interrupt Remap support for emulated amd viommu
RH-Jira: RHEL-66202
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [4/5] d57e8fb4e69f3c01d32673bf658aae5067d6b969 (johnalle/qemu-kvm-fork)
JIRA: https://issues.redhat.com/browse/RHEL-66202
commit f84aad4d718b83d2a4d90485992e5421430032e1
Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Fri Sep 27 12:29:12 2024 -0500
amd_iommu: Send notification when invalidate interrupt entry cache
In order to support AMD IOMMU interrupt remapping emulation with PCI
pass-through devices, QEMU needs to notify VFIO when guest IOMMU driver
updates and invalidate the guest interrupt remapping table (IRT), and
communicate information so that the host IOMMU driver can update
the shadowed interrupt remapping table in the host IOMMU.
Therefore, send notification when guest IOMMU emulates the IRT
invalidation commands.
Reviewed-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Santosh Shukla <santosh.shukla@amd.com>
Message-Id: <20240927172913.121477-5-santosh.shukla@amd.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: John Allen <john.allen@amd.com>
---
hw/i386/amd_iommu.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 8fcf5eacb4..82d76dfca9 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -431,6 +431,12 @@ static void amdvi_complete_ppr(AMDVIState *s, uint64_t *cmd)
trace_amdvi_ppr_exec();
}
+static void amdvi_intremap_inval_notify_all(AMDVIState *s, bool global,
+ uint32_t index, uint32_t mask)
+{
+ x86_iommu_iec_notify_all(X86_IOMMU_DEVICE(s), global, index, mask);
+}
+
static void amdvi_inval_all(AMDVIState *s, uint64_t *cmd)
{
if (extract64(cmd[0], 0, 60) || cmd[1]) {
@@ -438,6 +444,9 @@ static void amdvi_inval_all(AMDVIState *s, uint64_t *cmd)
s->cmdbuf + s->cmdbuf_head);
}
+ /* Notify global invalidation */
+ amdvi_intremap_inval_notify_all(s, true, 0, 0);
+
amdvi_iotlb_reset(s);
trace_amdvi_all_inval();
}
@@ -486,6 +495,9 @@ static void amdvi_inval_inttable(AMDVIState *s, uint64_t *cmd)
return;
}
+ /* Notify global invalidation */
+ amdvi_intremap_inval_notify_all(s, true, 0, 0);
+
trace_amdvi_intr_inval();
}
--
2.39.3

View File

@ -0,0 +1,105 @@
From 4859d41adfaae8933e074dcefdc81edd3832c914 Mon Sep 17 00:00:00 2001
From: John Allen <john.allen@amd.com>
Date: Wed, 11 Dec 2024 15:06:55 -0600
Subject: [PATCH 05/57] amd_iommu: Use shared memory region for Interrupt
Remapping
RH-Author: John Allen <None>
RH-MergeRequest: 303: Interrupt Remap support for emulated amd viommu
RH-Jira: RHEL-66202
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [3/5] 48c0513c80257bfbd12c2cf3bab2503bd95d0b1c (johnalle/qemu-kvm-fork)
JIRA: https://issues.redhat.com/browse/RHEL-66202
commit 9fc9dbac61ddde7d8df37e84c8e02cec249d3222
Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Fri Sep 27 12:29:11 2024 -0500
amd_iommu: Use shared memory region for Interrupt Remapping
Use shared memory region for interrupt remapping which can be
aliased by all devices.
Reviewed-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Santosh Shukla <santosh.shukla@amd.com>
Message-Id: <20240927172913.121477-4-santosh.shukla@amd.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: John Allen <john.allen@amd.com>
---
hw/i386/amd_iommu.c | 22 ++++++++++++++--------
hw/i386/amd_iommu.h | 1 +
2 files changed, 15 insertions(+), 8 deletions(-)
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 567cb8adc9..8fcf5eacb4 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -1443,7 +1443,7 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
* |--------------------+-------------------+----------+
* | amdvi-root | 00000000-ffffffff | 0 |
* | amdvi-iommu_nodma | 00000000-ffffffff | 0 |
- * | amdvi-iommu_ir | fee00000-feefffff | 64 |
+ * | amdvi-iommu_ir | fee00000-feefffff | 1 |
* |--------------------+-------------------+----------|
*/
memory_region_init_iommu(&amdvi_dev_as->iommu,
@@ -1454,13 +1454,6 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
memory_region_init(&amdvi_dev_as->root, OBJECT(s),
"amdvi_root", UINT64_MAX);
address_space_init(&amdvi_dev_as->as, &amdvi_dev_as->root, name);
- memory_region_init_io(&amdvi_dev_as->iommu_ir, OBJECT(s),
- &amdvi_ir_ops, s, "amd_iommu_ir",
- AMDVI_INT_ADDR_SIZE);
- memory_region_add_subregion_overlap(&amdvi_dev_as->root,
- AMDVI_INT_ADDR_FIRST,
- &amdvi_dev_as->iommu_ir,
- 64);
memory_region_add_subregion_overlap(&amdvi_dev_as->root, 0,
MEMORY_REGION(&amdvi_dev_as->iommu),
0);
@@ -1472,6 +1465,13 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
memory_region_add_subregion_overlap(&amdvi_dev_as->root, 0,
&amdvi_dev_as->iommu_nodma,
0);
+ /* Build the Interrupt Remapping alias to shared memory */
+ memory_region_init_alias(&amdvi_dev_as->iommu_ir, OBJECT(s),
+ "amdvi-ir", &s->mr_ir, 0,
+ memory_region_size(&s->mr_ir));
+ memory_region_add_subregion_overlap(MEMORY_REGION(&amdvi_dev_as->iommu),
+ AMDVI_INT_ADDR_FIRST,
+ &amdvi_dev_as->iommu_ir, 1);
if (!x86_iommu->pt_supported) {
memory_region_set_enabled(&amdvi_dev_as->iommu_nodma, false);
@@ -1633,6 +1633,12 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
memory_region_add_subregion_overlap(&s->mr_sys, 0,
&s->mr_nodma, 0);
+ /* set up the Interrupt Remapping memory region */
+ memory_region_init_io(&s->mr_ir, OBJECT(s), &amdvi_ir_ops,
+ s, "amdvi-ir", AMDVI_INT_ADDR_SIZE);
+ memory_region_add_subregion_overlap(&s->mr_sys, AMDVI_INT_ADDR_FIRST,
+ &s->mr_ir, 1);
+
pci_setup_iommu(bus, &amdvi_iommu_ops, s);
amdvi_init(s);
}
diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
index be417e51c4..e0dac4d9a9 100644
--- a/hw/i386/amd_iommu.h
+++ b/hw/i386/amd_iommu.h
@@ -356,6 +356,7 @@ struct AMDVIState {
MemoryRegion mr_mmio; /* MMIO region */
MemoryRegion mr_sys;
MemoryRegion mr_nodma;
+ MemoryRegion mr_ir;
uint8_t mmior[AMDVI_MMIO_SIZE]; /* read/write MMIO */
uint8_t w1cmask[AMDVI_MMIO_SIZE]; /* read/write 1 clear mask */
uint8_t romask[AMDVI_MMIO_SIZE]; /* MMIO read/only mask */
--
2.39.3

View File

@ -0,0 +1,53 @@
From 173beb6698538dcffefab36772e107ffb0b4fbbd Mon Sep 17 00:00:00 2001
From: Shaoqin Huang <shahuang@redhat.com>
Date: Mon, 28 Apr 2025 04:34:27 -0400
Subject: [PATCH 2/5] arm: Use arm_virt_compat_set() to apply the compat
RH-Author: Shaoqin Huang <shahuang@redhat.com>
RH-MergeRequest: 353: virtio-net: disable USO for virt-rhel9.6
RH-Jira: RHEL-80313
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Eric Auger <eric.auger@redhat.com>
RH-Commit: [2/2] 6e7a158e65296928040e70622b3cee59e45c1c36 (shahuang/qemu-kvm)
JIRA: https://issues.redhat.com/browse/RHEL-80313
Upstream Status: RHEL only
Since the pauth and uso both should apply for the latest machine type,
move them to the arm_virt_compat_set() which applies the compat to all
machine types automatically.
Signed-off-by: Shaoqin Huang <shahuang@redhat.com>
---
hw/arm/virt.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 896deaa025..2aef94e776 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -127,6 +127,10 @@ static void arm_virt_compat_set(MachineClass *mc)
arm_virt_compat_len);
compat_props_add(mc->compat_props, arm_rhel_compat,
arm_rhel_compat_len);
+ compat_props_add(mc->compat_props, arm_rhel9_compat,
+ arm_rhel9_compat_len);
+ compat_props_add(mc->compat_props, hw_compat_rhel_9,
+ hw_compat_rhel_9_len);
}
#define DEFINE_VIRT_MACHINE_IMPL(latest, ...) \
@@ -3599,10 +3603,6 @@ DEFINE_VIRT_MACHINE(2, 6)
static void virt_rhel_machine_9_6_0_options(MachineClass *mc)
{
- compat_props_add(mc->compat_props, arm_rhel9_compat, arm_rhel9_compat_len);
-
- /* NB: remember to move this line to the *latest* RHEL 9 machine */
- compat_props_add(mc->compat_props, hw_compat_rhel_9, hw_compat_rhel_9_len);
}
DEFINE_VIRT_MACHINE_AS_LATEST(9, 6, 0)
--
2.48.1

View File

@ -0,0 +1,145 @@
From f2cd96a040dd7863484d22a3995a2904605dadde Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Fri, 9 May 2025 15:40:21 -0500
Subject: [PATCH 06/16] block: Add new bdrv_co_is_all_zeroes() function
RH-Author: Eric Blake <eblake@redhat.com>
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
RH-Jira: RHEL-82906 RHEL-83015
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
RH-Commit: [4/14] aabcba8323df698a72842f299e9242a5eee3aea6 (ebblake/centos-qemu-kvm)
There are some optimizations that require knowing if an image starts
out as reading all zeroes, such as making blockdev-mirror faster by
skipping the copying of source zeroes to the destination. The
existing bdrv_co_is_zero_fast() is a good building block for answering
this question, but it tends to give an answer of 0 for a file we just
created via QMP 'blockdev-create' or similar (such as 'qemu-img create
-f raw'). Why? Because file-posix.c insists on allocating a tiny
header to any file rather than leaving it 100% sparse, due to some
filesystems that are unable to answer alignment probes on a hole. But
teaching file-posix.c to read the tiny header doesn't scale - the
problem of a small header is also visible when libvirt sets up an NBD
client to a just-created file on a migration destination host.
So, we need a wrapper function that handles a bit more complexity in a
common manner for all block devices - when the BDS is mostly a hole,
but has a small non-hole header, it is still worth the time to read
that header and check if it reads as all zeroes before giving up and
returning a pessimistic answer.
Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20250509204341.3553601-19-eblake@redhat.com>
(cherry picked from commit 52726096707c5c8b90597c445de897fa64d56e73)
Conflicts:
block/io.c - context with header names
Jira: https://issues.redhat.com/browse/RHEL-82906
Jira: https://issues.redhat.com/browse/RHEL-83015
Signed-off-by: Eric Blake <eblake@redhat.com>
---
block/io.c | 62 ++++++++++++++++++++++++++++++++++++++++
include/block/block-io.h | 2 ++
2 files changed, 64 insertions(+)
diff --git a/block/io.c b/block/io.c
index 293c5dd393..1f01337599 100644
--- a/block/io.c
+++ b/block/io.c
@@ -38,10 +38,14 @@
#include "qemu/error-report.h"
#include "qemu/main-loop.h"
#include "sysemu/replay.h"
+#include "qemu/units.h"
/* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
#define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
+/* Maximum read size for checking if data reads as zero, in bytes */
+#define MAX_ZERO_CHECK_BUFFER (128 * KiB)
+
static void coroutine_fn GRAPH_RDLOCK
bdrv_parent_cb_resize(BlockDriverState *bs);
@@ -2774,6 +2778,64 @@ int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset,
return 1;
}
+/*
+ * Check @bs (and its backing chain) to see if the entire image is known
+ * to read as zeroes.
+ * Return 1 if that is the case, 0 otherwise and -errno on error.
+ * This test is meant to be fast rather than accurate so returning 0
+ * does not guarantee non-zero data; however, a return of 1 is reliable,
+ * and this function can report 1 in more cases than bdrv_co_is_zero_fast.
+ */
+int coroutine_fn bdrv_co_is_all_zeroes(BlockDriverState *bs)
+{
+ int ret;
+ int64_t pnum, bytes;
+ char *buf;
+ QEMUIOVector local_qiov;
+ IO_CODE();
+
+ bytes = bdrv_co_getlength(bs);
+ if (bytes < 0) {
+ return bytes;
+ }
+
+ /* First probe - see if the entire image reads as zero */
+ ret = bdrv_co_common_block_status_above(bs, NULL, false, BDRV_WANT_ZERO,
+ 0, bytes, &pnum, NULL, NULL,
+ NULL);
+ if (ret < 0) {
+ return ret;
+ }
+ if (ret & BDRV_BLOCK_ZERO) {
+ return bdrv_co_is_zero_fast(bs, pnum, bytes - pnum);
+ }
+
+ /*
+ * Because of the way 'blockdev-create' works, raw files tend to
+ * be created with a non-sparse region at the front to make
+ * alignment probing easier. If the block starts with only a
+ * small allocated region, it is still worth the effort to see if
+ * the rest of the image is still sparse, coupled with manually
+ * reading the first region to see if it reads zero after all.
+ */
+ if (pnum > MAX_ZERO_CHECK_BUFFER) {
+ return 0;
+ }
+ ret = bdrv_co_is_zero_fast(bs, pnum, bytes - pnum);
+ if (ret <= 0) {
+ return ret;
+ }
+ /* Only the head of the image is unknown, and it's small. Read it. */
+ buf = qemu_blockalign(bs, pnum);
+ qemu_iovec_init_buf(&local_qiov, buf, pnum);
+ ret = bdrv_driver_preadv(bs, 0, pnum, &local_qiov, 0, 0);
+ if (ret >= 0) {
+ ret = buffer_is_zero(buf, pnum);
+ }
+ qemu_vfree(buf);
+ return ret;
+}
+
int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t offset,
int64_t bytes, int64_t *pnum)
{
diff --git a/include/block/block-io.h b/include/block/block-io.h
index b49e0537dd..b99cc98d26 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -161,6 +161,8 @@ bdrv_is_allocated_above(BlockDriverState *bs, BlockDriverState *base,
int coroutine_fn GRAPH_RDLOCK
bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset, int64_t bytes);
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_is_all_zeroes(BlockDriverState *bs);
int GRAPH_RDLOCK
bdrv_apply_auto_read_only(BlockDriverState *bs, const char *errmsg,
--
2.48.1

View File

@ -0,0 +1,689 @@
From 26f5d221dd16137bed3527ee120cdf085e2c7e23 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Fri, 9 May 2025 15:40:18 -0500
Subject: [PATCH 03/16] block: Expand block status mode from bool to flags
RH-Author: Eric Blake <eblake@redhat.com>
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
RH-Jira: RHEL-82906 RHEL-83015
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
RH-Commit: [1/14] 9de5245def80e9815ed306e4abce9caec56cef6f (ebblake/centos-qemu-kvm)
This patch is purely mechanical, changing bool want_zero into an
unsigned int for bitwise-or of flags. As of this patch, all
implementations are unchanged (the old want_zero==true is now
mode==BDRV_WANT_PRECISE which is a superset of BDRV_WANT_ZERO); but
the callers in io.c that used to pass want_zero==false are now
prepared for future driver changes that can now distinguish bewteen
BDRV_WANT_ZERO vs. BDRV_WANT_ALLOCATED. The next patch will actually
change the file-posix driver along those lines, now that we have
more-specific hints.
As for the background why this patch is useful: right now, the
file-posix driver recognizes that if allocation is being queried, the
entire image can be reported as allocated (there is no backing file to
refer to) - but this throws away information on whether the entire
image reads as zero (trivially true if lseek(SEEK_HOLE) at offset 0
returns -ENXIO, a bit more complicated to prove if the raw file was
created with 'qemu-img create' since we intentionally allocate a small
chunk of all-zero data to help with alignment probing). Later patches
will add a generic algorithm for seeing if an entire file reads as
zeroes.
Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20250509204341.3553601-16-eblake@redhat.com>
(cherry picked from commit c33159dec79069514f78faecfe268439226b0f5b)
Jira: https://issues.redhat.com/browse/RHEL-82906
Jira: https://issues.redhat.com/browse/RHEL-83015
Signed-off-by: Eric Blake <eblake@redhat.com>
---
block/blkdebug.c | 6 ++--
block/copy-before-write.c | 4 +--
block/coroutines.h | 4 +--
block/file-posix.c | 4 +--
block/gluster.c | 4 +--
block/io.c | 51 ++++++++++++++++----------------
block/iscsi.c | 6 ++--
block/nbd.c | 4 +--
block/null.c | 6 ++--
block/parallels.c | 6 ++--
block/qcow.c | 2 +-
block/qcow2.c | 6 ++--
block/qed.c | 6 ++--
block/quorum.c | 4 +--
block/raw-format.c | 4 +--
block/rbd.c | 6 ++--
block/snapshot-access.c | 4 +--
block/vdi.c | 4 +--
block/vmdk.c | 2 +-
block/vpc.c | 2 +-
block/vvfat.c | 6 ++--
include/block/block-common.h | 11 +++++++
include/block/block_int-common.h | 27 +++++++++--------
include/block/block_int-io.h | 4 +--
tests/unit/test-block-iothread.c | 2 +-
25 files changed, 99 insertions(+), 86 deletions(-)
diff --git a/block/blkdebug.c b/block/blkdebug.c
index c95c818c38..736ae2b56b 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -751,9 +751,9 @@ blkdebug_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
}
static int coroutine_fn GRAPH_RDLOCK
-blkdebug_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
- int64_t bytes, int64_t *pnum, int64_t *map,
- BlockDriverState **file)
+blkdebug_co_block_status(BlockDriverState *bs, unsigned int mode,
+ int64_t offset, int64_t bytes, int64_t *pnum,
+ int64_t *map, BlockDriverState **file)
{
int err;
diff --git a/block/copy-before-write.c b/block/copy-before-write.c
index 853e01a1eb..36488cdeca 100644
--- a/block/copy-before-write.c
+++ b/block/copy-before-write.c
@@ -290,8 +290,8 @@ cbw_co_preadv_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes,
}
static int coroutine_fn GRAPH_RDLOCK
-cbw_co_snapshot_block_status(BlockDriverState *bs,
- bool want_zero, int64_t offset, int64_t bytes,
+cbw_co_snapshot_block_status(BlockDriverState *bs, unsigned int mode,
+ int64_t offset, int64_t bytes,
int64_t *pnum, int64_t *map,
BlockDriverState **file)
{
diff --git a/block/coroutines.h b/block/coroutines.h
index f3226682d6..811ef12e43 100644
--- a/block/coroutines.h
+++ b/block/coroutines.h
@@ -47,7 +47,7 @@ int coroutine_fn GRAPH_RDLOCK
bdrv_co_common_block_status_above(BlockDriverState *bs,
BlockDriverState *base,
bool include_base,
- bool want_zero,
+ unsigned int mode,
int64_t offset,
int64_t bytes,
int64_t *pnum,
@@ -78,7 +78,7 @@ int co_wrapper_mixed_bdrv_rdlock
bdrv_common_block_status_above(BlockDriverState *bs,
BlockDriverState *base,
bool include_base,
- bool want_zero,
+ unsigned int mode,
int64_t offset,
int64_t bytes,
int64_t *pnum,
diff --git a/block/file-posix.c b/block/file-posix.c
index f17a3f4d10..9ca55620ca 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -3277,7 +3277,7 @@ static int find_allocation(BlockDriverState *bs, off_t start,
* well exceed it.
*/
static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
- bool want_zero,
+ unsigned int mode,
int64_t offset,
int64_t bytes, int64_t *pnum,
int64_t *map,
@@ -3293,7 +3293,7 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
return ret;
}
- if (!want_zero) {
+ if (mode != BDRV_WANT_PRECISE) {
*pnum = bytes;
*map = offset;
*file = bs;
diff --git a/block/gluster.c b/block/gluster.c
index f8b415f381..ae5c45666b 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -1466,7 +1466,7 @@ exit:
* (Based on raw_co_block_status() from file-posix.c.)
*/
static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs,
- bool want_zero,
+ unsigned int mode,
int64_t offset,
int64_t bytes,
int64_t *pnum,
@@ -1483,7 +1483,7 @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs,
return ret;
}
- if (!want_zero) {
+ if (mode != BDRV_WANT_PRECISE) {
*pnum = bytes;
*map = offset;
*file = bs;
diff --git a/block/io.c b/block/io.c
index 3e189837a1..daaafe00d7 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2360,10 +2360,8 @@ int bdrv_flush_all(void)
* Drivers not implementing the functionality are assumed to not support
* backing files, hence all their sectors are reported as allocated.
*
- * If 'want_zero' is true, the caller is querying for mapping
- * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and
- * _ZERO where possible; otherwise, the result favors larger 'pnum',
- * with a focus on accurate BDRV_BLOCK_ALLOCATED.
+ * 'mode' serves as a hint as to which results are favored; see the
+ * BDRV_WANT_* macros for details.
*
* If 'offset' is beyond the end of the disk image the return value is
* BDRV_BLOCK_EOF and 'pnum' is set to 0.
@@ -2383,7 +2381,7 @@ int bdrv_flush_all(void)
* set to the host mapping and BDS corresponding to the guest offset.
*/
static int coroutine_fn GRAPH_RDLOCK
-bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero,
+bdrv_co_do_block_status(BlockDriverState *bs, unsigned int mode,
int64_t offset, int64_t bytes,
int64_t *pnum, int64_t *map, BlockDriverState **file)
{
@@ -2472,7 +2470,7 @@ bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero,
local_file = bs;
local_map = aligned_offset;
} else {
- ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
+ ret = bs->drv->bdrv_co_block_status(bs, mode, aligned_offset,
aligned_bytes, pnum, &local_map,
&local_file);
@@ -2484,10 +2482,10 @@ bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero,
* the cache requires an RCU update, so double check here to avoid
* such an update if possible.
*
- * Check want_zero, because we only want to update the cache when we
+ * Check mode, because we only want to update the cache when we
* have accurate information about what is zero and what is data.
*/
- if (want_zero &&
+ if (mode == BDRV_WANT_PRECISE &&
ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID) &&
QLIST_EMPTY(&bs->children))
{
@@ -2544,7 +2542,7 @@ bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero,
if (ret & BDRV_BLOCK_RAW) {
assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file);
- ret = bdrv_co_do_block_status(local_file, want_zero, local_map,
+ ret = bdrv_co_do_block_status(local_file, mode, local_map,
*pnum, pnum, &local_map, &local_file);
goto out;
}
@@ -2556,7 +2554,7 @@ bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero,
if (!cow_bs) {
ret |= BDRV_BLOCK_ZERO;
- } else if (want_zero) {
+ } else if (mode == BDRV_WANT_PRECISE) {
int64_t size2 = bdrv_co_getlength(cow_bs);
if (size2 >= 0 && offset >= size2) {
@@ -2565,14 +2563,14 @@ bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero,
}
}
- if (want_zero && ret & BDRV_BLOCK_RECURSE &&
+ if (mode == BDRV_WANT_PRECISE && ret & BDRV_BLOCK_RECURSE &&
local_file && local_file != bs &&
(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
(ret & BDRV_BLOCK_OFFSET_VALID)) {
int64_t file_pnum;
int ret2;
- ret2 = bdrv_co_do_block_status(local_file, want_zero, local_map,
+ ret2 = bdrv_co_do_block_status(local_file, mode, local_map,
*pnum, &file_pnum, NULL, NULL);
if (ret2 >= 0) {
/* Ignore errors. This is just providing extra information, it
@@ -2623,7 +2621,7 @@ int coroutine_fn
bdrv_co_common_block_status_above(BlockDriverState *bs,
BlockDriverState *base,
bool include_base,
- bool want_zero,
+ unsigned int mode,
int64_t offset,
int64_t bytes,
int64_t *pnum,
@@ -2650,7 +2648,7 @@ bdrv_co_common_block_status_above(BlockDriverState *bs,
return 0;
}
- ret = bdrv_co_do_block_status(bs, want_zero, offset, bytes, pnum,
+ ret = bdrv_co_do_block_status(bs, mode, offset, bytes, pnum,
map, file);
++*depth;
if (ret < 0 || *pnum == 0 || ret & BDRV_BLOCK_ALLOCATED || bs == base) {
@@ -2667,7 +2665,7 @@ bdrv_co_common_block_status_above(BlockDriverState *bs,
for (p = bdrv_filter_or_cow_bs(bs); include_base || p != base;
p = bdrv_filter_or_cow_bs(p))
{
- ret = bdrv_co_do_block_status(p, want_zero, offset, bytes, pnum,
+ ret = bdrv_co_do_block_status(p, mode, offset, bytes, pnum,
map, file);
++*depth;
if (ret < 0) {
@@ -2730,7 +2728,8 @@ int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
BlockDriverState **file)
{
IO_CODE();
- return bdrv_co_common_block_status_above(bs, base, false, true, offset,
+ return bdrv_co_common_block_status_above(bs, base, false,
+ BDRV_WANT_PRECISE, offset,
bytes, pnum, map, file, NULL);
}
@@ -2761,8 +2760,9 @@ int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset,
return 1;
}
- ret = bdrv_co_common_block_status_above(bs, NULL, false, false, offset,
- bytes, &pnum, NULL, NULL, NULL);
+ ret = bdrv_co_common_block_status_above(bs, NULL, false, BDRV_WANT_ZERO,
+ offset, bytes, &pnum, NULL, NULL,
+ NULL);
if (ret < 0) {
return ret;
@@ -2778,9 +2778,9 @@ int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t offset,
int64_t dummy;
IO_CODE();
- ret = bdrv_co_common_block_status_above(bs, bs, true, false, offset,
- bytes, pnum ? pnum : &dummy, NULL,
- NULL, NULL);
+ ret = bdrv_co_common_block_status_above(bs, bs, true, BDRV_WANT_ALLOCATED,
+ offset, bytes, pnum ? pnum : &dummy,
+ NULL, NULL, NULL);
if (ret < 0) {
return ret;
}
@@ -2813,7 +2813,8 @@ int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *bs,
int ret;
IO_CODE();
- ret = bdrv_co_common_block_status_above(bs, base, include_base, false,
+ ret = bdrv_co_common_block_status_above(bs, base, include_base,
+ BDRV_WANT_ALLOCATED,
offset, bytes, pnum, NULL, NULL,
&depth);
if (ret < 0) {
@@ -3710,8 +3711,8 @@ bdrv_co_preadv_snapshot(BdrvChild *child, int64_t offset, int64_t bytes,
}
int coroutine_fn
-bdrv_co_snapshot_block_status(BlockDriverState *bs,
- bool want_zero, int64_t offset, int64_t bytes,
+bdrv_co_snapshot_block_status(BlockDriverState *bs, unsigned int mode,
+ int64_t offset, int64_t bytes,
int64_t *pnum, int64_t *map,
BlockDriverState **file)
{
@@ -3729,7 +3730,7 @@ bdrv_co_snapshot_block_status(BlockDriverState *bs,
}
bdrv_inc_in_flight(bs);
- ret = drv->bdrv_co_snapshot_block_status(bs, want_zero, offset, bytes,
+ ret = drv->bdrv_co_snapshot_block_status(bs, mode, offset, bytes,
pnum, map, file);
bdrv_dec_in_flight(bs);
diff --git a/block/iscsi.c b/block/iscsi.c
index 979bf90cb7..d7caa4b363 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -694,9 +694,9 @@ out_unlock:
static int coroutine_fn iscsi_co_block_status(BlockDriverState *bs,
- bool want_zero, int64_t offset,
- int64_t bytes, int64_t *pnum,
- int64_t *map,
+ unsigned int mode,
+ int64_t offset, int64_t bytes,
+ int64_t *pnum, int64_t *map,
BlockDriverState **file)
{
IscsiLun *iscsilun = bs->opaque;
diff --git a/block/nbd.c b/block/nbd.c
index d464315766..a359aa236e 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -1397,8 +1397,8 @@ nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
}
static int coroutine_fn GRAPH_RDLOCK nbd_client_co_block_status(
- BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes,
- int64_t *pnum, int64_t *map, BlockDriverState **file)
+ BlockDriverState *bs, unsigned int mode, int64_t offset,
+ int64_t bytes, int64_t *pnum, int64_t *map, BlockDriverState **file)
{
int ret, request_ret;
NBDExtent64 extent = { 0 };
diff --git a/block/null.c b/block/null.c
index 4730acc1eb..95021230c8 100644
--- a/block/null.c
+++ b/block/null.c
@@ -227,9 +227,9 @@ static int null_reopen_prepare(BDRVReopenState *reopen_state,
}
static int coroutine_fn null_co_block_status(BlockDriverState *bs,
- bool want_zero, int64_t offset,
- int64_t bytes, int64_t *pnum,
- int64_t *map,
+ unsigned int mode,
+ int64_t offset, int64_t bytes,
+ int64_t *pnum, int64_t *map,
BlockDriverState **file)
{
BDRVNullState *s = bs->opaque;
diff --git a/block/parallels.c b/block/parallels.c
index 9205a0864f..22ea7834fd 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -416,9 +416,9 @@ parallels_co_flush_to_os(BlockDriverState *bs)
}
static int coroutine_fn GRAPH_RDLOCK
-parallels_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
- int64_t bytes, int64_t *pnum, int64_t *map,
- BlockDriverState **file)
+parallels_co_block_status(BlockDriverState *bs, unsigned int mode,
+ int64_t offset, int64_t bytes, int64_t *pnum,
+ int64_t *map, BlockDriverState **file)
{
BDRVParallelsState *s = bs->opaque;
int count;
diff --git a/block/qcow.c b/block/qcow.c
index c2f89db055..2e18c42d8f 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -530,7 +530,7 @@ get_cluster_offset(BlockDriverState *bs, uint64_t offset, int allocate,
}
static int coroutine_fn GRAPH_RDLOCK
-qcow_co_block_status(BlockDriverState *bs, bool want_zero,
+qcow_co_block_status(BlockDriverState *bs, unsigned int mode,
int64_t offset, int64_t bytes, int64_t *pnum,
int64_t *map, BlockDriverState **file)
{
diff --git a/block/qcow2.c b/block/qcow2.c
index a4cffb628c..788da07fee 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2147,9 +2147,9 @@ static void qcow2_join_options(QDict *options, QDict *old_options)
}
static int coroutine_fn GRAPH_RDLOCK
-qcow2_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
- int64_t count, int64_t *pnum, int64_t *map,
- BlockDriverState **file)
+qcow2_co_block_status(BlockDriverState *bs, unsigned int mode,
+ int64_t offset, int64_t count, int64_t *pnum,
+ int64_t *map, BlockDriverState **file)
{
BDRVQcow2State *s = bs->opaque;
uint64_t host_offset;
diff --git a/block/qed.c b/block/qed.c
index fa5bc11085..b135e981e5 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -832,9 +832,9 @@ fail:
}
static int coroutine_fn GRAPH_RDLOCK
-bdrv_qed_co_block_status(BlockDriverState *bs, bool want_zero, int64_t pos,
- int64_t bytes, int64_t *pnum, int64_t *map,
- BlockDriverState **file)
+bdrv_qed_co_block_status(BlockDriverState *bs, unsigned int mode,
+ int64_t pos, int64_t bytes, int64_t *pnum,
+ int64_t *map, BlockDriverState **file)
{
BDRVQEDState *s = bs->opaque;
size_t len = MIN(bytes, SIZE_MAX);
diff --git a/block/quorum.c b/block/quorum.c
index db8fe891c4..bb4ed9483e 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -1226,7 +1226,7 @@ static void quorum_child_perm(BlockDriverState *bs, BdrvChild *c,
* region contains zeroes, and BDRV_BLOCK_DATA otherwise.
*/
static int coroutine_fn GRAPH_RDLOCK
-quorum_co_block_status(BlockDriverState *bs, bool want_zero,
+quorum_co_block_status(BlockDriverState *bs, unsigned int mode,
int64_t offset, int64_t count,
int64_t *pnum, int64_t *map, BlockDriverState **file)
{
@@ -1238,7 +1238,7 @@ quorum_co_block_status(BlockDriverState *bs, bool want_zero,
for (i = 0; i < s->num_children; i++) {
int64_t bytes;
ret = bdrv_co_common_block_status_above(s->children[i]->bs, NULL, false,
- want_zero, offset, count,
+ mode, offset, count,
&bytes, NULL, NULL, NULL);
if (ret < 0) {
quorum_report_bad(QUORUM_OP_TYPE_READ, offset, count,
diff --git a/block/raw-format.c b/block/raw-format.c
index ac7e8495f6..623bca87a6 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -283,8 +283,8 @@ fail:
}
static int coroutine_fn GRAPH_RDLOCK
-raw_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
- int64_t bytes, int64_t *pnum, int64_t *map,
+raw_co_block_status(BlockDriverState *bs, unsigned int mode,
+ int64_t offset, int64_t bytes, int64_t *pnum, int64_t *map,
BlockDriverState **file)
{
BDRVRawState *s = bs->opaque;
diff --git a/block/rbd.c b/block/rbd.c
index 9c0fd0cb3f..627f8eb05a 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -1504,9 +1504,9 @@ static int qemu_rbd_diff_iterate_cb(uint64_t offs, size_t len,
}
static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
- bool want_zero, int64_t offset,
- int64_t bytes, int64_t *pnum,
- int64_t *map,
+ unsigned int mode,
+ int64_t offset, int64_t bytes,
+ int64_t *pnum, int64_t *map,
BlockDriverState **file)
{
BDRVRBDState *s = bs->opaque;
diff --git a/block/snapshot-access.c b/block/snapshot-access.c
index 84d0d13f86..972b8f2e68 100644
--- a/block/snapshot-access.c
+++ b/block/snapshot-access.c
@@ -41,11 +41,11 @@ snapshot_access_co_preadv_part(BlockDriverState *bs,
static int coroutine_fn GRAPH_RDLOCK
snapshot_access_co_block_status(BlockDriverState *bs,
- bool want_zero, int64_t offset,
+ unsigned int mode, int64_t offset,
int64_t bytes, int64_t *pnum,
int64_t *map, BlockDriverState **file)
{
- return bdrv_co_snapshot_block_status(bs->file->bs, want_zero, offset,
+ return bdrv_co_snapshot_block_status(bs->file->bs, mode, offset,
bytes, pnum, map, file);
}
diff --git a/block/vdi.c b/block/vdi.c
index 6363da08ce..028fe68488 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -521,8 +521,8 @@ static int vdi_reopen_prepare(BDRVReopenState *state,
}
static int coroutine_fn GRAPH_RDLOCK
-vdi_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
- int64_t bytes, int64_t *pnum, int64_t *map,
+vdi_co_block_status(BlockDriverState *bs, unsigned int mode,
+ int64_t offset, int64_t bytes, int64_t *pnum, int64_t *map,
BlockDriverState **file)
{
BDRVVdiState *s = (BDRVVdiState *)bs->opaque;
diff --git a/block/vmdk.c b/block/vmdk.c
index 78f6433607..6f1af82078 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1777,7 +1777,7 @@ static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
}
static int coroutine_fn GRAPH_RDLOCK
-vmdk_co_block_status(BlockDriverState *bs, bool want_zero,
+vmdk_co_block_status(BlockDriverState *bs, unsigned int mode,
int64_t offset, int64_t bytes, int64_t *pnum,
int64_t *map, BlockDriverState **file)
{
diff --git a/block/vpc.c b/block/vpc.c
index d95a204612..0dd641b614 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -721,7 +721,7 @@ fail:
}
static int coroutine_fn GRAPH_RDLOCK
-vpc_co_block_status(BlockDriverState *bs, bool want_zero,
+vpc_co_block_status(BlockDriverState *bs, unsigned int mode,
int64_t offset, int64_t bytes,
int64_t *pnum, int64_t *map,
BlockDriverState **file)
diff --git a/block/vvfat.c b/block/vvfat.c
index 8ffe8b3b9b..d59231357e 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -3135,9 +3135,9 @@ vvfat_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
}
static int coroutine_fn vvfat_co_block_status(BlockDriverState *bs,
- bool want_zero, int64_t offset,
- int64_t bytes, int64_t *n,
- int64_t *map,
+ unsigned int mode,
+ int64_t offset, int64_t bytes,
+ int64_t *n, int64_t *map,
BlockDriverState **file)
{
*n = bytes;
diff --git a/include/block/block-common.h b/include/block/block-common.h
index 7030669f04..5beee6402b 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -333,6 +333,17 @@ typedef enum {
#define BDRV_BLOCK_RECURSE 0x40
#define BDRV_BLOCK_COMPRESSED 0x80
+/*
+ * Block status hints: the bitwise-or of these flags emphasize what
+ * the caller hopes to learn, and some drivers may be able to give
+ * faster answers by doing less work when the hint permits.
+ */
+#define BDRV_WANT_ZERO BDRV_BLOCK_ZERO
+#define BDRV_WANT_OFFSET_VALID BDRV_BLOCK_OFFSET_VALID
+#define BDRV_WANT_ALLOCATED BDRV_BLOCK_ALLOCATED
+#define BDRV_WANT_PRECISE (BDRV_WANT_ZERO | BDRV_WANT_OFFSET_VALID | \
+ BDRV_WANT_OFFSET_VALID)
+
typedef QTAILQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue;
typedef struct BDRVReopenState {
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index ebb4e56a50..a9c0daa2a4 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -608,15 +608,16 @@ struct BlockDriver {
* according to the current layer, and should only need to set
* BDRV_BLOCK_DATA, BDRV_BLOCK_ZERO, BDRV_BLOCK_OFFSET_VALID,
* and/or BDRV_BLOCK_RAW; if the current layer defers to a backing
- * layer, the result should be 0 (and not BDRV_BLOCK_ZERO). See
- * block.h for the overall meaning of the bits. As a hint, the
- * flag want_zero is true if the caller cares more about precise
- * mappings (favor accurate _OFFSET_VALID/_ZERO) or false for
- * overall allocation (favor larger *pnum, perhaps by reporting
- * _DATA instead of _ZERO). The block layer guarantees input
- * clamped to bdrv_getlength() and aligned to request_alignment,
- * as well as non-NULL pnum, map, and file; in turn, the driver
- * must return an error or set pnum to an aligned non-zero value.
+ * layer, the result should be 0 (and not BDRV_BLOCK_ZERO). The
+ * caller will synthesize BDRV_BLOCK_ALLOCATED based on the
+ * non-zero results. See block.h for the overall meaning of the
+ * bits. As a hint, the flags in @mode may include a bitwise-or
+ * of BDRV_WANT_ALLOCATED, BDRV_WANT_OFFSET_VALID, or
+ * BDRV_WANT_ZERO based on what the caller is looking for in the
+ * results. The block layer guarantees input clamped to
+ * bdrv_getlength() and aligned to request_alignment, as well as
+ * non-NULL pnum, map, and file; in turn, the driver must return
+ * an error or set pnum to an aligned non-zero value.
*
* Note that @bytes is just a hint on how big of a region the
* caller wants to inspect. It is not a limit on *pnum.
@@ -628,8 +629,8 @@ struct BlockDriver {
* to clamping *pnum for return to its caller.
*/
int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_block_status)(
- BlockDriverState *bs,
- bool want_zero, int64_t offset, int64_t bytes, int64_t *pnum,
+ BlockDriverState *bs, unsigned int mode,
+ int64_t offset, int64_t bytes, int64_t *pnum,
int64_t *map, BlockDriverState **file);
/*
@@ -653,8 +654,8 @@ struct BlockDriver {
QEMUIOVector *qiov, size_t qiov_offset);
int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_snapshot_block_status)(
- BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes,
- int64_t *pnum, int64_t *map, BlockDriverState **file);
+ BlockDriverState *bs, unsigned int mode, int64_t offset,
+ int64_t bytes, int64_t *pnum, int64_t *map, BlockDriverState **file);
int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_pdiscard_snapshot)(
BlockDriverState *bs, int64_t offset, int64_t bytes);
diff --git a/include/block/block_int-io.h b/include/block/block_int-io.h
index 4a7cf2b4fd..4f94eb3c5a 100644
--- a/include/block/block_int-io.h
+++ b/include/block/block_int-io.h
@@ -38,8 +38,8 @@
int coroutine_fn GRAPH_RDLOCK bdrv_co_preadv_snapshot(BdrvChild *child,
int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset);
int coroutine_fn GRAPH_RDLOCK bdrv_co_snapshot_block_status(
- BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes,
- int64_t *pnum, int64_t *map, BlockDriverState **file);
+ BlockDriverState *bs, unsigned int mode, int64_t offset,
+ int64_t bytes, int64_t *pnum, int64_t *map, BlockDriverState **file);
int coroutine_fn GRAPH_RDLOCK bdrv_co_pdiscard_snapshot(BlockDriverState *bs,
int64_t offset, int64_t bytes);
diff --git a/tests/unit/test-block-iothread.c b/tests/unit/test-block-iothread.c
index 3766d5de6b..373b72fdd8 100644
--- a/tests/unit/test-block-iothread.c
+++ b/tests/unit/test-block-iothread.c
@@ -63,7 +63,7 @@ bdrv_test_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
}
static int coroutine_fn bdrv_test_co_block_status(BlockDriverState *bs,
- bool want_zero,
+ unsigned int mode,
int64_t offset, int64_t count,
int64_t *pnum, int64_t *map,
BlockDriverState **file)
--
2.48.1

View File

@ -0,0 +1,90 @@
From 9f8158e56beae4221e91feb5a98cb4db9076cac4 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Fri, 9 May 2025 15:40:20 -0500
Subject: [PATCH 05/16] block: Let bdrv_co_is_zero_fast consolidate adjacent
extents
RH-Author: Eric Blake <eblake@redhat.com>
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
RH-Jira: RHEL-82906 RHEL-83015
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
RH-Commit: [3/14] 98bf9ff773d9a36f8a8e294e38629e3f20c41334 (ebblake/centos-qemu-kvm)
Some BDS drivers have a cap on how much block status they can supply
in one query (for example, NBD talking to an older server cannot
inspect more than 4G per query; and qcow2 tends to cap its answers
rather than cross a cluster boundary of an L1 table). Although the
existing callers of bdrv_co_is_zero_fast are not passing in that large
of a 'bytes' parameter, an upcoming caller wants to query the entire
image at once, and will thus benefit from being able to treat adjacent
zero regions in a coalesced manner, rather than claiming the region is
non-zero merely because pnum was truncated and didn't match the
incoming bytes.
While refactoring this into a loop, note that there is no need to
assign pnum prior to calling bdrv_co_common_block_status_above() (it
is guaranteed to be assigned deeper in the callstack).
Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20250509204341.3553601-18-eblake@redhat.com>
(cherry picked from commit 31bf15d97dd1d205a3b264675f9a1b3bd1939068)
Jira: https://issues.redhat.com/browse/RHEL-82906
Jira: https://issues.redhat.com/browse/RHEL-83015
Signed-off-by: Eric Blake <eblake@redhat.com>
---
block/io.c | 27 +++++++++++++++------------
1 file changed, 15 insertions(+), 12 deletions(-)
diff --git a/block/io.c b/block/io.c
index daaafe00d7..293c5dd393 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2747,28 +2747,31 @@ int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, int64_t offset,
* by @offset and @bytes is known to read as zeroes.
* Return 1 if that is the case, 0 otherwise and -errno on error.
* This test is meant to be fast rather than accurate so returning 0
- * does not guarantee non-zero data.
+ * does not guarantee non-zero data; but a return of 1 is reliable.
*/
int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset,
int64_t bytes)
{
int ret;
- int64_t pnum = bytes;
+ int64_t pnum;
IO_CODE();
- if (!bytes) {
- return 1;
- }
-
- ret = bdrv_co_common_block_status_above(bs, NULL, false, BDRV_WANT_ZERO,
- offset, bytes, &pnum, NULL, NULL,
- NULL);
+ while (bytes) {
+ ret = bdrv_co_common_block_status_above(bs, NULL, false,
+ BDRV_WANT_ZERO, offset, bytes,
+ &pnum, NULL, NULL, NULL);
- if (ret < 0) {
- return ret;
+ if (ret < 0) {
+ return ret;
+ }
+ if (!(ret & BDRV_BLOCK_ZERO)) {
+ return 0;
+ }
+ offset += pnum;
+ bytes -= pnum;
}
- return (pnum == bytes) && (ret & BDRV_BLOCK_ZERO);
+ return 1;
}
int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t offset,
--
2.48.1

View File

@ -0,0 +1,74 @@
From e629a362860977161e43ed80bb59d1d05a06b2f2 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 17 Apr 2025 11:05:28 -0400
Subject: [PATCH 4/5] block/io: skip head/tail requests on EINVAL
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 355: file-posix: probe discard alignment on Linux block devices
RH-Jira: RHEL-86032
RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
RH-Acked-by: Eric Blake <eblake@redhat.com>
RH-Commit: [2/3] 0028fb11f18e16e2aba9506eabb2383c406d17b5 (stefanha/centos-stream-qemu-kvm)
When guests send misaligned discard requests, the block layer breaks
them up into a misaligned head, an aligned main body, and a misaligned
tail.
The file-posix block driver on Linux returns -EINVAL on misaligned
discard requests. This causes bdrv_co_pdiscard() to fail and guests
configured with werror=stop will pause.
Add a special case for misaligned head/tail requests. Simply continue
when EINVAL is encountered so that the aligned main body of the request
can be completed and the guest is not paused. This is the best we can do
when guest discard limits do not match the host discard limits.
Fixes: https://issues.redhat.com/browse/RHEL-86032
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Hanna Czenczek <hreitz@redhat.com>
Message-ID: <20250417150528.76470-3-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
(cherry picked from commit 4733cb0833c4b223f92ec0136980eeb5239ecb87)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
block/io.c | 15 ++++++++++-----
1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/block/io.c b/block/io.c
index 301514c880..3e189837a1 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3105,11 +3105,12 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
/* Invalidate the cached block-status data range if this discard overlaps */
bdrv_bsc_invalidate_range(bs, offset, bytes);
- /* Discard is advisory, but some devices track and coalesce
+ /*
+ * Discard is advisory, but some devices track and coalesce
* unaligned requests, so we must pass everything down rather than
- * round here. Still, most devices will just silently ignore
- * unaligned requests (by returning -ENOTSUP), so we must fragment
- * the request accordingly. */
+ * round here. Still, most devices reject unaligned requests with
+ * -EINVAL or -ENOTSUP, so we must fragment the request accordingly.
+ */
align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
assert(align % bs->bl.request_alignment == 0);
head = offset % align;
@@ -3176,7 +3177,11 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
}
}
if (ret && ret != -ENOTSUP) {
- goto out;
+ if (ret == -EINVAL && (offset % align != 0 || num % align != 0)) {
+ /* Silently skip rejected unaligned head/tail requests */
+ } else {
+ goto out; /* bail out */
+ }
}
offset += num;
--
2.48.1

View File

@ -0,0 +1,48 @@
From d38bdce712f572e1920e3344132ff6600d657de2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:36:41 +0100
Subject: [PATCH 29/57] block: skip automatic zero-init of large array in
ioq_submit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [3/30] 301a08b3acdcd95634dec5dab1d96fcfe3abf3be (stefanha/centos-stream-qemu-kvm)
The 'ioq_submit' method has a struct array that is 8k in size.
Skip the automatic zero-init of this array to eliminate the
performance overhead in the I/O hot path.
The 'iocbs' array will selectively initialized when processing
the I/O data.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20250610123709.835102-4-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 83750c1da807c973b0b11d977d61df7e41122d03)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
block/linux-aio.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/block/linux-aio.c b/block/linux-aio.c
index e3b5ec9aba..26d9f086d2 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -291,7 +291,7 @@ static void ioq_submit(LinuxAioState *s)
{
int ret, len;
struct qemu_laiocb *aiocb;
- struct iocb *iocbs[MAX_EVENTS];
+ QEMU_UNINITIALIZED struct iocb *iocbs[MAX_EVENTS];
QSIMPLEQ_HEAD(, qemu_laiocb) completed;
do {
--
2.39.3

View File

@ -0,0 +1,49 @@
From 1e8798a3adbbfc42167aaba0ee18175deac37193 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:36:42 +0100
Subject: [PATCH 30/57] chardev/char-fd: skip automatic zero-init of large
array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [4/30] b16fe5c9af4756e1856cd330df02a1a09d9f33ea (stefanha/centos-stream-qemu-kvm)
The 'fd_chr_read' method has a 4k byte array used for copying
data between the socket and device. Skip the automatic zero-init
of this array to eliminate the performance overhead in the I/O
hot path.
The 'buf' array will be fully initialized when reading data off
the network socket.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20250610123709.835102-5-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit a503bdc22b91869e3bf45522e36b122889465306)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
chardev/char-fd.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/chardev/char-fd.c b/chardev/char-fd.c
index d2c4923359..8dd662c066 100644
--- a/chardev/char-fd.c
+++ b/chardev/char-fd.c
@@ -50,7 +50,7 @@ static gboolean fd_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque)
Chardev *chr = CHARDEV(opaque);
FDChardev *s = FD_CHARDEV(opaque);
int len;
- uint8_t buf[CHR_READ_BUF_LEN];
+ QEMU_UNINITIALIZED uint8_t buf[CHR_READ_BUF_LEN];
ssize_t ret;
len = sizeof(buf);
--
2.39.3

View File

@ -0,0 +1,49 @@
From 74311b0ee8e211fccff211b975e4ae9236c063dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:36:43 +0100
Subject: [PATCH 31/57] chardev/char-pty: skip automatic zero-init of large
array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [5/30] a3b8458c30f485551093f292c00c20b0e118df77 (stefanha/centos-stream-qemu-kvm)
The 'pty_chr_read' method has a 4k byte array used for copying
data between the PTY and device. Skip the automatic zero-init
of this array to eliminate the performance overhead in the I/O
hot path.
The 'buf' array will be fully initialized when reading data off
the PTY.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20250610123709.835102-6-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 45bb7fb21c8d18294a9f92da99d01ab3c67c7df2)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
chardev/char-pty.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/chardev/char-pty.c b/chardev/char-pty.c
index cc2f7617fe..3319ad215d 100644
--- a/chardev/char-pty.c
+++ b/chardev/char-pty.c
@@ -152,7 +152,7 @@ static gboolean pty_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque)
Chardev *chr = CHARDEV(opaque);
PtyChardev *s = PTY_CHARDEV(opaque);
gsize len;
- uint8_t buf[CHR_READ_BUF_LEN];
+ QEMU_UNINITIALIZED uint8_t buf[CHR_READ_BUF_LEN];
ssize_t ret;
len = sizeof(buf);
--
2.39.3

View File

@ -0,0 +1,49 @@
From d56a8ce56f0de70ab2de266a80e25cf309e72fda Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:36:44 +0100
Subject: [PATCH 32/57] chardev/char-socket: skip automatic zero-init of large
array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [6/30] 86a2ac03efa1838fb30931c38945ee77de9bbe06 (stefanha/centos-stream-qemu-kvm)
The 'tcp_chr_read' method has a 4k byte array used for copying
data between the socket and device. Skip the automatic zero-init
of this array to eliminate the performance overhead in the I/O
hot path.
The 'buf' array will be fully initialized when reading data off
the network socket.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20250610123709.835102-7-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 9a23075cef1ac6e73a95a489ac72f41c573ceb9b)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
chardev/char-socket.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index 1ca9441b1b..99d644e89f 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -497,7 +497,7 @@ static gboolean tcp_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque)
{
Chardev *chr = CHARDEV(opaque);
SocketChardev *s = SOCKET_CHARDEV(opaque);
- uint8_t buf[CHR_READ_BUF_LEN];
+ QEMU_UNINITIALIZED uint8_t buf[CHR_READ_BUF_LEN];
int len, size;
if ((s->state != TCP_CHARDEV_STATE_CONNECTED) ||
--
2.39.3

View File

@ -0,0 +1,53 @@
From 389c3c6b4215c9be3fd784c73af0e9795e796380 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Tue, 18 Feb 2025 19:25:35 +0100
Subject: [PATCH 5/9] docs/devel/reset: Document reset expectations for DMA and
IOMMU
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 341: Fix vIOMMU reset order
RH-Jira: RHEL-7188
RH-Acked-by: Peter Xu <peterx@redhat.com>
RH-Acked-by: Donald Dutile <None>
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Commit: [5/5] be8b9d9e34a2b301430dfa229c6785ab17d3fb16 (eauger1/centos-qemu-kvm)
To avoid any translation faults, the IOMMUs are expected to be
reset after the devices they protect. Document that we expect
DMA requests to be stopped during the 'enter' or 'hold' phase
while IOMMUs should be reset during the 'exit' phase.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Message-Id: <20250218182737.76722-6-eric.auger@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit dd6d545e8f2d9a0e8a8c287ec16469f03ef5c198)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
docs/devel/reset.rst | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/docs/devel/reset.rst b/docs/devel/reset.rst
index 9746a4e8a0..24ab630465 100644
--- a/docs/devel/reset.rst
+++ b/docs/devel/reset.rst
@@ -123,6 +123,11 @@ The *exit* phase is executed only when the last reset operation ends. Therefore
the object does not need to care how many of reset controllers it has and how
many of them have started a reset.
+DMA capable devices are expected to cancel all outstanding DMA operations
+during either 'enter' or 'hold' phases. IOMMUs are expected to reset during
+the 'exit' phase and this sequencing makes sure no outstanding DMA request
+will fault.
+
Handling reset in a resettable object
-------------------------------------
--
2.48.1

View File

@ -0,0 +1,42 @@
From d565fe385b3c45a41fa8e25942220aff38a04fc3 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 29 Apr 2025 17:05:41 +0200
Subject: [PATCH 2/3] file-posix: Define DM_MPATH_PROBE_PATHS
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 372: file-posix: Fix multipath failover with SCSI passthrough [9.7]
RH-Jira: RHEL-95408
RH-Acked-by: Hanna Czenczek <hreitz@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [1/2] 7615906833a6bb2b4645fa5cd60d78aa9631cb7c (kmwolf/centos-qemu-kvm)
While the kernel side isn't merged yet and we're still using old kernel
headers, just define DM_MPATH_PROBE_PATHS manually.
This is a downstream-only patch that can be removed after the next minor
release.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
block/file-posix.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/block/file-posix.c b/block/file-posix.c
index 0cb4e922c0..6a5c506549 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -134,6 +134,11 @@
#define RAW_LOCK_PERM_BASE 100
#define RAW_LOCK_SHARED_BASE 200
+/* TODO Remove this when the kernel side is merged */
+#if !defined(DM_MPATH_PROBE_PATHS) && defined(DM_GET_TARGET_VERSION)
+#define DM_MPATH_PROBE_PATHS _IO(DM_IOCTL, DM_GET_TARGET_VERSION_CMD + 1)
+#endif
+
typedef struct BDRVRawState {
int fd;
bool use_lock;
--
2.48.1

View File

@ -0,0 +1,46 @@
From 3515c6541f71817727a3a8b18ec5252644b51bc0 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 29 Apr 2025 17:56:54 +0200
Subject: [PATCH 5/5] file-posix: Fix crash on discard_granularity == 0
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 355: file-posix: probe discard alignment on Linux block devices
RH-Jira: RHEL-86032
RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
RH-Acked-by: Eric Blake <eblake@redhat.com>
RH-Commit: [3/3] b8139a4c5b19efff1f15c314447a6abb89db0ae7 (stefanha/centos-stream-qemu-kvm)
Block devices that don't support discard have a discard_granularity of
0. Currently, this results in a division by zero when we try to make
sure that it's a multiple of request_alignment. Only try to update
bs->bl.pdiscard_alignment when we got a non-zero discard_granularity
from sysfs.
Fixes: f605796aae4 ('file-posix: probe discard alignment on Linux block devices')
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-ID: <20250429155654.102735-1-kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 71a30d54e6ab1d5c102a8bee2c263414697402ea)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
block/file-posix.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/block/file-posix.c b/block/file-posix.c
index 3d5b024459..0cb4e922c0 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1565,7 +1565,7 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
int ret;
ret = hdev_get_pdiscard_alignment(&st, &dalign);
- if (ret == 0) {
+ if (ret == 0 && dalign != 0) {
uint32_t ralign = bs->bl.request_alignment;
/* Probably never happens, but handle it just in case */
--
2.48.1

View File

@ -0,0 +1,215 @@
From 95c651ba1177bd88dbd9b52fe2ec8fedadcdb5c8 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 22 May 2025 15:08:03 +0200
Subject: [PATCH 3/3] file-posix: Probe paths and retry SG_IO on potential path
errors
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 372: file-posix: Fix multipath failover with SCSI passthrough [9.7]
RH-Jira: RHEL-95408
RH-Acked-by: Hanna Czenczek <hreitz@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [2/2] 4312e9ec609e511afdfb6634e1d2370032d41543 (kmwolf/centos-qemu-kvm)
When scsi-block is used on a host multipath device, it runs into the
problem that the kernel dm-mpath doesn't know anything about SCSI or
SG_IO and therefore can't decide if a SG_IO request returned an error
and needs to be retried on a different path. Instead of getting working
failover, an error is returned to scsi-block and handled according to
the configured error policy. Obviously, this is not what users want,
they want working failover.
QEMU can parse the SG_IO result and determine whether this could have
been a path error, but just retrying the same request could just send it
to the same failing path again and result in the same error.
With a kernel that supports the DM_MPATH_PROBE_PATHS ioctl on dm-mpath
block devices (queued in the device mapper tree for Linux 6.16), we can
tell the kernel to probe all paths and tell us if any usable paths
remained. If so, we can now retry the SG_IO ioctl and expect it to be
sent to a working path.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20250522130803.34738-1-kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Hanna Czenczek <hreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
(cherry picked from commit bf627788ef17721955bfcfba84209a07ae5f54ea)
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
block/file-posix.c | 115 ++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 114 insertions(+), 1 deletion(-)
diff --git a/block/file-posix.c b/block/file-posix.c
index 6a5c506549..f17a3f4d10 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -41,6 +41,7 @@
#include "scsi/pr-manager.h"
#include "scsi/constants.h"
+#include "scsi/utils.h"
#if defined(__APPLE__) && (__MACH__)
#include <sys/ioctl.h>
@@ -72,6 +73,7 @@
#include <linux/blkzoned.h>
#endif
#include <linux/cdrom.h>
+#include <linux/dm-ioctl.h>
#include <linux/fd.h>
#include <linux/fs.h>
#include <linux/hdreg.h>
@@ -139,6 +141,22 @@
#define DM_MPATH_PROBE_PATHS _IO(DM_IOCTL, DM_GET_TARGET_VERSION_CMD + 1)
#endif
+/*
+ * Multiple retries are mostly meant for two separate scenarios:
+ *
+ * - DM_MPATH_PROBE_PATHS returns success, but before SG_IO completes, another
+ * path goes down.
+ *
+ * - DM_MPATH_PROBE_PATHS failed all paths in the current path group, so we have
+ * to send another SG_IO to switch to another path group to probe the paths in
+ * it.
+ *
+ * Even if each path is in a separate path group (path_grouping_policy set to
+ * failover), it's rare to have more than eight path groups - and even then
+ * pretty unlikely that only bad path groups would be chosen in eight retries.
+ */
+#define SG_IO_MAX_RETRIES 8
+
typedef struct BDRVRawState {
int fd;
bool use_lock;
@@ -166,6 +184,7 @@ typedef struct BDRVRawState {
bool use_linux_aio:1;
bool has_laio_fdsync:1;
bool use_linux_io_uring:1;
+ bool use_mpath:1;
int page_cache_inconsistent; /* errno from fdatasync failure */
bool has_fallocate;
bool needs_alignment;
@@ -4248,15 +4267,105 @@ hdev_open_Mac_error:
/* Since this does ioctl the device must be already opened */
bs->sg = hdev_is_sg(bs);
+ /* sg devices aren't even block devices and can't use dm-mpath */
+ s->use_mpath = !bs->sg;
+
return ret;
}
#if defined(__linux__)
+#if defined(DM_MPATH_PROBE_PATHS)
+static bool coroutine_fn sgio_path_error(int ret, sg_io_hdr_t *io_hdr)
+{
+ if (ret < 0) {
+ switch (ret) {
+ case -ENODEV:
+ return true;
+ case -EAGAIN:
+ /*
+ * The device is probably suspended. This happens while the dm table
+ * is reloaded, e.g. because a path is added or removed. This is an
+ * operation that should complete within 1ms, so just wait a bit and
+ * retry.
+ *
+ * If the device was suspended for another reason, we'll wait and
+ * retry SG_IO_MAX_RETRIES times. This is a tolerable delay before
+ * we return an error and potentially stop the VM.
+ */
+ qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000);
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ if (io_hdr->host_status != SCSI_HOST_OK) {
+ return true;
+ }
+
+ switch (io_hdr->status) {
+ case GOOD:
+ case CONDITION_GOOD:
+ case INTERMEDIATE_GOOD:
+ case INTERMEDIATE_C_GOOD:
+ case RESERVATION_CONFLICT:
+ case COMMAND_TERMINATED:
+ return false;
+ case CHECK_CONDITION:
+ return !scsi_sense_buf_is_guest_recoverable(io_hdr->sbp,
+ io_hdr->mx_sb_len);
+ default:
+ return true;
+ }
+}
+
+static bool coroutine_fn hdev_co_ioctl_sgio_retry(RawPosixAIOData *acb, int ret)
+{
+ BDRVRawState *s = acb->bs->opaque;
+ RawPosixAIOData probe_acb;
+
+ if (!s->use_mpath) {
+ return false;
+ }
+
+ if (!sgio_path_error(ret, acb->ioctl.buf)) {
+ return false;
+ }
+
+ probe_acb = (RawPosixAIOData) {
+ .bs = acb->bs,
+ .aio_type = QEMU_AIO_IOCTL,
+ .aio_fildes = s->fd,
+ .aio_offset = 0,
+ .ioctl = {
+ .buf = NULL,
+ .cmd = DM_MPATH_PROBE_PATHS,
+ },
+ };
+
+ ret = raw_thread_pool_submit(handle_aiocb_ioctl, &probe_acb);
+ if (ret == -ENOTTY) {
+ s->use_mpath = false;
+ } else if (ret == -EAGAIN) {
+ /* The device might be suspended for a table reload, worth retrying */
+ return true;
+ }
+
+ return ret == 0;
+}
+#else
+static bool coroutine_fn hdev_co_ioctl_sgio_retry(RawPosixAIOData *acb, int ret)
+{
+ return false;
+}
+#endif /* DM_MPATH_PROBE_PATHS */
+
static int coroutine_fn
hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
{
BDRVRawState *s = bs->opaque;
RawPosixAIOData acb;
+ int retries = SG_IO_MAX_RETRIES;
int ret;
ret = fd_open(bs);
@@ -4284,7 +4393,11 @@ hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
},
};
- return raw_thread_pool_submit(handle_aiocb_ioctl, &acb);
+ do {
+ ret = raw_thread_pool_submit(handle_aiocb_ioctl, &acb);
+ } while (req == SG_IO && retries-- && hdev_co_ioctl_sgio_retry(&acb, ret));
+
+ return ret;
}
#endif /* linux */
--
2.48.1

View File

@ -0,0 +1,64 @@
From 39e0c370357a414abacd64fb6a172e7b25eb4d82 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Fri, 9 May 2025 15:40:19 -0500
Subject: [PATCH 04/16] file-posix, gluster: Handle zero block status hint
better
RH-Author: Eric Blake <eblake@redhat.com>
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
RH-Jira: RHEL-82906 RHEL-83015
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
RH-Commit: [2/14] 1f7b47ce5f5fb321aee41a16accf5bce3d1bfe95 (ebblake/centos-qemu-kvm)
Although the previous patch to change 'bool want_zero' into a bitmask
made no semantic change, it is now time to differentiate. When the
caller specifically wants to know what parts of the file read as zero,
we need to use lseek and actually reporting holes, rather than
short-circuiting and advertising full allocation.
This change will be utilized in later patches to let mirroring
optimize for the case when the destination already reads as zeroes.
Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20250509204341.3553601-17-eblake@redhat.com>
(cherry picked from commit a6a0a7fb0e327d17594c971b4a39de14e025b415)
Jira: https://issues.redhat.com/browse/RHEL-82906
Jira: https://issues.redhat.com/browse/RHEL-83015
Signed-off-by: Eric Blake <eblake@redhat.com>
---
block/file-posix.c | 3 ++-
block/gluster.c | 2 +-
2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/block/file-posix.c b/block/file-posix.c
index 9ca55620ca..ce5da2b4c2 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -3293,7 +3293,8 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
return ret;
}
- if (mode != BDRV_WANT_PRECISE) {
+ if (!(mode & BDRV_WANT_ZERO)) {
+ /* There is no backing file - all bytes are allocated in this file. */
*pnum = bytes;
*map = offset;
*file = bs;
diff --git a/block/gluster.c b/block/gluster.c
index ae5c45666b..175c70164c 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -1483,7 +1483,7 @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs,
return ret;
}
- if (mode != BDRV_WANT_PRECISE) {
+ if (!(mode & BDRV_WANT_ZERO)) {
*pnum = bytes;
*map = offset;
*file = bs;
--
2.48.1

View File

@ -0,0 +1,131 @@
From 29ae77d77cabc3582267cb8a7c4fe10d279a21e6 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 17 Apr 2025 11:05:27 -0400
Subject: [PATCH 3/5] file-posix: probe discard alignment on Linux block
devices
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 355: file-posix: probe discard alignment on Linux block devices
RH-Jira: RHEL-86032
RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
RH-Acked-by: Eric Blake <eblake@redhat.com>
RH-Commit: [1/3] bb3c17b0da6edeb209874e97d4e2c3b1762a1749 (stefanha/centos-stream-qemu-kvm)
Populate the pdiscard_alignment block limit so the block layer is able
align discard requests correctly.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20250417150528.76470-2-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
(cherry picked from commit f605796aae42885034400c83ed6a9b07cd6d6481)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
block/file-posix.c | 67 +++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 66 insertions(+), 1 deletion(-)
diff --git a/block/file-posix.c b/block/file-posix.c
index ff928b5e85..3d5b024459 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1268,10 +1268,10 @@ static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
}
#endif /* defined(CONFIG_BLKZONED) */
+#ifdef CONFIG_LINUX
/*
* Get a sysfs attribute value as a long integer.
*/
-#ifdef CONFIG_LINUX
static long get_sysfs_long_val(struct stat *st, const char *attribute)
{
g_autofree char *str = NULL;
@@ -1291,6 +1291,30 @@ static long get_sysfs_long_val(struct stat *st, const char *attribute)
}
return ret;
}
+
+/*
+ * Get a sysfs attribute value as a uint32_t.
+ */
+static int get_sysfs_u32_val(struct stat *st, const char *attribute,
+ uint32_t *u32)
+{
+ g_autofree char *str = NULL;
+ const char *end;
+ unsigned int val;
+ int ret;
+
+ ret = get_sysfs_str_val(st, attribute, &str);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* The file is ended with '\n', pass 'end' to accept that. */
+ ret = qemu_strtoui(str, &end, 10, &val);
+ if (ret == 0 && end && *end == '\0') {
+ *u32 = val;
+ }
+ return ret;
+}
#endif
static int hdev_get_max_segments(int fd, struct stat *st)
@@ -1310,6 +1334,23 @@ static int hdev_get_max_segments(int fd, struct stat *st)
#endif
}
+/*
+ * Fills in *dalign with the discard alignment and returns 0 on success,
+ * -errno otherwise.
+ */
+static int hdev_get_pdiscard_alignment(struct stat *st, uint32_t *dalign)
+{
+#ifdef CONFIG_LINUX
+ /*
+ * Note that Linux "discard_granularity" is QEMU "discard_alignment". Linux
+ * "discard_alignment" is something else.
+ */
+ return get_sysfs_u32_val(st, "discard_granularity", dalign);
+#else
+ return -ENOTSUP;
+#endif
+}
+
#if defined(CONFIG_BLKZONED)
/*
* If the reset_all flag is true, then the wps of zone whose state is
@@ -1519,6 +1560,30 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
}
}
+ if (S_ISBLK(st.st_mode)) {
+ uint32_t dalign = 0;
+ int ret;
+
+ ret = hdev_get_pdiscard_alignment(&st, &dalign);
+ if (ret == 0) {
+ uint32_t ralign = bs->bl.request_alignment;
+
+ /* Probably never happens, but handle it just in case */
+ if (dalign < ralign && (ralign % dalign == 0)) {
+ dalign = ralign;
+ }
+
+ /* The block layer requires a multiple of request_alignment */
+ if (dalign % ralign != 0) {
+ error_setg(errp, "Invalid pdiscard_alignment limit %u is not a "
+ "multiple of request_alignment %u", dalign, ralign);
+ return;
+ }
+
+ bs->bl.pdiscard_alignment = dalign;
+ }
+ }
+
raw_refresh_zoned_limits(bs, &st, errp);
}
--
2.48.1

View File

@ -0,0 +1,123 @@
From a3dfbe30e930c8d794057e45fffd91a9b0e6afd0 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Tue, 18 Feb 2025 19:25:33 +0100
Subject: [PATCH 3/9] hw/arm/smmuv3: Move reset to exit phase
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 341: Fix vIOMMU reset order
RH-Jira: RHEL-7188
RH-Acked-by: Peter Xu <peterx@redhat.com>
RH-Acked-by: Donald Dutile <None>
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Commit: [3/5] e291cb45c32e0fab49b200c275553bbe76b97264 (eauger1/centos-qemu-kvm)
Currently the iommu may be reset before the devices
it protects. For example this happens with virtio-scsi-pci.
when system_reset is issued from qmp monitor: spurious
"virtio: zero sized buffers are not allowed" warnings can
be observed. This happens because outstanding DMA requests
are still happening while the SMMU gets reset.
This can also happen with VFIO devices. In that case
spurious DMA translation faults can be observed on host.
Make sure the SMMU is reset in the 'exit' phase after
all DMA capable devices have been reset during the 'enter'
or 'hold' phase.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Message-Id: <20250218182737.76722-4-eric.auger@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit e39e3f8b8dea856f141e9945167d2b18021ef445)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/arm/smmu-common.c | 9 +++++++--
hw/arm/smmuv3.c | 14 ++++++++++----
hw/arm/trace-events | 1 +
3 files changed, 18 insertions(+), 6 deletions(-)
diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
index 3f82728758..f4210fcbc1 100644
--- a/hw/arm/smmu-common.c
+++ b/hw/arm/smmu-common.c
@@ -924,7 +924,12 @@ static void smmu_base_realize(DeviceState *dev, Error **errp)
}
}
-static void smmu_base_reset_hold(Object *obj, ResetType type)
+/*
+ * Make sure the IOMMU is reset in 'exit' phase after
+ * all outstanding DMA requests have been quiesced during
+ * the 'enter' or 'hold' reset phases
+ */
+static void smmu_base_reset_exit(Object *obj, ResetType type)
{
SMMUState *s = ARM_SMMU(obj);
@@ -950,7 +955,7 @@ static void smmu_base_class_init(ObjectClass *klass, void *data)
device_class_set_props(dc, smmu_dev_properties);
device_class_set_parent_realize(dc, smmu_base_realize,
&sbc->parent_realize);
- rc->phases.hold = smmu_base_reset_hold;
+ rc->phases.exit = smmu_base_reset_exit;
}
static const TypeInfo smmu_base_info = {
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index 3971976389..2e90570915 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -1870,13 +1870,19 @@ static void smmu_init_irq(SMMUv3State *s, SysBusDevice *dev)
}
}
-static void smmu_reset_hold(Object *obj, ResetType type)
+/*
+ * Make sure the IOMMU is reset in 'exit' phase after
+ * all outstanding DMA requests have been quiesced during
+ * the 'enter' or 'hold' reset phases
+ */
+static void smmu_reset_exit(Object *obj, ResetType type)
{
SMMUv3State *s = ARM_SMMUV3(obj);
SMMUv3Class *c = ARM_SMMUV3_GET_CLASS(s);
- if (c->parent_phases.hold) {
- c->parent_phases.hold(obj, type);
+ trace_smmu_reset_exit();
+ if (c->parent_phases.exit) {
+ c->parent_phases.exit(obj, type);
}
smmuv3_init_regs(s);
@@ -1999,7 +2005,7 @@ static void smmuv3_class_init(ObjectClass *klass, void *data)
SMMUv3Class *c = ARM_SMMUV3_CLASS(klass);
dc->vmsd = &vmstate_smmuv3;
- resettable_class_set_parent_phases(rc, NULL, smmu_reset_hold, NULL,
+ resettable_class_set_parent_phases(rc, NULL, NULL, smmu_reset_exit,
&c->parent_phases);
device_class_set_parent_realize(dc, smmu_realize,
&c->parent_realize);
diff --git a/hw/arm/trace-events b/hw/arm/trace-events
index be6c8f720b..79ef347e3e 100644
--- a/hw/arm/trace-events
+++ b/hw/arm/trace-events
@@ -56,6 +56,7 @@ smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid=0x%x"
smmuv3_notify_flag_add(const char *iommu) "ADD SMMUNotifier node for iommu mr=%s"
smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s"
smmuv3_inv_notifiers_iova(const char *name, int asid, int vmid, uint64_t iova, uint8_t tg, uint64_t num_pages, int stage) "iommu mr=%s asid=%d vmid=%d iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64" stage=%d"
+smmu_reset_exit(void) ""
# strongarm.c
strongarm_uart_update_parameters(const char *label, int speed, char parity, int data_bits, int stop_bits) "%s speed=%d parity=%c data=%d stop=%d"
--
2.48.1

View File

@ -0,0 +1,57 @@
From 2018f62f2242d8d4a970d83ebef9b3c2bccf6fda Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:36:45 +0100
Subject: [PATCH 33/57] hw/audio/ac97: skip automatic zero-init of large arrays
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [7/30] 4a6b59a9b9122d9f89e99b3e44df19e6d92ed941 (stefanha/centos-stream-qemu-kvm)
The 'read_audio' & 'write_audio' methods have a 4k byte array used
for copying data between the audio backend and device. Skip the
automatic zero-init of these arrays to eliminate the performance
overhead in the I/O hot path.
The 'tmpbuf' array will be fully initialized when reading data from
the audio backend and/or device memory.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20250610123709.835102-8-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 2553d2d26a9d0f46386bf8c37d184567e5cede6c)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/audio/ac97.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/hw/audio/ac97.c b/hw/audio/ac97.c
index 3f0053f94d..681b5752a1 100644
--- a/hw/audio/ac97.c
+++ b/hw/audio/ac97.c
@@ -886,7 +886,7 @@ static void nabm_writel(void *opaque, uint32_t addr, uint32_t val)
static int write_audio(AC97LinkState *s, AC97BusMasterRegs *r,
int max, int *stop)
{
- uint8_t tmpbuf[4096];
+ QEMU_UNINITIALIZED uint8_t tmpbuf[4096];
uint32_t addr = r->bd.addr;
uint32_t temp = r->picb << 1;
uint32_t written = 0;
@@ -959,7 +959,7 @@ static void write_bup(AC97LinkState *s, int elapsed)
static int read_audio(AC97LinkState *s, AC97BusMasterRegs *r,
int max, int *stop)
{
- uint8_t tmpbuf[4096];
+ QEMU_UNINITIALIZED uint8_t tmpbuf[4096];
uint32_t addr = r->bd.addr;
uint32_t temp = r->picb << 1;
uint32_t nread = 0;
--
2.39.3

View File

@ -0,0 +1,59 @@
From bd32bb22fb324a37b31ed9ac3387524f6f4ea5be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:36:46 +0100
Subject: [PATCH 34/57] hw/audio/cs4231a: skip automatic zero-init of large
arrays
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [8/30] 6c454bcc2927e49896c62718287fb9e4b37b3bb9 (stefanha/centos-stream-qemu-kvm)
The 'cs_write_audio' method has a pair of byte arrays, one 4k in size
and one 8k, which are used in converting audio samples. Skip the
automatic zero-init of these arrays to eliminate the performance
overhead in the I/O hot path.
The 'tmpbuf' array will be fully initialized when reading a block of
data from the guest. The 'linbuf' array will be fully initialized
when converting the audio samples.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20250610123709.835102-9-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit ca2cc0385d97cea66cd54ee42553f385c403d4a6)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/audio/cs4231a.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/hw/audio/cs4231a.c b/hw/audio/cs4231a.c
index 9ef57f042d..5c312642cc 100644
--- a/hw/audio/cs4231a.c
+++ b/hw/audio/cs4231a.c
@@ -528,7 +528,7 @@ static int cs_write_audio (CSState *s, int nchan, int dma_pos,
int dma_len, int len)
{
int temp, net;
- uint8_t tmpbuf[4096];
+ QEMU_UNINITIALIZED uint8_t tmpbuf[4096];
IsaDmaClass *k = ISADMA_GET_CLASS(s->isa_dma);
temp = len;
@@ -547,7 +547,7 @@ static int cs_write_audio (CSState *s, int nchan, int dma_pos,
copied = k->read_memory(s->isa_dma, nchan, tmpbuf, dma_pos, to_copy);
if (s->tab) {
int i;
- int16_t linbuf[4096];
+ QEMU_UNINITIALIZED int16_t linbuf[4096];
for (i = 0; i < copied; ++i)
linbuf[i] = s->tab[tmpbuf[i]];
--
2.39.3

View File

@ -0,0 +1,49 @@
From cb12ddc6ed836091aa7724e2f77ab79cd9089cad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:36:47 +0100
Subject: [PATCH 35/57] hw/audio/es1370: skip automatic zero-init of large
array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [9/30] b992e4247d8d31dc09f9dc7671e7a532558174ec (stefanha/centos-stream-qemu-kvm)
The 'es1370_transfer_audio' method has a 4k byte array used for
copying data between the audio backend and device. Skip the automatic
zero-init of this array to eliminate the performance overhead in
the I/O hot path.
The 'tmpbuf' array will be fully initialized when reading data from
the audio backend and/or device memory.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20250610123709.835102-10-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 8236e206084b832d1d7ec947a4798b818f4cdf1f)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/audio/es1370.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hw/audio/es1370.c b/hw/audio/es1370.c
index 4ab61d3b9d..6aea934f54 100644
--- a/hw/audio/es1370.c
+++ b/hw/audio/es1370.c
@@ -604,7 +604,7 @@ static uint64_t es1370_read(void *opaque, hwaddr addr, unsigned size)
static void es1370_transfer_audio (ES1370State *s, struct chan *d, int loop_sel,
int max, bool *irq)
{
- uint8_t tmpbuf[4096];
+ QEMU_UNINITIALIZED uint8_t tmpbuf[4096];
size_t to_transfer;
uint32_t addr = d->frame_addr;
int sc = d->scount & 0xffff;
--
2.39.3

View File

@ -0,0 +1,48 @@
From 9ad7091d82fd0577488f27ab54bb7851fe957020 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:36:48 +0100
Subject: [PATCH 36/57] hw/audio/gus: skip automatic zero-init of large array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [10/30] 366953d0417ac31e3060fdc327fe8dade3375bf0 (stefanha/centos-stream-qemu-kvm)
The 'GUS_read_DMA' method has a 4k byte array used for copying
data between the audio backend and device. Skip the automatic
zero-init of this array to eliminate the performance overhead in
the I/O hot path.
The 'tmpbuf' array will be fully initialized when reading data
from device memory.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20250610123709.835102-11-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 2e438da4929018c62609381e1156aac0b2fe3de3)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/audio/gus.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hw/audio/gus.c b/hw/audio/gus.c
index 4beb3fd74e..e8b0b85d44 100644
--- a/hw/audio/gus.c
+++ b/hw/audio/gus.c
@@ -183,7 +183,7 @@ static int GUS_read_DMA (void *opaque, int nchan, int dma_pos, int dma_len)
{
GUSState *s = opaque;
IsaDmaClass *k = ISADMA_GET_CLASS(s->isa_dma);
- char tmpbuf[4096];
+ QEMU_UNINITIALIZED char tmpbuf[4096];
int pos = dma_pos, mode, left = dma_len - dma_pos;
ldebug ("read DMA %#x %d\n", dma_pos, dma_len);
--
2.39.3

View File

@ -0,0 +1,50 @@
From 5cf61823cbe80b1ace2f5bdb9cc1971956425b98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:36:49 +0100
Subject: [PATCH 37/57] hw/audio/marvell_88w8618: skip automatic zero-init of
large array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [11/30] e09cdb76430552081168873dadfef1b5c8f74327 (stefanha/centos-stream-qemu-kvm)
The 'mv88w8618_audio_callback' method has a 4k byte array used for
copying data between the audio backend and device. Skip the automatic
zero-init of this array to eliminate the performance overhead in
the I/O hot path.
The 'buf' array will be fully initialized when reading data from
device memory.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20250610123709.835102-12-berrange@redhat.com
[Fixed hw/audio/gus in commit message --Stefan]
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 5b6cd5c5df4229972d8a0fd9dd9a089a1644d6ba)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/audio/marvell_88w8618.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hw/audio/marvell_88w8618.c b/hw/audio/marvell_88w8618.c
index cc285444bc..b7b4b27272 100644
--- a/hw/audio/marvell_88w8618.c
+++ b/hw/audio/marvell_88w8618.c
@@ -66,7 +66,7 @@ static void mv88w8618_audio_callback(void *opaque, int free_out, int free_in)
{
mv88w8618_audio_state *s = opaque;
int16_t *codec_buffer;
- int8_t buf[4096];
+ QEMU_UNINITIALIZED int8_t buf[4096];
int8_t *mem_buffer;
int pos, block_size;
--
2.39.3

View File

@ -0,0 +1,48 @@
From 0b4d59d75edd49ef99f0a82fbcbe360c5b48e4f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:36:50 +0100
Subject: [PATCH 38/57] hw/audio/sb16: skip automatic zero-init of large array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [12/30] 6475d67546bf04745636b317e965bcd89b6fb2d2 (stefanha/centos-stream-qemu-kvm)
The 'write_audio' method has a 4k byte array used for copying data
between the audio backend and device. Skip the automatic zero-init
of this array to eliminate the performance overhead in the I/O hot
path.
The 'tmpbuf' array will be fully initialized when reading data from
device memory.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20250610123709.835102-13-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 30c82f6657c1ee9fbb5473924b4d3273f214bd6f)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/audio/sb16.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hw/audio/sb16.c b/hw/audio/sb16.c
index fd76e78d18..04c818ed3d 100644
--- a/hw/audio/sb16.c
+++ b/hw/audio/sb16.c
@@ -1181,7 +1181,7 @@ static int write_audio (SB16State *s, int nchan, int dma_pos,
IsaDma *isa_dma = nchan == s->dma ? s->isa_dma : s->isa_hdma;
IsaDmaClass *k = ISADMA_GET_CLASS(isa_dma);
int temp, net;
- uint8_t tmpbuf[4096];
+ QEMU_UNINITIALIZED uint8_t tmpbuf[4096];
temp = len;
net = 0;
--
2.39.3

View File

@ -0,0 +1,49 @@
From 35332282ef8bd06f59206266006eff222ffe6bec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:36:51 +0100
Subject: [PATCH 39/57] hw/audio/via-ac97: skip automatic zero-init of large
array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [13/30] 6391a04b29fcbb8bcdbce2c6b786758fc34f0d71 (stefanha/centos-stream-qemu-kvm)
The 'out_cb' method has a 4k byte array used for copying data
between the audio backend and device. Skip the automatic zero-init
of this array to eliminate the performance overhead in the I/O hot
path.
The 'tmpbuf' array will be fully initialized when reading data from
device memory.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20250610123709.835102-14-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit bb71d9fe1419f44529c91d1b09464718d157e647)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/audio/via-ac97.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hw/audio/via-ac97.c b/hw/audio/via-ac97.c
index 4c127a1def..e8fcf44e5d 100644
--- a/hw/audio/via-ac97.c
+++ b/hw/audio/via-ac97.c
@@ -175,7 +175,7 @@ static void out_cb(void *opaque, int avail)
ViaAC97SGDChannel *c = &s->aur;
int temp, to_copy, copied;
bool stop = false;
- uint8_t tmpbuf[4096];
+ QEMU_UNINITIALIZED uint8_t tmpbuf[4096];
if (c->stat & STAT_PAUSED) {
return;
--
2.39.3

View File

@ -0,0 +1,49 @@
From b0c16a93460c2dfe834a9f439d25dc833dfb7427 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:36:52 +0100
Subject: [PATCH 40/57] hw/char/sclpconsole-lm: skip automatic zero-init of
large array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [14/30] 1491e0147a799ec523fa67fd49649722a07299e7 (stefanha/centos-stream-qemu-kvm)
The 'process_mdb' method has a 4k byte array used for copying data
between the guest and the chardev backend. Skip the automatic zero-init
of this array to eliminate the performance overhead in the I/O hot
path.
The 'buffer' array will be selectively initialized when data is converted
between EBCDIC and ASCII.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20250610123709.835102-15-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 8b1dac1ad57082611419b0e2f347acd96115d25f)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/char/sclpconsole-lm.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hw/char/sclpconsole-lm.c b/hw/char/sclpconsole-lm.c
index 7719f438f6..19e64b92f6 100644
--- a/hw/char/sclpconsole-lm.c
+++ b/hw/char/sclpconsole-lm.c
@@ -214,7 +214,7 @@ static int process_mdb(SCLPEvent *event, MDBO *mdbo)
{
int rc;
int len;
- uint8_t buffer[SIZE_BUFFER];
+ QEMU_UNINITIALIZED uint8_t buffer[SIZE_BUFFER];
len = be16_to_cpu(mdbo->length);
len -= sizeof(mdbo->length) + sizeof(mdbo->type)
--
2.39.3

View File

@ -0,0 +1,49 @@
From 7b5624efccf55184278c6f4924efc2141df460f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:36:54 +0100
Subject: [PATCH 42/57] hw/display/vmware_vga: skip automatic zero-init of
large struct
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [16/30] 4aaf459d4356bf28164be742889b9a78d3656703 (stefanha/centos-stream-qemu-kvm)
The 'vmsvga_fifo_run' method has a struct which is a little over 20k
in size, used for holding image data for cursor changes. Skip the
automatic zero-init of this struct to eliminate the performance
overhead in the I/O hot path.
The cursor variable will be fully initialized only when processing
a cursor definition message from the guest.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20250610123709.835102-17-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 7048e70f391df76d009eecca25f8027858f9f304)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/display/vmware_vga.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hw/display/vmware_vga.c b/hw/display/vmware_vga.c
index 3db3ff98f7..69afe98a2f 100644
--- a/hw/display/vmware_vga.c
+++ b/hw/display/vmware_vga.c
@@ -618,7 +618,7 @@ static void vmsvga_fifo_run(struct vmsvga_state_s *s)
uint32_t cmd, colour;
int args, len, maxloop = 1024;
int x, y, dx, dy, width, height;
- struct vmsvga_cursor_definition_s cursor;
+ QEMU_UNINITIALIZED struct vmsvga_cursor_definition_s cursor;
uint32_t cmd_start;
len = vmsvga_fifo_length(s);
--
2.39.3

View File

@ -0,0 +1,47 @@
From cd3500c9e248dbefb36273046e6eee44ee0d5cbe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:36:53 +0100
Subject: [PATCH 41/57] hw/dma/xlnx_csu_dma: skip automatic zero-init of large
array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [15/30] 063c88269c7d3bf07ae05aaf2d3d154e2016db81 (stefanha/centos-stream-qemu-kvm)
The 'xlnx_csu_dma_src_notify' method has a 4k byte array used for
copying DMA data. Skip the automatic zero-init of this array to
eliminate the performance overhead in the I/O hot path.
The 'buf' array will be fully initialized when data is copied.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20250610123709.835102-16-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit ce14f24611aa0469b464a9512e192b4fd51dca2b)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/dma/xlnx_csu_dma.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hw/dma/xlnx_csu_dma.c b/hw/dma/xlnx_csu_dma.c
index ae307482f2..9d1cccc5ca 100644
--- a/hw/dma/xlnx_csu_dma.c
+++ b/hw/dma/xlnx_csu_dma.c
@@ -287,7 +287,7 @@ static uint32_t xlnx_csu_dma_advance(XlnxCSUDMA *s, uint32_t len)
static void xlnx_csu_dma_src_notify(void *opaque)
{
XlnxCSUDMA *s = XLNX_CSU_DMA(opaque);
- unsigned char buf[4 * 1024];
+ QEMU_UNINITIALIZED unsigned char buf[4 * 1024];
size_t rlen = 0;
ptimer_transaction_begin(s->src_timer);
--
2.39.3

View File

@ -0,0 +1,56 @@
From a4673aab85958c60867b12c65cc3483d734bb6e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:36:55 +0100
Subject: [PATCH 43/57] hw/hyperv/syndbg: skip automatic zero-init of large
array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [17/30] 5f71779c431128601baf46115fe65178532a3836 (stefanha/centos-stream-qemu-kvm)
The 'handle_recv_msg' method has a 4k byte array used for copying
data between the network socket and guest memory. Skip the automatic
zero-init of this array to eliminate the performance overhead in the
I/O hot path.
The 'data_buf' array will be fully initialized when data is read
off the network socket.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20250610123709.835102-18-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 5a1f614d0cd0bcc8e84e0b7ab6af63d56bd348a2)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Conflicts:
hw/hyperv/syndbg.c
Context conflict due to missing commit 3efb9d226221
("hw/hyperv/syndbg: common compilation unit") downstream. There is no
need to backport the commit because it's not a bug fix.
---
hw/hyperv/syndbg.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hw/hyperv/syndbg.c b/hw/hyperv/syndbg.c
index 065e12fb1e..c7c43c8009 100644
--- a/hw/hyperv/syndbg.c
+++ b/hw/hyperv/syndbg.c
@@ -188,7 +188,7 @@ static uint16_t handle_recv_msg(HvSynDbg *syndbg, uint64_t outgpa,
uint64_t timeout, uint32_t *retrieved_count)
{
uint16_t ret;
- uint8_t data_buf[TARGET_PAGE_SIZE - UDP_PKT_HEADER_SIZE];
+ QEMU_UNINITIALIZED uint8_t data_buf[TARGET_PAGE_SIZE - UDP_PKT_HEADER_SIZE];
hwaddr out_len;
void *out_data;
ssize_t recv_byte_count;
--
2.39.3

View File

@ -0,0 +1,87 @@
From 2bb5dff02fb393530a12f4f00219cd2f90cd442a Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@redhat.com>
Date: Thu, 15 May 2025 18:45:51 +0200
Subject: [PATCH 3/5] hw/i386: Fix machine type compatibility
RH-Author: Sebastian Ott <sebott@redhat.com>
RH-MergeRequest: 364: hw/i386: Fix machine type compatibility
RH-Jira: RHEL-91307
RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
RH-Commit: [1/1] 44ddbcb3af119c65e99018d7ed90887f3948907e (seott1/cos-qemu-kvm)
Upstream Status: RHEL only
Ensure compatibility of rhel specific i440fx and q35 machine types.
Pick up missing bits from pc_compat_9_0 upstream.
Signed-off-by: Sebastian Ott <sebott@redhat.com>
---
hw/i386/pc.c | 8 ++++++++
hw/i386/pc_piix.c | 2 ++
hw/i386/pc_q35.c | 2 ++
include/hw/i386/pc.h | 3 +++
4 files changed, 15 insertions(+)
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index fa9f16cbaf..5237538640 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -298,6 +298,14 @@ GlobalProperty pc_rhel_compat[] = {
};
const size_t pc_rhel_compat_len = G_N_ELEMENTS(pc_rhel_compat);
+GlobalProperty pc_rhel_9_6_compat[] = {
+ /* pc_rhel_9_6_compat from pc_compat_9_0 */
+ { TYPE_X86_CPU, "x-amd-topoext-features-only", "false" },
+ { TYPE_X86_CPU, "x-l1-cache-per-thread", "false" },
+ { TYPE_X86_CPU, "legacy-multi-node", "on" },
+};
+const size_t pc_rhel_9_6_compat_len = G_N_ELEMENTS(pc_rhel_9_6_compat);
+
GlobalProperty pc_rhel_9_5_compat[] = {
/* pc_rhel_9_5_compat from pc_compat_pc_9_0 (backported from 9.1) */
{ TYPE_X86_CPU, "guest-phys-bits", "0" },
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 10764bf596..0687317db5 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -885,6 +885,8 @@ static void pc_i440fx_rhel_machine_7_6_0_options(MachineClass *m)
compat_props_add(m->compat_props, hw_compat_rhel_9_6,
hw_compat_rhel_9_6_len);
+ compat_props_add(m->compat_props, pc_rhel_9_6_compat,
+ pc_rhel_9_6_compat_len);
compat_props_add(m->compat_props, pc_rhel_9_5_compat,
pc_rhel_9_5_compat_len);
compat_props_add(m->compat_props, hw_compat_rhel_9_5,
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 5bf08be0fb..871c760aea 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -704,6 +704,8 @@ static void pc_q35_rhel_machine_9_4_0_options(MachineClass *m)
compat_props_add(m->compat_props, hw_compat_rhel_9_6,
hw_compat_rhel_9_6_len);
+ compat_props_add(m->compat_props, pc_rhel_9_6_compat,
+ pc_rhel_9_6_compat_len);
compat_props_add(m->compat_props, pc_rhel_9_5_compat,
pc_rhel_9_5_compat_len);
compat_props_add(m->compat_props, hw_compat_rhel_9_5,
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 75c9271cdd..2b7c18f2b0 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -305,6 +305,9 @@ extern const size_t pc_compat_2_3_len;
extern GlobalProperty pc_rhel_compat[];
extern const size_t pc_rhel_compat_len;
+extern GlobalProperty pc_rhel_9_6_compat[];
+extern const size_t pc_rhel_9_6_compat_len;
+
extern GlobalProperty pc_rhel_9_5_compat[];
extern const size_t pc_rhel_9_5_compat_len;
--
2.48.1

View File

@ -0,0 +1,117 @@
From f1ff9d3b379697a2d4627e9529067195841d86a8 Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Sun, 4 May 2025 17:04:05 +0000
Subject: [PATCH 25/57] hw/i386/amd_iommu: Allow migration when explicitly
create the AMDVI-PCI device
RH-Author: John Allen <None>
RH-MergeRequest: 380: Add ability to manually specify the AMDVI-PCI device
RH-Jira: RHEL-70925
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [2/3] a42b88116e608a79b6fae13ebe3709874f2a853f (johnalle/qemu-kvm-fork)
Add migration support for AMD IOMMU model by saving necessary AMDVIState
parameters for MMIO registers, device table, command buffer, and event
buffers.
Also change devtab_len type from size_t to uint64_t to avoid 32-bit build
issue.
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Message-Id: <20250504170405.12623-3-suravee.suthikulpanit@amd.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 28931c2e1591deb4bfaaf744fdc8813e96c230f1)
JIRA: https://issues.redhat.com/browse/RHEL-70925
Signed-off-by: John Allen <johnalle@redhat.com>
---
hw/i386/amd_iommu.c | 48 +++++++++++++++++++++++++++++++++++++++++++++
hw/i386/amd_iommu.h | 2 +-
2 files changed, 49 insertions(+), 1 deletion(-)
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 6a5e76cfef..a34e0c5f59 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -1611,8 +1611,55 @@ static void amdvi_sysbus_reset(DeviceState *dev)
amdvi_init(s);
}
+static const VMStateDescription vmstate_amdvi_sysbus_migratable = {
+ .name = "amd-iommu",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .priority = MIG_PRI_IOMMU,
+ .fields = (VMStateField[]) {
+ /* Updated in amdvi_handle_control_write() */
+ VMSTATE_BOOL(enabled, AMDVIState),
+ VMSTATE_BOOL(ga_enabled, AMDVIState),
+ VMSTATE_BOOL(ats_enabled, AMDVIState),
+ VMSTATE_BOOL(cmdbuf_enabled, AMDVIState),
+ VMSTATE_BOOL(completion_wait_intr, AMDVIState),
+ VMSTATE_BOOL(evtlog_enabled, AMDVIState),
+ VMSTATE_BOOL(evtlog_intr, AMDVIState),
+ /* Updated in amdvi_handle_devtab_write() */
+ VMSTATE_UINT64(devtab, AMDVIState),
+ VMSTATE_UINT64(devtab_len, AMDVIState),
+ /* Updated in amdvi_handle_cmdbase_write() */
+ VMSTATE_UINT64(cmdbuf, AMDVIState),
+ VMSTATE_UINT64(cmdbuf_len, AMDVIState),
+ /* Updated in amdvi_handle_cmdhead_write() */
+ VMSTATE_UINT32(cmdbuf_head, AMDVIState),
+ /* Updated in amdvi_handle_cmdtail_write() */
+ VMSTATE_UINT32(cmdbuf_tail, AMDVIState),
+ /* Updated in amdvi_handle_evtbase_write() */
+ VMSTATE_UINT64(evtlog, AMDVIState),
+ VMSTATE_UINT32(evtlog_len, AMDVIState),
+ /* Updated in amdvi_handle_evthead_write() */
+ VMSTATE_UINT32(evtlog_head, AMDVIState),
+ /* Updated in amdvi_handle_evttail_write() */
+ VMSTATE_UINT32(evtlog_tail, AMDVIState),
+ /* Updated in amdvi_handle_pprbase_write() */
+ VMSTATE_UINT64(ppr_log, AMDVIState),
+ VMSTATE_UINT32(pprlog_len, AMDVIState),
+ /* Updated in amdvi_handle_pprhead_write() */
+ VMSTATE_UINT32(pprlog_head, AMDVIState),
+ /* Updated in amdvi_handle_tailhead_write() */
+ VMSTATE_UINT32(pprlog_tail, AMDVIState),
+ /* MMIO registers */
+ VMSTATE_UINT8_ARRAY(mmior, AMDVIState, AMDVI_MMIO_SIZE),
+ VMSTATE_UINT8_ARRAY(romask, AMDVIState, AMDVI_MMIO_SIZE),
+ VMSTATE_UINT8_ARRAY(w1cmask, AMDVIState, AMDVI_MMIO_SIZE),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
{
+ DeviceClass *dc = (DeviceClass *) object_get_class(OBJECT(dev));
AMDVIState *s = AMD_IOMMU_DEVICE(dev);
MachineState *ms = MACHINE(qdev_get_machine());
PCMachineState *pcms = PC_MACHINE(ms);
@@ -1634,6 +1681,7 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
}
s->pci = AMD_IOMMU_PCI(pdev);
+ dc->vmsd = &vmstate_amdvi_sysbus_migratable;
} else {
s->pci = AMD_IOMMU_PCI(object_new(TYPE_AMD_IOMMU_PCI));
/* This device should take care of IOMMU PCI properties */
diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
index ece71ff0b6..741dd9a910 100644
--- a/hw/i386/amd_iommu.h
+++ b/hw/i386/amd_iommu.h
@@ -329,7 +329,7 @@ struct AMDVIState {
bool excl_enabled;
hwaddr devtab; /* base address device table */
- size_t devtab_len; /* device table length */
+ uint64_t devtab_len; /* device table length */
hwaddr cmdbuf; /* command buffer base address */
uint64_t cmdbuf_len; /* command buffer length */
--
2.39.3

View File

@ -0,0 +1,57 @@
From e611119b8b4e0712ab103628051d69ea84538719 Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Tue, 25 Mar 2025 02:11:40 +0000
Subject: [PATCH 23/57] hw/i386/amd_iommu: Assign pci-id 0x1419 for the AMD
IOMMU device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: John Allen <None>
RH-MergeRequest: 379: hw/i386/amd_iommu: Assign pci-id 0x1419 for the AMD IOMMU device
RH-Jira: RHEL-70926
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [1/1] 69d847f64543caf328da3e7663e7d2ebe53cd448 (johnalle/qemu-kvm-fork)
Currently, the QEMU-emulated AMD IOMMU device use PCI vendor id 0x1022
(AMD) with device id zero (undefined). Eventhough this does not cause any
functional issue for AMD IOMMU driver since it normally uses information
in the ACPI IVRS table to probe and initialize the device per
recommendation in the AMD IOMMU specification, the device id zero causes
the Windows Device Manager utility to show the device as an unknown device.
Since Windows only recognizes AMD IOMMU device with device id 0x1419 as
listed in the machine.inf file, modify the QEMU AMD IOMMU model to use
the id 0x1419 to avoid the issue. This advertise the IOMMU as the AMD
IOMMU device for Family 15h (Models 10h-1fh).
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Message-Id: <20250325021140.5676-1-suravee.suthikulpanit@amd.com>
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Yan Vugenfirer <yvugenfi@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 719255486df2fcbe1b8599786b37f4bb80272f1a)
JIRA: https://issues.redhat.com/browse/RHEL-70926
Signed-off-by: John Allen <johnalle@redhat.com>
---
hw/i386/amd_iommu.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index d804656ea8..59e1a01b7c 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -1714,6 +1714,7 @@ static void amdvi_pci_class_init(ObjectClass *klass, void *data)
PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
k->vendor_id = PCI_VENDOR_ID_AMD;
+ k->device_id = 0x1419;
k->class_id = 0x0806;
k->realize = amdvi_pci_realize;
--
2.39.3

View File

@ -0,0 +1,267 @@
From 5a697d0f66360acca8216f49c06dc9702231d470 Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Sun, 4 May 2025 17:04:04 +0000
Subject: [PATCH 24/57] hw/i386/amd_iommu: Isolate AMDVI-PCI from amd-iommu
device to allow full control over the PCI device creation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: John Allen <None>
RH-MergeRequest: 380: Add ability to manually specify the AMDVI-PCI device
RH-Jira: RHEL-70925
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [1/3] 58254a72ba2d810b57c610462494f76691126521 (johnalle/qemu-kvm-fork)
Current amd-iommu model internally creates an AMDVI-PCI device. Here is
a snippet from info qtree:
bus: main-system-bus
type System
dev: amd-iommu, id ""
xtsup = false
pci-id = ""
intremap = "on"
device-iotlb = false
pt = true
...
dev: q35-pcihost, id ""
MCFG = -1 (0xffffffffffffffff)
pci-hole64-size = 34359738368 (32 GiB)
below-4g-mem-size = 134217728 (128 MiB)
above-4g-mem-size = 0 (0 B)
smm-ranges = true
x-pci-hole64-fix = true
x-config-reg-migration-enabled = true
bypass-iommu = false
bus: pcie.0
type PCIE
dev: AMDVI-PCI, id ""
addr = 01.0
romfile = ""
romsize = 4294967295 (0xffffffff)
rombar = -1 (0xffffffffffffffff)
multifunction = false
x-pcie-lnksta-dllla = true
x-pcie-extcap-init = true
failover_pair_id = ""
acpi-index = 0 (0x0)
x-pcie-err-unc-mask = true
x-pcie-ari-nextfn-1 = false
x-max-bounce-buffer-size = 4096 (4 KiB)
x-pcie-ext-tag = true
busnr = 0 (0x0)
class Class 0806, addr 00:01.0, pci id 1022:0000 (sub 1af4:1100)
...
This prohibits users from specifying the PCI topology for the amd-iommu device,
which becomes a problem when trying to support VM migration since it does not
guarantee the same enumeration of AMD IOMMU device.
Therefore, allow the 'AMDVI-PCI' device to optionally be pre-created and
associated with a 'amd-iommu' device via a new 'pci-id' parameter on the
latter.
For example:
-device AMDVI-PCI,id=iommupci0,bus=pcie.0,addr=0x05 \
-device amd-iommu,intremap=on,pt=on,xtsup=on,pci-id=iommupci0 \
For backward-compatibility, internally create the AMDVI-PCI device if not
specified on the CLI.
Co-developed-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Message-Id: <20250504170405.12623-2-suravee.suthikulpanit@amd.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit f864a3235ea1d1d714b3cde2d9a810ea6344a7b5)
JIRA: https://issues.redhat.com/browse/RHEL-70925
Signed-off-by: John Allen <johnalle@redhat.com>
---
hw/i386/acpi-build.c | 8 +++----
hw/i386/amd_iommu.c | 53 ++++++++++++++++++++++++++------------------
hw/i386/amd_iommu.h | 3 ++-
3 files changed, 38 insertions(+), 26 deletions(-)
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 032fb1f904..236261f8aa 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2392,10 +2392,10 @@ build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id,
build_append_int_noprefix(table_data, ivhd_blob->len + 24, 2);
/* DeviceID */
build_append_int_noprefix(table_data,
- object_property_get_int(OBJECT(&s->pci), "addr",
+ object_property_get_int(OBJECT(s->pci), "addr",
&error_abort), 2);
/* Capability offset */
- build_append_int_noprefix(table_data, s->pci.capab_offset, 2);
+ build_append_int_noprefix(table_data, s->pci->capab_offset, 2);
/* IOMMU base address */
build_append_int_noprefix(table_data, s->mr_mmio.addr, 8);
/* PCI Segment Group */
@@ -2427,10 +2427,10 @@ build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id,
build_append_int_noprefix(table_data, ivhd_blob->len + 40, 2);
/* DeviceID */
build_append_int_noprefix(table_data,
- object_property_get_int(OBJECT(&s->pci), "addr",
+ object_property_get_int(OBJECT(s->pci), "addr",
&error_abort), 2);
/* Capability offset */
- build_append_int_noprefix(table_data, s->pci.capab_offset, 2);
+ build_append_int_noprefix(table_data, s->pci->capab_offset, 2);
/* IOMMU base address */
build_append_int_noprefix(table_data, s->mr_mmio.addr, 8);
/* PCI Segment Group */
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 59e1a01b7c..6a5e76cfef 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -167,11 +167,11 @@ static void amdvi_generate_msi_interrupt(AMDVIState *s)
{
MSIMessage msg = {};
MemTxAttrs attrs = {
- .requester_id = pci_requester_id(&s->pci.dev)
+ .requester_id = pci_requester_id(&s->pci->dev)
};
- if (msi_enabled(&s->pci.dev)) {
- msg = msi_get_message(&s->pci.dev, 0);
+ if (msi_enabled(&s->pci->dev)) {
+ msg = msi_get_message(&s->pci->dev, 0);
address_space_stl_le(&address_space_memory, msg.address, msg.data,
attrs, NULL);
}
@@ -239,7 +239,7 @@ static void amdvi_page_fault(AMDVIState *s, uint16_t devid,
info |= AMDVI_EVENT_IOPF_I | AMDVI_EVENT_IOPF;
amdvi_encode_event(evt, devid, addr, info);
amdvi_log_event(s, evt);
- pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
+ pci_word_test_and_set_mask(s->pci->dev.config + PCI_STATUS,
PCI_STATUS_SIG_TARGET_ABORT);
}
/*
@@ -256,7 +256,7 @@ static void amdvi_log_devtab_error(AMDVIState *s, uint16_t devid,
amdvi_encode_event(evt, devid, devtab, info);
amdvi_log_event(s, evt);
- pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
+ pci_word_test_and_set_mask(s->pci->dev.config + PCI_STATUS,
PCI_STATUS_SIG_TARGET_ABORT);
}
/* log an event trying to access command buffer
@@ -269,7 +269,7 @@ static void amdvi_log_command_error(AMDVIState *s, hwaddr addr)
amdvi_encode_event(evt, 0, addr, info);
amdvi_log_event(s, evt);
- pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
+ pci_word_test_and_set_mask(s->pci->dev.config + PCI_STATUS,
PCI_STATUS_SIG_TARGET_ABORT);
}
/* log an illegal command event
@@ -310,7 +310,7 @@ static void amdvi_log_pagetab_error(AMDVIState *s, uint16_t devid,
info |= AMDVI_EVENT_PAGE_TAB_HW_ERROR;
amdvi_encode_event(evt, devid, addr, info);
amdvi_log_event(s, evt);
- pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
+ pci_word_test_and_set_mask(s->pci->dev.config + PCI_STATUS,
PCI_STATUS_SIG_TARGET_ABORT);
}
@@ -1607,7 +1607,7 @@ static void amdvi_sysbus_reset(DeviceState *dev)
{
AMDVIState *s = AMD_IOMMU_DEVICE(dev);
- msi_reset(&s->pci.dev);
+ msi_reset(&s->pci->dev);
amdvi_init(s);
}
@@ -1619,14 +1619,32 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
X86MachineState *x86ms = X86_MACHINE(ms);
PCIBus *bus = pcms->pcibus;
- s->iotlb = g_hash_table_new_full(amdvi_uint64_hash,
- amdvi_uint64_equal, g_free, g_free);
+ if (s->pci_id) {
+ PCIDevice *pdev = NULL;
+ int ret = pci_qdev_find_device(s->pci_id, &pdev);
- /* This device should take care of IOMMU PCI properties */
- if (!qdev_realize(DEVICE(&s->pci), &bus->qbus, errp)) {
- return;
+ if (ret) {
+ error_report("Cannot find PCI device '%s'", s->pci_id);
+ return;
+ }
+
+ if (!object_dynamic_cast(OBJECT(pdev), TYPE_AMD_IOMMU_PCI)) {
+ error_report("Device '%s' must be an AMDVI-PCI device type", s->pci_id);
+ return;
+ }
+
+ s->pci = AMD_IOMMU_PCI(pdev);
+ } else {
+ s->pci = AMD_IOMMU_PCI(object_new(TYPE_AMD_IOMMU_PCI));
+ /* This device should take care of IOMMU PCI properties */
+ if (!qdev_realize(DEVICE(s->pci), &bus->qbus, errp)) {
+ return;
+ }
}
+ s->iotlb = g_hash_table_new_full(amdvi_uint64_hash,
+ amdvi_uint64_equal, g_free, g_free);
+
/* Pseudo address space under root PCI bus. */
x86ms->ioapic_as = amdvi_host_dma_iommu(bus, s, AMDVI_IOAPIC_SB_DEVID);
@@ -1668,6 +1686,7 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
static Property amdvi_properties[] = {
DEFINE_PROP_BOOL("xtsup", AMDVIState, xtsup, false),
+ DEFINE_PROP_STRING("pci-id", AMDVIState, pci_id),
DEFINE_PROP_END_OF_LIST(),
};
@@ -1676,13 +1695,6 @@ static const VMStateDescription vmstate_amdvi_sysbus = {
.unmigratable = 1
};
-static void amdvi_sysbus_instance_init(Object *klass)
-{
- AMDVIState *s = AMD_IOMMU_DEVICE(klass);
-
- object_initialize(&s->pci, sizeof(s->pci), TYPE_AMD_IOMMU_PCI);
-}
-
static void amdvi_sysbus_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
@@ -1704,7 +1716,6 @@ static const TypeInfo amdvi_sysbus = {
.name = TYPE_AMD_IOMMU_DEVICE,
.parent = TYPE_X86_IOMMU_DEVICE,
.instance_size = sizeof(AMDVIState),
- .instance_init = amdvi_sysbus_instance_init,
.class_init = amdvi_sysbus_class_init
};
diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
index e0dac4d9a9..ece71ff0b6 100644
--- a/hw/i386/amd_iommu.h
+++ b/hw/i386/amd_iommu.h
@@ -315,7 +315,8 @@ struct AMDVIPCIState {
struct AMDVIState {
X86IOMMUState iommu; /* IOMMU bus device */
- AMDVIPCIState pci; /* IOMMU PCI device */
+ AMDVIPCIState *pci; /* IOMMU PCI device */
+ char *pci_id; /* ID of AMDVI-PCI device, if user created */
uint32_t version;
--
2.39.3

View File

@ -0,0 +1,96 @@
From 67b281dc1ccdae05da6c6052c264ecd94723c0b2 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Tue, 18 Feb 2025 19:25:32 +0100
Subject: [PATCH 2/9] hw/i386/intel-iommu: Migrate to 3-phase reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 341: Fix vIOMMU reset order
RH-Jira: RHEL-7188
RH-Acked-by: Peter Xu <peterx@redhat.com>
RH-Acked-by: Donald Dutile <None>
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Commit: [2/5] 5b9b60b2b796529db10b846881e82e7df4626ec1 (eauger1/centos-qemu-kvm)
Currently the IOMMU may be reset before the devices
it protects. For example this happens with virtio devices
but also with VFIO devices. In this latter case this
produces spurious translation faults on host.
Let's use 3-phase reset mechanism and reset the IOMMU on
exit phase after all DMA capable devices have been reset
on 'enter' or 'hold' phase.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Zhenzhong Duan <zhenzhong.duan@intel.com>
Message-Id: <20250218182737.76722-3-eric.auger@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 2aaf48bcf27d8b3da5b30af6c1ced464d3df30f7)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Conflicts: Code change
hw/i386/intel_iommu.c
We miss e3d0814368d0 ("hw: Use device_class_set_legacy_reset() instead
of opencoding") meaning that instead of removing
device_class_set_legacy_reset(dc, vtd_reset) we remove
dc->reset = vtd_reset;
---
hw/i386/intel_iommu.c | 12 +++++++++---
hw/i386/trace-events | 1 +
2 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 16d2885fcc..4acefcf5c8 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -4212,10 +4212,11 @@ static void vtd_init(IntelIOMMUState *s)
/* Should not reset address_spaces when reset because devices will still use
* the address space they got at first (won't ask the bus again).
*/
-static void vtd_reset(DeviceState *dev)
+static void vtd_reset_exit(Object *obj, ResetType type)
{
- IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
+ IntelIOMMUState *s = INTEL_IOMMU_DEVICE(obj);
+ trace_vtd_reset_exit();
vtd_init(s);
vtd_address_space_refresh_all(s);
}
@@ -4367,8 +4368,13 @@ static void vtd_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
X86IOMMUClass *x86_class = X86_IOMMU_DEVICE_CLASS(klass);
+ ResettableClass *rc = RESETTABLE_CLASS(klass);
- dc->reset = vtd_reset;
+ /*
+ * Use 'exit' reset phase to make sure all DMA requests
+ * have been quiesced during 'enter' or 'hold' phase
+ */
+ rc->phases.exit = vtd_reset_exit;
dc->vmsd = &vtd_vmstate;
device_class_set_props(dc, vtd_properties);
dc->hotpluggable = false;
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 53c02d7ac8..ac9e1a10aa 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -68,6 +68,7 @@ vtd_frr_new(int index, uint64_t hi, uint64_t lo) "index %d high 0x%"PRIx64" low
vtd_warn_invalid_qi_tail(uint16_t tail) "tail 0x%"PRIx16
vtd_warn_ir_vector(uint16_t sid, int index, int vec, int target) "sid 0x%"PRIx16" index %d vec %d (should be: %d)"
vtd_warn_ir_trigger(uint16_t sid, int index, int trig, int target) "sid 0x%"PRIx16" index %d trigger %d (should be: %d)"
+vtd_reset_exit(void) ""
# amd_iommu.c
amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" + offset 0x%"PRIx32
--
2.48.1

View File

@ -0,0 +1,57 @@
From 0bfbd2c49c01ee77d3b5a21bf9fe675916cbf0ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:36:56 +0100
Subject: [PATCH 44/57] hw/misc/aspeed_hace: skip automatic zero-init of large
array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [18/30] ec8510be6b23b26b3eecd6767e1deb0c0c50dd58 (stefanha/centos-stream-qemu-kvm)
The 'do_hash_operation' method has a 256 element iovec array used for
holding pointers to data that is to be hashed. Skip the automatic
zero-init of this array to eliminate the performance overhead in the
I/O hot path.
The 'iovec' array will be selectively initialized based on data that
needs to be hashed.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20250610123709.835102-19-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 6992c886838282f36b20deee44b666bbfc573a8f)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Conflicts:
hw/misc/aspeed_hace.c
Context conflict due to missing commit b9ccbe212e24
("hw/misc/aspeed_hace: Extract accumulation-mode hash execution into
helper function") downstream. The commit is not a bug fix, so there is
no need to backport it.
---
hw/misc/aspeed_hace.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hw/misc/aspeed_hace.c b/hw/misc/aspeed_hace.c
index c06c04ddc6..d2118f1864 100644
--- a/hw/misc/aspeed_hace.c
+++ b/hw/misc/aspeed_hace.c
@@ -188,7 +188,7 @@ static int gen_acc_mode_iov(AspeedHACEState *s, struct iovec *iov, int id,
static void do_hash_operation(AspeedHACEState *s, int algo, bool sg_mode,
bool acc_mode)
{
- struct iovec iov[ASPEED_HACE_MAX_SG];
+ QEMU_UNINITIALIZED struct iovec iov[ASPEED_HACE_MAX_SG];
g_autofree uint8_t *digest_buf = NULL;
size_t digest_len = 0;
int niov = 0;
--
2.39.3

View File

@ -0,0 +1,48 @@
From cc173deaaa4d9dc6ad9188e0b03f46b7e64f26b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:36:57 +0100
Subject: [PATCH 45/57] hw/net/rtl8139: skip automatic zero-init of large array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [19/30] 344c720aef2feb35f84fd4b21f2b1b31e5572286 (stefanha/centos-stream-qemu-kvm)
The 'rtl8139_transmit_one' method has a 8k byte array used for
copying data between guest and host. Skip the automatic zero-init
of this array to eliminate the performance overhead in the I/O
hot path.
The 'txbuffer' will be fully initialized when reading PCI DMA
buffers.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20250610123709.835102-20-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 3ccc6489dd4925ddd1f3066bd3751389169cd7aa)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/net/rtl8139.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c
index f2fe057535..a2732bf1c1 100644
--- a/hw/net/rtl8139.c
+++ b/hw/net/rtl8139.c
@@ -1818,7 +1818,7 @@ static int rtl8139_transmit_one(RTL8139State *s, int descriptor)
PCIDevice *d = PCI_DEVICE(s);
int txsize = s->TxStatus[descriptor] & 0x1fff;
- uint8_t txbuffer[0x2000];
+ QEMU_UNINITIALIZED uint8_t txbuffer[0x2000];
DPRINTF("+++ transmit reading %d bytes from host memory at 0x%08x\n",
txsize, s->TxAddr[descriptor]);
--
2.39.3

View File

@ -0,0 +1,47 @@
From 400b5c8ae7f06a450ef91230343d7ce489142a38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:36:58 +0100
Subject: [PATCH 46/57] hw/net/tulip: skip automatic zero-init of large array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [20/30] b3d29de8495c0ff40c26974673adefe4eb27a417 (stefanha/centos-stream-qemu-kvm)
The 'tulip_setup_frame' method has a 4k byte array used for copynig
DMA data from the device. Skip the automatic zero-init of this array
to eliminate the performance overhead in the I/O hot path.
The 'buf' array will be fully initialized when reading data from the
device.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20250610123709.835102-21-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit e1afd5ee6eb2954f4baf3c97820e4aaf7de97d2a)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/net/tulip.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hw/net/tulip.c b/hw/net/tulip.c
index 1f2ef20977..5cf2b96fbd 100644
--- a/hw/net/tulip.c
+++ b/hw/net/tulip.c
@@ -629,7 +629,7 @@ static void tulip_setup_filter_addr(TULIPState *s, uint8_t *buf, int n)
static void tulip_setup_frame(TULIPState *s,
struct tulip_descriptor *desc)
{
- uint8_t buf[4096];
+ QEMU_UNINITIALIZED uint8_t buf[4096];
int len = (desc->control >> TDES1_BUF1_SIZE_SHIFT) & TDES1_BUF1_SIZE_MASK;
int i;
--
2.39.3

View File

@ -0,0 +1,54 @@
From 0925796a4537e20e033a675ebc8899e4580235f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:36:59 +0100
Subject: [PATCH 47/57] hw/net/virtio-net: skip automatic zero-init of large
arrays
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [21/30] 0450189a4c4c779b5a1850e9ea8278a5129c5f7f (stefanha/centos-stream-qemu-kvm)
The 'virtio_net_receive_rcu' method has three arrays with
VIRTQUEUE_MAX_SIZE elements, which are apprixmately 32k in
size used for copying data between guest and host. Skip the
automatic zero-init of these arrays to eliminate the
performance overhead in the I/O hot path.
The three arrays will be selectively initialized as required
when processing network buffers.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20250610123709.835102-22-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 21cf31c51a7aeff4270c9b30b37e019c536d54b2)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/net/virtio-net.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 3d2b2460ad..086ea20ea0 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -1895,9 +1895,9 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
VirtIONet *n = qemu_get_nic_opaque(nc);
VirtIONetQueue *q = virtio_net_get_subqueue(nc);
VirtIODevice *vdev = VIRTIO_DEVICE(n);
- VirtQueueElement *elems[VIRTQUEUE_MAX_SIZE];
- size_t lens[VIRTQUEUE_MAX_SIZE];
- struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE];
+ QEMU_UNINITIALIZED VirtQueueElement *elems[VIRTQUEUE_MAX_SIZE];
+ QEMU_UNINITIALIZED size_t lens[VIRTQUEUE_MAX_SIZE];
+ QEMU_UNINITIALIZED struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE];
struct virtio_net_hdr_v1_hash extra_hdr;
unsigned mhdr_cnt = 0;
size_t offset, i, guest_offset, j;
--
2.39.3

View File

@ -0,0 +1,47 @@
From 34116b3a243f005938a30e9b38c6f47a62752c3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:37:00 +0100
Subject: [PATCH 48/57] hw/net/xgamc: skip automatic zero-init of large array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [22/30] 63536d627705775c4bf72a511de3d68ec30ac7de (stefanha/centos-stream-qemu-kvm)
The 'xgmac_enet_send' method has a 8k byte array used for copying
data between guest and host. Skip the automatic zero-init of this
array to eliminate the performance overhead in the I/O hot path.
The 'frame' buffer will be fully initialized when reading guest
memory to fetch the data to send.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20250610123709.835102-23-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 8b723287b84a62bb5d1a7799ef0959ca8e6c293a)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/net/xgmac.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hw/net/xgmac.c b/hw/net/xgmac.c
index ffe3fc8dbe..eff8022aca 100644
--- a/hw/net/xgmac.c
+++ b/hw/net/xgmac.c
@@ -207,7 +207,7 @@ static void xgmac_enet_send(XgmacState *s)
struct desc bd;
int frame_size;
int len;
- uint8_t frame[8192];
+ QEMU_UNINITIALIZED uint8_t frame[8192];
uint8_t *ptr;
ptr = frame;
--
2.39.3

View File

@ -0,0 +1,72 @@
From 3e0134b45828bf9a623a26ac41d5fbb3a8d2917b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:37:01 +0100
Subject: [PATCH 49/57] hw/nvme/ctrl: skip automatic zero-init of large arrays
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [23/30] 57ce4361ffb307be4ea4d3edf9e0dac269d16908 (stefanha/centos-stream-qemu-kvm)
The 'nvme_map_sgl' method has a 256 element array used for copying
data from the device. Skip the automatic zero-init of this array
to eliminate the performance overhead in the I/O hot path.
The 'segment' array will be fully initialized when reading data from
the device.
The 'nme_changed_nslist' method has a 4k byte array that is manually
initialized with memset(). The compiler ought to be intelligent
enough to turn the memset() into a static initialization operation,
and thus not duplicate the automatic zero-init. Replacing memset()
with '{}' makes it unambiguous that the array is statically initialized.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
Message-id: 20250610123709.835102-24-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 7eeb1d3acc175813ad3d5e824f26123e0992093a)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/nvme/ctrl.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index d451ee0d00..75d7f20801 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1047,7 +1047,8 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
*/
#define SEG_CHUNK_SIZE 256
- NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
+ QEMU_UNINITIALIZED NvmeSglDescriptor segment[SEG_CHUNK_SIZE];
+ NvmeSglDescriptor *sgld, *last_sgld;
uint64_t nsgld;
uint32_t seg_len;
uint16_t status;
@@ -5029,7 +5030,7 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
uint64_t off, NvmeRequest *req)
{
- uint32_t nslist[1024];
+ uint32_t nslist[1024] = {};
uint32_t trans_len;
int i = 0;
uint32_t nsid;
@@ -5039,7 +5040,6 @@ static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
return NVME_INVALID_FIELD | NVME_DNR;
}
- memset(nslist, 0x0, sizeof(nslist));
trans_len = MIN(sizeof(nslist) - off, buf_len);
while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
--
2.39.3

View File

@ -0,0 +1,242 @@
From 98b0cd83c09d35a3da0ae142c09038174355e87e Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Tue, 25 Feb 2025 14:52:25 -0700
Subject: [PATCH 2/7] hw/pci: Basic support for PCI power management
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 348: PCI: Implement basic PCI PM capability backing
RH-Jira: RHEL-7301
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Alex Williamson <None>
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
RH-Commit: [2/6] 5faff6382c124711887704fff4f857e8f85e7be5 (eauger1/centos-qemu-kvm)
Conflicts: contextual conflict in include/hw/pci/pci.h
we don't have 449dca6ac93a ("pcie: enable Extended tag field support")
downstream so we don't have x-pcie-ext-tag definition.
The memory and IO BARs for devices are only accessible in the D0 power
state. In other power states the PCI spec defines that the device
responds to TLPs and messages with an Unsupported Request response.
To approximate this behavior, consider the BARs as unmapped when the
device is not in the D0 power state. This makes the BARs inaccessible
and has the additional bonus for vfio-pci that we don't attempt to DMA
map BARs for devices in a non-D0 power state.
To support this, an interface is added for devices to register the PM
capability, which allows central tracking to enforce valid transitions
and unmap BARs in non-D0 states.
NB. We currently have device models (eepro100 and pcie_pci_bridge)
that register a PM capability but do not set wmask to enable writes to
the power state field. In order to maintain migration compatibility,
this new helper does not manage the wmask to enable guest writes to
initiate a power state change. The contents and write access of the
PM capability are still managed by the caller.
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Link: https://lore.kernel.org/qemu-devel/20250225215237.3314011-2-alex.williamson@redhat.com
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 9461afd2008b0820fc45a6a7bc675df1b6791e4f)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/pci/pci.c | 93 ++++++++++++++++++++++++++++++++++++-
hw/pci/trace-events | 2 +
include/hw/pci/pci.h | 3 ++
include/hw/pci/pci_device.h | 3 ++
4 files changed, 99 insertions(+), 2 deletions(-)
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 83c9d5b9ea..d774ae47d2 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -365,6 +365,84 @@ static void pci_msi_trigger(PCIDevice *dev, MSIMessage msg)
attrs, NULL);
}
+/*
+ * Register and track a PM capability. If wmask is also enabled for the power
+ * state field of the pmcsr register, guest writes may change the device PM
+ * state. BAR access is only enabled while the device is in the D0 state.
+ * Return the capability offset or negative error code.
+ */
+int pci_pm_init(PCIDevice *d, uint8_t offset, Error **errp)
+{
+ int cap = pci_add_capability(d, PCI_CAP_ID_PM, offset, PCI_PM_SIZEOF, errp);
+
+ if (cap < 0) {
+ return cap;
+ }
+
+ d->pm_cap = cap;
+ d->cap_present |= QEMU_PCI_CAP_PM;
+
+ return cap;
+}
+
+static uint8_t pci_pm_state(PCIDevice *d)
+{
+ uint16_t pmcsr;
+
+ if (!(d->cap_present & QEMU_PCI_CAP_PM)) {
+ return 0;
+ }
+
+ pmcsr = pci_get_word(d->config + d->pm_cap + PCI_PM_CTRL);
+
+ return pmcsr & PCI_PM_CTRL_STATE_MASK;
+}
+
+/*
+ * Update the PM capability state based on the new value stored in config
+ * space respective to the old, pre-write state provided. If the new value
+ * is rejected (unsupported or invalid transition) restore the old value.
+ * Return the resulting PM state.
+ */
+static uint8_t pci_pm_update(PCIDevice *d, uint32_t addr, int l, uint8_t old)
+{
+ uint16_t pmc;
+ uint8_t new;
+
+ if (!(d->cap_present & QEMU_PCI_CAP_PM) ||
+ !range_covers_byte(addr, l, d->pm_cap + PCI_PM_CTRL)) {
+ return old;
+ }
+
+ new = pci_pm_state(d);
+ if (new == old) {
+ return old;
+ }
+
+ pmc = pci_get_word(d->config + d->pm_cap + PCI_PM_PMC);
+
+ /*
+ * Transitions to D1 & D2 are only allowed if supported. Devices may
+ * only transition to higher D-states or to D0.
+ */
+ if ((!(pmc & PCI_PM_CAP_D1) && new == 1) ||
+ (!(pmc & PCI_PM_CAP_D2) && new == 2) ||
+ (old && new && new < old)) {
+ pci_word_test_and_clear_mask(d->config + d->pm_cap + PCI_PM_CTRL,
+ PCI_PM_CTRL_STATE_MASK);
+ pci_word_test_and_set_mask(d->config + d->pm_cap + PCI_PM_CTRL,
+ old);
+ trace_pci_pm_bad_transition(d->name, pci_dev_bus_num(d),
+ PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
+ old, new);
+ return old;
+ }
+
+ trace_pci_pm_transition(d->name, pci_dev_bus_num(d), PCI_SLOT(d->devfn),
+ PCI_FUNC(d->devfn), old, new);
+ return new;
+}
+
static void pci_reset_regions(PCIDevice *dev)
{
int r;
@@ -404,6 +482,11 @@ static void pci_do_device_reset(PCIDevice *dev)
pci_get_word(dev->wmask + PCI_INTERRUPT_LINE) |
pci_get_word(dev->w1cmask + PCI_INTERRUPT_LINE));
dev->config[PCI_CACHE_LINE_SIZE] = 0x0;
+ /* Default PM state is D0 */
+ if (dev->cap_present & QEMU_PCI_CAP_PM) {
+ pci_word_test_and_clear_mask(dev->config + dev->pm_cap + PCI_PM_CTRL,
+ PCI_PM_CTRL_STATE_MASK);
+ }
pci_reset_regions(dev);
pci_update_mappings(dev);
@@ -1525,7 +1608,7 @@ static void pci_update_mappings(PCIDevice *d)
continue;
new_addr = pci_bar_address(d, i, r->type, r->size);
- if (!d->enabled) {
+ if (!d->enabled || pci_pm_state(d)) {
new_addr = PCI_BAR_UNMAPPED;
}
@@ -1591,6 +1674,7 @@ uint32_t pci_default_read_config(PCIDevice *d,
void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int l)
{
+ uint8_t new_pm_state, old_pm_state = pci_pm_state(d);
int i, was_irq_disabled = pci_irq_disabled(d);
uint32_t val = val_in;
@@ -1603,11 +1687,16 @@ void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int
d->config[addr + i] = (d->config[addr + i] & ~wmask) | (val & wmask);
d->config[addr + i] &= ~(val & w1cmask); /* W1C: Write 1 to Clear */
}
+
+ new_pm_state = pci_pm_update(d, addr, l, old_pm_state);
+
if (ranges_overlap(addr, l, PCI_BASE_ADDRESS_0, 24) ||
ranges_overlap(addr, l, PCI_ROM_ADDRESS, 4) ||
ranges_overlap(addr, l, PCI_ROM_ADDRESS1, 4) ||
- range_covers_byte(addr, l, PCI_COMMAND))
+ range_covers_byte(addr, l, PCI_COMMAND) ||
+ !!new_pm_state != !!old_pm_state) {
pci_update_mappings(d);
+ }
if (ranges_overlap(addr, l, PCI_COMMAND, 2)) {
pci_update_irq_disabled(d, was_irq_disabled);
diff --git a/hw/pci/trace-events b/hw/pci/trace-events
index 19643aa8c6..c82a87ffdd 100644
--- a/hw/pci/trace-events
+++ b/hw/pci/trace-events
@@ -1,6 +1,8 @@
# See docs/devel/tracing.rst for syntax documentation.
# pci.c
+pci_pm_bad_transition(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, uint8_t old, uint8_t new) "%s %02x:%02x.%x REJECTED PM transition D%d->D%d"
+pci_pm_transition(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, uint8_t old, uint8_t new) "%s %02x:%02x.%x PM transition D%d->D%d"
pci_update_mappings_del(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, int bar, uint64_t addr, uint64_t size) "%s %02x:%02x.%x %d,0x%"PRIx64"+0x%"PRIx64
pci_update_mappings_add(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, int bar, uint64_t addr, uint64_t size) "%s %02x:%02x.%x %d,0x%"PRIx64"+0x%"PRIx64
pci_route_irq(int dev_irq, const char *dev_path, int parent_irq, const char *parent_path) "IRQ %d @%s -> IRQ %d @%s"
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 45365ae085..afeb5a2263 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -213,6 +213,8 @@ enum {
QEMU_PCIE_ERR_UNC_MASK = (1 << QEMU_PCIE_ERR_UNC_MASK_BITNR),
#define QEMU_PCIE_ARI_NEXTFN_1_BITNR 12
QEMU_PCIE_ARI_NEXTFN_1 = (1 << QEMU_PCIE_ARI_NEXTFN_1_BITNR),
+#define QEMU_PCI_CAP_PM_BITNR 14
+ QEMU_PCI_CAP_PM = (1 << QEMU_PCI_CAP_PM_BITNR),
};
typedef struct PCIINTxRoute {
@@ -680,5 +682,6 @@ static inline void pci_irq_pulse(PCIDevice *pci_dev)
MSIMessage pci_get_msi_message(PCIDevice *dev, int vector);
void pci_set_enabled(PCIDevice *pci_dev, bool state);
void pci_set_power(PCIDevice *pci_dev, bool state);
+int pci_pm_init(PCIDevice *pci_dev, uint8_t offset, Error **errp);
#endif
diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
index f38fb31119..325d7bcaf7 100644
--- a/include/hw/pci/pci_device.h
+++ b/include/hw/pci/pci_device.h
@@ -105,6 +105,9 @@ struct PCIDevice {
/* Capability bits */
uint32_t cap_present;
+ /* Offset of PM capability in config space */
+ uint8_t pm_cap;
+
/* Offset of MSI-X capability in config space */
uint8_t msix_cap;
--
2.48.1

View File

@ -0,0 +1,130 @@
From 8711bb1a54d4f5734d44545cd8e7262bc358f51d Mon Sep 17 00:00:00 2001
From: Akihiko Odaki <akihiko.odaki@daynix.com>
Date: Thu, 9 Jan 2025 15:29:46 +0900
Subject: [PATCH 1/7] hw/pci: Rename has_power to enabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 348: PCI: Implement basic PCI PM capability backing
RH-Jira: RHEL-7301
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Alex Williamson <None>
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
RH-Commit: [1/6] ac8a7427a1203e33aa323933818a7114c0eb4520 (eauger1/centos-qemu-kvm)
The renamed state will not only represent powering state of PFs, but
also represent SR-IOV VF enablement in the future.
Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Message-ID: <20250109-reuse-v19-1-f541e82ca5f7@daynix.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
(cherry picked from commit c407eef162f765dd83d45e048585731be41a66fc)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/pci/pci.c | 17 +++++++++++------
hw/pci/pci_host.c | 4 ++--
include/hw/pci/pci.h | 1 +
include/hw/pci/pci_device.h | 2 +-
4 files changed, 15 insertions(+), 9 deletions(-)
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index fab86d0567..83c9d5b9ea 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -1525,7 +1525,7 @@ static void pci_update_mappings(PCIDevice *d)
continue;
new_addr = pci_bar_address(d, i, r->type, r->size);
- if (!d->has_power) {
+ if (!d->enabled) {
new_addr = PCI_BAR_UNMAPPED;
}
@@ -1613,7 +1613,7 @@ void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int
pci_update_irq_disabled(d, was_irq_disabled);
memory_region_set_enabled(&d->bus_master_enable_region,
(pci_get_word(d->config + PCI_COMMAND)
- & PCI_COMMAND_MASTER) && d->has_power);
+ & PCI_COMMAND_MASTER) && d->enabled);
}
msi_write_config(d, addr, val_in, l);
@@ -2886,16 +2886,21 @@ MSIMessage pci_get_msi_message(PCIDevice *dev, int vector)
void pci_set_power(PCIDevice *d, bool state)
{
- if (d->has_power == state) {
+ pci_set_enabled(d, state);
+}
+
+void pci_set_enabled(PCIDevice *d, bool state)
+{
+ if (d->enabled == state) {
return;
}
- d->has_power = state;
+ d->enabled = state;
pci_update_mappings(d);
memory_region_set_enabled(&d->bus_master_enable_region,
(pci_get_word(d->config + PCI_COMMAND)
- & PCI_COMMAND_MASTER) && d->has_power);
- if (!d->has_power) {
+ & PCI_COMMAND_MASTER) && d->enabled);
+ if (!d->enabled) {
pci_device_reset(d);
}
}
diff --git a/hw/pci/pci_host.c b/hw/pci/pci_host.c
index dfe6fe6184..0d82727cc9 100644
--- a/hw/pci/pci_host.c
+++ b/hw/pci/pci_host.c
@@ -86,7 +86,7 @@ void pci_host_config_write_common(PCIDevice *pci_dev, uint32_t addr,
* allowing direct removal of unexposed functions.
*/
if ((pci_dev->qdev.hotplugged && !pci_get_function_0(pci_dev)) ||
- !pci_dev->has_power || is_pci_dev_ejected(pci_dev)) {
+ !pci_dev->enabled || is_pci_dev_ejected(pci_dev)) {
return;
}
@@ -111,7 +111,7 @@ uint32_t pci_host_config_read_common(PCIDevice *pci_dev, uint32_t addr,
* allowing direct removal of unexposed functions.
*/
if ((pci_dev->qdev.hotplugged && !pci_get_function_0(pci_dev)) ||
- !pci_dev->has_power || is_pci_dev_ejected(pci_dev)) {
+ !pci_dev->enabled || is_pci_dev_ejected(pci_dev)) {
return ~0x0;
}
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index eb26cac810..45365ae085 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -678,6 +678,7 @@ static inline void pci_irq_pulse(PCIDevice *pci_dev)
}
MSIMessage pci_get_msi_message(PCIDevice *dev, int vector);
+void pci_set_enabled(PCIDevice *pci_dev, bool state);
void pci_set_power(PCIDevice *pci_dev, bool state);
#endif
diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
index 15694f2489..f38fb31119 100644
--- a/include/hw/pci/pci_device.h
+++ b/include/hw/pci/pci_device.h
@@ -57,7 +57,7 @@ typedef struct PCIReqIDCache PCIReqIDCache;
struct PCIDevice {
DeviceState qdev;
bool partially_hotplugged;
- bool has_power;
+ bool enabled;
/* PCI config space */
uint8_t *config;
--
2.48.1

View File

@ -0,0 +1,52 @@
From 4c3fe6e7b88c58713c0c499d4bf0658a055ee52e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:37:03 +0100
Subject: [PATCH 50/57] hw/ppc/spapr_tpm_proxy: skip automatic zero-init of
large arrays
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [24/30] 8d963380c64a33a27adc99738b42b52864229111 (stefanha/centos-stream-qemu-kvm)
The 'tpm_execute' method has a pair of 4k arrays used for copying
data between guest and host. Skip the automatic zero-init of these
arrays to eliminate the performance overhead in the I/O hot path.
The two arrays will be fully initialized when reading data from
guest memory or reading data from the proxy FD.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Harsh Prateek Bora <harshpb@linux.ibm.com>
Message-id: 20250610123709.835102-26-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 5dd9087fff74b5672526cad254e76f790fb35c7a)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/ppc/spapr_tpm_proxy.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/hw/ppc/spapr_tpm_proxy.c b/hw/ppc/spapr_tpm_proxy.c
index e10af35a18..88833d9e2e 100644
--- a/hw/ppc/spapr_tpm_proxy.c
+++ b/hw/ppc/spapr_tpm_proxy.c
@@ -41,8 +41,8 @@ static ssize_t tpm_execute(SpaprTpmProxy *tpm_proxy, target_ulong *args)
target_ulong data_in_size = args[2];
uint64_t data_out = ppc64_phys_to_real(args[3]);
target_ulong data_out_size = args[4];
- uint8_t buf_in[TPM_SPAPR_BUFSIZE];
- uint8_t buf_out[TPM_SPAPR_BUFSIZE];
+ QEMU_UNINITIALIZED uint8_t buf_in[TPM_SPAPR_BUFSIZE];
+ QEMU_UNINITIALIZED uint8_t buf_out[TPM_SPAPR_BUFSIZE];
ssize_t ret;
trace_spapr_tpm_execute(data_in, data_in_size, data_out, data_out_size);
--
2.39.3

View File

@ -0,0 +1,63 @@
From 5126609c0714c66a0ec41328017e7e8388c78bf4 Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Fri, 13 Sep 2024 15:31:43 +0100
Subject: [PATCH 02/26] hw/s390/ccw-device: Convert to three-phase reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 351: Enable virtio-mem support on s390x
RH-Jira: RHEL-72977
RH-Acked-by: David Hildenbrand <david@redhat.com>
RH-Acked-by: Juraj Marcin <None>
RH-Commit: [2/26] 58f6fc2e65a101e069feac399859464d31e43045 (thuth/qemu-kvm-cs)
Convert the TYPE_CCW_DEVICE to three-phase reset. This is a
device class which is subclassed, so it needs to be three-phase
before we can convert the subclass.
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Nina Schoetterl-Glausch <nsg@linux.ibm.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Acked-by: Thomas Huth <thuth@redhat.com>
Message-id: 20240830145812.1967042-2-peter.maydell@linaro.org
(cherry picked from commit 6a0e10b76b68e2f412746a1d5ed7d6efee804864)
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
hw/s390x/ccw-device.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/hw/s390x/ccw-device.c b/hw/s390x/ccw-device.c
index d7bb364579..30f2fb486f 100644
--- a/hw/s390x/ccw-device.c
+++ b/hw/s390x/ccw-device.c
@@ -88,9 +88,9 @@ static Property ccw_device_properties[] = {
DEFINE_PROP_END_OF_LIST(),
};
-static void ccw_device_reset(DeviceState *d)
+static void ccw_device_reset_hold(Object *obj, ResetType type)
{
- CcwDevice *ccw_dev = CCW_DEVICE(d);
+ CcwDevice *ccw_dev = CCW_DEVICE(obj);
css_reset_sch(ccw_dev->sch);
}
@@ -99,11 +99,12 @@ static void ccw_device_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
CCWDeviceClass *k = CCW_DEVICE_CLASS(klass);
+ ResettableClass *rc = RESETTABLE_CLASS(klass);
k->realize = ccw_device_realize;
k->refill_ids = ccw_device_refill_ids;
device_class_set_props(dc, ccw_device_properties);
- dc->reset = ccw_device_reset;
+ rc->phases.hold = ccw_device_reset_hold;
dc->bus_type = TYPE_VIRTUAL_CSS_BUS;
}
--
2.48.1

View File

@ -0,0 +1,92 @@
From 7cbf9be09907407a64d739a2d0862af2ad08eaf5 Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Fri, 13 Sep 2024 15:31:43 +0100
Subject: [PATCH 03/26] hw/s390/virtio-ccw: Convert to three-phase reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 351: Enable virtio-mem support on s390x
RH-Jira: RHEL-72977
RH-Acked-by: David Hildenbrand <david@redhat.com>
RH-Acked-by: Juraj Marcin <None>
RH-Commit: [3/26] e06ee194fa289a387433b905eb0999a048681a92 (thuth/qemu-kvm-cs)
Convert the virtio-ccw code to three-phase reset. This allows us to
remove a call to device_class_set_parent_reset(), replacing it with
the three-phase equivalent resettable_class_set_parent_phases().
Removing all the device_class_set_parent_reset() uses will allow us
to remove some of the glue code that interworks between three-phase
and legacy reset.
This is a simple conversion, with no behavioural changes.
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Nina Schoetterl-Glausch <nsg@linux.ibm.com>
Acked-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20240830145812.1967042-3-peter.maydell@linaro.org
(cherry picked from commit 6affa00d6ebebf24485667fe146470b0d6feb90d)
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
hw/s390x/virtio-ccw.c | 13 ++++++++-----
hw/s390x/virtio-ccw.h | 2 +-
2 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
index b4676909dd..96747318d2 100644
--- a/hw/s390x/virtio-ccw.c
+++ b/hw/s390x/virtio-ccw.c
@@ -913,14 +913,15 @@ static void virtio_ccw_notify(DeviceState *d, uint16_t vector)
}
}
-static void virtio_ccw_reset(DeviceState *d)
+static void virtio_ccw_reset_hold(Object *obj, ResetType type)
{
- VirtioCcwDevice *dev = VIRTIO_CCW_DEVICE(d);
+ VirtioCcwDevice *dev = VIRTIO_CCW_DEVICE(obj);
VirtIOCCWDeviceClass *vdc = VIRTIO_CCW_DEVICE_GET_CLASS(dev);
virtio_ccw_reset_virtio(dev);
- if (vdc->parent_reset) {
- vdc->parent_reset(d);
+
+ if (vdc->parent_phases.hold) {
+ vdc->parent_phases.hold(obj, type);
}
}
@@ -1233,11 +1234,13 @@ static void virtio_ccw_device_class_init(ObjectClass *klass, void *data)
DeviceClass *dc = DEVICE_CLASS(klass);
CCWDeviceClass *k = CCW_DEVICE_CLASS(dc);
VirtIOCCWDeviceClass *vdc = VIRTIO_CCW_DEVICE_CLASS(klass);
+ ResettableClass *rc = RESETTABLE_CLASS(klass);
k->unplug = virtio_ccw_busdev_unplug;
dc->realize = virtio_ccw_busdev_realize;
dc->unrealize = virtio_ccw_busdev_unrealize;
- device_class_set_parent_reset(dc, virtio_ccw_reset, &vdc->parent_reset);
+ resettable_class_set_parent_phases(rc, NULL, virtio_ccw_reset_hold, NULL,
+ &vdc->parent_phases);
}
static const TypeInfo virtio_ccw_device_info = {
diff --git a/hw/s390x/virtio-ccw.h b/hw/s390x/virtio-ccw.h
index fac186c8f6..c7a830a194 100644
--- a/hw/s390x/virtio-ccw.h
+++ b/hw/s390x/virtio-ccw.h
@@ -57,7 +57,7 @@ struct VirtIOCCWDeviceClass {
CCWDeviceClass parent_class;
void (*realize)(VirtioCcwDevice *dev, Error **errp);
void (*unrealize)(VirtioCcwDevice *dev);
- void (*parent_reset)(DeviceState *dev);
+ ResettablePhases parent_phases;
};
/* Performance improves when virtqueue kick processing is decoupled from the
--
2.48.1

View File

@ -0,0 +1,47 @@
From b25bbfcad4a3df94555f6b5f238910314a5d17ea Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 25 Jun 2025 10:27:51 +0200
Subject: [PATCH 02/57] hw/s390x/ccw-device: Fix memory leak in loadparm setter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 387: s390x: Fix memory leaks related to loadparm [rhel-9]
RH-Jira: RHEL-98554
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
RH-Commit: [2/2] d85cf8b3c93ede47b51c4aa1336dc54f58b8cc3f (thuth/qemu-kvm-cs)
Commit bdf12f2a fixed the setter for the "loadparm" machine property,
which gets a string from a visitor, passes it to s390_ipl_fmt_loadparm()
and then forgot to free it. It left another instance of the same problem
unfixed in the "loadparm" device property. Fix it.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20250625082751.24896-1-kwolf@redhat.com>
Reviewed-by: Eric Farman <farman@linux.ibm.com>
Reviewed-by: Halil Pasic <pasic@linux.ibm.com>
Tested-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Thomas Huth <thuth@redhat.com>
(cherry picked from commit 78e3781541209b3dcd6f4bb66adf3a3e504b88a4)
---
hw/s390x/ccw-device.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hw/s390x/ccw-device.c b/hw/s390x/ccw-device.c
index 30f2fb486f..63e937401e 100644
--- a/hw/s390x/ccw-device.c
+++ b/hw/s390x/ccw-device.c
@@ -57,7 +57,7 @@ static void ccw_device_set_loadparm(Object *obj, Visitor *v,
Error **errp)
{
CcwDevice *dev = CCW_DEVICE(obj);
- char *val;
+ g_autofree char *val = NULL;
int index;
index = object_property_get_int(obj, "bootindex", NULL);
--
2.39.3

View File

@ -0,0 +1,49 @@
From 45884bfad1f14585407a04eff9230a75bc5095fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:37:05 +0100
Subject: [PATCH 52/57] hw/scsi/lsi53c895a: skip automatic zero-init of large
array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [26/30] 235884d43fcb3e49b320e36faa631a3656d07de6 (stefanha/centos-stream-qemu-kvm)
The 'lsi_memcpy' method has a 4k byte array used for copying data
to/from the device. Skip the automatic zero-init of this array to
eliminate the performance overhead in the I/O hot path.
The 'buf' array will be fully initialized when data is copied.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Harsh Prateek Bora <harshpb@linux.ibm.com>
Message-id: 20250610123709.835102-28-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 55243edf42ee87bce9f36ca251f3ab9cda1563e4)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/scsi/lsi53c895a.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hw/scsi/lsi53c895a.c b/hw/scsi/lsi53c895a.c
index f1935e5328..f165705f8a 100644
--- a/hw/scsi/lsi53c895a.c
+++ b/hw/scsi/lsi53c895a.c
@@ -1112,7 +1112,7 @@ bad:
static void lsi_memcpy(LSIState *s, uint32_t dest, uint32_t src, int count)
{
int n;
- uint8_t buf[LSI_BUF_SIZE];
+ QEMU_UNINITIALIZED uint8_t buf[LSI_BUF_SIZE];
trace_lsi_memcpy(dest, src, count);
while (count) {
--
2.39.3

View File

@ -0,0 +1,73 @@
From 9f76103e90ce8406bc5bbda72a7314b82e56652e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:37:06 +0100
Subject: [PATCH 53/57] hw/scsi/megasas: skip automatic zero-init of large
arrays
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [27/30] b3a3f466fd03c64c665c52e26079b03def376f48 (stefanha/centos-stream-qemu-kvm)
The 'megasas_dcmd_pd_get_list' and 'megasas_dcmd_get_properties'
methods have 4k structs used for copying data from the device.
Skip the automatic zero-init of this array to eliminate the
performance overhead in the I/O hot path.
The 'info' structs are manually initialized with memset(). The
compiler ought to be intelligent enough to turn the memset()
into a static initialization operation, and thus not duplicate
the automatic zero-init. Replacing memset() with '{}' makes it
unambiguous that the arrays are statically initialized.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Harsh Prateek Bora <harshpb@linux.ibm.com>
Message-id: 20250610123709.835102-29-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit ca0559e2350c618048f7caf80cb79c1259e7cfd2)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/scsi/megasas.c | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c
index 2d0c607177..91b65accbc 100644
--- a/hw/scsi/megasas.c
+++ b/hw/scsi/megasas.c
@@ -981,13 +981,11 @@ static int megasas_event_wait(MegasasState *s, MegasasCmd *cmd)
static int megasas_dcmd_pd_get_list(MegasasState *s, MegasasCmd *cmd)
{
- struct mfi_pd_list info;
- size_t dcmd_size = sizeof(info);
+ struct mfi_pd_list info = {};
BusChild *kid;
uint32_t offset, dcmd_limit, num_pd_disks = 0, max_pd_disks;
dma_addr_t residual;
- memset(&info, 0, dcmd_size);
offset = 8;
dcmd_limit = offset + sizeof(struct mfi_pd_address);
if (cmd->iov_size < dcmd_limit) {
@@ -1429,11 +1427,10 @@ static int megasas_dcmd_cfg_read(MegasasState *s, MegasasCmd *cmd)
static int megasas_dcmd_get_properties(MegasasState *s, MegasasCmd *cmd)
{
- struct mfi_ctrl_props info;
+ struct mfi_ctrl_props info = {};
size_t dcmd_size = sizeof(info);
dma_addr_t residual;
- memset(&info, 0x0, dcmd_size);
if (cmd->iov_size < dcmd_size) {
trace_megasas_dcmd_invalid_xfer_len(cmd->index, cmd->iov_size,
dcmd_size);
--
2.39.3

View File

@ -0,0 +1,50 @@
From 3a0ae5a2f873fc7062262efc24a5403233988f5f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:37:07 +0100
Subject: [PATCH 54/57] hw/ufs/lu: skip automatic zero-init of large array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [28/30] 62e7c83d15143387f6d6b366c8ec46b312d05577 (stefanha/centos-stream-qemu-kvm)
The 'ufs_emulate_scsi_cmd' method has a 4k byte array used for
copying data from the device. Skip the automatic zero-init of
this array to eliminate the performance overhead in the I/O hot
path.
The 'outbuf' array will be fully initialized when data is copied
from the guest.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Harsh Prateek Bora <harshpb@linux.ibm.com>
Message-id: 20250610123709.835102-30-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 7708e298180550eac262c1fd742e6e80c711a5d8)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/ufs/lu.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hw/ufs/lu.c b/hw/ufs/lu.c
index 81bfff9b4e..caad82dcc4 100644
--- a/hw/ufs/lu.c
+++ b/hw/ufs/lu.c
@@ -194,7 +194,7 @@ static int ufs_emulate_wlun_inquiry(UfsRequest *req, uint8_t *outbuf,
static UfsReqResult ufs_emulate_scsi_cmd(UfsLu *lu, UfsRequest *req)
{
uint8_t lun = lu->lun;
- uint8_t outbuf[4096];
+ QEMU_UNINITIALIZED uint8_t outbuf[4096];
uint8_t sense_buf[UFS_SENSE_SIZE];
uint8_t scsi_status;
int len = 0;
--
2.39.3

View File

@ -0,0 +1,50 @@
From 6d4761010ea4dc218a1623513f410fc2d1cfc832 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:37:04 +0100
Subject: [PATCH 51/57] hw/usb/hcd-ohci: skip automatic zero-init of large
array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [25/30] 721dd97d384fb755c4a6a00cfc3d867e43f25b0b (stefanha/centos-stream-qemu-kvm)
The 'ohci_service_iso_td' method has a 8k byte array used for copying
data between guest and host. Skip the automatic zero-init of this
array to eliminate the performance overhead in the I/O hot path.
The 'buf' array will be fully initialized when reading data from guest
memory.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Harsh Prateek Bora <harshpb@linux.ibm.com>
Message-id: 20250610123709.835102-27-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 14997d521d1cd0bb36c902ef1032f0d3f2a3c912)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/usb/hcd-ohci.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hw/usb/hcd-ohci.c b/hw/usb/hcd-ohci.c
index 71b54914d3..72a9f9f474 100644
--- a/hw/usb/hcd-ohci.c
+++ b/hw/usb/hcd-ohci.c
@@ -577,7 +577,7 @@ static int ohci_service_iso_td(OHCIState *ohci, struct ohci_ed *ed)
USBDevice *dev;
USBEndpoint *ep;
USBPacket *pkt;
- uint8_t buf[8192];
+ QEMU_UNINITIALIZED uint8_t buf[8192];
bool int_req;
struct ohci_iso_td iso_td;
uint32_t addr;
--
2.39.3

View File

@ -0,0 +1,61 @@
From 04f11749dd21b4df1ea2818785d650dd6eee2cbe Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Tue, 18 Feb 2025 19:25:34 +0100
Subject: [PATCH 4/9] hw/vfio/common: Add a trace point in vfio_reset_handler
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 341: Fix vIOMMU reset order
RH-Jira: RHEL-7188
RH-Acked-by: Peter Xu <peterx@redhat.com>
RH-Acked-by: Donald Dutile <None>
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Commit: [4/5] 46878ffdc96997d1f6d09bde3fce350564e499fd (eauger1/centos-qemu-kvm)
To ease the debug of reset sequence, let's add a trace point
in vfio_reset_handler()
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Message-Id: <20250218182737.76722-5-eric.auger@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit d410e709526d1cd4aa9085c6e254a622594a02a5)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/common.c | 1 +
hw/vfio/trace-events | 1 +
2 files changed, 2 insertions(+)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 36d0cf6585..6982f88fc8 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1395,6 +1395,7 @@ void vfio_reset_handler(void *opaque)
{
VFIODevice *vbasedev;
+ trace_vfio_reset_handler();
QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
if (vbasedev->dev->realized) {
vbasedev->ops->vfio_compute_needs_reset(vbasedev);
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 3756ff660e..9523a9ccb0 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -120,6 +120,7 @@ vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype
vfio_legacy_dma_unmap_overflow_workaround(void) ""
vfio_get_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64
vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
+vfio_reset_handler(void) ""
# platform.c
vfio_platform_realize(char *name, char *compat) "vfio device %s, compat = %s"
--
2.48.1

View File

@ -0,0 +1,74 @@
From d6a961077e753b9ad5a670a1529634fe20322ce2 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Tue, 25 Feb 2025 14:52:29 -0700
Subject: [PATCH 6/7] hw/vfio/pci: Re-order pre-reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 348: PCI: Implement basic PCI PM capability backing
RH-Jira: RHEL-7301
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Alex Williamson <None>
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
RH-Commit: [6/6] c6c386ecbabda93f8a79da926ece95c2195fbc36 (eauger1/centos-qemu-kvm)
We want the device in the D0 power state going into reset, but the
config write can enable the BARs in the address space, which are
then removed from the address space once we clear the memory enable
bit in the command register. Re-order to clear the command bit
first, so the power state change doesn't enable the BARs.
Cc: Cédric Le Goater <clg@redhat.com>
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Link: https://lore.kernel.org/qemu-devel/20250225215237.3314011-6-alex.williamson@redhat.com
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 518a69a598916749338de3852d41d961d4503115)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/pci.c | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 595b5c9b25..ffe72fd1d0 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2414,6 +2414,15 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
vfio_disable_interrupts(vdev);
+ /*
+ * Stop any ongoing DMA by disconnecting I/O, MMIO, and bus master.
+ * Also put INTx Disable in known state.
+ */
+ cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
+ cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
+ PCI_COMMAND_INTX_DISABLE);
+ vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
+
/* Make sure the device is in D0 */
if (pdev->pm_cap) {
uint16_t pmcsr;
@@ -2433,15 +2442,6 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
}
}
}
-
- /*
- * Stop any ongoing DMA by disconnecting I/O, MMIO, and bus master.
- * Also put INTx Disable in known state.
- */
- cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
- cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
- PCI_COMMAND_INTX_DISABLE);
- vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
}
void vfio_pci_post_reset(VFIOPCIDevice *vdev)
--
2.48.1

View File

@ -0,0 +1,59 @@
From afa3a488f3ca52a5455987e4cd643882c4b15d8a Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Thu, 13 Mar 2025 07:35:22 +0100
Subject: [PATCH 24/26] hw/virtio: Also include md stubs in case
CONFIG_VIRTIO_PCI is not set
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 351: Enable virtio-mem support on s390x
RH-Jira: RHEL-72977
RH-Acked-by: David Hildenbrand <david@redhat.com>
RH-Acked-by: Juraj Marcin <None>
RH-Commit: [24/26] ae6307b26d01d2a317f7e5d1d3b3a16b6d5f56de (thuth/qemu-kvm-cs)
For the s390x target, it's possible to build the QEMU binary without
CONFIG_VIRTIO_PCI and only have the virtio-mem device via the ccw
transport. In that case, QEMU currently fails to link correctly:
/usr/bin/ld: libqemu-s390x-softmmu.a.p/hw_s390x_s390-virtio-ccw.c.o: in function `s390_machine_device_pre_plug':
../hw/s390x/s390-virtio-ccw.c:579:(.text+0x1e96): undefined reference to `virtio_md_pci_pre_plug'
/usr/bin/ld: libqemu-s390x-softmmu.a.p/hw_s390x_s390-virtio-ccw.c.o: in function `s390_machine_device_plug':
../hw/s390x/s390-virtio-ccw.c:608:(.text+0x21a4): undefined reference to `virtio_md_pci_plug'
/usr/bin/ld: libqemu-s390x-softmmu.a.p/hw_s390x_s390-virtio-ccw.c.o: in function `s390_machine_device_unplug_request':
../hw/s390x/s390-virtio-ccw.c:622:(.text+0x2334): undefined reference to `virtio_md_pci_unplug_request'
/usr/bin/ld: libqemu-s390x-softmmu.a.p/hw_s390x_s390-virtio-ccw.c.o: in function `s390_machine_device_unplug':
../hw/s390x/s390-virtio-ccw.c:633:(.text+0x2436): undefined reference to `virtio_md_pci_unplug'
clang: error: linker command failed with exit code 1 (use -v to see invocation)
We also need to include the stubs when CONFIG_VIRTIO_PCI is missing.
Fixes: aa910c20ec5 ("s390x: virtio-mem support")
Message-ID: <20250313063522.1348288-1-thuth@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Thomas Huth <thuth@redhat.com>
(cherry picked from commit c1a6bff276ca52ffde472532d92bb5bb122dab3f)
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
hw/virtio/meson.build | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index c38bdd6fa4..e2f9c75625 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -89,7 +89,8 @@ specific_virtio_ss.add_all(when: 'CONFIG_VIRTIO_PCI', if_true: virtio_pci_ss)
system_ss.add_all(when: 'CONFIG_VIRTIO', if_true: system_virtio_ss)
system_ss.add(when: 'CONFIG_VIRTIO', if_false: files('vhost-stub.c'))
system_ss.add(when: 'CONFIG_VIRTIO', if_false: files('virtio-stub.c'))
-system_ss.add(when: 'CONFIG_VIRTIO_MD', if_false: files('virtio-md-stubs.c'))
+system_ss.add(when: ['CONFIG_VIRTIO_MD', 'CONFIG_VIRTIO_PCI'],
+ if_false: files('virtio-md-stubs.c'))
system_ss.add(files('virtio-hmp-cmds.c'))
--
2.48.1

View File

@ -0,0 +1,73 @@
From 4727c044a09fb8c4fb6d667f26eb55bb6de7554d Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 10 Jun 2025 13:36:40 +0100
Subject: [PATCH 28/57] hw/virtio/virtio: avoid cost of -ftrivial-auto-var-init
in hot path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [2/30] 1c2cc6292deaaac068f4514439703c22c9ccb300 (stefanha/centos-stream-qemu-kvm)
Since commit 7ff9ff039380 ("meson: mitigate against use of uninitialize
stack for exploits") the -ftrivial-auto-var-init=zero compiler option is
used to zero local variables. While this reduces security risks
associated with uninitialized stack data, it introduced a measurable
bottleneck in the virtqueue_split_pop() and virtqueue_packed_pop()
functions.
These virtqueue functions are in the hot path. They are called for each
element (request) that is popped from a VIRTIO device's virtqueue. Using
__attribute__((uninitialized)) on large stack variables in these
functions improves fio randread bs=4k iodepth=64 performance from 304k
to 332k IOPS (+9%).
This issue was found using perf-top(1). virtqueue_split_pop() was one of
the top CPU consumers and the "annotate" feature showed that the memory
zeroing instructions at the beginning of the functions were hot.
Fixes: 7ff9ff039380 ("meson: mitigate against use of uninitialize stack for exploits")
Cc: Daniel P. Berrangé <berrange@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20250610123709.835102-3-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit ba2868ce091cd4abe4be6de4b7e44b3be303b352)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/virtio/virtio.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 10f24a58dd..7f7b178a50 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -1680,8 +1680,8 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t sz)
VirtIODevice *vdev = vq->vdev;
VirtQueueElement *elem = NULL;
unsigned out_num, in_num, elem_entries;
- hwaddr addr[VIRTQUEUE_MAX_SIZE];
- struct iovec iov[VIRTQUEUE_MAX_SIZE];
+ hwaddr QEMU_UNINITIALIZED addr[VIRTQUEUE_MAX_SIZE];
+ struct iovec QEMU_UNINITIALIZED iov[VIRTQUEUE_MAX_SIZE];
VRingDesc desc;
int rc;
@@ -1826,8 +1826,8 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t sz)
VirtIODevice *vdev = vq->vdev;
VirtQueueElement *elem = NULL;
unsigned out_num, in_num, elem_entries;
- hwaddr addr[VIRTQUEUE_MAX_SIZE];
- struct iovec iov[VIRTQUEUE_MAX_SIZE];
+ hwaddr QEMU_UNINITIALIZED addr[VIRTQUEUE_MAX_SIZE];
+ struct iovec QEMU_UNINITIALIZED iov[VIRTQUEUE_MAX_SIZE];
VRingPackedDesc desc;
uint16_t id;
int rc;
--
2.39.3

View File

@ -0,0 +1,96 @@
From 9ca5d7ac4f0ff5f10bf424df8104fe5abe01e431 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Tue, 18 Feb 2025 19:25:31 +0100
Subject: [PATCH 1/9] hw/virtio/virtio-iommu: Migrate to 3-phase reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 341: Fix vIOMMU reset order
RH-Jira: RHEL-7188
RH-Acked-by: Peter Xu <peterx@redhat.com>
RH-Acked-by: Donald Dutile <None>
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Commit: [1/5] 32bf47497d5d4817a448d07ffa7a844aee82ae3c (eauger1/centos-qemu-kvm)
Currently the iommu may be reset before the devices
it protects. For example this happens with virtio-net.
Let's use 3-phase reset mechanism and reset the IOMMU on
exit phase after all DMA capable devices have been
reset during the 'enter' or 'hold' phase.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20250218182737.76722-2-eric.auger@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit d261b84d354a41a38336af813f92f636d3fb3f78)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/virtio/trace-events | 2 +-
hw/virtio/virtio-iommu.c | 14 ++++++++++----
2 files changed, 11 insertions(+), 5 deletions(-)
diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index 04e36ae047..76f0d458b2 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -108,7 +108,7 @@ virtio_pci_notify_write(uint64_t addr, uint64_t val, unsigned int size) "0x%" PR
virtio_pci_notify_write_pio(uint64_t addr, uint64_t val, unsigned int size) "0x%" PRIx64" = 0x%" PRIx64 " (%d)"
# hw/virtio/virtio-iommu.c
-virtio_iommu_device_reset(void) "reset!"
+virtio_iommu_device_reset_exit(void) "reset!"
virtio_iommu_system_reset(void) "system reset!"
virtio_iommu_get_features(uint64_t features) "device supports features=0x%"PRIx64
virtio_iommu_device_status(uint8_t status) "driver status = %d"
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index 59ef4fb217..496200ebc5 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -1504,11 +1504,11 @@ static void virtio_iommu_device_unrealize(DeviceState *dev)
virtio_cleanup(vdev);
}
-static void virtio_iommu_device_reset(VirtIODevice *vdev)
+static void virtio_iommu_device_reset_exit(Object *obj, ResetType type)
{
- VirtIOIOMMU *s = VIRTIO_IOMMU(vdev);
+ VirtIOIOMMU *s = VIRTIO_IOMMU(obj);
- trace_virtio_iommu_device_reset();
+ trace_virtio_iommu_device_reset_exit();
if (s->domains) {
g_tree_destroy(s->domains);
@@ -1669,6 +1669,7 @@ static void virtio_iommu_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+ ResettableClass *rc = RESETTABLE_CLASS(klass);
device_class_set_props(dc, virtio_iommu_properties);
dc->vmsd = &vmstate_virtio_iommu;
@@ -1676,7 +1677,12 @@ static void virtio_iommu_class_init(ObjectClass *klass, void *data)
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
vdc->realize = virtio_iommu_device_realize;
vdc->unrealize = virtio_iommu_device_unrealize;
- vdc->reset = virtio_iommu_device_reset;
+
+ /*
+ * Use 'exit' reset phase to make sure all DMA requests
+ * have been quiesced during 'enter' or 'hold' phase
+ */
+ rc->phases.exit = virtio_iommu_device_reset_exit;
vdc->get_config = virtio_iommu_get_config;
vdc->set_config = virtio_iommu_set_config;
vdc->get_features = virtio_iommu_get_features;
--
2.48.1

View File

@ -0,0 +1,80 @@
From cf92fd8487195ac45bfbdad15168eaec70f3aaa9 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 10 Jun 2025 13:36:39 +0100
Subject: [PATCH 27/57] include/qemu/compiler: add QEMU_UNINITIALIZED attribute
macro
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [1/30] 43c2412d318b6d8e0dcb0b37340640a9d90c3188 (stefanha/centos-stream-qemu-kvm)
The QEMU_UNINITIALIZED macro is to be used to skip the default compiler
variable initialization done by -ftrivial-auto-var-init=zero.
Use this in cases where there a method in the device I/O path (or other
important hot paths), that has large variables on the stack. A rule of
thumb is that "large" means a method with 4kb data in the local stack
frame. Any variables which are KB in size, should be annotated with this
attribute, to pre-emptively eliminate any potential overhead from the
compiler zero'ing memory.
Given that this turns off a security hardening feature, when using this
to flag variables, it is important that the code is double-checked to
ensure there is no possible use of uninitialized data in the method.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Message-id: 20250610123709.835102-2-berrange@redhat.com
[DB: split off patch & rewrite guidance on when to use the annotation]
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit c653b67d1863b7ebfa67f7c9f4aec209d7b5ced5)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Conflicts:
include/qemu/compiler.h
Context conflict due to clang Thread Safety Analysis macros.
---
include/qemu/compiler.h | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h
index c06954ccb4..cc193d5b82 100644
--- a/include/qemu/compiler.h
+++ b/include/qemu/compiler.h
@@ -212,6 +212,26 @@
# define QEMU_USED
#endif
+/*
+ * Disable -ftrivial-auto-var-init on a local variable.
+ *
+ * Use this in cases where there a method in the device I/O path (or other
+ * important hot paths), that has large variables on the stack. A rule of
+ * thumb is that "large" means a method with 4kb data in the local stack
+ * frame. Any variables which are KB in size, should be annotated with this
+ * attribute, to pre-emptively eliminate any potential overhead from the
+ * compiler's implicit zero'ing of memory.
+ *
+ * Given that this turns off a security hardening feature, when using this
+ * to flag variables, it is important that the code is double-checked to
+ * ensure there is no possible use of uninitialized data in the method.
+ */
+#if __has_attribute(uninitialized)
+# define QEMU_UNINITIALIZED __attribute__((uninitialized))
+#else
+# define QEMU_UNINITIALIZED
+#endif
+
/*
* Ugly CPP trick that is like "defined FOO", but also works in C
* code. Useful to replace #ifdef with "if" statements; assumes
--
2.39.3

View File

@ -0,0 +1,73 @@
From 4545870823aea92b18a7e747b686b666d08006a4 Mon Sep 17 00:00:00 2001
From: Juraj Marcin <jmarcin@redhat.com>
Date: Wed, 21 May 2025 15:52:30 +0200
Subject: [PATCH 08/57] io: Fix partial struct copy in
qio_dns_resolver_lookup_sync_inet()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Juraj Marcin <None>
RH-MergeRequest: 369: util/qemu-sockets: Introduce inet socket options controlling TCP keep-alive
RH-Jira: RHEL-67104
RH-Acked-by: Peter Xu <peterx@redhat.com>
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [1/7] 92c8b3e63c22a3ca6e5adc76cac1a9f812034912 (JurajMarcin/centos-src-qemu-kvm)
Commit aec21d3175 (qapi: Add InetSocketAddress member keep-alive)
introduces the keep-alive flag, but this flag is not copied together
with other options in qio_dns_resolver_lookup_sync_inet().
This patch fixes this issue and also prevents future ones by copying the
entire structure first and only then overriding a few attributes that
need to be different.
Fixes: aec21d31756c (qapi: Add InetSocketAddress member keep-alive)
Signed-off-by: Juraj Marcin <jmarcin@redhat.com>
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
(cherry picked from commit 0dc051aa85e1bd68d5c5110fa8af69204e6dbd3d)
JIRA: https://issues.redhat.com/browse/RHEL-67104
Signed-off-by: Juraj Marcin <jmarcin@redhat.com>
---
io/dns-resolver.c | 21 +++++----------------
1 file changed, 5 insertions(+), 16 deletions(-)
diff --git a/io/dns-resolver.c b/io/dns-resolver.c
index 53b0e8407a..3712438f82 100644
--- a/io/dns-resolver.c
+++ b/io/dns-resolver.c
@@ -111,22 +111,11 @@ static int qio_dns_resolver_lookup_sync_inet(QIODNSResolver *resolver,
uaddr, INET6_ADDRSTRLEN, uport, 32,
NI_NUMERICHOST | NI_NUMERICSERV);
- newaddr->u.inet = (InetSocketAddress){
- .host = g_strdup(uaddr),
- .port = g_strdup(uport),
- .has_numeric = true,
- .numeric = true,
- .has_to = iaddr->has_to,
- .to = iaddr->to,
- .has_ipv4 = iaddr->has_ipv4,
- .ipv4 = iaddr->ipv4,
- .has_ipv6 = iaddr->has_ipv6,
- .ipv6 = iaddr->ipv6,
-#ifdef HAVE_IPPROTO_MPTCP
- .has_mptcp = iaddr->has_mptcp,
- .mptcp = iaddr->mptcp,
-#endif
- };
+ newaddr->u.inet = *iaddr;
+ newaddr->u.inet.host = g_strdup(uaddr),
+ newaddr->u.inet.port = g_strdup(uport),
+ newaddr->u.inet.has_numeric = true,
+ newaddr->u.inet.numeric = true,
(*addrs)[i] = newaddr;
}
--
2.39.3

View File

@ -0,0 +1,42 @@
From 8832268a98104ba3065a57dedcd3db43231512ba Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Fri, 9 May 2025 15:40:22 -0500
Subject: [PATCH 07/16] iotests: Improve iotest 194 to mirror data
RH-Author: Eric Blake <eblake@redhat.com>
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
RH-Jira: RHEL-82906 RHEL-83015
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
RH-Commit: [5/14] bfbe8eab1035480cef9d69d1974ba66b755b1b60 (ebblake/centos-qemu-kvm)
Mirroring a completely sparse image to a sparse destination should be
practically instantaneous. It isn't yet, but the test will be more
realistic if it has some non-zero to mirror as well as the holes.
Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20250509204341.3553601-20-eblake@redhat.com>
(cherry picked from commit eb89627899bb84148d272394e885725eff456ae9)
Jira: https://issues.redhat.com/browse/RHEL-82906
Jira: https://issues.redhat.com/browse/RHEL-83015
Signed-off-by: Eric Blake <eblake@redhat.com>
---
tests/qemu-iotests/194 | 1 +
1 file changed, 1 insertion(+)
diff --git a/tests/qemu-iotests/194 b/tests/qemu-iotests/194
index c0ce82dd25..d0b9c084f5 100755
--- a/tests/qemu-iotests/194
+++ b/tests/qemu-iotests/194
@@ -34,6 +34,7 @@ with iotests.FilePath('source.img') as source_img_path, \
img_size = '1G'
iotests.qemu_img_create('-f', iotests.imgfmt, source_img_path, img_size)
+ iotests.qemu_io('-f', iotests.imgfmt, '-c', 'write 512M 1M', source_img_path)
iotests.qemu_img_create('-f', iotests.imgfmt, dest_img_path, img_size)
iotests.log('Launching VMs...')
--
2.48.1

View File

@ -0,0 +1,68 @@
From 644f39de9e2466a9570833b1070acf47a53863ea Mon Sep 17 00:00:00 2001
From: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
Date: Fri, 9 May 2025 15:40:29 -0500
Subject: [PATCH 14/16] iotests/common.rc: add disk_usage function
RH-Author: Eric Blake <eblake@redhat.com>
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
RH-Jira: RHEL-82906 RHEL-83015
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
RH-Commit: [12/14] 0e5d4217f97fe6e952de23eedbc2b8d9c7600665 (ebblake/centos-qemu-kvm)
Move the definition from iotests/250 to common.rc. This is used to
detect real disk usage of sparse files. In particular, we want to use
it for checking subclusters-based discards.
Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
Reviewed-by: Alexander Ivanov <alexander.ivanov@virtuozzo.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Message-ID: <20240913163942.423050-6-andrey.drobyshev@virtuozzo.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20250509204341.3553601-27-eblake@redhat.com>
(cherry picked from commit be9bac072ede6e6aa27079f59efcf17b56bd7b26)
Jira: https://issues.redhat.com/browse/RHEL-82906
Jira: https://issues.redhat.com/browse/RHEL-83015
Signed-off-by: Eric Blake <eblake@redhat.com>
---
tests/qemu-iotests/250 | 5 -----
tests/qemu-iotests/common.rc | 6 ++++++
2 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/tests/qemu-iotests/250 b/tests/qemu-iotests/250
index af48f83aba..c0a0dbc0ff 100755
--- a/tests/qemu-iotests/250
+++ b/tests/qemu-iotests/250
@@ -52,11 +52,6 @@ _unsupported_imgopts data_file
# bdrv_co_truncate(bs->file) call in qcow2_co_truncate(), which might succeed
# anyway.
-disk_usage()
-{
- du --block-size=1 $1 | awk '{print $1}'
-}
-
size=2100M
_make_test_img -o "cluster_size=1M,preallocation=metadata" $size
diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
index 95c12577dd..237f746af8 100644
--- a/tests/qemu-iotests/common.rc
+++ b/tests/qemu-iotests/common.rc
@@ -140,6 +140,12 @@ _optstr_add()
fi
}
+# report real disk usage for sparse files
+disk_usage()
+{
+ du --block-size=1 "$1" | awk '{print $1}'
+}
+
# Set the variables to the empty string to turn Valgrind off
# for specific processes, e.g.
# $ VALGRIND_QEMU_IO= ./check -qcow2 -valgrind 015
--
2.48.1

View File

@ -0,0 +1,110 @@
From 0277328b5a2d1df5d9843423ab5f5fa9481bad79 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Fri, 25 Apr 2025 13:17:12 +0100
Subject: [PATCH 1/5] meson/configure: add 'valgrind' option & --{en,
dis}able-valgrind flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Daniel P. Berrangé <berrange@redhat.com>
RH-MergeRequest: 359: distro: add an explicit valgrind-devel build dep
RH-Jira: RHEL-88153
RH-Acked-by: Eric Blake <eblake@redhat.com>
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
RH-Commit: [1/2] ba9bc44ef9cef6fa76e2092500608575f223f1f7 (berrange/centos-src-qemu)
Currently valgrind debugging support for coroutine stacks is enabled
unconditionally when valgrind/valgrind.h is found. There is no way
to disable valgrind support if valgrind.h is present in the build env.
This is bad for distros, as an dependency far down the chain may cause
valgrind.h to become installed, inadvertently enabling QEMU's valgrind
debugging support. It also means if a distro wants valgrind support
there is no way to mandate this.
The solution is to add a 'valgrind' build feature to meson and thus
configure script.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Message-ID: <20250425121713.1913424-1-berrange@redhat.com>
Signed-off-by: Thomas Huth <thuth@redhat.com>
(cherry picked from commit 6b1c744ec0d66d6d568f9a156282153fc11a21cf)
Conflicts:
meson.build - context from upstream is not present in older tree
---
meson.build | 13 ++++++++++++-
meson_options.txt | 2 ++
scripts/meson-buildoptions.sh | 3 +++
3 files changed, 17 insertions(+), 1 deletion(-)
diff --git a/meson.build b/meson.build
index 1dd97c6f49..5bb2b757c3 100644
--- a/meson.build
+++ b/meson.build
@@ -2463,7 +2463,17 @@ config_host_data.set('CONFIG_FSTRIM', qga_fstrim)
# has_header
config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h'))
config_host_data.set('CONFIG_LINUX_MAGIC_H', cc.has_header('linux/magic.h'))
-config_host_data.set('CONFIG_VALGRIND_H', cc.has_header('valgrind/valgrind.h'))
+valgrind = false
+if get_option('valgrind').allowed()
+ if cc.has_header('valgrind/valgrind.h')
+ valgrind = true
+ else
+ if get_option('valgrind').enabled()
+ error('valgrind requested but valgrind.h not found')
+ endif
+ endif
+endif
+config_host_data.set('CONFIG_VALGRIND_H', valgrind)
config_host_data.set('HAVE_BTRFS_H', cc.has_header('linux/btrfs.h'))
config_host_data.set('HAVE_DRM_H', cc.has_header('libdrm/drm.h'))
config_host_data.set('HAVE_PTY_H', cc.has_header('pty.h'))
@@ -4549,6 +4559,7 @@ summary_info += {'libdw': libdw}
if host_os == 'freebsd'
summary_info += {'libinotify-kqueue': inotify}
endif
+summary_info += {'valgrind': valgrind}
summary(summary_info, bool_yn: true, section: 'Dependencies')
if host_arch == 'unknown'
diff --git a/meson_options.txt b/meson_options.txt
index aa2ba0baef..da06441fdf 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -113,6 +113,8 @@ option('dbus_display', type: 'feature', value: 'auto',
description: '-display dbus support')
option('tpm', type : 'feature', value : 'auto',
description: 'TPM support')
+option('valgrind', type : 'feature', value: 'auto',
+ description: 'valgrind debug support for coroutine stacks')
# Do not enable it by default even for Mingw32, because it doesn't
# work on Wine.
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 5f0cbfc725..251470ea6d 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -191,6 +191,7 @@ meson_options_help() {
printf "%s\n" ' u2f U2F emulation support'
printf "%s\n" ' uadk UADK Library support'
printf "%s\n" ' usb-redir libusbredir support'
+ printf "%s\n" ' valgrind valgrind debug support for coroutine stacks'
printf "%s\n" ' vde vde network backend support'
printf "%s\n" ' vdi vdi image format support'
printf "%s\n" ' vduse-blk-export'
@@ -509,6 +510,8 @@ _meson_option_parse() {
--disable-uadk) printf "%s" -Duadk=disabled ;;
--enable-usb-redir) printf "%s" -Dusb_redir=enabled ;;
--disable-usb-redir) printf "%s" -Dusb_redir=disabled ;;
+ --enable-valgrind) printf "%s" -Dvalgrind=enabled ;;
+ --disable-valgrind) printf "%s" -Dvalgrind=disabled ;;
--enable-vde) printf "%s" -Dvde=enabled ;;
--disable-vde) printf "%s" -Dvde=disabled ;;
--enable-vdi) printf "%s" -Dvdi=enabled ;;
--
2.48.1

View File

@ -0,0 +1,180 @@
From 5d7d7a2ec6301f4d0b0dbea4fbdcab4e41a9cf07 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Thu, 20 Feb 2025 08:24:59 -0500
Subject: [PATCH 7/9] migration: Fix UAF for incoming migration on
MigrationState
RH-Author: Peter Xu <peterx@redhat.com>
RH-MergeRequest: 344: migration: Fix UAF for incoming migration on MigrationState
RH-Jira: RHEL-69775
RH-Acked-by: Juraj Marcin <None>
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
RH-Commit: [1/1] 106e2b4c1c461202c912b5e3ea7e586c4ab05d8c (peterx/qemu-kvm)
On the incoming migration side, QEMU uses a coroutine to load all the VM
states. Inside, it may reference MigrationState on global states like
migration capabilities, parameters, error state, shared mutexes and more.
However there's nothing yet to make sure MigrationState won't get
destroyed (e.g. after migration_shutdown()). Meanwhile there's also no API
available to remove the incoming coroutine in migration_shutdown(),
avoiding it to access the freed elements.
There's a bug report showing this can happen and crash dest QEMU when
migration is cancelled on source.
When it happens, the dest main thread is trying to cleanup everything:
#0 qemu_aio_coroutine_enter
#1 aio_dispatch_handler
#2 aio_poll
#3 monitor_cleanup
#4 qemu_cleanup
#5 qemu_default_main
Then it found the migration incoming coroutine, schedule it (even after
migration_shutdown()), causing crash:
#0 __pthread_kill_implementation
#1 __pthread_kill_internal
#2 __GI_raise
#3 __GI_abort
#4 __assert_fail_base
#5 __assert_fail
#6 qemu_mutex_lock_impl
#7 qemu_lockable_mutex_lock
#8 qemu_lockable_lock
#9 qemu_lockable_auto_lock
#10 migrate_set_error
#11 process_incoming_migration_co
#12 coroutine_trampoline
To fix it, take a refcount after an incoming setup is properly done when
qmp_migrate_incoming() succeeded the 1st time. As it's during a QMP
handler which needs BQL, it means the main loop is still alive (without
going into cleanups, which also needs BQL).
Releasing the refcount now only until the incoming migration coroutine
finished or failed. Hence the refcount is valid for both (1) setup phase
of incoming ports, mostly IO watches (e.g. qio_channel_add_watch_full()),
and (2) the incoming coroutine itself (process_incoming_migration_co()).
Note that we can't unref in migration_incoming_state_destroy(), because
both qmp_xen_load_devices_state() and load_snapshot() will use it without
an incoming migration. Those hold BQL so they're not prone to this issue.
PS: I suspect nobody uses Xen's command at all, as it didn't register yank,
hence AFAIU the command should crash on master when trying to unregister
yank in migration_incoming_state_destroy().. but that's another story.
Also note that in some incoming failure cases we may not always unref the
MigrationState refcount, which is a trade-off to keep things simple. We
could make it accurate, but it can be an overkill. Some examples:
- Unlike most of the rest protocols, socket_start_incoming_migration()
may create net listener after incoming port setup sucessfully.
It means we can't unref in migration_channel_process_incoming() as a
generic path because socket protocol might keep using MigrationState.
- For either socket or file, multiple IO watches might be created, it
means logically each IO watch needs to take one refcount for
MigrationState so as to be 100% accurate on ownership of refcount taken.
In general, we at least need per-protocol handling to make it accurate,
which can be an overkill if we know incoming failed after all. Add a short
comment to explain that when taking the refcount in qmp_migrate_incoming().
Bugzilla: https://issues.redhat.com/browse/RHEL-69775
Tested-by: Yan Fu <yafu@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Fabiano Rosas <farosas@suse.de>
Message-ID: <20250220132459.512610-1-peterx@redhat.com>
Signed-off-by: Fabiano Rosas <farosas@suse.de>
(cherry picked from commit d657a14de5d597bbfe7b54e4c4f0646f440e98ad)
Signed-off-by: Peter Xu <peterx@redhat.com>
---
migration/migration.c | 40 ++++++++++++++++++++++++++++++++++++++--
1 file changed, 38 insertions(+), 2 deletions(-)
diff --git a/migration/migration.c b/migration/migration.c
index 999d4cac54..aabdc45c16 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -115,6 +115,27 @@ static void migration_downtime_start(MigrationState *s)
s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
}
+/*
+ * This is unfortunate: incoming migration actually needs the outgoing
+ * migration state (MigrationState) to be there too, e.g. to query
+ * capabilities, parameters, using locks, setup errors, etc.
+ *
+ * NOTE: when calling this, making sure current_migration exists and not
+ * been freed yet! Otherwise trying to access the refcount is already
+ * an use-after-free itself..
+ *
+ * TODO: Move shared part of incoming / outgoing out into separate object.
+ * Then this is not needed.
+ */
+static void migrate_incoming_ref_outgoing_state(void)
+{
+ object_ref(migrate_get_current());
+}
+static void migrate_incoming_unref_outgoing_state(void)
+{
+ object_unref(migrate_get_current());
+}
+
static void migration_downtime_end(MigrationState *s)
{
int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
@@ -821,7 +842,7 @@ process_incoming_migration_co(void *opaque)
* postcopy thread.
*/
trace_process_incoming_migration_co_postcopy_end_main();
- return;
+ goto out;
}
/* Else if something went wrong then just fall out of the normal exit */
}
@@ -837,7 +858,8 @@ process_incoming_migration_co(void *opaque)
}
migration_bh_schedule(process_incoming_migration_bh, mis);
- return;
+ goto out;
+
fail:
migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
MIGRATION_STATUS_FAILED);
@@ -854,6 +876,9 @@ fail:
exit(EXIT_FAILURE);
}
+out:
+ /* Pairs with the refcount taken in qmp_migrate_incoming() */
+ migrate_incoming_unref_outgoing_state();
}
/**
@@ -1875,6 +1900,17 @@ void qmp_migrate_incoming(const char *uri, bool has_channels,
return;
}
+ /*
+ * Making sure MigrationState is available until incoming migration
+ * completes.
+ *
+ * NOTE: QEMU _might_ leak this refcount in some failure paths, but
+ * that's OK. This is the minimum change we need to at least making
+ * sure success case is clean on the refcount. We can try harder to
+ * make it accurate for any kind of failures, but it might be an
+ * overkill and doesn't bring us much benefit.
+ */
+ migrate_incoming_ref_outgoing_state();
once = false;
}
--
2.48.1

View File

@ -0,0 +1,237 @@
From 1fa31324da8ebba64a44c1e9b64f7e59c29f3d75 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Thu, 24 Apr 2025 18:07:05 -0400
Subject: [PATCH 1/2] migration/postcopy: Spatial locality page hint for
preempt mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Peter Xu <peterx@redhat.com>
RH-MergeRequest: 358: migration/postcopy: Spatial locality page hint for preempt mode
RH-Jira: RHEL-85159
RH-Acked-by: Juraj Marcin <None>
RH-Acked-by: Daniel P. Berrangé <berrange@redhat.com>
RH-Commit: [1/1] f5bce349c80f98428c73a3898f87d4d10ec2f4bd (peterx/qemu-kvm)
The preempt mode postcopy has been introduced for a while. From latency
POV, it should always win the vanilla postcopy.
However there's one thing missing when preempt mode is enabled right now,
which is the spatial locality hint when there're page requests from the
destination side.
In vanilla postcopy, as long as a page request was unqueued, it will update
the PSS of the precopy background stream, so that after a page request the
background thread will move the pages after whatever was requested. It's
pretty much a natural behavior when there's only one channel anyway, and
one scanner to send the pages.
Preempt mode didn't follow that, because preempt mode has its own channel
and its own PSS (which doesn't linearly scan the guest memory, but
dedicated to resolve page requested from destination). So the page request
process and the background migration process are completely separate.
This patch adds the hint explicitly for preempt mode. With that, whenever
the preempt mode receives a page request on the source, it will service the
remote page fault in the return path, then it'll provide a hint to the
background thread so that we'll start sending the pages right after the
requested ones in the background, assuming the follow up pages have a
higher chance to be accessed later.
NOTE: since the background migration thread and return path thread run
completely concurrently, it doesn't always mean the hint will be applied
every single time. For example, it's possible that the return path thread
receives multiple page requests in a row without the background thread
getting the chance to consume one. In such case, the preempt thread only
provide the hint if the previous hint has been consumed. After all,
there's no point queuing hints when we only have one linear scanner.
This could measureably improve the simple sequential memory access pattern
during postcopy (when preempt is on). For random accesses, I can measure a
slight increase of remote page fault latency from ~500us -> ~600us, that
could be a trade-off to have such hint mechanism, and after all that's
still greatly improved comparing to vanilla postcopy on random (~10ms).
The patch is verified by our QE team in a video streaming test case, to
reduce the pause of the video from ~1min to a few seconds when switching
over to postcopy with preempt mode.
Reported-by: Xiaohui Li <xiaohli@redhat.com>
Tested-by: Xiaohui Li <xiaohli@redhat.com>
Reviewed-by: Juraj Marcin <jmarcin@redhat.com>
Link: https://lore.kernel.org/r/20250424220705.195544-1-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
(cherry picked from commit 20d82622812d888478d04a2d0d8575d70eb5d749)
Signed-off-by: Peter Xu <peterx@redhat.com>
---
migration/ram.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 96 insertions(+), 1 deletion(-)
diff --git a/migration/ram.c b/migration/ram.c
index edec1a2d07..0803f85b8a 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -112,6 +112,36 @@
XBZRLECacheStats xbzrle_counters;
+/*
+ * This structure locates a specific location of a guest page. In QEMU,
+ * it's described in a tuple of (ramblock, offset).
+ */
+struct PageLocation {
+ RAMBlock *block;
+ unsigned long offset;
+};
+typedef struct PageLocation PageLocation;
+
+/**
+ * PageLocationHint: describes a hint to a page location
+ *
+ * @valid set if the hint is vaild and to be consumed
+ * @location: the hint content
+ *
+ * In postcopy preempt mode, the urgent channel may provide hints to the
+ * background channel, so that QEMU source can try to migrate whatever is
+ * right after the requested urgent pages.
+ *
+ * This is based on the assumption that the VM (already running on the
+ * destination side) tends to access the memory with spatial locality.
+ * This is also the default behavior of vanilla postcopy (preempt off).
+ */
+struct PageLocationHint {
+ bool valid;
+ PageLocation location;
+};
+typedef struct PageLocationHint PageLocationHint;
+
/* used by the search for pages to send */
struct PageSearchStatus {
/* The migration channel used for a specific host page */
@@ -414,6 +444,13 @@ struct RAMState {
* RAM migration.
*/
unsigned int postcopy_bmap_sync_requested;
+ /*
+ * Page hint during postcopy when preempt mode is on. Return path
+ * thread sets it, while background migration thread consumes it.
+ *
+ * Protected by @bitmap_mutex.
+ */
+ PageLocationHint page_hint;
};
typedef struct RAMState RAMState;
@@ -2091,6 +2128,21 @@ static void pss_host_page_finish(PageSearchStatus *pss)
pss->host_page_start = pss->host_page_end = 0;
}
+static void ram_page_hint_update(RAMState *rs, PageSearchStatus *pss)
+{
+ PageLocationHint *hint = &rs->page_hint;
+
+ /* If there's a pending hint not consumed, don't bother */
+ if (hint->valid) {
+ return;
+ }
+
+ /* Provide a hint to the background stream otherwise */
+ hint->location.block = pss->block;
+ hint->location.offset = pss->page;
+ hint->valid = true;
+}
+
/*
* Send an urgent host page specified by `pss'. Need to be called with
* bitmap_mutex held.
@@ -2136,6 +2188,7 @@ out:
/* For urgent requests, flush immediately if sent */
if (sent) {
qemu_fflush(pss->pss_channel);
+ ram_page_hint_update(rs, pss);
}
return ret;
}
@@ -2223,6 +2276,30 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
return (res < 0 ? res : pages);
}
+static bool ram_page_hint_valid(RAMState *rs)
+{
+ /* There's only page hint during postcopy preempt mode */
+ if (!postcopy_preempt_active()) {
+ return false;
+ }
+
+ return rs->page_hint.valid;
+}
+
+static void ram_page_hint_collect(RAMState *rs, RAMBlock **block,
+ unsigned long *page)
+{
+ PageLocationHint *hint = &rs->page_hint;
+
+ assert(hint->valid);
+
+ *block = hint->location.block;
+ *page = hint->location.offset;
+
+ /* Mark the hint consumed */
+ hint->valid = false;
+}
+
/**
* ram_find_and_save_block: finds a dirty page and sends it to f
*
@@ -2239,6 +2316,8 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
static int ram_find_and_save_block(RAMState *rs)
{
PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
+ unsigned long next_page;
+ RAMBlock *next_block;
int pages = 0;
/* No dirty page as there is zero RAM */
@@ -2258,7 +2337,14 @@ static int ram_find_and_save_block(RAMState *rs)
rs->last_page = 0;
}
- pss_init(pss, rs->last_seen_block, rs->last_page);
+ if (ram_page_hint_valid(rs)) {
+ ram_page_hint_collect(rs, &next_block, &next_page);
+ } else {
+ next_block = rs->last_seen_block;
+ next_page = rs->last_page;
+ }
+
+ pss_init(pss, next_block, next_page);
while (true){
if (!get_queued_page(rs, pss)) {
@@ -2392,6 +2478,13 @@ static void ram_save_cleanup(void *opaque)
migration_ops = NULL;
}
+static void ram_page_hint_reset(PageLocationHint *hint)
+{
+ hint->location.block = NULL;
+ hint->location.offset = 0;
+ hint->valid = false;
+}
+
static void ram_state_reset(RAMState *rs)
{
int i;
@@ -2404,6 +2497,8 @@ static void ram_state_reset(RAMState *rs)
rs->last_page = 0;
rs->last_version = ram_list.version;
rs->xbzrle_started = false;
+
+ ram_page_hint_reset(&rs->page_hint);
}
#define MAX_WAIT 50 /* ms, half buffered_file limit */
--
2.48.1

View File

@ -0,0 +1,295 @@
From a5f6042a0c80daf3672fa071b724cb05e6f6e928 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Fri, 9 May 2025 15:40:25 -0500
Subject: [PATCH 10/16] mirror: Allow QMP override to declare target already
zero
RH-Author: Eric Blake <eblake@redhat.com>
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
RH-Jira: RHEL-82906 RHEL-83015
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
RH-Commit: [8/14] fb054864175d83e9d232464295b170808bee0e6c (ebblake/centos-qemu-kvm)
QEMU has an optimization for a just-created drive-mirror destination
that is not possible for blockdev-mirror (which can't create the
destination) - any time we know the destination starts life as all
zeroes, we can skip a pre-zeroing pass on the destination. Recent
patches have added an improved heuristic for detecting if a file
contains all zeroes, and we plan to use that heuristic in upcoming
patches. But since a heuristic cannot quickly detect all scenarios,
and there may be cases where the caller is aware of information that
QEMU cannot learn quickly, it makes sense to have a way to tell QEMU
to assume facts about the destination that can make the mirror
operation faster. Given our existing example of "qemu-img convert
--target-is-zero", it is time to expose this override in QMP for
blockdev-mirror as well.
This patch results in some slight redundancy between the older
s->zero_target (set any time mode==FULL and the destination image was
not just created - ie. clear if drive-mirror is asking to skip the
pre-zero pass) and the newly-introduced s->target_is_zero (in addition
to the QMP override, it is set when drive-mirror creates the
destination image); this will be cleaned up in the next patch.
There is also a subtlety that we must consider. When drive-mirror is
passing target_is_zero on behalf of a just-created image, we know the
image is sparse (skipping the pre-zeroing keeps it that way), so it
doesn't matter whether the destination also has "discard":"unmap" and
"detect-zeroes":"unmap". But now that we are letting the user set the
knob for target-is-zero, if the user passes a pre-existing file that
is fully allocated, it is fine to leave the file fully allocated under
"detect-zeroes":"on", but if the file is open with
"detect-zeroes":"unmap", we should really be trying harder to punch
holes in the destination for every region of zeroes copied from the
source. The easiest way to do this is to still run the pre-zeroing
pass (turning the entire destination file sparse before populating
just the allocated portions of the source), even though that currently
results in double I/O to the portions of the file that are allocated.
A later patch will add further optimizations to reduce redundant
zeroing I/O during the mirror operation.
Since "target-is-zero":true is designed for optimizations, it is okay
to silently ignore the parameter rather than erroring if the user ever
sets the parameter in a scenario where the mirror job can't exploit it
(for example, when doing "sync":"top" instead of "sync":"full", we
can't pre-zero, so setting the parameter won't make a speed
difference).
Signed-off-by: Eric Blake <eblake@redhat.com>
Acked-by: Markus Armbruster <armbru@redhat.com>
Message-ID: <20250509204341.3553601-23-eblake@redhat.com>
Reviewed-by: Sunny Zhu <sunnyzhyy@qq.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit d17a34bfb94bda3a89d7320ae67255ded1d8c939)
Jira: https://issues.redhat.com/browse/RHEL-82906
Jira: https://issues.redhat.com/browse/RHEL-83015
Signed-off-by: Eric Blake <eblake@redhat.com>
---
block/mirror.c | 27 ++++++++++++++++++++++----
blockdev.c | 18 ++++++++++-------
include/block/block_int-global-state.h | 3 ++-
qapi/block-core.json | 8 +++++++-
tests/unit/test-block-iothread.c | 2 +-
5 files changed, 44 insertions(+), 14 deletions(-)
diff --git a/block/mirror.c b/block/mirror.c
index c8bbaa0b35..bba3e3b05c 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -55,6 +55,8 @@ typedef struct MirrorBlockJob {
BlockMirrorBackingMode backing_mode;
/* Whether the target image requires explicit zero-initialization */
bool zero_target;
+ /* Whether the target should be assumed to be already zero initialized */
+ bool target_is_zero;
/*
* To be accesssed with atomics. Written only under the BQL (required by the
* current implementation of mirror_change()).
@@ -844,12 +846,26 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s)
BlockDriverState *target_bs = blk_bs(s->target);
int ret = -EIO;
int64_t count;
+ bool punch_holes =
+ target_bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP &&
+ bdrv_can_write_zeroes_with_unmap(target_bs);
bdrv_graph_co_rdlock();
bs = s->mirror_top_bs->backing->bs;
bdrv_graph_co_rdunlock();
- if (s->zero_target) {
+ if (s->zero_target && (!s->target_is_zero || punch_holes)) {
+ /*
+ * Here, we are in FULL mode; our goal is to avoid writing
+ * zeroes if the destination already reads as zero, except
+ * when we are trying to punch holes. This is possible if
+ * zeroing happened externally (s->target_is_zero) or if we
+ * have a fast way to pre-zero the image (the dirty bitmap
+ * will be populated later by the non-zero portions, the same
+ * as for TOP mode). If pre-zeroing is not fast, or we need
+ * to punch holes, then our only recourse is to write the
+ * entire image.
+ */
if (!bdrv_can_write_zeroes_with_unmap(target_bs)) {
bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, s->bdev_length);
return 0;
@@ -1714,7 +1730,7 @@ static BlockJob *mirror_start_job(
uint32_t granularity, int64_t buf_size,
MirrorSyncMode sync_mode,
BlockMirrorBackingMode backing_mode,
- bool zero_target,
+ bool zero_target, bool target_is_zero,
BlockdevOnError on_source_error,
BlockdevOnError on_target_error,
bool unmap,
@@ -1883,6 +1899,7 @@ static BlockJob *mirror_start_job(
s->sync_mode = sync_mode;
s->backing_mode = backing_mode;
s->zero_target = zero_target;
+ s->target_is_zero = target_is_zero;
qatomic_set(&s->copy_mode, copy_mode);
s->base = base;
s->base_overlay = bdrv_find_overlay(bs, base);
@@ -2011,7 +2028,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
int creation_flags, int64_t speed,
uint32_t granularity, int64_t buf_size,
MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
- bool zero_target,
+ bool zero_target, bool target_is_zero,
BlockdevOnError on_source_error,
BlockdevOnError on_target_error,
bool unmap, const char *filter_node_name,
@@ -2034,7 +2051,8 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
mirror_start_job(job_id, bs, creation_flags, target, replaces,
speed, granularity, buf_size, mode, backing_mode,
- zero_target, on_source_error, on_target_error, unmap,
+ zero_target,
+ target_is_zero, on_source_error, on_target_error, unmap,
NULL, NULL, &mirror_job_driver, base, false,
filter_node_name, true, copy_mode, false, errp);
}
@@ -2062,6 +2080,7 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
job = mirror_start_job(
job_id, bs, creation_flags, base, NULL, speed, 0, 0,
MIRROR_SYNC_MODE_TOP, MIRROR_LEAVE_BACKING_CHAIN, false,
+ false,
on_error, on_error, true, cb, opaque,
&commit_active_job_driver, base, auto_complete,
filter_node_name, false, MIRROR_COPY_MODE_BACKGROUND,
diff --git a/blockdev.c b/blockdev.c
index 70046b6690..db11a99312 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2795,7 +2795,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
const char *replaces,
enum MirrorSyncMode sync,
BlockMirrorBackingMode backing_mode,
- bool zero_target,
+ bool zero_target, bool target_is_zero,
bool has_speed, int64_t speed,
bool has_granularity, uint32_t granularity,
bool has_buf_size, int64_t buf_size,
@@ -2906,11 +2906,10 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
/* pass the node name to replace to mirror start since it's loose coupling
* and will allow to check whether the node still exist at mirror completion
*/
- mirror_start(job_id, bs, target,
- replaces, job_flags,
+ mirror_start(job_id, bs, target, replaces, job_flags,
speed, granularity, buf_size, sync, backing_mode, zero_target,
- on_source_error, on_target_error, unmap, filter_node_name,
- copy_mode, errp);
+ target_is_zero, on_source_error, on_target_error, unmap,
+ filter_node_name, copy_mode, errp);
}
void qmp_drive_mirror(DriveMirror *arg, Error **errp)
@@ -2925,6 +2924,7 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
int64_t size;
const char *format = arg->format;
bool zero_target;
+ bool target_is_zero;
int ret;
bs = qmp_get_root_bs(arg->device, errp);
@@ -3041,6 +3041,8 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
zero_target = (arg->sync == MIRROR_SYNC_MODE_FULL &&
(arg->mode == NEW_IMAGE_MODE_EXISTING ||
!bdrv_has_zero_init(target_bs)));
+ target_is_zero = (arg->mode != NEW_IMAGE_MODE_EXISTING &&
+ bdrv_has_zero_init(target_bs));
bdrv_graph_rdunlock_main_loop();
@@ -3052,7 +3054,7 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
blockdev_mirror_common(arg->job_id, bs, target_bs,
arg->replaces, arg->sync,
- backing_mode, zero_target,
+ backing_mode, zero_target, target_is_zero,
arg->has_speed, arg->speed,
arg->has_granularity, arg->granularity,
arg->has_buf_size, arg->buf_size,
@@ -3082,6 +3084,7 @@ void qmp_blockdev_mirror(const char *job_id,
bool has_copy_mode, MirrorCopyMode copy_mode,
bool has_auto_finalize, bool auto_finalize,
bool has_auto_dismiss, bool auto_dismiss,
+ bool has_target_is_zero, bool target_is_zero,
Error **errp)
{
BlockDriverState *bs;
@@ -3112,7 +3115,8 @@ void qmp_blockdev_mirror(const char *job_id,
blockdev_mirror_common(job_id, bs, target_bs,
replaces, sync, backing_mode,
- zero_target, has_speed, speed,
+ zero_target, has_target_is_zero && target_is_zero,
+ has_speed, speed,
has_granularity, granularity,
has_buf_size, buf_size,
has_on_source_error, on_source_error,
diff --git a/include/block/block_int-global-state.h b/include/block/block_int-global-state.h
index eb2d92a226..8cf0003ce7 100644
--- a/include/block/block_int-global-state.h
+++ b/include/block/block_int-global-state.h
@@ -140,6 +140,7 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
* @mode: Whether to collapse all images in the chain to the target.
* @backing_mode: How to establish the target's backing chain after completion.
* @zero_target: Whether the target should be explicitly zero-initialized
+ * @target_is_zero: Whether the target already is zero-initialized.
* @on_source_error: The action to take upon error reading from the source.
* @on_target_error: The action to take upon error writing to the target.
* @unmap: Whether to unmap target where source sectors only contain zeroes.
@@ -159,7 +160,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
int creation_flags, int64_t speed,
uint32_t granularity, int64_t buf_size,
MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
- bool zero_target,
+ bool zero_target, bool target_is_zero,
BlockdevOnError on_source_error,
BlockdevOnError on_target_error,
bool unmap, const char *filter_node_name,
diff --git a/qapi/block-core.json b/qapi/block-core.json
index c1af3d1f7d..3969c60b93 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2535,6 +2535,11 @@
# disappear from the query list without user intervention.
# Defaults to true. (Since 3.1)
#
+# @target-is-zero: Assume the destination reads as all zeroes before
+# the mirror started. Setting this to true can speed up the
+# mirror. Setting this to true when the destination is not
+# actually all zero can corrupt the destination. (Since 10.1)
+#
# Since: 2.6
#
# .. qmp-example::
@@ -2554,7 +2559,8 @@
'*on-target-error': 'BlockdevOnError',
'*filter-node-name': 'str',
'*copy-mode': 'MirrorCopyMode',
- '*auto-finalize': 'bool', '*auto-dismiss': 'bool' },
+ '*auto-finalize': 'bool', '*auto-dismiss': 'bool',
+ '*target-is-zero': 'bool'},
'allow-preconfig': true }
##
diff --git a/tests/unit/test-block-iothread.c b/tests/unit/test-block-iothread.c
index 373b72fdd8..033711d8d7 100644
--- a/tests/unit/test-block-iothread.c
+++ b/tests/unit/test-block-iothread.c
@@ -755,7 +755,7 @@ static void test_propagate_mirror(void)
/* Start a mirror job */
mirror_start("job0", src, target, NULL, JOB_DEFAULT, 0, 0, 0,
- MIRROR_SYNC_MODE_NONE, MIRROR_OPEN_BACKING_CHAIN, false,
+ MIRROR_SYNC_MODE_NONE, MIRROR_OPEN_BACKING_CHAIN, false, false,
BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT,
false, "filter_node", MIRROR_COPY_MODE_BACKGROUND,
&error_abort);
--
2.48.1

View File

@ -0,0 +1,241 @@
From 5040f835f07f3355ae80b3da2ae83ce35de022e0 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Fri, 9 May 2025 15:40:26 -0500
Subject: [PATCH 11/16] mirror: Drop redundant zero_target parameter
RH-Author: Eric Blake <eblake@redhat.com>
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
RH-Jira: RHEL-82906 RHEL-83015
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
RH-Commit: [9/14] b84a938c69e3761211b9fee4c59b465d55f61855 (ebblake/centos-qemu-kvm)
The two callers to a mirror job (drive-mirror and blockdev-mirror) set
zero_target precisely when sync mode == FULL, with the one exception
that drive-mirror skips zeroing the target if it was newly created and
reads as zero. But given the previous patch, that exception is
equally captured by target_is_zero.
Meanwhile, there is another slight wrinkle, fortunately caught by
iotest 185: if the caller uses "sync":"top" but the source has no
backing file, the code in blockdev.c was changing sync to be FULL, but
only after it had set zero_target=false. In mirror.c, prior to recent
patches, this didn't matter: the only places that inspected sync were
setting is_none_mode (both TOP and FULL had set that to false), and
mirror_start() setting base = mode == MIRROR_SYNC_MODE_TOP ?
bdrv_backing_chain_next(bs) : NULL. But now that we are passing sync
around, the slammed sync mode would result in a new pre-zeroing pass
even when the user had passed "sync":"top" in an effort to skip
pre-zeroing. Fortunately, the assignment of base when bs has no
backing chain still works out to NULL if we don't slam things. So
with the forced change of sync ripped out of blockdev.c, the sync mode
is passed through the full callstack unmolested, and we can now
reliably reconstruct the same settings as what used to be passed in by
zero_target=false, without the redundant parameter.
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-ID: <20250509204341.3553601-24-eblake@redhat.com>
Reviewed-by: Sunny Zhu <sunnyzhyy@qq.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
[eblake: Fix regression in iotest 185]
Signed-off-by: Eric Blake <eblake@redhat.com>
(cherry picked from commit 253b43a29077de9266351e120c600a73b82e9c49)
Jira: https://issues.redhat.com/browse/RHEL-82906
Jira: https://issues.redhat.com/browse/RHEL-83015
Signed-off-by: Eric Blake <eblake@redhat.com>
---
block/mirror.c | 13 +++++--------
blockdev.c | 19 ++++---------------
include/block/block_int-global-state.h | 3 +--
tests/unit/test-block-iothread.c | 2 +-
4 files changed, 11 insertions(+), 26 deletions(-)
diff --git a/block/mirror.c b/block/mirror.c
index bba3e3b05c..b35d12adaa 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -53,8 +53,6 @@ typedef struct MirrorBlockJob {
Error *replace_blocker;
MirrorSyncMode sync_mode;
BlockMirrorBackingMode backing_mode;
- /* Whether the target image requires explicit zero-initialization */
- bool zero_target;
/* Whether the target should be assumed to be already zero initialized */
bool target_is_zero;
/*
@@ -854,7 +852,9 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s)
bs = s->mirror_top_bs->backing->bs;
bdrv_graph_co_rdunlock();
- if (s->zero_target && (!s->target_is_zero || punch_holes)) {
+ if (s->sync_mode == MIRROR_SYNC_MODE_TOP) {
+ /* In TOP mode, there is no benefit to a pre-zeroing pass. */
+ } else if (!s->target_is_zero || punch_holes) {
/*
* Here, we are in FULL mode; our goal is to avoid writing
* zeroes if the destination already reads as zero, except
@@ -1730,7 +1730,7 @@ static BlockJob *mirror_start_job(
uint32_t granularity, int64_t buf_size,
MirrorSyncMode sync_mode,
BlockMirrorBackingMode backing_mode,
- bool zero_target, bool target_is_zero,
+ bool target_is_zero,
BlockdevOnError on_source_error,
BlockdevOnError on_target_error,
bool unmap,
@@ -1898,7 +1898,6 @@ static BlockJob *mirror_start_job(
s->on_target_error = on_target_error;
s->sync_mode = sync_mode;
s->backing_mode = backing_mode;
- s->zero_target = zero_target;
s->target_is_zero = target_is_zero;
qatomic_set(&s->copy_mode, copy_mode);
s->base = base;
@@ -2028,7 +2027,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
int creation_flags, int64_t speed,
uint32_t granularity, int64_t buf_size,
MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
- bool zero_target, bool target_is_zero,
+ bool target_is_zero,
BlockdevOnError on_source_error,
BlockdevOnError on_target_error,
bool unmap, const char *filter_node_name,
@@ -2051,7 +2050,6 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
mirror_start_job(job_id, bs, creation_flags, target, replaces,
speed, granularity, buf_size, mode, backing_mode,
- zero_target,
target_is_zero, on_source_error, on_target_error, unmap,
NULL, NULL, &mirror_job_driver, base, false,
filter_node_name, true, copy_mode, false, errp);
@@ -2080,7 +2078,6 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
job = mirror_start_job(
job_id, bs, creation_flags, base, NULL, speed, 0, 0,
MIRROR_SYNC_MODE_TOP, MIRROR_LEAVE_BACKING_CHAIN, false,
- false,
on_error, on_error, true, cb, opaque,
&commit_active_job_driver, base, auto_complete,
filter_node_name, false, MIRROR_COPY_MODE_BACKGROUND,
diff --git a/blockdev.c b/blockdev.c
index db11a99312..04fa759e30 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2795,7 +2795,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
const char *replaces,
enum MirrorSyncMode sync,
BlockMirrorBackingMode backing_mode,
- bool zero_target, bool target_is_zero,
+ bool target_is_zero,
bool has_speed, int64_t speed,
bool has_granularity, uint32_t granularity,
bool has_buf_size, int64_t buf_size,
@@ -2862,10 +2862,6 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
return;
}
- if (!bdrv_backing_chain_next(bs) && sync == MIRROR_SYNC_MODE_TOP) {
- sync = MIRROR_SYNC_MODE_FULL;
- }
-
if (!replaces) {
/* We want to mirror from @bs, but keep implicit filters on top */
unfiltered_bs = bdrv_skip_implicit_filters(bs);
@@ -2907,7 +2903,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
* and will allow to check whether the node still exist at mirror completion
*/
mirror_start(job_id, bs, target, replaces, job_flags,
- speed, granularity, buf_size, sync, backing_mode, zero_target,
+ speed, granularity, buf_size, sync, backing_mode,
target_is_zero, on_source_error, on_target_error, unmap,
filter_node_name, copy_mode, errp);
}
@@ -2923,7 +2919,6 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
int flags;
int64_t size;
const char *format = arg->format;
- bool zero_target;
bool target_is_zero;
int ret;
@@ -3038,9 +3033,6 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
}
bdrv_graph_rdlock_main_loop();
- zero_target = (arg->sync == MIRROR_SYNC_MODE_FULL &&
- (arg->mode == NEW_IMAGE_MODE_EXISTING ||
- !bdrv_has_zero_init(target_bs)));
target_is_zero = (arg->mode != NEW_IMAGE_MODE_EXISTING &&
bdrv_has_zero_init(target_bs));
bdrv_graph_rdunlock_main_loop();
@@ -3054,7 +3046,7 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
blockdev_mirror_common(arg->job_id, bs, target_bs,
arg->replaces, arg->sync,
- backing_mode, zero_target, target_is_zero,
+ backing_mode, target_is_zero,
arg->has_speed, arg->speed,
arg->has_granularity, arg->granularity,
arg->has_buf_size, arg->buf_size,
@@ -3091,7 +3083,6 @@ void qmp_blockdev_mirror(const char *job_id,
BlockDriverState *target_bs;
AioContext *aio_context;
BlockMirrorBackingMode backing_mode = MIRROR_LEAVE_BACKING_CHAIN;
- bool zero_target;
int ret;
bs = qmp_get_root_bs(device, errp);
@@ -3104,8 +3095,6 @@ void qmp_blockdev_mirror(const char *job_id,
return;
}
- zero_target = (sync == MIRROR_SYNC_MODE_FULL);
-
aio_context = bdrv_get_aio_context(bs);
ret = bdrv_try_change_aio_context(target_bs, aio_context, NULL, errp);
@@ -3115,7 +3104,7 @@ void qmp_blockdev_mirror(const char *job_id,
blockdev_mirror_common(job_id, bs, target_bs,
replaces, sync, backing_mode,
- zero_target, has_target_is_zero && target_is_zero,
+ has_target_is_zero && target_is_zero,
has_speed, speed,
has_granularity, granularity,
has_buf_size, buf_size,
diff --git a/include/block/block_int-global-state.h b/include/block/block_int-global-state.h
index 8cf0003ce7..d21bd7fd2f 100644
--- a/include/block/block_int-global-state.h
+++ b/include/block/block_int-global-state.h
@@ -139,7 +139,6 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
* @buf_size: The amount of data that can be in flight at one time.
* @mode: Whether to collapse all images in the chain to the target.
* @backing_mode: How to establish the target's backing chain after completion.
- * @zero_target: Whether the target should be explicitly zero-initialized
* @target_is_zero: Whether the target already is zero-initialized.
* @on_source_error: The action to take upon error reading from the source.
* @on_target_error: The action to take upon error writing to the target.
@@ -160,7 +159,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
int creation_flags, int64_t speed,
uint32_t granularity, int64_t buf_size,
MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
- bool zero_target, bool target_is_zero,
+ bool target_is_zero,
BlockdevOnError on_source_error,
BlockdevOnError on_target_error,
bool unmap, const char *filter_node_name,
diff --git a/tests/unit/test-block-iothread.c b/tests/unit/test-block-iothread.c
index 033711d8d7..373b72fdd8 100644
--- a/tests/unit/test-block-iothread.c
+++ b/tests/unit/test-block-iothread.c
@@ -755,7 +755,7 @@ static void test_propagate_mirror(void)
/* Start a mirror job */
mirror_start("job0", src, target, NULL, JOB_DEFAULT, 0, 0, 0,
- MIRROR_SYNC_MODE_NONE, MIRROR_OPEN_BACKING_CHAIN, false, false,
+ MIRROR_SYNC_MODE_NONE, MIRROR_OPEN_BACKING_CHAIN, false,
BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT,
false, "filter_node", MIRROR_COPY_MODE_BACKGROUND,
&error_abort);
--
2.48.1

View File

@ -0,0 +1,92 @@
From 0102da22fe5aefde9d398d539fc290ab062346f1 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Fri, 9 May 2025 15:40:23 -0500
Subject: [PATCH 08/16] mirror: Minor refactoring
RH-Author: Eric Blake <eblake@redhat.com>
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
RH-Jira: RHEL-82906 RHEL-83015
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
RH-Commit: [6/14] 886fa2e3249f48f89d3e04ba619d370031851d89 (ebblake/centos-qemu-kvm)
Commit 5791ba52 (v9.2) pre-initialized ret in mirror_dirty_init to
silence a false positive compiler warning, even though in all code
paths where ret is used, it was guaranteed to be reassigned
beforehand. But since the function returns -errno, and -1 is not
always the right errno, it's better to initialize to -EIO.
An upcoming patch wants to track two bitmaps in
do_sync_target_write(); this will be easier if the current variables
related to the dirty bitmap are renamed.
Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20250509204341.3553601-21-eblake@redhat.com>
(cherry picked from commit 870f8963cf1a84f8ec929b05a6d68906974a76c5)
Conflicts:
block/mirror.c - commit 5791ba52 not present
Jira: https://issues.redhat.com/browse/RHEL-82906
Jira: https://issues.redhat.com/browse/RHEL-83015
Signed-off-by: Eric Blake <eblake@redhat.com>
---
block/mirror.c | 22 +++++++++++-----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/block/mirror.c b/block/mirror.c
index 61f0a717b7..22f8bd98c4 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -841,7 +841,7 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s)
int64_t offset;
BlockDriverState *bs;
BlockDriverState *target_bs = blk_bs(s->target);
- int ret;
+ int ret = -EIO;
int64_t count;
bdrv_graph_co_rdlock();
@@ -1341,7 +1341,7 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
{
int ret;
size_t qiov_offset = 0;
- int64_t bitmap_offset, bitmap_end;
+ int64_t dirty_bitmap_offset, dirty_bitmap_end;
if (!QEMU_IS_ALIGNED(offset, job->granularity) &&
bdrv_dirty_bitmap_get(job->dirty_bitmap, offset))
@@ -1388,11 +1388,11 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
* Tails are either clean or shrunk, so for bitmap resetting
* we safely align the range down.
*/
- bitmap_offset = QEMU_ALIGN_UP(offset, job->granularity);
- bitmap_end = QEMU_ALIGN_DOWN(offset + bytes, job->granularity);
- if (bitmap_offset < bitmap_end) {
- bdrv_reset_dirty_bitmap(job->dirty_bitmap, bitmap_offset,
- bitmap_end - bitmap_offset);
+ dirty_bitmap_offset = QEMU_ALIGN_UP(offset, job->granularity);
+ dirty_bitmap_end = QEMU_ALIGN_DOWN(offset + bytes, job->granularity);
+ if (dirty_bitmap_offset < dirty_bitmap_end) {
+ bdrv_reset_dirty_bitmap(job->dirty_bitmap, dirty_bitmap_offset,
+ dirty_bitmap_end - dirty_bitmap_offset);
}
job_progress_increase_remaining(&job->common.job, bytes);
@@ -1430,10 +1430,10 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
* at function start, and they must be still dirty, as we've locked
* the region for in-flight op.
*/
- bitmap_offset = QEMU_ALIGN_DOWN(offset, job->granularity);
- bitmap_end = QEMU_ALIGN_UP(offset + bytes, job->granularity);
- bdrv_set_dirty_bitmap(job->dirty_bitmap, bitmap_offset,
- bitmap_end - bitmap_offset);
+ dirty_bitmap_offset = QEMU_ALIGN_DOWN(offset, job->granularity);
+ dirty_bitmap_end = QEMU_ALIGN_UP(offset + bytes, job->granularity);
+ bdrv_set_dirty_bitmap(job->dirty_bitmap, dirty_bitmap_offset,
+ dirty_bitmap_end - dirty_bitmap_offset);
qatomic_set(&job->actively_synced, false);
action = mirror_error_action(job, false, -ret);
--
2.48.1

View File

@ -0,0 +1,139 @@
From 482db3e637a16d5877e523e87c53ddb2579b4b66 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Fri, 9 May 2025 15:40:24 -0500
Subject: [PATCH 09/16] mirror: Pass full sync mode rather than bool to
internals
RH-Author: Eric Blake <eblake@redhat.com>
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
RH-Jira: RHEL-82906 RHEL-83015
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
RH-Commit: [7/14] f45a83a14b0eea07517176d44ab0c49db8233ea0 (ebblake/centos-qemu-kvm)
Out of the five possible values for MirrorSyncMode, INCREMENTAL and
BITMAP are already rejected up front in mirror_start, leaving NONE,
TOP, and FULL as the remaining values that the code was collapsing
into a single bool is_none_mode. Furthermore, mirror_dirty_init() is
only reachable for modes TOP and FULL, as further guided by
s->zero_target. However, upcoming patches want to further optimize
the pre-zeroing pass of a sync=full mirror in mirror_dirty_init(),
while avoiding that pass on a sync=top action. Instead of throwing
away context by collapsing these two values into
s->is_none_mode=false, it is better to pass s->sync_mode throughout
the entire operation. For active commit, the desired semantics match
sync mode TOP.
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-ID: <20250509204341.3553601-22-eblake@redhat.com>
Reviewed-by: Sunny Zhu <sunnyzhyy@qq.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 9474d97bd7421b4fe7c806ab0949697514d11e88)
Jira: https://issues.redhat.com/browse/RHEL-82906
Jira: https://issues.redhat.com/browse/RHEL-83015
Signed-off-by: Eric Blake <eblake@redhat.com>
---
block/mirror.c | 24 ++++++++++++------------
1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/block/mirror.c b/block/mirror.c
index 22f8bd98c4..c8bbaa0b35 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -51,7 +51,7 @@ typedef struct MirrorBlockJob {
BlockDriverState *to_replace;
/* Used to block operations on the drive-mirror-replace target */
Error *replace_blocker;
- bool is_none_mode;
+ MirrorSyncMode sync_mode;
BlockMirrorBackingMode backing_mode;
/* Whether the target image requires explicit zero-initialization */
bool zero_target;
@@ -723,9 +723,10 @@ static int mirror_exit_common(Job *job)
&error_abort);
if (!abort && s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
- BlockDriverState *backing = s->is_none_mode ? src : s->base;
+ BlockDriverState *backing;
BlockDriverState *unfiltered_target = bdrv_skip_filters(target_bs);
+ backing = s->sync_mode == MIRROR_SYNC_MODE_NONE ? src : s->base;
if (bdrv_cow_bs(unfiltered_target) != backing) {
bdrv_set_backing_hd(unfiltered_target, backing, &local_err);
if (local_err) {
@@ -1020,7 +1021,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
mirror_free_init(s);
s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
- if (!s->is_none_mode) {
+ if (s->sync_mode != MIRROR_SYNC_MODE_NONE) {
ret = mirror_dirty_init(s);
if (ret < 0 || job_is_cancelled(&s->common.job)) {
goto immediate_exit;
@@ -1711,6 +1712,7 @@ static BlockJob *mirror_start_job(
int creation_flags, BlockDriverState *target,
const char *replaces, int64_t speed,
uint32_t granularity, int64_t buf_size,
+ MirrorSyncMode sync_mode,
BlockMirrorBackingMode backing_mode,
bool zero_target,
BlockdevOnError on_source_error,
@@ -1719,7 +1721,7 @@ static BlockJob *mirror_start_job(
BlockCompletionFunc *cb,
void *opaque,
const BlockJobDriver *driver,
- bool is_none_mode, BlockDriverState *base,
+ BlockDriverState *base,
bool auto_complete, const char *filter_node_name,
bool is_mirror, MirrorCopyMode copy_mode,
bool base_ro,
@@ -1878,7 +1880,7 @@ static BlockJob *mirror_start_job(
s->replaces = g_strdup(replaces);
s->on_source_error = on_source_error;
s->on_target_error = on_target_error;
- s->is_none_mode = is_none_mode;
+ s->sync_mode = sync_mode;
s->backing_mode = backing_mode;
s->zero_target = zero_target;
qatomic_set(&s->copy_mode, copy_mode);
@@ -2015,7 +2017,6 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
bool unmap, const char *filter_node_name,
MirrorCopyMode copy_mode, Error **errp)
{
- bool is_none_mode;
BlockDriverState *base;
GLOBAL_STATE_CODE();
@@ -2028,14 +2029,13 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
}
bdrv_graph_rdlock_main_loop();
- is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
base = mode == MIRROR_SYNC_MODE_TOP ? bdrv_backing_chain_next(bs) : NULL;
bdrv_graph_rdunlock_main_loop();
mirror_start_job(job_id, bs, creation_flags, target, replaces,
- speed, granularity, buf_size, backing_mode, zero_target,
- on_source_error, on_target_error, unmap, NULL, NULL,
- &mirror_job_driver, is_none_mode, base, false,
+ speed, granularity, buf_size, mode, backing_mode,
+ zero_target, on_source_error, on_target_error, unmap,
+ NULL, NULL, &mirror_job_driver, base, false,
filter_node_name, true, copy_mode, false, errp);
}
@@ -2061,9 +2061,9 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
job = mirror_start_job(
job_id, bs, creation_flags, base, NULL, speed, 0, 0,
- MIRROR_LEAVE_BACKING_CHAIN, false,
+ MIRROR_SYNC_MODE_TOP, MIRROR_LEAVE_BACKING_CHAIN, false,
on_error, on_error, true, cb, opaque,
- &commit_active_job_driver, false, base, auto_complete,
+ &commit_active_job_driver, base, auto_complete,
filter_node_name, false, MIRROR_COPY_MODE_BACKGROUND,
base_read_only, errp);
if (!job) {
--
2.48.1

View File

@ -0,0 +1,58 @@
From be6ce2c91fe949d1c264de974ab4f6c4efc6976e Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Tue, 13 May 2025 17:00:45 -0500
Subject: [PATCH 16/16] mirror: Reduce I/O when destination is
detect-zeroes:unmap
RH-Author: Eric Blake <eblake@redhat.com>
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
RH-Jira: RHEL-82906 RHEL-83015
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
RH-Commit: [14/14] 66f3de2ba9f977c9bc1c54f67d76b366df132e62 (ebblake/centos-qemu-kvm)
If we are going to punch holes in the mirror destination even for the
portions where the source image is unallocated, it is nicer to treat
the entire image as dirty and punch as we go, rather than pre-zeroing
the entire image just to re-do I/O to the allocated portions of the
image.
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-ID: <20250513220142.535200-2-eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 9abfc81246c9cc1845080eec5920779961187c07)
Jira: https://issues.redhat.com/browse/RHEL-82906
Jira: https://issues.redhat.com/browse/RHEL-83015
Signed-off-by: Eric Blake <eblake@redhat.com>
---
block/mirror.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/block/mirror.c b/block/mirror.c
index 7f3b5477ce..87c19ddf0d 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -920,11 +920,16 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s)
* zeroing happened externally (ret > 0) or if we have a fast
* way to pre-zero the image (the dirty bitmap will be
* populated later by the non-zero portions, the same as for
- * TOP mode). If pre-zeroing is not fast, then our only
- * recourse is to mark the entire image dirty. The act of
- * pre-zeroing will populate the zero bitmap.
+ * TOP mode). If pre-zeroing is not fast, or we need to visit
+ * the entire image in order to punch holes even in the
+ * non-allocated regions of the source, then just mark the
+ * entire image dirty and leave the zero bitmap clear at this
+ * point in time. Otherwise, it can be faster to pre-zero the
+ * image now, even if we re-write the allocated portions of
+ * the disk later, and the pre-zero pass will populate the
+ * zero bitmap.
*/
- if (!bdrv_can_write_zeroes_with_unmap(target_bs)) {
+ if (!bdrv_can_write_zeroes_with_unmap(target_bs) || punch_holes) {
bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, s->bdev_length);
return 0;
}
--
2.48.1

View File

@ -0,0 +1,180 @@
From 423ce7727eecae647330287e1264ac0d938fa7f9 Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Fri, 9 May 2025 15:40:27 -0500
Subject: [PATCH 12/16] mirror: Skip pre-zeroing destination if it is already
zero
RH-Author: Eric Blake <eblake@redhat.com>
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
RH-Jira: RHEL-82906 RHEL-83015
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
RH-Commit: [10/14] e754ae559123099f4aed322f6a4287cf3323f54d (ebblake/centos-qemu-kvm)
When doing a sync=full mirroring, we can skip pre-zeroing the
destination if it already reads as zeroes and we are not also trying
to punch holes due to detect-zeroes. With this patch, there are fewer
scenarios that have to pass in an explicit target-is-zero, while still
resulting in a sparse destination remaining sparse.
A later patch will then further improve things to skip writing to the
destination for parts of the image where the source is zero; but even
with just this patch, it is possible to see a difference for any
source that does not report itself as fully allocated, coupled with a
destination BDS that can quickly report that it already reads as zero.
(For a source that reports as fully allocated, such as a file, the
rest of mirror_dirty_init() still sets the entire dirty bitmap to
true, so even though we avoided the pre-zeroing, we are not yet
avoiding all redundant I/O).
Iotest 194 detects the difference made by this patch: for a file
source (where block status reports the entire image as allocated, and
therefore we end up writing zeroes everywhere in the destination
anyways), the job length remains the same. But for a qcow2 source and
a destination that reads as all zeroes, the dirty bitmap changes to
just tracking the allocated portions of the source, which results in
faster completion and smaller job statistics. For the test to pass
with both ./check -file and -qcow2, a new python filter is needed to
mask out the now-varying job amounts (this matches the shell filters
_filter_block_job_{offset,len} in common.filter). A later test will
also be added which further validates expected sparseness, so it does
not matter that 194 is no longer explicitly looking at how many bytes
were copied.
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-ID: <20250509204341.3553601-25-eblake@redhat.com>
Reviewed-by: Sunny Zhu <sunnyzhyy@qq.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 181a63667adf16c35b57e446def3e41c70f1fea6)
Jira: https://issues.redhat.com/browse/RHEL-82906
Jira: https://issues.redhat.com/browse/RHEL-83015
Signed-off-by: Eric Blake <eblake@redhat.com>
---
block/mirror.c | 24 ++++++++++++++++--------
tests/qemu-iotests/194 | 6 ++++--
tests/qemu-iotests/194.out | 4 ++--
tests/qemu-iotests/iotests.py | 12 +++++++++++-
4 files changed, 33 insertions(+), 13 deletions(-)
diff --git a/block/mirror.c b/block/mirror.c
index b35d12adaa..29cac1777c 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -848,23 +848,31 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s)
target_bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP &&
bdrv_can_write_zeroes_with_unmap(target_bs);
+ /* Determine if the image is already zero, regardless of sync mode. */
bdrv_graph_co_rdlock();
bs = s->mirror_top_bs->backing->bs;
+ if (s->target_is_zero) {
+ ret = 1;
+ } else {
+ ret = bdrv_co_is_all_zeroes(target_bs);
+ }
bdrv_graph_co_rdunlock();
- if (s->sync_mode == MIRROR_SYNC_MODE_TOP) {
+ /* Determine if a pre-zeroing pass is necessary. */
+ if (ret < 0) {
+ return ret;
+ } else if (s->sync_mode == MIRROR_SYNC_MODE_TOP) {
/* In TOP mode, there is no benefit to a pre-zeroing pass. */
- } else if (!s->target_is_zero || punch_holes) {
+ } else if (ret == 0 || punch_holes) {
/*
* Here, we are in FULL mode; our goal is to avoid writing
* zeroes if the destination already reads as zero, except
* when we are trying to punch holes. This is possible if
- * zeroing happened externally (s->target_is_zero) or if we
- * have a fast way to pre-zero the image (the dirty bitmap
- * will be populated later by the non-zero portions, the same
- * as for TOP mode). If pre-zeroing is not fast, or we need
- * to punch holes, then our only recourse is to write the
- * entire image.
+ * zeroing happened externally (ret > 0) or if we have a fast
+ * way to pre-zero the image (the dirty bitmap will be
+ * populated later by the non-zero portions, the same as for
+ * TOP mode). If pre-zeroing is not fast, or we need to punch
+ * holes, then our only recourse is to write the entire image.
*/
if (!bdrv_can_write_zeroes_with_unmap(target_bs)) {
bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, s->bdev_length);
diff --git a/tests/qemu-iotests/194 b/tests/qemu-iotests/194
index d0b9c084f5..e114c0b269 100755
--- a/tests/qemu-iotests/194
+++ b/tests/qemu-iotests/194
@@ -62,7 +62,8 @@ with iotests.FilePath('source.img') as source_img_path, \
iotests.log('Waiting for `drive-mirror` to complete...')
iotests.log(source_vm.event_wait('BLOCK_JOB_READY'),
- filters=[iotests.filter_qmp_event])
+ filters=[iotests.filter_qmp_event,
+ iotests.filter_block_job])
iotests.log('Starting migration...')
capabilities = [{'capability': 'events', 'state': True},
@@ -88,7 +89,8 @@ with iotests.FilePath('source.img') as source_img_path, \
while True:
event2 = source_vm.event_wait('BLOCK_JOB_COMPLETED')
- iotests.log(event2, filters=[iotests.filter_qmp_event])
+ iotests.log(event2, filters=[iotests.filter_qmp_event,
+ iotests.filter_block_job])
if event2['event'] == 'BLOCK_JOB_COMPLETED':
iotests.log('Stopping the NBD server on destination...')
iotests.log(dest_vm.qmp('nbd-server-stop'))
diff --git a/tests/qemu-iotests/194.out b/tests/qemu-iotests/194.out
index 376ed1d2e6..84e0fc34be 100644
--- a/tests/qemu-iotests/194.out
+++ b/tests/qemu-iotests/194.out
@@ -7,7 +7,7 @@ Launching NBD server on destination...
Starting `drive-mirror` on source...
{"return": {}}
Waiting for `drive-mirror` to complete...
-{"data": {"device": "mirror-job0", "len": 1073741824, "offset": 1073741824, "speed": 0, "type": "mirror"}, "event": "BLOCK_JOB_READY", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
+{"data": {"device": "mirror-job0", "len": "LEN", "offset": "OFFSET", "speed": 0, "type": "mirror"}, "event": "BLOCK_JOB_READY", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
Starting migration...
{"return": {}}
{"execute": "migrate-start-postcopy", "arguments": {}}
@@ -17,7 +17,7 @@ Starting migration...
{"data": {"status": "completed"}, "event": "MIGRATION", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
Gracefully ending the `drive-mirror` job on source...
{"return": {}}
-{"data": {"device": "mirror-job0", "len": 1073741824, "offset": 1073741824, "speed": 0, "type": "mirror"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
+{"data": {"device": "mirror-job0", "len": "LEN", "offset": "OFFSET", "speed": 0, "type": "mirror"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
Stopping the NBD server on destination...
{"return": {}}
Wait for migration completion on target...
diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index c8cb028c2d..978bef1499 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -601,13 +601,23 @@ def filter_chown(msg):
return chown_re.sub("chown UID:GID", msg)
def filter_qmp_event(event):
- '''Filter a QMP event dict'''
+ '''Filter the timestamp of a QMP event dict'''
event = dict(event)
if 'timestamp' in event:
event['timestamp']['seconds'] = 'SECS'
event['timestamp']['microseconds'] = 'USECS'
return event
+def filter_block_job(event):
+ '''Filter the offset and length of a QMP block job event dict'''
+ event = dict(event)
+ if 'data' in event:
+ if 'offset' in event['data']:
+ event['data']['offset'] = 'OFFSET'
+ if 'len' in event['data']:
+ event['data']['len'] = 'LEN'
+ return event
+
def filter_qmp(qmsg, filter_fn):
'''Given a string filter, filter a QMP object's values.
filter_fn takes a (key, value) pair.'''
--
2.48.1

View File

@ -0,0 +1,355 @@
From 8a2e660ff3ec7f7506fbd4197d4dc8f53db7859a Mon Sep 17 00:00:00 2001
From: Eric Blake <eblake@redhat.com>
Date: Fri, 9 May 2025 15:40:28 -0500
Subject: [PATCH 13/16] mirror: Skip writing zeroes when target is already zero
RH-Author: Eric Blake <eblake@redhat.com>
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
RH-Jira: RHEL-82906 RHEL-83015
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
RH-Commit: [11/14] f6bb5e0cecee07af0389aa18c3bddb47d6c5cf54 (ebblake/centos-qemu-kvm)
When mirroring, the goal is to ensure that the destination reads the
same as the source; this goal is met whether the destination is sparse
or fully-allocated (except when explicitly punching holes, then merely
reading zero is not enough to know if it is sparse, so we still want
to punch the hole). Avoiding a redundant write to zero (whether in
the background because the zero cluster was marked in the dirty
bitmap, or in the foreground because the guest is writing zeroes) when
the destination already reads as zero makes mirroring faster, and
avoids allocating the destination merely because the source reports as
allocated.
The effect is especially pronounced when the source is a raw file.
That's because when the source is a qcow2 file, the dirty bitmap only
visits the portions of the source that are allocated, which tend to be
non-zero. But when the source is a raw file,
bdrv_co_is_allocated_above() reports the entire file as allocated so
mirror_dirty_init sets the entire dirty bitmap, and it is only later
during mirror_iteration that we change to consulting the more precise
bdrv_co_block_status_above() to learn where the source reads as zero.
Remember that since a mirror operation can write a cluster more than
once (every time the guest changes the source, the destination is also
changed to keep up), and the guest can change whether a given cluster
reads as zero, is discarded, or has non-zero data over the course of
the mirror operation, we can't take the shortcut of relying on
s->target_is_zero (which is static for the life of the job) in
mirror_co_zero() to see if the destination is already zero, because
that information may be stale. Any solution we use must be dynamic in
the face of the guest writing or discarding a cluster while the mirror
has been ongoing.
We could just teach mirror_co_zero() to do a block_status() probe of
the destination, and skip the zeroes if the destination already reads
as zero, but we know from past experience that extra block_status()
calls are not always cheap (tmpfs, anyone?), especially when they are
random access rather than linear. Use of block_status() of the source
by the background task in a linear fashion is not our bottleneck (it's
a background task, after all); but since mirroring can be done while
the source is actively being changed, we don't want a slow
block_status() of the destination to occur on the hot path of the
guest trying to do random-access writes to the source.
So this patch takes a slightly different approach: any time we have to
track dirty clusters, we can also track which clusters are known to
read as zero. For sync=TOP or when we are punching holes from
"detect-zeroes":"unmap", the zero bitmap starts out empty, but
prevents a second write zero to a cluster that was already zero by an
earlier pass; for sync=FULL when we are not punching holes, the zero
bitmap starts out full if the destination reads as zero during
initialization. Either way, I/O to the destination can now avoid
redundant write zero to a cluster that already reads as zero, all
without having to do a block_status() per write on the destination.
With this patch, if I create a raw sparse destination file, connect it
with QMP 'blockdev-add' while leaving it at the default "discard":
"ignore", then run QMP 'blockdev-mirror' with "sync": "full", the
destination remains sparse rather than fully allocated. Meanwhile, a
destination image that is already fully allocated remains so unless it
was opened with "detect-zeroes": "unmap". And any time writing zeroes
is skipped, the job counters are not incremented.
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-ID: <20250509204341.3553601-26-eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 7e277545b90874171128804e256a538fb0e8dd7e)
Jira: https://issues.redhat.com/browse/RHEL-82906
Jira: https://issues.redhat.com/browse/RHEL-83015
Signed-off-by: Eric Blake <eblake@redhat.com>
---
block/mirror.c | 107 ++++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 93 insertions(+), 14 deletions(-)
diff --git a/block/mirror.c b/block/mirror.c
index 29cac1777c..7f3b5477ce 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -73,6 +73,7 @@ typedef struct MirrorBlockJob {
size_t buf_size;
int64_t bdev_length;
unsigned long *cow_bitmap;
+ unsigned long *zero_bitmap;
BdrvDirtyBitmap *dirty_bitmap;
BdrvDirtyBitmapIter *dbi;
uint8_t *buf;
@@ -108,9 +109,12 @@ struct MirrorOp {
int64_t offset;
uint64_t bytes;
- /* The pointee is set by mirror_co_read(), mirror_co_zero(), and
- * mirror_co_discard() before yielding for the first time */
+ /*
+ * These pointers are set by mirror_co_read(), mirror_co_zero(), and
+ * mirror_co_discard() before yielding for the first time
+ */
int64_t *bytes_handled;
+ bool *io_skipped;
bool is_pseudo_op;
bool is_active_write;
@@ -408,15 +412,34 @@ static void coroutine_fn mirror_co_read(void *opaque)
static void coroutine_fn mirror_co_zero(void *opaque)
{
MirrorOp *op = opaque;
- int ret;
+ bool write_needed = true;
+ int ret = 0;
op->s->in_flight++;
op->s->bytes_in_flight += op->bytes;
*op->bytes_handled = op->bytes;
op->is_in_flight = true;
- ret = blk_co_pwrite_zeroes(op->s->target, op->offset, op->bytes,
- op->s->unmap ? BDRV_REQ_MAY_UNMAP : 0);
+ if (op->s->zero_bitmap) {
+ unsigned long end = DIV_ROUND_UP(op->offset + op->bytes,
+ op->s->granularity);
+ assert(QEMU_IS_ALIGNED(op->offset, op->s->granularity));
+ assert(QEMU_IS_ALIGNED(op->bytes, op->s->granularity) ||
+ op->offset + op->bytes == op->s->bdev_length);
+ if (find_next_zero_bit(op->s->zero_bitmap, end,
+ op->offset / op->s->granularity) == end) {
+ write_needed = false;
+ *op->io_skipped = true;
+ }
+ }
+ if (write_needed) {
+ ret = blk_co_pwrite_zeroes(op->s->target, op->offset, op->bytes,
+ op->s->unmap ? BDRV_REQ_MAY_UNMAP : 0);
+ }
+ if (ret >= 0 && op->s->zero_bitmap) {
+ bitmap_set(op->s->zero_bitmap, op->offset / op->s->granularity,
+ DIV_ROUND_UP(op->bytes, op->s->granularity));
+ }
mirror_write_complete(op, ret);
}
@@ -435,29 +458,43 @@ static void coroutine_fn mirror_co_discard(void *opaque)
}
static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset,
- unsigned bytes, MirrorMethod mirror_method)
+ unsigned bytes, MirrorMethod mirror_method,
+ bool *io_skipped)
{
MirrorOp *op;
Coroutine *co;
int64_t bytes_handled = -1;
+ assert(QEMU_IS_ALIGNED(offset, s->granularity));
+ assert(QEMU_IS_ALIGNED(bytes, s->granularity) ||
+ offset + bytes == s->bdev_length);
op = g_new(MirrorOp, 1);
*op = (MirrorOp){
.s = s,
.offset = offset,
.bytes = bytes,
.bytes_handled = &bytes_handled,
+ .io_skipped = io_skipped,
};
qemu_co_queue_init(&op->waiting_requests);
switch (mirror_method) {
case MIRROR_METHOD_COPY:
+ if (s->zero_bitmap) {
+ bitmap_clear(s->zero_bitmap, offset / s->granularity,
+ DIV_ROUND_UP(bytes, s->granularity));
+ }
co = qemu_coroutine_create(mirror_co_read, op);
break;
case MIRROR_METHOD_ZERO:
+ /* s->zero_bitmap handled in mirror_co_zero */
co = qemu_coroutine_create(mirror_co_zero, op);
break;
case MIRROR_METHOD_DISCARD:
+ if (s->zero_bitmap) {
+ bitmap_clear(s->zero_bitmap, offset / s->granularity,
+ DIV_ROUND_UP(bytes, s->granularity));
+ }
co = qemu_coroutine_create(mirror_co_discard, op);
break;
default:
@@ -568,6 +605,7 @@ static void coroutine_fn GRAPH_UNLOCKED mirror_iteration(MirrorBlockJob *s)
int ret;
int64_t io_bytes;
int64_t io_bytes_acct;
+ bool io_skipped = false;
MirrorMethod mirror_method = MIRROR_METHOD_COPY;
assert(!(offset % s->granularity));
@@ -611,8 +649,10 @@ static void coroutine_fn GRAPH_UNLOCKED mirror_iteration(MirrorBlockJob *s)
}
io_bytes = mirror_clip_bytes(s, offset, io_bytes);
- io_bytes = mirror_perform(s, offset, io_bytes, mirror_method);
- if (mirror_method != MIRROR_METHOD_COPY && write_zeroes_ok) {
+ io_bytes = mirror_perform(s, offset, io_bytes, mirror_method,
+ &io_skipped);
+ if (io_skipped ||
+ (mirror_method != MIRROR_METHOD_COPY && write_zeroes_ok)) {
io_bytes_acct = 0;
} else {
io_bytes_acct = io_bytes;
@@ -847,8 +887,10 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s)
bool punch_holes =
target_bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP &&
bdrv_can_write_zeroes_with_unmap(target_bs);
+ int64_t bitmap_length = DIV_ROUND_UP(s->bdev_length, s->granularity);
/* Determine if the image is already zero, regardless of sync mode. */
+ s->zero_bitmap = bitmap_new(bitmap_length);
bdrv_graph_co_rdlock();
bs = s->mirror_top_bs->backing->bs;
if (s->target_is_zero) {
@@ -862,7 +904,14 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s)
if (ret < 0) {
return ret;
} else if (s->sync_mode == MIRROR_SYNC_MODE_TOP) {
- /* In TOP mode, there is no benefit to a pre-zeroing pass. */
+ /*
+ * In TOP mode, there is no benefit to a pre-zeroing pass, but
+ * the zero bitmap can be set if the destination already reads
+ * as zero and we are not punching holes.
+ */
+ if (ret > 0 && !punch_holes) {
+ bitmap_set(s->zero_bitmap, 0, bitmap_length);
+ }
} else if (ret == 0 || punch_holes) {
/*
* Here, we are in FULL mode; our goal is to avoid writing
@@ -871,8 +920,9 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s)
* zeroing happened externally (ret > 0) or if we have a fast
* way to pre-zero the image (the dirty bitmap will be
* populated later by the non-zero portions, the same as for
- * TOP mode). If pre-zeroing is not fast, or we need to punch
- * holes, then our only recourse is to write the entire image.
+ * TOP mode). If pre-zeroing is not fast, then our only
+ * recourse is to mark the entire image dirty. The act of
+ * pre-zeroing will populate the zero bitmap.
*/
if (!bdrv_can_write_zeroes_with_unmap(target_bs)) {
bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, s->bdev_length);
@@ -883,6 +933,7 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s)
for (offset = 0; offset < s->bdev_length; ) {
int bytes = MIN(s->bdev_length - offset,
QEMU_ALIGN_DOWN(INT_MAX, s->granularity));
+ bool ignored;
mirror_throttle(s);
@@ -898,12 +949,15 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s)
continue;
}
- mirror_perform(s, offset, bytes, MIRROR_METHOD_ZERO);
+ mirror_perform(s, offset, bytes, MIRROR_METHOD_ZERO, &ignored);
offset += bytes;
}
mirror_wait_for_all_io(s);
s->initial_zeroing_ongoing = false;
+ } else {
+ /* In FULL mode, and image already reads as zero. */
+ bitmap_set(s->zero_bitmap, 0, bitmap_length);
}
/* First part, loop on the sectors and initialize the dirty bitmap. */
@@ -1188,6 +1242,7 @@ immediate_exit:
assert(s->in_flight == 0);
qemu_vfree(s->buf);
g_free(s->cow_bitmap);
+ g_free(s->zero_bitmap);
g_free(s->in_flight_bitmap);
bdrv_dirty_iter_free(s->dbi);
@@ -1367,6 +1422,7 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
int ret;
size_t qiov_offset = 0;
int64_t dirty_bitmap_offset, dirty_bitmap_end;
+ int64_t zero_bitmap_offset, zero_bitmap_end;
if (!QEMU_IS_ALIGNED(offset, job->granularity) &&
bdrv_dirty_bitmap_get(job->dirty_bitmap, offset))
@@ -1410,8 +1466,9 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
}
/*
- * Tails are either clean or shrunk, so for bitmap resetting
- * we safely align the range down.
+ * Tails are either clean or shrunk, so for dirty bitmap resetting
+ * we safely align the range narrower. But for zero bitmap, round
+ * range wider for checking or clearing, and narrower for setting.
*/
dirty_bitmap_offset = QEMU_ALIGN_UP(offset, job->granularity);
dirty_bitmap_end = QEMU_ALIGN_DOWN(offset + bytes, job->granularity);
@@ -1419,22 +1476,44 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
bdrv_reset_dirty_bitmap(job->dirty_bitmap, dirty_bitmap_offset,
dirty_bitmap_end - dirty_bitmap_offset);
}
+ zero_bitmap_offset = offset / job->granularity;
+ zero_bitmap_end = DIV_ROUND_UP(offset + bytes, job->granularity);
job_progress_increase_remaining(&job->common.job, bytes);
job->active_write_bytes_in_flight += bytes;
switch (method) {
case MIRROR_METHOD_COPY:
+ if (job->zero_bitmap) {
+ bitmap_clear(job->zero_bitmap, zero_bitmap_offset,
+ zero_bitmap_end - zero_bitmap_offset);
+ }
ret = blk_co_pwritev_part(job->target, offset, bytes,
qiov, qiov_offset, flags);
break;
case MIRROR_METHOD_ZERO:
+ if (job->zero_bitmap) {
+ if (find_next_zero_bit(job->zero_bitmap, zero_bitmap_end,
+ zero_bitmap_offset) == zero_bitmap_end) {
+ ret = 0;
+ break;
+ }
+ }
assert(!qiov);
ret = blk_co_pwrite_zeroes(job->target, offset, bytes, flags);
+ if (job->zero_bitmap && ret >= 0) {
+ bitmap_set(job->zero_bitmap, dirty_bitmap_offset / job->granularity,
+ (dirty_bitmap_end - dirty_bitmap_offset) /
+ job->granularity);
+ }
break;
case MIRROR_METHOD_DISCARD:
+ if (job->zero_bitmap) {
+ bitmap_clear(job->zero_bitmap, zero_bitmap_offset,
+ zero_bitmap_end - zero_bitmap_offset);
+ }
assert(!qiov);
ret = blk_co_pdiscard(job->target, offset, bytes);
break;
--
2.48.1

View File

@ -0,0 +1,49 @@
From 4b9a1a9154467fd65ac2a0a26959d3342d8fcd49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:37:08 +0100
Subject: [PATCH 55/57] net/socket: skip automatic zero-init of large array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [29/30] 645ad4d138d1222ea9bd1b2ac3b84d9ff83e2fa2 (stefanha/centos-stream-qemu-kvm)
The 'net_socket_send' method has a 68k byte array used for copying
data between guest and host. Skip the automatic zero-init of this
array to eliminate the performance overhead in the I/O hot path.
The 'buf1' array will be fully initialized when reading data off
the network socket.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Harsh Prateek Bora <harshpb@linux.ibm.com>
Message-id: 20250610123709.835102-31-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 751b0e79f1e0e7f88fad2fe2f22595ad03d78859)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
net/socket.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/socket.c b/net/socket.c
index 8e3702e1f3..784dda686f 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -157,7 +157,7 @@ static void net_socket_send(void *opaque)
NetSocketState *s = opaque;
int size;
int ret;
- uint8_t buf1[NET_BUFSIZE];
+ QEMU_UNINITIALIZED uint8_t buf1[NET_BUFSIZE];
const uint8_t *buf;
size = recv(s->fd, buf1, sizeof(buf1), 0);
--
2.39.3

View File

@ -0,0 +1,49 @@
From 94310a4168257297e52058d5d6aea4a2d06630c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Tue, 10 Jun 2025 13:37:09 +0100
Subject: [PATCH 56/57] net/stream: skip automatic zero-init of large array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
RH-Jira: RHEL-99888
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Commit: [30/30] 9dfec5c0e6358e3557bf58d66eee8e4ba6e93621 (stefanha/centos-stream-qemu-kvm)
The 'net_stream_send' method has a 68k byte array used for copying
data between guest and host. Skip the automatic zero-init of this
array to eliminate the performance overhead in the I/O hot path.
The 'buf1' array will be fully initialized when reading data off
the network socket.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Harsh Prateek Bora <harshpb@linux.ibm.com>
Message-id: 20250610123709.835102-32-berrange@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
(cherry picked from commit 837b87c4c5ba9ac7a255133c6642b8d578272a70)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
net/stream.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/stream.c b/net/stream.c
index 97e6ec6679..12384ffee5 100644
--- a/net/stream.c
+++ b/net/stream.c
@@ -148,7 +148,7 @@ static gboolean net_stream_send(QIOChannel *ioc,
NetStreamState *s = data;
int size;
int ret;
- char buf1[NET_BUFSIZE];
+ QEMU_UNINITIALIZED char buf1[NET_BUFSIZE];
const char *buf;
size = qio_channel_read(s->ioc, buf1, sizeof(buf1), NULL);
--
2.39.3

View File

@ -0,0 +1,133 @@
From b6de1e19ba778547e92997c6cad77d7cf755c78b Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Mon, 17 Feb 2025 10:25:50 +0100
Subject: [PATCH 1/3] net: vhost-user: add QAPI events to report connection
state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Laurent Vivier <lvivier@redhat.com>
RH-MergeRequest: 371: net: vhost-user: add QAPI events to report connection state
RH-Jira: RHEL-95120
RH-Acked-by: Eugenio Pérez <eperezma@redhat.com>
RH-Acked-by: Cindy Lu <lulu@redhat.com>
RH-Commit: [1/1] c8f65026e3548891fe713a1622438388e285dbf3 (lvivier/qemu-kvm-centos)
The netdev reports NETDEV_VHOST_USER_CONNECTED event when
the chardev is connected, and NETDEV_VHOST_USER_DISCONNECTED
when it is disconnected.
The NETDEV_VHOST_USER_CONNECTED event includes the chardev id.
This allows a system manager like libvirt to detect when the server
fails.
For instance with passt:
{ 'execute': 'qmp_capabilities' }
{ "return": { } }
[killing passt here]
{ "timestamp": { "seconds": 1739538634, "microseconds": 920450 },
"event": "NETDEV_VHOST_USER_DISCONNECTED",
"data": { "netdev-id": "netdev0" } }
[automatic reconnection with reconnect-ms]
{ "timestamp": { "seconds": 1739538638, "microseconds": 354181 },
"event": "NETDEV_VHOST_USER_CONNECTED",
"data": { "netdev-id": "netdev0", "chardev-id": "chr0" } }
Tested-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Message-Id: <20250217092550.1172055-1-lvivier@redhat.com>
Acked-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
(cherry picked from commit 02fd9f8aeeb184276b283ae2f404bc3acf1e7b7a)
---
net/vhost-user.c | 3 +++
qapi/net.json | 40 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 43 insertions(+)
diff --git a/net/vhost-user.c b/net/vhost-user.c
index 12555518e8..0b235e50c6 100644
--- a/net/vhost-user.c
+++ b/net/vhost-user.c
@@ -16,6 +16,7 @@
#include "chardev/char-fe.h"
#include "qapi/error.h"
#include "qapi/qapi-commands-net.h"
+#include "qapi/qapi-events-net.h"
#include "qemu/config-file.h"
#include "qemu/error-report.h"
#include "qemu/option.h"
@@ -271,6 +272,7 @@ static void chr_closed_bh(void *opaque)
if (err) {
error_report_err(err);
}
+ qapi_event_send_netdev_vhost_user_disconnected(name);
}
static void net_vhost_user_event(void *opaque, QEMUChrEvent event)
@@ -300,6 +302,7 @@ static void net_vhost_user_event(void *opaque, QEMUChrEvent event)
net_vhost_user_watch, s);
qmp_set_link(name, true, &err);
s->started = true;
+ qapi_event_send_netdev_vhost_user_connected(name, chr->label);
break;
case CHR_EVENT_CLOSED:
/* a close event may happen during a read/write, but vhost
diff --git a/qapi/net.json b/qapi/net.json
index 87fc0d0b28..7bd1eaa1ba 100644
--- a/qapi/net.json
+++ b/qapi/net.json
@@ -1020,3 +1020,43 @@
##
{ 'event': 'NETDEV_STREAM_DISCONNECTED',
'data': { 'netdev-id': 'str' } }
+
+##
+# @NETDEV_VHOST_USER_CONNECTED:
+#
+# Emitted when the vhost-user chardev is connected
+#
+# @netdev-id: QEMU netdev id that is connected
+#
+# @chardev-id: The character device id used by the QEMU netdev
+#
+# Since: 10.0
+#
+# .. qmp-example::
+#
+# <- { "timestamp": {"seconds": 1739538638, "microseconds": 354181 },
+# "event": "NETDEV_VHOST_USER_CONNECTED",
+# "data": { "netdev-id": "netdev0", "chardev-id": "chr0" } }
+#
+##
+{ 'event': 'NETDEV_VHOST_USER_CONNECTED',
+ 'data': { 'netdev-id': 'str', 'chardev-id': 'str' } }
+
+##
+# @NETDEV_VHOST_USER_DISCONNECTED:
+#
+# Emitted when the vhost-user chardev is disconnected
+#
+# @netdev-id: QEMU netdev id that is disconnected
+#
+# Since: 10.0
+#
+# .. qmp-example::
+#
+# <- { "timestamp": { "seconds": 1739538634, "microseconds": 920450 },
+# "event": "NETDEV_VHOST_USER_DISCONNECTED",
+# "data": { "netdev-id": "netdev0" } }
+#
+##
+{ 'event': 'NETDEV_VHOST_USER_DISCONNECTED',
+ 'data': { 'netdev-id': 'str' } }
--
2.48.1

View File

@ -0,0 +1,153 @@
From 978951b390bb7073293c792c4714516ad40cba73 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Tue, 25 Feb 2025 14:52:26 -0700
Subject: [PATCH 3/7] pci: Use PCI PM capability initializer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 348: PCI: Implement basic PCI PM capability backing
RH-Jira: RHEL-7301
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Alex Williamson <None>
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
RH-Commit: [3/6] fd862caa094490a9b8a04b00ad39ba58e0b46a7a (eauger1/centos-qemu-kvm)
Switch callers directly initializing the PCI PM capability with
pci_add_capability() to use pci_pm_init().
Cc: Dmitry Fleytman <dmitry.fleytman@gmail.com>
Cc: Akihiko Odaki <akihiko.odaki@daynix.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Stefan Weil <sw@weilnetz.de>
Cc: Sriram Yagnaraman <sriram.yagnaraman@ericsson.com>
Cc: Keith Busch <kbusch@kernel.org>
Cc: Klaus Jensen <its@irrelevant.dk>
Cc: Jesper Devantier <foss@defmacro.it>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
Cc: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Link: https://lore.kernel.org/qemu-devel/20250225215237.3314011-3-alex.williamson@redhat.com
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 0681ec253141d838210b3c5e6bc0d2d71f2e111e)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/net/e1000e.c | 3 +--
hw/net/eepro100.c | 4 +---
hw/net/igb.c | 3 +--
hw/nvme/ctrl.c | 3 +--
hw/pci-bridge/pcie_pci_bridge.c | 2 +-
hw/vfio/pci.c | 7 ++++++-
hw/virtio/virtio-pci.c | 3 +--
7 files changed, 12 insertions(+), 13 deletions(-)
diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
index 843892ce09..9eb93d049d 100644
--- a/hw/net/e1000e.c
+++ b/hw/net/e1000e.c
@@ -372,8 +372,7 @@ static int
e1000e_add_pm_capability(PCIDevice *pdev, uint8_t offset, uint16_t pmc)
{
Error *local_err = NULL;
- int ret = pci_add_capability(pdev, PCI_CAP_ID_PM, offset,
- PCI_PM_SIZEOF, &local_err);
+ int ret = pci_pm_init(pdev, offset, &local_err);
if (local_err) {
error_report_err(local_err);
diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c
index d9a70c4544..668a410055 100644
--- a/hw/net/eepro100.c
+++ b/hw/net/eepro100.c
@@ -549,9 +549,7 @@ static void e100_pci_reset(EEPRO100State *s, Error **errp)
if (info->power_management) {
/* Power Management Capabilities */
int cfg_offset = 0xdc;
- int r = pci_add_capability(&s->dev, PCI_CAP_ID_PM,
- cfg_offset, PCI_PM_SIZEOF,
- errp);
+ int r = pci_pm_init(&s->dev, cfg_offset, errp);
if (r < 0) {
return;
}
diff --git a/hw/net/igb.c b/hw/net/igb.c
index b92bba402e..a3c22e2391 100644
--- a/hw/net/igb.c
+++ b/hw/net/igb.c
@@ -356,8 +356,7 @@ static int
igb_add_pm_capability(PCIDevice *pdev, uint8_t offset, uint16_t pmc)
{
Error *local_err = NULL;
- int ret = pci_add_capability(pdev, PCI_CAP_ID_PM, offset,
- PCI_PM_SIZEOF, &local_err);
+ int ret = pci_pm_init(pdev, offset, &local_err);
if (local_err) {
error_report_err(local_err);
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 9f277b81d8..d451ee0d00 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -8293,8 +8293,7 @@ static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset)
Error *err = NULL;
int ret;
- ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, offset,
- PCI_PM_SIZEOF, &err);
+ ret = pci_pm_init(pci_dev, offset, &err);
if (err) {
error_report_err(err);
return ret;
diff --git a/hw/pci-bridge/pcie_pci_bridge.c b/hw/pci-bridge/pcie_pci_bridge.c
index 7646ac2397..2f098e3a13 100644
--- a/hw/pci-bridge/pcie_pci_bridge.c
+++ b/hw/pci-bridge/pcie_pci_bridge.c
@@ -52,7 +52,7 @@ static void pcie_pci_bridge_realize(PCIDevice *d, Error **errp)
goto cap_error;
}
- pos = pci_add_capability(d, PCI_CAP_ID_PM, 0, PCI_PM_SIZEOF, errp);
+ pos = pci_pm_init(d, 0, errp);
if (pos < 0) {
goto pm_error;
}
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 82a47edc89..e18b57d864 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2220,7 +2220,12 @@ static bool vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
case PCI_CAP_ID_PM:
vfio_check_pm_reset(vdev, pos);
vdev->pm_cap = pos;
- ret = pci_add_capability(pdev, cap_id, pos, size, errp) >= 0;
+ ret = pci_pm_init(pdev, pos, errp) >= 0;
+ /*
+ * PCI-core config space emulation needs write access to the power
+ * state enabled for tracking BAR mapping relative to PM state.
+ */
+ pci_set_word(pdev->wmask + pos + PCI_PM_CTRL, PCI_PM_CTRL_STATE_MASK);
break;
case PCI_CAP_ID_AF:
vfio_check_af_flr(vdev, pos);
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 524b63e5c7..4b2aeaad8d 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -2195,8 +2195,7 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp)
pos = pcie_endpoint_cap_init(pci_dev, 0);
assert(pos > 0);
- pos = pci_add_capability(pci_dev, PCI_CAP_ID_PM, 0,
- PCI_PM_SIZEOF, errp);
+ pos = pci_pm_init(pci_dev, 0, errp);
if (pos < 0) {
return;
}
--
2.48.1

View File

@ -0,0 +1,99 @@
From 274e81bcf091c981d1e27e49fbe98e63d5308472 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Tue, 25 Feb 2025 14:52:28 -0700
Subject: [PATCH 5/7] pcie, virtio: Remove redundant pm_cap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 348: PCI: Implement basic PCI PM capability backing
RH-Jira: RHEL-7301
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Alex Williamson <None>
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
RH-Commit: [5/6] 81c6e3c9c52a0b3f0b9269b4ac7f56e8e4b5d68b (eauger1/centos-qemu-kvm)
The pm_cap on the PCIExpressDevice object can be distilled down
to the new instance on the PCIDevice object.
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Link: https://lore.kernel.org/qemu-devel/20250225215237.3314011-5-alex.williamson@redhat.com
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 8b8d08cf293b930d0f55b2d5385d8dd27e0c6b41)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/pci-bridge/pcie_pci_bridge.c | 1 -
hw/virtio/virtio-pci.c | 8 +++-----
include/hw/pci/pcie.h | 2 --
3 files changed, 3 insertions(+), 8 deletions(-)
diff --git a/hw/pci-bridge/pcie_pci_bridge.c b/hw/pci-bridge/pcie_pci_bridge.c
index 2f098e3a13..c0ba6d7928 100644
--- a/hw/pci-bridge/pcie_pci_bridge.c
+++ b/hw/pci-bridge/pcie_pci_bridge.c
@@ -56,7 +56,6 @@ static void pcie_pci_bridge_realize(PCIDevice *d, Error **errp)
if (pos < 0) {
goto pm_error;
}
- d->exp.pm_cap = pos;
pci_set_word(d->config + pos + PCI_PM_PMC, 0x3);
pcie_cap_arifwd_init(d);
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 4b2aeaad8d..a85787b837 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -2200,8 +2200,6 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp)
return;
}
- pci_dev->exp.pm_cap = pos;
-
/*
* Indicates that this function complies with revision 1.2 of the
* PCI Power Management Interface Specification.
@@ -2295,11 +2293,11 @@ static bool virtio_pci_no_soft_reset(PCIDevice *dev)
{
uint16_t pmcsr;
- if (!pci_is_express(dev) || !dev->exp.pm_cap) {
+ if (!pci_is_express(dev) || !(dev->cap_present & QEMU_PCI_CAP_PM)) {
return false;
}
- pmcsr = pci_get_word(dev->config + dev->exp.pm_cap + PCI_PM_CTRL);
+ pmcsr = pci_get_word(dev->config + dev->pm_cap + PCI_PM_CTRL);
/*
* When No_Soft_Reset bit is set and the device
@@ -2328,7 +2326,7 @@ static void virtio_pci_bus_reset_hold(Object *obj, ResetType type)
if (proxy->flags & VIRTIO_PCI_FLAG_INIT_PM) {
pci_word_test_and_clear_mask(
- dev->config + dev->exp.pm_cap + PCI_PM_CTRL,
+ dev->config + dev->pm_cap + PCI_PM_CTRL,
PCI_PM_CTRL_STATE_MASK);
}
}
diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h
index 5eddb90976..8a30d07fd0 100644
--- a/include/hw/pci/pcie.h
+++ b/include/hw/pci/pcie.h
@@ -58,8 +58,6 @@ typedef enum {
struct PCIExpressDevice {
/* Offset of express capability in config space */
uint8_t exp_cap;
- /* Offset of Power Management capability in config space */
- uint8_t pm_cap;
/* SLOT */
bool hpev_notified; /* Logical AND of conditions for hot plug event.
--
2.48.1

View File

@ -0,0 +1,139 @@
From 22f26a93ab94bf87c0724891a5886797a38c23b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
Date: Mon, 2 Dec 2024 12:19:27 +0000
Subject: [PATCH 6/9] qga: implement a 'guest-get-load' command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Konstantin Kostiuk <None>
RH-MergeRequest: 343: RHEL-69622: qga: implement a 'guest-get-load' command
RH-Jira: RHEL-69622
RH-Acked-by: Daniel P. Berrangé <berrange@redhat.com>
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
RH-Commit: [1/1] 9284c70737ad9f700d37f8c3833f855f2354acb7 (kkostiuk/redhat-centos-stream-src-qemu-kvm)
Provide a way to report the process load average, via a new
'guest-get-load' command.
This is only implemented for POSIX platforms providing 'getloadavg'.
Example illustrated with qmp-shell:
(QEMU) guest-get-load
{
"return": {
"load15m": 1.546875,
"load1m": 1.669921875,
"load5m": 1.9306640625
}
}
Windows has no native equivalent API, but it would be possible to
simulate it as illustrated here (BSD-3-Clause):
https://github.com/giampaolo/psutil/pull/1485
This is left as an exercise for future contributors.
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Konstantin Kostiuk <kkostiuk@redhat.com>
Message-ID: <20241202121927.864335-1-berrange@redhat.com>
Signed-off-by: Konstantin Kostiuk <kkostiuk@redhat.com>
---
meson.build | 1 +
qga/commands-posix.c | 20 ++++++++++++++++++++
qga/qapi-schema.json | 37 +++++++++++++++++++++++++++++++++++++
3 files changed, 58 insertions(+)
diff --git a/meson.build b/meson.build
index b3529aa0e1..1dd97c6f49 100644
--- a/meson.build
+++ b/meson.build
@@ -2497,6 +2497,7 @@ config_host_data.set('CONFIG_SETNS', cc.has_function('setns') and cc.has_functio
config_host_data.set('CONFIG_SYNCFS', cc.has_function('syncfs'))
config_host_data.set('CONFIG_SYNC_FILE_RANGE', cc.has_function('sync_file_range'))
config_host_data.set('CONFIG_TIMERFD', cc.has_function('timerfd_create'))
+config_host_data.set('CONFIG_GETLOADAVG', cc.has_function('getloadavg'))
config_host_data.set('HAVE_COPY_FILE_RANGE', cc.has_function('copy_file_range'))
config_host_data.set('HAVE_GETIFADDRS', cc.has_function('getifaddrs'))
config_host_data.set('HAVE_GLIB_WITH_SLICE_ALLOCATOR', glib_has_gslice)
diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 49e40f9127..abfa53d6e9 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -1371,3 +1371,23 @@ char *qga_get_host_name(Error **errp)
return g_steal_pointer(&hostname);
}
+
+#ifdef CONFIG_GETLOADAVG
+GuestLoadAverage *qmp_guest_get_load(Error **errp)
+{
+ double loadavg[3];
+ GuestLoadAverage *ret = NULL;
+
+ if (getloadavg(loadavg, G_N_ELEMENTS(loadavg)) < 0) {
+ error_setg_errno(errp, errno,
+ "cannot query load average");
+ return NULL;
+ }
+
+ ret = g_new0(GuestLoadAverage, 1);
+ ret->load1m = loadavg[0];
+ ret->load5m = loadavg[1];
+ ret->load15m = loadavg[2];
+ return ret;
+}
+#endif
diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
index 495706cf73..739f008ff2 100644
--- a/qga/qapi-schema.json
+++ b/qga/qapi-schema.json
@@ -1852,6 +1852,43 @@
'if': 'CONFIG_LINUX'
}
+
+##
+# @GuestLoadAverage:
+#
+# Statistics about process load information
+#
+# @load1m: 1-minute load avage
+#
+# @load5m: 5-minute load avage
+#
+# @load15m: 15-minute load avage
+#
+# Since: 10.0
+##
+{ 'struct': 'GuestLoadAverage',
+ 'data': {
+ 'load1m': 'number',
+ 'load5m': 'number',
+ 'load15m': 'number'
+ },
+ 'if': 'CONFIG_GETLOADAVG'
+}
+
+##
+# @guest-get-load:
+#
+# Retrieve CPU process load information
+#
+# Returns: load information
+#
+# Since: 10.0
+##
+{ 'command': 'guest-get-load',
+ 'returns': 'GuestLoadAverage',
+ 'if': 'CONFIG_GETLOADAVG'
+}
+
##
# @GuestNetworkRoute:
#
--
2.48.1

View File

@ -0,0 +1,273 @@
From 181b9ca805f3ae09c24a925eea0460525f30c90e Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 11 Aug 2025 15:40:10 +0200
Subject: [PATCH] rbd: Fix .bdrv_get_specific_info implementation
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 400: rbd: Fix .bdrv_get_specific_info implementation
RH-Jira: RHEL-108726
RH-Acked-by: Hanna Czenczek <hreitz@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [1/1] 5a488d6e2355adcec7fc4fd686c6be001808a146 (kmwolf/centos-qemu-kvm)
qemu_rbd_get_specific_info() has at least two problems:
The first is that it issues a blocking rbd_read() call in order to probe
the encryption format for the image while querying the node. This means
that if the connection to the server goes down, not only I/O is stuck
(which is unavoidable), but query-names-block-nodes will actually make
the whole QEMU instance unresponsive. .bdrv_get_specific_info
implementations shouldn't perform blocking operations, but only return
what is already known.
The second is that the information returned isn't even correct. If the
image is already opened with encryption enabled at the RBD level, we'll
probe for "double encryption", i.e. if the encrypted data contains
another encryption header. If it doesn't (which is the normal case), we
won't return the encryption format. If it does, we return misleading
information because it looks like we're talking about the outer level
(the encryption format of the image itself) while the information is
about an encryption header in the guest data.
Fix this by storing the encryption format in BDRVRBDState when the image
is opened (and we do blocking operations anyway) and returning only the
stored information in qemu_rbd_get_specific_info().
The information we'll store is either the actual encryption format that
we enabled on the RBD level, or if the image is unencrypted, the result
of the same probing as we previously did when querying the node. Probing
image formats based on content that can be modified by the guest has
long been known as problematic, but as long as we only output it to the
user instead of making decisions based on it, it should be okay. It is
undoubtedly useful in the context of 'qemu-img info' when you're trying
to figure out which encryption options you have to use to open the
image successfully.
Fixes: 42e4ac9ef5a6 ("block/rbd: Add support for rbd image encryption")
Buglink: https://issues.redhat.com/browse/RHEL-105440
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20250811134010.81787-1-kwolf@redhat.com>
Reviewed-by: Hanna Czenczek <hreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
(cherry picked from commit 4af976ef398e4e823addc00bf1c58787ba4952fe)
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
block/rbd.c | 104 ++++++++++++++++++++++++++++---------------
qapi/block-core.json | 9 +++-
2 files changed, 76 insertions(+), 37 deletions(-)
diff --git a/block/rbd.c b/block/rbd.c
index 627f8eb05a..d5546da71b 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -99,6 +99,14 @@ typedef struct BDRVRBDState {
char *namespace;
uint64_t image_size;
uint64_t object_size;
+
+ /*
+ * If @bs->encrypted is true, this is the encryption format actually loaded
+ * at the librbd level. If it is false, it is the result of probing.
+ * RBD_IMAGE_ENCRYPTION_FORMAT__MAX means that encryption is not enabled and
+ * probing didn't find any known encryption header either.
+ */
+ RbdImageEncryptionFormat encryption_format;
} BDRVRBDState;
typedef struct RBDTask {
@@ -471,10 +479,12 @@ static int qemu_rbd_encryption_format(rbd_image_t image,
return 0;
}
-static int qemu_rbd_encryption_load(rbd_image_t image,
+static int qemu_rbd_encryption_load(BlockDriverState *bs,
+ rbd_image_t image,
RbdEncryptionOptions *encrypt,
Error **errp)
{
+ BDRVRBDState *s = bs->opaque;
int r = 0;
g_autofree char *passphrase = NULL;
rbd_encryption_luks1_format_options_t luks_opts;
@@ -545,15 +555,19 @@ static int qemu_rbd_encryption_load(rbd_image_t image,
error_setg_errno(errp, -r, "encryption load fail");
return r;
}
+ bs->encrypted = true;
+ s->encryption_format = encrypt->format;
return 0;
}
#ifdef LIBRBD_SUPPORTS_ENCRYPTION_LOAD2
-static int qemu_rbd_encryption_load2(rbd_image_t image,
+static int qemu_rbd_encryption_load2(BlockDriverState *bs,
+ rbd_image_t image,
RbdEncryptionOptions *encrypt,
Error **errp)
{
+ BDRVRBDState *s = bs->opaque;
int r = 0;
int encrypt_count = 1;
int i;
@@ -639,6 +653,8 @@ static int qemu_rbd_encryption_load2(rbd_image_t image,
error_setg_errno(errp, -r, "layered encryption load fail");
goto exit;
}
+ bs->encrypted = true;
+ s->encryption_format = encrypt->format;
exit:
for (i = 0; i < encrypt_count; ++i) {
@@ -672,6 +688,45 @@ exit:
#endif
#endif
+/*
+ * For an image without encryption enabled on the rbd layer, probe the start of
+ * the image if it could be opened as an encrypted image so that we can display
+ * it when the user queries the node (most importantly in qemu-img).
+ *
+ * If the guest writes an encryption header to its disk after this probing, this
+ * won't be reflected when queried, but that's okay. There is no reason why the
+ * user should want to apply encryption at the rbd level while the image is
+ * still in use. This is just guest data.
+ */
+static void qemu_rbd_encryption_probe(BlockDriverState *bs)
+{
+ BDRVRBDState *s = bs->opaque;
+ char buf[RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN] = {0};
+ int r;
+
+ assert(s->encryption_format == RBD_IMAGE_ENCRYPTION_FORMAT__MAX);
+
+ r = rbd_read(s->image, 0,
+ RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN, buf);
+ if (r < RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) {
+ return;
+ }
+
+ if (memcmp(buf, rbd_luks_header_verification,
+ RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) {
+ s->encryption_format = RBD_IMAGE_ENCRYPTION_FORMAT_LUKS;
+ } else if (memcmp(buf, rbd_luks2_header_verification,
+ RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) {
+ s->encryption_format = RBD_IMAGE_ENCRYPTION_FORMAT_LUKS2;
+ } else if (memcmp(buf, rbd_layered_luks_header_verification,
+ RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) {
+ s->encryption_format = RBD_IMAGE_ENCRYPTION_FORMAT_LUKS;
+ } else if (memcmp(buf, rbd_layered_luks2_header_verification,
+ RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) {
+ s->encryption_format = RBD_IMAGE_ENCRYPTION_FORMAT_LUKS2;
+ }
+}
+
/* FIXME Deprecate and remove keypairs or make it available in QMP. */
static int qemu_rbd_do_create(BlockdevCreateOptions *options,
const char *keypairs, const char *password_secret,
@@ -1134,17 +1189,18 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
goto failed_open;
}
+ s->encryption_format = RBD_IMAGE_ENCRYPTION_FORMAT__MAX;
if (opts->encrypt) {
#ifdef LIBRBD_SUPPORTS_ENCRYPTION
if (opts->encrypt->parent) {
#ifdef LIBRBD_SUPPORTS_ENCRYPTION_LOAD2
- r = qemu_rbd_encryption_load2(s->image, opts->encrypt, errp);
+ r = qemu_rbd_encryption_load2(bs, s->image, opts->encrypt, errp);
#else
r = -ENOTSUP;
error_setg(errp, "RBD library does not support layered encryption");
#endif
} else {
- r = qemu_rbd_encryption_load(s->image, opts->encrypt, errp);
+ r = qemu_rbd_encryption_load(bs, s->image, opts->encrypt, errp);
}
if (r < 0) {
goto failed_post_open;
@@ -1154,6 +1210,8 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
error_setg(errp, "RBD library does not support image encryption");
goto failed_post_open;
#endif
+ } else {
+ qemu_rbd_encryption_probe(bs);
}
r = rbd_stat(s->image, &info, sizeof(info));
@@ -1413,17 +1471,6 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
{
BDRVRBDState *s = bs->opaque;
ImageInfoSpecific *spec_info;
- char buf[RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN] = {0};
- int r;
-
- if (s->image_size >= RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) {
- r = rbd_read(s->image, 0,
- RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN, buf);
- if (r < 0) {
- error_setg_errno(errp, -r, "cannot read image start for probe");
- return NULL;
- }
- }
spec_info = g_new(ImageInfoSpecific, 1);
*spec_info = (ImageInfoSpecific){
@@ -1431,28 +1478,13 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
.u.rbd.data = g_new0(ImageInfoSpecificRbd, 1),
};
- if (memcmp(buf, rbd_luks_header_verification,
- RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) {
- spec_info->u.rbd.data->encryption_format =
- RBD_IMAGE_ENCRYPTION_FORMAT_LUKS;
- spec_info->u.rbd.data->has_encryption_format = true;
- } else if (memcmp(buf, rbd_luks2_header_verification,
- RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) {
- spec_info->u.rbd.data->encryption_format =
- RBD_IMAGE_ENCRYPTION_FORMAT_LUKS2;
- spec_info->u.rbd.data->has_encryption_format = true;
- } else if (memcmp(buf, rbd_layered_luks_header_verification,
- RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) {
- spec_info->u.rbd.data->encryption_format =
- RBD_IMAGE_ENCRYPTION_FORMAT_LUKS;
- spec_info->u.rbd.data->has_encryption_format = true;
- } else if (memcmp(buf, rbd_layered_luks2_header_verification,
- RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) {
- spec_info->u.rbd.data->encryption_format =
- RBD_IMAGE_ENCRYPTION_FORMAT_LUKS2;
- spec_info->u.rbd.data->has_encryption_format = true;
+ if (s->encryption_format == RBD_IMAGE_ENCRYPTION_FORMAT__MAX) {
+ assert(!bs->encrypted);
} else {
- spec_info->u.rbd.data->has_encryption_format = false;
+ ImageInfoSpecificRbd *rbd_info = spec_info->u.rbd.data;
+
+ rbd_info->has_encryption_format = true;
+ rbd_info->encryption_format = s->encryption_format;
}
return spec_info;
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 3969c60b93..15b91e2d4a 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -158,7 +158,14 @@
##
# @ImageInfoSpecificRbd:
#
-# @encryption-format: Image encryption format
+# @encryption-format: Image encryption format. If encryption is enabled for the
+# image (see encrypted in BlockNodeInfo), this is the actual format in which the
+# image is accessed. If encryption is not enabled, this is the result of
+# probing when the image was opened, to give a suggestion which encryption
+# format could be enabled. Note that probing results can be changed by the
+# guest by writing a (possibly partial) encryption format header to the
+# image, so don't treat this information as trusted if the guest is not
+# trusted.
#
# Since: 6.1
##
--
2.50.1

View File

@ -0,0 +1,36 @@
From 7300a435547b7e999227648fd1451db00e9c4867 Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Mon, 24 Mar 2025 18:09:26 +0100
Subject: [PATCH 26/26] redhat: Enable virtio-mem on s390x
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 351: Enable virtio-mem support on s390x
RH-Jira: RHEL-72977
RH-Acked-by: David Hildenbrand <david@redhat.com>
RH-Acked-by: Juraj Marcin <None>
RH-Commit: [26/26] 076b44c8f0262e903c5e17eda676614aec6f5c98 (thuth/qemu-kvm-cs)
JIRA: https://issues.redhat.com/browse/RHEL-72977
Enable virtio-mem on s390x now, too.
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
configs/devices/s390x-softmmu/s390x-rh-devices.mak | 1 +
1 file changed, 1 insertion(+)
diff --git a/configs/devices/s390x-softmmu/s390x-rh-devices.mak b/configs/devices/s390x-softmmu/s390x-rh-devices.mak
index 24cf6dbd03..834281d872 100644
--- a/configs/devices/s390x-softmmu/s390x-rh-devices.mak
+++ b/configs/devices/s390x-softmmu/s390x-rh-devices.mak
@@ -12,6 +12,7 @@ CONFIG_VFIO_CCW=y
CONFIG_VFIO_PCI=y
CONFIG_VHOST_USER=y
CONFIG_VIRTIO_CCW=y
+CONFIG_VIRTIO_MEM=y
CONFIG_WDT_DIAG288=y
CONFIG_VHOST_VSOCK=y
CONFIG_VHOST_USER_VSOCK=y
--
2.48.1

View File

@ -0,0 +1,94 @@
From 2de79d978c2cd29ad686dd91e74a86dbf2121f1f Mon Sep 17 00:00:00 2001
From: Juraj Marcin <jmarcin@redhat.com>
Date: Wed, 4 Sep 2024 12:37:13 +0200
Subject: [PATCH 06/26] reset: Add RESET_TYPE_WAKEUP
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 351: Enable virtio-mem support on s390x
RH-Jira: RHEL-72977
RH-Acked-by: David Hildenbrand <david@redhat.com>
RH-Acked-by: Juraj Marcin <None>
RH-Commit: [6/26] 6169fe25bfa5715340c180ee8711d0ad61832106 (thuth/qemu-kvm-cs)
Some devices need to distinguish cold start reset from waking up from a
suspended state. This patch adds new value to the enum, and updates the
i386 wakeup method to use this new reset type.
Message-ID: <20240904103722.946194-3-jmarcin@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Juraj Marcin <jmarcin@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
(cherry picked from commit 759cbb4ee971da13ddfa8ad73befc2351d542044)
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
docs/devel/reset.rst | 12 +++++++++++-
hw/i386/pc.c | 2 +-
include/hw/resettable.h | 2 ++
3 files changed, 14 insertions(+), 2 deletions(-)
diff --git a/docs/devel/reset.rst b/docs/devel/reset.rst
index d2799eba7a..44bd51b42e 100644
--- a/docs/devel/reset.rst
+++ b/docs/devel/reset.rst
@@ -44,6 +44,17 @@ The Resettable interface handles reset types with an enum ``ResetType``:
value on each cold reset, such as RNG seed information, and which they
must not reinitialize on a snapshot-load reset.
+``RESET_TYPE_WAKEUP``
+ If the machine supports waking up from a suspended state and needs to reset
+ its devices during wake-up (from the ``MachineClass::wakeup()`` method), this
+ reset type should be used for such a request. Devices can utilize this reset
+ type to differentiate the reset requested during machine wake-up from other
+ reset requests. For example, RAM content must not be lost during wake-up, and
+ memory devices like virtio-mem that provide additional RAM must not reset
+ such state during wake-ups, but might do so during cold resets. However, this
+ reset type should not be used for wake-up detection, as not every machine
+ type issues a device reset request during wake-up.
+
``RESET_TYPE_S390_CPU_NORMAL``
This is only used for S390 CPU objects; it clears interrupts, stops
processing, and clears the TLB, but does not touch register contents.
@@ -53,7 +64,6 @@ The Resettable interface handles reset types with an enum ``ResetType``:
``RESET_TYPE_S390_CPU_NORMAL`` does and also clears the PSW, prefix,
FPC, timer and control registers. It does not touch gprs, fprs or acrs.
-
Devices which implement reset methods must treat any unknown ``ResetType``
as equivalent to ``RESET_TYPE_COLD``; this will reduce the amount of
existing code we need to change if we add more types in future.
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index fedcf2a65f..fa9f16cbaf 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1889,7 +1889,7 @@ static void pc_machine_reset(MachineState *machine, ResetType type)
static void pc_machine_wakeup(MachineState *machine)
{
cpu_synchronize_all_states();
- pc_machine_reset(machine, RESET_TYPE_COLD);
+ pc_machine_reset(machine, RESET_TYPE_WAKEUP);
cpu_synchronize_all_post_reset();
}
diff --git a/include/hw/resettable.h b/include/hw/resettable.h
index 83b561fc83..cf37cd5ead 100644
--- a/include/hw/resettable.h
+++ b/include/hw/resettable.h
@@ -29,6 +29,7 @@ typedef struct ResettableState ResettableState;
* Types of reset.
*
* + Cold: reset resulting from a power cycle of the object.
+ * + Wakeup: reset resulting from a wake-up from a suspended state.
*
* TODO: Support has to be added to handle more types. In particular,
* ResettableState structure needs to be expanded.
@@ -36,6 +37,7 @@ typedef struct ResettableState ResettableState;
typedef enum ResetType {
RESET_TYPE_COLD,
RESET_TYPE_SNAPSHOT_LOAD,
+ RESET_TYPE_WAKEUP,
RESET_TYPE_S390_CPU_INITIAL,
RESET_TYPE_S390_CPU_NORMAL,
} ResetType;
--
2.48.1

View File

@ -0,0 +1,360 @@
From 8d48193b5a661f31c1c1db068d241b31ae379339 Mon Sep 17 00:00:00 2001
From: Juraj Marcin <jmarcin@redhat.com>
Date: Wed, 4 Sep 2024 12:37:12 +0200
Subject: [PATCH 05/26] reset: Use ResetType for qemu_devices_reset() and
MachineClass::reset()
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 351: Enable virtio-mem support on s390x
RH-Jira: RHEL-72977
RH-Acked-by: David Hildenbrand <david@redhat.com>
RH-Acked-by: Juraj Marcin <None>
RH-Commit: [5/26] ea1324b27885d979bcc54cc355dbdf940686776c (thuth/qemu-kvm-cs)
Currently, both qemu_devices_reset() and MachineClass::reset() use
ShutdownCause for the reason of the reset. However, the Resettable
interface uses ResetState, so ShutdownCause needs to be translated to
ResetType somewhere. Translating it qemu_devices_reset() makes adding
new reset types harder, as they cannot always be matched to a single
ShutdownCause here, and devices may need to check the ResetType to
determine what to reset and if to reset at all.
This patch moves this translation up in the call stack to
qemu_system_reset() and updates all MachineClass children to use the
ResetType instead.
Message-ID: <20240904103722.946194-2-jmarcin@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Juraj Marcin <jmarcin@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
(cherry picked from commit 1b063fe2df002052cc2d10799764979b8c583480)
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
hw/arm/aspeed.c | 4 ++--
hw/arm/mps2-tz.c | 4 ++--
hw/core/reset.c | 5 +----
hw/hppa/machine.c | 4 ++--
hw/i386/microvm.c | 4 ++--
hw/i386/pc.c | 6 +++---
hw/ppc/pegasos2.c | 4 ++--
hw/ppc/pnv.c | 4 ++--
hw/ppc/spapr.c | 6 +++---
hw/s390x/s390-virtio-ccw.c | 4 ++--
include/hw/boards.h | 3 ++-
include/sysemu/reset.h | 5 +++--
system/runstate.c | 13 +++++++++++--
13 files changed, 37 insertions(+), 29 deletions(-)
diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c
index fd5603f7aa..cbca7685da 100644
--- a/hw/arm/aspeed.c
+++ b/hw/arm/aspeed.c
@@ -1529,12 +1529,12 @@ static void aspeed_machine_bletchley_class_init(ObjectClass *oc, void *data)
aspeed_machine_class_init_cpus_defaults(mc);
}
-static void fby35_reset(MachineState *state, ShutdownCause reason)
+static void fby35_reset(MachineState *state, ResetType type)
{
AspeedMachineState *bmc = ASPEED_MACHINE(state);
AspeedGPIOState *gpio = &bmc->soc->gpio;
- qemu_devices_reset(reason);
+ qemu_devices_reset(type);
/* Board ID: 7 (Class-1, 4 slots) */
object_property_set_bool(OBJECT(gpio), "gpioV4", true, &error_fatal);
diff --git a/hw/arm/mps2-tz.c b/hw/arm/mps2-tz.c
index aec57c0d68..8edf57a66d 100644
--- a/hw/arm/mps2-tz.c
+++ b/hw/arm/mps2-tz.c
@@ -1254,7 +1254,7 @@ static void mps2_set_remap(Object *obj, const char *value, Error **errp)
}
}
-static void mps2_machine_reset(MachineState *machine, ShutdownCause reason)
+static void mps2_machine_reset(MachineState *machine, ResetType type)
{
MPS2TZMachineState *mms = MPS2TZ_MACHINE(machine);
@@ -1264,7 +1264,7 @@ static void mps2_machine_reset(MachineState *machine, ShutdownCause reason)
* reset see the correct mapping.
*/
remap_memory(mms, mms->remap);
- qemu_devices_reset(reason);
+ qemu_devices_reset(type);
}
static void mps2tz_class_init(ObjectClass *oc, void *data)
diff --git a/hw/core/reset.c b/hw/core/reset.c
index 58dfc8db3d..14a2639fbf 100644
--- a/hw/core/reset.c
+++ b/hw/core/reset.c
@@ -170,11 +170,8 @@ void qemu_unregister_resettable(Object *obj)
resettable_container_remove(get_root_reset_container(), obj);
}
-void qemu_devices_reset(ShutdownCause reason)
+void qemu_devices_reset(ResetType type)
{
- ResetType type = (reason == SHUTDOWN_CAUSE_SNAPSHOT_LOAD) ?
- RESET_TYPE_SNAPSHOT_LOAD : RESET_TYPE_COLD;
-
/* Reset the simulation */
resettable_reset(OBJECT(get_root_reset_container()), type);
}
diff --git a/hw/hppa/machine.c b/hw/hppa/machine.c
index 5d0a8739de..8259fe2e38 100644
--- a/hw/hppa/machine.c
+++ b/hw/hppa/machine.c
@@ -642,12 +642,12 @@ static void machine_HP_C3700_init(MachineState *machine)
machine_HP_common_init_tail(machine, pci_bus, translate);
}
-static void hppa_machine_reset(MachineState *ms, ShutdownCause reason)
+static void hppa_machine_reset(MachineState *ms, ResetType type)
{
unsigned int smp_cpus = ms->smp.cpus;
int i;
- qemu_devices_reset(reason);
+ qemu_devices_reset(type);
/* Start all CPUs at the firmware entry point.
* Monarch CPU will initialize firmware, secondary CPUs
diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c
index 40edcee7af..8ae4dff7f2 100644
--- a/hw/i386/microvm.c
+++ b/hw/i386/microvm.c
@@ -462,7 +462,7 @@ static void microvm_machine_state_init(MachineState *machine)
microvm_devices_init(mms);
}
-static void microvm_machine_reset(MachineState *machine, ShutdownCause reason)
+static void microvm_machine_reset(MachineState *machine, ResetType type)
{
MicrovmMachineState *mms = MICROVM_MACHINE(machine);
CPUState *cs;
@@ -475,7 +475,7 @@ static void microvm_machine_reset(MachineState *machine, ShutdownCause reason)
mms->kernel_cmdline_fixed = true;
}
- qemu_devices_reset(reason);
+ qemu_devices_reset(type);
CPU_FOREACH(cs) {
cpu = X86_CPU(cs);
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index fa0e42d072..fedcf2a65f 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1869,12 +1869,12 @@ static void pc_machine_initfn(Object *obj)
qemu_add_machine_init_done_notifier(&pcms->machine_done);
}
-static void pc_machine_reset(MachineState *machine, ShutdownCause reason)
+static void pc_machine_reset(MachineState *machine, ResetType type)
{
CPUState *cs;
X86CPU *cpu;
- qemu_devices_reset(reason);
+ qemu_devices_reset(type);
/* Reset APIC after devices have been reset to cancel
* any changes that qemu_devices_reset() might have done.
@@ -1889,7 +1889,7 @@ static void pc_machine_reset(MachineState *machine, ShutdownCause reason)
static void pc_machine_wakeup(MachineState *machine)
{
cpu_synchronize_all_states();
- pc_machine_reset(machine, SHUTDOWN_CAUSE_NONE);
+ pc_machine_reset(machine, RESET_TYPE_COLD);
cpu_synchronize_all_post_reset();
}
diff --git a/hw/ppc/pegasos2.c b/hw/ppc/pegasos2.c
index 9b0a6b70ab..8ff4a00c34 100644
--- a/hw/ppc/pegasos2.c
+++ b/hw/ppc/pegasos2.c
@@ -291,14 +291,14 @@ static void pegasos2_superio_write(uint8_t addr, uint8_t val)
cpu_physical_memory_write(PCI1_IO_BASE + 0x3f1, &val, 1);
}
-static void pegasos2_machine_reset(MachineState *machine, ShutdownCause reason)
+static void pegasos2_machine_reset(MachineState *machine, ResetType type)
{
Pegasos2MachineState *pm = PEGASOS2_MACHINE(machine);
void *fdt;
uint64_t d[2];
int sz;
- qemu_devices_reset(reason);
+ qemu_devices_reset(type);
if (!pm->vof) {
return; /* Firmware should set up machine so nothing to do */
}
diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 3526852685..988fd55d88 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -709,13 +709,13 @@ static void pnv_powerdown_notify(Notifier *n, void *opaque)
}
}
-static void pnv_reset(MachineState *machine, ShutdownCause reason)
+static void pnv_reset(MachineState *machine, ResetType type)
{
PnvMachineState *pnv = PNV_MACHINE(machine);
IPMIBmc *bmc;
void *fdt;
- qemu_devices_reset(reason);
+ qemu_devices_reset(type);
/*
* The machine should provide by default an internal BMC simulator.
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 29e66f1b3f..11c953669a 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1725,7 +1725,7 @@ void spapr_check_mmu_mode(bool guest_radix)
}
}
-static void spapr_machine_reset(MachineState *machine, ShutdownCause reason)
+static void spapr_machine_reset(MachineState *machine, ResetType type)
{
SpaprMachineState *spapr = SPAPR_MACHINE(machine);
PowerPCCPU *first_ppc_cpu;
@@ -1733,7 +1733,7 @@ static void spapr_machine_reset(MachineState *machine, ShutdownCause reason)
void *fdt;
int rc;
- if (reason != SHUTDOWN_CAUSE_SNAPSHOT_LOAD) {
+ if (type != RESET_TYPE_SNAPSHOT_LOAD) {
/*
* Record-replay snapshot load must not consume random, this was
* already replayed from initial machine reset.
@@ -1769,7 +1769,7 @@ static void spapr_machine_reset(MachineState *machine, ShutdownCause reason)
spapr_setup_hpt(spapr);
}
- qemu_devices_reset(reason);
+ qemu_devices_reset(type);
spapr_ovec_cleanup(spapr->ov5_cas);
spapr->ov5_cas = spapr_ovec_new();
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index ef2a9687c7..94cad1705b 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -434,7 +434,7 @@ static void s390_pv_prepare_reset(S390CcwMachineState *ms)
s390_pv_prep_reset();
}
-static void s390_machine_reset(MachineState *machine, ShutdownCause reason)
+static void s390_machine_reset(MachineState *machine, ResetType type)
{
S390CcwMachineState *ms = S390_CCW_MACHINE(machine);
enum s390_reset reset_type;
@@ -466,7 +466,7 @@ static void s390_machine_reset(MachineState *machine, ShutdownCause reason)
* Device reset includes CPU clear resets so this has to be
* done AFTER the unprotect call above.
*/
- qemu_devices_reset(reason);
+ qemu_devices_reset(type);
s390_crypto_reset();
/* configure and start the ipl CPU only */
diff --git a/include/hw/boards.h b/include/hw/boards.h
index ffefc0a625..fe011b1e86 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -10,6 +10,7 @@
#include "qemu/module.h"
#include "qom/object.h"
#include "hw/core/cpu.h"
+#include "hw/resettable.h"
#define TYPE_MACHINE_SUFFIX "-machine"
@@ -253,7 +254,7 @@ struct MachineClass {
const char *deprecation_reason;
void (*init)(MachineState *state);
- void (*reset)(MachineState *state, ShutdownCause reason);
+ void (*reset)(MachineState *state, ResetType type);
void (*wakeup)(MachineState *state);
int (*kvm_type)(MachineState *machine, const char *arg);
diff --git a/include/sysemu/reset.h b/include/sysemu/reset.h
index ae436044a9..0e297c0e02 100644
--- a/include/sysemu/reset.h
+++ b/include/sysemu/reset.h
@@ -27,6 +27,7 @@
#ifndef QEMU_SYSEMU_RESET_H
#define QEMU_SYSEMU_RESET_H
+#include "hw/resettable.h"
#include "qapi/qapi-events-run-state.h"
typedef void QEMUResetHandler(void *opaque);
@@ -110,7 +111,7 @@ void qemu_unregister_reset(QEMUResetHandler *func, void *opaque);
/**
* qemu_devices_reset: Perform a complete system reset
- * @reason: reason for the reset
+ * @reason: type of the reset
*
* This function performs the low-level work needed to do a complete reset
* of the system (calling all the callbacks registered with
@@ -121,6 +122,6 @@ void qemu_unregister_reset(QEMUResetHandler *func, void *opaque);
* If you want to trigger a system reset from, for instance, a device
* model, don't use this function. Use qemu_system_reset_request().
*/
-void qemu_devices_reset(ShutdownCause reason);
+void qemu_devices_reset(ResetType type);
#endif
diff --git a/system/runstate.c b/system/runstate.c
index a0e2a5fd22..c2c9afa905 100644
--- a/system/runstate.c
+++ b/system/runstate.c
@@ -32,6 +32,7 @@
#include "exec/cpu-common.h"
#include "gdbstub/syscalls.h"
#include "hw/boards.h"
+#include "hw/resettable.h"
#include "migration/misc.h"
#include "migration/postcopy-ram.h"
#include "monitor/monitor.h"
@@ -507,15 +508,23 @@ static int qemu_debug_requested(void)
void qemu_system_reset(ShutdownCause reason)
{
MachineClass *mc;
+ ResetType type;
mc = current_machine ? MACHINE_GET_CLASS(current_machine) : NULL;
cpu_synchronize_all_states();
+ switch (reason) {
+ case SHUTDOWN_CAUSE_SNAPSHOT_LOAD:
+ type = RESET_TYPE_SNAPSHOT_LOAD;
+ break;
+ default:
+ type = RESET_TYPE_COLD;
+ }
if (mc && mc->reset) {
- mc->reset(current_machine, reason);
+ mc->reset(current_machine, type);
} else {
- qemu_devices_reset(reason);
+ qemu_devices_reset(type);
}
switch (reason) {
case SHUTDOWN_CAUSE_NONE:
--
2.48.1

View File

@ -0,0 +1,60 @@
From 4f627e0ae8efb96380070b6a8d50e88c71f40477 Mon Sep 17 00:00:00 2001
From: Fabiano Rosas <farosas@suse.de>
Date: Fri, 9 May 2025 14:49:38 -0300
Subject: [PATCH 01/57] s390x: Fix leak in machine_set_loadparm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 387: s390x: Fix memory leaks related to loadparm [rhel-9]
RH-Jira: RHEL-98554
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
RH-Commit: [1/2] dadf5b9e187a644e0a8a8c565b1b913ef7f4dcc8 (thuth/qemu-kvm-cs)
ASAN spotted a leaking string in machine_set_loadparm():
Direct leak of 9 byte(s) in 1 object(s) allocated from:
#0 0x560ffb5bb379 in malloc ../projects/compiler-rt/lib/asan/asan_malloc_linux.cpp:69:3
#1 0x7f1aca926518 in g_malloc ../glib/gmem.c:106
#2 0x7f1aca94113e in g_strdup ../glib/gstrfuncs.c:364
#3 0x560ffc8afbf9 in qobject_input_type_str ../qapi/qobject-input-visitor.c:542:12
#4 0x560ffc8a80ff in visit_type_str ../qapi/qapi-visit-core.c:349:10
#5 0x560ffbe6053a in machine_set_loadparm ../hw/s390x/s390-virtio-ccw.c:802:10
#6 0x560ffc0c5e52 in object_property_set ../qom/object.c:1450:5
#7 0x560ffc0d4175 in object_property_set_qobject ../qom/qom-qobject.c:28:10
#8 0x560ffc0c6004 in object_property_set_str ../qom/object.c:1458:15
#9 0x560ffbe2ae60 in update_machine_ipl_properties ../hw/s390x/ipl.c:569:9
#10 0x560ffbe2aa65 in s390_ipl_update_diag308 ../hw/s390x/ipl.c:594:5
#11 0x560ffbdee132 in handle_diag_308 ../target/s390x/diag.c:147:9
#12 0x560ffbebb956 in helper_diag ../target/s390x/tcg/misc_helper.c:137:9
#13 0x7f1a3c51c730 (/memfd:tcg-jit (deleted)+0x39730)
Cc: qemu-stable@nongnu.org
Signed-off-by: Fabiano Rosas <farosas@suse.de>
Message-ID: <20250509174938.25935-1-farosas@suse.de>
Fixes: 1fd396e3228 ("s390x: Register TYPE_S390_CCW_MACHINE properties as class properties")
Reviewed-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Thomas Huth <thuth@redhat.com>
(cherry picked from commit bdf12f2a56bf3f13c52eb51f0a994bbfe40706b2)
---
hw/s390x/s390-virtio-ccw.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index 77a1bde71e..fc18ab575f 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -782,6 +782,7 @@ static void machine_set_loadparm(Object *obj, Visitor *v,
}
s390_ipl_fmt_loadparm(ms->loadparm, val, errp);
+ g_free(val);
}
static void ccw_machine_class_init(ObjectClass *oc, void *data)
--
2.39.3

View File

@ -0,0 +1,144 @@
From 1dd38383832fc27f2980f33bb5e10ec1af7e3fc3 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 19 Dec 2024 15:41:07 +0100
Subject: [PATCH 15/26] s390x: introduce s390_get_memory_limit()
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 351: Enable virtio-mem support on s390x
RH-Jira: RHEL-72977
RH-Acked-by: David Hildenbrand <david@redhat.com>
RH-Acked-by: Juraj Marcin <None>
RH-Commit: [15/26] 5ae6a624a6541283cb15e90ebeb8fef3940c823b (thuth/qemu-kvm-cs)
Let's add s390_get_memory_limit(), to query what has been successfully
set via s390_set_memory_limit(). Allow setting the limit only once.
We'll remember the limit in the machine state. Move
s390_set_memory_limit() to machine code, merging it into
set_memory_limit(), because this really is a machine property.
Message-ID: <20241219144115.2820241-7-david@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
(cherry picked from commit 27221b69a3ea49339a1f82b9622126f3928e0915)
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
hw/s390x/s390-virtio-ccw.c | 17 ++++++++++++-----
include/hw/s390x/s390-virtio-ccw.h | 8 ++++++++
target/s390x/cpu-sysemu.c | 8 --------
target/s390x/cpu.h | 1 -
4 files changed, 20 insertions(+), 14 deletions(-)
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index 248ac28d20..f5f147eb92 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -45,6 +45,7 @@
#include "migration/blocker.h"
#include "qapi/visitor.h"
#include "hw/s390x/cpu-topology.h"
+#include "kvm/kvm_s390x.h"
#include CONFIG_DEVICES
static Error *pv_mig_blocker;
@@ -121,12 +122,16 @@ static void subsystem_reset(void)
}
}
-static void set_memory_limit(uint64_t new_limit)
+static void s390_set_memory_limit(S390CcwMachineState *s390ms,
+ uint64_t new_limit)
{
- uint64_t hw_limit;
- int ret;
+ uint64_t hw_limit = 0;
+ int ret = 0;
- ret = s390_set_memory_limit(new_limit, &hw_limit);
+ assert(!s390ms->memory_limit && new_limit);
+ if (kvm_enabled()) {
+ ret = kvm_s390_set_mem_limit(new_limit, &hw_limit);
+ }
if (ret == -E2BIG) {
error_report("host supports a maximum of %" PRIu64 " GB",
hw_limit / GiB);
@@ -135,10 +140,12 @@ static void set_memory_limit(uint64_t new_limit)
error_report("setting the guest size failed");
exit(EXIT_FAILURE);
}
+ s390ms->memory_limit = new_limit;
}
static void s390_memory_init(MachineState *machine)
{
+ S390CcwMachineState *s390ms = S390_CCW_MACHINE(machine);
MemoryRegion *sysmem = get_system_memory();
MemoryRegion *ram = machine->ram;
uint64_t ram_size = memory_region_size(ram);
@@ -154,7 +161,7 @@ static void s390_memory_init(MachineState *machine)
exit(EXIT_FAILURE);
}
- set_memory_limit(ram_size);
+ s390_set_memory_limit(s390ms, ram_size);
/* Map the initial memory. Must happen after setting the memory limit. */
memory_region_add_subregion(sysmem, 0, ram);
diff --git a/include/hw/s390x/s390-virtio-ccw.h b/include/hw/s390x/s390-virtio-ccw.h
index 996864a34e..de04336c5a 100644
--- a/include/hw/s390x/s390-virtio-ccw.h
+++ b/include/hw/s390x/s390-virtio-ccw.h
@@ -29,10 +29,18 @@ struct S390CcwMachineState {
bool dea_key_wrap;
bool pv;
uint8_t loadparm[8];
+ uint64_t memory_limit;
SCLPDevice *sclp;
};
+static inline uint64_t s390_get_memory_limit(S390CcwMachineState *s390ms)
+{
+ /* We expect to be called only after the limit was set. */
+ assert(s390ms->memory_limit);
+ return s390ms->memory_limit;
+}
+
#define S390_PTF_REASON_NONE (0x00 << 8)
#define S390_PTF_REASON_DONE (0x01 << 8)
#define S390_PTF_REASON_BUSY (0x02 << 8)
diff --git a/target/s390x/cpu-sysemu.c b/target/s390x/cpu-sysemu.c
index 1cd30c1d84..3118a25fee 100644
--- a/target/s390x/cpu-sysemu.c
+++ b/target/s390x/cpu-sysemu.c
@@ -255,14 +255,6 @@ unsigned int s390_cpu_set_state(uint8_t cpu_state, S390CPU *cpu)
return s390_count_running_cpus();
}
-int s390_set_memory_limit(uint64_t new_limit, uint64_t *hw_limit)
-{
- if (kvm_enabled()) {
- return kvm_s390_set_mem_limit(new_limit, hw_limit);
- }
- return 0;
-}
-
void s390_set_max_pagesize(uint64_t pagesize, Error **errp)
{
if (kvm_enabled()) {
diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
index 6a64472403..ecaf3191d2 100644
--- a/target/s390x/cpu.h
+++ b/target/s390x/cpu.h
@@ -881,7 +881,6 @@ static inline void s390_do_cpu_load_normal(CPUState *cs, run_on_cpu_data arg)
/* cpu.c */
void s390_crypto_reset(void);
-int s390_set_memory_limit(uint64_t new_limit, uint64_t *hw_limit);
void s390_set_max_pagesize(uint64_t pagesize, Error **errp);
void s390_cmma_reset(void);
void s390_enable_css_support(S390CPU *cpu);
--
2.48.1

View File

@ -0,0 +1,256 @@
From c60d0770ff3f9124e6e9d7beb03e1ef8067e8e26 Mon Sep 17 00:00:00 2001
From: Christoph Schlameuss <cschlame@redhat.com>
Date: Thu, 12 Jun 2025 13:25:32 +0200
Subject: [PATCH 01/16] s390x/pci: add support for guests that request direct
mapping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Christoph Schlameuss <None>
RH-MergeRequest: 376: Draft: KVM: Performance Enhanced Refresh PCI Translation
RH-Jira: RHEL-11430
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Commit: [1/2] 11d1dd9a5add55ae43d5d922588a33945ecbfe27 (cschlame/qemu-kvm)
JIRA: https://issues.redhat.com/browse/RHEL-11430
Conflicts: hw/s390x/s390-pci-bus.c old s390_pci_device_properties[] still has DEFINE_PROP_END_OF_LIST()
hw/s390x/s390-pci-inst.c hw_accel.h is still in sysemu
hw/s390x/s390-virtio-ccw.c changes from ccw_machine_9_2_class_options() moved to ccw_rhel_machine_9_6_0_class_options()
commit dfcee1ea4c52ac60e0a06221eafb7b6253eb10c3
Author: Matthew Rosato <mjrosato@linux.ibm.com>
Date: Wed Feb 26 16:00:12 2025 -0500
s390x/pci: add support for guests that request direct mapping
When receiving a guest mpcifc(4) or mpcifc(6) instruction without the T
bit set, treat this as a request to perform direct mapping instead of
address translation. In order to facilitate this, pin the entirety of
guest memory into the host iommu.
Pinning for the direct mapping case is handled via vfio and its memory
listener. Additionally, ram discard settings are inherited from vfio:
coordinated discards (e.g. virtio-mem) are allowed while uncoordinated
discards (e.g. virtio-balloon) are disabled.
Subsequent guest DMA operations are all expected to be of the format
guest_phys+sdma, allowing them to be used as lookup into the host
iommu table.
Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Message-ID: <20250226210013.238349-2-mjrosato@linux.ibm.com>
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Christoph Schlameuss <cschlame@redhat.com>
---
hw/s390x/s390-pci-bus.c | 39 +++++++++++++++++++++++++++++++--
hw/s390x/s390-pci-inst.c | 13 +++++++++--
hw/s390x/s390-pci-vfio.c | 23 +++++++++++++++----
hw/s390x/s390-virtio-ccw.c | 5 +++++
include/hw/s390x/s390-pci-bus.h | 3 +++
5 files changed, 75 insertions(+), 8 deletions(-)
diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
index 3e57d5faca..13bc02d837 100644
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -18,6 +18,8 @@
#include "hw/s390x/s390-pci-inst.h"
#include "hw/s390x/s390-pci-kvm.h"
#include "hw/s390x/s390-pci-vfio.h"
+#include "hw/s390x/s390-virtio-ccw.h"
+#include "hw/boards.h"
#include "hw/pci/pci_bus.h"
#include "hw/qdev-properties.h"
#include "hw/pci/pci_bridge.h"
@@ -724,12 +726,42 @@ void s390_pci_iommu_enable(S390PCIIOMMU *iommu)
g_free(name);
}
+void s390_pci_iommu_direct_map_enable(S390PCIIOMMU *iommu)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+ S390CcwMachineState *s390ms = S390_CCW_MACHINE(ms);
+
+ /*
+ * For direct-mapping we must map the entire guest address space. Rather
+ * than using an iommu, create a memory region alias that maps GPA X to
+ * IOVA X + SDMA. VFIO will handle pinning via its memory listener.
+ */
+ g_autofree char *name = g_strdup_printf("iommu-dm-s390-%04x",
+ iommu->pbdev->uid);
+
+ iommu->dm_mr = g_malloc0(sizeof(*iommu->dm_mr));
+ memory_region_init_alias(iommu->dm_mr, OBJECT(&iommu->mr), name,
+ get_system_memory(), 0,
+ s390_get_memory_limit(s390ms));
+ iommu->enabled = true;
+ memory_region_add_subregion(&iommu->mr, iommu->pbdev->zpci_fn.sdma,
+ iommu->dm_mr);
+}
+
void s390_pci_iommu_disable(S390PCIIOMMU *iommu)
{
iommu->enabled = false;
g_hash_table_remove_all(iommu->iotlb);
- memory_region_del_subregion(&iommu->mr, MEMORY_REGION(&iommu->iommu_mr));
- object_unparent(OBJECT(&iommu->iommu_mr));
+ if (iommu->dm_mr) {
+ memory_region_del_subregion(&iommu->mr, iommu->dm_mr);
+ object_unparent(OBJECT(iommu->dm_mr));
+ g_free(iommu->dm_mr);
+ iommu->dm_mr = NULL;
+ } else {
+ memory_region_del_subregion(&iommu->mr,
+ MEMORY_REGION(&iommu->iommu_mr));
+ object_unparent(OBJECT(&iommu->iommu_mr));
+ }
}
static void s390_pci_iommu_free(S390pciState *s, PCIBus *bus, int32_t devfn)
@@ -1130,6 +1162,7 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
/* Always intercept emulated devices */
pbdev->interp = false;
pbdev->forwarding_assist = false;
+ pbdev->rtr_avail = false;
}
if (s390_pci_msix_init(pbdev) && !pbdev->interp) {
@@ -1488,6 +1521,8 @@ static Property s390_pci_device_properties[] = {
DEFINE_PROP_BOOL("interpret", S390PCIBusDevice, interp, true),
DEFINE_PROP_BOOL("forwarding-assist", S390PCIBusDevice, forwarding_assist,
true),
+ DEFINE_PROP_BOOL("relaxed-translation", S390PCIBusDevice, rtr_avail,
+ true),
DEFINE_PROP_END_OF_LIST(),
};
diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c
index 30149546c0..803ebcd9b3 100644
--- a/hw/s390x/s390-pci-inst.c
+++ b/hw/s390x/s390-pci-inst.c
@@ -16,6 +16,7 @@
#include "exec/memory.h"
#include "qemu/error-report.h"
#include "sysemu/hw_accel.h"
+#include "hw/boards.h"
#include "hw/pci/pci_device.h"
#include "hw/s390x/s390-pci-inst.h"
#include "hw/s390x/s390-pci-bus.h"
@@ -1008,17 +1009,25 @@ static int reg_ioat(CPUS390XState *env, S390PCIBusDevice *pbdev, ZpciFib fib,
}
/* currently we only support designation type 1 with translation */
- if (!(dt == ZPCI_IOTA_RTTO && t)) {
+ if (t && dt != ZPCI_IOTA_RTTO) {
error_report("unsupported ioat dt %d t %d", dt, t);
s390_program_interrupt(env, PGM_OPERAND, ra);
return -EINVAL;
+ } else if (!t && !pbdev->rtr_avail) {
+ error_report("relaxed translation not allowed");
+ s390_program_interrupt(env, PGM_OPERAND, ra);
+ return -EINVAL;
}
iommu->pba = pba;
iommu->pal = pal;
iommu->g_iota = g_iota;
- s390_pci_iommu_enable(iommu);
+ if (t) {
+ s390_pci_iommu_enable(iommu);
+ } else {
+ s390_pci_iommu_direct_map_enable(iommu);
+ }
return 0;
}
diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c
index 7dbbc76823..443e222912 100644
--- a/hw/s390x/s390-pci-vfio.c
+++ b/hw/s390x/s390-pci-vfio.c
@@ -131,13 +131,28 @@ static void s390_pci_read_base(S390PCIBusDevice *pbdev,
/* Store function type separately for type-specific behavior */
pbdev->pft = cap->pft;
+ /*
+ * If the device is a passthrough ISM device, disallow relaxed
+ * translation.
+ */
+ if (pbdev->pft == ZPCI_PFT_ISM) {
+ pbdev->rtr_avail = false;
+ }
+
/*
* If appropriate, reduce the size of the supported DMA aperture reported
- * to the guest based upon the vfio DMA limit.
+ * to the guest based upon the vfio DMA limit. This is applicable for
+ * devices that are guaranteed to not use relaxed translation. If the
+ * device is capable of relaxed translation then we must advertise the
+ * full aperture. In this case, if translation is used then we will
+ * rely on the vfio DMA limit counting and use RPCIT CC1 / status 16
+ * to request that the guest free DMA mappings as necessary.
*/
- vfio_size = pbdev->iommu->max_dma_limit << TARGET_PAGE_BITS;
- if (vfio_size > 0 && vfio_size < cap->end_dma - cap->start_dma + 1) {
- pbdev->zpci_fn.edma = cap->start_dma + vfio_size - 1;
+ if (!pbdev->rtr_avail) {
+ vfio_size = pbdev->iommu->max_dma_limit << TARGET_PAGE_BITS;
+ if (vfio_size > 0 && vfio_size < cap->end_dma - cap->start_dma + 1) {
+ pbdev->zpci_fn.edma = cap->start_dma + vfio_size - 1;
+ }
}
}
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index 312e8f18aa..77a1bde71e 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -1348,8 +1348,13 @@ static void ccw_rhel_machine_9_6_0_instance_options(MachineState *machine)
static void ccw_rhel_machine_9_6_0_class_options(MachineClass *mc)
{
+ static GlobalProperty compat[] = {
+ { TYPE_S390_PCI_DEVICE, "relaxed-translation", "off", },
+ };
+
/* NB: remember to move this line to the *latest* RHEL 9 machine */
compat_props_add(mc->compat_props, hw_compat_rhel_9, hw_compat_rhel_9_len);
+ compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
}
DEFINE_CCW_MACHINE_AS_LATEST(9, 6, 0);
diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h
index 2c43ea123f..04944d4fed 100644
--- a/include/hw/s390x/s390-pci-bus.h
+++ b/include/hw/s390x/s390-pci-bus.h
@@ -277,6 +277,7 @@ struct S390PCIIOMMU {
AddressSpace as;
MemoryRegion mr;
IOMMUMemoryRegion iommu_mr;
+ MemoryRegion *dm_mr;
bool enabled;
uint64_t g_iota;
uint64_t pba;
@@ -362,6 +363,7 @@ struct S390PCIBusDevice {
bool interp;
bool forwarding_assist;
bool aif;
+ bool rtr_avail;
QTAILQ_ENTRY(S390PCIBusDevice) link;
};
@@ -389,6 +391,7 @@ int pci_chsc_sei_nt2_have_event(void);
void s390_pci_sclp_configure(SCCB *sccb);
void s390_pci_sclp_deconfigure(SCCB *sccb);
void s390_pci_iommu_enable(S390PCIIOMMU *iommu);
+void s390_pci_iommu_direct_map_enable(S390PCIIOMMU *iommu);
void s390_pci_iommu_disable(S390PCIIOMMU *iommu);
void s390_pci_generate_error_event(uint16_t pec, uint32_t fh, uint32_t fid,
uint64_t faddr, uint32_t e);
--
2.48.1

View File

@ -0,0 +1,72 @@
From 13e8ddbd282da692c8199a6cb9ca847334089e29 Mon Sep 17 00:00:00 2001
From: Christoph Schlameuss <cschlame@redhat.com>
Date: Thu, 12 Jun 2025 11:48:41 +0200
Subject: [PATCH 02/16] s390x/pci: indicate QEMU supports relaxed translation
for passthrough
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Christoph Schlameuss <None>
RH-MergeRequest: 376: Draft: KVM: Performance Enhanced Refresh PCI Translation
RH-Jira: RHEL-11430
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Commit: [2/2] afd514268347d0b434a60d7c6c09d20b84e5d902 (cschlame/qemu-kvm)
JIRA: https://issues.redhat.com/browse/RHEL-11430
commit d9b5dfc7122559e5b5959ecf534788b90c3dd102
Author: Matthew Rosato <mjrosato@linux.ibm.com>
Date: Wed Feb 26 16:00:13 2025 -0500
s390x/pci: indicate QEMU supports relaxed translation for passthrough
Specifying this bit in the guest CLP response indicates that the guest
can optionally choose to skip translation and instead use
identity-mapped operations.
Tested-by: Niklas Schnelle <schnelle@linux.ibm.com>
Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
Message-ID: <20250226210013.238349-3-mjrosato@linux.ibm.com>
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Christoph Schlameuss <cschlame@redhat.com>
---
hw/s390x/s390-pci-vfio.c | 5 ++++-
include/hw/s390x/s390-pci-clp.h | 1 +
2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c
index 443e222912..6236ac7f1e 100644
--- a/hw/s390x/s390-pci-vfio.c
+++ b/hw/s390x/s390-pci-vfio.c
@@ -238,8 +238,11 @@ static void s390_pci_read_group(S390PCIBusDevice *pbdev,
pbdev->pci_group = s390_group_create(pbdev->zpci_fn.pfgid, start_gid);
resgrp = &pbdev->pci_group->zpci_group;
+ if (pbdev->rtr_avail) {
+ resgrp->fr |= CLP_RSP_QPCIG_MASK_RTR;
+ }
if (cap->flags & VFIO_DEVICE_INFO_ZPCI_FLAG_REFRESH) {
- resgrp->fr = 1;
+ resgrp->fr |= CLP_RSP_QPCIG_MASK_REFRESH;
}
resgrp->dasm = cap->dasm;
resgrp->msia = cap->msi_addr;
diff --git a/include/hw/s390x/s390-pci-clp.h b/include/hw/s390x/s390-pci-clp.h
index 03b7f9ba5f..6a635d693b 100644
--- a/include/hw/s390x/s390-pci-clp.h
+++ b/include/hw/s390x/s390-pci-clp.h
@@ -158,6 +158,7 @@ typedef struct ClpRspQueryPciGrp {
#define CLP_RSP_QPCIG_MASK_NOI 0xfff
uint16_t i;
uint8_t version;
+#define CLP_RSP_QPCIG_MASK_RTR 0x20
#define CLP_RSP_QPCIG_MASK_FRAME 0x2
#define CLP_RSP_QPCIG_MASK_REFRESH 0x1
uint8_t fr;
--
2.48.1

View File

@ -0,0 +1,46 @@
From 9d5420c4370b74d60f082f2aa1225b19150ee629 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 19 Dec 2024 15:41:12 +0100
Subject: [PATCH 20/26] s390x/pv: prepare for memory devices
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 351: Enable virtio-mem support on s390x
RH-Jira: RHEL-72977
RH-Acked-by: David Hildenbrand <david@redhat.com>
RH-Acked-by: Juraj Marcin <None>
RH-Commit: [20/26] cdbe71168b9afa9657b94f1e7500568314c707a8 (thuth/qemu-kvm-cs)
Let's avoid checking for the maxram_size, and instead rely on the memory
limit determined in s390_memory_init(), that might be larger than
maxram_size, for example due to alignment purposes.
This check now correctly mimics what the kernel will check in
kvm_s390_pv_set_aside(), whereby a VM <= 2 GiB VM would end up using
a segment type ASCE.
Message-ID: <20241219144115.2820241-12-david@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Nina Schoetterl-Glausch <nsg@linux.ibm.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
(cherry picked from commit a056332e732110c8ef0d40ffd49bd03afc2f04ca)
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
target/s390x/kvm/pv.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/target/s390x/kvm/pv.c b/target/s390x/kvm/pv.c
index 424cce75ca..fa66607e7b 100644
--- a/target/s390x/kvm/pv.c
+++ b/target/s390x/kvm/pv.c
@@ -133,7 +133,7 @@ bool s390_pv_vm_try_disable_async(S390CcwMachineState *ms)
* If the feature is not present or if the VM is not larger than 2 GiB,
* KVM_PV_ASYNC_CLEANUP_PREPARE fill fail; no point in attempting it.
*/
- if ((MACHINE(ms)->ram_size <= 2 * GiB) ||
+ if (s390_get_memory_limit(ms) <= 2 * GiB ||
!kvm_check_extension(kvm_state, KVM_CAP_S390_PROTECTED_ASYNC_DISABLE)) {
return false;
}
--
2.48.1

View File

@ -0,0 +1,107 @@
From 5a311d410bca4a5530a51c0b789ce8525d2d0653 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 19 Dec 2024 15:41:13 +0100
Subject: [PATCH 21/26] s390x: remember the maximum page size
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 351: Enable virtio-mem support on s390x
RH-Jira: RHEL-72977
RH-Acked-by: David Hildenbrand <david@redhat.com>
RH-Acked-by: Juraj Marcin <None>
RH-Commit: [21/26] 3b97c555b153d42e4fcb27dbb65fbf3edac622a4 (thuth/qemu-kvm-cs)
Let's remember the value (successfully) set via s390_set_max_pagesize().
This will be helpful to reject hotplugged memory devices that would exceed
this initially set page size.
Handle it just like how we handle s390_get_memory_limit(), storing it in
the machine, and moving the handling to machine code.
Message-ID: <20241219144115.2820241-13-david@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
(cherry picked from commit df2ac211a62e6ced7f1495b634fa6f78962f2321)
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
hw/s390x/s390-virtio-ccw.c | 12 +++++++++++-
include/hw/s390x/s390-virtio-ccw.h | 1 +
target/s390x/cpu-sysemu.c | 7 -------
target/s390x/cpu.h | 1 -
4 files changed, 12 insertions(+), 9 deletions(-)
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index 824c73536a..bd05a22b4e 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -143,6 +143,16 @@ static void s390_set_memory_limit(S390CcwMachineState *s390ms,
s390ms->memory_limit = new_limit;
}
+static void s390_set_max_pagesize(S390CcwMachineState *s390ms,
+ uint64_t pagesize)
+{
+ assert(!s390ms->max_pagesize && pagesize);
+ if (kvm_enabled()) {
+ kvm_s390_set_max_pagesize(pagesize, &error_fatal);
+ }
+ s390ms->max_pagesize = pagesize;
+}
+
static void s390_memory_init(MachineState *machine)
{
S390CcwMachineState *s390ms = S390_CCW_MACHINE(machine);
@@ -191,7 +201,7 @@ static void s390_memory_init(MachineState *machine)
* Configure the maximum page size. As no memory devices were created
* yet, this is the page size of initial memory only.
*/
- s390_set_max_pagesize(qemu_maxrampagesize(), &error_fatal);
+ s390_set_max_pagesize(s390ms, qemu_maxrampagesize());
/* Initialize storage key device */
s390_skeys_init();
/* Initialize storage attributes device */
diff --git a/include/hw/s390x/s390-virtio-ccw.h b/include/hw/s390x/s390-virtio-ccw.h
index de04336c5a..599740a998 100644
--- a/include/hw/s390x/s390-virtio-ccw.h
+++ b/include/hw/s390x/s390-virtio-ccw.h
@@ -30,6 +30,7 @@ struct S390CcwMachineState {
bool pv;
uint8_t loadparm[8];
uint64_t memory_limit;
+ uint64_t max_pagesize;
SCLPDevice *sclp;
};
diff --git a/target/s390x/cpu-sysemu.c b/target/s390x/cpu-sysemu.c
index 3118a25fee..706a5c53e2 100644
--- a/target/s390x/cpu-sysemu.c
+++ b/target/s390x/cpu-sysemu.c
@@ -255,13 +255,6 @@ unsigned int s390_cpu_set_state(uint8_t cpu_state, S390CPU *cpu)
return s390_count_running_cpus();
}
-void s390_set_max_pagesize(uint64_t pagesize, Error **errp)
-{
- if (kvm_enabled()) {
- kvm_s390_set_max_pagesize(pagesize, errp);
- }
-}
-
void s390_cmma_reset(void)
{
if (kvm_enabled()) {
diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
index ecaf3191d2..9770a62ac9 100644
--- a/target/s390x/cpu.h
+++ b/target/s390x/cpu.h
@@ -881,7 +881,6 @@ static inline void s390_do_cpu_load_normal(CPUState *cs, run_on_cpu_data arg)
/* cpu.c */
void s390_crypto_reset(void);
-void s390_set_max_pagesize(uint64_t pagesize, Error **errp);
void s390_cmma_reset(void);
void s390_enable_css_support(S390CPU *cpu);
void s390_do_cpu_set_diag318(CPUState *cs, run_on_cpu_data arg);
--
2.48.1

View File

@ -0,0 +1,113 @@
From 2fbdf7e3cf23daea470aaa4a29e16641feb76f3c Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 19 Dec 2024 15:41:05 +0100
Subject: [PATCH 13/26] s390x: rename s390-virtio-hcall* to s390-hypercall*
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 351: Enable virtio-mem support on s390x
RH-Jira: RHEL-72977
RH-Acked-by: David Hildenbrand <david@redhat.com>
RH-Acked-by: Juraj Marcin <None>
RH-Commit: [13/26] 3c1ef3cbb137517b306871f0a88a61a59740af5a (thuth/qemu-kvm-cs)
Let's make it clearer that we are talking about general
QEMU/KVM-specific hypercalls.
Message-ID: <20241219144115.2820241-5-david@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
(cherry picked from commit 85489fc3652d0c4433c940f1a80a952e8cb5d3cb)
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
hw/s390x/meson.build | 2 +-
hw/s390x/{s390-virtio-hcall.c => s390-hypercall.c} | 2 +-
hw/s390x/{s390-virtio-hcall.h => s390-hypercall.h} | 6 +++---
target/s390x/kvm/kvm.c | 2 +-
target/s390x/tcg/misc_helper.c | 2 +-
5 files changed, 7 insertions(+), 7 deletions(-)
rename hw/s390x/{s390-virtio-hcall.c => s390-hypercall.c} (97%)
rename hw/s390x/{s390-virtio-hcall.h => s390-hypercall.h} (86%)
diff --git a/hw/s390x/meson.build b/hw/s390x/meson.build
index d6c8c33915..e344a3bd8c 100644
--- a/hw/s390x/meson.build
+++ b/hw/s390x/meson.build
@@ -29,7 +29,7 @@ s390x_ss.add(when: 'CONFIG_TCG', if_true: files(
))
s390x_ss.add(when: 'CONFIG_S390_CCW_VIRTIO', if_true: files(
's390-virtio-ccw.c',
- 's390-virtio-hcall.c',
+ 's390-hypercall.c',
))
s390x_ss.add(when: 'CONFIG_TERMINAL3270', if_true: files('3270-ccw.c'))
s390x_ss.add(when: 'CONFIG_VFIO', if_true: files('s390-pci-vfio.c'))
diff --git a/hw/s390x/s390-virtio-hcall.c b/hw/s390x/s390-hypercall.c
similarity index 97%
rename from hw/s390x/s390-virtio-hcall.c
rename to hw/s390x/s390-hypercall.c
index 5fb78a719e..f816c2b1ef 100644
--- a/hw/s390x/s390-virtio-hcall.c
+++ b/hw/s390x/s390-hypercall.c
@@ -12,7 +12,7 @@
#include "qemu/osdep.h"
#include "cpu.h"
#include "hw/boards.h"
-#include "hw/s390x/s390-virtio-hcall.h"
+#include "hw/s390x/s390-hypercall.h"
#include "hw/s390x/ioinst.h"
#include "hw/s390x/css.h"
#include "virtio-ccw.h"
diff --git a/hw/s390x/s390-virtio-hcall.h b/hw/s390x/s390-hypercall.h
similarity index 86%
rename from hw/s390x/s390-virtio-hcall.h
rename to hw/s390x/s390-hypercall.h
index dca456b926..2fa81dbfdd 100644
--- a/hw/s390x/s390-virtio-hcall.h
+++ b/hw/s390x/s390-hypercall.h
@@ -9,8 +9,8 @@
* directory.
*/
-#ifndef HW_S390_VIRTIO_HCALL_H
-#define HW_S390_VIRTIO_HCALL_H
+#ifndef HW_S390_HYPERCALL_H
+#define HW_S390_HYPERCALL_H
#include "cpu.h"
@@ -21,4 +21,4 @@
void handle_diag_500(S390CPU *cpu, uintptr_t ra);
-#endif /* HW_S390_VIRTIO_HCALL_H */
+#endif /* HW_S390_HYPERCALL_H */
diff --git a/target/s390x/kvm/kvm.c b/target/s390x/kvm/kvm.c
index 42d6a54126..afc8d570c9 100644
--- a/target/s390x/kvm/kvm.c
+++ b/target/s390x/kvm/kvm.c
@@ -49,7 +49,7 @@
#include "hw/s390x/ebcdic.h"
#include "exec/memattrs.h"
#include "hw/s390x/s390-virtio-ccw.h"
-#include "hw/s390x/s390-virtio-hcall.h"
+#include "hw/s390x/s390-hypercall.h"
#include "target/s390x/kvm/pv.h"
#include CONFIG_DEVICES
diff --git a/target/s390x/tcg/misc_helper.c b/target/s390x/tcg/misc_helper.c
index 2b4310003b..b726a95352 100644
--- a/target/s390x/tcg/misc_helper.c
+++ b/target/s390x/tcg/misc_helper.c
@@ -36,7 +36,7 @@
#include "sysemu/cpus.h"
#include "sysemu/sysemu.h"
#include "hw/s390x/ebcdic.h"
-#include "hw/s390x/s390-virtio-hcall.h"
+#include "hw/s390x/s390-hypercall.h"
#include "hw/s390x/sclp.h"
#include "hw/s390x/s390_flic.h"
#include "hw/s390x/ioinst.h"
--
2.48.1

View File

@ -0,0 +1,99 @@
From 86417a068f24964422d4fd5ea301d70a0f8142d2 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 19 Dec 2024 15:41:08 +0100
Subject: [PATCH 16/26] s390x/s390-hypercall: introduce DIAG500 STORAGE_LIMIT
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 351: Enable virtio-mem support on s390x
RH-Jira: RHEL-72977
RH-Acked-by: David Hildenbrand <david@redhat.com>
RH-Acked-by: Juraj Marcin <None>
RH-Commit: [16/26] c1c341227388735450ddbba0201e7523e0658c07 (thuth/qemu-kvm-cs)
A guest OS that supports memory hotplug / memory devices must during
boot be aware of the maximum possible physical memory address that it might
have to handle at a later stage during its runtime.
For example, the maximum possible memory address might be required to
prepare the kernel virtual address space accordingly (e.g., select page
table hierarchy depth).
On s390x there is currently no such mechanism that is compatible with
paravirtualized memory devices, because the whole SCLP interface was
designed around the idea of "storage increments" and "standby memory".
Paravirtualized memory devices we want to support, such as virtio-mem, have
no intersection with any of that, but could co-exist with them in the
future if ever needed.
In particular, a guest OS must never detect and use device memory
without the help of a proper device driver. Device memory must not be
exposed in any firmware-provided memory map (SCLP or diag260 on s390x).
For this reason, these memory devices will be places in memory *above*
the "maximum storage increment" exposed via SCLP.
Let's provide a new diag500 subcode to query the memory limit determined in
s390_memory_init().
Message-ID: <20241219144115.2820241-8-david@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
(cherry picked from commit f7c168657816486527727d860b73747d41f0c5f6)
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
hw/s390x/s390-hypercall.c | 12 +++++++++++-
hw/s390x/s390-hypercall.h | 1 +
2 files changed, 12 insertions(+), 1 deletion(-)
diff --git a/hw/s390x/s390-hypercall.c b/hw/s390x/s390-hypercall.c
index f816c2b1ef..ac1b08b2cd 100644
--- a/hw/s390x/s390-hypercall.c
+++ b/hw/s390x/s390-hypercall.c
@@ -11,7 +11,7 @@
#include "qemu/osdep.h"
#include "cpu.h"
-#include "hw/boards.h"
+#include "hw/s390x/s390-virtio-ccw.h"
#include "hw/s390x/s390-hypercall.h"
#include "hw/s390x/ioinst.h"
#include "hw/s390x/css.h"
@@ -57,6 +57,13 @@ static int handle_virtio_ccw_notify(uint64_t subch_id, uint64_t data)
return 0;
}
+static uint64_t handle_storage_limit(void)
+{
+ S390CcwMachineState *s390ms = S390_CCW_MACHINE(qdev_get_machine());
+
+ return s390_get_memory_limit(s390ms) - 1;
+}
+
void handle_diag_500(S390CPU *cpu, uintptr_t ra)
{
CPUS390XState *env = &cpu->env;
@@ -69,6 +76,9 @@ void handle_diag_500(S390CPU *cpu, uintptr_t ra)
case DIAG500_VIRTIO_CCW_NOTIFY:
env->regs[2] = handle_virtio_ccw_notify(env->regs[2], env->regs[3]);
break;
+ case DIAG500_STORAGE_LIMIT:
+ env->regs[2] = handle_storage_limit();
+ break;
default:
s390_program_interrupt(env, PGM_SPECIFICATION, ra);
}
diff --git a/hw/s390x/s390-hypercall.h b/hw/s390x/s390-hypercall.h
index 2fa81dbfdd..4f07209128 100644
--- a/hw/s390x/s390-hypercall.h
+++ b/hw/s390x/s390-hypercall.h
@@ -18,6 +18,7 @@
#define DIAG500_VIRTIO_RESET 1 /* legacy */
#define DIAG500_VIRTIO_SET_STATUS 2 /* legacy */
#define DIAG500_VIRTIO_CCW_NOTIFY 3 /* KVM_S390_VIRTIO_CCW_NOTIFY */
+#define DIAG500_STORAGE_LIMIT 4
void handle_diag_500(S390CPU *cpu, uintptr_t ra);
--
2.48.1

View File

@ -0,0 +1,56 @@
From 53d1b43699c6b30583f41a18a33c28893718aeac Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 19 Dec 2024 15:41:10 +0100
Subject: [PATCH 18/26] s390x/s390-skeys: prepare for memory devices
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 351: Enable virtio-mem support on s390x
RH-Jira: RHEL-72977
RH-Acked-by: David Hildenbrand <david@redhat.com>
RH-Acked-by: Juraj Marcin <None>
RH-Commit: [18/26] 47edda0eeb6d5932f81633f2d9d294b1ca5f413c (thuth/qemu-kvm-cs)
With memory devices, we will have storage keys for memory that
exceeds the initial ram size.
The TODO already states that current handling is subopimal,
but we won't worry about improving that (TCG-only) thing for now.
Message-ID: <20241219144115.2820241-10-david@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
(cherry picked from commit d1e3c2ac41b3f73708682e4e8212c32ad35013b9)
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
hw/s390x/s390-skeys.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/hw/s390x/s390-skeys.c b/hw/s390x/s390-skeys.c
index bf22d6863e..e4297b3b8a 100644
--- a/hw/s390x/s390-skeys.c
+++ b/hw/s390x/s390-skeys.c
@@ -11,7 +11,7 @@
#include "qemu/osdep.h"
#include "qemu/units.h"
-#include "hw/boards.h"
+#include "hw/s390x/s390-virtio-ccw.h"
#include "hw/qdev-properties.h"
#include "hw/s390x/storage-keys.h"
#include "qapi/error.h"
@@ -251,9 +251,9 @@ static bool qemu_s390_enable_skeys(S390SKeysState *ss)
* g_once_init_enter() is good enough.
*/
if (g_once_init_enter(&initialized)) {
- MachineState *machine = MACHINE(qdev_get_machine());
+ S390CcwMachineState *s390ms = S390_CCW_MACHINE(qdev_get_machine());
- skeys->key_count = machine->ram_size / TARGET_PAGE_SIZE;
+ skeys->key_count = s390_get_memory_limit(s390ms) / TARGET_PAGE_SIZE;
skeys->keydata = g_malloc0(skeys->key_count);
g_once_init_leave(&initialized, 1);
}
--
2.48.1

View File

@ -0,0 +1,155 @@
From 1195c91d10892a888870248fd881612955b9e1eb Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 19 Dec 2024 15:41:09 +0100
Subject: [PATCH 17/26] s390x/s390-stattrib-kvm: prepare for memory devices and
sparse memory layouts
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 351: Enable virtio-mem support on s390x
RH-Jira: RHEL-72977
RH-Acked-by: David Hildenbrand <david@redhat.com>
RH-Acked-by: Juraj Marcin <None>
RH-Commit: [17/26] 799aa7b2b9cc2a948e9f391bc0ecf739254c78b1 (thuth/qemu-kvm-cs)
With memory devices, we will have storage attributes for memory that
exceeds the initial ram size. Further, we can easily have memory holes,
for which there (currently) are no storage attributes.
In particular, with memory holes, KVM_S390_SET_CMMA_BITS will fail to set
some storage attributes.
So let's do it like we handle storage keys migration, relying on
guest_phys_blocks_append(). However, in contrast to storage key
migration, we will handle it on the migration destination.
This is a preparation for virtio-mem support. Note that ever since the
"early migration" feature was added (x-early-migration), the state
of device blocks (plugged/unplugged) is migrated early such that
guest_phys_blocks_append() will properly consider all currently plugged
memory blocks and skip any unplugged ones.
In the future, we should try getting rid of the large temporary buffer
and also not send any attributes for any memory holes, just so they
get ignored on the destination.
Message-ID: <20241219144115.2820241-9-david@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
(cherry picked from commit 241e6b2d27b090b17cda5b011b2064544b0c458b)
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
hw/s390x/s390-stattrib-kvm.c | 67 +++++++++++++++++++++++-------------
1 file changed, 43 insertions(+), 24 deletions(-)
diff --git a/hw/s390x/s390-stattrib-kvm.c b/hw/s390x/s390-stattrib-kvm.c
index eeaa811098..33ec91422a 100644
--- a/hw/s390x/s390-stattrib-kvm.c
+++ b/hw/s390x/s390-stattrib-kvm.c
@@ -10,11 +10,12 @@
*/
#include "qemu/osdep.h"
-#include "hw/boards.h"
+#include "hw/s390x/s390-virtio-ccw.h"
#include "migration/qemu-file.h"
#include "hw/s390x/storage-attributes.h"
#include "qemu/error-report.h"
#include "sysemu/kvm.h"
+#include "sysemu/memory_mapping.h"
#include "exec/ram_addr.h"
#include "kvm/kvm_s390x.h"
#include "qapi/error.h"
@@ -84,8 +85,8 @@ static int kvm_s390_stattrib_set_stattr(S390StAttribState *sa,
uint8_t *values)
{
KVMS390StAttribState *sas = KVM_S390_STATTRIB(sa);
- MachineState *machine = MACHINE(qdev_get_machine());
- unsigned long max = machine->ram_size / TARGET_PAGE_SIZE;
+ S390CcwMachineState *s390ms = S390_CCW_MACHINE(qdev_get_machine());
+ unsigned long max = s390_get_memory_limit(s390ms) / TARGET_PAGE_SIZE;
if (start_gfn + count > max) {
error_report("Out of memory bounds when setting storage attributes");
@@ -103,39 +104,57 @@ static int kvm_s390_stattrib_set_stattr(S390StAttribState *sa,
static void kvm_s390_stattrib_synchronize(S390StAttribState *sa)
{
KVMS390StAttribState *sas = KVM_S390_STATTRIB(sa);
- MachineState *machine = MACHINE(qdev_get_machine());
- unsigned long max = machine->ram_size / TARGET_PAGE_SIZE;
- /* We do not need to reach the maximum buffer size allowed */
- unsigned long cx, len = KVM_S390_SKEYS_MAX / 2;
+ S390CcwMachineState *s390ms = S390_CCW_MACHINE(qdev_get_machine());
+ unsigned long max = s390_get_memory_limit(s390ms) / TARGET_PAGE_SIZE;
+ unsigned long start_gfn, end_gfn, pages;
+ GuestPhysBlockList guest_phys_blocks;
+ GuestPhysBlock *block;
int r;
struct kvm_s390_cmma_log clog = {
.flags = 0,
.mask = ~0ULL,
};
- if (sas->incoming_buffer) {
- for (cx = 0; cx + len <= max; cx += len) {
- clog.start_gfn = cx;
- clog.count = len;
- clog.values = (uint64_t)(sas->incoming_buffer + cx);
- r = kvm_vm_ioctl(kvm_state, KVM_S390_SET_CMMA_BITS, &clog);
- if (r) {
- error_report("KVM_S390_SET_CMMA_BITS failed: %s", strerror(-r));
- return;
- }
- }
- if (cx < max) {
- clog.start_gfn = cx;
- clog.count = max - cx;
- clog.values = (uint64_t)(sas->incoming_buffer + cx);
+ if (!sas->incoming_buffer) {
+ return;
+ }
+ guest_phys_blocks_init(&guest_phys_blocks);
+ guest_phys_blocks_append(&guest_phys_blocks);
+
+ QTAILQ_FOREACH(block, &guest_phys_blocks.head, next) {
+ assert(QEMU_IS_ALIGNED(block->target_start, TARGET_PAGE_SIZE));
+ assert(QEMU_IS_ALIGNED(block->target_end, TARGET_PAGE_SIZE));
+
+ start_gfn = block->target_start / TARGET_PAGE_SIZE;
+ end_gfn = block->target_end / TARGET_PAGE_SIZE;
+
+ while (start_gfn < end_gfn) {
+ /* Don't exceed the maximum buffer size. */
+ pages = MIN(end_gfn - start_gfn, KVM_S390_SKEYS_MAX / 2);
+
+ /*
+ * If we ever get guest physical memory beyond the configured
+ * memory limit, something went very wrong.
+ */
+ assert(start_gfn + pages <= max);
+
+ clog.start_gfn = start_gfn;
+ clog.count = pages;
+ clog.values = (uint64_t)(sas->incoming_buffer + start_gfn);
r = kvm_vm_ioctl(kvm_state, KVM_S390_SET_CMMA_BITS, &clog);
if (r) {
error_report("KVM_S390_SET_CMMA_BITS failed: %s", strerror(-r));
+ goto out;
}
+
+ start_gfn += pages;
}
- g_free(sas->incoming_buffer);
- sas->incoming_buffer = NULL;
}
+
+out:
+ guest_phys_blocks_free(&guest_phys_blocks);
+ g_free(sas->incoming_buffer);
+ sas->incoming_buffer = NULL;
}
static int kvm_s390_stattrib_set_migrationmode(S390StAttribState *sa, bool val,
--
2.48.1

View File

@ -0,0 +1,63 @@
From 4ee3076ac566622929f9410636483c4f0b2da967 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 19 Dec 2024 15:41:02 +0100
Subject: [PATCH 10/26] s390x/s390-virtio-ccw: don't crash on weird RAM sizes
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 351: Enable virtio-mem support on s390x
RH-Jira: RHEL-72977
RH-Acked-by: David Hildenbrand <david@redhat.com>
RH-Acked-by: Juraj Marcin <None>
RH-Commit: [10/26] 55738da52f3cf4746bee2b17780a10720fa05863 (thuth/qemu-kvm-cs)
KVM is not happy when starting a VM with weird RAM sizes:
# qemu-system-s390x --enable-kvm --nographic -m 1234K
qemu-system-s390x: kvm_set_user_memory_region: KVM_SET_USER_MEMORY_REGION
failed, slot=0, start=0x0, size=0x244000: Invalid argument
kvm_set_phys_mem: error registering slot: Invalid argument
Aborted (core dumped)
Let's handle that in a better way by rejecting such weird RAM sizes
right from the start:
# qemu-system-s390x --enable-kvm --nographic -m 1234K
qemu-system-s390x: ram size must be multiples of 1 MiB
Message-ID: <20241219144115.2820241-2-david@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Eric Farman <farman@linux.ibm.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Acked-by: Janosch Frank <frankja@linux.ibm.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
(cherry picked from commit 14e568ab4836347481af2e334009c385f456a734)
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
hw/s390x/s390-virtio-ccw.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index 94cad1705b..82ded9666c 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -180,6 +180,17 @@ static void s390_memory_init(MemoryRegion *ram)
{
MemoryRegion *sysmem = get_system_memory();
+ if (!QEMU_IS_ALIGNED(memory_region_size(ram), 1 * MiB)) {
+ /*
+ * SCLP cannot possibly expose smaller granularity right now and KVM
+ * cannot handle smaller granularity. As we don't support NUMA, the
+ * region size directly corresponds to machine->ram_size, and the region
+ * is a single RAM memory region.
+ */
+ error_report("ram size must be multiples of 1 MiB");
+ exit(EXIT_FAILURE);
+ }
+
/* allocate RAM for core */
memory_region_add_subregion(sysmem, 0, ram);
--
2.48.1

View File

@ -0,0 +1,140 @@
From 9ec2d356210f1e66f50519cc4d58633a13db9004 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 19 Dec 2024 15:41:06 +0100
Subject: [PATCH 14/26] s390x/s390-virtio-ccw: move setting the maximum guest
size from sclp to machine code
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 351: Enable virtio-mem support on s390x
RH-Jira: RHEL-72977
RH-Acked-by: David Hildenbrand <david@redhat.com>
RH-Acked-by: Juraj Marcin <None>
RH-Commit: [14/26] a5970c1c6d8d09a473a25a7eee533ec3a6711ec8 (thuth/qemu-kvm-cs)
Nowadays, it feels more natural to have that code located in
s390_memory_init(), where we also have direct access to the machine
object.
While at it, use the actual RAM size, not the maximum RAM size which
cannot currently be reached without support for any memory devices.
Consequently update s390_pv_vm_try_disable_async() to rely on the RAM size
as well, to avoid temporary issues while we further rework that
handling.
set_memory_limit() is temporary, we'll merge it with
s390_set_memory_limit() next.
Message-ID: <20241219144115.2820241-6-david@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
(cherry picked from commit 3c6fb557d295949bea291c3bf88ee9c83392e78c)
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
hw/s390x/s390-virtio-ccw.c | 28 ++++++++++++++++++++++++----
hw/s390x/sclp.c | 11 -----------
target/s390x/kvm/pv.c | 2 +-
3 files changed, 25 insertions(+), 16 deletions(-)
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index d47e99028e..248ac28d20 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -121,11 +121,29 @@ static void subsystem_reset(void)
}
}
-static void s390_memory_init(MemoryRegion *ram)
+static void set_memory_limit(uint64_t new_limit)
+{
+ uint64_t hw_limit;
+ int ret;
+
+ ret = s390_set_memory_limit(new_limit, &hw_limit);
+ if (ret == -E2BIG) {
+ error_report("host supports a maximum of %" PRIu64 " GB",
+ hw_limit / GiB);
+ exit(EXIT_FAILURE);
+ } else if (ret) {
+ error_report("setting the guest size failed");
+ exit(EXIT_FAILURE);
+ }
+}
+
+static void s390_memory_init(MachineState *machine)
{
MemoryRegion *sysmem = get_system_memory();
+ MemoryRegion *ram = machine->ram;
+ uint64_t ram_size = memory_region_size(ram);
- if (!QEMU_IS_ALIGNED(memory_region_size(ram), 1 * MiB)) {
+ if (!QEMU_IS_ALIGNED(ram_size, 1 * MiB)) {
/*
* SCLP cannot possibly expose smaller granularity right now and KVM
* cannot handle smaller granularity. As we don't support NUMA, the
@@ -136,7 +154,9 @@ static void s390_memory_init(MemoryRegion *ram)
exit(EXIT_FAILURE);
}
- /* allocate RAM for core */
+ set_memory_limit(ram_size);
+
+ /* Map the initial memory. Must happen after setting the memory limit. */
memory_region_add_subregion(sysmem, 0, ram);
/*
@@ -211,7 +231,7 @@ static void ccw_init(MachineState *machine)
qdev_realize_and_unref(DEVICE(ms->sclp), NULL, &error_fatal);
/* init memory + setup max page size. Required for the CPU model */
- s390_memory_init(machine->ram);
+ s390_memory_init(machine);
/* init CPUs (incl. CPU model) early so s390_has_feature() works */
s390_init_cpus(machine);
diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c
index 8757626b5c..73e88ab4eb 100644
--- a/hw/s390x/sclp.c
+++ b/hw/s390x/sclp.c
@@ -376,10 +376,7 @@ void sclp_service_interrupt(uint32_t sccb)
/* qemu object creation and initialization functions */
static void sclp_realize(DeviceState *dev, Error **errp)
{
- MachineState *machine = MACHINE(qdev_get_machine());
SCLPDevice *sclp = SCLP(dev);
- uint64_t hw_limit;
- int ret;
/*
* qdev_device_add searches the sysbus for TYPE_SCLP_EVENTS_BUS. As long
@@ -389,14 +386,6 @@ static void sclp_realize(DeviceState *dev, Error **errp)
if (!sysbus_realize(SYS_BUS_DEVICE(sclp->event_facility), errp)) {
return;
}
-
- ret = s390_set_memory_limit(machine->maxram_size, &hw_limit);
- if (ret == -E2BIG) {
- error_setg(errp, "host supports a maximum of %" PRIu64 " GB",
- hw_limit / GiB);
- } else if (ret) {
- error_setg(errp, "setting the guest size failed");
- }
}
static void sclp_memory_init(SCLPDevice *sclp)
diff --git a/target/s390x/kvm/pv.c b/target/s390x/kvm/pv.c
index dde836d21a..424cce75ca 100644
--- a/target/s390x/kvm/pv.c
+++ b/target/s390x/kvm/pv.c
@@ -133,7 +133,7 @@ bool s390_pv_vm_try_disable_async(S390CcwMachineState *ms)
* If the feature is not present or if the VM is not larger than 2 GiB,
* KVM_PV_ASYNC_CLEANUP_PREPARE fill fail; no point in attempting it.
*/
- if ((MACHINE(ms)->maxram_size <= 2 * GiB) ||
+ if ((MACHINE(ms)->ram_size <= 2 * GiB) ||
!kvm_check_extension(kvm_state, KVM_CAP_S390_PROTECTED_ASYNC_DISABLE)) {
return false;
}
--
2.48.1

View File

@ -0,0 +1,117 @@
From 0e7d7bf86fb242c1ea90bf9648fb061626790eda Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 19 Dec 2024 15:41:11 +0100
Subject: [PATCH 19/26] s390x/s390-virtio-ccw: prepare for memory devices
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 351: Enable virtio-mem support on s390x
RH-Jira: RHEL-72977
RH-Acked-by: David Hildenbrand <david@redhat.com>
RH-Acked-by: Juraj Marcin <None>
RH-Commit: [19/26] 2441c8c5f5a06d5ca93188dd44e8a08f06d1722b (thuth/qemu-kvm-cs)
Let's prepare our address space for memory devices if enabled via
"maxmem" and if we have CONFIG_MEM_DEVICE enabled at all. Note that
CONFIG_MEM_DEVICE will be selected automatically once we add support
for devices.
Just like on other architectures, the region container for memory devices
is placed directly above our initial memory. For now, we only align the
start address of the region up to 1 GiB, but we won't add any additional
space to the region for internal alignment purposes; this can be done in
the future if really required.
The RAM size returned via SCLP is not modified, as this only
covers initial RAM (and standby memory we don't implement) and not memory
devices; clarify that in the docs of read_SCP_info(). Existing OSes without
support for memory devices will keep working as is, even when memory
devices would be attached the VM.
Guest OSs which support memory devices, such as virtio-mem, will
consult diag500(), to find out the maximum possible pfn. Guest OSes that
don't support memory devices, don't have to be changed and will continue
relying on information provided by SCLP.
There are no remaining maxram_size users in s390x code, and the remaining
ram_size users only care about initial RAM:
* hw/s390x/ipl.c
* hw/s390x/s390-hypercall.c
* hw/s390x/sclp.c
* target/s390x/kvm/pv.c
Message-ID: <20241219144115.2820241-11-david@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
(cherry picked from commit 1e86400298cf0fed5f7d49427db477775b859093)
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
hw/s390x/s390-virtio-ccw.c | 23 ++++++++++++++++++++++-
hw/s390x/sclp.c | 6 +++++-
2 files changed, 27 insertions(+), 2 deletions(-)
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index f5f147eb92..824c73536a 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -149,6 +149,7 @@ static void s390_memory_init(MachineState *machine)
MemoryRegion *sysmem = get_system_memory();
MemoryRegion *ram = machine->ram;
uint64_t ram_size = memory_region_size(ram);
+ uint64_t devmem_base, devmem_size;
if (!QEMU_IS_ALIGNED(ram_size, 1 * MiB)) {
/*
@@ -161,11 +162,31 @@ static void s390_memory_init(MachineState *machine)
exit(EXIT_FAILURE);
}
- s390_set_memory_limit(s390ms, ram_size);
+ devmem_size = 0;
+ devmem_base = ram_size;
+#ifdef CONFIG_MEM_DEVICE
+ if (machine->ram_size < machine->maxram_size) {
+
+ /*
+ * Make sure memory devices have a sane default alignment, even
+ * when weird initial memory sizes are specified.
+ */
+ devmem_base = QEMU_ALIGN_UP(devmem_base, 1 * GiB);
+ devmem_size = machine->maxram_size - machine->ram_size;
+ }
+#endif
+ s390_set_memory_limit(s390ms, devmem_base + devmem_size);
/* Map the initial memory. Must happen after setting the memory limit. */
memory_region_add_subregion(sysmem, 0, ram);
+ /* Initialize address space for memory devices. */
+#ifdef CONFIG_MEM_DEVICE
+ if (devmem_size) {
+ machine_memory_devices_init(machine, devmem_base, devmem_size);
+ }
+#endif /* CONFIG_MEM_DEVICE */
+
/*
* Configure the maximum page size. As no memory devices were created
* yet, this is the page size of initial memory only.
diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c
index 73e88ab4eb..5945c9b1d8 100644
--- a/hw/s390x/sclp.c
+++ b/hw/s390x/sclp.c
@@ -161,7 +161,11 @@ static void read_SCP_info(SCLPDevice *sclp, SCCB *sccb)
read_info->rnsize2 = cpu_to_be32(rnsize);
}
- /* we don't support standby memory, maxram_size is never exposed */
+ /*
+ * We don't support standby memory. maxram_size is used for sizing the
+ * memory device region, which is not exposed through SCLP but through
+ * diag500.
+ */
rnmax = machine->ram_size >> sclp->increment_size;
if (rnmax < 0x10000) {
read_info->rnmax = cpu_to_be16(rnmax);
--
2.48.1

View File

@ -0,0 +1,163 @@
From d2764db41fc6edcead9ad27b8d31e7bff524c0c0 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 19 Dec 2024 15:41:04 +0100
Subject: [PATCH 12/26] s390x/s390-virtio-hcall: prepare for more diag500
hypercalls
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 351: Enable virtio-mem support on s390x
RH-Jira: RHEL-72977
RH-Acked-by: David Hildenbrand <david@redhat.com>
RH-Acked-by: Juraj Marcin <None>
RH-Commit: [12/26] 6573602d71b9e70679a48315f913309be29d6239 (thuth/qemu-kvm-cs)
Let's generalize, abstracting the virtio bits. diag500 is now a generic
hypercall to handle QEMU/KVM specific things. Explicitly specify all
already defined subcodes, including legacy ones (so we know what we can
use for new hypercalls).
Move the PGM_SPECIFICATION injection into the renamed function
handle_diag_500(), so we can turn it into a void function.
We'll rename the files separately, so git properly detects the rename.
Message-ID: <20241219144115.2820241-4-david@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
(cherry picked from commit 6e9cc2da4e8b997fd6ff3249034f436b84fc7974)
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
hw/s390x/s390-virtio-hcall.c | 15 ++++++++-------
hw/s390x/s390-virtio-hcall.h | 11 ++++++-----
target/s390x/kvm/kvm.c | 20 +++-----------------
target/s390x/tcg/misc_helper.c | 5 +++--
4 files changed, 20 insertions(+), 31 deletions(-)
diff --git a/hw/s390x/s390-virtio-hcall.c b/hw/s390x/s390-virtio-hcall.c
index ca49e3cd22..5fb78a719e 100644
--- a/hw/s390x/s390-virtio-hcall.c
+++ b/hw/s390x/s390-virtio-hcall.c
@@ -1,5 +1,5 @@
/*
- * Support for virtio hypercalls on s390
+ * Support for QEMU/KVM hypercalls on s390
*
* Copyright 2012 IBM Corp.
* Author(s): Cornelia Huck <cornelia.huck@de.ibm.com>
@@ -57,18 +57,19 @@ static int handle_virtio_ccw_notify(uint64_t subch_id, uint64_t data)
return 0;
}
-int s390_virtio_hypercall(CPUS390XState *env)
+void handle_diag_500(S390CPU *cpu, uintptr_t ra)
{
+ CPUS390XState *env = &cpu->env;
const uint64_t subcode = env->regs[1];
switch (subcode) {
- case KVM_S390_VIRTIO_NOTIFY:
+ case DIAG500_VIRTIO_NOTIFY:
env->regs[2] = handle_virtio_notify(env->regs[2]);
- return 0;
- case KVM_S390_VIRTIO_CCW_NOTIFY:
+ break;
+ case DIAG500_VIRTIO_CCW_NOTIFY:
env->regs[2] = handle_virtio_ccw_notify(env->regs[2], env->regs[3]);
- return 0;
+ break;
default:
- return -EINVAL;
+ s390_program_interrupt(env, PGM_SPECIFICATION, ra);
}
}
diff --git a/hw/s390x/s390-virtio-hcall.h b/hw/s390x/s390-virtio-hcall.h
index 3d9fe147d2..dca456b926 100644
--- a/hw/s390x/s390-virtio-hcall.h
+++ b/hw/s390x/s390-virtio-hcall.h
@@ -1,5 +1,5 @@
/*
- * Support for virtio hypercalls on s390x
+ * Support for QEMU/KVM hypercalls on s390x
*
* Copyright IBM Corp. 2012, 2017
* Author(s): Cornelia Huck <cornelia.huck@de.ibm.com>
@@ -12,12 +12,13 @@
#ifndef HW_S390_VIRTIO_HCALL_H
#define HW_S390_VIRTIO_HCALL_H
-#include "standard-headers/asm-s390/virtio-ccw.h"
#include "cpu.h"
-/* The only thing that we need from the old kvm_virtio.h file */
-#define KVM_S390_VIRTIO_NOTIFY 0
+#define DIAG500_VIRTIO_NOTIFY 0 /* legacy, implemented as a NOP */
+#define DIAG500_VIRTIO_RESET 1 /* legacy */
+#define DIAG500_VIRTIO_SET_STATUS 2 /* legacy */
+#define DIAG500_VIRTIO_CCW_NOTIFY 3 /* KVM_S390_VIRTIO_CCW_NOTIFY */
-int s390_virtio_hypercall(CPUS390XState *env);
+void handle_diag_500(S390CPU *cpu, uintptr_t ra);
#endif /* HW_S390_VIRTIO_HCALL_H */
diff --git a/target/s390x/kvm/kvm.c b/target/s390x/kvm/kvm.c
index 5947dda829..42d6a54126 100644
--- a/target/s390x/kvm/kvm.c
+++ b/target/s390x/kvm/kvm.c
@@ -1492,22 +1492,6 @@ static int handle_e3(S390CPU *cpu, struct kvm_run *run, uint8_t ipbl)
return r;
}
-static int handle_hypercall(S390CPU *cpu, struct kvm_run *run)
-{
- CPUS390XState *env = &cpu->env;
- int ret = -EINVAL;
-
-#ifdef CONFIG_S390_CCW_VIRTIO
- ret = s390_virtio_hypercall(env);
-#endif /* CONFIG_S390_CCW_VIRTIO */
- if (ret == -EINVAL) {
- kvm_s390_program_interrupt(cpu, PGM_SPECIFICATION);
- return 0;
- }
-
- return ret;
-}
-
static void kvm_handle_diag_288(S390CPU *cpu, struct kvm_run *run)
{
uint64_t r1, r3;
@@ -1603,9 +1587,11 @@ static int handle_diag(S390CPU *cpu, struct kvm_run *run, uint32_t ipb)
case DIAG_SET_CONTROL_PROGRAM_CODES:
handle_diag_318(cpu, run);
break;
+#ifdef CONFIG_S390_CCW_VIRTIO
case DIAG_KVM_HYPERCALL:
- r = handle_hypercall(cpu, run);
+ handle_diag_500(cpu, RA_IGNORED);
break;
+#endif /* CONFIG_S390_CCW_VIRTIO */
case DIAG_KVM_BREAKPOINT:
r = handle_sw_breakpoint(cpu, run);
break;
diff --git a/target/s390x/tcg/misc_helper.c b/target/s390x/tcg/misc_helper.c
index f44136a568..2b4310003b 100644
--- a/target/s390x/tcg/misc_helper.c
+++ b/target/s390x/tcg/misc_helper.c
@@ -119,10 +119,11 @@ void HELPER(diag)(CPUS390XState *env, uint32_t r1, uint32_t r3, uint32_t num)
switch (num) {
#ifdef CONFIG_S390_CCW_VIRTIO
case 0x500:
- /* KVM hypercall */
+ /* QEMU/KVM hypercall */
bql_lock();
- r = s390_virtio_hypercall(env);
+ handle_diag_500(env_archcpu(env), GETPC());
bql_unlock();
+ r = 0;
break;
#endif /* CONFIG_S390_CCW_VIRTIO */
case 0x44:
--
2.48.1

View File

@ -0,0 +1,296 @@
From 16ccb16d393a3e63936dc993c30c67fdecb1f120 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 19 Dec 2024 15:41:03 +0100
Subject: [PATCH 11/26] s390x/s390-virtio-hcall: remove hypercall registration
mechanism
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 351: Enable virtio-mem support on s390x
RH-Jira: RHEL-72977
RH-Acked-by: David Hildenbrand <david@redhat.com>
RH-Acked-by: Juraj Marcin <None>
RH-Commit: [11/26] 5e8d2720fe9fd6e6e24487d71988821f1cf27f17 (thuth/qemu-kvm-cs)
Nowadays, we only have a single machine type in QEMU, everything is based
on virtio-ccw and the traditional virtio machine does no longer exist. No
need to dynamically register diag500 handlers. Move the two existing
handlers into s390-virtio-hcall.c.
Message-ID: <20241219144115.2820241-3-david@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Acked-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
(cherry picked from commit 4be0fce498d0a08f18b3a9accdb9ded79484d30a)
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
hw/s390x/meson.build | 6 ++--
hw/s390x/s390-virtio-ccw.c | 58 ------------------------------
hw/s390x/s390-virtio-hcall.c | 65 +++++++++++++++++++++++++---------
hw/s390x/s390-virtio-hcall.h | 2 --
target/s390x/kvm/kvm.c | 5 ++-
target/s390x/tcg/misc_helper.c | 3 ++
6 files changed, 60 insertions(+), 79 deletions(-)
diff --git a/hw/s390x/meson.build b/hw/s390x/meson.build
index 482fd13420..d6c8c33915 100644
--- a/hw/s390x/meson.build
+++ b/hw/s390x/meson.build
@@ -12,7 +12,6 @@ s390x_ss.add(files(
's390-pci-inst.c',
's390-skeys.c',
's390-stattrib.c',
- 's390-virtio-hcall.c',
'sclp.c',
'sclpcpu.c',
'sclpquiesce.c',
@@ -28,7 +27,10 @@ s390x_ss.add(when: 'CONFIG_KVM', if_true: files(
s390x_ss.add(when: 'CONFIG_TCG', if_true: files(
'tod-tcg.c',
))
-s390x_ss.add(when: 'CONFIG_S390_CCW_VIRTIO', if_true: files('s390-virtio-ccw.c'))
+s390x_ss.add(when: 'CONFIG_S390_CCW_VIRTIO', if_true: files(
+ 's390-virtio-ccw.c',
+ 's390-virtio-hcall.c',
+))
s390x_ss.add(when: 'CONFIG_TERMINAL3270', if_true: files('3270-ccw.c'))
s390x_ss.add(when: 'CONFIG_VFIO', if_true: files('s390-pci-vfio.c'))
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index 82ded9666c..d47e99028e 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -16,11 +16,8 @@
#include "exec/ram_addr.h"
#include "exec/confidential-guest-support.h"
#include "hw/boards.h"
-#include "hw/s390x/s390-virtio-hcall.h"
#include "hw/s390x/sclp.h"
#include "hw/s390x/s390_flic.h"
-#include "hw/s390x/ioinst.h"
-#include "hw/s390x/css.h"
#include "virtio-ccw.h"
#include "qemu/config-file.h"
#include "qemu/ctype.h"
@@ -124,58 +121,6 @@ static void subsystem_reset(void)
}
}
-static int virtio_ccw_hcall_notify(const uint64_t *args)
-{
- uint64_t subch_id = args[0];
- uint64_t data = args[1];
- SubchDev *sch;
- VirtIODevice *vdev;
- int cssid, ssid, schid, m;
- uint16_t vq_idx = data;
-
- if (ioinst_disassemble_sch_ident(subch_id, &m, &cssid, &ssid, &schid)) {
- return -EINVAL;
- }
- sch = css_find_subch(m, cssid, ssid, schid);
- if (!sch || !css_subch_visible(sch)) {
- return -EINVAL;
- }
-
- vdev = virtio_ccw_get_vdev(sch);
- if (vq_idx >= VIRTIO_QUEUE_MAX || !virtio_queue_get_num(vdev, vq_idx)) {
- return -EINVAL;
- }
-
- if (virtio_vdev_has_feature(vdev, VIRTIO_F_NOTIFICATION_DATA)) {
- virtio_queue_set_shadow_avail_idx(virtio_get_queue(vdev, vq_idx),
- (data >> 16) & 0xFFFF);
- }
-
- virtio_queue_notify(vdev, vq_idx);
- return 0;
-}
-
-static int virtio_ccw_hcall_early_printk(const uint64_t *args)
-{
- uint64_t mem = args[0];
- MachineState *ms = MACHINE(qdev_get_machine());
-
- if (mem < ms->ram_size) {
- /* Early printk */
- return 0;
- }
- return -EINVAL;
-}
-
-static void virtio_ccw_register_hcalls(void)
-{
- s390_register_virtio_hypercall(KVM_S390_VIRTIO_CCW_NOTIFY,
- virtio_ccw_hcall_notify);
- /* Tolerate early printk. */
- s390_register_virtio_hypercall(KVM_S390_VIRTIO_NOTIFY,
- virtio_ccw_hcall_early_printk);
-}
-
static void s390_memory_init(MemoryRegion *ram)
{
MemoryRegion *sysmem = get_system_memory();
@@ -296,9 +241,6 @@ static void ccw_init(MachineState *machine)
OBJECT(dev));
sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
- /* register hypercalls */
- virtio_ccw_register_hcalls();
-
s390_enable_css_support(s390_cpu_addr2state(0));
ret = css_create_css_image(VIRTUAL_CSSID, true);
diff --git a/hw/s390x/s390-virtio-hcall.c b/hw/s390x/s390-virtio-hcall.c
index ec7cf8beb3..ca49e3cd22 100644
--- a/hw/s390x/s390-virtio-hcall.c
+++ b/hw/s390x/s390-virtio-hcall.c
@@ -11,31 +11,64 @@
#include "qemu/osdep.h"
#include "cpu.h"
+#include "hw/boards.h"
#include "hw/s390x/s390-virtio-hcall.h"
+#include "hw/s390x/ioinst.h"
+#include "hw/s390x/css.h"
+#include "virtio-ccw.h"
-#define MAX_DIAG_SUBCODES 255
+static int handle_virtio_notify(uint64_t mem)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
-static s390_virtio_fn s390_diag500_table[MAX_DIAG_SUBCODES];
+ if (mem < ms->ram_size) {
+ /* Early printk */
+ return 0;
+ }
+ return -EINVAL;
+}
-void s390_register_virtio_hypercall(uint64_t code, s390_virtio_fn fn)
+static int handle_virtio_ccw_notify(uint64_t subch_id, uint64_t data)
{
- assert(code < MAX_DIAG_SUBCODES);
- assert(!s390_diag500_table[code]);
+ SubchDev *sch;
+ VirtIODevice *vdev;
+ int cssid, ssid, schid, m;
+ uint16_t vq_idx = data;
+
+ if (ioinst_disassemble_sch_ident(subch_id, &m, &cssid, &ssid, &schid)) {
+ return -EINVAL;
+ }
+ sch = css_find_subch(m, cssid, ssid, schid);
+ if (!sch || !css_subch_visible(sch)) {
+ return -EINVAL;
+ }
- s390_diag500_table[code] = fn;
+ vdev = virtio_ccw_get_vdev(sch);
+ if (vq_idx >= VIRTIO_QUEUE_MAX || !virtio_queue_get_num(vdev, vq_idx)) {
+ return -EINVAL;
+ }
+
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_NOTIFICATION_DATA)) {
+ virtio_queue_set_shadow_avail_idx(virtio_get_queue(vdev, vq_idx),
+ (data >> 16) & 0xFFFF);
+ }
+
+ virtio_queue_notify(vdev, vq_idx);
+ return 0;
}
int s390_virtio_hypercall(CPUS390XState *env)
{
- s390_virtio_fn fn;
-
- if (env->regs[1] < MAX_DIAG_SUBCODES) {
- fn = s390_diag500_table[env->regs[1]];
- if (fn) {
- env->regs[2] = fn(&env->regs[2]);
- return 0;
- }
- }
+ const uint64_t subcode = env->regs[1];
- return -EINVAL;
+ switch (subcode) {
+ case KVM_S390_VIRTIO_NOTIFY:
+ env->regs[2] = handle_virtio_notify(env->regs[2]);
+ return 0;
+ case KVM_S390_VIRTIO_CCW_NOTIFY:
+ env->regs[2] = handle_virtio_ccw_notify(env->regs[2], env->regs[3]);
+ return 0;
+ default:
+ return -EINVAL;
+ }
}
diff --git a/hw/s390x/s390-virtio-hcall.h b/hw/s390x/s390-virtio-hcall.h
index 3ae6d6ae3a..3d9fe147d2 100644
--- a/hw/s390x/s390-virtio-hcall.h
+++ b/hw/s390x/s390-virtio-hcall.h
@@ -18,8 +18,6 @@
/* The only thing that we need from the old kvm_virtio.h file */
#define KVM_S390_VIRTIO_NOTIFY 0
-typedef int (*s390_virtio_fn)(const uint64_t *args);
-void s390_register_virtio_hypercall(uint64_t code, s390_virtio_fn fn);
int s390_virtio_hypercall(CPUS390XState *env);
#endif /* HW_S390_VIRTIO_HCALL_H */
diff --git a/target/s390x/kvm/kvm.c b/target/s390x/kvm/kvm.c
index 7a0ca5570f..5947dda829 100644
--- a/target/s390x/kvm/kvm.c
+++ b/target/s390x/kvm/kvm.c
@@ -51,6 +51,7 @@
#include "hw/s390x/s390-virtio-ccw.h"
#include "hw/s390x/s390-virtio-hcall.h"
#include "target/s390x/kvm/pv.h"
+#include CONFIG_DEVICES
#define kvm_vm_check_mem_attr(s, attr) \
kvm_vm_check_attr(s, KVM_S390_VM_MEM_CTRL, attr)
@@ -1494,9 +1495,11 @@ static int handle_e3(S390CPU *cpu, struct kvm_run *run, uint8_t ipbl)
static int handle_hypercall(S390CPU *cpu, struct kvm_run *run)
{
CPUS390XState *env = &cpu->env;
- int ret;
+ int ret = -EINVAL;
+#ifdef CONFIG_S390_CCW_VIRTIO
ret = s390_virtio_hypercall(env);
+#endif /* CONFIG_S390_CCW_VIRTIO */
if (ret == -EINVAL) {
kvm_s390_program_interrupt(cpu, PGM_SPECIFICATION);
return 0;
diff --git a/target/s390x/tcg/misc_helper.c b/target/s390x/tcg/misc_helper.c
index 303f86d363..f44136a568 100644
--- a/target/s390x/tcg/misc_helper.c
+++ b/target/s390x/tcg/misc_helper.c
@@ -43,6 +43,7 @@
#include "hw/s390x/s390-pci-inst.h"
#include "hw/boards.h"
#include "hw/s390x/tod.h"
+#include CONFIG_DEVICES
#endif
/* #define DEBUG_HELPER */
@@ -116,12 +117,14 @@ void HELPER(diag)(CPUS390XState *env, uint32_t r1, uint32_t r3, uint32_t num)
uint64_t r;
switch (num) {
+#ifdef CONFIG_S390_CCW_VIRTIO
case 0x500:
/* KVM hypercall */
bql_lock();
r = s390_virtio_hypercall(env);
bql_unlock();
break;
+#endif /* CONFIG_S390_CCW_VIRTIO */
case 0x44:
/* yield */
r = 0;
--
2.48.1

View File

@ -0,0 +1,423 @@
From 6b82fca2ecac0c7b30780ebb71ce5bad0421b9b4 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 19 Dec 2024 15:41:14 +0100
Subject: [PATCH 22/26] s390x/virtio-ccw: add support for virtio based memory
devices
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 351: Enable virtio-mem support on s390x
RH-Jira: RHEL-72977
RH-Acked-by: David Hildenbrand <david@redhat.com>
RH-Acked-by: Juraj Marcin <None>
RH-Commit: [22/26] 270a9fbe7e5bacfa6c9377815a01da26c4d26097 (thuth/qemu-kvm-cs)
Let's implement support for abstract virtio based memory devices, using
the virtio-pci implementation as an orientation. Wire them up in the
machine hotplug handler, taking care of s390x page size limitations.
As we neither support virtio-mem or virtio-pmem yet, the code is
effectively unused. We'll implement support for virtio-mem based on this
next.
Note that we won't wire up the virtio-pci variant (should currently be
impossible due to lack of support for MSI-X), but we'll add a safety net
to reject plugging them in the pre-plug handler.
Message-ID: <20241219144115.2820241-14-david@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
(cherry picked from commit 88d86f6f1e36741ba9e1625da19a7ccf1a343d39)
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
MAINTAINERS | 3 +
hw/s390x/meson.build | 3 +
hw/s390x/s390-virtio-ccw.c | 47 +++++++++-
hw/s390x/virtio-ccw-md-stubs.c | 24 ++++++
hw/s390x/virtio-ccw-md.c | 153 +++++++++++++++++++++++++++++++++
hw/s390x/virtio-ccw-md.h | 44 ++++++++++
hw/virtio/Kconfig | 1 +
7 files changed, 274 insertions(+), 1 deletion(-)
create mode 100644 hw/s390x/virtio-ccw-md-stubs.c
create mode 100644 hw/s390x/virtio-ccw-md.c
create mode 100644 hw/s390x/virtio-ccw-md.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 3584d6a6c6..f21dc3fa75 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2387,6 +2387,9 @@ F: include/hw/virtio/virtio-crypto.h
virtio based memory device
M: David Hildenbrand <david@redhat.com>
S: Supported
+F: hw/s390x/virtio-ccw-md.c
+F: hw/s390x/virtio-ccw-md.h
+F: hw/s390x/virtio-ccw-md-stubs.c
F: hw/virtio/virtio-md-pci.c
F: include/hw/virtio/virtio-md-pci.h
F: stubs/virtio-md-pci.c
diff --git a/hw/s390x/meson.build b/hw/s390x/meson.build
index e344a3bd8c..4431868408 100644
--- a/hw/s390x/meson.build
+++ b/hw/s390x/meson.build
@@ -50,8 +50,11 @@ endif
virtio_ss.add(when: 'CONFIG_VHOST_SCSI', if_true: files('vhost-scsi-ccw.c'))
virtio_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: files('vhost-vsock-ccw.c'))
virtio_ss.add(when: 'CONFIG_VHOST_USER_FS', if_true: files('vhost-user-fs-ccw.c'))
+virtio_ss.add(when: 'CONFIG_VIRTIO_MD', if_true: files('virtio-ccw-md.c'))
s390x_ss.add_all(when: 'CONFIG_VIRTIO_CCW', if_true: virtio_ss)
+s390x_ss.add(when: 'CONFIG_VIRTIO_MD', if_false: files('virtio-ccw-md-stubs.c'))
+
hw_arch += {'s390x': s390x_ss}
hw_s390x_modules = {}
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index bd05a22b4e..9f4ad01789 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -46,6 +46,8 @@
#include "qapi/visitor.h"
#include "hw/s390x/cpu-topology.h"
#include "kvm/kvm_s390x.h"
+#include "hw/virtio/virtio-md-pci.h"
+#include "hw/s390x/virtio-ccw-md.h"
#include CONFIG_DEVICES
static Error *pv_mig_blocker;
@@ -546,11 +548,39 @@ static void s390_machine_reset(MachineState *machine, ResetType type)
s390_ipl_clear_reset_request();
}
+static void s390_machine_device_pre_plug(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_CCW)) {
+ virtio_ccw_md_pre_plug(VIRTIO_MD_CCW(dev), MACHINE(hotplug_dev), errp);
+ } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) {
+ error_setg(errp,
+ "PCI-attached virtio based memory devices not supported");
+ }
+}
+
static void s390_machine_device_plug(HotplugHandler *hotplug_dev,
DeviceState *dev, Error **errp)
{
+ S390CcwMachineState *s390ms = S390_CCW_MACHINE(hotplug_dev);
+
if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
s390_cpu_plug(hotplug_dev, dev, errp);
+ } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_CCW)) {
+ /*
+ * At this point, the device is realized and set all memdevs mapped, so
+ * qemu_maxrampagesize() will pick up the page sizes of these memdevs
+ * as well. Before we plug the device and expose any RAM memory regions
+ * to the system, make sure we don't exceed the previously set max page
+ * size. While only relevant for KVM, there is not really any use case
+ * for this with TCG, so we'll unconditionally reject it.
+ */
+ if (qemu_maxrampagesize() != s390ms->max_pagesize) {
+ error_setg(errp, "Memory device uses a bigger page size than"
+ " initial memory");
+ return;
+ }
+ virtio_ccw_md_plug(VIRTIO_MD_CCW(dev), MACHINE(hotplug_dev), errp);
}
}
@@ -560,9 +590,20 @@ static void s390_machine_device_unplug_request(HotplugHandler *hotplug_dev,
if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
error_setg(errp, "CPU hot unplug not supported on this machine");
return;
+ } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_CCW)) {
+ virtio_ccw_md_unplug_request(VIRTIO_MD_CCW(dev), MACHINE(hotplug_dev),
+ errp);
}
}
+static void s390_machine_device_unplug(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_CCW)) {
+ virtio_ccw_md_unplug(VIRTIO_MD_CCW(dev), MACHINE(hotplug_dev), errp);
+ }
+ }
+
static CpuInstanceProperties s390_cpu_index_to_props(MachineState *ms,
unsigned cpu_index)
{
@@ -609,7 +650,9 @@ static const CPUArchIdList *s390_possible_cpu_arch_ids(MachineState *ms)
static HotplugHandler *s390_get_hotplug_handler(MachineState *machine,
DeviceState *dev)
{
- if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
+ if (object_dynamic_cast(OBJECT(dev), TYPE_CPU) ||
+ object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_CCW) ||
+ object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) {
return HOTPLUG_HANDLER(machine);
}
return NULL;
@@ -769,8 +812,10 @@ static void ccw_machine_class_init(ObjectClass *oc, void *data)
mc->possible_cpu_arch_ids = s390_possible_cpu_arch_ids;
/* it is overridden with 'host' cpu *in kvm_arch_init* */
mc->default_cpu_type = S390_CPU_TYPE_NAME("qemu");
+ hc->pre_plug = s390_machine_device_pre_plug;
hc->plug = s390_machine_device_plug;
hc->unplug_request = s390_machine_device_unplug_request;
+ hc->unplug = s390_machine_device_unplug;
nc->nmi_monitor_handler = s390_nmi;
mc->default_ram_id = "s390.ram";
mc->default_nic = "virtio-net-ccw";
diff --git a/hw/s390x/virtio-ccw-md-stubs.c b/hw/s390x/virtio-ccw-md-stubs.c
new file mode 100644
index 0000000000..e937865550
--- /dev/null
+++ b/hw/s390x/virtio-ccw-md-stubs.c
@@ -0,0 +1,24 @@
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "hw/s390x/virtio-ccw-md.h"
+
+void virtio_ccw_md_pre_plug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp)
+{
+ error_setg(errp, "virtio based memory devices not supported");
+}
+
+void virtio_ccw_md_plug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp)
+{
+ error_setg(errp, "virtio based memory devices not supported");
+}
+
+void virtio_ccw_md_unplug_request(VirtIOMDCcw *vmd, MachineState *ms,
+ Error **errp)
+{
+ error_setg(errp, "virtio based memory devices not supported");
+}
+
+void virtio_ccw_md_unplug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp)
+{
+ error_setg(errp, "virtio based memory devices not supported");
+}
diff --git a/hw/s390x/virtio-ccw-md.c b/hw/s390x/virtio-ccw-md.c
new file mode 100644
index 0000000000..de333282df
--- /dev/null
+++ b/hw/s390x/virtio-ccw-md.c
@@ -0,0 +1,153 @@
+/*
+ * Virtio CCW support for abstract virtio based memory device
+ *
+ * Copyright (C) 2024 Red Hat, Inc.
+ *
+ * Authors:
+ * David Hildenbrand <david@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/s390x/virtio-ccw-md.h"
+#include "hw/mem/memory-device.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+
+void virtio_ccw_md_pre_plug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp)
+{
+ DeviceState *dev = DEVICE(vmd);
+ HotplugHandler *bus_handler = qdev_get_bus_hotplug_handler(dev);
+ MemoryDeviceState *md = MEMORY_DEVICE(vmd);
+ Error *local_err = NULL;
+
+ if (!bus_handler && dev->hotplugged) {
+ /*
+ * Without a bus hotplug handler, we cannot control the plug/unplug
+ * order. We should never reach this point when hotplugging, but
+ * better add a safety net.
+ */
+ error_setg(errp, "hotplug of virtio based memory devices not supported"
+ " on this bus.");
+ return;
+ }
+
+ /*
+ * First, see if we can plug this memory device at all. If that
+ * succeeds, branch of to the actual hotplug handler.
+ */
+ memory_device_pre_plug(md, ms, &local_err);
+ if (!local_err && bus_handler) {
+ hotplug_handler_pre_plug(bus_handler, dev, &local_err);
+ }
+ error_propagate(errp, local_err);
+}
+
+void virtio_ccw_md_plug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp)
+{
+ DeviceState *dev = DEVICE(vmd);
+ HotplugHandler *bus_handler = qdev_get_bus_hotplug_handler(dev);
+ MemoryDeviceState *md = MEMORY_DEVICE(vmd);
+ Error *local_err = NULL;
+
+ /*
+ * Plug the memory device first and then branch off to the actual
+ * hotplug handler. If that one fails, we can easily undo the memory
+ * device bits.
+ */
+ memory_device_plug(md, ms);
+ if (bus_handler) {
+ hotplug_handler_plug(bus_handler, dev, &local_err);
+ if (local_err) {
+ memory_device_unplug(md, ms);
+ }
+ }
+ error_propagate(errp, local_err);
+}
+
+void virtio_ccw_md_unplug_request(VirtIOMDCcw *vmd, MachineState *ms,
+ Error **errp)
+{
+ VirtIOMDCcwClass *vmdc = VIRTIO_MD_CCW_GET_CLASS(vmd);
+ DeviceState *dev = DEVICE(vmd);
+ HotplugHandler *bus_handler = qdev_get_bus_hotplug_handler(dev);
+ HotplugHandlerClass *hdc;
+ Error *local_err = NULL;
+
+ if (!vmdc->unplug_request_check) {
+ error_setg(errp,
+ "this virtio based memory devices cannot be unplugged");
+ return;
+ }
+
+ if (!bus_handler) {
+ error_setg(errp, "hotunplug of virtio based memory devices not"
+ "supported on this bus");
+ return;
+ }
+
+ vmdc->unplug_request_check(vmd, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return;
+ }
+
+ /*
+ * Forward the async request or turn it into a sync request (handling it
+ * like qdev_unplug()).
+ */
+ hdc = HOTPLUG_HANDLER_GET_CLASS(bus_handler);
+ if (hdc->unplug_request) {
+ hotplug_handler_unplug_request(bus_handler, dev, &local_err);
+ } else {
+ virtio_ccw_md_unplug(vmd, ms, &local_err);
+ if (!local_err) {
+ object_unparent(OBJECT(dev));
+ }
+ }
+}
+
+void virtio_ccw_md_unplug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp)
+{
+ DeviceState *dev = DEVICE(vmd);
+ HotplugHandler *bus_handler = qdev_get_bus_hotplug_handler(dev);
+ MemoryDeviceState *md = MEMORY_DEVICE(vmd);
+ Error *local_err = NULL;
+
+ /* Unplug the memory device while it is still realized. */
+ memory_device_unplug(md, ms);
+
+ if (bus_handler) {
+ hotplug_handler_unplug(bus_handler, dev, &local_err);
+ if (local_err) {
+ /* Not expected to fail ... but still try to recover. */
+ memory_device_plug(md, ms);
+ error_propagate(errp, local_err);
+ return;
+ }
+ } else {
+ /* Very unexpected, but let's just try to do the right thing. */
+ warn_report("Unexpected unplug of virtio based memory device");
+ qdev_unrealize(dev);
+ }
+}
+
+static const TypeInfo virtio_ccw_md_info = {
+ .name = TYPE_VIRTIO_MD_CCW,
+ .parent = TYPE_VIRTIO_CCW_DEVICE,
+ .instance_size = sizeof(VirtIOMDCcw),
+ .class_size = sizeof(VirtIOMDCcwClass),
+ .abstract = true,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_MEMORY_DEVICE },
+ { }
+ },
+};
+
+static void virtio_ccw_md_register(void)
+{
+ type_register_static(&virtio_ccw_md_info);
+}
+type_init(virtio_ccw_md_register)
diff --git a/hw/s390x/virtio-ccw-md.h b/hw/s390x/virtio-ccw-md.h
new file mode 100644
index 0000000000..39ba864c92
--- /dev/null
+++ b/hw/s390x/virtio-ccw-md.h
@@ -0,0 +1,44 @@
+/*
+ * Virtio CCW support for abstract virtio based memory device
+ *
+ * Copyright (C) 2024 Red Hat, Inc.
+ *
+ * Authors:
+ * David Hildenbrand <david@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef HW_S390X_VIRTIO_CCW_MD_H
+#define HW_S390X_VIRTIO_CCW_MD_H
+
+#include "virtio-ccw.h"
+#include "qom/object.h"
+
+/*
+ * virtio-md-ccw: This extends VirtioCcwDevice.
+ */
+#define TYPE_VIRTIO_MD_CCW "virtio-md-ccw"
+
+OBJECT_DECLARE_TYPE(VirtIOMDCcw, VirtIOMDCcwClass, VIRTIO_MD_CCW)
+
+struct VirtIOMDCcwClass {
+ /* private */
+ VirtIOCCWDeviceClass parent;
+
+ /* public */
+ void (*unplug_request_check)(VirtIOMDCcw *vmd, Error **errp);
+};
+
+struct VirtIOMDCcw {
+ VirtioCcwDevice parent_obj;
+};
+
+void virtio_ccw_md_pre_plug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp);
+void virtio_ccw_md_plug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp);
+void virtio_ccw_md_unplug_request(VirtIOMDCcw *vmd, MachineState *ms,
+ Error **errp);
+void virtio_ccw_md_unplug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp);
+
+#endif /* HW_S390X_VIRTIO_CCW_MD_H */
diff --git a/hw/virtio/Kconfig b/hw/virtio/Kconfig
index 0afec2ae92..f4b14e1a44 100644
--- a/hw/virtio/Kconfig
+++ b/hw/virtio/Kconfig
@@ -25,6 +25,7 @@ config VIRTIO_MMIO
config VIRTIO_CCW
bool
select VIRTIO
+ select VIRTIO_MD_SUPPORTED
config VIRTIO_BALLOON
bool
--
2.48.1

View File

@ -0,0 +1,459 @@
From fa68427f55bee8d18d846e03ebf9f1eeb80f274d Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 19 Dec 2024 15:41:15 +0100
Subject: [PATCH 23/26] s390x: virtio-mem support
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 351: Enable virtio-mem support on s390x
RH-Jira: RHEL-72977
RH-Acked-by: David Hildenbrand <david@redhat.com>
RH-Acked-by: Juraj Marcin <None>
RH-Commit: [23/26] 4c59ba9025ce5ba7686a7f3e01bb70e8c580709f (thuth/qemu-kvm-cs)
Let's add our virtio-mem-ccw proxy device and wire it up. We should
be supporting everything (e.g., device unplug, "dynamic-memslots") that
we already support for the virtio-pci variant.
With a Linux guest that supports virtio-mem (and has automatic memory
onlining properly configured) the following example will work:
1. Start a VM with 4G initial memory and a virtio-mem device with a maximum
capacity of 16GB:
qemu/build/qemu-system-s390x \
--enable-kvm \
-m 4G,maxmem=20G \
-nographic \
-smp 8 \
-hda Fedora-Server-KVM-40-1.14.s390x.qcow2 \
-chardev socket,id=monitor,path=/var/tmp/monitor,server,nowait \
-mon chardev=monitor,mode=readline \
-object memory-backend-ram,id=mem0,size=16G,reserve=off \
-device virtio-mem-ccw,id=vmem0,memdev=mem0,dynamic-memslots=on
2. Query the current size of virtio-mem device:
(qemu) info memory-devices
Memory device [virtio-mem]: "vmem0"
memaddr: 0x100000000
node: 0
requested-size: 0
size: 0
max-size: 17179869184
block-size: 1048576
memdev: /objects/mem0
3. Request to grow it to 8GB (hotplug 8GB):
(qemu) qom-set vmem0 requested-size 8G
(qemu) info memory-devices
Memory device [virtio-mem]: "vmem0"
memaddr: 0x100000000
node: 0
requested-size: 8589934592
size: 8589934592
max-size: 17179869184
block-size: 1048576
memdev: /objects/mem0
4. Request to grow to 16GB (hotplug another 8GB):
(qemu) qom-set vmem0 requested-size 16G
(qemu) info memory-devices
Memory device [virtio-mem]: "vmem0"
memaddr: 0x100000000
node: 0
requested-size: 17179869184
size: 17179869184
max-size: 17179869184
block-size: 1048576
memdev: /objects/mem0
5. Try to hotunplug all memory again, shrinking to 0GB:
(qemu) qom-set vmem0 requested-size 0G
(qemu) info memory-devices
Memory device [virtio-mem]: "vmem0"
memaddr: 0x100000000
node: 0
requested-size: 0
size: 0
max-size: 17179869184
block-size: 1048576
memdev: /objects/mem0
6. If it worked, unplug the device
(qemu) device_del vmem0
(qemu) info memory-devices
(qemu) object_del mem0
7. Hotplug a new device with a smaller capacity and directly size it to 1GB
(qemu) object_add memory-backend-ram,id=mem0,size=8G,reserve=off
(qemu) device_add virtio-mem-ccw,id=vmem0,memdev=mem0,\
dynamic-memslots=on,requested-size=1G
(qemu) info memory-devices
Memory device [virtio-mem]: "vmem0"
memaddr: 0x100000000
node: 0
requested-size: 1073741824
size: 1073741824
max-size: 8589934592
block-size: 1048576
memdev: /objects/mem0
Trying to use a virtio-mem device backed by hugetlb into a !hugetlb VM
correctly results in the error:
... Memory device uses a bigger page size than initial memory
Note that the virtio-mem driver in Linux will supports 1 MiB (pageblock)
granularity.
Message-ID: <20241219144115.2820241-15-david@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
(cherry picked from commit aa910c20ec5f3b10551da19e441b3e2b54406e25)
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
MAINTAINERS | 2 +
hw/s390x/Kconfig | 1 +
hw/s390x/meson.build | 1 +
hw/s390x/virtio-ccw-mem.c | 226 ++++++++++++++++++++++++++++++++++++++
hw/s390x/virtio-ccw-mem.h | 34 ++++++
hw/virtio/virtio-mem.c | 4 +-
6 files changed, 267 insertions(+), 1 deletion(-)
create mode 100644 hw/s390x/virtio-ccw-mem.c
create mode 100644 hw/s390x/virtio-ccw-mem.h
diff --git a/MAINTAINERS b/MAINTAINERS
index f21dc3fa75..f7b7ceffc4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2401,6 +2401,8 @@ W: https://virtio-mem.gitlab.io/
F: hw/virtio/virtio-mem.c
F: hw/virtio/virtio-mem-pci.h
F: hw/virtio/virtio-mem-pci.c
+F: hw/s390x/virtio-ccw-mem.c
+F: hw/s390x/virtio-ccw-mem.h
F: include/hw/virtio/virtio-mem.h
virtio-snd
diff --git a/hw/s390x/Kconfig b/hw/s390x/Kconfig
index 3bbf4ae56e..5d57daff77 100644
--- a/hw/s390x/Kconfig
+++ b/hw/s390x/Kconfig
@@ -15,3 +15,4 @@ config S390_CCW_VIRTIO
select SCLPCONSOLE
select VIRTIO_CCW
select MSI_NONBROKEN
+ select VIRTIO_MEM_SUPPORTED
diff --git a/hw/s390x/meson.build b/hw/s390x/meson.build
index 4431868408..3bbebfd817 100644
--- a/hw/s390x/meson.build
+++ b/hw/s390x/meson.build
@@ -51,6 +51,7 @@ virtio_ss.add(when: 'CONFIG_VHOST_SCSI', if_true: files('vhost-scsi-ccw.c'))
virtio_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: files('vhost-vsock-ccw.c'))
virtio_ss.add(when: 'CONFIG_VHOST_USER_FS', if_true: files('vhost-user-fs-ccw.c'))
virtio_ss.add(when: 'CONFIG_VIRTIO_MD', if_true: files('virtio-ccw-md.c'))
+virtio_ss.add(when: 'CONFIG_VIRTIO_MEM', if_true: files('virtio-ccw-mem.c'))
s390x_ss.add_all(when: 'CONFIG_VIRTIO_CCW', if_true: virtio_ss)
s390x_ss.add(when: 'CONFIG_VIRTIO_MD', if_false: files('virtio-ccw-md-stubs.c'))
diff --git a/hw/s390x/virtio-ccw-mem.c b/hw/s390x/virtio-ccw-mem.c
new file mode 100644
index 0000000000..bee0d560cb
--- /dev/null
+++ b/hw/s390x/virtio-ccw-mem.c
@@ -0,0 +1,226 @@
+/*
+ * virtio-mem CCW implementation
+ *
+ * Copyright (C) 2024 Red Hat, Inc.
+ *
+ * Authors:
+ * David Hildenbrand <david@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/qdev-properties.h"
+#include "qapi/error.h"
+#include "qemu/module.h"
+#include "virtio-ccw-mem.h"
+#include "hw/mem/memory-device.h"
+#include "qapi/qapi-events-machine.h"
+#include "qapi/qapi-events-misc.h"
+
+static void virtio_ccw_mem_realize(VirtioCcwDevice *ccw_dev, Error **errp)
+{
+ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(ccw_dev);
+ DeviceState *vdev = DEVICE(&dev->vdev);
+
+ qdev_realize(vdev, BUS(&ccw_dev->bus), errp);
+}
+
+static void virtio_ccw_mem_set_addr(MemoryDeviceState *md, uint64_t addr,
+ Error **errp)
+{
+ object_property_set_uint(OBJECT(md), VIRTIO_MEM_ADDR_PROP, addr, errp);
+}
+
+static uint64_t virtio_ccw_mem_get_addr(const MemoryDeviceState *md)
+{
+ return object_property_get_uint(OBJECT(md), VIRTIO_MEM_ADDR_PROP,
+ &error_abort);
+}
+
+static MemoryRegion *virtio_ccw_mem_get_memory_region(MemoryDeviceState *md,
+ Error **errp)
+{
+ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(md);
+ VirtIOMEM *vmem = &dev->vdev;
+ VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem);
+
+ return vmc->get_memory_region(vmem, errp);
+}
+
+static void virtio_ccw_mem_decide_memslots(MemoryDeviceState *md,
+ unsigned int limit)
+{
+ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(md);
+ VirtIOMEM *vmem = VIRTIO_MEM(&dev->vdev);
+ VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem);
+
+ vmc->decide_memslots(vmem, limit);
+}
+
+static unsigned int virtio_ccw_mem_get_memslots(MemoryDeviceState *md)
+{
+ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(md);
+ VirtIOMEM *vmem = VIRTIO_MEM(&dev->vdev);
+ VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem);
+
+ return vmc->get_memslots(vmem);
+}
+
+static uint64_t virtio_ccw_mem_get_plugged_size(const MemoryDeviceState *md,
+ Error **errp)
+{
+ return object_property_get_uint(OBJECT(md), VIRTIO_MEM_SIZE_PROP,
+ errp);
+}
+
+static void virtio_ccw_mem_fill_device_info(const MemoryDeviceState *md,
+ MemoryDeviceInfo *info)
+{
+ VirtioMEMDeviceInfo *vi = g_new0(VirtioMEMDeviceInfo, 1);
+ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(md);
+ VirtIOMEM *vmem = &dev->vdev;
+ VirtIOMEMClass *vpc = VIRTIO_MEM_GET_CLASS(vmem);
+ DeviceState *vdev = DEVICE(md);
+
+ if (vdev->id) {
+ vi->id = g_strdup(vdev->id);
+ }
+
+ /* let the real device handle everything else */
+ vpc->fill_device_info(vmem, vi);
+
+ info->u.virtio_mem.data = vi;
+ info->type = MEMORY_DEVICE_INFO_KIND_VIRTIO_MEM;
+}
+
+static uint64_t virtio_ccw_mem_get_min_alignment(const MemoryDeviceState *md)
+{
+ return object_property_get_uint(OBJECT(md), VIRTIO_MEM_BLOCK_SIZE_PROP,
+ &error_abort);
+}
+
+static void virtio_ccw_mem_size_change_notify(Notifier *notifier, void *data)
+{
+ VirtIOMEMCcw *dev = container_of(notifier, VirtIOMEMCcw,
+ size_change_notifier);
+ DeviceState *vdev = DEVICE(dev);
+ char *qom_path = object_get_canonical_path(OBJECT(dev));
+ const uint64_t * const size_p = data;
+
+ qapi_event_send_memory_device_size_change(vdev->id, *size_p, qom_path);
+ g_free(qom_path);
+}
+
+static void virtio_ccw_mem_unplug_request_check(VirtIOMDCcw *vmd, Error **errp)
+{
+ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(vmd);
+ VirtIOMEM *vmem = &dev->vdev;
+ VirtIOMEMClass *vpc = VIRTIO_MEM_GET_CLASS(vmem);
+
+ vpc->unplug_request_check(vmem, errp);
+}
+
+static void virtio_ccw_mem_get_requested_size(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(obj);
+
+ object_property_get(OBJECT(&dev->vdev), name, v, errp);
+}
+
+static void virtio_ccw_mem_set_requested_size(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(obj);
+ DeviceState *vdev = DEVICE(obj);
+
+ /*
+ * If we passed virtio_ccw_mem_unplug_request_check(), making sure that
+ * the requested size is 0, don't allow modifying the requested size
+ * anymore, otherwise the VM might end up hotplugging memory before
+ * handling the unplug request.
+ */
+ if (vdev->pending_deleted_event) {
+ error_setg(errp, "'%s' cannot be changed if the device is in the"
+ " process of unplug", name);
+ return;
+ }
+
+ object_property_set(OBJECT(&dev->vdev), name, v, errp);
+}
+
+static Property virtio_ccw_mem_properties[] = {
+ DEFINE_PROP_BIT("ioeventfd", VirtioCcwDevice, flags,
+ VIRTIO_CCW_FLAG_USE_IOEVENTFD_BIT, true),
+ DEFINE_PROP_UINT32("max_revision", VirtioCcwDevice, max_rev,
+ VIRTIO_CCW_MAX_REV),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_ccw_mem_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtIOCCWDeviceClass *k = VIRTIO_CCW_DEVICE_CLASS(klass);
+ MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(klass);
+ VirtIOMDCcwClass *vmdc = VIRTIO_MD_CCW_CLASS(klass);
+
+ k->realize = virtio_ccw_mem_realize;
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ device_class_set_props(dc, virtio_ccw_mem_properties);
+
+ mdc->get_addr = virtio_ccw_mem_get_addr;
+ mdc->set_addr = virtio_ccw_mem_set_addr;
+ mdc->get_plugged_size = virtio_ccw_mem_get_plugged_size;
+ mdc->get_memory_region = virtio_ccw_mem_get_memory_region;
+ mdc->decide_memslots = virtio_ccw_mem_decide_memslots;
+ mdc->get_memslots = virtio_ccw_mem_get_memslots;
+ mdc->fill_device_info = virtio_ccw_mem_fill_device_info;
+ mdc->get_min_alignment = virtio_ccw_mem_get_min_alignment;
+
+ vmdc->unplug_request_check = virtio_ccw_mem_unplug_request_check;
+}
+
+static void virtio_ccw_mem_instance_init(Object *obj)
+{
+ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(obj);
+ VirtIOMEMClass *vmc;
+ VirtIOMEM *vmem;
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VIRTIO_MEM);
+
+ dev->size_change_notifier.notify = virtio_ccw_mem_size_change_notify;
+ vmem = &dev->vdev;
+ vmc = VIRTIO_MEM_GET_CLASS(vmem);
+ /*
+ * We never remove the notifier again, as we expect both devices to
+ * disappear at the same time.
+ */
+ vmc->add_size_change_notifier(vmem, &dev->size_change_notifier);
+
+ object_property_add_alias(obj, VIRTIO_MEM_BLOCK_SIZE_PROP,
+ OBJECT(&dev->vdev), VIRTIO_MEM_BLOCK_SIZE_PROP);
+ object_property_add_alias(obj, VIRTIO_MEM_SIZE_PROP, OBJECT(&dev->vdev),
+ VIRTIO_MEM_SIZE_PROP);
+ object_property_add(obj, VIRTIO_MEM_REQUESTED_SIZE_PROP, "size",
+ virtio_ccw_mem_get_requested_size,
+ virtio_ccw_mem_set_requested_size, NULL, NULL);
+}
+
+static const TypeInfo virtio_ccw_mem = {
+ .name = TYPE_VIRTIO_MEM_CCW,
+ .parent = TYPE_VIRTIO_MD_CCW,
+ .instance_size = sizeof(VirtIOMEMCcw),
+ .instance_init = virtio_ccw_mem_instance_init,
+ .class_init = virtio_ccw_mem_class_init,
+};
+
+static void virtio_ccw_mem_register_types(void)
+{
+ type_register_static(&virtio_ccw_mem);
+}
+type_init(virtio_ccw_mem_register_types)
diff --git a/hw/s390x/virtio-ccw-mem.h b/hw/s390x/virtio-ccw-mem.h
new file mode 100644
index 0000000000..738ab2c744
--- /dev/null
+++ b/hw/s390x/virtio-ccw-mem.h
@@ -0,0 +1,34 @@
+/*
+ * Virtio MEM CCW device
+ *
+ * Copyright (C) 2024 Red Hat, Inc.
+ *
+ * Authors:
+ * David Hildenbrand <david@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef HW_S390X_VIRTIO_CCW_MEM_H
+#define HW_S390X_VIRTIO_CCW_MEM_H
+
+#include "virtio-ccw-md.h"
+#include "hw/virtio/virtio-mem.h"
+#include "qom/object.h"
+
+typedef struct VirtIOMEMCcw VirtIOMEMCcw;
+
+/*
+ * virtio-mem-ccw: This extends VirtIOMDCcw
+ */
+#define TYPE_VIRTIO_MEM_CCW "virtio-mem-ccw"
+DECLARE_INSTANCE_CHECKER(VirtIOMEMCcw, VIRTIO_MEM_CCW, TYPE_VIRTIO_MEM_CCW)
+
+struct VirtIOMEMCcw {
+ VirtIOMDCcw parent_obj;
+ VirtIOMEM vdev;
+ Notifier size_change_notifier;
+};
+
+#endif /* HW_S390X_VIRTIO_CCW_MEM_H */
diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
index 00da98b6e1..c9f8a23bbc 100644
--- a/hw/virtio/virtio-mem.c
+++ b/hw/virtio/virtio-mem.c
@@ -61,6 +61,8 @@ static uint32_t virtio_mem_default_thp_size(void)
} else if (qemu_real_host_page_size() == 64 * KiB) {
default_thp_size = 512 * MiB;
}
+#elif defined(__s390x__)
+ default_thp_size = 1 * MiB;
#endif
return default_thp_size;
@@ -161,7 +163,7 @@ static bool virtio_mem_has_shared_zeropage(RAMBlock *rb)
* necessary (as the section size can change). But it's more likely that the
* section size will rather get smaller and not bigger over time.
*/
-#if defined(TARGET_X86_64) || defined(TARGET_I386)
+#if defined(TARGET_X86_64) || defined(TARGET_I386) || defined(TARGET_S390X)
#define VIRTIO_MEM_USABLE_EXTENT (2 * (128 * MiB))
#elif defined(TARGET_ARM)
#define VIRTIO_MEM_USABLE_EXTENT (2 * (512 * MiB))
--
2.48.1

Some files were not shown because too many files have changed in this diff Show More