import CS qemu-kvm-9.1.0-26.el9
This commit is contained in:
parent
8a741c4808
commit
ab9af75c93
38
SOURCES/kvm-Enable-amd-iommu-device.patch
Normal file
38
SOURCES/kvm-Enable-amd-iommu-device.patch
Normal file
@ -0,0 +1,38 @@
|
||||
From 0608561efc441f234d9aaf45f1867ffb5c43cffe Mon Sep 17 00:00:00 2001
|
||||
From: John Allen <john.allen@amd.com>
|
||||
Date: Wed, 11 Jun 2025 15:41:14 -0500
|
||||
Subject: [PATCH 26/57] Enable amd-iommu device
|
||||
|
||||
RH-Author: John Allen <None>
|
||||
RH-MergeRequest: 380: Add ability to manually specify the AMDVI-PCI device
|
||||
RH-Jira: RHEL-70925
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [3/3] 852500a18275e14bcd94d598ccd0ee33b76578dc (johnalle/qemu-kvm-fork)
|
||||
|
||||
Now that the amdvi-pci device that amd-iommu creates can be specified
|
||||
manually, amd-iommu device can be enabled.
|
||||
|
||||
JIRA: https://issues.redhat.com/browse/RHEL-70925
|
||||
|
||||
Upstream: RHEL ONLY
|
||||
|
||||
Signed-off-by: John Allen <johnalle@redhat.com>
|
||||
---
|
||||
configs/devices/x86_64-softmmu/x86_64-rh-devices.mak | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
diff --git a/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak b/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak
|
||||
index 3e5f693b62..2b15fdc2db 100644
|
||||
--- a/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak
|
||||
+++ b/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak
|
||||
@@ -97,6 +97,7 @@ CONFIG_VIRTIO_MEM=y
|
||||
CONFIG_VIRTIO_PCI=y
|
||||
CONFIG_VIRTIO_VGA=y
|
||||
CONFIG_VIRTIO_IOMMU=y
|
||||
+CONFIG_AMD_IOMMU=y
|
||||
CONFIG_VMMOUSE=y
|
||||
CONFIG_VMPORT=y
|
||||
CONFIG_VTD=y
|
||||
--
|
||||
2.39.3
|
||||
|
||||
141
SOURCES/kvm-amd_iommu-Add-support-for-pass-though-mode.patch
Normal file
141
SOURCES/kvm-amd_iommu-Add-support-for-pass-though-mode.patch
Normal file
@ -0,0 +1,141 @@
|
||||
From 4114553452f7187283aefa001bc8342fc65b6b72 Mon Sep 17 00:00:00 2001
|
||||
From: John Allen <john.allen@amd.com>
|
||||
Date: Wed, 11 Dec 2024 15:06:48 -0600
|
||||
Subject: [PATCH 04/57] amd_iommu: Add support for pass though mode
|
||||
|
||||
RH-Author: John Allen <None>
|
||||
RH-MergeRequest: 303: Interrupt Remap support for emulated amd viommu
|
||||
RH-Jira: RHEL-66202
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [2/5] 0434fefd554baf27fb9d93026af513c621f8cdb0 (johnalle/qemu-kvm-fork)
|
||||
|
||||
JIRA: https://issues.redhat.com/browse/RHEL-66202
|
||||
|
||||
commit c1f46999ef506d9854534560a94d02cf3cf9edd1
|
||||
Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
Date: Fri Sep 27 12:29:10 2024 -0500
|
||||
|
||||
amd_iommu: Add support for pass though mode
|
||||
|
||||
Introduce 'nodma' shared memory region to support PT mode
|
||||
so that for each device, we only create an alias to shared memory
|
||||
region when DMA-remapping is disabled.
|
||||
|
||||
Reviewed-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
|
||||
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
Signed-off-by: Santosh Shukla <santosh.shukla@amd.com>
|
||||
Message-Id: <20240927172913.121477-3-santosh.shukla@amd.com>
|
||||
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
|
||||
Signed-off-by: John Allen <john.allen@amd.com>
|
||||
---
|
||||
hw/i386/amd_iommu.c | 49 ++++++++++++++++++++++++++++++++++++---------
|
||||
hw/i386/amd_iommu.h | 2 ++
|
||||
2 files changed, 42 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
|
||||
index 148b5ee51d..567cb8adc9 100644
|
||||
--- a/hw/i386/amd_iommu.c
|
||||
+++ b/hw/i386/amd_iommu.c
|
||||
@@ -60,8 +60,9 @@ struct AMDVIAddressSpace {
|
||||
uint8_t bus_num; /* bus number */
|
||||
uint8_t devfn; /* device function */
|
||||
AMDVIState *iommu_state; /* AMDVI - one per machine */
|
||||
- MemoryRegion root; /* AMDVI Root memory map region */
|
||||
+ MemoryRegion root; /* AMDVI Root memory map region */
|
||||
IOMMUMemoryRegion iommu; /* Device's address translation region */
|
||||
+ MemoryRegion iommu_nodma; /* Alias of shared nodma memory region */
|
||||
MemoryRegion iommu_ir; /* Device's interrupt remapping region */
|
||||
AddressSpace as; /* device's corresponding address space */
|
||||
};
|
||||
@@ -1412,6 +1413,7 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
|
||||
AMDVIState *s = opaque;
|
||||
AMDVIAddressSpace **iommu_as, *amdvi_dev_as;
|
||||
int bus_num = pci_bus_num(bus);
|
||||
+ X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
|
||||
|
||||
iommu_as = s->address_spaces[bus_num];
|
||||
|
||||
@@ -1436,13 +1438,13 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
|
||||
* Memory region relationships looks like (Address range shows
|
||||
* only lower 32 bits to make it short in length...):
|
||||
*
|
||||
- * |-----------------+-------------------+----------|
|
||||
- * | Name | Address range | Priority |
|
||||
- * |-----------------+-------------------+----------+
|
||||
- * | amdvi_root | 00000000-ffffffff | 0 |
|
||||
- * | amdvi_iommu | 00000000-ffffffff | 1 |
|
||||
- * | amdvi_iommu_ir | fee00000-feefffff | 64 |
|
||||
- * |-----------------+-------------------+----------|
|
||||
+ * |--------------------+-------------------+----------|
|
||||
+ * | Name | Address range | Priority |
|
||||
+ * |--------------------+-------------------+----------+
|
||||
+ * | amdvi-root | 00000000-ffffffff | 0 |
|
||||
+ * | amdvi-iommu_nodma | 00000000-ffffffff | 0 |
|
||||
+ * | amdvi-iommu_ir | fee00000-feefffff | 64 |
|
||||
+ * |--------------------+-------------------+----------|
|
||||
*/
|
||||
memory_region_init_iommu(&amdvi_dev_as->iommu,
|
||||
sizeof(amdvi_dev_as->iommu),
|
||||
@@ -1461,7 +1463,25 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
|
||||
64);
|
||||
memory_region_add_subregion_overlap(&amdvi_dev_as->root, 0,
|
||||
MEMORY_REGION(&amdvi_dev_as->iommu),
|
||||
- 1);
|
||||
+ 0);
|
||||
+
|
||||
+ /* Build the DMA Disabled alias to shared memory */
|
||||
+ memory_region_init_alias(&amdvi_dev_as->iommu_nodma, OBJECT(s),
|
||||
+ "amdvi-sys", &s->mr_sys, 0,
|
||||
+ memory_region_size(&s->mr_sys));
|
||||
+ memory_region_add_subregion_overlap(&amdvi_dev_as->root, 0,
|
||||
+ &amdvi_dev_as->iommu_nodma,
|
||||
+ 0);
|
||||
+
|
||||
+ if (!x86_iommu->pt_supported) {
|
||||
+ memory_region_set_enabled(&amdvi_dev_as->iommu_nodma, false);
|
||||
+ memory_region_set_enabled(MEMORY_REGION(&amdvi_dev_as->iommu),
|
||||
+ true);
|
||||
+ } else {
|
||||
+ memory_region_set_enabled(MEMORY_REGION(&amdvi_dev_as->iommu),
|
||||
+ false);
|
||||
+ memory_region_set_enabled(&amdvi_dev_as->iommu_nodma, true);
|
||||
+ }
|
||||
}
|
||||
return &iommu_as[devfn]->as;
|
||||
}
|
||||
@@ -1602,6 +1622,17 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
|
||||
"amdvi-mmio", AMDVI_MMIO_SIZE);
|
||||
memory_region_add_subregion(get_system_memory(), AMDVI_BASE_ADDR,
|
||||
&s->mr_mmio);
|
||||
+
|
||||
+ /* Create the share memory regions by all devices */
|
||||
+ memory_region_init(&s->mr_sys, OBJECT(s), "amdvi-sys", UINT64_MAX);
|
||||
+
|
||||
+ /* set up the DMA disabled memory region */
|
||||
+ memory_region_init_alias(&s->mr_nodma, OBJECT(s),
|
||||
+ "amdvi-nodma", get_system_memory(), 0,
|
||||
+ memory_region_size(get_system_memory()));
|
||||
+ memory_region_add_subregion_overlap(&s->mr_sys, 0,
|
||||
+ &s->mr_nodma, 0);
|
||||
+
|
||||
pci_setup_iommu(bus, &amdvi_iommu_ops, s);
|
||||
amdvi_init(s);
|
||||
}
|
||||
diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
|
||||
index e5c2ae94f2..be417e51c4 100644
|
||||
--- a/hw/i386/amd_iommu.h
|
||||
+++ b/hw/i386/amd_iommu.h
|
||||
@@ -354,6 +354,8 @@ struct AMDVIState {
|
||||
uint32_t pprlog_tail; /* ppr log tail */
|
||||
|
||||
MemoryRegion mr_mmio; /* MMIO region */
|
||||
+ MemoryRegion mr_sys;
|
||||
+ MemoryRegion mr_nodma;
|
||||
uint8_t mmior[AMDVI_MMIO_SIZE]; /* read/write MMIO */
|
||||
uint8_t w1cmask[AMDVI_MMIO_SIZE]; /* read/write 1 clear mask */
|
||||
uint8_t romask[AMDVI_MMIO_SIZE]; /* MMIO read/only mask */
|
||||
--
|
||||
2.39.3
|
||||
|
||||
66
SOURCES/kvm-amd_iommu-Check-APIC-ID-255-for-XTSup.patch
Normal file
66
SOURCES/kvm-amd_iommu-Check-APIC-ID-255-for-XTSup.patch
Normal file
@ -0,0 +1,66 @@
|
||||
From 0397ebacdba6539147d9986255c3f81cbfdabf1e Mon Sep 17 00:00:00 2001
|
||||
From: John Allen <john.allen@amd.com>
|
||||
Date: Wed, 11 Dec 2024 15:07:03 -0600
|
||||
Subject: [PATCH 07/57] amd_iommu: Check APIC ID > 255 for XTSup
|
||||
|
||||
RH-Author: John Allen <None>
|
||||
RH-MergeRequest: 303: Interrupt Remap support for emulated amd viommu
|
||||
RH-Jira: RHEL-66202
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [5/5] f39b3e3cdefc2b562f1ad2ef939a37bf404f355a (johnalle/qemu-kvm-fork)
|
||||
|
||||
JIRA: https://issues.redhat.com/browse/RHEL-66202
|
||||
|
||||
commit b12cb3819baf6d9ee8140d4dd6d36fa829e2c6d9
|
||||
Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
Date: Fri Sep 27 12:29:13 2024 -0500
|
||||
|
||||
amd_iommu: Check APIC ID > 255 for XTSup
|
||||
|
||||
The XTSup mode enables x2APIC support for AMD IOMMU, which is needed
|
||||
to support vcpu w/ APIC ID > 255.
|
||||
|
||||
Reviewed-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
|
||||
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
Signed-off-by: Santosh Shukla <santosh.shukla@amd.com>
|
||||
Message-Id: <20240927172913.121477-6-santosh.shukla@amd.com>
|
||||
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
|
||||
Signed-off-by: John Allen <john.allen@amd.com>
|
||||
---
|
||||
hw/i386/amd_iommu.c | 11 +++++++++++
|
||||
1 file changed, 11 insertions(+)
|
||||
|
||||
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
|
||||
index 82d76dfca9..d804656ea8 100644
|
||||
--- a/hw/i386/amd_iommu.c
|
||||
+++ b/hw/i386/amd_iommu.c
|
||||
@@ -32,6 +32,7 @@
|
||||
#include "trace.h"
|
||||
#include "hw/i386/apic-msidef.h"
|
||||
#include "hw/qdev-properties.h"
|
||||
+#include "kvm/kvm_i386.h"
|
||||
|
||||
/* used AMD-Vi MMIO registers */
|
||||
const char *amdvi_mmio_low[] = {
|
||||
@@ -1651,6 +1652,16 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
|
||||
memory_region_add_subregion_overlap(&s->mr_sys, AMDVI_INT_ADDR_FIRST,
|
||||
&s->mr_ir, 1);
|
||||
|
||||
+ /* AMD IOMMU with x2APIC mode requires xtsup=on */
|
||||
+ if (x86ms->apic_id_limit > 255 && !s->xtsup) {
|
||||
+ error_report("AMD IOMMU with x2APIC confguration requires xtsup=on");
|
||||
+ exit(EXIT_FAILURE);
|
||||
+ }
|
||||
+ if (s->xtsup && kvm_irqchip_is_split() && !kvm_enable_x2apic()) {
|
||||
+ error_report("AMD IOMMU xtsup=on requires support on the KVM side");
|
||||
+ exit(EXIT_FAILURE);
|
||||
+ }
|
||||
+
|
||||
pci_setup_iommu(bus, &amdvi_iommu_ops, s);
|
||||
amdvi_init(s);
|
||||
}
|
||||
--
|
||||
2.39.3
|
||||
|
||||
94
SOURCES/kvm-amd_iommu-Rename-variable-mmio-to-mr_mmio.patch
Normal file
94
SOURCES/kvm-amd_iommu-Rename-variable-mmio-to-mr_mmio.patch
Normal file
@ -0,0 +1,94 @@
|
||||
From f733325d3d91576ae9f6e341faabc301542fc6c8 Mon Sep 17 00:00:00 2001
|
||||
From: John Allen <john.allen@amd.com>
|
||||
Date: Wed, 11 Dec 2024 15:06:44 -0600
|
||||
Subject: [PATCH 03/57] amd_iommu: Rename variable mmio to mr_mmio
|
||||
|
||||
RH-Author: John Allen <None>
|
||||
RH-MergeRequest: 303: Interrupt Remap support for emulated amd viommu
|
||||
RH-Jira: RHEL-66202
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [1/5] 1996a48efb7210d4d1e0b929be2d115d672e1a02 (johnalle/qemu-kvm-fork)
|
||||
|
||||
JIRA: https://issues.redhat.com/browse/RHEL-66202
|
||||
|
||||
commit 2e6f051cfc58e69dcb392cd245d8f01b0c2e963f
|
||||
Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
Date: Fri Sep 27 12:29:09 2024 -0500
|
||||
|
||||
amd_iommu: Rename variable mmio to mr_mmio
|
||||
|
||||
Rename the MMIO memory region variable 'mmio' to 'mr_mmio'
|
||||
so to correctly name align with struct AMDVIState::variable type.
|
||||
|
||||
No functional change intended.
|
||||
|
||||
Reviewed-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
|
||||
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
Signed-off-by: Santosh Shukla <santosh.shukla@amd.com>
|
||||
Message-Id: <20240927172913.121477-2-santosh.shukla@amd.com>
|
||||
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
|
||||
Signed-off-by: John Allen <john.allen@amd.com>
|
||||
---
|
||||
hw/i386/acpi-build.c | 4 ++--
|
||||
hw/i386/amd_iommu.c | 6 +++---
|
||||
hw/i386/amd_iommu.h | 2 +-
|
||||
3 files changed, 6 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
|
||||
index 5d4bd2b710..032fb1f904 100644
|
||||
--- a/hw/i386/acpi-build.c
|
||||
+++ b/hw/i386/acpi-build.c
|
||||
@@ -2397,7 +2397,7 @@ build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id,
|
||||
/* Capability offset */
|
||||
build_append_int_noprefix(table_data, s->pci.capab_offset, 2);
|
||||
/* IOMMU base address */
|
||||
- build_append_int_noprefix(table_data, s->mmio.addr, 8);
|
||||
+ build_append_int_noprefix(table_data, s->mr_mmio.addr, 8);
|
||||
/* PCI Segment Group */
|
||||
build_append_int_noprefix(table_data, 0, 2);
|
||||
/* IOMMU info */
|
||||
@@ -2432,7 +2432,7 @@ build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id,
|
||||
/* Capability offset */
|
||||
build_append_int_noprefix(table_data, s->pci.capab_offset, 2);
|
||||
/* IOMMU base address */
|
||||
- build_append_int_noprefix(table_data, s->mmio.addr, 8);
|
||||
+ build_append_int_noprefix(table_data, s->mr_mmio.addr, 8);
|
||||
/* PCI Segment Group */
|
||||
build_append_int_noprefix(table_data, 0, 2);
|
||||
/* IOMMU info */
|
||||
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
|
||||
index 87643d2891..148b5ee51d 100644
|
||||
--- a/hw/i386/amd_iommu.c
|
||||
+++ b/hw/i386/amd_iommu.c
|
||||
@@ -1598,10 +1598,10 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
|
||||
x86ms->ioapic_as = amdvi_host_dma_iommu(bus, s, AMDVI_IOAPIC_SB_DEVID);
|
||||
|
||||
/* set up MMIO */
|
||||
- memory_region_init_io(&s->mmio, OBJECT(s), &mmio_mem_ops, s, "amdvi-mmio",
|
||||
- AMDVI_MMIO_SIZE);
|
||||
+ memory_region_init_io(&s->mr_mmio, OBJECT(s), &mmio_mem_ops, s,
|
||||
+ "amdvi-mmio", AMDVI_MMIO_SIZE);
|
||||
memory_region_add_subregion(get_system_memory(), AMDVI_BASE_ADDR,
|
||||
- &s->mmio);
|
||||
+ &s->mr_mmio);
|
||||
pci_setup_iommu(bus, &amdvi_iommu_ops, s);
|
||||
amdvi_init(s);
|
||||
}
|
||||
diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
|
||||
index 73619fe9ea..e5c2ae94f2 100644
|
||||
--- a/hw/i386/amd_iommu.h
|
||||
+++ b/hw/i386/amd_iommu.h
|
||||
@@ -353,7 +353,7 @@ struct AMDVIState {
|
||||
uint32_t pprlog_head; /* ppr log head */
|
||||
uint32_t pprlog_tail; /* ppr log tail */
|
||||
|
||||
- MemoryRegion mmio; /* MMIO region */
|
||||
+ MemoryRegion mr_mmio; /* MMIO region */
|
||||
uint8_t mmior[AMDVI_MMIO_SIZE]; /* read/write MMIO */
|
||||
uint8_t w1cmask[AMDVI_MMIO_SIZE]; /* read/write 1 clear mask */
|
||||
uint8_t romask[AMDVI_MMIO_SIZE]; /* MMIO read/only mask */
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,81 @@
|
||||
From 17ce6ac0d8edb04ba79bb39d3f695cd0506a9dc2 Mon Sep 17 00:00:00 2001
|
||||
From: John Allen <john.allen@amd.com>
|
||||
Date: Wed, 11 Dec 2024 15:06:59 -0600
|
||||
Subject: [PATCH 06/57] amd_iommu: Send notification when invalidate interrupt
|
||||
entry cache
|
||||
|
||||
RH-Author: John Allen <None>
|
||||
RH-MergeRequest: 303: Interrupt Remap support for emulated amd viommu
|
||||
RH-Jira: RHEL-66202
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [4/5] d57e8fb4e69f3c01d32673bf658aae5067d6b969 (johnalle/qemu-kvm-fork)
|
||||
|
||||
JIRA: https://issues.redhat.com/browse/RHEL-66202
|
||||
|
||||
commit f84aad4d718b83d2a4d90485992e5421430032e1
|
||||
Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
Date: Fri Sep 27 12:29:12 2024 -0500
|
||||
|
||||
amd_iommu: Send notification when invalidate interrupt entry cache
|
||||
|
||||
In order to support AMD IOMMU interrupt remapping emulation with PCI
|
||||
pass-through devices, QEMU needs to notify VFIO when guest IOMMU driver
|
||||
updates and invalidate the guest interrupt remapping table (IRT), and
|
||||
communicate information so that the host IOMMU driver can update
|
||||
the shadowed interrupt remapping table in the host IOMMU.
|
||||
|
||||
Therefore, send notification when guest IOMMU emulates the IRT
|
||||
invalidation commands.
|
||||
|
||||
Reviewed-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
|
||||
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
Signed-off-by: Santosh Shukla <santosh.shukla@amd.com>
|
||||
Message-Id: <20240927172913.121477-5-santosh.shukla@amd.com>
|
||||
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
|
||||
Signed-off-by: John Allen <john.allen@amd.com>
|
||||
---
|
||||
hw/i386/amd_iommu.c | 12 ++++++++++++
|
||||
1 file changed, 12 insertions(+)
|
||||
|
||||
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
|
||||
index 8fcf5eacb4..82d76dfca9 100644
|
||||
--- a/hw/i386/amd_iommu.c
|
||||
+++ b/hw/i386/amd_iommu.c
|
||||
@@ -431,6 +431,12 @@ static void amdvi_complete_ppr(AMDVIState *s, uint64_t *cmd)
|
||||
trace_amdvi_ppr_exec();
|
||||
}
|
||||
|
||||
+static void amdvi_intremap_inval_notify_all(AMDVIState *s, bool global,
|
||||
+ uint32_t index, uint32_t mask)
|
||||
+{
|
||||
+ x86_iommu_iec_notify_all(X86_IOMMU_DEVICE(s), global, index, mask);
|
||||
+}
|
||||
+
|
||||
static void amdvi_inval_all(AMDVIState *s, uint64_t *cmd)
|
||||
{
|
||||
if (extract64(cmd[0], 0, 60) || cmd[1]) {
|
||||
@@ -438,6 +444,9 @@ static void amdvi_inval_all(AMDVIState *s, uint64_t *cmd)
|
||||
s->cmdbuf + s->cmdbuf_head);
|
||||
}
|
||||
|
||||
+ /* Notify global invalidation */
|
||||
+ amdvi_intremap_inval_notify_all(s, true, 0, 0);
|
||||
+
|
||||
amdvi_iotlb_reset(s);
|
||||
trace_amdvi_all_inval();
|
||||
}
|
||||
@@ -486,6 +495,9 @@ static void amdvi_inval_inttable(AMDVIState *s, uint64_t *cmd)
|
||||
return;
|
||||
}
|
||||
|
||||
+ /* Notify global invalidation */
|
||||
+ amdvi_intremap_inval_notify_all(s, true, 0, 0);
|
||||
+
|
||||
trace_amdvi_intr_inval();
|
||||
}
|
||||
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,105 @@
|
||||
From 4859d41adfaae8933e074dcefdc81edd3832c914 Mon Sep 17 00:00:00 2001
|
||||
From: John Allen <john.allen@amd.com>
|
||||
Date: Wed, 11 Dec 2024 15:06:55 -0600
|
||||
Subject: [PATCH 05/57] amd_iommu: Use shared memory region for Interrupt
|
||||
Remapping
|
||||
|
||||
RH-Author: John Allen <None>
|
||||
RH-MergeRequest: 303: Interrupt Remap support for emulated amd viommu
|
||||
RH-Jira: RHEL-66202
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [3/5] 48c0513c80257bfbd12c2cf3bab2503bd95d0b1c (johnalle/qemu-kvm-fork)
|
||||
|
||||
JIRA: https://issues.redhat.com/browse/RHEL-66202
|
||||
|
||||
commit 9fc9dbac61ddde7d8df37e84c8e02cec249d3222
|
||||
Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
Date: Fri Sep 27 12:29:11 2024 -0500
|
||||
|
||||
amd_iommu: Use shared memory region for Interrupt Remapping
|
||||
|
||||
Use shared memory region for interrupt remapping which can be
|
||||
aliased by all devices.
|
||||
|
||||
Reviewed-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
|
||||
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
Signed-off-by: Santosh Shukla <santosh.shukla@amd.com>
|
||||
Message-Id: <20240927172913.121477-4-santosh.shukla@amd.com>
|
||||
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
|
||||
Signed-off-by: John Allen <john.allen@amd.com>
|
||||
---
|
||||
hw/i386/amd_iommu.c | 22 ++++++++++++++--------
|
||||
hw/i386/amd_iommu.h | 1 +
|
||||
2 files changed, 15 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
|
||||
index 567cb8adc9..8fcf5eacb4 100644
|
||||
--- a/hw/i386/amd_iommu.c
|
||||
+++ b/hw/i386/amd_iommu.c
|
||||
@@ -1443,7 +1443,7 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
|
||||
* |--------------------+-------------------+----------+
|
||||
* | amdvi-root | 00000000-ffffffff | 0 |
|
||||
* | amdvi-iommu_nodma | 00000000-ffffffff | 0 |
|
||||
- * | amdvi-iommu_ir | fee00000-feefffff | 64 |
|
||||
+ * | amdvi-iommu_ir | fee00000-feefffff | 1 |
|
||||
* |--------------------+-------------------+----------|
|
||||
*/
|
||||
memory_region_init_iommu(&amdvi_dev_as->iommu,
|
||||
@@ -1454,13 +1454,6 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
|
||||
memory_region_init(&amdvi_dev_as->root, OBJECT(s),
|
||||
"amdvi_root", UINT64_MAX);
|
||||
address_space_init(&amdvi_dev_as->as, &amdvi_dev_as->root, name);
|
||||
- memory_region_init_io(&amdvi_dev_as->iommu_ir, OBJECT(s),
|
||||
- &amdvi_ir_ops, s, "amd_iommu_ir",
|
||||
- AMDVI_INT_ADDR_SIZE);
|
||||
- memory_region_add_subregion_overlap(&amdvi_dev_as->root,
|
||||
- AMDVI_INT_ADDR_FIRST,
|
||||
- &amdvi_dev_as->iommu_ir,
|
||||
- 64);
|
||||
memory_region_add_subregion_overlap(&amdvi_dev_as->root, 0,
|
||||
MEMORY_REGION(&amdvi_dev_as->iommu),
|
||||
0);
|
||||
@@ -1472,6 +1465,13 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
|
||||
memory_region_add_subregion_overlap(&amdvi_dev_as->root, 0,
|
||||
&amdvi_dev_as->iommu_nodma,
|
||||
0);
|
||||
+ /* Build the Interrupt Remapping alias to shared memory */
|
||||
+ memory_region_init_alias(&amdvi_dev_as->iommu_ir, OBJECT(s),
|
||||
+ "amdvi-ir", &s->mr_ir, 0,
|
||||
+ memory_region_size(&s->mr_ir));
|
||||
+ memory_region_add_subregion_overlap(MEMORY_REGION(&amdvi_dev_as->iommu),
|
||||
+ AMDVI_INT_ADDR_FIRST,
|
||||
+ &amdvi_dev_as->iommu_ir, 1);
|
||||
|
||||
if (!x86_iommu->pt_supported) {
|
||||
memory_region_set_enabled(&amdvi_dev_as->iommu_nodma, false);
|
||||
@@ -1633,6 +1633,12 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
|
||||
memory_region_add_subregion_overlap(&s->mr_sys, 0,
|
||||
&s->mr_nodma, 0);
|
||||
|
||||
+ /* set up the Interrupt Remapping memory region */
|
||||
+ memory_region_init_io(&s->mr_ir, OBJECT(s), &amdvi_ir_ops,
|
||||
+ s, "amdvi-ir", AMDVI_INT_ADDR_SIZE);
|
||||
+ memory_region_add_subregion_overlap(&s->mr_sys, AMDVI_INT_ADDR_FIRST,
|
||||
+ &s->mr_ir, 1);
|
||||
+
|
||||
pci_setup_iommu(bus, &amdvi_iommu_ops, s);
|
||||
amdvi_init(s);
|
||||
}
|
||||
diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
|
||||
index be417e51c4..e0dac4d9a9 100644
|
||||
--- a/hw/i386/amd_iommu.h
|
||||
+++ b/hw/i386/amd_iommu.h
|
||||
@@ -356,6 +356,7 @@ struct AMDVIState {
|
||||
MemoryRegion mr_mmio; /* MMIO region */
|
||||
MemoryRegion mr_sys;
|
||||
MemoryRegion mr_nodma;
|
||||
+ MemoryRegion mr_ir;
|
||||
uint8_t mmior[AMDVI_MMIO_SIZE]; /* read/write MMIO */
|
||||
uint8_t w1cmask[AMDVI_MMIO_SIZE]; /* read/write 1 clear mask */
|
||||
uint8_t romask[AMDVI_MMIO_SIZE]; /* MMIO read/only mask */
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,53 @@
|
||||
From 173beb6698538dcffefab36772e107ffb0b4fbbd Mon Sep 17 00:00:00 2001
|
||||
From: Shaoqin Huang <shahuang@redhat.com>
|
||||
Date: Mon, 28 Apr 2025 04:34:27 -0400
|
||||
Subject: [PATCH 2/5] arm: Use arm_virt_compat_set() to apply the compat
|
||||
|
||||
RH-Author: Shaoqin Huang <shahuang@redhat.com>
|
||||
RH-MergeRequest: 353: virtio-net: disable USO for virt-rhel9.6
|
||||
RH-Jira: RHEL-80313
|
||||
RH-Acked-by: Thomas Huth <thuth@redhat.com>
|
||||
RH-Acked-by: Eric Auger <eric.auger@redhat.com>
|
||||
RH-Commit: [2/2] 6e7a158e65296928040e70622b3cee59e45c1c36 (shahuang/qemu-kvm)
|
||||
|
||||
JIRA: https://issues.redhat.com/browse/RHEL-80313
|
||||
Upstream Status: RHEL only
|
||||
|
||||
Since the pauth and uso both should apply for the latest machine type,
|
||||
move them to the arm_virt_compat_set() which applies the compat to all
|
||||
machine types automatically.
|
||||
|
||||
Signed-off-by: Shaoqin Huang <shahuang@redhat.com>
|
||||
---
|
||||
hw/arm/virt.c | 8 ++++----
|
||||
1 file changed, 4 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
|
||||
index 896deaa025..2aef94e776 100644
|
||||
--- a/hw/arm/virt.c
|
||||
+++ b/hw/arm/virt.c
|
||||
@@ -127,6 +127,10 @@ static void arm_virt_compat_set(MachineClass *mc)
|
||||
arm_virt_compat_len);
|
||||
compat_props_add(mc->compat_props, arm_rhel_compat,
|
||||
arm_rhel_compat_len);
|
||||
+ compat_props_add(mc->compat_props, arm_rhel9_compat,
|
||||
+ arm_rhel9_compat_len);
|
||||
+ compat_props_add(mc->compat_props, hw_compat_rhel_9,
|
||||
+ hw_compat_rhel_9_len);
|
||||
}
|
||||
|
||||
#define DEFINE_VIRT_MACHINE_IMPL(latest, ...) \
|
||||
@@ -3599,10 +3603,6 @@ DEFINE_VIRT_MACHINE(2, 6)
|
||||
|
||||
static void virt_rhel_machine_9_6_0_options(MachineClass *mc)
|
||||
{
|
||||
- compat_props_add(mc->compat_props, arm_rhel9_compat, arm_rhel9_compat_len);
|
||||
-
|
||||
- /* NB: remember to move this line to the *latest* RHEL 9 machine */
|
||||
- compat_props_add(mc->compat_props, hw_compat_rhel_9, hw_compat_rhel_9_len);
|
||||
}
|
||||
DEFINE_VIRT_MACHINE_AS_LATEST(9, 6, 0)
|
||||
|
||||
--
|
||||
2.48.1
|
||||
|
||||
145
SOURCES/kvm-block-Add-new-bdrv_co_is_all_zeroes-function.patch
Normal file
145
SOURCES/kvm-block-Add-new-bdrv_co_is_all_zeroes-function.patch
Normal file
@ -0,0 +1,145 @@
|
||||
From f2cd96a040dd7863484d22a3995a2904605dadde Mon Sep 17 00:00:00 2001
|
||||
From: Eric Blake <eblake@redhat.com>
|
||||
Date: Fri, 9 May 2025 15:40:21 -0500
|
||||
Subject: [PATCH 06/16] block: Add new bdrv_co_is_all_zeroes() function
|
||||
|
||||
RH-Author: Eric Blake <eblake@redhat.com>
|
||||
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
|
||||
RH-Jira: RHEL-82906 RHEL-83015
|
||||
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
|
||||
RH-Commit: [4/14] aabcba8323df698a72842f299e9242a5eee3aea6 (ebblake/centos-qemu-kvm)
|
||||
|
||||
There are some optimizations that require knowing if an image starts
|
||||
out as reading all zeroes, such as making blockdev-mirror faster by
|
||||
skipping the copying of source zeroes to the destination. The
|
||||
existing bdrv_co_is_zero_fast() is a good building block for answering
|
||||
this question, but it tends to give an answer of 0 for a file we just
|
||||
created via QMP 'blockdev-create' or similar (such as 'qemu-img create
|
||||
-f raw'). Why? Because file-posix.c insists on allocating a tiny
|
||||
header to any file rather than leaving it 100% sparse, due to some
|
||||
filesystems that are unable to answer alignment probes on a hole. But
|
||||
teaching file-posix.c to read the tiny header doesn't scale - the
|
||||
problem of a small header is also visible when libvirt sets up an NBD
|
||||
client to a just-created file on a migration destination host.
|
||||
|
||||
So, we need a wrapper function that handles a bit more complexity in a
|
||||
common manner for all block devices - when the BDS is mostly a hole,
|
||||
but has a small non-hole header, it is still worth the time to read
|
||||
that header and check if it reads as all zeroes before giving up and
|
||||
returning a pessimistic answer.
|
||||
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-ID: <20250509204341.3553601-19-eblake@redhat.com>
|
||||
(cherry picked from commit 52726096707c5c8b90597c445de897fa64d56e73)
|
||||
Conflicts:
|
||||
block/io.c - context with header names
|
||||
Jira: https://issues.redhat.com/browse/RHEL-82906
|
||||
Jira: https://issues.redhat.com/browse/RHEL-83015
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
---
|
||||
block/io.c | 62 ++++++++++++++++++++++++++++++++++++++++
|
||||
include/block/block-io.h | 2 ++
|
||||
2 files changed, 64 insertions(+)
|
||||
|
||||
diff --git a/block/io.c b/block/io.c
|
||||
index 293c5dd393..1f01337599 100644
|
||||
--- a/block/io.c
|
||||
+++ b/block/io.c
|
||||
@@ -38,10 +38,14 @@
|
||||
#include "qemu/error-report.h"
|
||||
#include "qemu/main-loop.h"
|
||||
#include "sysemu/replay.h"
|
||||
+#include "qemu/units.h"
|
||||
|
||||
/* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
|
||||
#define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
|
||||
|
||||
+/* Maximum read size for checking if data reads as zero, in bytes */
|
||||
+#define MAX_ZERO_CHECK_BUFFER (128 * KiB)
|
||||
+
|
||||
static void coroutine_fn GRAPH_RDLOCK
|
||||
bdrv_parent_cb_resize(BlockDriverState *bs);
|
||||
|
||||
@@ -2774,6 +2778,64 @@ int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset,
|
||||
return 1;
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * Check @bs (and its backing chain) to see if the entire image is known
|
||||
+ * to read as zeroes.
|
||||
+ * Return 1 if that is the case, 0 otherwise and -errno on error.
|
||||
+ * This test is meant to be fast rather than accurate so returning 0
|
||||
+ * does not guarantee non-zero data; however, a return of 1 is reliable,
|
||||
+ * and this function can report 1 in more cases than bdrv_co_is_zero_fast.
|
||||
+ */
|
||||
+int coroutine_fn bdrv_co_is_all_zeroes(BlockDriverState *bs)
|
||||
+{
|
||||
+ int ret;
|
||||
+ int64_t pnum, bytes;
|
||||
+ char *buf;
|
||||
+ QEMUIOVector local_qiov;
|
||||
+ IO_CODE();
|
||||
+
|
||||
+ bytes = bdrv_co_getlength(bs);
|
||||
+ if (bytes < 0) {
|
||||
+ return bytes;
|
||||
+ }
|
||||
+
|
||||
+ /* First probe - see if the entire image reads as zero */
|
||||
+ ret = bdrv_co_common_block_status_above(bs, NULL, false, BDRV_WANT_ZERO,
|
||||
+ 0, bytes, &pnum, NULL, NULL,
|
||||
+ NULL);
|
||||
+ if (ret < 0) {
|
||||
+ return ret;
|
||||
+ }
|
||||
+ if (ret & BDRV_BLOCK_ZERO) {
|
||||
+ return bdrv_co_is_zero_fast(bs, pnum, bytes - pnum);
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * Because of the way 'blockdev-create' works, raw files tend to
|
||||
+ * be created with a non-sparse region at the front to make
|
||||
+ * alignment probing easier. If the block starts with only a
|
||||
+ * small allocated region, it is still worth the effort to see if
|
||||
+ * the rest of the image is still sparse, coupled with manually
|
||||
+ * reading the first region to see if it reads zero after all.
|
||||
+ */
|
||||
+ if (pnum > MAX_ZERO_CHECK_BUFFER) {
|
||||
+ return 0;
|
||||
+ }
|
||||
+ ret = bdrv_co_is_zero_fast(bs, pnum, bytes - pnum);
|
||||
+ if (ret <= 0) {
|
||||
+ return ret;
|
||||
+ }
|
||||
+ /* Only the head of the image is unknown, and it's small. Read it. */
|
||||
+ buf = qemu_blockalign(bs, pnum);
|
||||
+ qemu_iovec_init_buf(&local_qiov, buf, pnum);
|
||||
+ ret = bdrv_driver_preadv(bs, 0, pnum, &local_qiov, 0, 0);
|
||||
+ if (ret >= 0) {
|
||||
+ ret = buffer_is_zero(buf, pnum);
|
||||
+ }
|
||||
+ qemu_vfree(buf);
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t offset,
|
||||
int64_t bytes, int64_t *pnum)
|
||||
{
|
||||
diff --git a/include/block/block-io.h b/include/block/block-io.h
|
||||
index b49e0537dd..b99cc98d26 100644
|
||||
--- a/include/block/block-io.h
|
||||
+++ b/include/block/block-io.h
|
||||
@@ -161,6 +161,8 @@ bdrv_is_allocated_above(BlockDriverState *bs, BlockDriverState *base,
|
||||
|
||||
int coroutine_fn GRAPH_RDLOCK
|
||||
bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset, int64_t bytes);
|
||||
+int coroutine_fn GRAPH_RDLOCK
|
||||
+bdrv_co_is_all_zeroes(BlockDriverState *bs);
|
||||
|
||||
int GRAPH_RDLOCK
|
||||
bdrv_apply_auto_read_only(BlockDriverState *bs, const char *errmsg,
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,689 @@
|
||||
From 26f5d221dd16137bed3527ee120cdf085e2c7e23 Mon Sep 17 00:00:00 2001
|
||||
From: Eric Blake <eblake@redhat.com>
|
||||
Date: Fri, 9 May 2025 15:40:18 -0500
|
||||
Subject: [PATCH 03/16] block: Expand block status mode from bool to flags
|
||||
|
||||
RH-Author: Eric Blake <eblake@redhat.com>
|
||||
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
|
||||
RH-Jira: RHEL-82906 RHEL-83015
|
||||
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
|
||||
RH-Commit: [1/14] 9de5245def80e9815ed306e4abce9caec56cef6f (ebblake/centos-qemu-kvm)
|
||||
|
||||
This patch is purely mechanical, changing bool want_zero into an
|
||||
unsigned int for bitwise-or of flags. As of this patch, all
|
||||
implementations are unchanged (the old want_zero==true is now
|
||||
mode==BDRV_WANT_PRECISE which is a superset of BDRV_WANT_ZERO); but
|
||||
the callers in io.c that used to pass want_zero==false are now
|
||||
prepared for future driver changes that can now distinguish bewteen
|
||||
BDRV_WANT_ZERO vs. BDRV_WANT_ALLOCATED. The next patch will actually
|
||||
change the file-posix driver along those lines, now that we have
|
||||
more-specific hints.
|
||||
|
||||
As for the background why this patch is useful: right now, the
|
||||
file-posix driver recognizes that if allocation is being queried, the
|
||||
entire image can be reported as allocated (there is no backing file to
|
||||
refer to) - but this throws away information on whether the entire
|
||||
image reads as zero (trivially true if lseek(SEEK_HOLE) at offset 0
|
||||
returns -ENXIO, a bit more complicated to prove if the raw file was
|
||||
created with 'qemu-img create' since we intentionally allocate a small
|
||||
chunk of all-zero data to help with alignment probing). Later patches
|
||||
will add a generic algorithm for seeing if an entire file reads as
|
||||
zeroes.
|
||||
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-ID: <20250509204341.3553601-16-eblake@redhat.com>
|
||||
(cherry picked from commit c33159dec79069514f78faecfe268439226b0f5b)
|
||||
Jira: https://issues.redhat.com/browse/RHEL-82906
|
||||
Jira: https://issues.redhat.com/browse/RHEL-83015
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
---
|
||||
block/blkdebug.c | 6 ++--
|
||||
block/copy-before-write.c | 4 +--
|
||||
block/coroutines.h | 4 +--
|
||||
block/file-posix.c | 4 +--
|
||||
block/gluster.c | 4 +--
|
||||
block/io.c | 51 ++++++++++++++++----------------
|
||||
block/iscsi.c | 6 ++--
|
||||
block/nbd.c | 4 +--
|
||||
block/null.c | 6 ++--
|
||||
block/parallels.c | 6 ++--
|
||||
block/qcow.c | 2 +-
|
||||
block/qcow2.c | 6 ++--
|
||||
block/qed.c | 6 ++--
|
||||
block/quorum.c | 4 +--
|
||||
block/raw-format.c | 4 +--
|
||||
block/rbd.c | 6 ++--
|
||||
block/snapshot-access.c | 4 +--
|
||||
block/vdi.c | 4 +--
|
||||
block/vmdk.c | 2 +-
|
||||
block/vpc.c | 2 +-
|
||||
block/vvfat.c | 6 ++--
|
||||
include/block/block-common.h | 11 +++++++
|
||||
include/block/block_int-common.h | 27 +++++++++--------
|
||||
include/block/block_int-io.h | 4 +--
|
||||
tests/unit/test-block-iothread.c | 2 +-
|
||||
25 files changed, 99 insertions(+), 86 deletions(-)
|
||||
|
||||
diff --git a/block/blkdebug.c b/block/blkdebug.c
|
||||
index c95c818c38..736ae2b56b 100644
|
||||
--- a/block/blkdebug.c
|
||||
+++ b/block/blkdebug.c
|
||||
@@ -751,9 +751,9 @@ blkdebug_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
|
||||
}
|
||||
|
||||
static int coroutine_fn GRAPH_RDLOCK
|
||||
-blkdebug_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
|
||||
- int64_t bytes, int64_t *pnum, int64_t *map,
|
||||
- BlockDriverState **file)
|
||||
+blkdebug_co_block_status(BlockDriverState *bs, unsigned int mode,
|
||||
+ int64_t offset, int64_t bytes, int64_t *pnum,
|
||||
+ int64_t *map, BlockDriverState **file)
|
||||
{
|
||||
int err;
|
||||
|
||||
diff --git a/block/copy-before-write.c b/block/copy-before-write.c
|
||||
index 853e01a1eb..36488cdeca 100644
|
||||
--- a/block/copy-before-write.c
|
||||
+++ b/block/copy-before-write.c
|
||||
@@ -290,8 +290,8 @@ cbw_co_preadv_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes,
|
||||
}
|
||||
|
||||
static int coroutine_fn GRAPH_RDLOCK
|
||||
-cbw_co_snapshot_block_status(BlockDriverState *bs,
|
||||
- bool want_zero, int64_t offset, int64_t bytes,
|
||||
+cbw_co_snapshot_block_status(BlockDriverState *bs, unsigned int mode,
|
||||
+ int64_t offset, int64_t bytes,
|
||||
int64_t *pnum, int64_t *map,
|
||||
BlockDriverState **file)
|
||||
{
|
||||
diff --git a/block/coroutines.h b/block/coroutines.h
|
||||
index f3226682d6..811ef12e43 100644
|
||||
--- a/block/coroutines.h
|
||||
+++ b/block/coroutines.h
|
||||
@@ -47,7 +47,7 @@ int coroutine_fn GRAPH_RDLOCK
|
||||
bdrv_co_common_block_status_above(BlockDriverState *bs,
|
||||
BlockDriverState *base,
|
||||
bool include_base,
|
||||
- bool want_zero,
|
||||
+ unsigned int mode,
|
||||
int64_t offset,
|
||||
int64_t bytes,
|
||||
int64_t *pnum,
|
||||
@@ -78,7 +78,7 @@ int co_wrapper_mixed_bdrv_rdlock
|
||||
bdrv_common_block_status_above(BlockDriverState *bs,
|
||||
BlockDriverState *base,
|
||||
bool include_base,
|
||||
- bool want_zero,
|
||||
+ unsigned int mode,
|
||||
int64_t offset,
|
||||
int64_t bytes,
|
||||
int64_t *pnum,
|
||||
diff --git a/block/file-posix.c b/block/file-posix.c
|
||||
index f17a3f4d10..9ca55620ca 100644
|
||||
--- a/block/file-posix.c
|
||||
+++ b/block/file-posix.c
|
||||
@@ -3277,7 +3277,7 @@ static int find_allocation(BlockDriverState *bs, off_t start,
|
||||
* well exceed it.
|
||||
*/
|
||||
static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
|
||||
- bool want_zero,
|
||||
+ unsigned int mode,
|
||||
int64_t offset,
|
||||
int64_t bytes, int64_t *pnum,
|
||||
int64_t *map,
|
||||
@@ -3293,7 +3293,7 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
|
||||
return ret;
|
||||
}
|
||||
|
||||
- if (!want_zero) {
|
||||
+ if (mode != BDRV_WANT_PRECISE) {
|
||||
*pnum = bytes;
|
||||
*map = offset;
|
||||
*file = bs;
|
||||
diff --git a/block/gluster.c b/block/gluster.c
|
||||
index f8b415f381..ae5c45666b 100644
|
||||
--- a/block/gluster.c
|
||||
+++ b/block/gluster.c
|
||||
@@ -1466,7 +1466,7 @@ exit:
|
||||
* (Based on raw_co_block_status() from file-posix.c.)
|
||||
*/
|
||||
static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs,
|
||||
- bool want_zero,
|
||||
+ unsigned int mode,
|
||||
int64_t offset,
|
||||
int64_t bytes,
|
||||
int64_t *pnum,
|
||||
@@ -1483,7 +1483,7 @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs,
|
||||
return ret;
|
||||
}
|
||||
|
||||
- if (!want_zero) {
|
||||
+ if (mode != BDRV_WANT_PRECISE) {
|
||||
*pnum = bytes;
|
||||
*map = offset;
|
||||
*file = bs;
|
||||
diff --git a/block/io.c b/block/io.c
|
||||
index 3e189837a1..daaafe00d7 100644
|
||||
--- a/block/io.c
|
||||
+++ b/block/io.c
|
||||
@@ -2360,10 +2360,8 @@ int bdrv_flush_all(void)
|
||||
* Drivers not implementing the functionality are assumed to not support
|
||||
* backing files, hence all their sectors are reported as allocated.
|
||||
*
|
||||
- * If 'want_zero' is true, the caller is querying for mapping
|
||||
- * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and
|
||||
- * _ZERO where possible; otherwise, the result favors larger 'pnum',
|
||||
- * with a focus on accurate BDRV_BLOCK_ALLOCATED.
|
||||
+ * 'mode' serves as a hint as to which results are favored; see the
|
||||
+ * BDRV_WANT_* macros for details.
|
||||
*
|
||||
* If 'offset' is beyond the end of the disk image the return value is
|
||||
* BDRV_BLOCK_EOF and 'pnum' is set to 0.
|
||||
@@ -2383,7 +2381,7 @@ int bdrv_flush_all(void)
|
||||
* set to the host mapping and BDS corresponding to the guest offset.
|
||||
*/
|
||||
static int coroutine_fn GRAPH_RDLOCK
|
||||
-bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero,
|
||||
+bdrv_co_do_block_status(BlockDriverState *bs, unsigned int mode,
|
||||
int64_t offset, int64_t bytes,
|
||||
int64_t *pnum, int64_t *map, BlockDriverState **file)
|
||||
{
|
||||
@@ -2472,7 +2470,7 @@ bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero,
|
||||
local_file = bs;
|
||||
local_map = aligned_offset;
|
||||
} else {
|
||||
- ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
|
||||
+ ret = bs->drv->bdrv_co_block_status(bs, mode, aligned_offset,
|
||||
aligned_bytes, pnum, &local_map,
|
||||
&local_file);
|
||||
|
||||
@@ -2484,10 +2482,10 @@ bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero,
|
||||
* the cache requires an RCU update, so double check here to avoid
|
||||
* such an update if possible.
|
||||
*
|
||||
- * Check want_zero, because we only want to update the cache when we
|
||||
+ * Check mode, because we only want to update the cache when we
|
||||
* have accurate information about what is zero and what is data.
|
||||
*/
|
||||
- if (want_zero &&
|
||||
+ if (mode == BDRV_WANT_PRECISE &&
|
||||
ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID) &&
|
||||
QLIST_EMPTY(&bs->children))
|
||||
{
|
||||
@@ -2544,7 +2542,7 @@ bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero,
|
||||
|
||||
if (ret & BDRV_BLOCK_RAW) {
|
||||
assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file);
|
||||
- ret = bdrv_co_do_block_status(local_file, want_zero, local_map,
|
||||
+ ret = bdrv_co_do_block_status(local_file, mode, local_map,
|
||||
*pnum, pnum, &local_map, &local_file);
|
||||
goto out;
|
||||
}
|
||||
@@ -2556,7 +2554,7 @@ bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero,
|
||||
|
||||
if (!cow_bs) {
|
||||
ret |= BDRV_BLOCK_ZERO;
|
||||
- } else if (want_zero) {
|
||||
+ } else if (mode == BDRV_WANT_PRECISE) {
|
||||
int64_t size2 = bdrv_co_getlength(cow_bs);
|
||||
|
||||
if (size2 >= 0 && offset >= size2) {
|
||||
@@ -2565,14 +2563,14 @@ bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero,
|
||||
}
|
||||
}
|
||||
|
||||
- if (want_zero && ret & BDRV_BLOCK_RECURSE &&
|
||||
+ if (mode == BDRV_WANT_PRECISE && ret & BDRV_BLOCK_RECURSE &&
|
||||
local_file && local_file != bs &&
|
||||
(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
|
||||
(ret & BDRV_BLOCK_OFFSET_VALID)) {
|
||||
int64_t file_pnum;
|
||||
int ret2;
|
||||
|
||||
- ret2 = bdrv_co_do_block_status(local_file, want_zero, local_map,
|
||||
+ ret2 = bdrv_co_do_block_status(local_file, mode, local_map,
|
||||
*pnum, &file_pnum, NULL, NULL);
|
||||
if (ret2 >= 0) {
|
||||
/* Ignore errors. This is just providing extra information, it
|
||||
@@ -2623,7 +2621,7 @@ int coroutine_fn
|
||||
bdrv_co_common_block_status_above(BlockDriverState *bs,
|
||||
BlockDriverState *base,
|
||||
bool include_base,
|
||||
- bool want_zero,
|
||||
+ unsigned int mode,
|
||||
int64_t offset,
|
||||
int64_t bytes,
|
||||
int64_t *pnum,
|
||||
@@ -2650,7 +2648,7 @@ bdrv_co_common_block_status_above(BlockDriverState *bs,
|
||||
return 0;
|
||||
}
|
||||
|
||||
- ret = bdrv_co_do_block_status(bs, want_zero, offset, bytes, pnum,
|
||||
+ ret = bdrv_co_do_block_status(bs, mode, offset, bytes, pnum,
|
||||
map, file);
|
||||
++*depth;
|
||||
if (ret < 0 || *pnum == 0 || ret & BDRV_BLOCK_ALLOCATED || bs == base) {
|
||||
@@ -2667,7 +2665,7 @@ bdrv_co_common_block_status_above(BlockDriverState *bs,
|
||||
for (p = bdrv_filter_or_cow_bs(bs); include_base || p != base;
|
||||
p = bdrv_filter_or_cow_bs(p))
|
||||
{
|
||||
- ret = bdrv_co_do_block_status(p, want_zero, offset, bytes, pnum,
|
||||
+ ret = bdrv_co_do_block_status(p, mode, offset, bytes, pnum,
|
||||
map, file);
|
||||
++*depth;
|
||||
if (ret < 0) {
|
||||
@@ -2730,7 +2728,8 @@ int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
|
||||
BlockDriverState **file)
|
||||
{
|
||||
IO_CODE();
|
||||
- return bdrv_co_common_block_status_above(bs, base, false, true, offset,
|
||||
+ return bdrv_co_common_block_status_above(bs, base, false,
|
||||
+ BDRV_WANT_PRECISE, offset,
|
||||
bytes, pnum, map, file, NULL);
|
||||
}
|
||||
|
||||
@@ -2761,8 +2760,9 @@ int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset,
|
||||
return 1;
|
||||
}
|
||||
|
||||
- ret = bdrv_co_common_block_status_above(bs, NULL, false, false, offset,
|
||||
- bytes, &pnum, NULL, NULL, NULL);
|
||||
+ ret = bdrv_co_common_block_status_above(bs, NULL, false, BDRV_WANT_ZERO,
|
||||
+ offset, bytes, &pnum, NULL, NULL,
|
||||
+ NULL);
|
||||
|
||||
if (ret < 0) {
|
||||
return ret;
|
||||
@@ -2778,9 +2778,9 @@ int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t offset,
|
||||
int64_t dummy;
|
||||
IO_CODE();
|
||||
|
||||
- ret = bdrv_co_common_block_status_above(bs, bs, true, false, offset,
|
||||
- bytes, pnum ? pnum : &dummy, NULL,
|
||||
- NULL, NULL);
|
||||
+ ret = bdrv_co_common_block_status_above(bs, bs, true, BDRV_WANT_ALLOCATED,
|
||||
+ offset, bytes, pnum ? pnum : &dummy,
|
||||
+ NULL, NULL, NULL);
|
||||
if (ret < 0) {
|
||||
return ret;
|
||||
}
|
||||
@@ -2813,7 +2813,8 @@ int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *bs,
|
||||
int ret;
|
||||
IO_CODE();
|
||||
|
||||
- ret = bdrv_co_common_block_status_above(bs, base, include_base, false,
|
||||
+ ret = bdrv_co_common_block_status_above(bs, base, include_base,
|
||||
+ BDRV_WANT_ALLOCATED,
|
||||
offset, bytes, pnum, NULL, NULL,
|
||||
&depth);
|
||||
if (ret < 0) {
|
||||
@@ -3710,8 +3711,8 @@ bdrv_co_preadv_snapshot(BdrvChild *child, int64_t offset, int64_t bytes,
|
||||
}
|
||||
|
||||
int coroutine_fn
|
||||
-bdrv_co_snapshot_block_status(BlockDriverState *bs,
|
||||
- bool want_zero, int64_t offset, int64_t bytes,
|
||||
+bdrv_co_snapshot_block_status(BlockDriverState *bs, unsigned int mode,
|
||||
+ int64_t offset, int64_t bytes,
|
||||
int64_t *pnum, int64_t *map,
|
||||
BlockDriverState **file)
|
||||
{
|
||||
@@ -3729,7 +3730,7 @@ bdrv_co_snapshot_block_status(BlockDriverState *bs,
|
||||
}
|
||||
|
||||
bdrv_inc_in_flight(bs);
|
||||
- ret = drv->bdrv_co_snapshot_block_status(bs, want_zero, offset, bytes,
|
||||
+ ret = drv->bdrv_co_snapshot_block_status(bs, mode, offset, bytes,
|
||||
pnum, map, file);
|
||||
bdrv_dec_in_flight(bs);
|
||||
|
||||
diff --git a/block/iscsi.c b/block/iscsi.c
|
||||
index 979bf90cb7..d7caa4b363 100644
|
||||
--- a/block/iscsi.c
|
||||
+++ b/block/iscsi.c
|
||||
@@ -694,9 +694,9 @@ out_unlock:
|
||||
|
||||
|
||||
static int coroutine_fn iscsi_co_block_status(BlockDriverState *bs,
|
||||
- bool want_zero, int64_t offset,
|
||||
- int64_t bytes, int64_t *pnum,
|
||||
- int64_t *map,
|
||||
+ unsigned int mode,
|
||||
+ int64_t offset, int64_t bytes,
|
||||
+ int64_t *pnum, int64_t *map,
|
||||
BlockDriverState **file)
|
||||
{
|
||||
IscsiLun *iscsilun = bs->opaque;
|
||||
diff --git a/block/nbd.c b/block/nbd.c
|
||||
index d464315766..a359aa236e 100644
|
||||
--- a/block/nbd.c
|
||||
+++ b/block/nbd.c
|
||||
@@ -1397,8 +1397,8 @@ nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
|
||||
}
|
||||
|
||||
static int coroutine_fn GRAPH_RDLOCK nbd_client_co_block_status(
|
||||
- BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes,
|
||||
- int64_t *pnum, int64_t *map, BlockDriverState **file)
|
||||
+ BlockDriverState *bs, unsigned int mode, int64_t offset,
|
||||
+ int64_t bytes, int64_t *pnum, int64_t *map, BlockDriverState **file)
|
||||
{
|
||||
int ret, request_ret;
|
||||
NBDExtent64 extent = { 0 };
|
||||
diff --git a/block/null.c b/block/null.c
|
||||
index 4730acc1eb..95021230c8 100644
|
||||
--- a/block/null.c
|
||||
+++ b/block/null.c
|
||||
@@ -227,9 +227,9 @@ static int null_reopen_prepare(BDRVReopenState *reopen_state,
|
||||
}
|
||||
|
||||
static int coroutine_fn null_co_block_status(BlockDriverState *bs,
|
||||
- bool want_zero, int64_t offset,
|
||||
- int64_t bytes, int64_t *pnum,
|
||||
- int64_t *map,
|
||||
+ unsigned int mode,
|
||||
+ int64_t offset, int64_t bytes,
|
||||
+ int64_t *pnum, int64_t *map,
|
||||
BlockDriverState **file)
|
||||
{
|
||||
BDRVNullState *s = bs->opaque;
|
||||
diff --git a/block/parallels.c b/block/parallels.c
|
||||
index 9205a0864f..22ea7834fd 100644
|
||||
--- a/block/parallels.c
|
||||
+++ b/block/parallels.c
|
||||
@@ -416,9 +416,9 @@ parallels_co_flush_to_os(BlockDriverState *bs)
|
||||
}
|
||||
|
||||
static int coroutine_fn GRAPH_RDLOCK
|
||||
-parallels_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
|
||||
- int64_t bytes, int64_t *pnum, int64_t *map,
|
||||
- BlockDriverState **file)
|
||||
+parallels_co_block_status(BlockDriverState *bs, unsigned int mode,
|
||||
+ int64_t offset, int64_t bytes, int64_t *pnum,
|
||||
+ int64_t *map, BlockDriverState **file)
|
||||
{
|
||||
BDRVParallelsState *s = bs->opaque;
|
||||
int count;
|
||||
diff --git a/block/qcow.c b/block/qcow.c
|
||||
index c2f89db055..2e18c42d8f 100644
|
||||
--- a/block/qcow.c
|
||||
+++ b/block/qcow.c
|
||||
@@ -530,7 +530,7 @@ get_cluster_offset(BlockDriverState *bs, uint64_t offset, int allocate,
|
||||
}
|
||||
|
||||
static int coroutine_fn GRAPH_RDLOCK
|
||||
-qcow_co_block_status(BlockDriverState *bs, bool want_zero,
|
||||
+qcow_co_block_status(BlockDriverState *bs, unsigned int mode,
|
||||
int64_t offset, int64_t bytes, int64_t *pnum,
|
||||
int64_t *map, BlockDriverState **file)
|
||||
{
|
||||
diff --git a/block/qcow2.c b/block/qcow2.c
|
||||
index a4cffb628c..788da07fee 100644
|
||||
--- a/block/qcow2.c
|
||||
+++ b/block/qcow2.c
|
||||
@@ -2147,9 +2147,9 @@ static void qcow2_join_options(QDict *options, QDict *old_options)
|
||||
}
|
||||
|
||||
static int coroutine_fn GRAPH_RDLOCK
|
||||
-qcow2_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
|
||||
- int64_t count, int64_t *pnum, int64_t *map,
|
||||
- BlockDriverState **file)
|
||||
+qcow2_co_block_status(BlockDriverState *bs, unsigned int mode,
|
||||
+ int64_t offset, int64_t count, int64_t *pnum,
|
||||
+ int64_t *map, BlockDriverState **file)
|
||||
{
|
||||
BDRVQcow2State *s = bs->opaque;
|
||||
uint64_t host_offset;
|
||||
diff --git a/block/qed.c b/block/qed.c
|
||||
index fa5bc11085..b135e981e5 100644
|
||||
--- a/block/qed.c
|
||||
+++ b/block/qed.c
|
||||
@@ -832,9 +832,9 @@ fail:
|
||||
}
|
||||
|
||||
static int coroutine_fn GRAPH_RDLOCK
|
||||
-bdrv_qed_co_block_status(BlockDriverState *bs, bool want_zero, int64_t pos,
|
||||
- int64_t bytes, int64_t *pnum, int64_t *map,
|
||||
- BlockDriverState **file)
|
||||
+bdrv_qed_co_block_status(BlockDriverState *bs, unsigned int mode,
|
||||
+ int64_t pos, int64_t bytes, int64_t *pnum,
|
||||
+ int64_t *map, BlockDriverState **file)
|
||||
{
|
||||
BDRVQEDState *s = bs->opaque;
|
||||
size_t len = MIN(bytes, SIZE_MAX);
|
||||
diff --git a/block/quorum.c b/block/quorum.c
|
||||
index db8fe891c4..bb4ed9483e 100644
|
||||
--- a/block/quorum.c
|
||||
+++ b/block/quorum.c
|
||||
@@ -1226,7 +1226,7 @@ static void quorum_child_perm(BlockDriverState *bs, BdrvChild *c,
|
||||
* region contains zeroes, and BDRV_BLOCK_DATA otherwise.
|
||||
*/
|
||||
static int coroutine_fn GRAPH_RDLOCK
|
||||
-quorum_co_block_status(BlockDriverState *bs, bool want_zero,
|
||||
+quorum_co_block_status(BlockDriverState *bs, unsigned int mode,
|
||||
int64_t offset, int64_t count,
|
||||
int64_t *pnum, int64_t *map, BlockDriverState **file)
|
||||
{
|
||||
@@ -1238,7 +1238,7 @@ quorum_co_block_status(BlockDriverState *bs, bool want_zero,
|
||||
for (i = 0; i < s->num_children; i++) {
|
||||
int64_t bytes;
|
||||
ret = bdrv_co_common_block_status_above(s->children[i]->bs, NULL, false,
|
||||
- want_zero, offset, count,
|
||||
+ mode, offset, count,
|
||||
&bytes, NULL, NULL, NULL);
|
||||
if (ret < 0) {
|
||||
quorum_report_bad(QUORUM_OP_TYPE_READ, offset, count,
|
||||
diff --git a/block/raw-format.c b/block/raw-format.c
|
||||
index ac7e8495f6..623bca87a6 100644
|
||||
--- a/block/raw-format.c
|
||||
+++ b/block/raw-format.c
|
||||
@@ -283,8 +283,8 @@ fail:
|
||||
}
|
||||
|
||||
static int coroutine_fn GRAPH_RDLOCK
|
||||
-raw_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
|
||||
- int64_t bytes, int64_t *pnum, int64_t *map,
|
||||
+raw_co_block_status(BlockDriverState *bs, unsigned int mode,
|
||||
+ int64_t offset, int64_t bytes, int64_t *pnum, int64_t *map,
|
||||
BlockDriverState **file)
|
||||
{
|
||||
BDRVRawState *s = bs->opaque;
|
||||
diff --git a/block/rbd.c b/block/rbd.c
|
||||
index 9c0fd0cb3f..627f8eb05a 100644
|
||||
--- a/block/rbd.c
|
||||
+++ b/block/rbd.c
|
||||
@@ -1504,9 +1504,9 @@ static int qemu_rbd_diff_iterate_cb(uint64_t offs, size_t len,
|
||||
}
|
||||
|
||||
static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
|
||||
- bool want_zero, int64_t offset,
|
||||
- int64_t bytes, int64_t *pnum,
|
||||
- int64_t *map,
|
||||
+ unsigned int mode,
|
||||
+ int64_t offset, int64_t bytes,
|
||||
+ int64_t *pnum, int64_t *map,
|
||||
BlockDriverState **file)
|
||||
{
|
||||
BDRVRBDState *s = bs->opaque;
|
||||
diff --git a/block/snapshot-access.c b/block/snapshot-access.c
|
||||
index 84d0d13f86..972b8f2e68 100644
|
||||
--- a/block/snapshot-access.c
|
||||
+++ b/block/snapshot-access.c
|
||||
@@ -41,11 +41,11 @@ snapshot_access_co_preadv_part(BlockDriverState *bs,
|
||||
|
||||
static int coroutine_fn GRAPH_RDLOCK
|
||||
snapshot_access_co_block_status(BlockDriverState *bs,
|
||||
- bool want_zero, int64_t offset,
|
||||
+ unsigned int mode, int64_t offset,
|
||||
int64_t bytes, int64_t *pnum,
|
||||
int64_t *map, BlockDriverState **file)
|
||||
{
|
||||
- return bdrv_co_snapshot_block_status(bs->file->bs, want_zero, offset,
|
||||
+ return bdrv_co_snapshot_block_status(bs->file->bs, mode, offset,
|
||||
bytes, pnum, map, file);
|
||||
}
|
||||
|
||||
diff --git a/block/vdi.c b/block/vdi.c
|
||||
index 6363da08ce..028fe68488 100644
|
||||
--- a/block/vdi.c
|
||||
+++ b/block/vdi.c
|
||||
@@ -521,8 +521,8 @@ static int vdi_reopen_prepare(BDRVReopenState *state,
|
||||
}
|
||||
|
||||
static int coroutine_fn GRAPH_RDLOCK
|
||||
-vdi_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
|
||||
- int64_t bytes, int64_t *pnum, int64_t *map,
|
||||
+vdi_co_block_status(BlockDriverState *bs, unsigned int mode,
|
||||
+ int64_t offset, int64_t bytes, int64_t *pnum, int64_t *map,
|
||||
BlockDriverState **file)
|
||||
{
|
||||
BDRVVdiState *s = (BDRVVdiState *)bs->opaque;
|
||||
diff --git a/block/vmdk.c b/block/vmdk.c
|
||||
index 78f6433607..6f1af82078 100644
|
||||
--- a/block/vmdk.c
|
||||
+++ b/block/vmdk.c
|
||||
@@ -1777,7 +1777,7 @@ static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
|
||||
}
|
||||
|
||||
static int coroutine_fn GRAPH_RDLOCK
|
||||
-vmdk_co_block_status(BlockDriverState *bs, bool want_zero,
|
||||
+vmdk_co_block_status(BlockDriverState *bs, unsigned int mode,
|
||||
int64_t offset, int64_t bytes, int64_t *pnum,
|
||||
int64_t *map, BlockDriverState **file)
|
||||
{
|
||||
diff --git a/block/vpc.c b/block/vpc.c
|
||||
index d95a204612..0dd641b614 100644
|
||||
--- a/block/vpc.c
|
||||
+++ b/block/vpc.c
|
||||
@@ -721,7 +721,7 @@ fail:
|
||||
}
|
||||
|
||||
static int coroutine_fn GRAPH_RDLOCK
|
||||
-vpc_co_block_status(BlockDriverState *bs, bool want_zero,
|
||||
+vpc_co_block_status(BlockDriverState *bs, unsigned int mode,
|
||||
int64_t offset, int64_t bytes,
|
||||
int64_t *pnum, int64_t *map,
|
||||
BlockDriverState **file)
|
||||
diff --git a/block/vvfat.c b/block/vvfat.c
|
||||
index 8ffe8b3b9b..d59231357e 100644
|
||||
--- a/block/vvfat.c
|
||||
+++ b/block/vvfat.c
|
||||
@@ -3135,9 +3135,9 @@ vvfat_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
|
||||
}
|
||||
|
||||
static int coroutine_fn vvfat_co_block_status(BlockDriverState *bs,
|
||||
- bool want_zero, int64_t offset,
|
||||
- int64_t bytes, int64_t *n,
|
||||
- int64_t *map,
|
||||
+ unsigned int mode,
|
||||
+ int64_t offset, int64_t bytes,
|
||||
+ int64_t *n, int64_t *map,
|
||||
BlockDriverState **file)
|
||||
{
|
||||
*n = bytes;
|
||||
diff --git a/include/block/block-common.h b/include/block/block-common.h
|
||||
index 7030669f04..5beee6402b 100644
|
||||
--- a/include/block/block-common.h
|
||||
+++ b/include/block/block-common.h
|
||||
@@ -333,6 +333,17 @@ typedef enum {
|
||||
#define BDRV_BLOCK_RECURSE 0x40
|
||||
#define BDRV_BLOCK_COMPRESSED 0x80
|
||||
|
||||
+/*
|
||||
+ * Block status hints: the bitwise-or of these flags emphasize what
|
||||
+ * the caller hopes to learn, and some drivers may be able to give
|
||||
+ * faster answers by doing less work when the hint permits.
|
||||
+ */
|
||||
+#define BDRV_WANT_ZERO BDRV_BLOCK_ZERO
|
||||
+#define BDRV_WANT_OFFSET_VALID BDRV_BLOCK_OFFSET_VALID
|
||||
+#define BDRV_WANT_ALLOCATED BDRV_BLOCK_ALLOCATED
|
||||
+#define BDRV_WANT_PRECISE (BDRV_WANT_ZERO | BDRV_WANT_OFFSET_VALID | \
|
||||
+ BDRV_WANT_OFFSET_VALID)
|
||||
+
|
||||
typedef QTAILQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue;
|
||||
|
||||
typedef struct BDRVReopenState {
|
||||
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
|
||||
index ebb4e56a50..a9c0daa2a4 100644
|
||||
--- a/include/block/block_int-common.h
|
||||
+++ b/include/block/block_int-common.h
|
||||
@@ -608,15 +608,16 @@ struct BlockDriver {
|
||||
* according to the current layer, and should only need to set
|
||||
* BDRV_BLOCK_DATA, BDRV_BLOCK_ZERO, BDRV_BLOCK_OFFSET_VALID,
|
||||
* and/or BDRV_BLOCK_RAW; if the current layer defers to a backing
|
||||
- * layer, the result should be 0 (and not BDRV_BLOCK_ZERO). See
|
||||
- * block.h for the overall meaning of the bits. As a hint, the
|
||||
- * flag want_zero is true if the caller cares more about precise
|
||||
- * mappings (favor accurate _OFFSET_VALID/_ZERO) or false for
|
||||
- * overall allocation (favor larger *pnum, perhaps by reporting
|
||||
- * _DATA instead of _ZERO). The block layer guarantees input
|
||||
- * clamped to bdrv_getlength() and aligned to request_alignment,
|
||||
- * as well as non-NULL pnum, map, and file; in turn, the driver
|
||||
- * must return an error or set pnum to an aligned non-zero value.
|
||||
+ * layer, the result should be 0 (and not BDRV_BLOCK_ZERO). The
|
||||
+ * caller will synthesize BDRV_BLOCK_ALLOCATED based on the
|
||||
+ * non-zero results. See block.h for the overall meaning of the
|
||||
+ * bits. As a hint, the flags in @mode may include a bitwise-or
|
||||
+ * of BDRV_WANT_ALLOCATED, BDRV_WANT_OFFSET_VALID, or
|
||||
+ * BDRV_WANT_ZERO based on what the caller is looking for in the
|
||||
+ * results. The block layer guarantees input clamped to
|
||||
+ * bdrv_getlength() and aligned to request_alignment, as well as
|
||||
+ * non-NULL pnum, map, and file; in turn, the driver must return
|
||||
+ * an error or set pnum to an aligned non-zero value.
|
||||
*
|
||||
* Note that @bytes is just a hint on how big of a region the
|
||||
* caller wants to inspect. It is not a limit on *pnum.
|
||||
@@ -628,8 +629,8 @@ struct BlockDriver {
|
||||
* to clamping *pnum for return to its caller.
|
||||
*/
|
||||
int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_block_status)(
|
||||
- BlockDriverState *bs,
|
||||
- bool want_zero, int64_t offset, int64_t bytes, int64_t *pnum,
|
||||
+ BlockDriverState *bs, unsigned int mode,
|
||||
+ int64_t offset, int64_t bytes, int64_t *pnum,
|
||||
int64_t *map, BlockDriverState **file);
|
||||
|
||||
/*
|
||||
@@ -653,8 +654,8 @@ struct BlockDriver {
|
||||
QEMUIOVector *qiov, size_t qiov_offset);
|
||||
|
||||
int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_snapshot_block_status)(
|
||||
- BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes,
|
||||
- int64_t *pnum, int64_t *map, BlockDriverState **file);
|
||||
+ BlockDriverState *bs, unsigned int mode, int64_t offset,
|
||||
+ int64_t bytes, int64_t *pnum, int64_t *map, BlockDriverState **file);
|
||||
|
||||
int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_pdiscard_snapshot)(
|
||||
BlockDriverState *bs, int64_t offset, int64_t bytes);
|
||||
diff --git a/include/block/block_int-io.h b/include/block/block_int-io.h
|
||||
index 4a7cf2b4fd..4f94eb3c5a 100644
|
||||
--- a/include/block/block_int-io.h
|
||||
+++ b/include/block/block_int-io.h
|
||||
@@ -38,8 +38,8 @@
|
||||
int coroutine_fn GRAPH_RDLOCK bdrv_co_preadv_snapshot(BdrvChild *child,
|
||||
int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset);
|
||||
int coroutine_fn GRAPH_RDLOCK bdrv_co_snapshot_block_status(
|
||||
- BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes,
|
||||
- int64_t *pnum, int64_t *map, BlockDriverState **file);
|
||||
+ BlockDriverState *bs, unsigned int mode, int64_t offset,
|
||||
+ int64_t bytes, int64_t *pnum, int64_t *map, BlockDriverState **file);
|
||||
int coroutine_fn GRAPH_RDLOCK bdrv_co_pdiscard_snapshot(BlockDriverState *bs,
|
||||
int64_t offset, int64_t bytes);
|
||||
|
||||
diff --git a/tests/unit/test-block-iothread.c b/tests/unit/test-block-iothread.c
|
||||
index 3766d5de6b..373b72fdd8 100644
|
||||
--- a/tests/unit/test-block-iothread.c
|
||||
+++ b/tests/unit/test-block-iothread.c
|
||||
@@ -63,7 +63,7 @@ bdrv_test_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
|
||||
}
|
||||
|
||||
static int coroutine_fn bdrv_test_co_block_status(BlockDriverState *bs,
|
||||
- bool want_zero,
|
||||
+ unsigned int mode,
|
||||
int64_t offset, int64_t count,
|
||||
int64_t *pnum, int64_t *map,
|
||||
BlockDriverState **file)
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,90 @@
|
||||
From 9f8158e56beae4221e91feb5a98cb4db9076cac4 Mon Sep 17 00:00:00 2001
|
||||
From: Eric Blake <eblake@redhat.com>
|
||||
Date: Fri, 9 May 2025 15:40:20 -0500
|
||||
Subject: [PATCH 05/16] block: Let bdrv_co_is_zero_fast consolidate adjacent
|
||||
extents
|
||||
|
||||
RH-Author: Eric Blake <eblake@redhat.com>
|
||||
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
|
||||
RH-Jira: RHEL-82906 RHEL-83015
|
||||
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
|
||||
RH-Commit: [3/14] 98bf9ff773d9a36f8a8e294e38629e3f20c41334 (ebblake/centos-qemu-kvm)
|
||||
|
||||
Some BDS drivers have a cap on how much block status they can supply
|
||||
in one query (for example, NBD talking to an older server cannot
|
||||
inspect more than 4G per query; and qcow2 tends to cap its answers
|
||||
rather than cross a cluster boundary of an L1 table). Although the
|
||||
existing callers of bdrv_co_is_zero_fast are not passing in that large
|
||||
of a 'bytes' parameter, an upcoming caller wants to query the entire
|
||||
image at once, and will thus benefit from being able to treat adjacent
|
||||
zero regions in a coalesced manner, rather than claiming the region is
|
||||
non-zero merely because pnum was truncated and didn't match the
|
||||
incoming bytes.
|
||||
|
||||
While refactoring this into a loop, note that there is no need to
|
||||
assign pnum prior to calling bdrv_co_common_block_status_above() (it
|
||||
is guaranteed to be assigned deeper in the callstack).
|
||||
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-ID: <20250509204341.3553601-18-eblake@redhat.com>
|
||||
(cherry picked from commit 31bf15d97dd1d205a3b264675f9a1b3bd1939068)
|
||||
Jira: https://issues.redhat.com/browse/RHEL-82906
|
||||
Jira: https://issues.redhat.com/browse/RHEL-83015
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
---
|
||||
block/io.c | 27 +++++++++++++++------------
|
||||
1 file changed, 15 insertions(+), 12 deletions(-)
|
||||
|
||||
diff --git a/block/io.c b/block/io.c
|
||||
index daaafe00d7..293c5dd393 100644
|
||||
--- a/block/io.c
|
||||
+++ b/block/io.c
|
||||
@@ -2747,28 +2747,31 @@ int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, int64_t offset,
|
||||
* by @offset and @bytes is known to read as zeroes.
|
||||
* Return 1 if that is the case, 0 otherwise and -errno on error.
|
||||
* This test is meant to be fast rather than accurate so returning 0
|
||||
- * does not guarantee non-zero data.
|
||||
+ * does not guarantee non-zero data; but a return of 1 is reliable.
|
||||
*/
|
||||
int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset,
|
||||
int64_t bytes)
|
||||
{
|
||||
int ret;
|
||||
- int64_t pnum = bytes;
|
||||
+ int64_t pnum;
|
||||
IO_CODE();
|
||||
|
||||
- if (!bytes) {
|
||||
- return 1;
|
||||
- }
|
||||
-
|
||||
- ret = bdrv_co_common_block_status_above(bs, NULL, false, BDRV_WANT_ZERO,
|
||||
- offset, bytes, &pnum, NULL, NULL,
|
||||
- NULL);
|
||||
+ while (bytes) {
|
||||
+ ret = bdrv_co_common_block_status_above(bs, NULL, false,
|
||||
+ BDRV_WANT_ZERO, offset, bytes,
|
||||
+ &pnum, NULL, NULL, NULL);
|
||||
|
||||
- if (ret < 0) {
|
||||
- return ret;
|
||||
+ if (ret < 0) {
|
||||
+ return ret;
|
||||
+ }
|
||||
+ if (!(ret & BDRV_BLOCK_ZERO)) {
|
||||
+ return 0;
|
||||
+ }
|
||||
+ offset += pnum;
|
||||
+ bytes -= pnum;
|
||||
}
|
||||
|
||||
- return (pnum == bytes) && (ret & BDRV_BLOCK_ZERO);
|
||||
+ return 1;
|
||||
}
|
||||
|
||||
int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t offset,
|
||||
--
|
||||
2.48.1
|
||||
|
||||
74
SOURCES/kvm-block-io-skip-head-tail-requests-on-EINVAL.patch
Normal file
74
SOURCES/kvm-block-io-skip-head-tail-requests-on-EINVAL.patch
Normal file
@ -0,0 +1,74 @@
|
||||
From e629a362860977161e43ed80bb59d1d05a06b2f2 Mon Sep 17 00:00:00 2001
|
||||
From: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Date: Thu, 17 Apr 2025 11:05:28 -0400
|
||||
Subject: [PATCH 4/5] block/io: skip head/tail requests on EINVAL
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 355: file-posix: probe discard alignment on Linux block devices
|
||||
RH-Jira: RHEL-86032
|
||||
RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
|
||||
RH-Acked-by: Eric Blake <eblake@redhat.com>
|
||||
RH-Commit: [2/3] 0028fb11f18e16e2aba9506eabb2383c406d17b5 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
When guests send misaligned discard requests, the block layer breaks
|
||||
them up into a misaligned head, an aligned main body, and a misaligned
|
||||
tail.
|
||||
|
||||
The file-posix block driver on Linux returns -EINVAL on misaligned
|
||||
discard requests. This causes bdrv_co_pdiscard() to fail and guests
|
||||
configured with werror=stop will pause.
|
||||
|
||||
Add a special case for misaligned head/tail requests. Simply continue
|
||||
when EINVAL is encountered so that the aligned main body of the request
|
||||
can be completed and the guest is not paused. This is the best we can do
|
||||
when guest discard limits do not match the host discard limits.
|
||||
|
||||
Fixes: https://issues.redhat.com/browse/RHEL-86032
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Reviewed-by: Hanna Czenczek <hreitz@redhat.com>
|
||||
Message-ID: <20250417150528.76470-3-stefanha@redhat.com>
|
||||
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
|
||||
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
||||
(cherry picked from commit 4733cb0833c4b223f92ec0136980eeb5239ecb87)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
block/io.c | 15 ++++++++++-----
|
||||
1 file changed, 10 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/block/io.c b/block/io.c
|
||||
index 301514c880..3e189837a1 100644
|
||||
--- a/block/io.c
|
||||
+++ b/block/io.c
|
||||
@@ -3105,11 +3105,12 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
|
||||
/* Invalidate the cached block-status data range if this discard overlaps */
|
||||
bdrv_bsc_invalidate_range(bs, offset, bytes);
|
||||
|
||||
- /* Discard is advisory, but some devices track and coalesce
|
||||
+ /*
|
||||
+ * Discard is advisory, but some devices track and coalesce
|
||||
* unaligned requests, so we must pass everything down rather than
|
||||
- * round here. Still, most devices will just silently ignore
|
||||
- * unaligned requests (by returning -ENOTSUP), so we must fragment
|
||||
- * the request accordingly. */
|
||||
+ * round here. Still, most devices reject unaligned requests with
|
||||
+ * -EINVAL or -ENOTSUP, so we must fragment the request accordingly.
|
||||
+ */
|
||||
align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
|
||||
assert(align % bs->bl.request_alignment == 0);
|
||||
head = offset % align;
|
||||
@@ -3176,7 +3177,11 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
|
||||
}
|
||||
}
|
||||
if (ret && ret != -ENOTSUP) {
|
||||
- goto out;
|
||||
+ if (ret == -EINVAL && (offset % align != 0 || num % align != 0)) {
|
||||
+ /* Silently skip rejected unaligned head/tail requests */
|
||||
+ } else {
|
||||
+ goto out; /* bail out */
|
||||
+ }
|
||||
}
|
||||
|
||||
offset += num;
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,48 @@
|
||||
From d38bdce712f572e1920e3344132ff6600d657de2 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:36:41 +0100
|
||||
Subject: [PATCH 29/57] block: skip automatic zero-init of large array in
|
||||
ioq_submit
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [3/30] 301a08b3acdcd95634dec5dab1d96fcfe3abf3be (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'ioq_submit' method has a struct array that is 8k in size.
|
||||
Skip the automatic zero-init of this array to eliminate the
|
||||
performance overhead in the I/O hot path.
|
||||
|
||||
The 'iocbs' array will selectively initialized when processing
|
||||
the I/O data.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-id: 20250610123709.835102-4-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 83750c1da807c973b0b11d977d61df7e41122d03)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
block/linux-aio.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/block/linux-aio.c b/block/linux-aio.c
|
||||
index e3b5ec9aba..26d9f086d2 100644
|
||||
--- a/block/linux-aio.c
|
||||
+++ b/block/linux-aio.c
|
||||
@@ -291,7 +291,7 @@ static void ioq_submit(LinuxAioState *s)
|
||||
{
|
||||
int ret, len;
|
||||
struct qemu_laiocb *aiocb;
|
||||
- struct iocb *iocbs[MAX_EVENTS];
|
||||
+ QEMU_UNINITIALIZED struct iocb *iocbs[MAX_EVENTS];
|
||||
QSIMPLEQ_HEAD(, qemu_laiocb) completed;
|
||||
|
||||
do {
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,49 @@
|
||||
From 1e8798a3adbbfc42167aaba0ee18175deac37193 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:36:42 +0100
|
||||
Subject: [PATCH 30/57] chardev/char-fd: skip automatic zero-init of large
|
||||
array
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [4/30] b16fe5c9af4756e1856cd330df02a1a09d9f33ea (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'fd_chr_read' method has a 4k byte array used for copying
|
||||
data between the socket and device. Skip the automatic zero-init
|
||||
of this array to eliminate the performance overhead in the I/O
|
||||
hot path.
|
||||
|
||||
The 'buf' array will be fully initialized when reading data off
|
||||
the network socket.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-id: 20250610123709.835102-5-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit a503bdc22b91869e3bf45522e36b122889465306)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
chardev/char-fd.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/chardev/char-fd.c b/chardev/char-fd.c
|
||||
index d2c4923359..8dd662c066 100644
|
||||
--- a/chardev/char-fd.c
|
||||
+++ b/chardev/char-fd.c
|
||||
@@ -50,7 +50,7 @@ static gboolean fd_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque)
|
||||
Chardev *chr = CHARDEV(opaque);
|
||||
FDChardev *s = FD_CHARDEV(opaque);
|
||||
int len;
|
||||
- uint8_t buf[CHR_READ_BUF_LEN];
|
||||
+ QEMU_UNINITIALIZED uint8_t buf[CHR_READ_BUF_LEN];
|
||||
ssize_t ret;
|
||||
|
||||
len = sizeof(buf);
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,49 @@
|
||||
From 74311b0ee8e211fccff211b975e4ae9236c063dc Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:36:43 +0100
|
||||
Subject: [PATCH 31/57] chardev/char-pty: skip automatic zero-init of large
|
||||
array
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [5/30] a3b8458c30f485551093f292c00c20b0e118df77 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'pty_chr_read' method has a 4k byte array used for copying
|
||||
data between the PTY and device. Skip the automatic zero-init
|
||||
of this array to eliminate the performance overhead in the I/O
|
||||
hot path.
|
||||
|
||||
The 'buf' array will be fully initialized when reading data off
|
||||
the PTY.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-id: 20250610123709.835102-6-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 45bb7fb21c8d18294a9f92da99d01ab3c67c7df2)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
chardev/char-pty.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/chardev/char-pty.c b/chardev/char-pty.c
|
||||
index cc2f7617fe..3319ad215d 100644
|
||||
--- a/chardev/char-pty.c
|
||||
+++ b/chardev/char-pty.c
|
||||
@@ -152,7 +152,7 @@ static gboolean pty_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque)
|
||||
Chardev *chr = CHARDEV(opaque);
|
||||
PtyChardev *s = PTY_CHARDEV(opaque);
|
||||
gsize len;
|
||||
- uint8_t buf[CHR_READ_BUF_LEN];
|
||||
+ QEMU_UNINITIALIZED uint8_t buf[CHR_READ_BUF_LEN];
|
||||
ssize_t ret;
|
||||
|
||||
len = sizeof(buf);
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,49 @@
|
||||
From d56a8ce56f0de70ab2de266a80e25cf309e72fda Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:36:44 +0100
|
||||
Subject: [PATCH 32/57] chardev/char-socket: skip automatic zero-init of large
|
||||
array
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [6/30] 86a2ac03efa1838fb30931c38945ee77de9bbe06 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'tcp_chr_read' method has a 4k byte array used for copying
|
||||
data between the socket and device. Skip the automatic zero-init
|
||||
of this array to eliminate the performance overhead in the I/O
|
||||
hot path.
|
||||
|
||||
The 'buf' array will be fully initialized when reading data off
|
||||
the network socket.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-id: 20250610123709.835102-7-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 9a23075cef1ac6e73a95a489ac72f41c573ceb9b)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
chardev/char-socket.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/chardev/char-socket.c b/chardev/char-socket.c
|
||||
index 1ca9441b1b..99d644e89f 100644
|
||||
--- a/chardev/char-socket.c
|
||||
+++ b/chardev/char-socket.c
|
||||
@@ -497,7 +497,7 @@ static gboolean tcp_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque)
|
||||
{
|
||||
Chardev *chr = CHARDEV(opaque);
|
||||
SocketChardev *s = SOCKET_CHARDEV(opaque);
|
||||
- uint8_t buf[CHR_READ_BUF_LEN];
|
||||
+ QEMU_UNINITIALIZED uint8_t buf[CHR_READ_BUF_LEN];
|
||||
int len, size;
|
||||
|
||||
if ((s->state != TCP_CHARDEV_STATE_CONNECTED) ||
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,53 @@
|
||||
From 389c3c6b4215c9be3fd784c73af0e9795e796380 Mon Sep 17 00:00:00 2001
|
||||
From: Eric Auger <eric.auger@redhat.com>
|
||||
Date: Tue, 18 Feb 2025 19:25:35 +0100
|
||||
Subject: [PATCH 5/9] docs/devel/reset: Document reset expectations for DMA and
|
||||
IOMMU
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Eric Auger <eric.auger@redhat.com>
|
||||
RH-MergeRequest: 341: Fix vIOMMU reset order
|
||||
RH-Jira: RHEL-7188
|
||||
RH-Acked-by: Peter Xu <peterx@redhat.com>
|
||||
RH-Acked-by: Donald Dutile <None>
|
||||
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
|
||||
RH-Commit: [5/5] be8b9d9e34a2b301430dfa229c6785ab17d3fb16 (eauger1/centos-qemu-kvm)
|
||||
|
||||
To avoid any translation faults, the IOMMUs are expected to be
|
||||
reset after the devices they protect. Document that we expect
|
||||
DMA requests to be stopped during the 'enter' or 'hold' phase
|
||||
while IOMMUs should be reset during the 'exit' phase.
|
||||
|
||||
Signed-off-by: Eric Auger <eric.auger@redhat.com>
|
||||
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
|
||||
Message-Id: <20250218182737.76722-6-eric.auger@redhat.com>
|
||||
Reviewed-by: Peter Xu <peterx@redhat.com>
|
||||
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
(cherry picked from commit dd6d545e8f2d9a0e8a8c287ec16469f03ef5c198)
|
||||
Signed-off-by: Eric Auger <eric.auger@redhat.com>
|
||||
---
|
||||
docs/devel/reset.rst | 5 +++++
|
||||
1 file changed, 5 insertions(+)
|
||||
|
||||
diff --git a/docs/devel/reset.rst b/docs/devel/reset.rst
|
||||
index 9746a4e8a0..24ab630465 100644
|
||||
--- a/docs/devel/reset.rst
|
||||
+++ b/docs/devel/reset.rst
|
||||
@@ -123,6 +123,11 @@ The *exit* phase is executed only when the last reset operation ends. Therefore
|
||||
the object does not need to care how many of reset controllers it has and how
|
||||
many of them have started a reset.
|
||||
|
||||
+DMA capable devices are expected to cancel all outstanding DMA operations
|
||||
+during either 'enter' or 'hold' phases. IOMMUs are expected to reset during
|
||||
+the 'exit' phase and this sequencing makes sure no outstanding DMA request
|
||||
+will fault.
|
||||
+
|
||||
|
||||
Handling reset in a resettable object
|
||||
-------------------------------------
|
||||
--
|
||||
2.48.1
|
||||
|
||||
42
SOURCES/kvm-file-posix-Define-DM_MPATH_PROBE_PATHS.patch
Normal file
42
SOURCES/kvm-file-posix-Define-DM_MPATH_PROBE_PATHS.patch
Normal file
@ -0,0 +1,42 @@
|
||||
From d565fe385b3c45a41fa8e25942220aff38a04fc3 Mon Sep 17 00:00:00 2001
|
||||
From: Kevin Wolf <kwolf@redhat.com>
|
||||
Date: Tue, 29 Apr 2025 17:05:41 +0200
|
||||
Subject: [PATCH 2/3] file-posix: Define DM_MPATH_PROBE_PATHS
|
||||
|
||||
RH-Author: Kevin Wolf <kwolf@redhat.com>
|
||||
RH-MergeRequest: 372: file-posix: Fix multipath failover with SCSI passthrough [9.7]
|
||||
RH-Jira: RHEL-95408
|
||||
RH-Acked-by: Hanna Czenczek <hreitz@redhat.com>
|
||||
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-Commit: [1/2] 7615906833a6bb2b4645fa5cd60d78aa9631cb7c (kmwolf/centos-qemu-kvm)
|
||||
|
||||
While the kernel side isn't merged yet and we're still using old kernel
|
||||
headers, just define DM_MPATH_PROBE_PATHS manually.
|
||||
|
||||
This is a downstream-only patch that can be removed after the next minor
|
||||
release.
|
||||
|
||||
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
||||
---
|
||||
block/file-posix.c | 5 +++++
|
||||
1 file changed, 5 insertions(+)
|
||||
|
||||
diff --git a/block/file-posix.c b/block/file-posix.c
|
||||
index 0cb4e922c0..6a5c506549 100644
|
||||
--- a/block/file-posix.c
|
||||
+++ b/block/file-posix.c
|
||||
@@ -134,6 +134,11 @@
|
||||
#define RAW_LOCK_PERM_BASE 100
|
||||
#define RAW_LOCK_SHARED_BASE 200
|
||||
|
||||
+/* TODO Remove this when the kernel side is merged */
|
||||
+#if !defined(DM_MPATH_PROBE_PATHS) && defined(DM_GET_TARGET_VERSION)
|
||||
+#define DM_MPATH_PROBE_PATHS _IO(DM_IOCTL, DM_GET_TARGET_VERSION_CMD + 1)
|
||||
+#endif
|
||||
+
|
||||
typedef struct BDRVRawState {
|
||||
int fd;
|
||||
bool use_lock;
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,46 @@
|
||||
From 3515c6541f71817727a3a8b18ec5252644b51bc0 Mon Sep 17 00:00:00 2001
|
||||
From: Kevin Wolf <kwolf@redhat.com>
|
||||
Date: Tue, 29 Apr 2025 17:56:54 +0200
|
||||
Subject: [PATCH 5/5] file-posix: Fix crash on discard_granularity == 0
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 355: file-posix: probe discard alignment on Linux block devices
|
||||
RH-Jira: RHEL-86032
|
||||
RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
|
||||
RH-Acked-by: Eric Blake <eblake@redhat.com>
|
||||
RH-Commit: [3/3] b8139a4c5b19efff1f15c314447a6abb89db0ae7 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
Block devices that don't support discard have a discard_granularity of
|
||||
0. Currently, this results in a division by zero when we try to make
|
||||
sure that it's a multiple of request_alignment. Only try to update
|
||||
bs->bl.pdiscard_alignment when we got a non-zero discard_granularity
|
||||
from sysfs.
|
||||
|
||||
Fixes: f605796aae4 ('file-posix: probe discard alignment on Linux block devices')
|
||||
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Reviewed-by: Eric Blake <eblake@redhat.com>
|
||||
Message-ID: <20250429155654.102735-1-kwolf@redhat.com>
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 71a30d54e6ab1d5c102a8bee2c263414697402ea)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
block/file-posix.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/block/file-posix.c b/block/file-posix.c
|
||||
index 3d5b024459..0cb4e922c0 100644
|
||||
--- a/block/file-posix.c
|
||||
+++ b/block/file-posix.c
|
||||
@@ -1565,7 +1565,7 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
|
||||
int ret;
|
||||
|
||||
ret = hdev_get_pdiscard_alignment(&st, &dalign);
|
||||
- if (ret == 0) {
|
||||
+ if (ret == 0 && dalign != 0) {
|
||||
uint32_t ralign = bs->bl.request_alignment;
|
||||
|
||||
/* Probably never happens, but handle it just in case */
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,215 @@
|
||||
From 95c651ba1177bd88dbd9b52fe2ec8fedadcdb5c8 Mon Sep 17 00:00:00 2001
|
||||
From: Kevin Wolf <kwolf@redhat.com>
|
||||
Date: Thu, 22 May 2025 15:08:03 +0200
|
||||
Subject: [PATCH 3/3] file-posix: Probe paths and retry SG_IO on potential path
|
||||
errors
|
||||
|
||||
RH-Author: Kevin Wolf <kwolf@redhat.com>
|
||||
RH-MergeRequest: 372: file-posix: Fix multipath failover with SCSI passthrough [9.7]
|
||||
RH-Jira: RHEL-95408
|
||||
RH-Acked-by: Hanna Czenczek <hreitz@redhat.com>
|
||||
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-Commit: [2/2] 4312e9ec609e511afdfb6634e1d2370032d41543 (kmwolf/centos-qemu-kvm)
|
||||
|
||||
When scsi-block is used on a host multipath device, it runs into the
|
||||
problem that the kernel dm-mpath doesn't know anything about SCSI or
|
||||
SG_IO and therefore can't decide if a SG_IO request returned an error
|
||||
and needs to be retried on a different path. Instead of getting working
|
||||
failover, an error is returned to scsi-block and handled according to
|
||||
the configured error policy. Obviously, this is not what users want,
|
||||
they want working failover.
|
||||
|
||||
QEMU can parse the SG_IO result and determine whether this could have
|
||||
been a path error, but just retrying the same request could just send it
|
||||
to the same failing path again and result in the same error.
|
||||
|
||||
With a kernel that supports the DM_MPATH_PROBE_PATHS ioctl on dm-mpath
|
||||
block devices (queued in the device mapper tree for Linux 6.16), we can
|
||||
tell the kernel to probe all paths and tell us if any usable paths
|
||||
remained. If so, we can now retry the SG_IO ioctl and expect it to be
|
||||
sent to a working path.
|
||||
|
||||
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
||||
Message-ID: <20250522130803.34738-1-kwolf@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Reviewed-by: Hanna Czenczek <hreitz@redhat.com>
|
||||
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
||||
(cherry picked from commit bf627788ef17721955bfcfba84209a07ae5f54ea)
|
||||
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
||||
---
|
||||
block/file-posix.c | 115 ++++++++++++++++++++++++++++++++++++++++++++-
|
||||
1 file changed, 114 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/block/file-posix.c b/block/file-posix.c
|
||||
index 6a5c506549..f17a3f4d10 100644
|
||||
--- a/block/file-posix.c
|
||||
+++ b/block/file-posix.c
|
||||
@@ -41,6 +41,7 @@
|
||||
|
||||
#include "scsi/pr-manager.h"
|
||||
#include "scsi/constants.h"
|
||||
+#include "scsi/utils.h"
|
||||
|
||||
#if defined(__APPLE__) && (__MACH__)
|
||||
#include <sys/ioctl.h>
|
||||
@@ -72,6 +73,7 @@
|
||||
#include <linux/blkzoned.h>
|
||||
#endif
|
||||
#include <linux/cdrom.h>
|
||||
+#include <linux/dm-ioctl.h>
|
||||
#include <linux/fd.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/hdreg.h>
|
||||
@@ -139,6 +141,22 @@
|
||||
#define DM_MPATH_PROBE_PATHS _IO(DM_IOCTL, DM_GET_TARGET_VERSION_CMD + 1)
|
||||
#endif
|
||||
|
||||
+/*
|
||||
+ * Multiple retries are mostly meant for two separate scenarios:
|
||||
+ *
|
||||
+ * - DM_MPATH_PROBE_PATHS returns success, but before SG_IO completes, another
|
||||
+ * path goes down.
|
||||
+ *
|
||||
+ * - DM_MPATH_PROBE_PATHS failed all paths in the current path group, so we have
|
||||
+ * to send another SG_IO to switch to another path group to probe the paths in
|
||||
+ * it.
|
||||
+ *
|
||||
+ * Even if each path is in a separate path group (path_grouping_policy set to
|
||||
+ * failover), it's rare to have more than eight path groups - and even then
|
||||
+ * pretty unlikely that only bad path groups would be chosen in eight retries.
|
||||
+ */
|
||||
+#define SG_IO_MAX_RETRIES 8
|
||||
+
|
||||
typedef struct BDRVRawState {
|
||||
int fd;
|
||||
bool use_lock;
|
||||
@@ -166,6 +184,7 @@ typedef struct BDRVRawState {
|
||||
bool use_linux_aio:1;
|
||||
bool has_laio_fdsync:1;
|
||||
bool use_linux_io_uring:1;
|
||||
+ bool use_mpath:1;
|
||||
int page_cache_inconsistent; /* errno from fdatasync failure */
|
||||
bool has_fallocate;
|
||||
bool needs_alignment;
|
||||
@@ -4248,15 +4267,105 @@ hdev_open_Mac_error:
|
||||
/* Since this does ioctl the device must be already opened */
|
||||
bs->sg = hdev_is_sg(bs);
|
||||
|
||||
+ /* sg devices aren't even block devices and can't use dm-mpath */
|
||||
+ s->use_mpath = !bs->sg;
|
||||
+
|
||||
return ret;
|
||||
}
|
||||
|
||||
#if defined(__linux__)
|
||||
+#if defined(DM_MPATH_PROBE_PATHS)
|
||||
+static bool coroutine_fn sgio_path_error(int ret, sg_io_hdr_t *io_hdr)
|
||||
+{
|
||||
+ if (ret < 0) {
|
||||
+ switch (ret) {
|
||||
+ case -ENODEV:
|
||||
+ return true;
|
||||
+ case -EAGAIN:
|
||||
+ /*
|
||||
+ * The device is probably suspended. This happens while the dm table
|
||||
+ * is reloaded, e.g. because a path is added or removed. This is an
|
||||
+ * operation that should complete within 1ms, so just wait a bit and
|
||||
+ * retry.
|
||||
+ *
|
||||
+ * If the device was suspended for another reason, we'll wait and
|
||||
+ * retry SG_IO_MAX_RETRIES times. This is a tolerable delay before
|
||||
+ * we return an error and potentially stop the VM.
|
||||
+ */
|
||||
+ qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000);
|
||||
+ return true;
|
||||
+ default:
|
||||
+ return false;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (io_hdr->host_status != SCSI_HOST_OK) {
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
+ switch (io_hdr->status) {
|
||||
+ case GOOD:
|
||||
+ case CONDITION_GOOD:
|
||||
+ case INTERMEDIATE_GOOD:
|
||||
+ case INTERMEDIATE_C_GOOD:
|
||||
+ case RESERVATION_CONFLICT:
|
||||
+ case COMMAND_TERMINATED:
|
||||
+ return false;
|
||||
+ case CHECK_CONDITION:
|
||||
+ return !scsi_sense_buf_is_guest_recoverable(io_hdr->sbp,
|
||||
+ io_hdr->mx_sb_len);
|
||||
+ default:
|
||||
+ return true;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static bool coroutine_fn hdev_co_ioctl_sgio_retry(RawPosixAIOData *acb, int ret)
|
||||
+{
|
||||
+ BDRVRawState *s = acb->bs->opaque;
|
||||
+ RawPosixAIOData probe_acb;
|
||||
+
|
||||
+ if (!s->use_mpath) {
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ if (!sgio_path_error(ret, acb->ioctl.buf)) {
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ probe_acb = (RawPosixAIOData) {
|
||||
+ .bs = acb->bs,
|
||||
+ .aio_type = QEMU_AIO_IOCTL,
|
||||
+ .aio_fildes = s->fd,
|
||||
+ .aio_offset = 0,
|
||||
+ .ioctl = {
|
||||
+ .buf = NULL,
|
||||
+ .cmd = DM_MPATH_PROBE_PATHS,
|
||||
+ },
|
||||
+ };
|
||||
+
|
||||
+ ret = raw_thread_pool_submit(handle_aiocb_ioctl, &probe_acb);
|
||||
+ if (ret == -ENOTTY) {
|
||||
+ s->use_mpath = false;
|
||||
+ } else if (ret == -EAGAIN) {
|
||||
+ /* The device might be suspended for a table reload, worth retrying */
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
+ return ret == 0;
|
||||
+}
|
||||
+#else
|
||||
+static bool coroutine_fn hdev_co_ioctl_sgio_retry(RawPosixAIOData *acb, int ret)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+#endif /* DM_MPATH_PROBE_PATHS */
|
||||
+
|
||||
static int coroutine_fn
|
||||
hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
|
||||
{
|
||||
BDRVRawState *s = bs->opaque;
|
||||
RawPosixAIOData acb;
|
||||
+ int retries = SG_IO_MAX_RETRIES;
|
||||
int ret;
|
||||
|
||||
ret = fd_open(bs);
|
||||
@@ -4284,7 +4393,11 @@ hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
|
||||
},
|
||||
};
|
||||
|
||||
- return raw_thread_pool_submit(handle_aiocb_ioctl, &acb);
|
||||
+ do {
|
||||
+ ret = raw_thread_pool_submit(handle_aiocb_ioctl, &acb);
|
||||
+ } while (req == SG_IO && retries-- && hdev_co_ioctl_sgio_retry(&acb, ret));
|
||||
+
|
||||
+ return ret;
|
||||
}
|
||||
#endif /* linux */
|
||||
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,64 @@
|
||||
From 39e0c370357a414abacd64fb6a172e7b25eb4d82 Mon Sep 17 00:00:00 2001
|
||||
From: Eric Blake <eblake@redhat.com>
|
||||
Date: Fri, 9 May 2025 15:40:19 -0500
|
||||
Subject: [PATCH 04/16] file-posix, gluster: Handle zero block status hint
|
||||
better
|
||||
|
||||
RH-Author: Eric Blake <eblake@redhat.com>
|
||||
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
|
||||
RH-Jira: RHEL-82906 RHEL-83015
|
||||
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
|
||||
RH-Commit: [2/14] 1f7b47ce5f5fb321aee41a16accf5bce3d1bfe95 (ebblake/centos-qemu-kvm)
|
||||
|
||||
Although the previous patch to change 'bool want_zero' into a bitmask
|
||||
made no semantic change, it is now time to differentiate. When the
|
||||
caller specifically wants to know what parts of the file read as zero,
|
||||
we need to use lseek and actually reporting holes, rather than
|
||||
short-circuiting and advertising full allocation.
|
||||
|
||||
This change will be utilized in later patches to let mirroring
|
||||
optimize for the case when the destination already reads as zeroes.
|
||||
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-ID: <20250509204341.3553601-17-eblake@redhat.com>
|
||||
(cherry picked from commit a6a0a7fb0e327d17594c971b4a39de14e025b415)
|
||||
Jira: https://issues.redhat.com/browse/RHEL-82906
|
||||
Jira: https://issues.redhat.com/browse/RHEL-83015
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
---
|
||||
block/file-posix.c | 3 ++-
|
||||
block/gluster.c | 2 +-
|
||||
2 files changed, 3 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/block/file-posix.c b/block/file-posix.c
|
||||
index 9ca55620ca..ce5da2b4c2 100644
|
||||
--- a/block/file-posix.c
|
||||
+++ b/block/file-posix.c
|
||||
@@ -3293,7 +3293,8 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
|
||||
return ret;
|
||||
}
|
||||
|
||||
- if (mode != BDRV_WANT_PRECISE) {
|
||||
+ if (!(mode & BDRV_WANT_ZERO)) {
|
||||
+ /* There is no backing file - all bytes are allocated in this file. */
|
||||
*pnum = bytes;
|
||||
*map = offset;
|
||||
*file = bs;
|
||||
diff --git a/block/gluster.c b/block/gluster.c
|
||||
index ae5c45666b..175c70164c 100644
|
||||
--- a/block/gluster.c
|
||||
+++ b/block/gluster.c
|
||||
@@ -1483,7 +1483,7 @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs,
|
||||
return ret;
|
||||
}
|
||||
|
||||
- if (mode != BDRV_WANT_PRECISE) {
|
||||
+ if (!(mode & BDRV_WANT_ZERO)) {
|
||||
*pnum = bytes;
|
||||
*map = offset;
|
||||
*file = bs;
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,131 @@
|
||||
From 29ae77d77cabc3582267cb8a7c4fe10d279a21e6 Mon Sep 17 00:00:00 2001
|
||||
From: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Date: Thu, 17 Apr 2025 11:05:27 -0400
|
||||
Subject: [PATCH 3/5] file-posix: probe discard alignment on Linux block
|
||||
devices
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 355: file-posix: probe discard alignment on Linux block devices
|
||||
RH-Jira: RHEL-86032
|
||||
RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
|
||||
RH-Acked-by: Eric Blake <eblake@redhat.com>
|
||||
RH-Commit: [1/3] bb3c17b0da6edeb209874e97d4e2c3b1762a1749 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
Populate the pdiscard_alignment block limit so the block layer is able
|
||||
align discard requests correctly.
|
||||
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-ID: <20250417150528.76470-2-stefanha@redhat.com>
|
||||
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
|
||||
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
||||
(cherry picked from commit f605796aae42885034400c83ed6a9b07cd6d6481)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
block/file-posix.c | 67 +++++++++++++++++++++++++++++++++++++++++++++-
|
||||
1 file changed, 66 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/block/file-posix.c b/block/file-posix.c
|
||||
index ff928b5e85..3d5b024459 100644
|
||||
--- a/block/file-posix.c
|
||||
+++ b/block/file-posix.c
|
||||
@@ -1268,10 +1268,10 @@ static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
|
||||
}
|
||||
#endif /* defined(CONFIG_BLKZONED) */
|
||||
|
||||
+#ifdef CONFIG_LINUX
|
||||
/*
|
||||
* Get a sysfs attribute value as a long integer.
|
||||
*/
|
||||
-#ifdef CONFIG_LINUX
|
||||
static long get_sysfs_long_val(struct stat *st, const char *attribute)
|
||||
{
|
||||
g_autofree char *str = NULL;
|
||||
@@ -1291,6 +1291,30 @@ static long get_sysfs_long_val(struct stat *st, const char *attribute)
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
+
|
||||
+/*
|
||||
+ * Get a sysfs attribute value as a uint32_t.
|
||||
+ */
|
||||
+static int get_sysfs_u32_val(struct stat *st, const char *attribute,
|
||||
+ uint32_t *u32)
|
||||
+{
|
||||
+ g_autofree char *str = NULL;
|
||||
+ const char *end;
|
||||
+ unsigned int val;
|
||||
+ int ret;
|
||||
+
|
||||
+ ret = get_sysfs_str_val(st, attribute, &str);
|
||||
+ if (ret < 0) {
|
||||
+ return ret;
|
||||
+ }
|
||||
+
|
||||
+ /* The file is ended with '\n', pass 'end' to accept that. */
|
||||
+ ret = qemu_strtoui(str, &end, 10, &val);
|
||||
+ if (ret == 0 && end && *end == '\0') {
|
||||
+ *u32 = val;
|
||||
+ }
|
||||
+ return ret;
|
||||
+}
|
||||
#endif
|
||||
|
||||
static int hdev_get_max_segments(int fd, struct stat *st)
|
||||
@@ -1310,6 +1334,23 @@ static int hdev_get_max_segments(int fd, struct stat *st)
|
||||
#endif
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * Fills in *dalign with the discard alignment and returns 0 on success,
|
||||
+ * -errno otherwise.
|
||||
+ */
|
||||
+static int hdev_get_pdiscard_alignment(struct stat *st, uint32_t *dalign)
|
||||
+{
|
||||
+#ifdef CONFIG_LINUX
|
||||
+ /*
|
||||
+ * Note that Linux "discard_granularity" is QEMU "discard_alignment". Linux
|
||||
+ * "discard_alignment" is something else.
|
||||
+ */
|
||||
+ return get_sysfs_u32_val(st, "discard_granularity", dalign);
|
||||
+#else
|
||||
+ return -ENOTSUP;
|
||||
+#endif
|
||||
+}
|
||||
+
|
||||
#if defined(CONFIG_BLKZONED)
|
||||
/*
|
||||
* If the reset_all flag is true, then the wps of zone whose state is
|
||||
@@ -1519,6 +1560,30 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
|
||||
}
|
||||
}
|
||||
|
||||
+ if (S_ISBLK(st.st_mode)) {
|
||||
+ uint32_t dalign = 0;
|
||||
+ int ret;
|
||||
+
|
||||
+ ret = hdev_get_pdiscard_alignment(&st, &dalign);
|
||||
+ if (ret == 0) {
|
||||
+ uint32_t ralign = bs->bl.request_alignment;
|
||||
+
|
||||
+ /* Probably never happens, but handle it just in case */
|
||||
+ if (dalign < ralign && (ralign % dalign == 0)) {
|
||||
+ dalign = ralign;
|
||||
+ }
|
||||
+
|
||||
+ /* The block layer requires a multiple of request_alignment */
|
||||
+ if (dalign % ralign != 0) {
|
||||
+ error_setg(errp, "Invalid pdiscard_alignment limit %u is not a "
|
||||
+ "multiple of request_alignment %u", dalign, ralign);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ bs->bl.pdiscard_alignment = dalign;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
raw_refresh_zoned_limits(bs, &st, errp);
|
||||
}
|
||||
|
||||
--
|
||||
2.48.1
|
||||
|
||||
123
SOURCES/kvm-hw-arm-smmuv3-Move-reset-to-exit-phase.patch
Normal file
123
SOURCES/kvm-hw-arm-smmuv3-Move-reset-to-exit-phase.patch
Normal file
@ -0,0 +1,123 @@
|
||||
From a3dfbe30e930c8d794057e45fffd91a9b0e6afd0 Mon Sep 17 00:00:00 2001
|
||||
From: Eric Auger <eric.auger@redhat.com>
|
||||
Date: Tue, 18 Feb 2025 19:25:33 +0100
|
||||
Subject: [PATCH 3/9] hw/arm/smmuv3: Move reset to exit phase
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Eric Auger <eric.auger@redhat.com>
|
||||
RH-MergeRequest: 341: Fix vIOMMU reset order
|
||||
RH-Jira: RHEL-7188
|
||||
RH-Acked-by: Peter Xu <peterx@redhat.com>
|
||||
RH-Acked-by: Donald Dutile <None>
|
||||
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
|
||||
RH-Commit: [3/5] e291cb45c32e0fab49b200c275553bbe76b97264 (eauger1/centos-qemu-kvm)
|
||||
|
||||
Currently the iommu may be reset before the devices
|
||||
it protects. For example this happens with virtio-scsi-pci.
|
||||
when system_reset is issued from qmp monitor: spurious
|
||||
"virtio: zero sized buffers are not allowed" warnings can
|
||||
be observed. This happens because outstanding DMA requests
|
||||
are still happening while the SMMU gets reset.
|
||||
|
||||
This can also happen with VFIO devices. In that case
|
||||
spurious DMA translation faults can be observed on host.
|
||||
|
||||
Make sure the SMMU is reset in the 'exit' phase after
|
||||
all DMA capable devices have been reset during the 'enter'
|
||||
or 'hold' phase.
|
||||
|
||||
Signed-off-by: Eric Auger <eric.auger@redhat.com>
|
||||
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
|
||||
|
||||
Message-Id: <20250218182737.76722-4-eric.auger@redhat.com>
|
||||
Reviewed-by: Peter Xu <peterx@redhat.com>
|
||||
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
(cherry picked from commit e39e3f8b8dea856f141e9945167d2b18021ef445)
|
||||
Signed-off-by: Eric Auger <eric.auger@redhat.com>
|
||||
---
|
||||
hw/arm/smmu-common.c | 9 +++++++--
|
||||
hw/arm/smmuv3.c | 14 ++++++++++----
|
||||
hw/arm/trace-events | 1 +
|
||||
3 files changed, 18 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
|
||||
index 3f82728758..f4210fcbc1 100644
|
||||
--- a/hw/arm/smmu-common.c
|
||||
+++ b/hw/arm/smmu-common.c
|
||||
@@ -924,7 +924,12 @@ static void smmu_base_realize(DeviceState *dev, Error **errp)
|
||||
}
|
||||
}
|
||||
|
||||
-static void smmu_base_reset_hold(Object *obj, ResetType type)
|
||||
+/*
|
||||
+ * Make sure the IOMMU is reset in 'exit' phase after
|
||||
+ * all outstanding DMA requests have been quiesced during
|
||||
+ * the 'enter' or 'hold' reset phases
|
||||
+ */
|
||||
+static void smmu_base_reset_exit(Object *obj, ResetType type)
|
||||
{
|
||||
SMMUState *s = ARM_SMMU(obj);
|
||||
|
||||
@@ -950,7 +955,7 @@ static void smmu_base_class_init(ObjectClass *klass, void *data)
|
||||
device_class_set_props(dc, smmu_dev_properties);
|
||||
device_class_set_parent_realize(dc, smmu_base_realize,
|
||||
&sbc->parent_realize);
|
||||
- rc->phases.hold = smmu_base_reset_hold;
|
||||
+ rc->phases.exit = smmu_base_reset_exit;
|
||||
}
|
||||
|
||||
static const TypeInfo smmu_base_info = {
|
||||
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
|
||||
index 3971976389..2e90570915 100644
|
||||
--- a/hw/arm/smmuv3.c
|
||||
+++ b/hw/arm/smmuv3.c
|
||||
@@ -1870,13 +1870,19 @@ static void smmu_init_irq(SMMUv3State *s, SysBusDevice *dev)
|
||||
}
|
||||
}
|
||||
|
||||
-static void smmu_reset_hold(Object *obj, ResetType type)
|
||||
+/*
|
||||
+ * Make sure the IOMMU is reset in 'exit' phase after
|
||||
+ * all outstanding DMA requests have been quiesced during
|
||||
+ * the 'enter' or 'hold' reset phases
|
||||
+ */
|
||||
+static void smmu_reset_exit(Object *obj, ResetType type)
|
||||
{
|
||||
SMMUv3State *s = ARM_SMMUV3(obj);
|
||||
SMMUv3Class *c = ARM_SMMUV3_GET_CLASS(s);
|
||||
|
||||
- if (c->parent_phases.hold) {
|
||||
- c->parent_phases.hold(obj, type);
|
||||
+ trace_smmu_reset_exit();
|
||||
+ if (c->parent_phases.exit) {
|
||||
+ c->parent_phases.exit(obj, type);
|
||||
}
|
||||
|
||||
smmuv3_init_regs(s);
|
||||
@@ -1999,7 +2005,7 @@ static void smmuv3_class_init(ObjectClass *klass, void *data)
|
||||
SMMUv3Class *c = ARM_SMMUV3_CLASS(klass);
|
||||
|
||||
dc->vmsd = &vmstate_smmuv3;
|
||||
- resettable_class_set_parent_phases(rc, NULL, smmu_reset_hold, NULL,
|
||||
+ resettable_class_set_parent_phases(rc, NULL, NULL, smmu_reset_exit,
|
||||
&c->parent_phases);
|
||||
device_class_set_parent_realize(dc, smmu_realize,
|
||||
&c->parent_realize);
|
||||
diff --git a/hw/arm/trace-events b/hw/arm/trace-events
|
||||
index be6c8f720b..79ef347e3e 100644
|
||||
--- a/hw/arm/trace-events
|
||||
+++ b/hw/arm/trace-events
|
||||
@@ -56,6 +56,7 @@ smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid=0x%x"
|
||||
smmuv3_notify_flag_add(const char *iommu) "ADD SMMUNotifier node for iommu mr=%s"
|
||||
smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s"
|
||||
smmuv3_inv_notifiers_iova(const char *name, int asid, int vmid, uint64_t iova, uint8_t tg, uint64_t num_pages, int stage) "iommu mr=%s asid=%d vmid=%d iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64" stage=%d"
|
||||
+smmu_reset_exit(void) ""
|
||||
|
||||
# strongarm.c
|
||||
strongarm_uart_update_parameters(const char *label, int speed, char parity, int data_bits, int stop_bits) "%s speed=%d parity=%c data=%d stop=%d"
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,57 @@
|
||||
From 2018f62f2242d8d4a970d83ebef9b3c2bccf6fda Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:36:45 +0100
|
||||
Subject: [PATCH 33/57] hw/audio/ac97: skip automatic zero-init of large arrays
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [7/30] 4a6b59a9b9122d9f89e99b3e44df19e6d92ed941 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'read_audio' & 'write_audio' methods have a 4k byte array used
|
||||
for copying data between the audio backend and device. Skip the
|
||||
automatic zero-init of these arrays to eliminate the performance
|
||||
overhead in the I/O hot path.
|
||||
|
||||
The 'tmpbuf' array will be fully initialized when reading data from
|
||||
the audio backend and/or device memory.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-id: 20250610123709.835102-8-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 2553d2d26a9d0f46386bf8c37d184567e5cede6c)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
hw/audio/ac97.c | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/hw/audio/ac97.c b/hw/audio/ac97.c
|
||||
index 3f0053f94d..681b5752a1 100644
|
||||
--- a/hw/audio/ac97.c
|
||||
+++ b/hw/audio/ac97.c
|
||||
@@ -886,7 +886,7 @@ static void nabm_writel(void *opaque, uint32_t addr, uint32_t val)
|
||||
static int write_audio(AC97LinkState *s, AC97BusMasterRegs *r,
|
||||
int max, int *stop)
|
||||
{
|
||||
- uint8_t tmpbuf[4096];
|
||||
+ QEMU_UNINITIALIZED uint8_t tmpbuf[4096];
|
||||
uint32_t addr = r->bd.addr;
|
||||
uint32_t temp = r->picb << 1;
|
||||
uint32_t written = 0;
|
||||
@@ -959,7 +959,7 @@ static void write_bup(AC97LinkState *s, int elapsed)
|
||||
static int read_audio(AC97LinkState *s, AC97BusMasterRegs *r,
|
||||
int max, int *stop)
|
||||
{
|
||||
- uint8_t tmpbuf[4096];
|
||||
+ QEMU_UNINITIALIZED uint8_t tmpbuf[4096];
|
||||
uint32_t addr = r->bd.addr;
|
||||
uint32_t temp = r->picb << 1;
|
||||
uint32_t nread = 0;
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,59 @@
|
||||
From bd32bb22fb324a37b31ed9ac3387524f6f4ea5be Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:36:46 +0100
|
||||
Subject: [PATCH 34/57] hw/audio/cs4231a: skip automatic zero-init of large
|
||||
arrays
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [8/30] 6c454bcc2927e49896c62718287fb9e4b37b3bb9 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'cs_write_audio' method has a pair of byte arrays, one 4k in size
|
||||
and one 8k, which are used in converting audio samples. Skip the
|
||||
automatic zero-init of these arrays to eliminate the performance
|
||||
overhead in the I/O hot path.
|
||||
|
||||
The 'tmpbuf' array will be fully initialized when reading a block of
|
||||
data from the guest. The 'linbuf' array will be fully initialized
|
||||
when converting the audio samples.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-id: 20250610123709.835102-9-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit ca2cc0385d97cea66cd54ee42553f385c403d4a6)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
hw/audio/cs4231a.c | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/hw/audio/cs4231a.c b/hw/audio/cs4231a.c
|
||||
index 9ef57f042d..5c312642cc 100644
|
||||
--- a/hw/audio/cs4231a.c
|
||||
+++ b/hw/audio/cs4231a.c
|
||||
@@ -528,7 +528,7 @@ static int cs_write_audio (CSState *s, int nchan, int dma_pos,
|
||||
int dma_len, int len)
|
||||
{
|
||||
int temp, net;
|
||||
- uint8_t tmpbuf[4096];
|
||||
+ QEMU_UNINITIALIZED uint8_t tmpbuf[4096];
|
||||
IsaDmaClass *k = ISADMA_GET_CLASS(s->isa_dma);
|
||||
|
||||
temp = len;
|
||||
@@ -547,7 +547,7 @@ static int cs_write_audio (CSState *s, int nchan, int dma_pos,
|
||||
copied = k->read_memory(s->isa_dma, nchan, tmpbuf, dma_pos, to_copy);
|
||||
if (s->tab) {
|
||||
int i;
|
||||
- int16_t linbuf[4096];
|
||||
+ QEMU_UNINITIALIZED int16_t linbuf[4096];
|
||||
|
||||
for (i = 0; i < copied; ++i)
|
||||
linbuf[i] = s->tab[tmpbuf[i]];
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,49 @@
|
||||
From cb12ddc6ed836091aa7724e2f77ab79cd9089cad Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:36:47 +0100
|
||||
Subject: [PATCH 35/57] hw/audio/es1370: skip automatic zero-init of large
|
||||
array
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [9/30] b992e4247d8d31dc09f9dc7671e7a532558174ec (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'es1370_transfer_audio' method has a 4k byte array used for
|
||||
copying data between the audio backend and device. Skip the automatic
|
||||
zero-init of this array to eliminate the performance overhead in
|
||||
the I/O hot path.
|
||||
|
||||
The 'tmpbuf' array will be fully initialized when reading data from
|
||||
the audio backend and/or device memory.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-id: 20250610123709.835102-10-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 8236e206084b832d1d7ec947a4798b818f4cdf1f)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
hw/audio/es1370.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/hw/audio/es1370.c b/hw/audio/es1370.c
|
||||
index 4ab61d3b9d..6aea934f54 100644
|
||||
--- a/hw/audio/es1370.c
|
||||
+++ b/hw/audio/es1370.c
|
||||
@@ -604,7 +604,7 @@ static uint64_t es1370_read(void *opaque, hwaddr addr, unsigned size)
|
||||
static void es1370_transfer_audio (ES1370State *s, struct chan *d, int loop_sel,
|
||||
int max, bool *irq)
|
||||
{
|
||||
- uint8_t tmpbuf[4096];
|
||||
+ QEMU_UNINITIALIZED uint8_t tmpbuf[4096];
|
||||
size_t to_transfer;
|
||||
uint32_t addr = d->frame_addr;
|
||||
int sc = d->scount & 0xffff;
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,48 @@
|
||||
From 9ad7091d82fd0577488f27ab54bb7851fe957020 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:36:48 +0100
|
||||
Subject: [PATCH 36/57] hw/audio/gus: skip automatic zero-init of large array
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [10/30] 366953d0417ac31e3060fdc327fe8dade3375bf0 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'GUS_read_DMA' method has a 4k byte array used for copying
|
||||
data between the audio backend and device. Skip the automatic
|
||||
zero-init of this array to eliminate the performance overhead in
|
||||
the I/O hot path.
|
||||
|
||||
The 'tmpbuf' array will be fully initialized when reading data
|
||||
from device memory.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-id: 20250610123709.835102-11-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 2e438da4929018c62609381e1156aac0b2fe3de3)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
hw/audio/gus.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/hw/audio/gus.c b/hw/audio/gus.c
|
||||
index 4beb3fd74e..e8b0b85d44 100644
|
||||
--- a/hw/audio/gus.c
|
||||
+++ b/hw/audio/gus.c
|
||||
@@ -183,7 +183,7 @@ static int GUS_read_DMA (void *opaque, int nchan, int dma_pos, int dma_len)
|
||||
{
|
||||
GUSState *s = opaque;
|
||||
IsaDmaClass *k = ISADMA_GET_CLASS(s->isa_dma);
|
||||
- char tmpbuf[4096];
|
||||
+ QEMU_UNINITIALIZED char tmpbuf[4096];
|
||||
int pos = dma_pos, mode, left = dma_len - dma_pos;
|
||||
|
||||
ldebug ("read DMA %#x %d\n", dma_pos, dma_len);
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,50 @@
|
||||
From 5cf61823cbe80b1ace2f5bdb9cc1971956425b98 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:36:49 +0100
|
||||
Subject: [PATCH 37/57] hw/audio/marvell_88w8618: skip automatic zero-init of
|
||||
large array
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [11/30] e09cdb76430552081168873dadfef1b5c8f74327 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'mv88w8618_audio_callback' method has a 4k byte array used for
|
||||
copying data between the audio backend and device. Skip the automatic
|
||||
zero-init of this array to eliminate the performance overhead in
|
||||
the I/O hot path.
|
||||
|
||||
The 'buf' array will be fully initialized when reading data from
|
||||
device memory.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-id: 20250610123709.835102-12-berrange@redhat.com
|
||||
[Fixed hw/audio/gus in commit message --Stefan]
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 5b6cd5c5df4229972d8a0fd9dd9a089a1644d6ba)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
hw/audio/marvell_88w8618.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/hw/audio/marvell_88w8618.c b/hw/audio/marvell_88w8618.c
|
||||
index cc285444bc..b7b4b27272 100644
|
||||
--- a/hw/audio/marvell_88w8618.c
|
||||
+++ b/hw/audio/marvell_88w8618.c
|
||||
@@ -66,7 +66,7 @@ static void mv88w8618_audio_callback(void *opaque, int free_out, int free_in)
|
||||
{
|
||||
mv88w8618_audio_state *s = opaque;
|
||||
int16_t *codec_buffer;
|
||||
- int8_t buf[4096];
|
||||
+ QEMU_UNINITIALIZED int8_t buf[4096];
|
||||
int8_t *mem_buffer;
|
||||
int pos, block_size;
|
||||
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,48 @@
|
||||
From 0b4d59d75edd49ef99f0a82fbcbe360c5b48e4f8 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:36:50 +0100
|
||||
Subject: [PATCH 38/57] hw/audio/sb16: skip automatic zero-init of large array
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [12/30] 6475d67546bf04745636b317e965bcd89b6fb2d2 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'write_audio' method has a 4k byte array used for copying data
|
||||
between the audio backend and device. Skip the automatic zero-init
|
||||
of this array to eliminate the performance overhead in the I/O hot
|
||||
path.
|
||||
|
||||
The 'tmpbuf' array will be fully initialized when reading data from
|
||||
device memory.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-id: 20250610123709.835102-13-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 30c82f6657c1ee9fbb5473924b4d3273f214bd6f)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
hw/audio/sb16.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/hw/audio/sb16.c b/hw/audio/sb16.c
|
||||
index fd76e78d18..04c818ed3d 100644
|
||||
--- a/hw/audio/sb16.c
|
||||
+++ b/hw/audio/sb16.c
|
||||
@@ -1181,7 +1181,7 @@ static int write_audio (SB16State *s, int nchan, int dma_pos,
|
||||
IsaDma *isa_dma = nchan == s->dma ? s->isa_dma : s->isa_hdma;
|
||||
IsaDmaClass *k = ISADMA_GET_CLASS(isa_dma);
|
||||
int temp, net;
|
||||
- uint8_t tmpbuf[4096];
|
||||
+ QEMU_UNINITIALIZED uint8_t tmpbuf[4096];
|
||||
|
||||
temp = len;
|
||||
net = 0;
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,49 @@
|
||||
From 35332282ef8bd06f59206266006eff222ffe6bec Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:36:51 +0100
|
||||
Subject: [PATCH 39/57] hw/audio/via-ac97: skip automatic zero-init of large
|
||||
array
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [13/30] 6391a04b29fcbb8bcdbce2c6b786758fc34f0d71 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'out_cb' method has a 4k byte array used for copying data
|
||||
between the audio backend and device. Skip the automatic zero-init
|
||||
of this array to eliminate the performance overhead in the I/O hot
|
||||
path.
|
||||
|
||||
The 'tmpbuf' array will be fully initialized when reading data from
|
||||
device memory.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-id: 20250610123709.835102-14-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit bb71d9fe1419f44529c91d1b09464718d157e647)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
hw/audio/via-ac97.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/hw/audio/via-ac97.c b/hw/audio/via-ac97.c
|
||||
index 4c127a1def..e8fcf44e5d 100644
|
||||
--- a/hw/audio/via-ac97.c
|
||||
+++ b/hw/audio/via-ac97.c
|
||||
@@ -175,7 +175,7 @@ static void out_cb(void *opaque, int avail)
|
||||
ViaAC97SGDChannel *c = &s->aur;
|
||||
int temp, to_copy, copied;
|
||||
bool stop = false;
|
||||
- uint8_t tmpbuf[4096];
|
||||
+ QEMU_UNINITIALIZED uint8_t tmpbuf[4096];
|
||||
|
||||
if (c->stat & STAT_PAUSED) {
|
||||
return;
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,49 @@
|
||||
From b0c16a93460c2dfe834a9f439d25dc833dfb7427 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:36:52 +0100
|
||||
Subject: [PATCH 40/57] hw/char/sclpconsole-lm: skip automatic zero-init of
|
||||
large array
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [14/30] 1491e0147a799ec523fa67fd49649722a07299e7 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'process_mdb' method has a 4k byte array used for copying data
|
||||
between the guest and the chardev backend. Skip the automatic zero-init
|
||||
of this array to eliminate the performance overhead in the I/O hot
|
||||
path.
|
||||
|
||||
The 'buffer' array will be selectively initialized when data is converted
|
||||
between EBCDIC and ASCII.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-id: 20250610123709.835102-15-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 8b1dac1ad57082611419b0e2f347acd96115d25f)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
hw/char/sclpconsole-lm.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/hw/char/sclpconsole-lm.c b/hw/char/sclpconsole-lm.c
|
||||
index 7719f438f6..19e64b92f6 100644
|
||||
--- a/hw/char/sclpconsole-lm.c
|
||||
+++ b/hw/char/sclpconsole-lm.c
|
||||
@@ -214,7 +214,7 @@ static int process_mdb(SCLPEvent *event, MDBO *mdbo)
|
||||
{
|
||||
int rc;
|
||||
int len;
|
||||
- uint8_t buffer[SIZE_BUFFER];
|
||||
+ QEMU_UNINITIALIZED uint8_t buffer[SIZE_BUFFER];
|
||||
|
||||
len = be16_to_cpu(mdbo->length);
|
||||
len -= sizeof(mdbo->length) + sizeof(mdbo->type)
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,49 @@
|
||||
From 7b5624efccf55184278c6f4924efc2141df460f0 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:36:54 +0100
|
||||
Subject: [PATCH 42/57] hw/display/vmware_vga: skip automatic zero-init of
|
||||
large struct
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [16/30] 4aaf459d4356bf28164be742889b9a78d3656703 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'vmsvga_fifo_run' method has a struct which is a little over 20k
|
||||
in size, used for holding image data for cursor changes. Skip the
|
||||
automatic zero-init of this struct to eliminate the performance
|
||||
overhead in the I/O hot path.
|
||||
|
||||
The cursor variable will be fully initialized only when processing
|
||||
a cursor definition message from the guest.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-id: 20250610123709.835102-17-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 7048e70f391df76d009eecca25f8027858f9f304)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
hw/display/vmware_vga.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/hw/display/vmware_vga.c b/hw/display/vmware_vga.c
|
||||
index 3db3ff98f7..69afe98a2f 100644
|
||||
--- a/hw/display/vmware_vga.c
|
||||
+++ b/hw/display/vmware_vga.c
|
||||
@@ -618,7 +618,7 @@ static void vmsvga_fifo_run(struct vmsvga_state_s *s)
|
||||
uint32_t cmd, colour;
|
||||
int args, len, maxloop = 1024;
|
||||
int x, y, dx, dy, width, height;
|
||||
- struct vmsvga_cursor_definition_s cursor;
|
||||
+ QEMU_UNINITIALIZED struct vmsvga_cursor_definition_s cursor;
|
||||
uint32_t cmd_start;
|
||||
|
||||
len = vmsvga_fifo_length(s);
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,47 @@
|
||||
From cd3500c9e248dbefb36273046e6eee44ee0d5cbe Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:36:53 +0100
|
||||
Subject: [PATCH 41/57] hw/dma/xlnx_csu_dma: skip automatic zero-init of large
|
||||
array
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [15/30] 063c88269c7d3bf07ae05aaf2d3d154e2016db81 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'xlnx_csu_dma_src_notify' method has a 4k byte array used for
|
||||
copying DMA data. Skip the automatic zero-init of this array to
|
||||
eliminate the performance overhead in the I/O hot path.
|
||||
|
||||
The 'buf' array will be fully initialized when data is copied.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-id: 20250610123709.835102-16-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit ce14f24611aa0469b464a9512e192b4fd51dca2b)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
hw/dma/xlnx_csu_dma.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/hw/dma/xlnx_csu_dma.c b/hw/dma/xlnx_csu_dma.c
|
||||
index ae307482f2..9d1cccc5ca 100644
|
||||
--- a/hw/dma/xlnx_csu_dma.c
|
||||
+++ b/hw/dma/xlnx_csu_dma.c
|
||||
@@ -287,7 +287,7 @@ static uint32_t xlnx_csu_dma_advance(XlnxCSUDMA *s, uint32_t len)
|
||||
static void xlnx_csu_dma_src_notify(void *opaque)
|
||||
{
|
||||
XlnxCSUDMA *s = XLNX_CSU_DMA(opaque);
|
||||
- unsigned char buf[4 * 1024];
|
||||
+ QEMU_UNINITIALIZED unsigned char buf[4 * 1024];
|
||||
size_t rlen = 0;
|
||||
|
||||
ptimer_transaction_begin(s->src_timer);
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,56 @@
|
||||
From a4673aab85958c60867b12c65cc3483d734bb6e0 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:36:55 +0100
|
||||
Subject: [PATCH 43/57] hw/hyperv/syndbg: skip automatic zero-init of large
|
||||
array
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [17/30] 5f71779c431128601baf46115fe65178532a3836 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'handle_recv_msg' method has a 4k byte array used for copying
|
||||
data between the network socket and guest memory. Skip the automatic
|
||||
zero-init of this array to eliminate the performance overhead in the
|
||||
I/O hot path.
|
||||
|
||||
The 'data_buf' array will be fully initialized when data is read
|
||||
off the network socket.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-id: 20250610123709.835102-18-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 5a1f614d0cd0bcc8e84e0b7ab6af63d56bd348a2)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
|
||||
Conflicts:
|
||||
hw/hyperv/syndbg.c
|
||||
|
||||
Context conflict due to missing commit 3efb9d226221
|
||||
("hw/hyperv/syndbg: common compilation unit") downstream. There is no
|
||||
need to backport the commit because it's not a bug fix.
|
||||
---
|
||||
hw/hyperv/syndbg.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/hw/hyperv/syndbg.c b/hw/hyperv/syndbg.c
|
||||
index 065e12fb1e..c7c43c8009 100644
|
||||
--- a/hw/hyperv/syndbg.c
|
||||
+++ b/hw/hyperv/syndbg.c
|
||||
@@ -188,7 +188,7 @@ static uint16_t handle_recv_msg(HvSynDbg *syndbg, uint64_t outgpa,
|
||||
uint64_t timeout, uint32_t *retrieved_count)
|
||||
{
|
||||
uint16_t ret;
|
||||
- uint8_t data_buf[TARGET_PAGE_SIZE - UDP_PKT_HEADER_SIZE];
|
||||
+ QEMU_UNINITIALIZED uint8_t data_buf[TARGET_PAGE_SIZE - UDP_PKT_HEADER_SIZE];
|
||||
hwaddr out_len;
|
||||
void *out_data;
|
||||
ssize_t recv_byte_count;
|
||||
--
|
||||
2.39.3
|
||||
|
||||
87
SOURCES/kvm-hw-i386-Fix-machine-type-compatibility.patch
Normal file
87
SOURCES/kvm-hw-i386-Fix-machine-type-compatibility.patch
Normal file
@ -0,0 +1,87 @@
|
||||
From 2bb5dff02fb393530a12f4f00219cd2f90cd442a Mon Sep 17 00:00:00 2001
|
||||
From: Sebastian Ott <sebott@redhat.com>
|
||||
Date: Thu, 15 May 2025 18:45:51 +0200
|
||||
Subject: [PATCH 3/5] hw/i386: Fix machine type compatibility
|
||||
|
||||
RH-Author: Sebastian Ott <sebott@redhat.com>
|
||||
RH-MergeRequest: 364: hw/i386: Fix machine type compatibility
|
||||
RH-Jira: RHEL-91307
|
||||
RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
|
||||
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
|
||||
RH-Commit: [1/1] 44ddbcb3af119c65e99018d7ed90887f3948907e (seott1/cos-qemu-kvm)
|
||||
|
||||
Upstream Status: RHEL only
|
||||
|
||||
Ensure compatibility of rhel specific i440fx and q35 machine types.
|
||||
Pick up missing bits from pc_compat_9_0 upstream.
|
||||
|
||||
Signed-off-by: Sebastian Ott <sebott@redhat.com>
|
||||
---
|
||||
hw/i386/pc.c | 8 ++++++++
|
||||
hw/i386/pc_piix.c | 2 ++
|
||||
hw/i386/pc_q35.c | 2 ++
|
||||
include/hw/i386/pc.h | 3 +++
|
||||
4 files changed, 15 insertions(+)
|
||||
|
||||
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
|
||||
index fa9f16cbaf..5237538640 100644
|
||||
--- a/hw/i386/pc.c
|
||||
+++ b/hw/i386/pc.c
|
||||
@@ -298,6 +298,14 @@ GlobalProperty pc_rhel_compat[] = {
|
||||
};
|
||||
const size_t pc_rhel_compat_len = G_N_ELEMENTS(pc_rhel_compat);
|
||||
|
||||
+GlobalProperty pc_rhel_9_6_compat[] = {
|
||||
+ /* pc_rhel_9_6_compat from pc_compat_9_0 */
|
||||
+ { TYPE_X86_CPU, "x-amd-topoext-features-only", "false" },
|
||||
+ { TYPE_X86_CPU, "x-l1-cache-per-thread", "false" },
|
||||
+ { TYPE_X86_CPU, "legacy-multi-node", "on" },
|
||||
+};
|
||||
+const size_t pc_rhel_9_6_compat_len = G_N_ELEMENTS(pc_rhel_9_6_compat);
|
||||
+
|
||||
GlobalProperty pc_rhel_9_5_compat[] = {
|
||||
/* pc_rhel_9_5_compat from pc_compat_pc_9_0 (backported from 9.1) */
|
||||
{ TYPE_X86_CPU, "guest-phys-bits", "0" },
|
||||
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
|
||||
index 10764bf596..0687317db5 100644
|
||||
--- a/hw/i386/pc_piix.c
|
||||
+++ b/hw/i386/pc_piix.c
|
||||
@@ -885,6 +885,8 @@ static void pc_i440fx_rhel_machine_7_6_0_options(MachineClass *m)
|
||||
|
||||
compat_props_add(m->compat_props, hw_compat_rhel_9_6,
|
||||
hw_compat_rhel_9_6_len);
|
||||
+ compat_props_add(m->compat_props, pc_rhel_9_6_compat,
|
||||
+ pc_rhel_9_6_compat_len);
|
||||
compat_props_add(m->compat_props, pc_rhel_9_5_compat,
|
||||
pc_rhel_9_5_compat_len);
|
||||
compat_props_add(m->compat_props, hw_compat_rhel_9_5,
|
||||
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
|
||||
index 5bf08be0fb..871c760aea 100644
|
||||
--- a/hw/i386/pc_q35.c
|
||||
+++ b/hw/i386/pc_q35.c
|
||||
@@ -704,6 +704,8 @@ static void pc_q35_rhel_machine_9_4_0_options(MachineClass *m)
|
||||
|
||||
compat_props_add(m->compat_props, hw_compat_rhel_9_6,
|
||||
hw_compat_rhel_9_6_len);
|
||||
+ compat_props_add(m->compat_props, pc_rhel_9_6_compat,
|
||||
+ pc_rhel_9_6_compat_len);
|
||||
compat_props_add(m->compat_props, pc_rhel_9_5_compat,
|
||||
pc_rhel_9_5_compat_len);
|
||||
compat_props_add(m->compat_props, hw_compat_rhel_9_5,
|
||||
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
|
||||
index 75c9271cdd..2b7c18f2b0 100644
|
||||
--- a/include/hw/i386/pc.h
|
||||
+++ b/include/hw/i386/pc.h
|
||||
@@ -305,6 +305,9 @@ extern const size_t pc_compat_2_3_len;
|
||||
extern GlobalProperty pc_rhel_compat[];
|
||||
extern const size_t pc_rhel_compat_len;
|
||||
|
||||
+extern GlobalProperty pc_rhel_9_6_compat[];
|
||||
+extern const size_t pc_rhel_9_6_compat_len;
|
||||
+
|
||||
extern GlobalProperty pc_rhel_9_5_compat[];
|
||||
extern const size_t pc_rhel_9_5_compat_len;
|
||||
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,117 @@
|
||||
From f1ff9d3b379697a2d4627e9529067195841d86a8 Mon Sep 17 00:00:00 2001
|
||||
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
Date: Sun, 4 May 2025 17:04:05 +0000
|
||||
Subject: [PATCH 25/57] hw/i386/amd_iommu: Allow migration when explicitly
|
||||
create the AMDVI-PCI device
|
||||
|
||||
RH-Author: John Allen <None>
|
||||
RH-MergeRequest: 380: Add ability to manually specify the AMDVI-PCI device
|
||||
RH-Jira: RHEL-70925
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [2/3] a42b88116e608a79b6fae13ebe3709874f2a853f (johnalle/qemu-kvm-fork)
|
||||
|
||||
Add migration support for AMD IOMMU model by saving necessary AMDVIState
|
||||
parameters for MMIO registers, device table, command buffer, and event
|
||||
buffers.
|
||||
|
||||
Also change devtab_len type from size_t to uint64_t to avoid 32-bit build
|
||||
issue.
|
||||
|
||||
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
Message-Id: <20250504170405.12623-3-suravee.suthikulpanit@amd.com>
|
||||
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
(cherry picked from commit 28931c2e1591deb4bfaaf744fdc8813e96c230f1)
|
||||
|
||||
JIRA: https://issues.redhat.com/browse/RHEL-70925
|
||||
|
||||
Signed-off-by: John Allen <johnalle@redhat.com>
|
||||
---
|
||||
hw/i386/amd_iommu.c | 48 +++++++++++++++++++++++++++++++++++++++++++++
|
||||
hw/i386/amd_iommu.h | 2 +-
|
||||
2 files changed, 49 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
|
||||
index 6a5e76cfef..a34e0c5f59 100644
|
||||
--- a/hw/i386/amd_iommu.c
|
||||
+++ b/hw/i386/amd_iommu.c
|
||||
@@ -1611,8 +1611,55 @@ static void amdvi_sysbus_reset(DeviceState *dev)
|
||||
amdvi_init(s);
|
||||
}
|
||||
|
||||
+static const VMStateDescription vmstate_amdvi_sysbus_migratable = {
|
||||
+ .name = "amd-iommu",
|
||||
+ .version_id = 1,
|
||||
+ .minimum_version_id = 1,
|
||||
+ .priority = MIG_PRI_IOMMU,
|
||||
+ .fields = (VMStateField[]) {
|
||||
+ /* Updated in amdvi_handle_control_write() */
|
||||
+ VMSTATE_BOOL(enabled, AMDVIState),
|
||||
+ VMSTATE_BOOL(ga_enabled, AMDVIState),
|
||||
+ VMSTATE_BOOL(ats_enabled, AMDVIState),
|
||||
+ VMSTATE_BOOL(cmdbuf_enabled, AMDVIState),
|
||||
+ VMSTATE_BOOL(completion_wait_intr, AMDVIState),
|
||||
+ VMSTATE_BOOL(evtlog_enabled, AMDVIState),
|
||||
+ VMSTATE_BOOL(evtlog_intr, AMDVIState),
|
||||
+ /* Updated in amdvi_handle_devtab_write() */
|
||||
+ VMSTATE_UINT64(devtab, AMDVIState),
|
||||
+ VMSTATE_UINT64(devtab_len, AMDVIState),
|
||||
+ /* Updated in amdvi_handle_cmdbase_write() */
|
||||
+ VMSTATE_UINT64(cmdbuf, AMDVIState),
|
||||
+ VMSTATE_UINT64(cmdbuf_len, AMDVIState),
|
||||
+ /* Updated in amdvi_handle_cmdhead_write() */
|
||||
+ VMSTATE_UINT32(cmdbuf_head, AMDVIState),
|
||||
+ /* Updated in amdvi_handle_cmdtail_write() */
|
||||
+ VMSTATE_UINT32(cmdbuf_tail, AMDVIState),
|
||||
+ /* Updated in amdvi_handle_evtbase_write() */
|
||||
+ VMSTATE_UINT64(evtlog, AMDVIState),
|
||||
+ VMSTATE_UINT32(evtlog_len, AMDVIState),
|
||||
+ /* Updated in amdvi_handle_evthead_write() */
|
||||
+ VMSTATE_UINT32(evtlog_head, AMDVIState),
|
||||
+ /* Updated in amdvi_handle_evttail_write() */
|
||||
+ VMSTATE_UINT32(evtlog_tail, AMDVIState),
|
||||
+ /* Updated in amdvi_handle_pprbase_write() */
|
||||
+ VMSTATE_UINT64(ppr_log, AMDVIState),
|
||||
+ VMSTATE_UINT32(pprlog_len, AMDVIState),
|
||||
+ /* Updated in amdvi_handle_pprhead_write() */
|
||||
+ VMSTATE_UINT32(pprlog_head, AMDVIState),
|
||||
+ /* Updated in amdvi_handle_tailhead_write() */
|
||||
+ VMSTATE_UINT32(pprlog_tail, AMDVIState),
|
||||
+ /* MMIO registers */
|
||||
+ VMSTATE_UINT8_ARRAY(mmior, AMDVIState, AMDVI_MMIO_SIZE),
|
||||
+ VMSTATE_UINT8_ARRAY(romask, AMDVIState, AMDVI_MMIO_SIZE),
|
||||
+ VMSTATE_UINT8_ARRAY(w1cmask, AMDVIState, AMDVI_MMIO_SIZE),
|
||||
+ VMSTATE_END_OF_LIST()
|
||||
+ }
|
||||
+};
|
||||
+
|
||||
static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
|
||||
{
|
||||
+ DeviceClass *dc = (DeviceClass *) object_get_class(OBJECT(dev));
|
||||
AMDVIState *s = AMD_IOMMU_DEVICE(dev);
|
||||
MachineState *ms = MACHINE(qdev_get_machine());
|
||||
PCMachineState *pcms = PC_MACHINE(ms);
|
||||
@@ -1634,6 +1681,7 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
|
||||
}
|
||||
|
||||
s->pci = AMD_IOMMU_PCI(pdev);
|
||||
+ dc->vmsd = &vmstate_amdvi_sysbus_migratable;
|
||||
} else {
|
||||
s->pci = AMD_IOMMU_PCI(object_new(TYPE_AMD_IOMMU_PCI));
|
||||
/* This device should take care of IOMMU PCI properties */
|
||||
diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
|
||||
index ece71ff0b6..741dd9a910 100644
|
||||
--- a/hw/i386/amd_iommu.h
|
||||
+++ b/hw/i386/amd_iommu.h
|
||||
@@ -329,7 +329,7 @@ struct AMDVIState {
|
||||
bool excl_enabled;
|
||||
|
||||
hwaddr devtab; /* base address device table */
|
||||
- size_t devtab_len; /* device table length */
|
||||
+ uint64_t devtab_len; /* device table length */
|
||||
|
||||
hwaddr cmdbuf; /* command buffer base address */
|
||||
uint64_t cmdbuf_len; /* command buffer length */
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,57 @@
|
||||
From e611119b8b4e0712ab103628051d69ea84538719 Mon Sep 17 00:00:00 2001
|
||||
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
Date: Tue, 25 Mar 2025 02:11:40 +0000
|
||||
Subject: [PATCH 23/57] hw/i386/amd_iommu: Assign pci-id 0x1419 for the AMD
|
||||
IOMMU device
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: John Allen <None>
|
||||
RH-MergeRequest: 379: hw/i386/amd_iommu: Assign pci-id 0x1419 for the AMD IOMMU device
|
||||
RH-Jira: RHEL-70926
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [1/1] 69d847f64543caf328da3e7663e7d2ebe53cd448 (johnalle/qemu-kvm-fork)
|
||||
|
||||
Currently, the QEMU-emulated AMD IOMMU device use PCI vendor id 0x1022
|
||||
(AMD) with device id zero (undefined). Eventhough this does not cause any
|
||||
functional issue for AMD IOMMU driver since it normally uses information
|
||||
in the ACPI IVRS table to probe and initialize the device per
|
||||
recommendation in the AMD IOMMU specification, the device id zero causes
|
||||
the Windows Device Manager utility to show the device as an unknown device.
|
||||
|
||||
Since Windows only recognizes AMD IOMMU device with device id 0x1419 as
|
||||
listed in the machine.inf file, modify the QEMU AMD IOMMU model to use
|
||||
the id 0x1419 to avoid the issue. This advertise the IOMMU as the AMD
|
||||
IOMMU device for Family 15h (Models 10h-1fh).
|
||||
|
||||
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
Message-Id: <20250325021140.5676-1-suravee.suthikulpanit@amd.com>
|
||||
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Yan Vugenfirer <yvugenfi@redhat.com>
|
||||
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
(cherry picked from commit 719255486df2fcbe1b8599786b37f4bb80272f1a)
|
||||
|
||||
JIRA: https://issues.redhat.com/browse/RHEL-70926
|
||||
|
||||
Signed-off-by: John Allen <johnalle@redhat.com>
|
||||
---
|
||||
hw/i386/amd_iommu.c | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
|
||||
index d804656ea8..59e1a01b7c 100644
|
||||
--- a/hw/i386/amd_iommu.c
|
||||
+++ b/hw/i386/amd_iommu.c
|
||||
@@ -1714,6 +1714,7 @@ static void amdvi_pci_class_init(ObjectClass *klass, void *data)
|
||||
PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
|
||||
|
||||
k->vendor_id = PCI_VENDOR_ID_AMD;
|
||||
+ k->device_id = 0x1419;
|
||||
k->class_id = 0x0806;
|
||||
k->realize = amdvi_pci_realize;
|
||||
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,267 @@
|
||||
From 5a697d0f66360acca8216f49c06dc9702231d470 Mon Sep 17 00:00:00 2001
|
||||
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
Date: Sun, 4 May 2025 17:04:04 +0000
|
||||
Subject: [PATCH 24/57] hw/i386/amd_iommu: Isolate AMDVI-PCI from amd-iommu
|
||||
device to allow full control over the PCI device creation
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: John Allen <None>
|
||||
RH-MergeRequest: 380: Add ability to manually specify the AMDVI-PCI device
|
||||
RH-Jira: RHEL-70925
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [1/3] 58254a72ba2d810b57c610462494f76691126521 (johnalle/qemu-kvm-fork)
|
||||
|
||||
Current amd-iommu model internally creates an AMDVI-PCI device. Here is
|
||||
a snippet from info qtree:
|
||||
|
||||
bus: main-system-bus
|
||||
type System
|
||||
dev: amd-iommu, id ""
|
||||
xtsup = false
|
||||
pci-id = ""
|
||||
intremap = "on"
|
||||
device-iotlb = false
|
||||
pt = true
|
||||
...
|
||||
dev: q35-pcihost, id ""
|
||||
MCFG = -1 (0xffffffffffffffff)
|
||||
pci-hole64-size = 34359738368 (32 GiB)
|
||||
below-4g-mem-size = 134217728 (128 MiB)
|
||||
above-4g-mem-size = 0 (0 B)
|
||||
smm-ranges = true
|
||||
x-pci-hole64-fix = true
|
||||
x-config-reg-migration-enabled = true
|
||||
bypass-iommu = false
|
||||
bus: pcie.0
|
||||
type PCIE
|
||||
dev: AMDVI-PCI, id ""
|
||||
addr = 01.0
|
||||
romfile = ""
|
||||
romsize = 4294967295 (0xffffffff)
|
||||
rombar = -1 (0xffffffffffffffff)
|
||||
multifunction = false
|
||||
x-pcie-lnksta-dllla = true
|
||||
x-pcie-extcap-init = true
|
||||
failover_pair_id = ""
|
||||
acpi-index = 0 (0x0)
|
||||
x-pcie-err-unc-mask = true
|
||||
x-pcie-ari-nextfn-1 = false
|
||||
x-max-bounce-buffer-size = 4096 (4 KiB)
|
||||
x-pcie-ext-tag = true
|
||||
busnr = 0 (0x0)
|
||||
class Class 0806, addr 00:01.0, pci id 1022:0000 (sub 1af4:1100)
|
||||
...
|
||||
|
||||
This prohibits users from specifying the PCI topology for the amd-iommu device,
|
||||
which becomes a problem when trying to support VM migration since it does not
|
||||
guarantee the same enumeration of AMD IOMMU device.
|
||||
|
||||
Therefore, allow the 'AMDVI-PCI' device to optionally be pre-created and
|
||||
associated with a 'amd-iommu' device via a new 'pci-id' parameter on the
|
||||
latter.
|
||||
|
||||
For example:
|
||||
-device AMDVI-PCI,id=iommupci0,bus=pcie.0,addr=0x05 \
|
||||
-device amd-iommu,intremap=on,pt=on,xtsup=on,pci-id=iommupci0 \
|
||||
|
||||
For backward-compatibility, internally create the AMDVI-PCI device if not
|
||||
specified on the CLI.
|
||||
|
||||
Co-developed-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
|
||||
Message-Id: <20250504170405.12623-2-suravee.suthikulpanit@amd.com>
|
||||
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
(cherry picked from commit f864a3235ea1d1d714b3cde2d9a810ea6344a7b5)
|
||||
|
||||
JIRA: https://issues.redhat.com/browse/RHEL-70925
|
||||
|
||||
Signed-off-by: John Allen <johnalle@redhat.com>
|
||||
---
|
||||
hw/i386/acpi-build.c | 8 +++----
|
||||
hw/i386/amd_iommu.c | 53 ++++++++++++++++++++++++++------------------
|
||||
hw/i386/amd_iommu.h | 3 ++-
|
||||
3 files changed, 38 insertions(+), 26 deletions(-)
|
||||
|
||||
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
|
||||
index 032fb1f904..236261f8aa 100644
|
||||
--- a/hw/i386/acpi-build.c
|
||||
+++ b/hw/i386/acpi-build.c
|
||||
@@ -2392,10 +2392,10 @@ build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id,
|
||||
build_append_int_noprefix(table_data, ivhd_blob->len + 24, 2);
|
||||
/* DeviceID */
|
||||
build_append_int_noprefix(table_data,
|
||||
- object_property_get_int(OBJECT(&s->pci), "addr",
|
||||
+ object_property_get_int(OBJECT(s->pci), "addr",
|
||||
&error_abort), 2);
|
||||
/* Capability offset */
|
||||
- build_append_int_noprefix(table_data, s->pci.capab_offset, 2);
|
||||
+ build_append_int_noprefix(table_data, s->pci->capab_offset, 2);
|
||||
/* IOMMU base address */
|
||||
build_append_int_noprefix(table_data, s->mr_mmio.addr, 8);
|
||||
/* PCI Segment Group */
|
||||
@@ -2427,10 +2427,10 @@ build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id,
|
||||
build_append_int_noprefix(table_data, ivhd_blob->len + 40, 2);
|
||||
/* DeviceID */
|
||||
build_append_int_noprefix(table_data,
|
||||
- object_property_get_int(OBJECT(&s->pci), "addr",
|
||||
+ object_property_get_int(OBJECT(s->pci), "addr",
|
||||
&error_abort), 2);
|
||||
/* Capability offset */
|
||||
- build_append_int_noprefix(table_data, s->pci.capab_offset, 2);
|
||||
+ build_append_int_noprefix(table_data, s->pci->capab_offset, 2);
|
||||
/* IOMMU base address */
|
||||
build_append_int_noprefix(table_data, s->mr_mmio.addr, 8);
|
||||
/* PCI Segment Group */
|
||||
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
|
||||
index 59e1a01b7c..6a5e76cfef 100644
|
||||
--- a/hw/i386/amd_iommu.c
|
||||
+++ b/hw/i386/amd_iommu.c
|
||||
@@ -167,11 +167,11 @@ static void amdvi_generate_msi_interrupt(AMDVIState *s)
|
||||
{
|
||||
MSIMessage msg = {};
|
||||
MemTxAttrs attrs = {
|
||||
- .requester_id = pci_requester_id(&s->pci.dev)
|
||||
+ .requester_id = pci_requester_id(&s->pci->dev)
|
||||
};
|
||||
|
||||
- if (msi_enabled(&s->pci.dev)) {
|
||||
- msg = msi_get_message(&s->pci.dev, 0);
|
||||
+ if (msi_enabled(&s->pci->dev)) {
|
||||
+ msg = msi_get_message(&s->pci->dev, 0);
|
||||
address_space_stl_le(&address_space_memory, msg.address, msg.data,
|
||||
attrs, NULL);
|
||||
}
|
||||
@@ -239,7 +239,7 @@ static void amdvi_page_fault(AMDVIState *s, uint16_t devid,
|
||||
info |= AMDVI_EVENT_IOPF_I | AMDVI_EVENT_IOPF;
|
||||
amdvi_encode_event(evt, devid, addr, info);
|
||||
amdvi_log_event(s, evt);
|
||||
- pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
|
||||
+ pci_word_test_and_set_mask(s->pci->dev.config + PCI_STATUS,
|
||||
PCI_STATUS_SIG_TARGET_ABORT);
|
||||
}
|
||||
/*
|
||||
@@ -256,7 +256,7 @@ static void amdvi_log_devtab_error(AMDVIState *s, uint16_t devid,
|
||||
|
||||
amdvi_encode_event(evt, devid, devtab, info);
|
||||
amdvi_log_event(s, evt);
|
||||
- pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
|
||||
+ pci_word_test_and_set_mask(s->pci->dev.config + PCI_STATUS,
|
||||
PCI_STATUS_SIG_TARGET_ABORT);
|
||||
}
|
||||
/* log an event trying to access command buffer
|
||||
@@ -269,7 +269,7 @@ static void amdvi_log_command_error(AMDVIState *s, hwaddr addr)
|
||||
|
||||
amdvi_encode_event(evt, 0, addr, info);
|
||||
amdvi_log_event(s, evt);
|
||||
- pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
|
||||
+ pci_word_test_and_set_mask(s->pci->dev.config + PCI_STATUS,
|
||||
PCI_STATUS_SIG_TARGET_ABORT);
|
||||
}
|
||||
/* log an illegal command event
|
||||
@@ -310,7 +310,7 @@ static void amdvi_log_pagetab_error(AMDVIState *s, uint16_t devid,
|
||||
info |= AMDVI_EVENT_PAGE_TAB_HW_ERROR;
|
||||
amdvi_encode_event(evt, devid, addr, info);
|
||||
amdvi_log_event(s, evt);
|
||||
- pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
|
||||
+ pci_word_test_and_set_mask(s->pci->dev.config + PCI_STATUS,
|
||||
PCI_STATUS_SIG_TARGET_ABORT);
|
||||
}
|
||||
|
||||
@@ -1607,7 +1607,7 @@ static void amdvi_sysbus_reset(DeviceState *dev)
|
||||
{
|
||||
AMDVIState *s = AMD_IOMMU_DEVICE(dev);
|
||||
|
||||
- msi_reset(&s->pci.dev);
|
||||
+ msi_reset(&s->pci->dev);
|
||||
amdvi_init(s);
|
||||
}
|
||||
|
||||
@@ -1619,14 +1619,32 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
|
||||
X86MachineState *x86ms = X86_MACHINE(ms);
|
||||
PCIBus *bus = pcms->pcibus;
|
||||
|
||||
- s->iotlb = g_hash_table_new_full(amdvi_uint64_hash,
|
||||
- amdvi_uint64_equal, g_free, g_free);
|
||||
+ if (s->pci_id) {
|
||||
+ PCIDevice *pdev = NULL;
|
||||
+ int ret = pci_qdev_find_device(s->pci_id, &pdev);
|
||||
|
||||
- /* This device should take care of IOMMU PCI properties */
|
||||
- if (!qdev_realize(DEVICE(&s->pci), &bus->qbus, errp)) {
|
||||
- return;
|
||||
+ if (ret) {
|
||||
+ error_report("Cannot find PCI device '%s'", s->pci_id);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ if (!object_dynamic_cast(OBJECT(pdev), TYPE_AMD_IOMMU_PCI)) {
|
||||
+ error_report("Device '%s' must be an AMDVI-PCI device type", s->pci_id);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ s->pci = AMD_IOMMU_PCI(pdev);
|
||||
+ } else {
|
||||
+ s->pci = AMD_IOMMU_PCI(object_new(TYPE_AMD_IOMMU_PCI));
|
||||
+ /* This device should take care of IOMMU PCI properties */
|
||||
+ if (!qdev_realize(DEVICE(s->pci), &bus->qbus, errp)) {
|
||||
+ return;
|
||||
+ }
|
||||
}
|
||||
|
||||
+ s->iotlb = g_hash_table_new_full(amdvi_uint64_hash,
|
||||
+ amdvi_uint64_equal, g_free, g_free);
|
||||
+
|
||||
/* Pseudo address space under root PCI bus. */
|
||||
x86ms->ioapic_as = amdvi_host_dma_iommu(bus, s, AMDVI_IOAPIC_SB_DEVID);
|
||||
|
||||
@@ -1668,6 +1686,7 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
|
||||
|
||||
static Property amdvi_properties[] = {
|
||||
DEFINE_PROP_BOOL("xtsup", AMDVIState, xtsup, false),
|
||||
+ DEFINE_PROP_STRING("pci-id", AMDVIState, pci_id),
|
||||
DEFINE_PROP_END_OF_LIST(),
|
||||
};
|
||||
|
||||
@@ -1676,13 +1695,6 @@ static const VMStateDescription vmstate_amdvi_sysbus = {
|
||||
.unmigratable = 1
|
||||
};
|
||||
|
||||
-static void amdvi_sysbus_instance_init(Object *klass)
|
||||
-{
|
||||
- AMDVIState *s = AMD_IOMMU_DEVICE(klass);
|
||||
-
|
||||
- object_initialize(&s->pci, sizeof(s->pci), TYPE_AMD_IOMMU_PCI);
|
||||
-}
|
||||
-
|
||||
static void amdvi_sysbus_class_init(ObjectClass *klass, void *data)
|
||||
{
|
||||
DeviceClass *dc = DEVICE_CLASS(klass);
|
||||
@@ -1704,7 +1716,6 @@ static const TypeInfo amdvi_sysbus = {
|
||||
.name = TYPE_AMD_IOMMU_DEVICE,
|
||||
.parent = TYPE_X86_IOMMU_DEVICE,
|
||||
.instance_size = sizeof(AMDVIState),
|
||||
- .instance_init = amdvi_sysbus_instance_init,
|
||||
.class_init = amdvi_sysbus_class_init
|
||||
};
|
||||
|
||||
diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
|
||||
index e0dac4d9a9..ece71ff0b6 100644
|
||||
--- a/hw/i386/amd_iommu.h
|
||||
+++ b/hw/i386/amd_iommu.h
|
||||
@@ -315,7 +315,8 @@ struct AMDVIPCIState {
|
||||
|
||||
struct AMDVIState {
|
||||
X86IOMMUState iommu; /* IOMMU bus device */
|
||||
- AMDVIPCIState pci; /* IOMMU PCI device */
|
||||
+ AMDVIPCIState *pci; /* IOMMU PCI device */
|
||||
+ char *pci_id; /* ID of AMDVI-PCI device, if user created */
|
||||
|
||||
uint32_t version;
|
||||
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,96 @@
|
||||
From 67b281dc1ccdae05da6c6052c264ecd94723c0b2 Mon Sep 17 00:00:00 2001
|
||||
From: Eric Auger <eric.auger@redhat.com>
|
||||
Date: Tue, 18 Feb 2025 19:25:32 +0100
|
||||
Subject: [PATCH 2/9] hw/i386/intel-iommu: Migrate to 3-phase reset
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Eric Auger <eric.auger@redhat.com>
|
||||
RH-MergeRequest: 341: Fix vIOMMU reset order
|
||||
RH-Jira: RHEL-7188
|
||||
RH-Acked-by: Peter Xu <peterx@redhat.com>
|
||||
RH-Acked-by: Donald Dutile <None>
|
||||
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
|
||||
RH-Commit: [2/5] 5b9b60b2b796529db10b846881e82e7df4626ec1 (eauger1/centos-qemu-kvm)
|
||||
|
||||
Currently the IOMMU may be reset before the devices
|
||||
it protects. For example this happens with virtio devices
|
||||
but also with VFIO devices. In this latter case this
|
||||
produces spurious translation faults on host.
|
||||
|
||||
Let's use 3-phase reset mechanism and reset the IOMMU on
|
||||
exit phase after all DMA capable devices have been reset
|
||||
on 'enter' or 'hold' phase.
|
||||
|
||||
Signed-off-by: Eric Auger <eric.auger@redhat.com>
|
||||
Acked-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Acked-by: Jason Wang <jasowang@redhat.com>
|
||||
Zhenzhong Duan <zhenzhong.duan@intel.com>
|
||||
|
||||
Message-Id: <20250218182737.76722-3-eric.auger@redhat.com>
|
||||
Reviewed-by: Peter Xu <peterx@redhat.com>
|
||||
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
(cherry picked from commit 2aaf48bcf27d8b3da5b30af6c1ced464d3df30f7)
|
||||
Signed-off-by: Eric Auger <eric.auger@redhat.com>
|
||||
|
||||
Conflicts: Code change
|
||||
hw/i386/intel_iommu.c
|
||||
We miss e3d0814368d0 ("hw: Use device_class_set_legacy_reset() instead
|
||||
of opencoding") meaning that instead of removing
|
||||
device_class_set_legacy_reset(dc, vtd_reset) we remove
|
||||
dc->reset = vtd_reset;
|
||||
---
|
||||
hw/i386/intel_iommu.c | 12 +++++++++---
|
||||
hw/i386/trace-events | 1 +
|
||||
2 files changed, 10 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
|
||||
index 16d2885fcc..4acefcf5c8 100644
|
||||
--- a/hw/i386/intel_iommu.c
|
||||
+++ b/hw/i386/intel_iommu.c
|
||||
@@ -4212,10 +4212,11 @@ static void vtd_init(IntelIOMMUState *s)
|
||||
/* Should not reset address_spaces when reset because devices will still use
|
||||
* the address space they got at first (won't ask the bus again).
|
||||
*/
|
||||
-static void vtd_reset(DeviceState *dev)
|
||||
+static void vtd_reset_exit(Object *obj, ResetType type)
|
||||
{
|
||||
- IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
|
||||
+ IntelIOMMUState *s = INTEL_IOMMU_DEVICE(obj);
|
||||
|
||||
+ trace_vtd_reset_exit();
|
||||
vtd_init(s);
|
||||
vtd_address_space_refresh_all(s);
|
||||
}
|
||||
@@ -4367,8 +4368,13 @@ static void vtd_class_init(ObjectClass *klass, void *data)
|
||||
{
|
||||
DeviceClass *dc = DEVICE_CLASS(klass);
|
||||
X86IOMMUClass *x86_class = X86_IOMMU_DEVICE_CLASS(klass);
|
||||
+ ResettableClass *rc = RESETTABLE_CLASS(klass);
|
||||
|
||||
- dc->reset = vtd_reset;
|
||||
+ /*
|
||||
+ * Use 'exit' reset phase to make sure all DMA requests
|
||||
+ * have been quiesced during 'enter' or 'hold' phase
|
||||
+ */
|
||||
+ rc->phases.exit = vtd_reset_exit;
|
||||
dc->vmsd = &vtd_vmstate;
|
||||
device_class_set_props(dc, vtd_properties);
|
||||
dc->hotpluggable = false;
|
||||
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
|
||||
index 53c02d7ac8..ac9e1a10aa 100644
|
||||
--- a/hw/i386/trace-events
|
||||
+++ b/hw/i386/trace-events
|
||||
@@ -68,6 +68,7 @@ vtd_frr_new(int index, uint64_t hi, uint64_t lo) "index %d high 0x%"PRIx64" low
|
||||
vtd_warn_invalid_qi_tail(uint16_t tail) "tail 0x%"PRIx16
|
||||
vtd_warn_ir_vector(uint16_t sid, int index, int vec, int target) "sid 0x%"PRIx16" index %d vec %d (should be: %d)"
|
||||
vtd_warn_ir_trigger(uint16_t sid, int index, int trig, int target) "sid 0x%"PRIx16" index %d trigger %d (should be: %d)"
|
||||
+vtd_reset_exit(void) ""
|
||||
|
||||
# amd_iommu.c
|
||||
amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" + offset 0x%"PRIx32
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,57 @@
|
||||
From 0bfbd2c49c01ee77d3b5a21bf9fe675916cbf0ed Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:36:56 +0100
|
||||
Subject: [PATCH 44/57] hw/misc/aspeed_hace: skip automatic zero-init of large
|
||||
array
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [18/30] ec8510be6b23b26b3eecd6767e1deb0c0c50dd58 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'do_hash_operation' method has a 256 element iovec array used for
|
||||
holding pointers to data that is to be hashed. Skip the automatic
|
||||
zero-init of this array to eliminate the performance overhead in the
|
||||
I/O hot path.
|
||||
|
||||
The 'iovec' array will be selectively initialized based on data that
|
||||
needs to be hashed.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-id: 20250610123709.835102-19-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 6992c886838282f36b20deee44b666bbfc573a8f)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
|
||||
Conflicts:
|
||||
hw/misc/aspeed_hace.c
|
||||
|
||||
Context conflict due to missing commit b9ccbe212e24
|
||||
("hw/misc/aspeed_hace: Extract accumulation-mode hash execution into
|
||||
helper function") downstream. The commit is not a bug fix, so there is
|
||||
no need to backport it.
|
||||
---
|
||||
hw/misc/aspeed_hace.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/hw/misc/aspeed_hace.c b/hw/misc/aspeed_hace.c
|
||||
index c06c04ddc6..d2118f1864 100644
|
||||
--- a/hw/misc/aspeed_hace.c
|
||||
+++ b/hw/misc/aspeed_hace.c
|
||||
@@ -188,7 +188,7 @@ static int gen_acc_mode_iov(AspeedHACEState *s, struct iovec *iov, int id,
|
||||
static void do_hash_operation(AspeedHACEState *s, int algo, bool sg_mode,
|
||||
bool acc_mode)
|
||||
{
|
||||
- struct iovec iov[ASPEED_HACE_MAX_SG];
|
||||
+ QEMU_UNINITIALIZED struct iovec iov[ASPEED_HACE_MAX_SG];
|
||||
g_autofree uint8_t *digest_buf = NULL;
|
||||
size_t digest_len = 0;
|
||||
int niov = 0;
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,48 @@
|
||||
From cc173deaaa4d9dc6ad9188e0b03f46b7e64f26b2 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:36:57 +0100
|
||||
Subject: [PATCH 45/57] hw/net/rtl8139: skip automatic zero-init of large array
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [19/30] 344c720aef2feb35f84fd4b21f2b1b31e5572286 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'rtl8139_transmit_one' method has a 8k byte array used for
|
||||
copying data between guest and host. Skip the automatic zero-init
|
||||
of this array to eliminate the performance overhead in the I/O
|
||||
hot path.
|
||||
|
||||
The 'txbuffer' will be fully initialized when reading PCI DMA
|
||||
buffers.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-id: 20250610123709.835102-20-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 3ccc6489dd4925ddd1f3066bd3751389169cd7aa)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
hw/net/rtl8139.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c
|
||||
index f2fe057535..a2732bf1c1 100644
|
||||
--- a/hw/net/rtl8139.c
|
||||
+++ b/hw/net/rtl8139.c
|
||||
@@ -1818,7 +1818,7 @@ static int rtl8139_transmit_one(RTL8139State *s, int descriptor)
|
||||
|
||||
PCIDevice *d = PCI_DEVICE(s);
|
||||
int txsize = s->TxStatus[descriptor] & 0x1fff;
|
||||
- uint8_t txbuffer[0x2000];
|
||||
+ QEMU_UNINITIALIZED uint8_t txbuffer[0x2000];
|
||||
|
||||
DPRINTF("+++ transmit reading %d bytes from host memory at 0x%08x\n",
|
||||
txsize, s->TxAddr[descriptor]);
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,47 @@
|
||||
From 400b5c8ae7f06a450ef91230343d7ce489142a38 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:36:58 +0100
|
||||
Subject: [PATCH 46/57] hw/net/tulip: skip automatic zero-init of large array
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [20/30] b3d29de8495c0ff40c26974673adefe4eb27a417 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'tulip_setup_frame' method has a 4k byte array used for copynig
|
||||
DMA data from the device. Skip the automatic zero-init of this array
|
||||
to eliminate the performance overhead in the I/O hot path.
|
||||
|
||||
The 'buf' array will be fully initialized when reading data from the
|
||||
device.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-id: 20250610123709.835102-21-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit e1afd5ee6eb2954f4baf3c97820e4aaf7de97d2a)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
hw/net/tulip.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/hw/net/tulip.c b/hw/net/tulip.c
|
||||
index 1f2ef20977..5cf2b96fbd 100644
|
||||
--- a/hw/net/tulip.c
|
||||
+++ b/hw/net/tulip.c
|
||||
@@ -629,7 +629,7 @@ static void tulip_setup_filter_addr(TULIPState *s, uint8_t *buf, int n)
|
||||
static void tulip_setup_frame(TULIPState *s,
|
||||
struct tulip_descriptor *desc)
|
||||
{
|
||||
- uint8_t buf[4096];
|
||||
+ QEMU_UNINITIALIZED uint8_t buf[4096];
|
||||
int len = (desc->control >> TDES1_BUF1_SIZE_SHIFT) & TDES1_BUF1_SIZE_MASK;
|
||||
int i;
|
||||
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,54 @@
|
||||
From 0925796a4537e20e033a675ebc8899e4580235f3 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:36:59 +0100
|
||||
Subject: [PATCH 47/57] hw/net/virtio-net: skip automatic zero-init of large
|
||||
arrays
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [21/30] 0450189a4c4c779b5a1850e9ea8278a5129c5f7f (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'virtio_net_receive_rcu' method has three arrays with
|
||||
VIRTQUEUE_MAX_SIZE elements, which are apprixmately 32k in
|
||||
size used for copying data between guest and host. Skip the
|
||||
automatic zero-init of these arrays to eliminate the
|
||||
performance overhead in the I/O hot path.
|
||||
|
||||
The three arrays will be selectively initialized as required
|
||||
when processing network buffers.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-id: 20250610123709.835102-22-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 21cf31c51a7aeff4270c9b30b37e019c536d54b2)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
hw/net/virtio-net.c | 6 +++---
|
||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
|
||||
index 3d2b2460ad..086ea20ea0 100644
|
||||
--- a/hw/net/virtio-net.c
|
||||
+++ b/hw/net/virtio-net.c
|
||||
@@ -1895,9 +1895,9 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
|
||||
VirtIONet *n = qemu_get_nic_opaque(nc);
|
||||
VirtIONetQueue *q = virtio_net_get_subqueue(nc);
|
||||
VirtIODevice *vdev = VIRTIO_DEVICE(n);
|
||||
- VirtQueueElement *elems[VIRTQUEUE_MAX_SIZE];
|
||||
- size_t lens[VIRTQUEUE_MAX_SIZE];
|
||||
- struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE];
|
||||
+ QEMU_UNINITIALIZED VirtQueueElement *elems[VIRTQUEUE_MAX_SIZE];
|
||||
+ QEMU_UNINITIALIZED size_t lens[VIRTQUEUE_MAX_SIZE];
|
||||
+ QEMU_UNINITIALIZED struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE];
|
||||
struct virtio_net_hdr_v1_hash extra_hdr;
|
||||
unsigned mhdr_cnt = 0;
|
||||
size_t offset, i, guest_offset, j;
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,47 @@
|
||||
From 34116b3a243f005938a30e9b38c6f47a62752c3e Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:37:00 +0100
|
||||
Subject: [PATCH 48/57] hw/net/xgamc: skip automatic zero-init of large array
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [22/30] 63536d627705775c4bf72a511de3d68ec30ac7de (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'xgmac_enet_send' method has a 8k byte array used for copying
|
||||
data between guest and host. Skip the automatic zero-init of this
|
||||
array to eliminate the performance overhead in the I/O hot path.
|
||||
|
||||
The 'frame' buffer will be fully initialized when reading guest
|
||||
memory to fetch the data to send.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-id: 20250610123709.835102-23-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 8b723287b84a62bb5d1a7799ef0959ca8e6c293a)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
hw/net/xgmac.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/hw/net/xgmac.c b/hw/net/xgmac.c
|
||||
index ffe3fc8dbe..eff8022aca 100644
|
||||
--- a/hw/net/xgmac.c
|
||||
+++ b/hw/net/xgmac.c
|
||||
@@ -207,7 +207,7 @@ static void xgmac_enet_send(XgmacState *s)
|
||||
struct desc bd;
|
||||
int frame_size;
|
||||
int len;
|
||||
- uint8_t frame[8192];
|
||||
+ QEMU_UNINITIALIZED uint8_t frame[8192];
|
||||
uint8_t *ptr;
|
||||
|
||||
ptr = frame;
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,72 @@
|
||||
From 3e0134b45828bf9a623a26ac41d5fbb3a8d2917b Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:37:01 +0100
|
||||
Subject: [PATCH 49/57] hw/nvme/ctrl: skip automatic zero-init of large arrays
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [23/30] 57ce4361ffb307be4ea4d3edf9e0dac269d16908 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'nvme_map_sgl' method has a 256 element array used for copying
|
||||
data from the device. Skip the automatic zero-init of this array
|
||||
to eliminate the performance overhead in the I/O hot path.
|
||||
|
||||
The 'segment' array will be fully initialized when reading data from
|
||||
the device.
|
||||
|
||||
The 'nme_changed_nslist' method has a 4k byte array that is manually
|
||||
initialized with memset(). The compiler ought to be intelligent
|
||||
enough to turn the memset() into a static initialization operation,
|
||||
and thus not duplicate the automatic zero-init. Replacing memset()
|
||||
with '{}' makes it unambiguous that the array is statically initialized.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
|
||||
Message-id: 20250610123709.835102-24-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 7eeb1d3acc175813ad3d5e824f26123e0992093a)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
hw/nvme/ctrl.c | 6 +++---
|
||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
|
||||
index d451ee0d00..75d7f20801 100644
|
||||
--- a/hw/nvme/ctrl.c
|
||||
+++ b/hw/nvme/ctrl.c
|
||||
@@ -1047,7 +1047,8 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
|
||||
*/
|
||||
#define SEG_CHUNK_SIZE 256
|
||||
|
||||
- NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
|
||||
+ QEMU_UNINITIALIZED NvmeSglDescriptor segment[SEG_CHUNK_SIZE];
|
||||
+ NvmeSglDescriptor *sgld, *last_sgld;
|
||||
uint64_t nsgld;
|
||||
uint32_t seg_len;
|
||||
uint16_t status;
|
||||
@@ -5029,7 +5030,7 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
|
||||
static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
|
||||
uint64_t off, NvmeRequest *req)
|
||||
{
|
||||
- uint32_t nslist[1024];
|
||||
+ uint32_t nslist[1024] = {};
|
||||
uint32_t trans_len;
|
||||
int i = 0;
|
||||
uint32_t nsid;
|
||||
@@ -5039,7 +5040,6 @@ static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
|
||||
return NVME_INVALID_FIELD | NVME_DNR;
|
||||
}
|
||||
|
||||
- memset(nslist, 0x0, sizeof(nslist));
|
||||
trans_len = MIN(sizeof(nslist) - off, buf_len);
|
||||
|
||||
while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
|
||||
--
|
||||
2.39.3
|
||||
|
||||
242
SOURCES/kvm-hw-pci-Basic-support-for-PCI-power-management.patch
Normal file
242
SOURCES/kvm-hw-pci-Basic-support-for-PCI-power-management.patch
Normal file
@ -0,0 +1,242 @@
|
||||
From 98b0cd83c09d35a3da0ae142c09038174355e87e Mon Sep 17 00:00:00 2001
|
||||
From: Alex Williamson <alex.williamson@redhat.com>
|
||||
Date: Tue, 25 Feb 2025 14:52:25 -0700
|
||||
Subject: [PATCH 2/7] hw/pci: Basic support for PCI power management
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Eric Auger <eric.auger@redhat.com>
|
||||
RH-MergeRequest: 348: PCI: Implement basic PCI PM capability backing
|
||||
RH-Jira: RHEL-7301
|
||||
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
|
||||
RH-Acked-by: Alex Williamson <None>
|
||||
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
|
||||
RH-Commit: [2/6] 5faff6382c124711887704fff4f857e8f85e7be5 (eauger1/centos-qemu-kvm)
|
||||
|
||||
Conflicts: contextual conflict in include/hw/pci/pci.h
|
||||
we don't have 449dca6ac93a ("pcie: enable Extended tag field support")
|
||||
downstream so we don't have x-pcie-ext-tag definition.
|
||||
|
||||
The memory and IO BARs for devices are only accessible in the D0 power
|
||||
state. In other power states the PCI spec defines that the device
|
||||
responds to TLPs and messages with an Unsupported Request response.
|
||||
|
||||
To approximate this behavior, consider the BARs as unmapped when the
|
||||
device is not in the D0 power state. This makes the BARs inaccessible
|
||||
and has the additional bonus for vfio-pci that we don't attempt to DMA
|
||||
map BARs for devices in a non-D0 power state.
|
||||
|
||||
To support this, an interface is added for devices to register the PM
|
||||
capability, which allows central tracking to enforce valid transitions
|
||||
and unmap BARs in non-D0 states.
|
||||
|
||||
NB. We currently have device models (eepro100 and pcie_pci_bridge)
|
||||
that register a PM capability but do not set wmask to enable writes to
|
||||
the power state field. In order to maintain migration compatibility,
|
||||
this new helper does not manage the wmask to enable guest writes to
|
||||
initiate a power state change. The contents and write access of the
|
||||
PM capability are still managed by the caller.
|
||||
|
||||
Cc: Michael S. Tsirkin <mst@redhat.com>
|
||||
Cc: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
|
||||
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
|
||||
Reviewed-by: Eric Auger <eric.auger@redhat.com>
|
||||
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Link: https://lore.kernel.org/qemu-devel/20250225215237.3314011-2-alex.williamson@redhat.com
|
||||
Signed-off-by: Cédric Le Goater <clg@redhat.com>
|
||||
(cherry picked from commit 9461afd2008b0820fc45a6a7bc675df1b6791e4f)
|
||||
Signed-off-by: Eric Auger <eric.auger@redhat.com>
|
||||
---
|
||||
hw/pci/pci.c | 93 ++++++++++++++++++++++++++++++++++++-
|
||||
hw/pci/trace-events | 2 +
|
||||
include/hw/pci/pci.h | 3 ++
|
||||
include/hw/pci/pci_device.h | 3 ++
|
||||
4 files changed, 99 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
|
||||
index 83c9d5b9ea..d774ae47d2 100644
|
||||
--- a/hw/pci/pci.c
|
||||
+++ b/hw/pci/pci.c
|
||||
@@ -365,6 +365,84 @@ static void pci_msi_trigger(PCIDevice *dev, MSIMessage msg)
|
||||
attrs, NULL);
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * Register and track a PM capability. If wmask is also enabled for the power
|
||||
+ * state field of the pmcsr register, guest writes may change the device PM
|
||||
+ * state. BAR access is only enabled while the device is in the D0 state.
|
||||
+ * Return the capability offset or negative error code.
|
||||
+ */
|
||||
+int pci_pm_init(PCIDevice *d, uint8_t offset, Error **errp)
|
||||
+{
|
||||
+ int cap = pci_add_capability(d, PCI_CAP_ID_PM, offset, PCI_PM_SIZEOF, errp);
|
||||
+
|
||||
+ if (cap < 0) {
|
||||
+ return cap;
|
||||
+ }
|
||||
+
|
||||
+ d->pm_cap = cap;
|
||||
+ d->cap_present |= QEMU_PCI_CAP_PM;
|
||||
+
|
||||
+ return cap;
|
||||
+}
|
||||
+
|
||||
+static uint8_t pci_pm_state(PCIDevice *d)
|
||||
+{
|
||||
+ uint16_t pmcsr;
|
||||
+
|
||||
+ if (!(d->cap_present & QEMU_PCI_CAP_PM)) {
|
||||
+ return 0;
|
||||
+ }
|
||||
+
|
||||
+ pmcsr = pci_get_word(d->config + d->pm_cap + PCI_PM_CTRL);
|
||||
+
|
||||
+ return pmcsr & PCI_PM_CTRL_STATE_MASK;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * Update the PM capability state based on the new value stored in config
|
||||
+ * space respective to the old, pre-write state provided. If the new value
|
||||
+ * is rejected (unsupported or invalid transition) restore the old value.
|
||||
+ * Return the resulting PM state.
|
||||
+ */
|
||||
+static uint8_t pci_pm_update(PCIDevice *d, uint32_t addr, int l, uint8_t old)
|
||||
+{
|
||||
+ uint16_t pmc;
|
||||
+ uint8_t new;
|
||||
+
|
||||
+ if (!(d->cap_present & QEMU_PCI_CAP_PM) ||
|
||||
+ !range_covers_byte(addr, l, d->pm_cap + PCI_PM_CTRL)) {
|
||||
+ return old;
|
||||
+ }
|
||||
+
|
||||
+ new = pci_pm_state(d);
|
||||
+ if (new == old) {
|
||||
+ return old;
|
||||
+ }
|
||||
+
|
||||
+ pmc = pci_get_word(d->config + d->pm_cap + PCI_PM_PMC);
|
||||
+
|
||||
+ /*
|
||||
+ * Transitions to D1 & D2 are only allowed if supported. Devices may
|
||||
+ * only transition to higher D-states or to D0.
|
||||
+ */
|
||||
+ if ((!(pmc & PCI_PM_CAP_D1) && new == 1) ||
|
||||
+ (!(pmc & PCI_PM_CAP_D2) && new == 2) ||
|
||||
+ (old && new && new < old)) {
|
||||
+ pci_word_test_and_clear_mask(d->config + d->pm_cap + PCI_PM_CTRL,
|
||||
+ PCI_PM_CTRL_STATE_MASK);
|
||||
+ pci_word_test_and_set_mask(d->config + d->pm_cap + PCI_PM_CTRL,
|
||||
+ old);
|
||||
+ trace_pci_pm_bad_transition(d->name, pci_dev_bus_num(d),
|
||||
+ PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
|
||||
+ old, new);
|
||||
+ return old;
|
||||
+ }
|
||||
+
|
||||
+ trace_pci_pm_transition(d->name, pci_dev_bus_num(d), PCI_SLOT(d->devfn),
|
||||
+ PCI_FUNC(d->devfn), old, new);
|
||||
+ return new;
|
||||
+}
|
||||
+
|
||||
static void pci_reset_regions(PCIDevice *dev)
|
||||
{
|
||||
int r;
|
||||
@@ -404,6 +482,11 @@ static void pci_do_device_reset(PCIDevice *dev)
|
||||
pci_get_word(dev->wmask + PCI_INTERRUPT_LINE) |
|
||||
pci_get_word(dev->w1cmask + PCI_INTERRUPT_LINE));
|
||||
dev->config[PCI_CACHE_LINE_SIZE] = 0x0;
|
||||
+ /* Default PM state is D0 */
|
||||
+ if (dev->cap_present & QEMU_PCI_CAP_PM) {
|
||||
+ pci_word_test_and_clear_mask(dev->config + dev->pm_cap + PCI_PM_CTRL,
|
||||
+ PCI_PM_CTRL_STATE_MASK);
|
||||
+ }
|
||||
pci_reset_regions(dev);
|
||||
pci_update_mappings(dev);
|
||||
|
||||
@@ -1525,7 +1608,7 @@ static void pci_update_mappings(PCIDevice *d)
|
||||
continue;
|
||||
|
||||
new_addr = pci_bar_address(d, i, r->type, r->size);
|
||||
- if (!d->enabled) {
|
||||
+ if (!d->enabled || pci_pm_state(d)) {
|
||||
new_addr = PCI_BAR_UNMAPPED;
|
||||
}
|
||||
|
||||
@@ -1591,6 +1674,7 @@ uint32_t pci_default_read_config(PCIDevice *d,
|
||||
|
||||
void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int l)
|
||||
{
|
||||
+ uint8_t new_pm_state, old_pm_state = pci_pm_state(d);
|
||||
int i, was_irq_disabled = pci_irq_disabled(d);
|
||||
uint32_t val = val_in;
|
||||
|
||||
@@ -1603,11 +1687,16 @@ void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int
|
||||
d->config[addr + i] = (d->config[addr + i] & ~wmask) | (val & wmask);
|
||||
d->config[addr + i] &= ~(val & w1cmask); /* W1C: Write 1 to Clear */
|
||||
}
|
||||
+
|
||||
+ new_pm_state = pci_pm_update(d, addr, l, old_pm_state);
|
||||
+
|
||||
if (ranges_overlap(addr, l, PCI_BASE_ADDRESS_0, 24) ||
|
||||
ranges_overlap(addr, l, PCI_ROM_ADDRESS, 4) ||
|
||||
ranges_overlap(addr, l, PCI_ROM_ADDRESS1, 4) ||
|
||||
- range_covers_byte(addr, l, PCI_COMMAND))
|
||||
+ range_covers_byte(addr, l, PCI_COMMAND) ||
|
||||
+ !!new_pm_state != !!old_pm_state) {
|
||||
pci_update_mappings(d);
|
||||
+ }
|
||||
|
||||
if (ranges_overlap(addr, l, PCI_COMMAND, 2)) {
|
||||
pci_update_irq_disabled(d, was_irq_disabled);
|
||||
diff --git a/hw/pci/trace-events b/hw/pci/trace-events
|
||||
index 19643aa8c6..c82a87ffdd 100644
|
||||
--- a/hw/pci/trace-events
|
||||
+++ b/hw/pci/trace-events
|
||||
@@ -1,6 +1,8 @@
|
||||
# See docs/devel/tracing.rst for syntax documentation.
|
||||
|
||||
# pci.c
|
||||
+pci_pm_bad_transition(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, uint8_t old, uint8_t new) "%s %02x:%02x.%x REJECTED PM transition D%d->D%d"
|
||||
+pci_pm_transition(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, uint8_t old, uint8_t new) "%s %02x:%02x.%x PM transition D%d->D%d"
|
||||
pci_update_mappings_del(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, int bar, uint64_t addr, uint64_t size) "%s %02x:%02x.%x %d,0x%"PRIx64"+0x%"PRIx64
|
||||
pci_update_mappings_add(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, int bar, uint64_t addr, uint64_t size) "%s %02x:%02x.%x %d,0x%"PRIx64"+0x%"PRIx64
|
||||
pci_route_irq(int dev_irq, const char *dev_path, int parent_irq, const char *parent_path) "IRQ %d @%s -> IRQ %d @%s"
|
||||
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
|
||||
index 45365ae085..afeb5a2263 100644
|
||||
--- a/include/hw/pci/pci.h
|
||||
+++ b/include/hw/pci/pci.h
|
||||
@@ -213,6 +213,8 @@ enum {
|
||||
QEMU_PCIE_ERR_UNC_MASK = (1 << QEMU_PCIE_ERR_UNC_MASK_BITNR),
|
||||
#define QEMU_PCIE_ARI_NEXTFN_1_BITNR 12
|
||||
QEMU_PCIE_ARI_NEXTFN_1 = (1 << QEMU_PCIE_ARI_NEXTFN_1_BITNR),
|
||||
+#define QEMU_PCI_CAP_PM_BITNR 14
|
||||
+ QEMU_PCI_CAP_PM = (1 << QEMU_PCI_CAP_PM_BITNR),
|
||||
};
|
||||
|
||||
typedef struct PCIINTxRoute {
|
||||
@@ -680,5 +682,6 @@ static inline void pci_irq_pulse(PCIDevice *pci_dev)
|
||||
MSIMessage pci_get_msi_message(PCIDevice *dev, int vector);
|
||||
void pci_set_enabled(PCIDevice *pci_dev, bool state);
|
||||
void pci_set_power(PCIDevice *pci_dev, bool state);
|
||||
+int pci_pm_init(PCIDevice *pci_dev, uint8_t offset, Error **errp);
|
||||
|
||||
#endif
|
||||
diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
|
||||
index f38fb31119..325d7bcaf7 100644
|
||||
--- a/include/hw/pci/pci_device.h
|
||||
+++ b/include/hw/pci/pci_device.h
|
||||
@@ -105,6 +105,9 @@ struct PCIDevice {
|
||||
/* Capability bits */
|
||||
uint32_t cap_present;
|
||||
|
||||
+ /* Offset of PM capability in config space */
|
||||
+ uint8_t pm_cap;
|
||||
+
|
||||
/* Offset of MSI-X capability in config space */
|
||||
uint8_t msix_cap;
|
||||
|
||||
--
|
||||
2.48.1
|
||||
|
||||
130
SOURCES/kvm-hw-pci-Rename-has_power-to-enabled.patch
Normal file
130
SOURCES/kvm-hw-pci-Rename-has_power-to-enabled.patch
Normal file
@ -0,0 +1,130 @@
|
||||
From 8711bb1a54d4f5734d44545cd8e7262bc358f51d Mon Sep 17 00:00:00 2001
|
||||
From: Akihiko Odaki <akihiko.odaki@daynix.com>
|
||||
Date: Thu, 9 Jan 2025 15:29:46 +0900
|
||||
Subject: [PATCH 1/7] hw/pci: Rename has_power to enabled
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Eric Auger <eric.auger@redhat.com>
|
||||
RH-MergeRequest: 348: PCI: Implement basic PCI PM capability backing
|
||||
RH-Jira: RHEL-7301
|
||||
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
|
||||
RH-Acked-by: Alex Williamson <None>
|
||||
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
|
||||
RH-Commit: [1/6] ac8a7427a1203e33aa323933818a7114c0eb4520 (eauger1/centos-qemu-kvm)
|
||||
|
||||
The renamed state will not only represent powering state of PFs, but
|
||||
also represent SR-IOV VF enablement in the future.
|
||||
|
||||
Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
|
||||
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
|
||||
Message-ID: <20250109-reuse-v19-1-f541e82ca5f7@daynix.com>
|
||||
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
|
||||
(cherry picked from commit c407eef162f765dd83d45e048585731be41a66fc)
|
||||
Signed-off-by: Eric Auger <eric.auger@redhat.com>
|
||||
---
|
||||
hw/pci/pci.c | 17 +++++++++++------
|
||||
hw/pci/pci_host.c | 4 ++--
|
||||
include/hw/pci/pci.h | 1 +
|
||||
include/hw/pci/pci_device.h | 2 +-
|
||||
4 files changed, 15 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
|
||||
index fab86d0567..83c9d5b9ea 100644
|
||||
--- a/hw/pci/pci.c
|
||||
+++ b/hw/pci/pci.c
|
||||
@@ -1525,7 +1525,7 @@ static void pci_update_mappings(PCIDevice *d)
|
||||
continue;
|
||||
|
||||
new_addr = pci_bar_address(d, i, r->type, r->size);
|
||||
- if (!d->has_power) {
|
||||
+ if (!d->enabled) {
|
||||
new_addr = PCI_BAR_UNMAPPED;
|
||||
}
|
||||
|
||||
@@ -1613,7 +1613,7 @@ void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int
|
||||
pci_update_irq_disabled(d, was_irq_disabled);
|
||||
memory_region_set_enabled(&d->bus_master_enable_region,
|
||||
(pci_get_word(d->config + PCI_COMMAND)
|
||||
- & PCI_COMMAND_MASTER) && d->has_power);
|
||||
+ & PCI_COMMAND_MASTER) && d->enabled);
|
||||
}
|
||||
|
||||
msi_write_config(d, addr, val_in, l);
|
||||
@@ -2886,16 +2886,21 @@ MSIMessage pci_get_msi_message(PCIDevice *dev, int vector)
|
||||
|
||||
void pci_set_power(PCIDevice *d, bool state)
|
||||
{
|
||||
- if (d->has_power == state) {
|
||||
+ pci_set_enabled(d, state);
|
||||
+}
|
||||
+
|
||||
+void pci_set_enabled(PCIDevice *d, bool state)
|
||||
+{
|
||||
+ if (d->enabled == state) {
|
||||
return;
|
||||
}
|
||||
|
||||
- d->has_power = state;
|
||||
+ d->enabled = state;
|
||||
pci_update_mappings(d);
|
||||
memory_region_set_enabled(&d->bus_master_enable_region,
|
||||
(pci_get_word(d->config + PCI_COMMAND)
|
||||
- & PCI_COMMAND_MASTER) && d->has_power);
|
||||
- if (!d->has_power) {
|
||||
+ & PCI_COMMAND_MASTER) && d->enabled);
|
||||
+ if (!d->enabled) {
|
||||
pci_device_reset(d);
|
||||
}
|
||||
}
|
||||
diff --git a/hw/pci/pci_host.c b/hw/pci/pci_host.c
|
||||
index dfe6fe6184..0d82727cc9 100644
|
||||
--- a/hw/pci/pci_host.c
|
||||
+++ b/hw/pci/pci_host.c
|
||||
@@ -86,7 +86,7 @@ void pci_host_config_write_common(PCIDevice *pci_dev, uint32_t addr,
|
||||
* allowing direct removal of unexposed functions.
|
||||
*/
|
||||
if ((pci_dev->qdev.hotplugged && !pci_get_function_0(pci_dev)) ||
|
||||
- !pci_dev->has_power || is_pci_dev_ejected(pci_dev)) {
|
||||
+ !pci_dev->enabled || is_pci_dev_ejected(pci_dev)) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -111,7 +111,7 @@ uint32_t pci_host_config_read_common(PCIDevice *pci_dev, uint32_t addr,
|
||||
* allowing direct removal of unexposed functions.
|
||||
*/
|
||||
if ((pci_dev->qdev.hotplugged && !pci_get_function_0(pci_dev)) ||
|
||||
- !pci_dev->has_power || is_pci_dev_ejected(pci_dev)) {
|
||||
+ !pci_dev->enabled || is_pci_dev_ejected(pci_dev)) {
|
||||
return ~0x0;
|
||||
}
|
||||
|
||||
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
|
||||
index eb26cac810..45365ae085 100644
|
||||
--- a/include/hw/pci/pci.h
|
||||
+++ b/include/hw/pci/pci.h
|
||||
@@ -678,6 +678,7 @@ static inline void pci_irq_pulse(PCIDevice *pci_dev)
|
||||
}
|
||||
|
||||
MSIMessage pci_get_msi_message(PCIDevice *dev, int vector);
|
||||
+void pci_set_enabled(PCIDevice *pci_dev, bool state);
|
||||
void pci_set_power(PCIDevice *pci_dev, bool state);
|
||||
|
||||
#endif
|
||||
diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
|
||||
index 15694f2489..f38fb31119 100644
|
||||
--- a/include/hw/pci/pci_device.h
|
||||
+++ b/include/hw/pci/pci_device.h
|
||||
@@ -57,7 +57,7 @@ typedef struct PCIReqIDCache PCIReqIDCache;
|
||||
struct PCIDevice {
|
||||
DeviceState qdev;
|
||||
bool partially_hotplugged;
|
||||
- bool has_power;
|
||||
+ bool enabled;
|
||||
|
||||
/* PCI config space */
|
||||
uint8_t *config;
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,52 @@
|
||||
From 4c3fe6e7b88c58713c0c499d4bf0658a055ee52e Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:37:03 +0100
|
||||
Subject: [PATCH 50/57] hw/ppc/spapr_tpm_proxy: skip automatic zero-init of
|
||||
large arrays
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [24/30] 8d963380c64a33a27adc99738b42b52864229111 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'tpm_execute' method has a pair of 4k arrays used for copying
|
||||
data between guest and host. Skip the automatic zero-init of these
|
||||
arrays to eliminate the performance overhead in the I/O hot path.
|
||||
|
||||
The two arrays will be fully initialized when reading data from
|
||||
guest memory or reading data from the proxy FD.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
|
||||
Reviewed-by: Harsh Prateek Bora <harshpb@linux.ibm.com>
|
||||
Message-id: 20250610123709.835102-26-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 5dd9087fff74b5672526cad254e76f790fb35c7a)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
hw/ppc/spapr_tpm_proxy.c | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/hw/ppc/spapr_tpm_proxy.c b/hw/ppc/spapr_tpm_proxy.c
|
||||
index e10af35a18..88833d9e2e 100644
|
||||
--- a/hw/ppc/spapr_tpm_proxy.c
|
||||
+++ b/hw/ppc/spapr_tpm_proxy.c
|
||||
@@ -41,8 +41,8 @@ static ssize_t tpm_execute(SpaprTpmProxy *tpm_proxy, target_ulong *args)
|
||||
target_ulong data_in_size = args[2];
|
||||
uint64_t data_out = ppc64_phys_to_real(args[3]);
|
||||
target_ulong data_out_size = args[4];
|
||||
- uint8_t buf_in[TPM_SPAPR_BUFSIZE];
|
||||
- uint8_t buf_out[TPM_SPAPR_BUFSIZE];
|
||||
+ QEMU_UNINITIALIZED uint8_t buf_in[TPM_SPAPR_BUFSIZE];
|
||||
+ QEMU_UNINITIALIZED uint8_t buf_out[TPM_SPAPR_BUFSIZE];
|
||||
ssize_t ret;
|
||||
|
||||
trace_spapr_tpm_execute(data_in, data_in_size, data_out, data_out_size);
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,63 @@
|
||||
From 5126609c0714c66a0ec41328017e7e8388c78bf4 Mon Sep 17 00:00:00 2001
|
||||
From: Peter Maydell <peter.maydell@linaro.org>
|
||||
Date: Fri, 13 Sep 2024 15:31:43 +0100
|
||||
Subject: [PATCH 02/26] hw/s390/ccw-device: Convert to three-phase reset
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Thomas Huth <thuth@redhat.com>
|
||||
RH-MergeRequest: 351: Enable virtio-mem support on s390x
|
||||
RH-Jira: RHEL-72977
|
||||
RH-Acked-by: David Hildenbrand <david@redhat.com>
|
||||
RH-Acked-by: Juraj Marcin <None>
|
||||
RH-Commit: [2/26] 58f6fc2e65a101e069feac399859464d31e43045 (thuth/qemu-kvm-cs)
|
||||
|
||||
Convert the TYPE_CCW_DEVICE to three-phase reset. This is a
|
||||
device class which is subclassed, so it needs to be three-phase
|
||||
before we can convert the subclass.
|
||||
|
||||
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
|
||||
Reviewed-by: Nina Schoetterl-Glausch <nsg@linux.ibm.com>
|
||||
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
|
||||
Acked-by: Thomas Huth <thuth@redhat.com>
|
||||
Message-id: 20240830145812.1967042-2-peter.maydell@linaro.org
|
||||
(cherry picked from commit 6a0e10b76b68e2f412746a1d5ed7d6efee804864)
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
---
|
||||
hw/s390x/ccw-device.c | 7 ++++---
|
||||
1 file changed, 4 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/hw/s390x/ccw-device.c b/hw/s390x/ccw-device.c
|
||||
index d7bb364579..30f2fb486f 100644
|
||||
--- a/hw/s390x/ccw-device.c
|
||||
+++ b/hw/s390x/ccw-device.c
|
||||
@@ -88,9 +88,9 @@ static Property ccw_device_properties[] = {
|
||||
DEFINE_PROP_END_OF_LIST(),
|
||||
};
|
||||
|
||||
-static void ccw_device_reset(DeviceState *d)
|
||||
+static void ccw_device_reset_hold(Object *obj, ResetType type)
|
||||
{
|
||||
- CcwDevice *ccw_dev = CCW_DEVICE(d);
|
||||
+ CcwDevice *ccw_dev = CCW_DEVICE(obj);
|
||||
|
||||
css_reset_sch(ccw_dev->sch);
|
||||
}
|
||||
@@ -99,11 +99,12 @@ static void ccw_device_class_init(ObjectClass *klass, void *data)
|
||||
{
|
||||
DeviceClass *dc = DEVICE_CLASS(klass);
|
||||
CCWDeviceClass *k = CCW_DEVICE_CLASS(klass);
|
||||
+ ResettableClass *rc = RESETTABLE_CLASS(klass);
|
||||
|
||||
k->realize = ccw_device_realize;
|
||||
k->refill_ids = ccw_device_refill_ids;
|
||||
device_class_set_props(dc, ccw_device_properties);
|
||||
- dc->reset = ccw_device_reset;
|
||||
+ rc->phases.hold = ccw_device_reset_hold;
|
||||
dc->bus_type = TYPE_VIRTUAL_CSS_BUS;
|
||||
}
|
||||
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,92 @@
|
||||
From 7cbf9be09907407a64d739a2d0862af2ad08eaf5 Mon Sep 17 00:00:00 2001
|
||||
From: Peter Maydell <peter.maydell@linaro.org>
|
||||
Date: Fri, 13 Sep 2024 15:31:43 +0100
|
||||
Subject: [PATCH 03/26] hw/s390/virtio-ccw: Convert to three-phase reset
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Thomas Huth <thuth@redhat.com>
|
||||
RH-MergeRequest: 351: Enable virtio-mem support on s390x
|
||||
RH-Jira: RHEL-72977
|
||||
RH-Acked-by: David Hildenbrand <david@redhat.com>
|
||||
RH-Acked-by: Juraj Marcin <None>
|
||||
RH-Commit: [3/26] e06ee194fa289a387433b905eb0999a048681a92 (thuth/qemu-kvm-cs)
|
||||
|
||||
Convert the virtio-ccw code to three-phase reset. This allows us to
|
||||
remove a call to device_class_set_parent_reset(), replacing it with
|
||||
the three-phase equivalent resettable_class_set_parent_phases().
|
||||
Removing all the device_class_set_parent_reset() uses will allow us
|
||||
to remove some of the glue code that interworks between three-phase
|
||||
and legacy reset.
|
||||
|
||||
This is a simple conversion, with no behavioural changes.
|
||||
|
||||
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
|
||||
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
|
||||
Reviewed-by: Nina Schoetterl-Glausch <nsg@linux.ibm.com>
|
||||
Acked-by: Thomas Huth <thuth@redhat.com>
|
||||
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
|
||||
Message-id: 20240830145812.1967042-3-peter.maydell@linaro.org
|
||||
(cherry picked from commit 6affa00d6ebebf24485667fe146470b0d6feb90d)
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
---
|
||||
hw/s390x/virtio-ccw.c | 13 ++++++++-----
|
||||
hw/s390x/virtio-ccw.h | 2 +-
|
||||
2 files changed, 9 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
|
||||
index b4676909dd..96747318d2 100644
|
||||
--- a/hw/s390x/virtio-ccw.c
|
||||
+++ b/hw/s390x/virtio-ccw.c
|
||||
@@ -913,14 +913,15 @@ static void virtio_ccw_notify(DeviceState *d, uint16_t vector)
|
||||
}
|
||||
}
|
||||
|
||||
-static void virtio_ccw_reset(DeviceState *d)
|
||||
+static void virtio_ccw_reset_hold(Object *obj, ResetType type)
|
||||
{
|
||||
- VirtioCcwDevice *dev = VIRTIO_CCW_DEVICE(d);
|
||||
+ VirtioCcwDevice *dev = VIRTIO_CCW_DEVICE(obj);
|
||||
VirtIOCCWDeviceClass *vdc = VIRTIO_CCW_DEVICE_GET_CLASS(dev);
|
||||
|
||||
virtio_ccw_reset_virtio(dev);
|
||||
- if (vdc->parent_reset) {
|
||||
- vdc->parent_reset(d);
|
||||
+
|
||||
+ if (vdc->parent_phases.hold) {
|
||||
+ vdc->parent_phases.hold(obj, type);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1233,11 +1234,13 @@ static void virtio_ccw_device_class_init(ObjectClass *klass, void *data)
|
||||
DeviceClass *dc = DEVICE_CLASS(klass);
|
||||
CCWDeviceClass *k = CCW_DEVICE_CLASS(dc);
|
||||
VirtIOCCWDeviceClass *vdc = VIRTIO_CCW_DEVICE_CLASS(klass);
|
||||
+ ResettableClass *rc = RESETTABLE_CLASS(klass);
|
||||
|
||||
k->unplug = virtio_ccw_busdev_unplug;
|
||||
dc->realize = virtio_ccw_busdev_realize;
|
||||
dc->unrealize = virtio_ccw_busdev_unrealize;
|
||||
- device_class_set_parent_reset(dc, virtio_ccw_reset, &vdc->parent_reset);
|
||||
+ resettable_class_set_parent_phases(rc, NULL, virtio_ccw_reset_hold, NULL,
|
||||
+ &vdc->parent_phases);
|
||||
}
|
||||
|
||||
static const TypeInfo virtio_ccw_device_info = {
|
||||
diff --git a/hw/s390x/virtio-ccw.h b/hw/s390x/virtio-ccw.h
|
||||
index fac186c8f6..c7a830a194 100644
|
||||
--- a/hw/s390x/virtio-ccw.h
|
||||
+++ b/hw/s390x/virtio-ccw.h
|
||||
@@ -57,7 +57,7 @@ struct VirtIOCCWDeviceClass {
|
||||
CCWDeviceClass parent_class;
|
||||
void (*realize)(VirtioCcwDevice *dev, Error **errp);
|
||||
void (*unrealize)(VirtioCcwDevice *dev);
|
||||
- void (*parent_reset)(DeviceState *dev);
|
||||
+ ResettablePhases parent_phases;
|
||||
};
|
||||
|
||||
/* Performance improves when virtqueue kick processing is decoupled from the
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,47 @@
|
||||
From b25bbfcad4a3df94555f6b5f238910314a5d17ea Mon Sep 17 00:00:00 2001
|
||||
From: Kevin Wolf <kwolf@redhat.com>
|
||||
Date: Wed, 25 Jun 2025 10:27:51 +0200
|
||||
Subject: [PATCH 02/57] hw/s390x/ccw-device: Fix memory leak in loadparm setter
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Thomas Huth <thuth@redhat.com>
|
||||
RH-MergeRequest: 387: s390x: Fix memory leaks related to loadparm [rhel-9]
|
||||
RH-Jira: RHEL-98554
|
||||
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
|
||||
RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
|
||||
RH-Commit: [2/2] d85cf8b3c93ede47b51c4aa1336dc54f58b8cc3f (thuth/qemu-kvm-cs)
|
||||
|
||||
Commit bdf12f2a fixed the setter for the "loadparm" machine property,
|
||||
which gets a string from a visitor, passes it to s390_ipl_fmt_loadparm()
|
||||
and then forgot to free it. It left another instance of the same problem
|
||||
unfixed in the "loadparm" device property. Fix it.
|
||||
|
||||
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
||||
Message-ID: <20250625082751.24896-1-kwolf@redhat.com>
|
||||
Reviewed-by: Eric Farman <farman@linux.ibm.com>
|
||||
Reviewed-by: Halil Pasic <pasic@linux.ibm.com>
|
||||
Tested-by: Thomas Huth <thuth@redhat.com>
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
(cherry picked from commit 78e3781541209b3dcd6f4bb66adf3a3e504b88a4)
|
||||
---
|
||||
hw/s390x/ccw-device.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/hw/s390x/ccw-device.c b/hw/s390x/ccw-device.c
|
||||
index 30f2fb486f..63e937401e 100644
|
||||
--- a/hw/s390x/ccw-device.c
|
||||
+++ b/hw/s390x/ccw-device.c
|
||||
@@ -57,7 +57,7 @@ static void ccw_device_set_loadparm(Object *obj, Visitor *v,
|
||||
Error **errp)
|
||||
{
|
||||
CcwDevice *dev = CCW_DEVICE(obj);
|
||||
- char *val;
|
||||
+ g_autofree char *val = NULL;
|
||||
int index;
|
||||
|
||||
index = object_property_get_int(obj, "bootindex", NULL);
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,49 @@
|
||||
From 45884bfad1f14585407a04eff9230a75bc5095fa Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:37:05 +0100
|
||||
Subject: [PATCH 52/57] hw/scsi/lsi53c895a: skip automatic zero-init of large
|
||||
array
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [26/30] 235884d43fcb3e49b320e36faa631a3656d07de6 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'lsi_memcpy' method has a 4k byte array used for copying data
|
||||
to/from the device. Skip the automatic zero-init of this array to
|
||||
eliminate the performance overhead in the I/O hot path.
|
||||
|
||||
The 'buf' array will be fully initialized when data is copied.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
|
||||
Reviewed-by: Harsh Prateek Bora <harshpb@linux.ibm.com>
|
||||
Message-id: 20250610123709.835102-28-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 55243edf42ee87bce9f36ca251f3ab9cda1563e4)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
hw/scsi/lsi53c895a.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/hw/scsi/lsi53c895a.c b/hw/scsi/lsi53c895a.c
|
||||
index f1935e5328..f165705f8a 100644
|
||||
--- a/hw/scsi/lsi53c895a.c
|
||||
+++ b/hw/scsi/lsi53c895a.c
|
||||
@@ -1112,7 +1112,7 @@ bad:
|
||||
static void lsi_memcpy(LSIState *s, uint32_t dest, uint32_t src, int count)
|
||||
{
|
||||
int n;
|
||||
- uint8_t buf[LSI_BUF_SIZE];
|
||||
+ QEMU_UNINITIALIZED uint8_t buf[LSI_BUF_SIZE];
|
||||
|
||||
trace_lsi_memcpy(dest, src, count);
|
||||
while (count) {
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,73 @@
|
||||
From 9f76103e90ce8406bc5bbda72a7314b82e56652e Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:37:06 +0100
|
||||
Subject: [PATCH 53/57] hw/scsi/megasas: skip automatic zero-init of large
|
||||
arrays
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [27/30] b3a3f466fd03c64c665c52e26079b03def376f48 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'megasas_dcmd_pd_get_list' and 'megasas_dcmd_get_properties'
|
||||
methods have 4k structs used for copying data from the device.
|
||||
Skip the automatic zero-init of this array to eliminate the
|
||||
performance overhead in the I/O hot path.
|
||||
|
||||
The 'info' structs are manually initialized with memset(). The
|
||||
compiler ought to be intelligent enough to turn the memset()
|
||||
into a static initialization operation, and thus not duplicate
|
||||
the automatic zero-init. Replacing memset() with '{}' makes it
|
||||
unambiguous that the arrays are statically initialized.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
|
||||
Reviewed-by: Harsh Prateek Bora <harshpb@linux.ibm.com>
|
||||
Message-id: 20250610123709.835102-29-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit ca0559e2350c618048f7caf80cb79c1259e7cfd2)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
hw/scsi/megasas.c | 7 ++-----
|
||||
1 file changed, 2 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c
|
||||
index 2d0c607177..91b65accbc 100644
|
||||
--- a/hw/scsi/megasas.c
|
||||
+++ b/hw/scsi/megasas.c
|
||||
@@ -981,13 +981,11 @@ static int megasas_event_wait(MegasasState *s, MegasasCmd *cmd)
|
||||
|
||||
static int megasas_dcmd_pd_get_list(MegasasState *s, MegasasCmd *cmd)
|
||||
{
|
||||
- struct mfi_pd_list info;
|
||||
- size_t dcmd_size = sizeof(info);
|
||||
+ struct mfi_pd_list info = {};
|
||||
BusChild *kid;
|
||||
uint32_t offset, dcmd_limit, num_pd_disks = 0, max_pd_disks;
|
||||
dma_addr_t residual;
|
||||
|
||||
- memset(&info, 0, dcmd_size);
|
||||
offset = 8;
|
||||
dcmd_limit = offset + sizeof(struct mfi_pd_address);
|
||||
if (cmd->iov_size < dcmd_limit) {
|
||||
@@ -1429,11 +1427,10 @@ static int megasas_dcmd_cfg_read(MegasasState *s, MegasasCmd *cmd)
|
||||
|
||||
static int megasas_dcmd_get_properties(MegasasState *s, MegasasCmd *cmd)
|
||||
{
|
||||
- struct mfi_ctrl_props info;
|
||||
+ struct mfi_ctrl_props info = {};
|
||||
size_t dcmd_size = sizeof(info);
|
||||
dma_addr_t residual;
|
||||
|
||||
- memset(&info, 0x0, dcmd_size);
|
||||
if (cmd->iov_size < dcmd_size) {
|
||||
trace_megasas_dcmd_invalid_xfer_len(cmd->index, cmd->iov_size,
|
||||
dcmd_size);
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,50 @@
|
||||
From 3a0ae5a2f873fc7062262efc24a5403233988f5f Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:37:07 +0100
|
||||
Subject: [PATCH 54/57] hw/ufs/lu: skip automatic zero-init of large array
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [28/30] 62e7c83d15143387f6d6b366c8ec46b312d05577 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'ufs_emulate_scsi_cmd' method has a 4k byte array used for
|
||||
copying data from the device. Skip the automatic zero-init of
|
||||
this array to eliminate the performance overhead in the I/O hot
|
||||
path.
|
||||
|
||||
The 'outbuf' array will be fully initialized when data is copied
|
||||
from the guest.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
|
||||
Reviewed-by: Harsh Prateek Bora <harshpb@linux.ibm.com>
|
||||
Message-id: 20250610123709.835102-30-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 7708e298180550eac262c1fd742e6e80c711a5d8)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
hw/ufs/lu.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/hw/ufs/lu.c b/hw/ufs/lu.c
|
||||
index 81bfff9b4e..caad82dcc4 100644
|
||||
--- a/hw/ufs/lu.c
|
||||
+++ b/hw/ufs/lu.c
|
||||
@@ -194,7 +194,7 @@ static int ufs_emulate_wlun_inquiry(UfsRequest *req, uint8_t *outbuf,
|
||||
static UfsReqResult ufs_emulate_scsi_cmd(UfsLu *lu, UfsRequest *req)
|
||||
{
|
||||
uint8_t lun = lu->lun;
|
||||
- uint8_t outbuf[4096];
|
||||
+ QEMU_UNINITIALIZED uint8_t outbuf[4096];
|
||||
uint8_t sense_buf[UFS_SENSE_SIZE];
|
||||
uint8_t scsi_status;
|
||||
int len = 0;
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,50 @@
|
||||
From 6d4761010ea4dc218a1623513f410fc2d1cfc832 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:37:04 +0100
|
||||
Subject: [PATCH 51/57] hw/usb/hcd-ohci: skip automatic zero-init of large
|
||||
array
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [25/30] 721dd97d384fb755c4a6a00cfc3d867e43f25b0b (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'ohci_service_iso_td' method has a 8k byte array used for copying
|
||||
data between guest and host. Skip the automatic zero-init of this
|
||||
array to eliminate the performance overhead in the I/O hot path.
|
||||
|
||||
The 'buf' array will be fully initialized when reading data from guest
|
||||
memory.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
|
||||
Reviewed-by: Harsh Prateek Bora <harshpb@linux.ibm.com>
|
||||
Message-id: 20250610123709.835102-27-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 14997d521d1cd0bb36c902ef1032f0d3f2a3c912)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
hw/usb/hcd-ohci.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/hw/usb/hcd-ohci.c b/hw/usb/hcd-ohci.c
|
||||
index 71b54914d3..72a9f9f474 100644
|
||||
--- a/hw/usb/hcd-ohci.c
|
||||
+++ b/hw/usb/hcd-ohci.c
|
||||
@@ -577,7 +577,7 @@ static int ohci_service_iso_td(OHCIState *ohci, struct ohci_ed *ed)
|
||||
USBDevice *dev;
|
||||
USBEndpoint *ep;
|
||||
USBPacket *pkt;
|
||||
- uint8_t buf[8192];
|
||||
+ QEMU_UNINITIALIZED uint8_t buf[8192];
|
||||
bool int_req;
|
||||
struct ohci_iso_td iso_td;
|
||||
uint32_t addr;
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,61 @@
|
||||
From 04f11749dd21b4df1ea2818785d650dd6eee2cbe Mon Sep 17 00:00:00 2001
|
||||
From: Eric Auger <eric.auger@redhat.com>
|
||||
Date: Tue, 18 Feb 2025 19:25:34 +0100
|
||||
Subject: [PATCH 4/9] hw/vfio/common: Add a trace point in vfio_reset_handler
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Eric Auger <eric.auger@redhat.com>
|
||||
RH-MergeRequest: 341: Fix vIOMMU reset order
|
||||
RH-Jira: RHEL-7188
|
||||
RH-Acked-by: Peter Xu <peterx@redhat.com>
|
||||
RH-Acked-by: Donald Dutile <None>
|
||||
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
|
||||
RH-Commit: [4/5] 46878ffdc96997d1f6d09bde3fce350564e499fd (eauger1/centos-qemu-kvm)
|
||||
|
||||
To ease the debug of reset sequence, let's add a trace point
|
||||
in vfio_reset_handler()
|
||||
|
||||
Signed-off-by: Eric Auger <eric.auger@redhat.com>
|
||||
Reviewed-by: Cédric Le Goater <clg@redhat.com>
|
||||
Acked-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
|
||||
Message-Id: <20250218182737.76722-5-eric.auger@redhat.com>
|
||||
Reviewed-by: Peter Xu <peterx@redhat.com>
|
||||
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
(cherry picked from commit d410e709526d1cd4aa9085c6e254a622594a02a5)
|
||||
Signed-off-by: Eric Auger <eric.auger@redhat.com>
|
||||
---
|
||||
hw/vfio/common.c | 1 +
|
||||
hw/vfio/trace-events | 1 +
|
||||
2 files changed, 2 insertions(+)
|
||||
|
||||
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
|
||||
index 36d0cf6585..6982f88fc8 100644
|
||||
--- a/hw/vfio/common.c
|
||||
+++ b/hw/vfio/common.c
|
||||
@@ -1395,6 +1395,7 @@ void vfio_reset_handler(void *opaque)
|
||||
{
|
||||
VFIODevice *vbasedev;
|
||||
|
||||
+ trace_vfio_reset_handler();
|
||||
QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
|
||||
if (vbasedev->dev->realized) {
|
||||
vbasedev->ops->vfio_compute_needs_reset(vbasedev);
|
||||
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
|
||||
index 3756ff660e..9523a9ccb0 100644
|
||||
--- a/hw/vfio/trace-events
|
||||
+++ b/hw/vfio/trace-events
|
||||
@@ -120,6 +120,7 @@ vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype
|
||||
vfio_legacy_dma_unmap_overflow_workaround(void) ""
|
||||
vfio_get_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64
|
||||
vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
|
||||
+vfio_reset_handler(void) ""
|
||||
|
||||
# platform.c
|
||||
vfio_platform_realize(char *name, char *compat) "vfio device %s, compat = %s"
|
||||
--
|
||||
2.48.1
|
||||
|
||||
74
SOURCES/kvm-hw-vfio-pci-Re-order-pre-reset.patch
Normal file
74
SOURCES/kvm-hw-vfio-pci-Re-order-pre-reset.patch
Normal file
@ -0,0 +1,74 @@
|
||||
From d6a961077e753b9ad5a670a1529634fe20322ce2 Mon Sep 17 00:00:00 2001
|
||||
From: Alex Williamson <alex.williamson@redhat.com>
|
||||
Date: Tue, 25 Feb 2025 14:52:29 -0700
|
||||
Subject: [PATCH 6/7] hw/vfio/pci: Re-order pre-reset
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Eric Auger <eric.auger@redhat.com>
|
||||
RH-MergeRequest: 348: PCI: Implement basic PCI PM capability backing
|
||||
RH-Jira: RHEL-7301
|
||||
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
|
||||
RH-Acked-by: Alex Williamson <None>
|
||||
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
|
||||
RH-Commit: [6/6] c6c386ecbabda93f8a79da926ece95c2195fbc36 (eauger1/centos-qemu-kvm)
|
||||
|
||||
We want the device in the D0 power state going into reset, but the
|
||||
config write can enable the BARs in the address space, which are
|
||||
then removed from the address space once we clear the memory enable
|
||||
bit in the command register. Re-order to clear the command bit
|
||||
first, so the power state change doesn't enable the BARs.
|
||||
|
||||
Cc: Cédric Le Goater <clg@redhat.com>
|
||||
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
|
||||
Reviewed-by: Eric Auger <eric.auger@redhat.com>
|
||||
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
|
||||
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Link: https://lore.kernel.org/qemu-devel/20250225215237.3314011-6-alex.williamson@redhat.com
|
||||
Signed-off-by: Cédric Le Goater <clg@redhat.com>
|
||||
(cherry picked from commit 518a69a598916749338de3852d41d961d4503115)
|
||||
Signed-off-by: Eric Auger <eric.auger@redhat.com>
|
||||
---
|
||||
hw/vfio/pci.c | 18 +++++++++---------
|
||||
1 file changed, 9 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
|
||||
index 595b5c9b25..ffe72fd1d0 100644
|
||||
--- a/hw/vfio/pci.c
|
||||
+++ b/hw/vfio/pci.c
|
||||
@@ -2414,6 +2414,15 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
|
||||
|
||||
vfio_disable_interrupts(vdev);
|
||||
|
||||
+ /*
|
||||
+ * Stop any ongoing DMA by disconnecting I/O, MMIO, and bus master.
|
||||
+ * Also put INTx Disable in known state.
|
||||
+ */
|
||||
+ cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
|
||||
+ cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
|
||||
+ PCI_COMMAND_INTX_DISABLE);
|
||||
+ vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
|
||||
+
|
||||
/* Make sure the device is in D0 */
|
||||
if (pdev->pm_cap) {
|
||||
uint16_t pmcsr;
|
||||
@@ -2433,15 +2442,6 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
|
||||
}
|
||||
}
|
||||
}
|
||||
-
|
||||
- /*
|
||||
- * Stop any ongoing DMA by disconnecting I/O, MMIO, and bus master.
|
||||
- * Also put INTx Disable in known state.
|
||||
- */
|
||||
- cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
|
||||
- cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
|
||||
- PCI_COMMAND_INTX_DISABLE);
|
||||
- vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
|
||||
}
|
||||
|
||||
void vfio_pci_post_reset(VFIOPCIDevice *vdev)
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,59 @@
|
||||
From afa3a488f3ca52a5455987e4cd643882c4b15d8a Mon Sep 17 00:00:00 2001
|
||||
From: Thomas Huth <thuth@redhat.com>
|
||||
Date: Thu, 13 Mar 2025 07:35:22 +0100
|
||||
Subject: [PATCH 24/26] hw/virtio: Also include md stubs in case
|
||||
CONFIG_VIRTIO_PCI is not set
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Thomas Huth <thuth@redhat.com>
|
||||
RH-MergeRequest: 351: Enable virtio-mem support on s390x
|
||||
RH-Jira: RHEL-72977
|
||||
RH-Acked-by: David Hildenbrand <david@redhat.com>
|
||||
RH-Acked-by: Juraj Marcin <None>
|
||||
RH-Commit: [24/26] ae6307b26d01d2a317f7e5d1d3b3a16b6d5f56de (thuth/qemu-kvm-cs)
|
||||
|
||||
For the s390x target, it's possible to build the QEMU binary without
|
||||
CONFIG_VIRTIO_PCI and only have the virtio-mem device via the ccw
|
||||
transport. In that case, QEMU currently fails to link correctly:
|
||||
|
||||
/usr/bin/ld: libqemu-s390x-softmmu.a.p/hw_s390x_s390-virtio-ccw.c.o: in function `s390_machine_device_pre_plug':
|
||||
../hw/s390x/s390-virtio-ccw.c:579:(.text+0x1e96): undefined reference to `virtio_md_pci_pre_plug'
|
||||
/usr/bin/ld: libqemu-s390x-softmmu.a.p/hw_s390x_s390-virtio-ccw.c.o: in function `s390_machine_device_plug':
|
||||
../hw/s390x/s390-virtio-ccw.c:608:(.text+0x21a4): undefined reference to `virtio_md_pci_plug'
|
||||
/usr/bin/ld: libqemu-s390x-softmmu.a.p/hw_s390x_s390-virtio-ccw.c.o: in function `s390_machine_device_unplug_request':
|
||||
../hw/s390x/s390-virtio-ccw.c:622:(.text+0x2334): undefined reference to `virtio_md_pci_unplug_request'
|
||||
/usr/bin/ld: libqemu-s390x-softmmu.a.p/hw_s390x_s390-virtio-ccw.c.o: in function `s390_machine_device_unplug':
|
||||
../hw/s390x/s390-virtio-ccw.c:633:(.text+0x2436): undefined reference to `virtio_md_pci_unplug'
|
||||
clang: error: linker command failed with exit code 1 (use -v to see invocation)
|
||||
|
||||
We also need to include the stubs when CONFIG_VIRTIO_PCI is missing.
|
||||
|
||||
Fixes: aa910c20ec5 ("s390x: virtio-mem support")
|
||||
Message-ID: <20250313063522.1348288-1-thuth@redhat.com>
|
||||
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
(cherry picked from commit c1a6bff276ca52ffde472532d92bb5bb122dab3f)
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
---
|
||||
hw/virtio/meson.build | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
|
||||
index c38bdd6fa4..e2f9c75625 100644
|
||||
--- a/hw/virtio/meson.build
|
||||
+++ b/hw/virtio/meson.build
|
||||
@@ -89,7 +89,8 @@ specific_virtio_ss.add_all(when: 'CONFIG_VIRTIO_PCI', if_true: virtio_pci_ss)
|
||||
system_ss.add_all(when: 'CONFIG_VIRTIO', if_true: system_virtio_ss)
|
||||
system_ss.add(when: 'CONFIG_VIRTIO', if_false: files('vhost-stub.c'))
|
||||
system_ss.add(when: 'CONFIG_VIRTIO', if_false: files('virtio-stub.c'))
|
||||
-system_ss.add(when: 'CONFIG_VIRTIO_MD', if_false: files('virtio-md-stubs.c'))
|
||||
+system_ss.add(when: ['CONFIG_VIRTIO_MD', 'CONFIG_VIRTIO_PCI'],
|
||||
+ if_false: files('virtio-md-stubs.c'))
|
||||
|
||||
system_ss.add(files('virtio-hmp-cmds.c'))
|
||||
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,73 @@
|
||||
From 4727c044a09fb8c4fb6d667f26eb55bb6de7554d Mon Sep 17 00:00:00 2001
|
||||
From: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:36:40 +0100
|
||||
Subject: [PATCH 28/57] hw/virtio/virtio: avoid cost of -ftrivial-auto-var-init
|
||||
in hot path
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [2/30] 1c2cc6292deaaac068f4514439703c22c9ccb300 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
Since commit 7ff9ff039380 ("meson: mitigate against use of uninitialize
|
||||
stack for exploits") the -ftrivial-auto-var-init=zero compiler option is
|
||||
used to zero local variables. While this reduces security risks
|
||||
associated with uninitialized stack data, it introduced a measurable
|
||||
bottleneck in the virtqueue_split_pop() and virtqueue_packed_pop()
|
||||
functions.
|
||||
|
||||
These virtqueue functions are in the hot path. They are called for each
|
||||
element (request) that is popped from a VIRTIO device's virtqueue. Using
|
||||
__attribute__((uninitialized)) on large stack variables in these
|
||||
functions improves fio randread bs=4k iodepth=64 performance from 304k
|
||||
to 332k IOPS (+9%).
|
||||
|
||||
This issue was found using perf-top(1). virtqueue_split_pop() was one of
|
||||
the top CPU consumers and the "annotate" feature showed that the memory
|
||||
zeroing instructions at the beginning of the functions were hot.
|
||||
|
||||
Fixes: 7ff9ff039380 ("meson: mitigate against use of uninitialize stack for exploits")
|
||||
Cc: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-id: 20250610123709.835102-3-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit ba2868ce091cd4abe4be6de4b7e44b3be303b352)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
hw/virtio/virtio.c | 8 ++++----
|
||||
1 file changed, 4 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
|
||||
index 10f24a58dd..7f7b178a50 100644
|
||||
--- a/hw/virtio/virtio.c
|
||||
+++ b/hw/virtio/virtio.c
|
||||
@@ -1680,8 +1680,8 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t sz)
|
||||
VirtIODevice *vdev = vq->vdev;
|
||||
VirtQueueElement *elem = NULL;
|
||||
unsigned out_num, in_num, elem_entries;
|
||||
- hwaddr addr[VIRTQUEUE_MAX_SIZE];
|
||||
- struct iovec iov[VIRTQUEUE_MAX_SIZE];
|
||||
+ hwaddr QEMU_UNINITIALIZED addr[VIRTQUEUE_MAX_SIZE];
|
||||
+ struct iovec QEMU_UNINITIALIZED iov[VIRTQUEUE_MAX_SIZE];
|
||||
VRingDesc desc;
|
||||
int rc;
|
||||
|
||||
@@ -1826,8 +1826,8 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t sz)
|
||||
VirtIODevice *vdev = vq->vdev;
|
||||
VirtQueueElement *elem = NULL;
|
||||
unsigned out_num, in_num, elem_entries;
|
||||
- hwaddr addr[VIRTQUEUE_MAX_SIZE];
|
||||
- struct iovec iov[VIRTQUEUE_MAX_SIZE];
|
||||
+ hwaddr QEMU_UNINITIALIZED addr[VIRTQUEUE_MAX_SIZE];
|
||||
+ struct iovec QEMU_UNINITIALIZED iov[VIRTQUEUE_MAX_SIZE];
|
||||
VRingPackedDesc desc;
|
||||
uint16_t id;
|
||||
int rc;
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,96 @@
|
||||
From 9ca5d7ac4f0ff5f10bf424df8104fe5abe01e431 Mon Sep 17 00:00:00 2001
|
||||
From: Eric Auger <eric.auger@redhat.com>
|
||||
Date: Tue, 18 Feb 2025 19:25:31 +0100
|
||||
Subject: [PATCH 1/9] hw/virtio/virtio-iommu: Migrate to 3-phase reset
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Eric Auger <eric.auger@redhat.com>
|
||||
RH-MergeRequest: 341: Fix vIOMMU reset order
|
||||
RH-Jira: RHEL-7188
|
||||
RH-Acked-by: Peter Xu <peterx@redhat.com>
|
||||
RH-Acked-by: Donald Dutile <None>
|
||||
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
|
||||
RH-Commit: [1/5] 32bf47497d5d4817a448d07ffa7a844aee82ae3c (eauger1/centos-qemu-kvm)
|
||||
|
||||
Currently the iommu may be reset before the devices
|
||||
it protects. For example this happens with virtio-net.
|
||||
|
||||
Let's use 3-phase reset mechanism and reset the IOMMU on
|
||||
exit phase after all DMA capable devices have been
|
||||
reset during the 'enter' or 'hold' phase.
|
||||
|
||||
Signed-off-by: Eric Auger <eric.auger@redhat.com>
|
||||
Acked-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
|
||||
Acked-by: Jason Wang <jasowang@redhat.com>
|
||||
|
||||
Message-Id: <20250218182737.76722-2-eric.auger@redhat.com>
|
||||
Reviewed-by: Peter Xu <peterx@redhat.com>
|
||||
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
(cherry picked from commit d261b84d354a41a38336af813f92f636d3fb3f78)
|
||||
Signed-off-by: Eric Auger <eric.auger@redhat.com>
|
||||
---
|
||||
hw/virtio/trace-events | 2 +-
|
||||
hw/virtio/virtio-iommu.c | 14 ++++++++++----
|
||||
2 files changed, 11 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
|
||||
index 04e36ae047..76f0d458b2 100644
|
||||
--- a/hw/virtio/trace-events
|
||||
+++ b/hw/virtio/trace-events
|
||||
@@ -108,7 +108,7 @@ virtio_pci_notify_write(uint64_t addr, uint64_t val, unsigned int size) "0x%" PR
|
||||
virtio_pci_notify_write_pio(uint64_t addr, uint64_t val, unsigned int size) "0x%" PRIx64" = 0x%" PRIx64 " (%d)"
|
||||
|
||||
# hw/virtio/virtio-iommu.c
|
||||
-virtio_iommu_device_reset(void) "reset!"
|
||||
+virtio_iommu_device_reset_exit(void) "reset!"
|
||||
virtio_iommu_system_reset(void) "system reset!"
|
||||
virtio_iommu_get_features(uint64_t features) "device supports features=0x%"PRIx64
|
||||
virtio_iommu_device_status(uint8_t status) "driver status = %d"
|
||||
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
|
||||
index 59ef4fb217..496200ebc5 100644
|
||||
--- a/hw/virtio/virtio-iommu.c
|
||||
+++ b/hw/virtio/virtio-iommu.c
|
||||
@@ -1504,11 +1504,11 @@ static void virtio_iommu_device_unrealize(DeviceState *dev)
|
||||
virtio_cleanup(vdev);
|
||||
}
|
||||
|
||||
-static void virtio_iommu_device_reset(VirtIODevice *vdev)
|
||||
+static void virtio_iommu_device_reset_exit(Object *obj, ResetType type)
|
||||
{
|
||||
- VirtIOIOMMU *s = VIRTIO_IOMMU(vdev);
|
||||
+ VirtIOIOMMU *s = VIRTIO_IOMMU(obj);
|
||||
|
||||
- trace_virtio_iommu_device_reset();
|
||||
+ trace_virtio_iommu_device_reset_exit();
|
||||
|
||||
if (s->domains) {
|
||||
g_tree_destroy(s->domains);
|
||||
@@ -1669,6 +1669,7 @@ static void virtio_iommu_class_init(ObjectClass *klass, void *data)
|
||||
{
|
||||
DeviceClass *dc = DEVICE_CLASS(klass);
|
||||
VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
|
||||
+ ResettableClass *rc = RESETTABLE_CLASS(klass);
|
||||
|
||||
device_class_set_props(dc, virtio_iommu_properties);
|
||||
dc->vmsd = &vmstate_virtio_iommu;
|
||||
@@ -1676,7 +1677,12 @@ static void virtio_iommu_class_init(ObjectClass *klass, void *data)
|
||||
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
|
||||
vdc->realize = virtio_iommu_device_realize;
|
||||
vdc->unrealize = virtio_iommu_device_unrealize;
|
||||
- vdc->reset = virtio_iommu_device_reset;
|
||||
+
|
||||
+ /*
|
||||
+ * Use 'exit' reset phase to make sure all DMA requests
|
||||
+ * have been quiesced during 'enter' or 'hold' phase
|
||||
+ */
|
||||
+ rc->phases.exit = virtio_iommu_device_reset_exit;
|
||||
vdc->get_config = virtio_iommu_get_config;
|
||||
vdc->set_config = virtio_iommu_set_config;
|
||||
vdc->get_features = virtio_iommu_get_features;
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,80 @@
|
||||
From cf92fd8487195ac45bfbdad15168eaec70f3aaa9 Mon Sep 17 00:00:00 2001
|
||||
From: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:36:39 +0100
|
||||
Subject: [PATCH 27/57] include/qemu/compiler: add QEMU_UNINITIALIZED attribute
|
||||
macro
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [1/30] 43c2412d318b6d8e0dcb0b37340640a9d90c3188 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The QEMU_UNINITIALIZED macro is to be used to skip the default compiler
|
||||
variable initialization done by -ftrivial-auto-var-init=zero.
|
||||
|
||||
Use this in cases where there a method in the device I/O path (or other
|
||||
important hot paths), that has large variables on the stack. A rule of
|
||||
thumb is that "large" means a method with 4kb data in the local stack
|
||||
frame. Any variables which are KB in size, should be annotated with this
|
||||
attribute, to pre-emptively eliminate any potential overhead from the
|
||||
compiler zero'ing memory.
|
||||
|
||||
Given that this turns off a security hardening feature, when using this
|
||||
to flag variables, it is important that the code is double-checked to
|
||||
ensure there is no possible use of uninitialized data in the method.
|
||||
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Message-id: 20250610123709.835102-2-berrange@redhat.com
|
||||
[DB: split off patch & rewrite guidance on when to use the annotation]
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit c653b67d1863b7ebfa67f7c9f4aec209d7b5ced5)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
|
||||
Conflicts:
|
||||
include/qemu/compiler.h
|
||||
Context conflict due to clang Thread Safety Analysis macros.
|
||||
---
|
||||
include/qemu/compiler.h | 20 ++++++++++++++++++++
|
||||
1 file changed, 20 insertions(+)
|
||||
|
||||
diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h
|
||||
index c06954ccb4..cc193d5b82 100644
|
||||
--- a/include/qemu/compiler.h
|
||||
+++ b/include/qemu/compiler.h
|
||||
@@ -212,6 +212,26 @@
|
||||
# define QEMU_USED
|
||||
#endif
|
||||
|
||||
+/*
|
||||
+ * Disable -ftrivial-auto-var-init on a local variable.
|
||||
+ *
|
||||
+ * Use this in cases where there a method in the device I/O path (or other
|
||||
+ * important hot paths), that has large variables on the stack. A rule of
|
||||
+ * thumb is that "large" means a method with 4kb data in the local stack
|
||||
+ * frame. Any variables which are KB in size, should be annotated with this
|
||||
+ * attribute, to pre-emptively eliminate any potential overhead from the
|
||||
+ * compiler's implicit zero'ing of memory.
|
||||
+ *
|
||||
+ * Given that this turns off a security hardening feature, when using this
|
||||
+ * to flag variables, it is important that the code is double-checked to
|
||||
+ * ensure there is no possible use of uninitialized data in the method.
|
||||
+ */
|
||||
+#if __has_attribute(uninitialized)
|
||||
+# define QEMU_UNINITIALIZED __attribute__((uninitialized))
|
||||
+#else
|
||||
+# define QEMU_UNINITIALIZED
|
||||
+#endif
|
||||
+
|
||||
/*
|
||||
* Ugly CPP trick that is like "defined FOO", but also works in C
|
||||
* code. Useful to replace #ifdef with "if" statements; assumes
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,73 @@
|
||||
From 4545870823aea92b18a7e747b686b666d08006a4 Mon Sep 17 00:00:00 2001
|
||||
From: Juraj Marcin <jmarcin@redhat.com>
|
||||
Date: Wed, 21 May 2025 15:52:30 +0200
|
||||
Subject: [PATCH 08/57] io: Fix partial struct copy in
|
||||
qio_dns_resolver_lookup_sync_inet()
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Juraj Marcin <None>
|
||||
RH-MergeRequest: 369: util/qemu-sockets: Introduce inet socket options controlling TCP keep-alive
|
||||
RH-Jira: RHEL-67104
|
||||
RH-Acked-by: Peter Xu <peterx@redhat.com>
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [1/7] 92c8b3e63c22a3ca6e5adc76cac1a9f812034912 (JurajMarcin/centos-src-qemu-kvm)
|
||||
|
||||
Commit aec21d3175 (qapi: Add InetSocketAddress member keep-alive)
|
||||
introduces the keep-alive flag, but this flag is not copied together
|
||||
with other options in qio_dns_resolver_lookup_sync_inet().
|
||||
|
||||
This patch fixes this issue and also prevents future ones by copying the
|
||||
entire structure first and only then overriding a few attributes that
|
||||
need to be different.
|
||||
|
||||
Fixes: aec21d31756c (qapi: Add InetSocketAddress member keep-alive)
|
||||
Signed-off-by: Juraj Marcin <jmarcin@redhat.com>
|
||||
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
|
||||
(cherry picked from commit 0dc051aa85e1bd68d5c5110fa8af69204e6dbd3d)
|
||||
|
||||
JIRA: https://issues.redhat.com/browse/RHEL-67104
|
||||
|
||||
Signed-off-by: Juraj Marcin <jmarcin@redhat.com>
|
||||
---
|
||||
io/dns-resolver.c | 21 +++++----------------
|
||||
1 file changed, 5 insertions(+), 16 deletions(-)
|
||||
|
||||
diff --git a/io/dns-resolver.c b/io/dns-resolver.c
|
||||
index 53b0e8407a..3712438f82 100644
|
||||
--- a/io/dns-resolver.c
|
||||
+++ b/io/dns-resolver.c
|
||||
@@ -111,22 +111,11 @@ static int qio_dns_resolver_lookup_sync_inet(QIODNSResolver *resolver,
|
||||
uaddr, INET6_ADDRSTRLEN, uport, 32,
|
||||
NI_NUMERICHOST | NI_NUMERICSERV);
|
||||
|
||||
- newaddr->u.inet = (InetSocketAddress){
|
||||
- .host = g_strdup(uaddr),
|
||||
- .port = g_strdup(uport),
|
||||
- .has_numeric = true,
|
||||
- .numeric = true,
|
||||
- .has_to = iaddr->has_to,
|
||||
- .to = iaddr->to,
|
||||
- .has_ipv4 = iaddr->has_ipv4,
|
||||
- .ipv4 = iaddr->ipv4,
|
||||
- .has_ipv6 = iaddr->has_ipv6,
|
||||
- .ipv6 = iaddr->ipv6,
|
||||
-#ifdef HAVE_IPPROTO_MPTCP
|
||||
- .has_mptcp = iaddr->has_mptcp,
|
||||
- .mptcp = iaddr->mptcp,
|
||||
-#endif
|
||||
- };
|
||||
+ newaddr->u.inet = *iaddr;
|
||||
+ newaddr->u.inet.host = g_strdup(uaddr),
|
||||
+ newaddr->u.inet.port = g_strdup(uport),
|
||||
+ newaddr->u.inet.has_numeric = true,
|
||||
+ newaddr->u.inet.numeric = true,
|
||||
|
||||
(*addrs)[i] = newaddr;
|
||||
}
|
||||
--
|
||||
2.39.3
|
||||
|
||||
42
SOURCES/kvm-iotests-Improve-iotest-194-to-mirror-data.patch
Normal file
42
SOURCES/kvm-iotests-Improve-iotest-194-to-mirror-data.patch
Normal file
@ -0,0 +1,42 @@
|
||||
From 8832268a98104ba3065a57dedcd3db43231512ba Mon Sep 17 00:00:00 2001
|
||||
From: Eric Blake <eblake@redhat.com>
|
||||
Date: Fri, 9 May 2025 15:40:22 -0500
|
||||
Subject: [PATCH 07/16] iotests: Improve iotest 194 to mirror data
|
||||
|
||||
RH-Author: Eric Blake <eblake@redhat.com>
|
||||
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
|
||||
RH-Jira: RHEL-82906 RHEL-83015
|
||||
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
|
||||
RH-Commit: [5/14] bfbe8eab1035480cef9d69d1974ba66b755b1b60 (ebblake/centos-qemu-kvm)
|
||||
|
||||
Mirroring a completely sparse image to a sparse destination should be
|
||||
practically instantaneous. It isn't yet, but the test will be more
|
||||
realistic if it has some non-zero to mirror as well as the holes.
|
||||
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-ID: <20250509204341.3553601-20-eblake@redhat.com>
|
||||
(cherry picked from commit eb89627899bb84148d272394e885725eff456ae9)
|
||||
Jira: https://issues.redhat.com/browse/RHEL-82906
|
||||
Jira: https://issues.redhat.com/browse/RHEL-83015
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
---
|
||||
tests/qemu-iotests/194 | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
diff --git a/tests/qemu-iotests/194 b/tests/qemu-iotests/194
|
||||
index c0ce82dd25..d0b9c084f5 100755
|
||||
--- a/tests/qemu-iotests/194
|
||||
+++ b/tests/qemu-iotests/194
|
||||
@@ -34,6 +34,7 @@ with iotests.FilePath('source.img') as source_img_path, \
|
||||
|
||||
img_size = '1G'
|
||||
iotests.qemu_img_create('-f', iotests.imgfmt, source_img_path, img_size)
|
||||
+ iotests.qemu_io('-f', iotests.imgfmt, '-c', 'write 512M 1M', source_img_path)
|
||||
iotests.qemu_img_create('-f', iotests.imgfmt, dest_img_path, img_size)
|
||||
|
||||
iotests.log('Launching VMs...')
|
||||
--
|
||||
2.48.1
|
||||
|
||||
68
SOURCES/kvm-iotests-common.rc-add-disk_usage-function.patch
Normal file
68
SOURCES/kvm-iotests-common.rc-add-disk_usage-function.patch
Normal file
@ -0,0 +1,68 @@
|
||||
From 644f39de9e2466a9570833b1070acf47a53863ea Mon Sep 17 00:00:00 2001
|
||||
From: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
|
||||
Date: Fri, 9 May 2025 15:40:29 -0500
|
||||
Subject: [PATCH 14/16] iotests/common.rc: add disk_usage function
|
||||
|
||||
RH-Author: Eric Blake <eblake@redhat.com>
|
||||
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
|
||||
RH-Jira: RHEL-82906 RHEL-83015
|
||||
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
|
||||
RH-Commit: [12/14] 0e5d4217f97fe6e952de23eedbc2b8d9c7600665 (ebblake/centos-qemu-kvm)
|
||||
|
||||
Move the definition from iotests/250 to common.rc. This is used to
|
||||
detect real disk usage of sparse files. In particular, we want to use
|
||||
it for checking subclusters-based discards.
|
||||
|
||||
Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
|
||||
Reviewed-by: Alexander Ivanov <alexander.ivanov@virtuozzo.com>
|
||||
Reviewed-by: Alberto Garcia <berto@igalia.com>
|
||||
Message-ID: <20240913163942.423050-6-andrey.drobyshev@virtuozzo.com>
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-ID: <20250509204341.3553601-27-eblake@redhat.com>
|
||||
(cherry picked from commit be9bac072ede6e6aa27079f59efcf17b56bd7b26)
|
||||
Jira: https://issues.redhat.com/browse/RHEL-82906
|
||||
Jira: https://issues.redhat.com/browse/RHEL-83015
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
---
|
||||
tests/qemu-iotests/250 | 5 -----
|
||||
tests/qemu-iotests/common.rc | 6 ++++++
|
||||
2 files changed, 6 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/tests/qemu-iotests/250 b/tests/qemu-iotests/250
|
||||
index af48f83aba..c0a0dbc0ff 100755
|
||||
--- a/tests/qemu-iotests/250
|
||||
+++ b/tests/qemu-iotests/250
|
||||
@@ -52,11 +52,6 @@ _unsupported_imgopts data_file
|
||||
# bdrv_co_truncate(bs->file) call in qcow2_co_truncate(), which might succeed
|
||||
# anyway.
|
||||
|
||||
-disk_usage()
|
||||
-{
|
||||
- du --block-size=1 $1 | awk '{print $1}'
|
||||
-}
|
||||
-
|
||||
size=2100M
|
||||
|
||||
_make_test_img -o "cluster_size=1M,preallocation=metadata" $size
|
||||
diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
|
||||
index 95c12577dd..237f746af8 100644
|
||||
--- a/tests/qemu-iotests/common.rc
|
||||
+++ b/tests/qemu-iotests/common.rc
|
||||
@@ -140,6 +140,12 @@ _optstr_add()
|
||||
fi
|
||||
}
|
||||
|
||||
+# report real disk usage for sparse files
|
||||
+disk_usage()
|
||||
+{
|
||||
+ du --block-size=1 "$1" | awk '{print $1}'
|
||||
+}
|
||||
+
|
||||
# Set the variables to the empty string to turn Valgrind off
|
||||
# for specific processes, e.g.
|
||||
# $ VALGRIND_QEMU_IO= ./check -qcow2 -valgrind 015
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,110 @@
|
||||
From 0277328b5a2d1df5d9843423ab5f5fa9481bad79 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Fri, 25 Apr 2025 13:17:12 +0100
|
||||
Subject: [PATCH 1/5] meson/configure: add 'valgrind' option & --{en,
|
||||
dis}able-valgrind flag
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Daniel P. Berrangé <berrange@redhat.com>
|
||||
RH-MergeRequest: 359: distro: add an explicit valgrind-devel build dep
|
||||
RH-Jira: RHEL-88153
|
||||
RH-Acked-by: Eric Blake <eblake@redhat.com>
|
||||
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
|
||||
RH-Commit: [1/2] ba9bc44ef9cef6fa76e2092500608575f223f1f7 (berrange/centos-src-qemu)
|
||||
|
||||
Currently valgrind debugging support for coroutine stacks is enabled
|
||||
unconditionally when valgrind/valgrind.h is found. There is no way
|
||||
to disable valgrind support if valgrind.h is present in the build env.
|
||||
|
||||
This is bad for distros, as an dependency far down the chain may cause
|
||||
valgrind.h to become installed, inadvertently enabling QEMU's valgrind
|
||||
debugging support. It also means if a distro wants valgrind support
|
||||
there is no way to mandate this.
|
||||
|
||||
The solution is to add a 'valgrind' build feature to meson and thus
|
||||
configure script.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Thomas Huth <thuth@redhat.com>
|
||||
Message-ID: <20250425121713.1913424-1-berrange@redhat.com>
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
(cherry picked from commit 6b1c744ec0d66d6d568f9a156282153fc11a21cf)
|
||||
|
||||
Conflicts:
|
||||
meson.build - context from upstream is not present in older tree
|
||||
---
|
||||
meson.build | 13 ++++++++++++-
|
||||
meson_options.txt | 2 ++
|
||||
scripts/meson-buildoptions.sh | 3 +++
|
||||
3 files changed, 17 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/meson.build b/meson.build
|
||||
index 1dd97c6f49..5bb2b757c3 100644
|
||||
--- a/meson.build
|
||||
+++ b/meson.build
|
||||
@@ -2463,7 +2463,17 @@ config_host_data.set('CONFIG_FSTRIM', qga_fstrim)
|
||||
# has_header
|
||||
config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h'))
|
||||
config_host_data.set('CONFIG_LINUX_MAGIC_H', cc.has_header('linux/magic.h'))
|
||||
-config_host_data.set('CONFIG_VALGRIND_H', cc.has_header('valgrind/valgrind.h'))
|
||||
+valgrind = false
|
||||
+if get_option('valgrind').allowed()
|
||||
+ if cc.has_header('valgrind/valgrind.h')
|
||||
+ valgrind = true
|
||||
+ else
|
||||
+ if get_option('valgrind').enabled()
|
||||
+ error('valgrind requested but valgrind.h not found')
|
||||
+ endif
|
||||
+ endif
|
||||
+endif
|
||||
+config_host_data.set('CONFIG_VALGRIND_H', valgrind)
|
||||
config_host_data.set('HAVE_BTRFS_H', cc.has_header('linux/btrfs.h'))
|
||||
config_host_data.set('HAVE_DRM_H', cc.has_header('libdrm/drm.h'))
|
||||
config_host_data.set('HAVE_PTY_H', cc.has_header('pty.h'))
|
||||
@@ -4549,6 +4559,7 @@ summary_info += {'libdw': libdw}
|
||||
if host_os == 'freebsd'
|
||||
summary_info += {'libinotify-kqueue': inotify}
|
||||
endif
|
||||
+summary_info += {'valgrind': valgrind}
|
||||
summary(summary_info, bool_yn: true, section: 'Dependencies')
|
||||
|
||||
if host_arch == 'unknown'
|
||||
diff --git a/meson_options.txt b/meson_options.txt
|
||||
index aa2ba0baef..da06441fdf 100644
|
||||
--- a/meson_options.txt
|
||||
+++ b/meson_options.txt
|
||||
@@ -113,6 +113,8 @@ option('dbus_display', type: 'feature', value: 'auto',
|
||||
description: '-display dbus support')
|
||||
option('tpm', type : 'feature', value : 'auto',
|
||||
description: 'TPM support')
|
||||
+option('valgrind', type : 'feature', value: 'auto',
|
||||
+ description: 'valgrind debug support for coroutine stacks')
|
||||
|
||||
# Do not enable it by default even for Mingw32, because it doesn't
|
||||
# work on Wine.
|
||||
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
|
||||
index 5f0cbfc725..251470ea6d 100644
|
||||
--- a/scripts/meson-buildoptions.sh
|
||||
+++ b/scripts/meson-buildoptions.sh
|
||||
@@ -191,6 +191,7 @@ meson_options_help() {
|
||||
printf "%s\n" ' u2f U2F emulation support'
|
||||
printf "%s\n" ' uadk UADK Library support'
|
||||
printf "%s\n" ' usb-redir libusbredir support'
|
||||
+ printf "%s\n" ' valgrind valgrind debug support for coroutine stacks'
|
||||
printf "%s\n" ' vde vde network backend support'
|
||||
printf "%s\n" ' vdi vdi image format support'
|
||||
printf "%s\n" ' vduse-blk-export'
|
||||
@@ -509,6 +510,8 @@ _meson_option_parse() {
|
||||
--disable-uadk) printf "%s" -Duadk=disabled ;;
|
||||
--enable-usb-redir) printf "%s" -Dusb_redir=enabled ;;
|
||||
--disable-usb-redir) printf "%s" -Dusb_redir=disabled ;;
|
||||
+ --enable-valgrind) printf "%s" -Dvalgrind=enabled ;;
|
||||
+ --disable-valgrind) printf "%s" -Dvalgrind=disabled ;;
|
||||
--enable-vde) printf "%s" -Dvde=enabled ;;
|
||||
--disable-vde) printf "%s" -Dvde=disabled ;;
|
||||
--enable-vdi) printf "%s" -Dvdi=enabled ;;
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,180 @@
|
||||
From 5d7d7a2ec6301f4d0b0dbea4fbdcab4e41a9cf07 Mon Sep 17 00:00:00 2001
|
||||
From: Peter Xu <peterx@redhat.com>
|
||||
Date: Thu, 20 Feb 2025 08:24:59 -0500
|
||||
Subject: [PATCH 7/9] migration: Fix UAF for incoming migration on
|
||||
MigrationState
|
||||
|
||||
RH-Author: Peter Xu <peterx@redhat.com>
|
||||
RH-MergeRequest: 344: migration: Fix UAF for incoming migration on MigrationState
|
||||
RH-Jira: RHEL-69775
|
||||
RH-Acked-by: Juraj Marcin <None>
|
||||
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
|
||||
RH-Commit: [1/1] 106e2b4c1c461202c912b5e3ea7e586c4ab05d8c (peterx/qemu-kvm)
|
||||
|
||||
On the incoming migration side, QEMU uses a coroutine to load all the VM
|
||||
states. Inside, it may reference MigrationState on global states like
|
||||
migration capabilities, parameters, error state, shared mutexes and more.
|
||||
|
||||
However there's nothing yet to make sure MigrationState won't get
|
||||
destroyed (e.g. after migration_shutdown()). Meanwhile there's also no API
|
||||
available to remove the incoming coroutine in migration_shutdown(),
|
||||
avoiding it to access the freed elements.
|
||||
|
||||
There's a bug report showing this can happen and crash dest QEMU when
|
||||
migration is cancelled on source.
|
||||
|
||||
When it happens, the dest main thread is trying to cleanup everything:
|
||||
|
||||
#0 qemu_aio_coroutine_enter
|
||||
#1 aio_dispatch_handler
|
||||
#2 aio_poll
|
||||
#3 monitor_cleanup
|
||||
#4 qemu_cleanup
|
||||
#5 qemu_default_main
|
||||
|
||||
Then it found the migration incoming coroutine, schedule it (even after
|
||||
migration_shutdown()), causing crash:
|
||||
|
||||
#0 __pthread_kill_implementation
|
||||
#1 __pthread_kill_internal
|
||||
#2 __GI_raise
|
||||
#3 __GI_abort
|
||||
#4 __assert_fail_base
|
||||
#5 __assert_fail
|
||||
#6 qemu_mutex_lock_impl
|
||||
#7 qemu_lockable_mutex_lock
|
||||
#8 qemu_lockable_lock
|
||||
#9 qemu_lockable_auto_lock
|
||||
#10 migrate_set_error
|
||||
#11 process_incoming_migration_co
|
||||
#12 coroutine_trampoline
|
||||
|
||||
To fix it, take a refcount after an incoming setup is properly done when
|
||||
qmp_migrate_incoming() succeeded the 1st time. As it's during a QMP
|
||||
handler which needs BQL, it means the main loop is still alive (without
|
||||
going into cleanups, which also needs BQL).
|
||||
|
||||
Releasing the refcount now only until the incoming migration coroutine
|
||||
finished or failed. Hence the refcount is valid for both (1) setup phase
|
||||
of incoming ports, mostly IO watches (e.g. qio_channel_add_watch_full()),
|
||||
and (2) the incoming coroutine itself (process_incoming_migration_co()).
|
||||
|
||||
Note that we can't unref in migration_incoming_state_destroy(), because
|
||||
both qmp_xen_load_devices_state() and load_snapshot() will use it without
|
||||
an incoming migration. Those hold BQL so they're not prone to this issue.
|
||||
|
||||
PS: I suspect nobody uses Xen's command at all, as it didn't register yank,
|
||||
hence AFAIU the command should crash on master when trying to unregister
|
||||
yank in migration_incoming_state_destroy().. but that's another story.
|
||||
|
||||
Also note that in some incoming failure cases we may not always unref the
|
||||
MigrationState refcount, which is a trade-off to keep things simple. We
|
||||
could make it accurate, but it can be an overkill. Some examples:
|
||||
|
||||
- Unlike most of the rest protocols, socket_start_incoming_migration()
|
||||
may create net listener after incoming port setup sucessfully.
|
||||
It means we can't unref in migration_channel_process_incoming() as a
|
||||
generic path because socket protocol might keep using MigrationState.
|
||||
|
||||
- For either socket or file, multiple IO watches might be created, it
|
||||
means logically each IO watch needs to take one refcount for
|
||||
MigrationState so as to be 100% accurate on ownership of refcount taken.
|
||||
|
||||
In general, we at least need per-protocol handling to make it accurate,
|
||||
which can be an overkill if we know incoming failed after all. Add a short
|
||||
comment to explain that when taking the refcount in qmp_migrate_incoming().
|
||||
|
||||
Bugzilla: https://issues.redhat.com/browse/RHEL-69775
|
||||
Tested-by: Yan Fu <yafu@redhat.com>
|
||||
Signed-off-by: Peter Xu <peterx@redhat.com>
|
||||
Reviewed-by: Fabiano Rosas <farosas@suse.de>
|
||||
Message-ID: <20250220132459.512610-1-peterx@redhat.com>
|
||||
Signed-off-by: Fabiano Rosas <farosas@suse.de>
|
||||
(cherry picked from commit d657a14de5d597bbfe7b54e4c4f0646f440e98ad)
|
||||
Signed-off-by: Peter Xu <peterx@redhat.com>
|
||||
---
|
||||
migration/migration.c | 40 ++++++++++++++++++++++++++++++++++++++--
|
||||
1 file changed, 38 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/migration/migration.c b/migration/migration.c
|
||||
index 999d4cac54..aabdc45c16 100644
|
||||
--- a/migration/migration.c
|
||||
+++ b/migration/migration.c
|
||||
@@ -115,6 +115,27 @@ static void migration_downtime_start(MigrationState *s)
|
||||
s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * This is unfortunate: incoming migration actually needs the outgoing
|
||||
+ * migration state (MigrationState) to be there too, e.g. to query
|
||||
+ * capabilities, parameters, using locks, setup errors, etc.
|
||||
+ *
|
||||
+ * NOTE: when calling this, making sure current_migration exists and not
|
||||
+ * been freed yet! Otherwise trying to access the refcount is already
|
||||
+ * an use-after-free itself..
|
||||
+ *
|
||||
+ * TODO: Move shared part of incoming / outgoing out into separate object.
|
||||
+ * Then this is not needed.
|
||||
+ */
|
||||
+static void migrate_incoming_ref_outgoing_state(void)
|
||||
+{
|
||||
+ object_ref(migrate_get_current());
|
||||
+}
|
||||
+static void migrate_incoming_unref_outgoing_state(void)
|
||||
+{
|
||||
+ object_unref(migrate_get_current());
|
||||
+}
|
||||
+
|
||||
static void migration_downtime_end(MigrationState *s)
|
||||
{
|
||||
int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
|
||||
@@ -821,7 +842,7 @@ process_incoming_migration_co(void *opaque)
|
||||
* postcopy thread.
|
||||
*/
|
||||
trace_process_incoming_migration_co_postcopy_end_main();
|
||||
- return;
|
||||
+ goto out;
|
||||
}
|
||||
/* Else if something went wrong then just fall out of the normal exit */
|
||||
}
|
||||
@@ -837,7 +858,8 @@ process_incoming_migration_co(void *opaque)
|
||||
}
|
||||
|
||||
migration_bh_schedule(process_incoming_migration_bh, mis);
|
||||
- return;
|
||||
+ goto out;
|
||||
+
|
||||
fail:
|
||||
migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
|
||||
MIGRATION_STATUS_FAILED);
|
||||
@@ -854,6 +876,9 @@ fail:
|
||||
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
+out:
|
||||
+ /* Pairs with the refcount taken in qmp_migrate_incoming() */
|
||||
+ migrate_incoming_unref_outgoing_state();
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1875,6 +1900,17 @@ void qmp_migrate_incoming(const char *uri, bool has_channels,
|
||||
return;
|
||||
}
|
||||
|
||||
+ /*
|
||||
+ * Making sure MigrationState is available until incoming migration
|
||||
+ * completes.
|
||||
+ *
|
||||
+ * NOTE: QEMU _might_ leak this refcount in some failure paths, but
|
||||
+ * that's OK. This is the minimum change we need to at least making
|
||||
+ * sure success case is clean on the refcount. We can try harder to
|
||||
+ * make it accurate for any kind of failures, but it might be an
|
||||
+ * overkill and doesn't bring us much benefit.
|
||||
+ */
|
||||
+ migrate_incoming_ref_outgoing_state();
|
||||
once = false;
|
||||
}
|
||||
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,237 @@
|
||||
From 1fa31324da8ebba64a44c1e9b64f7e59c29f3d75 Mon Sep 17 00:00:00 2001
|
||||
From: Peter Xu <peterx@redhat.com>
|
||||
Date: Thu, 24 Apr 2025 18:07:05 -0400
|
||||
Subject: [PATCH 1/2] migration/postcopy: Spatial locality page hint for
|
||||
preempt mode
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Peter Xu <peterx@redhat.com>
|
||||
RH-MergeRequest: 358: migration/postcopy: Spatial locality page hint for preempt mode
|
||||
RH-Jira: RHEL-85159
|
||||
RH-Acked-by: Juraj Marcin <None>
|
||||
RH-Acked-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
RH-Commit: [1/1] f5bce349c80f98428c73a3898f87d4d10ec2f4bd (peterx/qemu-kvm)
|
||||
|
||||
The preempt mode postcopy has been introduced for a while. From latency
|
||||
POV, it should always win the vanilla postcopy.
|
||||
|
||||
However there's one thing missing when preempt mode is enabled right now,
|
||||
which is the spatial locality hint when there're page requests from the
|
||||
destination side.
|
||||
|
||||
In vanilla postcopy, as long as a page request was unqueued, it will update
|
||||
the PSS of the precopy background stream, so that after a page request the
|
||||
background thread will move the pages after whatever was requested. It's
|
||||
pretty much a natural behavior when there's only one channel anyway, and
|
||||
one scanner to send the pages.
|
||||
|
||||
Preempt mode didn't follow that, because preempt mode has its own channel
|
||||
and its own PSS (which doesn't linearly scan the guest memory, but
|
||||
dedicated to resolve page requested from destination). So the page request
|
||||
process and the background migration process are completely separate.
|
||||
|
||||
This patch adds the hint explicitly for preempt mode. With that, whenever
|
||||
the preempt mode receives a page request on the source, it will service the
|
||||
remote page fault in the return path, then it'll provide a hint to the
|
||||
background thread so that we'll start sending the pages right after the
|
||||
requested ones in the background, assuming the follow up pages have a
|
||||
higher chance to be accessed later.
|
||||
|
||||
NOTE: since the background migration thread and return path thread run
|
||||
completely concurrently, it doesn't always mean the hint will be applied
|
||||
every single time. For example, it's possible that the return path thread
|
||||
receives multiple page requests in a row without the background thread
|
||||
getting the chance to consume one. In such case, the preempt thread only
|
||||
provide the hint if the previous hint has been consumed. After all,
|
||||
there's no point queuing hints when we only have one linear scanner.
|
||||
|
||||
This could measureably improve the simple sequential memory access pattern
|
||||
during postcopy (when preempt is on). For random accesses, I can measure a
|
||||
slight increase of remote page fault latency from ~500us -> ~600us, that
|
||||
could be a trade-off to have such hint mechanism, and after all that's
|
||||
still greatly improved comparing to vanilla postcopy on random (~10ms).
|
||||
|
||||
The patch is verified by our QE team in a video streaming test case, to
|
||||
reduce the pause of the video from ~1min to a few seconds when switching
|
||||
over to postcopy with preempt mode.
|
||||
|
||||
Reported-by: Xiaohui Li <xiaohli@redhat.com>
|
||||
Tested-by: Xiaohui Li <xiaohli@redhat.com>
|
||||
Reviewed-by: Juraj Marcin <jmarcin@redhat.com>
|
||||
Link: https://lore.kernel.org/r/20250424220705.195544-1-peterx@redhat.com
|
||||
Signed-off-by: Peter Xu <peterx@redhat.com>
|
||||
(cherry picked from commit 20d82622812d888478d04a2d0d8575d70eb5d749)
|
||||
Signed-off-by: Peter Xu <peterx@redhat.com>
|
||||
---
|
||||
migration/ram.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++-
|
||||
1 file changed, 96 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/migration/ram.c b/migration/ram.c
|
||||
index edec1a2d07..0803f85b8a 100644
|
||||
--- a/migration/ram.c
|
||||
+++ b/migration/ram.c
|
||||
@@ -112,6 +112,36 @@
|
||||
|
||||
XBZRLECacheStats xbzrle_counters;
|
||||
|
||||
+/*
|
||||
+ * This structure locates a specific location of a guest page. In QEMU,
|
||||
+ * it's described in a tuple of (ramblock, offset).
|
||||
+ */
|
||||
+struct PageLocation {
|
||||
+ RAMBlock *block;
|
||||
+ unsigned long offset;
|
||||
+};
|
||||
+typedef struct PageLocation PageLocation;
|
||||
+
|
||||
+/**
|
||||
+ * PageLocationHint: describes a hint to a page location
|
||||
+ *
|
||||
+ * @valid set if the hint is vaild and to be consumed
|
||||
+ * @location: the hint content
|
||||
+ *
|
||||
+ * In postcopy preempt mode, the urgent channel may provide hints to the
|
||||
+ * background channel, so that QEMU source can try to migrate whatever is
|
||||
+ * right after the requested urgent pages.
|
||||
+ *
|
||||
+ * This is based on the assumption that the VM (already running on the
|
||||
+ * destination side) tends to access the memory with spatial locality.
|
||||
+ * This is also the default behavior of vanilla postcopy (preempt off).
|
||||
+ */
|
||||
+struct PageLocationHint {
|
||||
+ bool valid;
|
||||
+ PageLocation location;
|
||||
+};
|
||||
+typedef struct PageLocationHint PageLocationHint;
|
||||
+
|
||||
/* used by the search for pages to send */
|
||||
struct PageSearchStatus {
|
||||
/* The migration channel used for a specific host page */
|
||||
@@ -414,6 +444,13 @@ struct RAMState {
|
||||
* RAM migration.
|
||||
*/
|
||||
unsigned int postcopy_bmap_sync_requested;
|
||||
+ /*
|
||||
+ * Page hint during postcopy when preempt mode is on. Return path
|
||||
+ * thread sets it, while background migration thread consumes it.
|
||||
+ *
|
||||
+ * Protected by @bitmap_mutex.
|
||||
+ */
|
||||
+ PageLocationHint page_hint;
|
||||
};
|
||||
typedef struct RAMState RAMState;
|
||||
|
||||
@@ -2091,6 +2128,21 @@ static void pss_host_page_finish(PageSearchStatus *pss)
|
||||
pss->host_page_start = pss->host_page_end = 0;
|
||||
}
|
||||
|
||||
+static void ram_page_hint_update(RAMState *rs, PageSearchStatus *pss)
|
||||
+{
|
||||
+ PageLocationHint *hint = &rs->page_hint;
|
||||
+
|
||||
+ /* If there's a pending hint not consumed, don't bother */
|
||||
+ if (hint->valid) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ /* Provide a hint to the background stream otherwise */
|
||||
+ hint->location.block = pss->block;
|
||||
+ hint->location.offset = pss->page;
|
||||
+ hint->valid = true;
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* Send an urgent host page specified by `pss'. Need to be called with
|
||||
* bitmap_mutex held.
|
||||
@@ -2136,6 +2188,7 @@ out:
|
||||
/* For urgent requests, flush immediately if sent */
|
||||
if (sent) {
|
||||
qemu_fflush(pss->pss_channel);
|
||||
+ ram_page_hint_update(rs, pss);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
@@ -2223,6 +2276,30 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
|
||||
return (res < 0 ? res : pages);
|
||||
}
|
||||
|
||||
+static bool ram_page_hint_valid(RAMState *rs)
|
||||
+{
|
||||
+ /* There's only page hint during postcopy preempt mode */
|
||||
+ if (!postcopy_preempt_active()) {
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ return rs->page_hint.valid;
|
||||
+}
|
||||
+
|
||||
+static void ram_page_hint_collect(RAMState *rs, RAMBlock **block,
|
||||
+ unsigned long *page)
|
||||
+{
|
||||
+ PageLocationHint *hint = &rs->page_hint;
|
||||
+
|
||||
+ assert(hint->valid);
|
||||
+
|
||||
+ *block = hint->location.block;
|
||||
+ *page = hint->location.offset;
|
||||
+
|
||||
+ /* Mark the hint consumed */
|
||||
+ hint->valid = false;
|
||||
+}
|
||||
+
|
||||
/**
|
||||
* ram_find_and_save_block: finds a dirty page and sends it to f
|
||||
*
|
||||
@@ -2239,6 +2316,8 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
|
||||
static int ram_find_and_save_block(RAMState *rs)
|
||||
{
|
||||
PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
|
||||
+ unsigned long next_page;
|
||||
+ RAMBlock *next_block;
|
||||
int pages = 0;
|
||||
|
||||
/* No dirty page as there is zero RAM */
|
||||
@@ -2258,7 +2337,14 @@ static int ram_find_and_save_block(RAMState *rs)
|
||||
rs->last_page = 0;
|
||||
}
|
||||
|
||||
- pss_init(pss, rs->last_seen_block, rs->last_page);
|
||||
+ if (ram_page_hint_valid(rs)) {
|
||||
+ ram_page_hint_collect(rs, &next_block, &next_page);
|
||||
+ } else {
|
||||
+ next_block = rs->last_seen_block;
|
||||
+ next_page = rs->last_page;
|
||||
+ }
|
||||
+
|
||||
+ pss_init(pss, next_block, next_page);
|
||||
|
||||
while (true){
|
||||
if (!get_queued_page(rs, pss)) {
|
||||
@@ -2392,6 +2478,13 @@ static void ram_save_cleanup(void *opaque)
|
||||
migration_ops = NULL;
|
||||
}
|
||||
|
||||
+static void ram_page_hint_reset(PageLocationHint *hint)
|
||||
+{
|
||||
+ hint->location.block = NULL;
|
||||
+ hint->location.offset = 0;
|
||||
+ hint->valid = false;
|
||||
+}
|
||||
+
|
||||
static void ram_state_reset(RAMState *rs)
|
||||
{
|
||||
int i;
|
||||
@@ -2404,6 +2497,8 @@ static void ram_state_reset(RAMState *rs)
|
||||
rs->last_page = 0;
|
||||
rs->last_version = ram_list.version;
|
||||
rs->xbzrle_started = false;
|
||||
+
|
||||
+ ram_page_hint_reset(&rs->page_hint);
|
||||
}
|
||||
|
||||
#define MAX_WAIT 50 /* ms, half buffered_file limit */
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,295 @@
|
||||
From a5f6042a0c80daf3672fa071b724cb05e6f6e928 Mon Sep 17 00:00:00 2001
|
||||
From: Eric Blake <eblake@redhat.com>
|
||||
Date: Fri, 9 May 2025 15:40:25 -0500
|
||||
Subject: [PATCH 10/16] mirror: Allow QMP override to declare target already
|
||||
zero
|
||||
|
||||
RH-Author: Eric Blake <eblake@redhat.com>
|
||||
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
|
||||
RH-Jira: RHEL-82906 RHEL-83015
|
||||
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
|
||||
RH-Commit: [8/14] fb054864175d83e9d232464295b170808bee0e6c (ebblake/centos-qemu-kvm)
|
||||
|
||||
QEMU has an optimization for a just-created drive-mirror destination
|
||||
that is not possible for blockdev-mirror (which can't create the
|
||||
destination) - any time we know the destination starts life as all
|
||||
zeroes, we can skip a pre-zeroing pass on the destination. Recent
|
||||
patches have added an improved heuristic for detecting if a file
|
||||
contains all zeroes, and we plan to use that heuristic in upcoming
|
||||
patches. But since a heuristic cannot quickly detect all scenarios,
|
||||
and there may be cases where the caller is aware of information that
|
||||
QEMU cannot learn quickly, it makes sense to have a way to tell QEMU
|
||||
to assume facts about the destination that can make the mirror
|
||||
operation faster. Given our existing example of "qemu-img convert
|
||||
--target-is-zero", it is time to expose this override in QMP for
|
||||
blockdev-mirror as well.
|
||||
|
||||
This patch results in some slight redundancy between the older
|
||||
s->zero_target (set any time mode==FULL and the destination image was
|
||||
not just created - ie. clear if drive-mirror is asking to skip the
|
||||
pre-zero pass) and the newly-introduced s->target_is_zero (in addition
|
||||
to the QMP override, it is set when drive-mirror creates the
|
||||
destination image); this will be cleaned up in the next patch.
|
||||
|
||||
There is also a subtlety that we must consider. When drive-mirror is
|
||||
passing target_is_zero on behalf of a just-created image, we know the
|
||||
image is sparse (skipping the pre-zeroing keeps it that way), so it
|
||||
doesn't matter whether the destination also has "discard":"unmap" and
|
||||
"detect-zeroes":"unmap". But now that we are letting the user set the
|
||||
knob for target-is-zero, if the user passes a pre-existing file that
|
||||
is fully allocated, it is fine to leave the file fully allocated under
|
||||
"detect-zeroes":"on", but if the file is open with
|
||||
"detect-zeroes":"unmap", we should really be trying harder to punch
|
||||
holes in the destination for every region of zeroes copied from the
|
||||
source. The easiest way to do this is to still run the pre-zeroing
|
||||
pass (turning the entire destination file sparse before populating
|
||||
just the allocated portions of the source), even though that currently
|
||||
results in double I/O to the portions of the file that are allocated.
|
||||
A later patch will add further optimizations to reduce redundant
|
||||
zeroing I/O during the mirror operation.
|
||||
|
||||
Since "target-is-zero":true is designed for optimizations, it is okay
|
||||
to silently ignore the parameter rather than erroring if the user ever
|
||||
sets the parameter in a scenario where the mirror job can't exploit it
|
||||
(for example, when doing "sync":"top" instead of "sync":"full", we
|
||||
can't pre-zero, so setting the parameter won't make a speed
|
||||
difference).
|
||||
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
Acked-by: Markus Armbruster <armbru@redhat.com>
|
||||
Message-ID: <20250509204341.3553601-23-eblake@redhat.com>
|
||||
Reviewed-by: Sunny Zhu <sunnyzhyy@qq.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit d17a34bfb94bda3a89d7320ae67255ded1d8c939)
|
||||
Jira: https://issues.redhat.com/browse/RHEL-82906
|
||||
Jira: https://issues.redhat.com/browse/RHEL-83015
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
---
|
||||
block/mirror.c | 27 ++++++++++++++++++++++----
|
||||
blockdev.c | 18 ++++++++++-------
|
||||
include/block/block_int-global-state.h | 3 ++-
|
||||
qapi/block-core.json | 8 +++++++-
|
||||
tests/unit/test-block-iothread.c | 2 +-
|
||||
5 files changed, 44 insertions(+), 14 deletions(-)
|
||||
|
||||
diff --git a/block/mirror.c b/block/mirror.c
|
||||
index c8bbaa0b35..bba3e3b05c 100644
|
||||
--- a/block/mirror.c
|
||||
+++ b/block/mirror.c
|
||||
@@ -55,6 +55,8 @@ typedef struct MirrorBlockJob {
|
||||
BlockMirrorBackingMode backing_mode;
|
||||
/* Whether the target image requires explicit zero-initialization */
|
||||
bool zero_target;
|
||||
+ /* Whether the target should be assumed to be already zero initialized */
|
||||
+ bool target_is_zero;
|
||||
/*
|
||||
* To be accesssed with atomics. Written only under the BQL (required by the
|
||||
* current implementation of mirror_change()).
|
||||
@@ -844,12 +846,26 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s)
|
||||
BlockDriverState *target_bs = blk_bs(s->target);
|
||||
int ret = -EIO;
|
||||
int64_t count;
|
||||
+ bool punch_holes =
|
||||
+ target_bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP &&
|
||||
+ bdrv_can_write_zeroes_with_unmap(target_bs);
|
||||
|
||||
bdrv_graph_co_rdlock();
|
||||
bs = s->mirror_top_bs->backing->bs;
|
||||
bdrv_graph_co_rdunlock();
|
||||
|
||||
- if (s->zero_target) {
|
||||
+ if (s->zero_target && (!s->target_is_zero || punch_holes)) {
|
||||
+ /*
|
||||
+ * Here, we are in FULL mode; our goal is to avoid writing
|
||||
+ * zeroes if the destination already reads as zero, except
|
||||
+ * when we are trying to punch holes. This is possible if
|
||||
+ * zeroing happened externally (s->target_is_zero) or if we
|
||||
+ * have a fast way to pre-zero the image (the dirty bitmap
|
||||
+ * will be populated later by the non-zero portions, the same
|
||||
+ * as for TOP mode). If pre-zeroing is not fast, or we need
|
||||
+ * to punch holes, then our only recourse is to write the
|
||||
+ * entire image.
|
||||
+ */
|
||||
if (!bdrv_can_write_zeroes_with_unmap(target_bs)) {
|
||||
bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, s->bdev_length);
|
||||
return 0;
|
||||
@@ -1714,7 +1730,7 @@ static BlockJob *mirror_start_job(
|
||||
uint32_t granularity, int64_t buf_size,
|
||||
MirrorSyncMode sync_mode,
|
||||
BlockMirrorBackingMode backing_mode,
|
||||
- bool zero_target,
|
||||
+ bool zero_target, bool target_is_zero,
|
||||
BlockdevOnError on_source_error,
|
||||
BlockdevOnError on_target_error,
|
||||
bool unmap,
|
||||
@@ -1883,6 +1899,7 @@ static BlockJob *mirror_start_job(
|
||||
s->sync_mode = sync_mode;
|
||||
s->backing_mode = backing_mode;
|
||||
s->zero_target = zero_target;
|
||||
+ s->target_is_zero = target_is_zero;
|
||||
qatomic_set(&s->copy_mode, copy_mode);
|
||||
s->base = base;
|
||||
s->base_overlay = bdrv_find_overlay(bs, base);
|
||||
@@ -2011,7 +2028,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
|
||||
int creation_flags, int64_t speed,
|
||||
uint32_t granularity, int64_t buf_size,
|
||||
MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
|
||||
- bool zero_target,
|
||||
+ bool zero_target, bool target_is_zero,
|
||||
BlockdevOnError on_source_error,
|
||||
BlockdevOnError on_target_error,
|
||||
bool unmap, const char *filter_node_name,
|
||||
@@ -2034,7 +2051,8 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
|
||||
|
||||
mirror_start_job(job_id, bs, creation_flags, target, replaces,
|
||||
speed, granularity, buf_size, mode, backing_mode,
|
||||
- zero_target, on_source_error, on_target_error, unmap,
|
||||
+ zero_target,
|
||||
+ target_is_zero, on_source_error, on_target_error, unmap,
|
||||
NULL, NULL, &mirror_job_driver, base, false,
|
||||
filter_node_name, true, copy_mode, false, errp);
|
||||
}
|
||||
@@ -2062,6 +2080,7 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
|
||||
job = mirror_start_job(
|
||||
job_id, bs, creation_flags, base, NULL, speed, 0, 0,
|
||||
MIRROR_SYNC_MODE_TOP, MIRROR_LEAVE_BACKING_CHAIN, false,
|
||||
+ false,
|
||||
on_error, on_error, true, cb, opaque,
|
||||
&commit_active_job_driver, base, auto_complete,
|
||||
filter_node_name, false, MIRROR_COPY_MODE_BACKGROUND,
|
||||
diff --git a/blockdev.c b/blockdev.c
|
||||
index 70046b6690..db11a99312 100644
|
||||
--- a/blockdev.c
|
||||
+++ b/blockdev.c
|
||||
@@ -2795,7 +2795,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
|
||||
const char *replaces,
|
||||
enum MirrorSyncMode sync,
|
||||
BlockMirrorBackingMode backing_mode,
|
||||
- bool zero_target,
|
||||
+ bool zero_target, bool target_is_zero,
|
||||
bool has_speed, int64_t speed,
|
||||
bool has_granularity, uint32_t granularity,
|
||||
bool has_buf_size, int64_t buf_size,
|
||||
@@ -2906,11 +2906,10 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
|
||||
/* pass the node name to replace to mirror start since it's loose coupling
|
||||
* and will allow to check whether the node still exist at mirror completion
|
||||
*/
|
||||
- mirror_start(job_id, bs, target,
|
||||
- replaces, job_flags,
|
||||
+ mirror_start(job_id, bs, target, replaces, job_flags,
|
||||
speed, granularity, buf_size, sync, backing_mode, zero_target,
|
||||
- on_source_error, on_target_error, unmap, filter_node_name,
|
||||
- copy_mode, errp);
|
||||
+ target_is_zero, on_source_error, on_target_error, unmap,
|
||||
+ filter_node_name, copy_mode, errp);
|
||||
}
|
||||
|
||||
void qmp_drive_mirror(DriveMirror *arg, Error **errp)
|
||||
@@ -2925,6 +2924,7 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
|
||||
int64_t size;
|
||||
const char *format = arg->format;
|
||||
bool zero_target;
|
||||
+ bool target_is_zero;
|
||||
int ret;
|
||||
|
||||
bs = qmp_get_root_bs(arg->device, errp);
|
||||
@@ -3041,6 +3041,8 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
|
||||
zero_target = (arg->sync == MIRROR_SYNC_MODE_FULL &&
|
||||
(arg->mode == NEW_IMAGE_MODE_EXISTING ||
|
||||
!bdrv_has_zero_init(target_bs)));
|
||||
+ target_is_zero = (arg->mode != NEW_IMAGE_MODE_EXISTING &&
|
||||
+ bdrv_has_zero_init(target_bs));
|
||||
bdrv_graph_rdunlock_main_loop();
|
||||
|
||||
|
||||
@@ -3052,7 +3054,7 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
|
||||
|
||||
blockdev_mirror_common(arg->job_id, bs, target_bs,
|
||||
arg->replaces, arg->sync,
|
||||
- backing_mode, zero_target,
|
||||
+ backing_mode, zero_target, target_is_zero,
|
||||
arg->has_speed, arg->speed,
|
||||
arg->has_granularity, arg->granularity,
|
||||
arg->has_buf_size, arg->buf_size,
|
||||
@@ -3082,6 +3084,7 @@ void qmp_blockdev_mirror(const char *job_id,
|
||||
bool has_copy_mode, MirrorCopyMode copy_mode,
|
||||
bool has_auto_finalize, bool auto_finalize,
|
||||
bool has_auto_dismiss, bool auto_dismiss,
|
||||
+ bool has_target_is_zero, bool target_is_zero,
|
||||
Error **errp)
|
||||
{
|
||||
BlockDriverState *bs;
|
||||
@@ -3112,7 +3115,8 @@ void qmp_blockdev_mirror(const char *job_id,
|
||||
|
||||
blockdev_mirror_common(job_id, bs, target_bs,
|
||||
replaces, sync, backing_mode,
|
||||
- zero_target, has_speed, speed,
|
||||
+ zero_target, has_target_is_zero && target_is_zero,
|
||||
+ has_speed, speed,
|
||||
has_granularity, granularity,
|
||||
has_buf_size, buf_size,
|
||||
has_on_source_error, on_source_error,
|
||||
diff --git a/include/block/block_int-global-state.h b/include/block/block_int-global-state.h
|
||||
index eb2d92a226..8cf0003ce7 100644
|
||||
--- a/include/block/block_int-global-state.h
|
||||
+++ b/include/block/block_int-global-state.h
|
||||
@@ -140,6 +140,7 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
|
||||
* @mode: Whether to collapse all images in the chain to the target.
|
||||
* @backing_mode: How to establish the target's backing chain after completion.
|
||||
* @zero_target: Whether the target should be explicitly zero-initialized
|
||||
+ * @target_is_zero: Whether the target already is zero-initialized.
|
||||
* @on_source_error: The action to take upon error reading from the source.
|
||||
* @on_target_error: The action to take upon error writing to the target.
|
||||
* @unmap: Whether to unmap target where source sectors only contain zeroes.
|
||||
@@ -159,7 +160,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
|
||||
int creation_flags, int64_t speed,
|
||||
uint32_t granularity, int64_t buf_size,
|
||||
MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
|
||||
- bool zero_target,
|
||||
+ bool zero_target, bool target_is_zero,
|
||||
BlockdevOnError on_source_error,
|
||||
BlockdevOnError on_target_error,
|
||||
bool unmap, const char *filter_node_name,
|
||||
diff --git a/qapi/block-core.json b/qapi/block-core.json
|
||||
index c1af3d1f7d..3969c60b93 100644
|
||||
--- a/qapi/block-core.json
|
||||
+++ b/qapi/block-core.json
|
||||
@@ -2535,6 +2535,11 @@
|
||||
# disappear from the query list without user intervention.
|
||||
# Defaults to true. (Since 3.1)
|
||||
#
|
||||
+# @target-is-zero: Assume the destination reads as all zeroes before
|
||||
+# the mirror started. Setting this to true can speed up the
|
||||
+# mirror. Setting this to true when the destination is not
|
||||
+# actually all zero can corrupt the destination. (Since 10.1)
|
||||
+#
|
||||
# Since: 2.6
|
||||
#
|
||||
# .. qmp-example::
|
||||
@@ -2554,7 +2559,8 @@
|
||||
'*on-target-error': 'BlockdevOnError',
|
||||
'*filter-node-name': 'str',
|
||||
'*copy-mode': 'MirrorCopyMode',
|
||||
- '*auto-finalize': 'bool', '*auto-dismiss': 'bool' },
|
||||
+ '*auto-finalize': 'bool', '*auto-dismiss': 'bool',
|
||||
+ '*target-is-zero': 'bool'},
|
||||
'allow-preconfig': true }
|
||||
|
||||
##
|
||||
diff --git a/tests/unit/test-block-iothread.c b/tests/unit/test-block-iothread.c
|
||||
index 373b72fdd8..033711d8d7 100644
|
||||
--- a/tests/unit/test-block-iothread.c
|
||||
+++ b/tests/unit/test-block-iothread.c
|
||||
@@ -755,7 +755,7 @@ static void test_propagate_mirror(void)
|
||||
|
||||
/* Start a mirror job */
|
||||
mirror_start("job0", src, target, NULL, JOB_DEFAULT, 0, 0, 0,
|
||||
- MIRROR_SYNC_MODE_NONE, MIRROR_OPEN_BACKING_CHAIN, false,
|
||||
+ MIRROR_SYNC_MODE_NONE, MIRROR_OPEN_BACKING_CHAIN, false, false,
|
||||
BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT,
|
||||
false, "filter_node", MIRROR_COPY_MODE_BACKGROUND,
|
||||
&error_abort);
|
||||
--
|
||||
2.48.1
|
||||
|
||||
241
SOURCES/kvm-mirror-Drop-redundant-zero_target-parameter.patch
Normal file
241
SOURCES/kvm-mirror-Drop-redundant-zero_target-parameter.patch
Normal file
@ -0,0 +1,241 @@
|
||||
From 5040f835f07f3355ae80b3da2ae83ce35de022e0 Mon Sep 17 00:00:00 2001
|
||||
From: Eric Blake <eblake@redhat.com>
|
||||
Date: Fri, 9 May 2025 15:40:26 -0500
|
||||
Subject: [PATCH 11/16] mirror: Drop redundant zero_target parameter
|
||||
|
||||
RH-Author: Eric Blake <eblake@redhat.com>
|
||||
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
|
||||
RH-Jira: RHEL-82906 RHEL-83015
|
||||
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
|
||||
RH-Commit: [9/14] b84a938c69e3761211b9fee4c59b465d55f61855 (ebblake/centos-qemu-kvm)
|
||||
|
||||
The two callers to a mirror job (drive-mirror and blockdev-mirror) set
|
||||
zero_target precisely when sync mode == FULL, with the one exception
|
||||
that drive-mirror skips zeroing the target if it was newly created and
|
||||
reads as zero. But given the previous patch, that exception is
|
||||
equally captured by target_is_zero.
|
||||
|
||||
Meanwhile, there is another slight wrinkle, fortunately caught by
|
||||
iotest 185: if the caller uses "sync":"top" but the source has no
|
||||
backing file, the code in blockdev.c was changing sync to be FULL, but
|
||||
only after it had set zero_target=false. In mirror.c, prior to recent
|
||||
patches, this didn't matter: the only places that inspected sync were
|
||||
setting is_none_mode (both TOP and FULL had set that to false), and
|
||||
mirror_start() setting base = mode == MIRROR_SYNC_MODE_TOP ?
|
||||
bdrv_backing_chain_next(bs) : NULL. But now that we are passing sync
|
||||
around, the slammed sync mode would result in a new pre-zeroing pass
|
||||
even when the user had passed "sync":"top" in an effort to skip
|
||||
pre-zeroing. Fortunately, the assignment of base when bs has no
|
||||
backing chain still works out to NULL if we don't slam things. So
|
||||
with the forced change of sync ripped out of blockdev.c, the sync mode
|
||||
is passed through the full callstack unmolested, and we can now
|
||||
reliably reconstruct the same settings as what used to be passed in by
|
||||
zero_target=false, without the redundant parameter.
|
||||
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
Message-ID: <20250509204341.3553601-24-eblake@redhat.com>
|
||||
Reviewed-by: Sunny Zhu <sunnyzhyy@qq.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
[eblake: Fix regression in iotest 185]
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
(cherry picked from commit 253b43a29077de9266351e120c600a73b82e9c49)
|
||||
Jira: https://issues.redhat.com/browse/RHEL-82906
|
||||
Jira: https://issues.redhat.com/browse/RHEL-83015
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
---
|
||||
block/mirror.c | 13 +++++--------
|
||||
blockdev.c | 19 ++++---------------
|
||||
include/block/block_int-global-state.h | 3 +--
|
||||
tests/unit/test-block-iothread.c | 2 +-
|
||||
4 files changed, 11 insertions(+), 26 deletions(-)
|
||||
|
||||
diff --git a/block/mirror.c b/block/mirror.c
|
||||
index bba3e3b05c..b35d12adaa 100644
|
||||
--- a/block/mirror.c
|
||||
+++ b/block/mirror.c
|
||||
@@ -53,8 +53,6 @@ typedef struct MirrorBlockJob {
|
||||
Error *replace_blocker;
|
||||
MirrorSyncMode sync_mode;
|
||||
BlockMirrorBackingMode backing_mode;
|
||||
- /* Whether the target image requires explicit zero-initialization */
|
||||
- bool zero_target;
|
||||
/* Whether the target should be assumed to be already zero initialized */
|
||||
bool target_is_zero;
|
||||
/*
|
||||
@@ -854,7 +852,9 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s)
|
||||
bs = s->mirror_top_bs->backing->bs;
|
||||
bdrv_graph_co_rdunlock();
|
||||
|
||||
- if (s->zero_target && (!s->target_is_zero || punch_holes)) {
|
||||
+ if (s->sync_mode == MIRROR_SYNC_MODE_TOP) {
|
||||
+ /* In TOP mode, there is no benefit to a pre-zeroing pass. */
|
||||
+ } else if (!s->target_is_zero || punch_holes) {
|
||||
/*
|
||||
* Here, we are in FULL mode; our goal is to avoid writing
|
||||
* zeroes if the destination already reads as zero, except
|
||||
@@ -1730,7 +1730,7 @@ static BlockJob *mirror_start_job(
|
||||
uint32_t granularity, int64_t buf_size,
|
||||
MirrorSyncMode sync_mode,
|
||||
BlockMirrorBackingMode backing_mode,
|
||||
- bool zero_target, bool target_is_zero,
|
||||
+ bool target_is_zero,
|
||||
BlockdevOnError on_source_error,
|
||||
BlockdevOnError on_target_error,
|
||||
bool unmap,
|
||||
@@ -1898,7 +1898,6 @@ static BlockJob *mirror_start_job(
|
||||
s->on_target_error = on_target_error;
|
||||
s->sync_mode = sync_mode;
|
||||
s->backing_mode = backing_mode;
|
||||
- s->zero_target = zero_target;
|
||||
s->target_is_zero = target_is_zero;
|
||||
qatomic_set(&s->copy_mode, copy_mode);
|
||||
s->base = base;
|
||||
@@ -2028,7 +2027,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
|
||||
int creation_flags, int64_t speed,
|
||||
uint32_t granularity, int64_t buf_size,
|
||||
MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
|
||||
- bool zero_target, bool target_is_zero,
|
||||
+ bool target_is_zero,
|
||||
BlockdevOnError on_source_error,
|
||||
BlockdevOnError on_target_error,
|
||||
bool unmap, const char *filter_node_name,
|
||||
@@ -2051,7 +2050,6 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
|
||||
|
||||
mirror_start_job(job_id, bs, creation_flags, target, replaces,
|
||||
speed, granularity, buf_size, mode, backing_mode,
|
||||
- zero_target,
|
||||
target_is_zero, on_source_error, on_target_error, unmap,
|
||||
NULL, NULL, &mirror_job_driver, base, false,
|
||||
filter_node_name, true, copy_mode, false, errp);
|
||||
@@ -2080,7 +2078,6 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
|
||||
job = mirror_start_job(
|
||||
job_id, bs, creation_flags, base, NULL, speed, 0, 0,
|
||||
MIRROR_SYNC_MODE_TOP, MIRROR_LEAVE_BACKING_CHAIN, false,
|
||||
- false,
|
||||
on_error, on_error, true, cb, opaque,
|
||||
&commit_active_job_driver, base, auto_complete,
|
||||
filter_node_name, false, MIRROR_COPY_MODE_BACKGROUND,
|
||||
diff --git a/blockdev.c b/blockdev.c
|
||||
index db11a99312..04fa759e30 100644
|
||||
--- a/blockdev.c
|
||||
+++ b/blockdev.c
|
||||
@@ -2795,7 +2795,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
|
||||
const char *replaces,
|
||||
enum MirrorSyncMode sync,
|
||||
BlockMirrorBackingMode backing_mode,
|
||||
- bool zero_target, bool target_is_zero,
|
||||
+ bool target_is_zero,
|
||||
bool has_speed, int64_t speed,
|
||||
bool has_granularity, uint32_t granularity,
|
||||
bool has_buf_size, int64_t buf_size,
|
||||
@@ -2862,10 +2862,6 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
|
||||
return;
|
||||
}
|
||||
|
||||
- if (!bdrv_backing_chain_next(bs) && sync == MIRROR_SYNC_MODE_TOP) {
|
||||
- sync = MIRROR_SYNC_MODE_FULL;
|
||||
- }
|
||||
-
|
||||
if (!replaces) {
|
||||
/* We want to mirror from @bs, but keep implicit filters on top */
|
||||
unfiltered_bs = bdrv_skip_implicit_filters(bs);
|
||||
@@ -2907,7 +2903,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
|
||||
* and will allow to check whether the node still exist at mirror completion
|
||||
*/
|
||||
mirror_start(job_id, bs, target, replaces, job_flags,
|
||||
- speed, granularity, buf_size, sync, backing_mode, zero_target,
|
||||
+ speed, granularity, buf_size, sync, backing_mode,
|
||||
target_is_zero, on_source_error, on_target_error, unmap,
|
||||
filter_node_name, copy_mode, errp);
|
||||
}
|
||||
@@ -2923,7 +2919,6 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
|
||||
int flags;
|
||||
int64_t size;
|
||||
const char *format = arg->format;
|
||||
- bool zero_target;
|
||||
bool target_is_zero;
|
||||
int ret;
|
||||
|
||||
@@ -3038,9 +3033,6 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
|
||||
}
|
||||
|
||||
bdrv_graph_rdlock_main_loop();
|
||||
- zero_target = (arg->sync == MIRROR_SYNC_MODE_FULL &&
|
||||
- (arg->mode == NEW_IMAGE_MODE_EXISTING ||
|
||||
- !bdrv_has_zero_init(target_bs)));
|
||||
target_is_zero = (arg->mode != NEW_IMAGE_MODE_EXISTING &&
|
||||
bdrv_has_zero_init(target_bs));
|
||||
bdrv_graph_rdunlock_main_loop();
|
||||
@@ -3054,7 +3046,7 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
|
||||
|
||||
blockdev_mirror_common(arg->job_id, bs, target_bs,
|
||||
arg->replaces, arg->sync,
|
||||
- backing_mode, zero_target, target_is_zero,
|
||||
+ backing_mode, target_is_zero,
|
||||
arg->has_speed, arg->speed,
|
||||
arg->has_granularity, arg->granularity,
|
||||
arg->has_buf_size, arg->buf_size,
|
||||
@@ -3091,7 +3083,6 @@ void qmp_blockdev_mirror(const char *job_id,
|
||||
BlockDriverState *target_bs;
|
||||
AioContext *aio_context;
|
||||
BlockMirrorBackingMode backing_mode = MIRROR_LEAVE_BACKING_CHAIN;
|
||||
- bool zero_target;
|
||||
int ret;
|
||||
|
||||
bs = qmp_get_root_bs(device, errp);
|
||||
@@ -3104,8 +3095,6 @@ void qmp_blockdev_mirror(const char *job_id,
|
||||
return;
|
||||
}
|
||||
|
||||
- zero_target = (sync == MIRROR_SYNC_MODE_FULL);
|
||||
-
|
||||
aio_context = bdrv_get_aio_context(bs);
|
||||
|
||||
ret = bdrv_try_change_aio_context(target_bs, aio_context, NULL, errp);
|
||||
@@ -3115,7 +3104,7 @@ void qmp_blockdev_mirror(const char *job_id,
|
||||
|
||||
blockdev_mirror_common(job_id, bs, target_bs,
|
||||
replaces, sync, backing_mode,
|
||||
- zero_target, has_target_is_zero && target_is_zero,
|
||||
+ has_target_is_zero && target_is_zero,
|
||||
has_speed, speed,
|
||||
has_granularity, granularity,
|
||||
has_buf_size, buf_size,
|
||||
diff --git a/include/block/block_int-global-state.h b/include/block/block_int-global-state.h
|
||||
index 8cf0003ce7..d21bd7fd2f 100644
|
||||
--- a/include/block/block_int-global-state.h
|
||||
+++ b/include/block/block_int-global-state.h
|
||||
@@ -139,7 +139,6 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
|
||||
* @buf_size: The amount of data that can be in flight at one time.
|
||||
* @mode: Whether to collapse all images in the chain to the target.
|
||||
* @backing_mode: How to establish the target's backing chain after completion.
|
||||
- * @zero_target: Whether the target should be explicitly zero-initialized
|
||||
* @target_is_zero: Whether the target already is zero-initialized.
|
||||
* @on_source_error: The action to take upon error reading from the source.
|
||||
* @on_target_error: The action to take upon error writing to the target.
|
||||
@@ -160,7 +159,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
|
||||
int creation_flags, int64_t speed,
|
||||
uint32_t granularity, int64_t buf_size,
|
||||
MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
|
||||
- bool zero_target, bool target_is_zero,
|
||||
+ bool target_is_zero,
|
||||
BlockdevOnError on_source_error,
|
||||
BlockdevOnError on_target_error,
|
||||
bool unmap, const char *filter_node_name,
|
||||
diff --git a/tests/unit/test-block-iothread.c b/tests/unit/test-block-iothread.c
|
||||
index 033711d8d7..373b72fdd8 100644
|
||||
--- a/tests/unit/test-block-iothread.c
|
||||
+++ b/tests/unit/test-block-iothread.c
|
||||
@@ -755,7 +755,7 @@ static void test_propagate_mirror(void)
|
||||
|
||||
/* Start a mirror job */
|
||||
mirror_start("job0", src, target, NULL, JOB_DEFAULT, 0, 0, 0,
|
||||
- MIRROR_SYNC_MODE_NONE, MIRROR_OPEN_BACKING_CHAIN, false, false,
|
||||
+ MIRROR_SYNC_MODE_NONE, MIRROR_OPEN_BACKING_CHAIN, false,
|
||||
BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT,
|
||||
false, "filter_node", MIRROR_COPY_MODE_BACKGROUND,
|
||||
&error_abort);
|
||||
--
|
||||
2.48.1
|
||||
|
||||
92
SOURCES/kvm-mirror-Minor-refactoring.patch
Normal file
92
SOURCES/kvm-mirror-Minor-refactoring.patch
Normal file
@ -0,0 +1,92 @@
|
||||
From 0102da22fe5aefde9d398d539fc290ab062346f1 Mon Sep 17 00:00:00 2001
|
||||
From: Eric Blake <eblake@redhat.com>
|
||||
Date: Fri, 9 May 2025 15:40:23 -0500
|
||||
Subject: [PATCH 08/16] mirror: Minor refactoring
|
||||
|
||||
RH-Author: Eric Blake <eblake@redhat.com>
|
||||
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
|
||||
RH-Jira: RHEL-82906 RHEL-83015
|
||||
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
|
||||
RH-Commit: [6/14] 886fa2e3249f48f89d3e04ba619d370031851d89 (ebblake/centos-qemu-kvm)
|
||||
|
||||
Commit 5791ba52 (v9.2) pre-initialized ret in mirror_dirty_init to
|
||||
silence a false positive compiler warning, even though in all code
|
||||
paths where ret is used, it was guaranteed to be reassigned
|
||||
beforehand. But since the function returns -errno, and -1 is not
|
||||
always the right errno, it's better to initialize to -EIO.
|
||||
|
||||
An upcoming patch wants to track two bitmaps in
|
||||
do_sync_target_write(); this will be easier if the current variables
|
||||
related to the dirty bitmap are renamed.
|
||||
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Message-ID: <20250509204341.3553601-21-eblake@redhat.com>
|
||||
(cherry picked from commit 870f8963cf1a84f8ec929b05a6d68906974a76c5)
|
||||
Conflicts:
|
||||
block/mirror.c - commit 5791ba52 not present
|
||||
Jira: https://issues.redhat.com/browse/RHEL-82906
|
||||
Jira: https://issues.redhat.com/browse/RHEL-83015
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
---
|
||||
block/mirror.c | 22 +++++++++++-----------
|
||||
1 file changed, 11 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/block/mirror.c b/block/mirror.c
|
||||
index 61f0a717b7..22f8bd98c4 100644
|
||||
--- a/block/mirror.c
|
||||
+++ b/block/mirror.c
|
||||
@@ -841,7 +841,7 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s)
|
||||
int64_t offset;
|
||||
BlockDriverState *bs;
|
||||
BlockDriverState *target_bs = blk_bs(s->target);
|
||||
- int ret;
|
||||
+ int ret = -EIO;
|
||||
int64_t count;
|
||||
|
||||
bdrv_graph_co_rdlock();
|
||||
@@ -1341,7 +1341,7 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
|
||||
{
|
||||
int ret;
|
||||
size_t qiov_offset = 0;
|
||||
- int64_t bitmap_offset, bitmap_end;
|
||||
+ int64_t dirty_bitmap_offset, dirty_bitmap_end;
|
||||
|
||||
if (!QEMU_IS_ALIGNED(offset, job->granularity) &&
|
||||
bdrv_dirty_bitmap_get(job->dirty_bitmap, offset))
|
||||
@@ -1388,11 +1388,11 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
|
||||
* Tails are either clean or shrunk, so for bitmap resetting
|
||||
* we safely align the range down.
|
||||
*/
|
||||
- bitmap_offset = QEMU_ALIGN_UP(offset, job->granularity);
|
||||
- bitmap_end = QEMU_ALIGN_DOWN(offset + bytes, job->granularity);
|
||||
- if (bitmap_offset < bitmap_end) {
|
||||
- bdrv_reset_dirty_bitmap(job->dirty_bitmap, bitmap_offset,
|
||||
- bitmap_end - bitmap_offset);
|
||||
+ dirty_bitmap_offset = QEMU_ALIGN_UP(offset, job->granularity);
|
||||
+ dirty_bitmap_end = QEMU_ALIGN_DOWN(offset + bytes, job->granularity);
|
||||
+ if (dirty_bitmap_offset < dirty_bitmap_end) {
|
||||
+ bdrv_reset_dirty_bitmap(job->dirty_bitmap, dirty_bitmap_offset,
|
||||
+ dirty_bitmap_end - dirty_bitmap_offset);
|
||||
}
|
||||
|
||||
job_progress_increase_remaining(&job->common.job, bytes);
|
||||
@@ -1430,10 +1430,10 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
|
||||
* at function start, and they must be still dirty, as we've locked
|
||||
* the region for in-flight op.
|
||||
*/
|
||||
- bitmap_offset = QEMU_ALIGN_DOWN(offset, job->granularity);
|
||||
- bitmap_end = QEMU_ALIGN_UP(offset + bytes, job->granularity);
|
||||
- bdrv_set_dirty_bitmap(job->dirty_bitmap, bitmap_offset,
|
||||
- bitmap_end - bitmap_offset);
|
||||
+ dirty_bitmap_offset = QEMU_ALIGN_DOWN(offset, job->granularity);
|
||||
+ dirty_bitmap_end = QEMU_ALIGN_UP(offset + bytes, job->granularity);
|
||||
+ bdrv_set_dirty_bitmap(job->dirty_bitmap, dirty_bitmap_offset,
|
||||
+ dirty_bitmap_end - dirty_bitmap_offset);
|
||||
qatomic_set(&job->actively_synced, false);
|
||||
|
||||
action = mirror_error_action(job, false, -ret);
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,139 @@
|
||||
From 482db3e637a16d5877e523e87c53ddb2579b4b66 Mon Sep 17 00:00:00 2001
|
||||
From: Eric Blake <eblake@redhat.com>
|
||||
Date: Fri, 9 May 2025 15:40:24 -0500
|
||||
Subject: [PATCH 09/16] mirror: Pass full sync mode rather than bool to
|
||||
internals
|
||||
|
||||
RH-Author: Eric Blake <eblake@redhat.com>
|
||||
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
|
||||
RH-Jira: RHEL-82906 RHEL-83015
|
||||
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
|
||||
RH-Commit: [7/14] f45a83a14b0eea07517176d44ab0c49db8233ea0 (ebblake/centos-qemu-kvm)
|
||||
|
||||
Out of the five possible values for MirrorSyncMode, INCREMENTAL and
|
||||
BITMAP are already rejected up front in mirror_start, leaving NONE,
|
||||
TOP, and FULL as the remaining values that the code was collapsing
|
||||
into a single bool is_none_mode. Furthermore, mirror_dirty_init() is
|
||||
only reachable for modes TOP and FULL, as further guided by
|
||||
s->zero_target. However, upcoming patches want to further optimize
|
||||
the pre-zeroing pass of a sync=full mirror in mirror_dirty_init(),
|
||||
while avoiding that pass on a sync=top action. Instead of throwing
|
||||
away context by collapsing these two values into
|
||||
s->is_none_mode=false, it is better to pass s->sync_mode throughout
|
||||
the entire operation. For active commit, the desired semantics match
|
||||
sync mode TOP.
|
||||
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
Message-ID: <20250509204341.3553601-22-eblake@redhat.com>
|
||||
Reviewed-by: Sunny Zhu <sunnyzhyy@qq.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 9474d97bd7421b4fe7c806ab0949697514d11e88)
|
||||
Jira: https://issues.redhat.com/browse/RHEL-82906
|
||||
Jira: https://issues.redhat.com/browse/RHEL-83015
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
---
|
||||
block/mirror.c | 24 ++++++++++++------------
|
||||
1 file changed, 12 insertions(+), 12 deletions(-)
|
||||
|
||||
diff --git a/block/mirror.c b/block/mirror.c
|
||||
index 22f8bd98c4..c8bbaa0b35 100644
|
||||
--- a/block/mirror.c
|
||||
+++ b/block/mirror.c
|
||||
@@ -51,7 +51,7 @@ typedef struct MirrorBlockJob {
|
||||
BlockDriverState *to_replace;
|
||||
/* Used to block operations on the drive-mirror-replace target */
|
||||
Error *replace_blocker;
|
||||
- bool is_none_mode;
|
||||
+ MirrorSyncMode sync_mode;
|
||||
BlockMirrorBackingMode backing_mode;
|
||||
/* Whether the target image requires explicit zero-initialization */
|
||||
bool zero_target;
|
||||
@@ -723,9 +723,10 @@ static int mirror_exit_common(Job *job)
|
||||
&error_abort);
|
||||
|
||||
if (!abort && s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
|
||||
- BlockDriverState *backing = s->is_none_mode ? src : s->base;
|
||||
+ BlockDriverState *backing;
|
||||
BlockDriverState *unfiltered_target = bdrv_skip_filters(target_bs);
|
||||
|
||||
+ backing = s->sync_mode == MIRROR_SYNC_MODE_NONE ? src : s->base;
|
||||
if (bdrv_cow_bs(unfiltered_target) != backing) {
|
||||
bdrv_set_backing_hd(unfiltered_target, backing, &local_err);
|
||||
if (local_err) {
|
||||
@@ -1020,7 +1021,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
|
||||
mirror_free_init(s);
|
||||
|
||||
s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
|
||||
- if (!s->is_none_mode) {
|
||||
+ if (s->sync_mode != MIRROR_SYNC_MODE_NONE) {
|
||||
ret = mirror_dirty_init(s);
|
||||
if (ret < 0 || job_is_cancelled(&s->common.job)) {
|
||||
goto immediate_exit;
|
||||
@@ -1711,6 +1712,7 @@ static BlockJob *mirror_start_job(
|
||||
int creation_flags, BlockDriverState *target,
|
||||
const char *replaces, int64_t speed,
|
||||
uint32_t granularity, int64_t buf_size,
|
||||
+ MirrorSyncMode sync_mode,
|
||||
BlockMirrorBackingMode backing_mode,
|
||||
bool zero_target,
|
||||
BlockdevOnError on_source_error,
|
||||
@@ -1719,7 +1721,7 @@ static BlockJob *mirror_start_job(
|
||||
BlockCompletionFunc *cb,
|
||||
void *opaque,
|
||||
const BlockJobDriver *driver,
|
||||
- bool is_none_mode, BlockDriverState *base,
|
||||
+ BlockDriverState *base,
|
||||
bool auto_complete, const char *filter_node_name,
|
||||
bool is_mirror, MirrorCopyMode copy_mode,
|
||||
bool base_ro,
|
||||
@@ -1878,7 +1880,7 @@ static BlockJob *mirror_start_job(
|
||||
s->replaces = g_strdup(replaces);
|
||||
s->on_source_error = on_source_error;
|
||||
s->on_target_error = on_target_error;
|
||||
- s->is_none_mode = is_none_mode;
|
||||
+ s->sync_mode = sync_mode;
|
||||
s->backing_mode = backing_mode;
|
||||
s->zero_target = zero_target;
|
||||
qatomic_set(&s->copy_mode, copy_mode);
|
||||
@@ -2015,7 +2017,6 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
|
||||
bool unmap, const char *filter_node_name,
|
||||
MirrorCopyMode copy_mode, Error **errp)
|
||||
{
|
||||
- bool is_none_mode;
|
||||
BlockDriverState *base;
|
||||
|
||||
GLOBAL_STATE_CODE();
|
||||
@@ -2028,14 +2029,13 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
|
||||
}
|
||||
|
||||
bdrv_graph_rdlock_main_loop();
|
||||
- is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
|
||||
base = mode == MIRROR_SYNC_MODE_TOP ? bdrv_backing_chain_next(bs) : NULL;
|
||||
bdrv_graph_rdunlock_main_loop();
|
||||
|
||||
mirror_start_job(job_id, bs, creation_flags, target, replaces,
|
||||
- speed, granularity, buf_size, backing_mode, zero_target,
|
||||
- on_source_error, on_target_error, unmap, NULL, NULL,
|
||||
- &mirror_job_driver, is_none_mode, base, false,
|
||||
+ speed, granularity, buf_size, mode, backing_mode,
|
||||
+ zero_target, on_source_error, on_target_error, unmap,
|
||||
+ NULL, NULL, &mirror_job_driver, base, false,
|
||||
filter_node_name, true, copy_mode, false, errp);
|
||||
}
|
||||
|
||||
@@ -2061,9 +2061,9 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
|
||||
|
||||
job = mirror_start_job(
|
||||
job_id, bs, creation_flags, base, NULL, speed, 0, 0,
|
||||
- MIRROR_LEAVE_BACKING_CHAIN, false,
|
||||
+ MIRROR_SYNC_MODE_TOP, MIRROR_LEAVE_BACKING_CHAIN, false,
|
||||
on_error, on_error, true, cb, opaque,
|
||||
- &commit_active_job_driver, false, base, auto_complete,
|
||||
+ &commit_active_job_driver, base, auto_complete,
|
||||
filter_node_name, false, MIRROR_COPY_MODE_BACKGROUND,
|
||||
base_read_only, errp);
|
||||
if (!job) {
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,58 @@
|
||||
From be6ce2c91fe949d1c264de974ab4f6c4efc6976e Mon Sep 17 00:00:00 2001
|
||||
From: Eric Blake <eblake@redhat.com>
|
||||
Date: Tue, 13 May 2025 17:00:45 -0500
|
||||
Subject: [PATCH 16/16] mirror: Reduce I/O when destination is
|
||||
detect-zeroes:unmap
|
||||
|
||||
RH-Author: Eric Blake <eblake@redhat.com>
|
||||
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
|
||||
RH-Jira: RHEL-82906 RHEL-83015
|
||||
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
|
||||
RH-Commit: [14/14] 66f3de2ba9f977c9bc1c54f67d76b366df132e62 (ebblake/centos-qemu-kvm)
|
||||
|
||||
If we are going to punch holes in the mirror destination even for the
|
||||
portions where the source image is unallocated, it is nicer to treat
|
||||
the entire image as dirty and punch as we go, rather than pre-zeroing
|
||||
the entire image just to re-do I/O to the allocated portions of the
|
||||
image.
|
||||
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
Message-ID: <20250513220142.535200-2-eblake@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 9abfc81246c9cc1845080eec5920779961187c07)
|
||||
Jira: https://issues.redhat.com/browse/RHEL-82906
|
||||
Jira: https://issues.redhat.com/browse/RHEL-83015
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
---
|
||||
block/mirror.c | 13 +++++++++----
|
||||
1 file changed, 9 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/block/mirror.c b/block/mirror.c
|
||||
index 7f3b5477ce..87c19ddf0d 100644
|
||||
--- a/block/mirror.c
|
||||
+++ b/block/mirror.c
|
||||
@@ -920,11 +920,16 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s)
|
||||
* zeroing happened externally (ret > 0) or if we have a fast
|
||||
* way to pre-zero the image (the dirty bitmap will be
|
||||
* populated later by the non-zero portions, the same as for
|
||||
- * TOP mode). If pre-zeroing is not fast, then our only
|
||||
- * recourse is to mark the entire image dirty. The act of
|
||||
- * pre-zeroing will populate the zero bitmap.
|
||||
+ * TOP mode). If pre-zeroing is not fast, or we need to visit
|
||||
+ * the entire image in order to punch holes even in the
|
||||
+ * non-allocated regions of the source, then just mark the
|
||||
+ * entire image dirty and leave the zero bitmap clear at this
|
||||
+ * point in time. Otherwise, it can be faster to pre-zero the
|
||||
+ * image now, even if we re-write the allocated portions of
|
||||
+ * the disk later, and the pre-zero pass will populate the
|
||||
+ * zero bitmap.
|
||||
*/
|
||||
- if (!bdrv_can_write_zeroes_with_unmap(target_bs)) {
|
||||
+ if (!bdrv_can_write_zeroes_with_unmap(target_bs) || punch_holes) {
|
||||
bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, s->bdev_length);
|
||||
return 0;
|
||||
}
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,180 @@
|
||||
From 423ce7727eecae647330287e1264ac0d938fa7f9 Mon Sep 17 00:00:00 2001
|
||||
From: Eric Blake <eblake@redhat.com>
|
||||
Date: Fri, 9 May 2025 15:40:27 -0500
|
||||
Subject: [PATCH 12/16] mirror: Skip pre-zeroing destination if it is already
|
||||
zero
|
||||
|
||||
RH-Author: Eric Blake <eblake@redhat.com>
|
||||
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
|
||||
RH-Jira: RHEL-82906 RHEL-83015
|
||||
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
|
||||
RH-Commit: [10/14] e754ae559123099f4aed322f6a4287cf3323f54d (ebblake/centos-qemu-kvm)
|
||||
|
||||
When doing a sync=full mirroring, we can skip pre-zeroing the
|
||||
destination if it already reads as zeroes and we are not also trying
|
||||
to punch holes due to detect-zeroes. With this patch, there are fewer
|
||||
scenarios that have to pass in an explicit target-is-zero, while still
|
||||
resulting in a sparse destination remaining sparse.
|
||||
|
||||
A later patch will then further improve things to skip writing to the
|
||||
destination for parts of the image where the source is zero; but even
|
||||
with just this patch, it is possible to see a difference for any
|
||||
source that does not report itself as fully allocated, coupled with a
|
||||
destination BDS that can quickly report that it already reads as zero.
|
||||
(For a source that reports as fully allocated, such as a file, the
|
||||
rest of mirror_dirty_init() still sets the entire dirty bitmap to
|
||||
true, so even though we avoided the pre-zeroing, we are not yet
|
||||
avoiding all redundant I/O).
|
||||
|
||||
Iotest 194 detects the difference made by this patch: for a file
|
||||
source (where block status reports the entire image as allocated, and
|
||||
therefore we end up writing zeroes everywhere in the destination
|
||||
anyways), the job length remains the same. But for a qcow2 source and
|
||||
a destination that reads as all zeroes, the dirty bitmap changes to
|
||||
just tracking the allocated portions of the source, which results in
|
||||
faster completion and smaller job statistics. For the test to pass
|
||||
with both ./check -file and -qcow2, a new python filter is needed to
|
||||
mask out the now-varying job amounts (this matches the shell filters
|
||||
_filter_block_job_{offset,len} in common.filter). A later test will
|
||||
also be added which further validates expected sparseness, so it does
|
||||
not matter that 194 is no longer explicitly looking at how many bytes
|
||||
were copied.
|
||||
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
Message-ID: <20250509204341.3553601-25-eblake@redhat.com>
|
||||
Reviewed-by: Sunny Zhu <sunnyzhyy@qq.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 181a63667adf16c35b57e446def3e41c70f1fea6)
|
||||
Jira: https://issues.redhat.com/browse/RHEL-82906
|
||||
Jira: https://issues.redhat.com/browse/RHEL-83015
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
---
|
||||
block/mirror.c | 24 ++++++++++++++++--------
|
||||
tests/qemu-iotests/194 | 6 ++++--
|
||||
tests/qemu-iotests/194.out | 4 ++--
|
||||
tests/qemu-iotests/iotests.py | 12 +++++++++++-
|
||||
4 files changed, 33 insertions(+), 13 deletions(-)
|
||||
|
||||
diff --git a/block/mirror.c b/block/mirror.c
|
||||
index b35d12adaa..29cac1777c 100644
|
||||
--- a/block/mirror.c
|
||||
+++ b/block/mirror.c
|
||||
@@ -848,23 +848,31 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s)
|
||||
target_bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP &&
|
||||
bdrv_can_write_zeroes_with_unmap(target_bs);
|
||||
|
||||
+ /* Determine if the image is already zero, regardless of sync mode. */
|
||||
bdrv_graph_co_rdlock();
|
||||
bs = s->mirror_top_bs->backing->bs;
|
||||
+ if (s->target_is_zero) {
|
||||
+ ret = 1;
|
||||
+ } else {
|
||||
+ ret = bdrv_co_is_all_zeroes(target_bs);
|
||||
+ }
|
||||
bdrv_graph_co_rdunlock();
|
||||
|
||||
- if (s->sync_mode == MIRROR_SYNC_MODE_TOP) {
|
||||
+ /* Determine if a pre-zeroing pass is necessary. */
|
||||
+ if (ret < 0) {
|
||||
+ return ret;
|
||||
+ } else if (s->sync_mode == MIRROR_SYNC_MODE_TOP) {
|
||||
/* In TOP mode, there is no benefit to a pre-zeroing pass. */
|
||||
- } else if (!s->target_is_zero || punch_holes) {
|
||||
+ } else if (ret == 0 || punch_holes) {
|
||||
/*
|
||||
* Here, we are in FULL mode; our goal is to avoid writing
|
||||
* zeroes if the destination already reads as zero, except
|
||||
* when we are trying to punch holes. This is possible if
|
||||
- * zeroing happened externally (s->target_is_zero) or if we
|
||||
- * have a fast way to pre-zero the image (the dirty bitmap
|
||||
- * will be populated later by the non-zero portions, the same
|
||||
- * as for TOP mode). If pre-zeroing is not fast, or we need
|
||||
- * to punch holes, then our only recourse is to write the
|
||||
- * entire image.
|
||||
+ * zeroing happened externally (ret > 0) or if we have a fast
|
||||
+ * way to pre-zero the image (the dirty bitmap will be
|
||||
+ * populated later by the non-zero portions, the same as for
|
||||
+ * TOP mode). If pre-zeroing is not fast, or we need to punch
|
||||
+ * holes, then our only recourse is to write the entire image.
|
||||
*/
|
||||
if (!bdrv_can_write_zeroes_with_unmap(target_bs)) {
|
||||
bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, s->bdev_length);
|
||||
diff --git a/tests/qemu-iotests/194 b/tests/qemu-iotests/194
|
||||
index d0b9c084f5..e114c0b269 100755
|
||||
--- a/tests/qemu-iotests/194
|
||||
+++ b/tests/qemu-iotests/194
|
||||
@@ -62,7 +62,8 @@ with iotests.FilePath('source.img') as source_img_path, \
|
||||
|
||||
iotests.log('Waiting for `drive-mirror` to complete...')
|
||||
iotests.log(source_vm.event_wait('BLOCK_JOB_READY'),
|
||||
- filters=[iotests.filter_qmp_event])
|
||||
+ filters=[iotests.filter_qmp_event,
|
||||
+ iotests.filter_block_job])
|
||||
|
||||
iotests.log('Starting migration...')
|
||||
capabilities = [{'capability': 'events', 'state': True},
|
||||
@@ -88,7 +89,8 @@ with iotests.FilePath('source.img') as source_img_path, \
|
||||
|
||||
while True:
|
||||
event2 = source_vm.event_wait('BLOCK_JOB_COMPLETED')
|
||||
- iotests.log(event2, filters=[iotests.filter_qmp_event])
|
||||
+ iotests.log(event2, filters=[iotests.filter_qmp_event,
|
||||
+ iotests.filter_block_job])
|
||||
if event2['event'] == 'BLOCK_JOB_COMPLETED':
|
||||
iotests.log('Stopping the NBD server on destination...')
|
||||
iotests.log(dest_vm.qmp('nbd-server-stop'))
|
||||
diff --git a/tests/qemu-iotests/194.out b/tests/qemu-iotests/194.out
|
||||
index 376ed1d2e6..84e0fc34be 100644
|
||||
--- a/tests/qemu-iotests/194.out
|
||||
+++ b/tests/qemu-iotests/194.out
|
||||
@@ -7,7 +7,7 @@ Launching NBD server on destination...
|
||||
Starting `drive-mirror` on source...
|
||||
{"return": {}}
|
||||
Waiting for `drive-mirror` to complete...
|
||||
-{"data": {"device": "mirror-job0", "len": 1073741824, "offset": 1073741824, "speed": 0, "type": "mirror"}, "event": "BLOCK_JOB_READY", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
|
||||
+{"data": {"device": "mirror-job0", "len": "LEN", "offset": "OFFSET", "speed": 0, "type": "mirror"}, "event": "BLOCK_JOB_READY", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
|
||||
Starting migration...
|
||||
{"return": {}}
|
||||
{"execute": "migrate-start-postcopy", "arguments": {}}
|
||||
@@ -17,7 +17,7 @@ Starting migration...
|
||||
{"data": {"status": "completed"}, "event": "MIGRATION", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
|
||||
Gracefully ending the `drive-mirror` job on source...
|
||||
{"return": {}}
|
||||
-{"data": {"device": "mirror-job0", "len": 1073741824, "offset": 1073741824, "speed": 0, "type": "mirror"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
|
||||
+{"data": {"device": "mirror-job0", "len": "LEN", "offset": "OFFSET", "speed": 0, "type": "mirror"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
|
||||
Stopping the NBD server on destination...
|
||||
{"return": {}}
|
||||
Wait for migration completion on target...
|
||||
diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
|
||||
index c8cb028c2d..978bef1499 100644
|
||||
--- a/tests/qemu-iotests/iotests.py
|
||||
+++ b/tests/qemu-iotests/iotests.py
|
||||
@@ -601,13 +601,23 @@ def filter_chown(msg):
|
||||
return chown_re.sub("chown UID:GID", msg)
|
||||
|
||||
def filter_qmp_event(event):
|
||||
- '''Filter a QMP event dict'''
|
||||
+ '''Filter the timestamp of a QMP event dict'''
|
||||
event = dict(event)
|
||||
if 'timestamp' in event:
|
||||
event['timestamp']['seconds'] = 'SECS'
|
||||
event['timestamp']['microseconds'] = 'USECS'
|
||||
return event
|
||||
|
||||
+def filter_block_job(event):
|
||||
+ '''Filter the offset and length of a QMP block job event dict'''
|
||||
+ event = dict(event)
|
||||
+ if 'data' in event:
|
||||
+ if 'offset' in event['data']:
|
||||
+ event['data']['offset'] = 'OFFSET'
|
||||
+ if 'len' in event['data']:
|
||||
+ event['data']['len'] = 'LEN'
|
||||
+ return event
|
||||
+
|
||||
def filter_qmp(qmsg, filter_fn):
|
||||
'''Given a string filter, filter a QMP object's values.
|
||||
filter_fn takes a (key, value) pair.'''
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,355 @@
|
||||
From 8a2e660ff3ec7f7506fbd4197d4dc8f53db7859a Mon Sep 17 00:00:00 2001
|
||||
From: Eric Blake <eblake@redhat.com>
|
||||
Date: Fri, 9 May 2025 15:40:28 -0500
|
||||
Subject: [PATCH 13/16] mirror: Skip writing zeroes when target is already zero
|
||||
|
||||
RH-Author: Eric Blake <eblake@redhat.com>
|
||||
RH-MergeRequest: 365: blockdev-mirror: More efficient handling of sparse mirrors
|
||||
RH-Jira: RHEL-82906 RHEL-83015
|
||||
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
|
||||
RH-Commit: [11/14] f6bb5e0cecee07af0389aa18c3bddb47d6c5cf54 (ebblake/centos-qemu-kvm)
|
||||
|
||||
When mirroring, the goal is to ensure that the destination reads the
|
||||
same as the source; this goal is met whether the destination is sparse
|
||||
or fully-allocated (except when explicitly punching holes, then merely
|
||||
reading zero is not enough to know if it is sparse, so we still want
|
||||
to punch the hole). Avoiding a redundant write to zero (whether in
|
||||
the background because the zero cluster was marked in the dirty
|
||||
bitmap, or in the foreground because the guest is writing zeroes) when
|
||||
the destination already reads as zero makes mirroring faster, and
|
||||
avoids allocating the destination merely because the source reports as
|
||||
allocated.
|
||||
|
||||
The effect is especially pronounced when the source is a raw file.
|
||||
That's because when the source is a qcow2 file, the dirty bitmap only
|
||||
visits the portions of the source that are allocated, which tend to be
|
||||
non-zero. But when the source is a raw file,
|
||||
bdrv_co_is_allocated_above() reports the entire file as allocated so
|
||||
mirror_dirty_init sets the entire dirty bitmap, and it is only later
|
||||
during mirror_iteration that we change to consulting the more precise
|
||||
bdrv_co_block_status_above() to learn where the source reads as zero.
|
||||
|
||||
Remember that since a mirror operation can write a cluster more than
|
||||
once (every time the guest changes the source, the destination is also
|
||||
changed to keep up), and the guest can change whether a given cluster
|
||||
reads as zero, is discarded, or has non-zero data over the course of
|
||||
the mirror operation, we can't take the shortcut of relying on
|
||||
s->target_is_zero (which is static for the life of the job) in
|
||||
mirror_co_zero() to see if the destination is already zero, because
|
||||
that information may be stale. Any solution we use must be dynamic in
|
||||
the face of the guest writing or discarding a cluster while the mirror
|
||||
has been ongoing.
|
||||
|
||||
We could just teach mirror_co_zero() to do a block_status() probe of
|
||||
the destination, and skip the zeroes if the destination already reads
|
||||
as zero, but we know from past experience that extra block_status()
|
||||
calls are not always cheap (tmpfs, anyone?), especially when they are
|
||||
random access rather than linear. Use of block_status() of the source
|
||||
by the background task in a linear fashion is not our bottleneck (it's
|
||||
a background task, after all); but since mirroring can be done while
|
||||
the source is actively being changed, we don't want a slow
|
||||
block_status() of the destination to occur on the hot path of the
|
||||
guest trying to do random-access writes to the source.
|
||||
|
||||
So this patch takes a slightly different approach: any time we have to
|
||||
track dirty clusters, we can also track which clusters are known to
|
||||
read as zero. For sync=TOP or when we are punching holes from
|
||||
"detect-zeroes":"unmap", the zero bitmap starts out empty, but
|
||||
prevents a second write zero to a cluster that was already zero by an
|
||||
earlier pass; for sync=FULL when we are not punching holes, the zero
|
||||
bitmap starts out full if the destination reads as zero during
|
||||
initialization. Either way, I/O to the destination can now avoid
|
||||
redundant write zero to a cluster that already reads as zero, all
|
||||
without having to do a block_status() per write on the destination.
|
||||
|
||||
With this patch, if I create a raw sparse destination file, connect it
|
||||
with QMP 'blockdev-add' while leaving it at the default "discard":
|
||||
"ignore", then run QMP 'blockdev-mirror' with "sync": "full", the
|
||||
destination remains sparse rather than fully allocated. Meanwhile, a
|
||||
destination image that is already fully allocated remains so unless it
|
||||
was opened with "detect-zeroes": "unmap". And any time writing zeroes
|
||||
is skipped, the job counters are not incremented.
|
||||
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
Message-ID: <20250509204341.3553601-26-eblake@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 7e277545b90874171128804e256a538fb0e8dd7e)
|
||||
Jira: https://issues.redhat.com/browse/RHEL-82906
|
||||
Jira: https://issues.redhat.com/browse/RHEL-83015
|
||||
Signed-off-by: Eric Blake <eblake@redhat.com>
|
||||
---
|
||||
block/mirror.c | 107 ++++++++++++++++++++++++++++++++++++++++++-------
|
||||
1 file changed, 93 insertions(+), 14 deletions(-)
|
||||
|
||||
diff --git a/block/mirror.c b/block/mirror.c
|
||||
index 29cac1777c..7f3b5477ce 100644
|
||||
--- a/block/mirror.c
|
||||
+++ b/block/mirror.c
|
||||
@@ -73,6 +73,7 @@ typedef struct MirrorBlockJob {
|
||||
size_t buf_size;
|
||||
int64_t bdev_length;
|
||||
unsigned long *cow_bitmap;
|
||||
+ unsigned long *zero_bitmap;
|
||||
BdrvDirtyBitmap *dirty_bitmap;
|
||||
BdrvDirtyBitmapIter *dbi;
|
||||
uint8_t *buf;
|
||||
@@ -108,9 +109,12 @@ struct MirrorOp {
|
||||
int64_t offset;
|
||||
uint64_t bytes;
|
||||
|
||||
- /* The pointee is set by mirror_co_read(), mirror_co_zero(), and
|
||||
- * mirror_co_discard() before yielding for the first time */
|
||||
+ /*
|
||||
+ * These pointers are set by mirror_co_read(), mirror_co_zero(), and
|
||||
+ * mirror_co_discard() before yielding for the first time
|
||||
+ */
|
||||
int64_t *bytes_handled;
|
||||
+ bool *io_skipped;
|
||||
|
||||
bool is_pseudo_op;
|
||||
bool is_active_write;
|
||||
@@ -408,15 +412,34 @@ static void coroutine_fn mirror_co_read(void *opaque)
|
||||
static void coroutine_fn mirror_co_zero(void *opaque)
|
||||
{
|
||||
MirrorOp *op = opaque;
|
||||
- int ret;
|
||||
+ bool write_needed = true;
|
||||
+ int ret = 0;
|
||||
|
||||
op->s->in_flight++;
|
||||
op->s->bytes_in_flight += op->bytes;
|
||||
*op->bytes_handled = op->bytes;
|
||||
op->is_in_flight = true;
|
||||
|
||||
- ret = blk_co_pwrite_zeroes(op->s->target, op->offset, op->bytes,
|
||||
- op->s->unmap ? BDRV_REQ_MAY_UNMAP : 0);
|
||||
+ if (op->s->zero_bitmap) {
|
||||
+ unsigned long end = DIV_ROUND_UP(op->offset + op->bytes,
|
||||
+ op->s->granularity);
|
||||
+ assert(QEMU_IS_ALIGNED(op->offset, op->s->granularity));
|
||||
+ assert(QEMU_IS_ALIGNED(op->bytes, op->s->granularity) ||
|
||||
+ op->offset + op->bytes == op->s->bdev_length);
|
||||
+ if (find_next_zero_bit(op->s->zero_bitmap, end,
|
||||
+ op->offset / op->s->granularity) == end) {
|
||||
+ write_needed = false;
|
||||
+ *op->io_skipped = true;
|
||||
+ }
|
||||
+ }
|
||||
+ if (write_needed) {
|
||||
+ ret = blk_co_pwrite_zeroes(op->s->target, op->offset, op->bytes,
|
||||
+ op->s->unmap ? BDRV_REQ_MAY_UNMAP : 0);
|
||||
+ }
|
||||
+ if (ret >= 0 && op->s->zero_bitmap) {
|
||||
+ bitmap_set(op->s->zero_bitmap, op->offset / op->s->granularity,
|
||||
+ DIV_ROUND_UP(op->bytes, op->s->granularity));
|
||||
+ }
|
||||
mirror_write_complete(op, ret);
|
||||
}
|
||||
|
||||
@@ -435,29 +458,43 @@ static void coroutine_fn mirror_co_discard(void *opaque)
|
||||
}
|
||||
|
||||
static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset,
|
||||
- unsigned bytes, MirrorMethod mirror_method)
|
||||
+ unsigned bytes, MirrorMethod mirror_method,
|
||||
+ bool *io_skipped)
|
||||
{
|
||||
MirrorOp *op;
|
||||
Coroutine *co;
|
||||
int64_t bytes_handled = -1;
|
||||
|
||||
+ assert(QEMU_IS_ALIGNED(offset, s->granularity));
|
||||
+ assert(QEMU_IS_ALIGNED(bytes, s->granularity) ||
|
||||
+ offset + bytes == s->bdev_length);
|
||||
op = g_new(MirrorOp, 1);
|
||||
*op = (MirrorOp){
|
||||
.s = s,
|
||||
.offset = offset,
|
||||
.bytes = bytes,
|
||||
.bytes_handled = &bytes_handled,
|
||||
+ .io_skipped = io_skipped,
|
||||
};
|
||||
qemu_co_queue_init(&op->waiting_requests);
|
||||
|
||||
switch (mirror_method) {
|
||||
case MIRROR_METHOD_COPY:
|
||||
+ if (s->zero_bitmap) {
|
||||
+ bitmap_clear(s->zero_bitmap, offset / s->granularity,
|
||||
+ DIV_ROUND_UP(bytes, s->granularity));
|
||||
+ }
|
||||
co = qemu_coroutine_create(mirror_co_read, op);
|
||||
break;
|
||||
case MIRROR_METHOD_ZERO:
|
||||
+ /* s->zero_bitmap handled in mirror_co_zero */
|
||||
co = qemu_coroutine_create(mirror_co_zero, op);
|
||||
break;
|
||||
case MIRROR_METHOD_DISCARD:
|
||||
+ if (s->zero_bitmap) {
|
||||
+ bitmap_clear(s->zero_bitmap, offset / s->granularity,
|
||||
+ DIV_ROUND_UP(bytes, s->granularity));
|
||||
+ }
|
||||
co = qemu_coroutine_create(mirror_co_discard, op);
|
||||
break;
|
||||
default:
|
||||
@@ -568,6 +605,7 @@ static void coroutine_fn GRAPH_UNLOCKED mirror_iteration(MirrorBlockJob *s)
|
||||
int ret;
|
||||
int64_t io_bytes;
|
||||
int64_t io_bytes_acct;
|
||||
+ bool io_skipped = false;
|
||||
MirrorMethod mirror_method = MIRROR_METHOD_COPY;
|
||||
|
||||
assert(!(offset % s->granularity));
|
||||
@@ -611,8 +649,10 @@ static void coroutine_fn GRAPH_UNLOCKED mirror_iteration(MirrorBlockJob *s)
|
||||
}
|
||||
|
||||
io_bytes = mirror_clip_bytes(s, offset, io_bytes);
|
||||
- io_bytes = mirror_perform(s, offset, io_bytes, mirror_method);
|
||||
- if (mirror_method != MIRROR_METHOD_COPY && write_zeroes_ok) {
|
||||
+ io_bytes = mirror_perform(s, offset, io_bytes, mirror_method,
|
||||
+ &io_skipped);
|
||||
+ if (io_skipped ||
|
||||
+ (mirror_method != MIRROR_METHOD_COPY && write_zeroes_ok)) {
|
||||
io_bytes_acct = 0;
|
||||
} else {
|
||||
io_bytes_acct = io_bytes;
|
||||
@@ -847,8 +887,10 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s)
|
||||
bool punch_holes =
|
||||
target_bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP &&
|
||||
bdrv_can_write_zeroes_with_unmap(target_bs);
|
||||
+ int64_t bitmap_length = DIV_ROUND_UP(s->bdev_length, s->granularity);
|
||||
|
||||
/* Determine if the image is already zero, regardless of sync mode. */
|
||||
+ s->zero_bitmap = bitmap_new(bitmap_length);
|
||||
bdrv_graph_co_rdlock();
|
||||
bs = s->mirror_top_bs->backing->bs;
|
||||
if (s->target_is_zero) {
|
||||
@@ -862,7 +904,14 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s)
|
||||
if (ret < 0) {
|
||||
return ret;
|
||||
} else if (s->sync_mode == MIRROR_SYNC_MODE_TOP) {
|
||||
- /* In TOP mode, there is no benefit to a pre-zeroing pass. */
|
||||
+ /*
|
||||
+ * In TOP mode, there is no benefit to a pre-zeroing pass, but
|
||||
+ * the zero bitmap can be set if the destination already reads
|
||||
+ * as zero and we are not punching holes.
|
||||
+ */
|
||||
+ if (ret > 0 && !punch_holes) {
|
||||
+ bitmap_set(s->zero_bitmap, 0, bitmap_length);
|
||||
+ }
|
||||
} else if (ret == 0 || punch_holes) {
|
||||
/*
|
||||
* Here, we are in FULL mode; our goal is to avoid writing
|
||||
@@ -871,8 +920,9 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s)
|
||||
* zeroing happened externally (ret > 0) or if we have a fast
|
||||
* way to pre-zero the image (the dirty bitmap will be
|
||||
* populated later by the non-zero portions, the same as for
|
||||
- * TOP mode). If pre-zeroing is not fast, or we need to punch
|
||||
- * holes, then our only recourse is to write the entire image.
|
||||
+ * TOP mode). If pre-zeroing is not fast, then our only
|
||||
+ * recourse is to mark the entire image dirty. The act of
|
||||
+ * pre-zeroing will populate the zero bitmap.
|
||||
*/
|
||||
if (!bdrv_can_write_zeroes_with_unmap(target_bs)) {
|
||||
bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, s->bdev_length);
|
||||
@@ -883,6 +933,7 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s)
|
||||
for (offset = 0; offset < s->bdev_length; ) {
|
||||
int bytes = MIN(s->bdev_length - offset,
|
||||
QEMU_ALIGN_DOWN(INT_MAX, s->granularity));
|
||||
+ bool ignored;
|
||||
|
||||
mirror_throttle(s);
|
||||
|
||||
@@ -898,12 +949,15 @@ static int coroutine_fn GRAPH_UNLOCKED mirror_dirty_init(MirrorBlockJob *s)
|
||||
continue;
|
||||
}
|
||||
|
||||
- mirror_perform(s, offset, bytes, MIRROR_METHOD_ZERO);
|
||||
+ mirror_perform(s, offset, bytes, MIRROR_METHOD_ZERO, &ignored);
|
||||
offset += bytes;
|
||||
}
|
||||
|
||||
mirror_wait_for_all_io(s);
|
||||
s->initial_zeroing_ongoing = false;
|
||||
+ } else {
|
||||
+ /* In FULL mode, and image already reads as zero. */
|
||||
+ bitmap_set(s->zero_bitmap, 0, bitmap_length);
|
||||
}
|
||||
|
||||
/* First part, loop on the sectors and initialize the dirty bitmap. */
|
||||
@@ -1188,6 +1242,7 @@ immediate_exit:
|
||||
assert(s->in_flight == 0);
|
||||
qemu_vfree(s->buf);
|
||||
g_free(s->cow_bitmap);
|
||||
+ g_free(s->zero_bitmap);
|
||||
g_free(s->in_flight_bitmap);
|
||||
bdrv_dirty_iter_free(s->dbi);
|
||||
|
||||
@@ -1367,6 +1422,7 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
|
||||
int ret;
|
||||
size_t qiov_offset = 0;
|
||||
int64_t dirty_bitmap_offset, dirty_bitmap_end;
|
||||
+ int64_t zero_bitmap_offset, zero_bitmap_end;
|
||||
|
||||
if (!QEMU_IS_ALIGNED(offset, job->granularity) &&
|
||||
bdrv_dirty_bitmap_get(job->dirty_bitmap, offset))
|
||||
@@ -1410,8 +1466,9 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
|
||||
}
|
||||
|
||||
/*
|
||||
- * Tails are either clean or shrunk, so for bitmap resetting
|
||||
- * we safely align the range down.
|
||||
+ * Tails are either clean or shrunk, so for dirty bitmap resetting
|
||||
+ * we safely align the range narrower. But for zero bitmap, round
|
||||
+ * range wider for checking or clearing, and narrower for setting.
|
||||
*/
|
||||
dirty_bitmap_offset = QEMU_ALIGN_UP(offset, job->granularity);
|
||||
dirty_bitmap_end = QEMU_ALIGN_DOWN(offset + bytes, job->granularity);
|
||||
@@ -1419,22 +1476,44 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
|
||||
bdrv_reset_dirty_bitmap(job->dirty_bitmap, dirty_bitmap_offset,
|
||||
dirty_bitmap_end - dirty_bitmap_offset);
|
||||
}
|
||||
+ zero_bitmap_offset = offset / job->granularity;
|
||||
+ zero_bitmap_end = DIV_ROUND_UP(offset + bytes, job->granularity);
|
||||
|
||||
job_progress_increase_remaining(&job->common.job, bytes);
|
||||
job->active_write_bytes_in_flight += bytes;
|
||||
|
||||
switch (method) {
|
||||
case MIRROR_METHOD_COPY:
|
||||
+ if (job->zero_bitmap) {
|
||||
+ bitmap_clear(job->zero_bitmap, zero_bitmap_offset,
|
||||
+ zero_bitmap_end - zero_bitmap_offset);
|
||||
+ }
|
||||
ret = blk_co_pwritev_part(job->target, offset, bytes,
|
||||
qiov, qiov_offset, flags);
|
||||
break;
|
||||
|
||||
case MIRROR_METHOD_ZERO:
|
||||
+ if (job->zero_bitmap) {
|
||||
+ if (find_next_zero_bit(job->zero_bitmap, zero_bitmap_end,
|
||||
+ zero_bitmap_offset) == zero_bitmap_end) {
|
||||
+ ret = 0;
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
assert(!qiov);
|
||||
ret = blk_co_pwrite_zeroes(job->target, offset, bytes, flags);
|
||||
+ if (job->zero_bitmap && ret >= 0) {
|
||||
+ bitmap_set(job->zero_bitmap, dirty_bitmap_offset / job->granularity,
|
||||
+ (dirty_bitmap_end - dirty_bitmap_offset) /
|
||||
+ job->granularity);
|
||||
+ }
|
||||
break;
|
||||
|
||||
case MIRROR_METHOD_DISCARD:
|
||||
+ if (job->zero_bitmap) {
|
||||
+ bitmap_clear(job->zero_bitmap, zero_bitmap_offset,
|
||||
+ zero_bitmap_end - zero_bitmap_offset);
|
||||
+ }
|
||||
assert(!qiov);
|
||||
ret = blk_co_pdiscard(job->target, offset, bytes);
|
||||
break;
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,49 @@
|
||||
From 4b9a1a9154467fd65ac2a0a26959d3342d8fcd49 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:37:08 +0100
|
||||
Subject: [PATCH 55/57] net/socket: skip automatic zero-init of large array
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [29/30] 645ad4d138d1222ea9bd1b2ac3b84d9ff83e2fa2 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'net_socket_send' method has a 68k byte array used for copying
|
||||
data between guest and host. Skip the automatic zero-init of this
|
||||
array to eliminate the performance overhead in the I/O hot path.
|
||||
|
||||
The 'buf1' array will be fully initialized when reading data off
|
||||
the network socket.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
|
||||
Reviewed-by: Harsh Prateek Bora <harshpb@linux.ibm.com>
|
||||
Message-id: 20250610123709.835102-31-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 751b0e79f1e0e7f88fad2fe2f22595ad03d78859)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
net/socket.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/net/socket.c b/net/socket.c
|
||||
index 8e3702e1f3..784dda686f 100644
|
||||
--- a/net/socket.c
|
||||
+++ b/net/socket.c
|
||||
@@ -157,7 +157,7 @@ static void net_socket_send(void *opaque)
|
||||
NetSocketState *s = opaque;
|
||||
int size;
|
||||
int ret;
|
||||
- uint8_t buf1[NET_BUFSIZE];
|
||||
+ QEMU_UNINITIALIZED uint8_t buf1[NET_BUFSIZE];
|
||||
const uint8_t *buf;
|
||||
|
||||
size = recv(s->fd, buf1, sizeof(buf1), 0);
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,49 @@
|
||||
From 94310a4168257297e52058d5d6aea4a2d06630c6 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 13:37:09 +0100
|
||||
Subject: [PATCH 56/57] net/stream: skip automatic zero-init of large array
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-MergeRequest: 382: Solve -ftrivial-auto-var-init performance regression with QEMU_UNINITIALIZED
|
||||
RH-Jira: RHEL-99888
|
||||
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||||
RH-Commit: [30/30] 9dfec5c0e6358e3557bf58d66eee8e4ba6e93621 (stefanha/centos-stream-qemu-kvm)
|
||||
|
||||
The 'net_stream_send' method has a 68k byte array used for copying
|
||||
data between guest and host. Skip the automatic zero-init of this
|
||||
array to eliminate the performance overhead in the I/O hot path.
|
||||
|
||||
The 'buf1' array will be fully initialized when reading data off
|
||||
the network socket.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
|
||||
Reviewed-by: Harsh Prateek Bora <harshpb@linux.ibm.com>
|
||||
Message-id: 20250610123709.835102-32-berrange@redhat.com
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
(cherry picked from commit 837b87c4c5ba9ac7a255133c6642b8d578272a70)
|
||||
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
---
|
||||
net/stream.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/net/stream.c b/net/stream.c
|
||||
index 97e6ec6679..12384ffee5 100644
|
||||
--- a/net/stream.c
|
||||
+++ b/net/stream.c
|
||||
@@ -148,7 +148,7 @@ static gboolean net_stream_send(QIOChannel *ioc,
|
||||
NetStreamState *s = data;
|
||||
int size;
|
||||
int ret;
|
||||
- char buf1[NET_BUFSIZE];
|
||||
+ QEMU_UNINITIALIZED char buf1[NET_BUFSIZE];
|
||||
const char *buf;
|
||||
|
||||
size = qio_channel_read(s->ioc, buf1, sizeof(buf1), NULL);
|
||||
--
|
||||
2.39.3
|
||||
|
||||
@ -0,0 +1,133 @@
|
||||
From b6de1e19ba778547e92997c6cad77d7cf755c78b Mon Sep 17 00:00:00 2001
|
||||
From: Laurent Vivier <lvivier@redhat.com>
|
||||
Date: Mon, 17 Feb 2025 10:25:50 +0100
|
||||
Subject: [PATCH 1/3] net: vhost-user: add QAPI events to report connection
|
||||
state
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Laurent Vivier <lvivier@redhat.com>
|
||||
RH-MergeRequest: 371: net: vhost-user: add QAPI events to report connection state
|
||||
RH-Jira: RHEL-95120
|
||||
RH-Acked-by: Eugenio Pérez <eperezma@redhat.com>
|
||||
RH-Acked-by: Cindy Lu <lulu@redhat.com>
|
||||
RH-Commit: [1/1] c8f65026e3548891fe713a1622438388e285dbf3 (lvivier/qemu-kvm-centos)
|
||||
|
||||
The netdev reports NETDEV_VHOST_USER_CONNECTED event when
|
||||
the chardev is connected, and NETDEV_VHOST_USER_DISCONNECTED
|
||||
when it is disconnected.
|
||||
|
||||
The NETDEV_VHOST_USER_CONNECTED event includes the chardev id.
|
||||
|
||||
This allows a system manager like libvirt to detect when the server
|
||||
fails.
|
||||
|
||||
For instance with passt:
|
||||
|
||||
{ 'execute': 'qmp_capabilities' }
|
||||
{ "return": { } }
|
||||
|
||||
[killing passt here]
|
||||
|
||||
{ "timestamp": { "seconds": 1739538634, "microseconds": 920450 },
|
||||
"event": "NETDEV_VHOST_USER_DISCONNECTED",
|
||||
"data": { "netdev-id": "netdev0" } }
|
||||
|
||||
[automatic reconnection with reconnect-ms]
|
||||
|
||||
{ "timestamp": { "seconds": 1739538638, "microseconds": 354181 },
|
||||
"event": "NETDEV_VHOST_USER_CONNECTED",
|
||||
"data": { "netdev-id": "netdev0", "chardev-id": "chr0" } }
|
||||
|
||||
Tested-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
|
||||
Message-Id: <20250217092550.1172055-1-lvivier@redhat.com>
|
||||
Acked-by: Markus Armbruster <armbru@redhat.com>
|
||||
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
(cherry picked from commit 02fd9f8aeeb184276b283ae2f404bc3acf1e7b7a)
|
||||
---
|
||||
net/vhost-user.c | 3 +++
|
||||
qapi/net.json | 40 ++++++++++++++++++++++++++++++++++++++++
|
||||
2 files changed, 43 insertions(+)
|
||||
|
||||
diff --git a/net/vhost-user.c b/net/vhost-user.c
|
||||
index 12555518e8..0b235e50c6 100644
|
||||
--- a/net/vhost-user.c
|
||||
+++ b/net/vhost-user.c
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "chardev/char-fe.h"
|
||||
#include "qapi/error.h"
|
||||
#include "qapi/qapi-commands-net.h"
|
||||
+#include "qapi/qapi-events-net.h"
|
||||
#include "qemu/config-file.h"
|
||||
#include "qemu/error-report.h"
|
||||
#include "qemu/option.h"
|
||||
@@ -271,6 +272,7 @@ static void chr_closed_bh(void *opaque)
|
||||
if (err) {
|
||||
error_report_err(err);
|
||||
}
|
||||
+ qapi_event_send_netdev_vhost_user_disconnected(name);
|
||||
}
|
||||
|
||||
static void net_vhost_user_event(void *opaque, QEMUChrEvent event)
|
||||
@@ -300,6 +302,7 @@ static void net_vhost_user_event(void *opaque, QEMUChrEvent event)
|
||||
net_vhost_user_watch, s);
|
||||
qmp_set_link(name, true, &err);
|
||||
s->started = true;
|
||||
+ qapi_event_send_netdev_vhost_user_connected(name, chr->label);
|
||||
break;
|
||||
case CHR_EVENT_CLOSED:
|
||||
/* a close event may happen during a read/write, but vhost
|
||||
diff --git a/qapi/net.json b/qapi/net.json
|
||||
index 87fc0d0b28..7bd1eaa1ba 100644
|
||||
--- a/qapi/net.json
|
||||
+++ b/qapi/net.json
|
||||
@@ -1020,3 +1020,43 @@
|
||||
##
|
||||
{ 'event': 'NETDEV_STREAM_DISCONNECTED',
|
||||
'data': { 'netdev-id': 'str' } }
|
||||
+
|
||||
+##
|
||||
+# @NETDEV_VHOST_USER_CONNECTED:
|
||||
+#
|
||||
+# Emitted when the vhost-user chardev is connected
|
||||
+#
|
||||
+# @netdev-id: QEMU netdev id that is connected
|
||||
+#
|
||||
+# @chardev-id: The character device id used by the QEMU netdev
|
||||
+#
|
||||
+# Since: 10.0
|
||||
+#
|
||||
+# .. qmp-example::
|
||||
+#
|
||||
+# <- { "timestamp": {"seconds": 1739538638, "microseconds": 354181 },
|
||||
+# "event": "NETDEV_VHOST_USER_CONNECTED",
|
||||
+# "data": { "netdev-id": "netdev0", "chardev-id": "chr0" } }
|
||||
+#
|
||||
+##
|
||||
+{ 'event': 'NETDEV_VHOST_USER_CONNECTED',
|
||||
+ 'data': { 'netdev-id': 'str', 'chardev-id': 'str' } }
|
||||
+
|
||||
+##
|
||||
+# @NETDEV_VHOST_USER_DISCONNECTED:
|
||||
+#
|
||||
+# Emitted when the vhost-user chardev is disconnected
|
||||
+#
|
||||
+# @netdev-id: QEMU netdev id that is disconnected
|
||||
+#
|
||||
+# Since: 10.0
|
||||
+#
|
||||
+# .. qmp-example::
|
||||
+#
|
||||
+# <- { "timestamp": { "seconds": 1739538634, "microseconds": 920450 },
|
||||
+# "event": "NETDEV_VHOST_USER_DISCONNECTED",
|
||||
+# "data": { "netdev-id": "netdev0" } }
|
||||
+#
|
||||
+##
|
||||
+{ 'event': 'NETDEV_VHOST_USER_DISCONNECTED',
|
||||
+ 'data': { 'netdev-id': 'str' } }
|
||||
--
|
||||
2.48.1
|
||||
|
||||
153
SOURCES/kvm-pci-Use-PCI-PM-capability-initializer.patch
Normal file
153
SOURCES/kvm-pci-Use-PCI-PM-capability-initializer.patch
Normal file
@ -0,0 +1,153 @@
|
||||
From 978951b390bb7073293c792c4714516ad40cba73 Mon Sep 17 00:00:00 2001
|
||||
From: Alex Williamson <alex.williamson@redhat.com>
|
||||
Date: Tue, 25 Feb 2025 14:52:26 -0700
|
||||
Subject: [PATCH 3/7] pci: Use PCI PM capability initializer
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Eric Auger <eric.auger@redhat.com>
|
||||
RH-MergeRequest: 348: PCI: Implement basic PCI PM capability backing
|
||||
RH-Jira: RHEL-7301
|
||||
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
|
||||
RH-Acked-by: Alex Williamson <None>
|
||||
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
|
||||
RH-Commit: [3/6] fd862caa094490a9b8a04b00ad39ba58e0b46a7a (eauger1/centos-qemu-kvm)
|
||||
|
||||
Switch callers directly initializing the PCI PM capability with
|
||||
pci_add_capability() to use pci_pm_init().
|
||||
|
||||
Cc: Dmitry Fleytman <dmitry.fleytman@gmail.com>
|
||||
Cc: Akihiko Odaki <akihiko.odaki@daynix.com>
|
||||
Cc: Jason Wang <jasowang@redhat.com>
|
||||
Cc: Stefan Weil <sw@weilnetz.de>
|
||||
Cc: Sriram Yagnaraman <sriram.yagnaraman@ericsson.com>
|
||||
Cc: Keith Busch <kbusch@kernel.org>
|
||||
Cc: Klaus Jensen <its@irrelevant.dk>
|
||||
Cc: Jesper Devantier <foss@defmacro.it>
|
||||
Cc: Michael S. Tsirkin <mst@redhat.com>
|
||||
Cc: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
|
||||
Cc: Cédric Le Goater <clg@redhat.com>
|
||||
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
|
||||
Reviewed-by: Eric Auger <eric.auger@redhat.com>
|
||||
Reviewed-by: Akihiko Odaki <akihiko.odaki@daynix.com>
|
||||
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Link: https://lore.kernel.org/qemu-devel/20250225215237.3314011-3-alex.williamson@redhat.com
|
||||
Signed-off-by: Cédric Le Goater <clg@redhat.com>
|
||||
(cherry picked from commit 0681ec253141d838210b3c5e6bc0d2d71f2e111e)
|
||||
Signed-off-by: Eric Auger <eric.auger@redhat.com>
|
||||
---
|
||||
hw/net/e1000e.c | 3 +--
|
||||
hw/net/eepro100.c | 4 +---
|
||||
hw/net/igb.c | 3 +--
|
||||
hw/nvme/ctrl.c | 3 +--
|
||||
hw/pci-bridge/pcie_pci_bridge.c | 2 +-
|
||||
hw/vfio/pci.c | 7 ++++++-
|
||||
hw/virtio/virtio-pci.c | 3 +--
|
||||
7 files changed, 12 insertions(+), 13 deletions(-)
|
||||
|
||||
diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
|
||||
index 843892ce09..9eb93d049d 100644
|
||||
--- a/hw/net/e1000e.c
|
||||
+++ b/hw/net/e1000e.c
|
||||
@@ -372,8 +372,7 @@ static int
|
||||
e1000e_add_pm_capability(PCIDevice *pdev, uint8_t offset, uint16_t pmc)
|
||||
{
|
||||
Error *local_err = NULL;
|
||||
- int ret = pci_add_capability(pdev, PCI_CAP_ID_PM, offset,
|
||||
- PCI_PM_SIZEOF, &local_err);
|
||||
+ int ret = pci_pm_init(pdev, offset, &local_err);
|
||||
|
||||
if (local_err) {
|
||||
error_report_err(local_err);
|
||||
diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c
|
||||
index d9a70c4544..668a410055 100644
|
||||
--- a/hw/net/eepro100.c
|
||||
+++ b/hw/net/eepro100.c
|
||||
@@ -549,9 +549,7 @@ static void e100_pci_reset(EEPRO100State *s, Error **errp)
|
||||
if (info->power_management) {
|
||||
/* Power Management Capabilities */
|
||||
int cfg_offset = 0xdc;
|
||||
- int r = pci_add_capability(&s->dev, PCI_CAP_ID_PM,
|
||||
- cfg_offset, PCI_PM_SIZEOF,
|
||||
- errp);
|
||||
+ int r = pci_pm_init(&s->dev, cfg_offset, errp);
|
||||
if (r < 0) {
|
||||
return;
|
||||
}
|
||||
diff --git a/hw/net/igb.c b/hw/net/igb.c
|
||||
index b92bba402e..a3c22e2391 100644
|
||||
--- a/hw/net/igb.c
|
||||
+++ b/hw/net/igb.c
|
||||
@@ -356,8 +356,7 @@ static int
|
||||
igb_add_pm_capability(PCIDevice *pdev, uint8_t offset, uint16_t pmc)
|
||||
{
|
||||
Error *local_err = NULL;
|
||||
- int ret = pci_add_capability(pdev, PCI_CAP_ID_PM, offset,
|
||||
- PCI_PM_SIZEOF, &local_err);
|
||||
+ int ret = pci_pm_init(pdev, offset, &local_err);
|
||||
|
||||
if (local_err) {
|
||||
error_report_err(local_err);
|
||||
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
|
||||
index 9f277b81d8..d451ee0d00 100644
|
||||
--- a/hw/nvme/ctrl.c
|
||||
+++ b/hw/nvme/ctrl.c
|
||||
@@ -8293,8 +8293,7 @@ static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset)
|
||||
Error *err = NULL;
|
||||
int ret;
|
||||
|
||||
- ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, offset,
|
||||
- PCI_PM_SIZEOF, &err);
|
||||
+ ret = pci_pm_init(pci_dev, offset, &err);
|
||||
if (err) {
|
||||
error_report_err(err);
|
||||
return ret;
|
||||
diff --git a/hw/pci-bridge/pcie_pci_bridge.c b/hw/pci-bridge/pcie_pci_bridge.c
|
||||
index 7646ac2397..2f098e3a13 100644
|
||||
--- a/hw/pci-bridge/pcie_pci_bridge.c
|
||||
+++ b/hw/pci-bridge/pcie_pci_bridge.c
|
||||
@@ -52,7 +52,7 @@ static void pcie_pci_bridge_realize(PCIDevice *d, Error **errp)
|
||||
goto cap_error;
|
||||
}
|
||||
|
||||
- pos = pci_add_capability(d, PCI_CAP_ID_PM, 0, PCI_PM_SIZEOF, errp);
|
||||
+ pos = pci_pm_init(d, 0, errp);
|
||||
if (pos < 0) {
|
||||
goto pm_error;
|
||||
}
|
||||
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
|
||||
index 82a47edc89..e18b57d864 100644
|
||||
--- a/hw/vfio/pci.c
|
||||
+++ b/hw/vfio/pci.c
|
||||
@@ -2220,7 +2220,12 @@ static bool vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
|
||||
case PCI_CAP_ID_PM:
|
||||
vfio_check_pm_reset(vdev, pos);
|
||||
vdev->pm_cap = pos;
|
||||
- ret = pci_add_capability(pdev, cap_id, pos, size, errp) >= 0;
|
||||
+ ret = pci_pm_init(pdev, pos, errp) >= 0;
|
||||
+ /*
|
||||
+ * PCI-core config space emulation needs write access to the power
|
||||
+ * state enabled for tracking BAR mapping relative to PM state.
|
||||
+ */
|
||||
+ pci_set_word(pdev->wmask + pos + PCI_PM_CTRL, PCI_PM_CTRL_STATE_MASK);
|
||||
break;
|
||||
case PCI_CAP_ID_AF:
|
||||
vfio_check_af_flr(vdev, pos);
|
||||
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
|
||||
index 524b63e5c7..4b2aeaad8d 100644
|
||||
--- a/hw/virtio/virtio-pci.c
|
||||
+++ b/hw/virtio/virtio-pci.c
|
||||
@@ -2195,8 +2195,7 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp)
|
||||
pos = pcie_endpoint_cap_init(pci_dev, 0);
|
||||
assert(pos > 0);
|
||||
|
||||
- pos = pci_add_capability(pci_dev, PCI_CAP_ID_PM, 0,
|
||||
- PCI_PM_SIZEOF, errp);
|
||||
+ pos = pci_pm_init(pci_dev, 0, errp);
|
||||
if (pos < 0) {
|
||||
return;
|
||||
}
|
||||
--
|
||||
2.48.1
|
||||
|
||||
99
SOURCES/kvm-pcie-virtio-Remove-redundant-pm_cap.patch
Normal file
99
SOURCES/kvm-pcie-virtio-Remove-redundant-pm_cap.patch
Normal file
@ -0,0 +1,99 @@
|
||||
From 274e81bcf091c981d1e27e49fbe98e63d5308472 Mon Sep 17 00:00:00 2001
|
||||
From: Alex Williamson <alex.williamson@redhat.com>
|
||||
Date: Tue, 25 Feb 2025 14:52:28 -0700
|
||||
Subject: [PATCH 5/7] pcie, virtio: Remove redundant pm_cap
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Eric Auger <eric.auger@redhat.com>
|
||||
RH-MergeRequest: 348: PCI: Implement basic PCI PM capability backing
|
||||
RH-Jira: RHEL-7301
|
||||
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
|
||||
RH-Acked-by: Alex Williamson <None>
|
||||
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
|
||||
RH-Commit: [5/6] 81c6e3c9c52a0b3f0b9269b4ac7f56e8e4b5d68b (eauger1/centos-qemu-kvm)
|
||||
|
||||
The pm_cap on the PCIExpressDevice object can be distilled down
|
||||
to the new instance on the PCIDevice object.
|
||||
|
||||
Cc: Michael S. Tsirkin <mst@redhat.com>
|
||||
Cc: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
|
||||
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
|
||||
Reviewed-by: Eric Auger <eric.auger@redhat.com>
|
||||
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
|
||||
Link: https://lore.kernel.org/qemu-devel/20250225215237.3314011-5-alex.williamson@redhat.com
|
||||
Signed-off-by: Cédric Le Goater <clg@redhat.com>
|
||||
(cherry picked from commit 8b8d08cf293b930d0f55b2d5385d8dd27e0c6b41)
|
||||
Signed-off-by: Eric Auger <eric.auger@redhat.com>
|
||||
---
|
||||
hw/pci-bridge/pcie_pci_bridge.c | 1 -
|
||||
hw/virtio/virtio-pci.c | 8 +++-----
|
||||
include/hw/pci/pcie.h | 2 --
|
||||
3 files changed, 3 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/hw/pci-bridge/pcie_pci_bridge.c b/hw/pci-bridge/pcie_pci_bridge.c
|
||||
index 2f098e3a13..c0ba6d7928 100644
|
||||
--- a/hw/pci-bridge/pcie_pci_bridge.c
|
||||
+++ b/hw/pci-bridge/pcie_pci_bridge.c
|
||||
@@ -56,7 +56,6 @@ static void pcie_pci_bridge_realize(PCIDevice *d, Error **errp)
|
||||
if (pos < 0) {
|
||||
goto pm_error;
|
||||
}
|
||||
- d->exp.pm_cap = pos;
|
||||
pci_set_word(d->config + pos + PCI_PM_PMC, 0x3);
|
||||
|
||||
pcie_cap_arifwd_init(d);
|
||||
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
|
||||
index 4b2aeaad8d..a85787b837 100644
|
||||
--- a/hw/virtio/virtio-pci.c
|
||||
+++ b/hw/virtio/virtio-pci.c
|
||||
@@ -2200,8 +2200,6 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp)
|
||||
return;
|
||||
}
|
||||
|
||||
- pci_dev->exp.pm_cap = pos;
|
||||
-
|
||||
/*
|
||||
* Indicates that this function complies with revision 1.2 of the
|
||||
* PCI Power Management Interface Specification.
|
||||
@@ -2295,11 +2293,11 @@ static bool virtio_pci_no_soft_reset(PCIDevice *dev)
|
||||
{
|
||||
uint16_t pmcsr;
|
||||
|
||||
- if (!pci_is_express(dev) || !dev->exp.pm_cap) {
|
||||
+ if (!pci_is_express(dev) || !(dev->cap_present & QEMU_PCI_CAP_PM)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
- pmcsr = pci_get_word(dev->config + dev->exp.pm_cap + PCI_PM_CTRL);
|
||||
+ pmcsr = pci_get_word(dev->config + dev->pm_cap + PCI_PM_CTRL);
|
||||
|
||||
/*
|
||||
* When No_Soft_Reset bit is set and the device
|
||||
@@ -2328,7 +2326,7 @@ static void virtio_pci_bus_reset_hold(Object *obj, ResetType type)
|
||||
|
||||
if (proxy->flags & VIRTIO_PCI_FLAG_INIT_PM) {
|
||||
pci_word_test_and_clear_mask(
|
||||
- dev->config + dev->exp.pm_cap + PCI_PM_CTRL,
|
||||
+ dev->config + dev->pm_cap + PCI_PM_CTRL,
|
||||
PCI_PM_CTRL_STATE_MASK);
|
||||
}
|
||||
}
|
||||
diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h
|
||||
index 5eddb90976..8a30d07fd0 100644
|
||||
--- a/include/hw/pci/pcie.h
|
||||
+++ b/include/hw/pci/pcie.h
|
||||
@@ -58,8 +58,6 @@ typedef enum {
|
||||
struct PCIExpressDevice {
|
||||
/* Offset of express capability in config space */
|
||||
uint8_t exp_cap;
|
||||
- /* Offset of Power Management capability in config space */
|
||||
- uint8_t pm_cap;
|
||||
|
||||
/* SLOT */
|
||||
bool hpev_notified; /* Logical AND of conditions for hot plug event.
|
||||
--
|
||||
2.48.1
|
||||
|
||||
139
SOURCES/kvm-qga-implement-a-guest-get-load-command.patch
Normal file
139
SOURCES/kvm-qga-implement-a-guest-get-load-command.patch
Normal file
@ -0,0 +1,139 @@
|
||||
From 22f26a93ab94bf87c0724891a5886797a38c23b4 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= <berrange@redhat.com>
|
||||
Date: Mon, 2 Dec 2024 12:19:27 +0000
|
||||
Subject: [PATCH 6/9] qga: implement a 'guest-get-load' command
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Konstantin Kostiuk <None>
|
||||
RH-MergeRequest: 343: RHEL-69622: qga: implement a 'guest-get-load' command
|
||||
RH-Jira: RHEL-69622
|
||||
RH-Acked-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
|
||||
RH-Commit: [1/1] 9284c70737ad9f700d37f8c3833f855f2354acb7 (kkostiuk/redhat-centos-stream-src-qemu-kvm)
|
||||
|
||||
Provide a way to report the process load average, via a new
|
||||
'guest-get-load' command.
|
||||
|
||||
This is only implemented for POSIX platforms providing 'getloadavg'.
|
||||
|
||||
Example illustrated with qmp-shell:
|
||||
|
||||
(QEMU) guest-get-load
|
||||
{
|
||||
"return": {
|
||||
"load15m": 1.546875,
|
||||
"load1m": 1.669921875,
|
||||
"load5m": 1.9306640625
|
||||
}
|
||||
}
|
||||
|
||||
Windows has no native equivalent API, but it would be possible to
|
||||
simulate it as illustrated here (BSD-3-Clause):
|
||||
|
||||
https://github.com/giampaolo/psutil/pull/1485
|
||||
|
||||
This is left as an exercise for future contributors.
|
||||
|
||||
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
|
||||
Reviewed-by: Konstantin Kostiuk <kkostiuk@redhat.com>
|
||||
Message-ID: <20241202121927.864335-1-berrange@redhat.com>
|
||||
Signed-off-by: Konstantin Kostiuk <kkostiuk@redhat.com>
|
||||
---
|
||||
meson.build | 1 +
|
||||
qga/commands-posix.c | 20 ++++++++++++++++++++
|
||||
qga/qapi-schema.json | 37 +++++++++++++++++++++++++++++++++++++
|
||||
3 files changed, 58 insertions(+)
|
||||
|
||||
diff --git a/meson.build b/meson.build
|
||||
index b3529aa0e1..1dd97c6f49 100644
|
||||
--- a/meson.build
|
||||
+++ b/meson.build
|
||||
@@ -2497,6 +2497,7 @@ config_host_data.set('CONFIG_SETNS', cc.has_function('setns') and cc.has_functio
|
||||
config_host_data.set('CONFIG_SYNCFS', cc.has_function('syncfs'))
|
||||
config_host_data.set('CONFIG_SYNC_FILE_RANGE', cc.has_function('sync_file_range'))
|
||||
config_host_data.set('CONFIG_TIMERFD', cc.has_function('timerfd_create'))
|
||||
+config_host_data.set('CONFIG_GETLOADAVG', cc.has_function('getloadavg'))
|
||||
config_host_data.set('HAVE_COPY_FILE_RANGE', cc.has_function('copy_file_range'))
|
||||
config_host_data.set('HAVE_GETIFADDRS', cc.has_function('getifaddrs'))
|
||||
config_host_data.set('HAVE_GLIB_WITH_SLICE_ALLOCATOR', glib_has_gslice)
|
||||
diff --git a/qga/commands-posix.c b/qga/commands-posix.c
|
||||
index 49e40f9127..abfa53d6e9 100644
|
||||
--- a/qga/commands-posix.c
|
||||
+++ b/qga/commands-posix.c
|
||||
@@ -1371,3 +1371,23 @@ char *qga_get_host_name(Error **errp)
|
||||
|
||||
return g_steal_pointer(&hostname);
|
||||
}
|
||||
+
|
||||
+#ifdef CONFIG_GETLOADAVG
|
||||
+GuestLoadAverage *qmp_guest_get_load(Error **errp)
|
||||
+{
|
||||
+ double loadavg[3];
|
||||
+ GuestLoadAverage *ret = NULL;
|
||||
+
|
||||
+ if (getloadavg(loadavg, G_N_ELEMENTS(loadavg)) < 0) {
|
||||
+ error_setg_errno(errp, errno,
|
||||
+ "cannot query load average");
|
||||
+ return NULL;
|
||||
+ }
|
||||
+
|
||||
+ ret = g_new0(GuestLoadAverage, 1);
|
||||
+ ret->load1m = loadavg[0];
|
||||
+ ret->load5m = loadavg[1];
|
||||
+ ret->load15m = loadavg[2];
|
||||
+ return ret;
|
||||
+}
|
||||
+#endif
|
||||
diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
|
||||
index 495706cf73..739f008ff2 100644
|
||||
--- a/qga/qapi-schema.json
|
||||
+++ b/qga/qapi-schema.json
|
||||
@@ -1852,6 +1852,43 @@
|
||||
'if': 'CONFIG_LINUX'
|
||||
}
|
||||
|
||||
+
|
||||
+##
|
||||
+# @GuestLoadAverage:
|
||||
+#
|
||||
+# Statistics about process load information
|
||||
+#
|
||||
+# @load1m: 1-minute load avage
|
||||
+#
|
||||
+# @load5m: 5-minute load avage
|
||||
+#
|
||||
+# @load15m: 15-minute load avage
|
||||
+#
|
||||
+# Since: 10.0
|
||||
+##
|
||||
+{ 'struct': 'GuestLoadAverage',
|
||||
+ 'data': {
|
||||
+ 'load1m': 'number',
|
||||
+ 'load5m': 'number',
|
||||
+ 'load15m': 'number'
|
||||
+ },
|
||||
+ 'if': 'CONFIG_GETLOADAVG'
|
||||
+}
|
||||
+
|
||||
+##
|
||||
+# @guest-get-load:
|
||||
+#
|
||||
+# Retrieve CPU process load information
|
||||
+#
|
||||
+# Returns: load information
|
||||
+#
|
||||
+# Since: 10.0
|
||||
+##
|
||||
+{ 'command': 'guest-get-load',
|
||||
+ 'returns': 'GuestLoadAverage',
|
||||
+ 'if': 'CONFIG_GETLOADAVG'
|
||||
+}
|
||||
+
|
||||
##
|
||||
# @GuestNetworkRoute:
|
||||
#
|
||||
--
|
||||
2.48.1
|
||||
|
||||
273
SOURCES/kvm-rbd-Fix-.bdrv_get_specific_info-implementation.patch
Normal file
273
SOURCES/kvm-rbd-Fix-.bdrv_get_specific_info-implementation.patch
Normal file
@ -0,0 +1,273 @@
|
||||
From 181b9ca805f3ae09c24a925eea0460525f30c90e Mon Sep 17 00:00:00 2001
|
||||
From: Kevin Wolf <kwolf@redhat.com>
|
||||
Date: Mon, 11 Aug 2025 15:40:10 +0200
|
||||
Subject: [PATCH] rbd: Fix .bdrv_get_specific_info implementation
|
||||
|
||||
RH-Author: Kevin Wolf <kwolf@redhat.com>
|
||||
RH-MergeRequest: 400: rbd: Fix .bdrv_get_specific_info implementation
|
||||
RH-Jira: RHEL-108726
|
||||
RH-Acked-by: Hanna Czenczek <hreitz@redhat.com>
|
||||
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-Commit: [1/1] 5a488d6e2355adcec7fc4fd686c6be001808a146 (kmwolf/centos-qemu-kvm)
|
||||
|
||||
qemu_rbd_get_specific_info() has at least two problems:
|
||||
|
||||
The first is that it issues a blocking rbd_read() call in order to probe
|
||||
the encryption format for the image while querying the node. This means
|
||||
that if the connection to the server goes down, not only I/O is stuck
|
||||
(which is unavoidable), but query-names-block-nodes will actually make
|
||||
the whole QEMU instance unresponsive. .bdrv_get_specific_info
|
||||
implementations shouldn't perform blocking operations, but only return
|
||||
what is already known.
|
||||
|
||||
The second is that the information returned isn't even correct. If the
|
||||
image is already opened with encryption enabled at the RBD level, we'll
|
||||
probe for "double encryption", i.e. if the encrypted data contains
|
||||
another encryption header. If it doesn't (which is the normal case), we
|
||||
won't return the encryption format. If it does, we return misleading
|
||||
information because it looks like we're talking about the outer level
|
||||
(the encryption format of the image itself) while the information is
|
||||
about an encryption header in the guest data.
|
||||
|
||||
Fix this by storing the encryption format in BDRVRBDState when the image
|
||||
is opened (and we do blocking operations anyway) and returning only the
|
||||
stored information in qemu_rbd_get_specific_info().
|
||||
|
||||
The information we'll store is either the actual encryption format that
|
||||
we enabled on the RBD level, or if the image is unencrypted, the result
|
||||
of the same probing as we previously did when querying the node. Probing
|
||||
image formats based on content that can be modified by the guest has
|
||||
long been known as problematic, but as long as we only output it to the
|
||||
user instead of making decisions based on it, it should be okay. It is
|
||||
undoubtedly useful in the context of 'qemu-img info' when you're trying
|
||||
to figure out which encryption options you have to use to open the
|
||||
image successfully.
|
||||
|
||||
Fixes: 42e4ac9ef5a6 ("block/rbd: Add support for rbd image encryption")
|
||||
Buglink: https://issues.redhat.com/browse/RHEL-105440
|
||||
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
||||
Message-ID: <20250811134010.81787-1-kwolf@redhat.com>
|
||||
Reviewed-by: Hanna Czenczek <hreitz@redhat.com>
|
||||
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
||||
(cherry picked from commit 4af976ef398e4e823addc00bf1c58787ba4952fe)
|
||||
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
||||
---
|
||||
block/rbd.c | 104 ++++++++++++++++++++++++++++---------------
|
||||
qapi/block-core.json | 9 +++-
|
||||
2 files changed, 76 insertions(+), 37 deletions(-)
|
||||
|
||||
diff --git a/block/rbd.c b/block/rbd.c
|
||||
index 627f8eb05a..d5546da71b 100644
|
||||
--- a/block/rbd.c
|
||||
+++ b/block/rbd.c
|
||||
@@ -99,6 +99,14 @@ typedef struct BDRVRBDState {
|
||||
char *namespace;
|
||||
uint64_t image_size;
|
||||
uint64_t object_size;
|
||||
+
|
||||
+ /*
|
||||
+ * If @bs->encrypted is true, this is the encryption format actually loaded
|
||||
+ * at the librbd level. If it is false, it is the result of probing.
|
||||
+ * RBD_IMAGE_ENCRYPTION_FORMAT__MAX means that encryption is not enabled and
|
||||
+ * probing didn't find any known encryption header either.
|
||||
+ */
|
||||
+ RbdImageEncryptionFormat encryption_format;
|
||||
} BDRVRBDState;
|
||||
|
||||
typedef struct RBDTask {
|
||||
@@ -471,10 +479,12 @@ static int qemu_rbd_encryption_format(rbd_image_t image,
|
||||
return 0;
|
||||
}
|
||||
|
||||
-static int qemu_rbd_encryption_load(rbd_image_t image,
|
||||
+static int qemu_rbd_encryption_load(BlockDriverState *bs,
|
||||
+ rbd_image_t image,
|
||||
RbdEncryptionOptions *encrypt,
|
||||
Error **errp)
|
||||
{
|
||||
+ BDRVRBDState *s = bs->opaque;
|
||||
int r = 0;
|
||||
g_autofree char *passphrase = NULL;
|
||||
rbd_encryption_luks1_format_options_t luks_opts;
|
||||
@@ -545,15 +555,19 @@ static int qemu_rbd_encryption_load(rbd_image_t image,
|
||||
error_setg_errno(errp, -r, "encryption load fail");
|
||||
return r;
|
||||
}
|
||||
+ bs->encrypted = true;
|
||||
+ s->encryption_format = encrypt->format;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef LIBRBD_SUPPORTS_ENCRYPTION_LOAD2
|
||||
-static int qemu_rbd_encryption_load2(rbd_image_t image,
|
||||
+static int qemu_rbd_encryption_load2(BlockDriverState *bs,
|
||||
+ rbd_image_t image,
|
||||
RbdEncryptionOptions *encrypt,
|
||||
Error **errp)
|
||||
{
|
||||
+ BDRVRBDState *s = bs->opaque;
|
||||
int r = 0;
|
||||
int encrypt_count = 1;
|
||||
int i;
|
||||
@@ -639,6 +653,8 @@ static int qemu_rbd_encryption_load2(rbd_image_t image,
|
||||
error_setg_errno(errp, -r, "layered encryption load fail");
|
||||
goto exit;
|
||||
}
|
||||
+ bs->encrypted = true;
|
||||
+ s->encryption_format = encrypt->format;
|
||||
|
||||
exit:
|
||||
for (i = 0; i < encrypt_count; ++i) {
|
||||
@@ -672,6 +688,45 @@ exit:
|
||||
#endif
|
||||
#endif
|
||||
|
||||
+/*
|
||||
+ * For an image without encryption enabled on the rbd layer, probe the start of
|
||||
+ * the image if it could be opened as an encrypted image so that we can display
|
||||
+ * it when the user queries the node (most importantly in qemu-img).
|
||||
+ *
|
||||
+ * If the guest writes an encryption header to its disk after this probing, this
|
||||
+ * won't be reflected when queried, but that's okay. There is no reason why the
|
||||
+ * user should want to apply encryption at the rbd level while the image is
|
||||
+ * still in use. This is just guest data.
|
||||
+ */
|
||||
+static void qemu_rbd_encryption_probe(BlockDriverState *bs)
|
||||
+{
|
||||
+ BDRVRBDState *s = bs->opaque;
|
||||
+ char buf[RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN] = {0};
|
||||
+ int r;
|
||||
+
|
||||
+ assert(s->encryption_format == RBD_IMAGE_ENCRYPTION_FORMAT__MAX);
|
||||
+
|
||||
+ r = rbd_read(s->image, 0,
|
||||
+ RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN, buf);
|
||||
+ if (r < RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ if (memcmp(buf, rbd_luks_header_verification,
|
||||
+ RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) {
|
||||
+ s->encryption_format = RBD_IMAGE_ENCRYPTION_FORMAT_LUKS;
|
||||
+ } else if (memcmp(buf, rbd_luks2_header_verification,
|
||||
+ RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) {
|
||||
+ s->encryption_format = RBD_IMAGE_ENCRYPTION_FORMAT_LUKS2;
|
||||
+ } else if (memcmp(buf, rbd_layered_luks_header_verification,
|
||||
+ RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) {
|
||||
+ s->encryption_format = RBD_IMAGE_ENCRYPTION_FORMAT_LUKS;
|
||||
+ } else if (memcmp(buf, rbd_layered_luks2_header_verification,
|
||||
+ RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) {
|
||||
+ s->encryption_format = RBD_IMAGE_ENCRYPTION_FORMAT_LUKS2;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
/* FIXME Deprecate and remove keypairs or make it available in QMP. */
|
||||
static int qemu_rbd_do_create(BlockdevCreateOptions *options,
|
||||
const char *keypairs, const char *password_secret,
|
||||
@@ -1134,17 +1189,18 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
|
||||
goto failed_open;
|
||||
}
|
||||
|
||||
+ s->encryption_format = RBD_IMAGE_ENCRYPTION_FORMAT__MAX;
|
||||
if (opts->encrypt) {
|
||||
#ifdef LIBRBD_SUPPORTS_ENCRYPTION
|
||||
if (opts->encrypt->parent) {
|
||||
#ifdef LIBRBD_SUPPORTS_ENCRYPTION_LOAD2
|
||||
- r = qemu_rbd_encryption_load2(s->image, opts->encrypt, errp);
|
||||
+ r = qemu_rbd_encryption_load2(bs, s->image, opts->encrypt, errp);
|
||||
#else
|
||||
r = -ENOTSUP;
|
||||
error_setg(errp, "RBD library does not support layered encryption");
|
||||
#endif
|
||||
} else {
|
||||
- r = qemu_rbd_encryption_load(s->image, opts->encrypt, errp);
|
||||
+ r = qemu_rbd_encryption_load(bs, s->image, opts->encrypt, errp);
|
||||
}
|
||||
if (r < 0) {
|
||||
goto failed_post_open;
|
||||
@@ -1154,6 +1210,8 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
|
||||
error_setg(errp, "RBD library does not support image encryption");
|
||||
goto failed_post_open;
|
||||
#endif
|
||||
+ } else {
|
||||
+ qemu_rbd_encryption_probe(bs);
|
||||
}
|
||||
|
||||
r = rbd_stat(s->image, &info, sizeof(info));
|
||||
@@ -1413,17 +1471,6 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
|
||||
{
|
||||
BDRVRBDState *s = bs->opaque;
|
||||
ImageInfoSpecific *spec_info;
|
||||
- char buf[RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN] = {0};
|
||||
- int r;
|
||||
-
|
||||
- if (s->image_size >= RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) {
|
||||
- r = rbd_read(s->image, 0,
|
||||
- RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN, buf);
|
||||
- if (r < 0) {
|
||||
- error_setg_errno(errp, -r, "cannot read image start for probe");
|
||||
- return NULL;
|
||||
- }
|
||||
- }
|
||||
|
||||
spec_info = g_new(ImageInfoSpecific, 1);
|
||||
*spec_info = (ImageInfoSpecific){
|
||||
@@ -1431,28 +1478,13 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
|
||||
.u.rbd.data = g_new0(ImageInfoSpecificRbd, 1),
|
||||
};
|
||||
|
||||
- if (memcmp(buf, rbd_luks_header_verification,
|
||||
- RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) {
|
||||
- spec_info->u.rbd.data->encryption_format =
|
||||
- RBD_IMAGE_ENCRYPTION_FORMAT_LUKS;
|
||||
- spec_info->u.rbd.data->has_encryption_format = true;
|
||||
- } else if (memcmp(buf, rbd_luks2_header_verification,
|
||||
- RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) {
|
||||
- spec_info->u.rbd.data->encryption_format =
|
||||
- RBD_IMAGE_ENCRYPTION_FORMAT_LUKS2;
|
||||
- spec_info->u.rbd.data->has_encryption_format = true;
|
||||
- } else if (memcmp(buf, rbd_layered_luks_header_verification,
|
||||
- RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) {
|
||||
- spec_info->u.rbd.data->encryption_format =
|
||||
- RBD_IMAGE_ENCRYPTION_FORMAT_LUKS;
|
||||
- spec_info->u.rbd.data->has_encryption_format = true;
|
||||
- } else if (memcmp(buf, rbd_layered_luks2_header_verification,
|
||||
- RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) {
|
||||
- spec_info->u.rbd.data->encryption_format =
|
||||
- RBD_IMAGE_ENCRYPTION_FORMAT_LUKS2;
|
||||
- spec_info->u.rbd.data->has_encryption_format = true;
|
||||
+ if (s->encryption_format == RBD_IMAGE_ENCRYPTION_FORMAT__MAX) {
|
||||
+ assert(!bs->encrypted);
|
||||
} else {
|
||||
- spec_info->u.rbd.data->has_encryption_format = false;
|
||||
+ ImageInfoSpecificRbd *rbd_info = spec_info->u.rbd.data;
|
||||
+
|
||||
+ rbd_info->has_encryption_format = true;
|
||||
+ rbd_info->encryption_format = s->encryption_format;
|
||||
}
|
||||
|
||||
return spec_info;
|
||||
diff --git a/qapi/block-core.json b/qapi/block-core.json
|
||||
index 3969c60b93..15b91e2d4a 100644
|
||||
--- a/qapi/block-core.json
|
||||
+++ b/qapi/block-core.json
|
||||
@@ -158,7 +158,14 @@
|
||||
##
|
||||
# @ImageInfoSpecificRbd:
|
||||
#
|
||||
-# @encryption-format: Image encryption format
|
||||
+# @encryption-format: Image encryption format. If encryption is enabled for the
|
||||
+# image (see encrypted in BlockNodeInfo), this is the actual format in which the
|
||||
+# image is accessed. If encryption is not enabled, this is the result of
|
||||
+# probing when the image was opened, to give a suggestion which encryption
|
||||
+# format could be enabled. Note that probing results can be changed by the
|
||||
+# guest by writing a (possibly partial) encryption format header to the
|
||||
+# image, so don't treat this information as trusted if the guest is not
|
||||
+# trusted.
|
||||
#
|
||||
# Since: 6.1
|
||||
##
|
||||
--
|
||||
2.50.1
|
||||
|
||||
36
SOURCES/kvm-redhat-Enable-virtio-mem-on-s390x.patch
Normal file
36
SOURCES/kvm-redhat-Enable-virtio-mem-on-s390x.patch
Normal file
@ -0,0 +1,36 @@
|
||||
From 7300a435547b7e999227648fd1451db00e9c4867 Mon Sep 17 00:00:00 2001
|
||||
From: Thomas Huth <thuth@redhat.com>
|
||||
Date: Mon, 24 Mar 2025 18:09:26 +0100
|
||||
Subject: [PATCH 26/26] redhat: Enable virtio-mem on s390x
|
||||
|
||||
RH-Author: Thomas Huth <thuth@redhat.com>
|
||||
RH-MergeRequest: 351: Enable virtio-mem support on s390x
|
||||
RH-Jira: RHEL-72977
|
||||
RH-Acked-by: David Hildenbrand <david@redhat.com>
|
||||
RH-Acked-by: Juraj Marcin <None>
|
||||
RH-Commit: [26/26] 076b44c8f0262e903c5e17eda676614aec6f5c98 (thuth/qemu-kvm-cs)
|
||||
|
||||
JIRA: https://issues.redhat.com/browse/RHEL-72977
|
||||
|
||||
Enable virtio-mem on s390x now, too.
|
||||
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
---
|
||||
configs/devices/s390x-softmmu/s390x-rh-devices.mak | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
diff --git a/configs/devices/s390x-softmmu/s390x-rh-devices.mak b/configs/devices/s390x-softmmu/s390x-rh-devices.mak
|
||||
index 24cf6dbd03..834281d872 100644
|
||||
--- a/configs/devices/s390x-softmmu/s390x-rh-devices.mak
|
||||
+++ b/configs/devices/s390x-softmmu/s390x-rh-devices.mak
|
||||
@@ -12,6 +12,7 @@ CONFIG_VFIO_CCW=y
|
||||
CONFIG_VFIO_PCI=y
|
||||
CONFIG_VHOST_USER=y
|
||||
CONFIG_VIRTIO_CCW=y
|
||||
+CONFIG_VIRTIO_MEM=y
|
||||
CONFIG_WDT_DIAG288=y
|
||||
CONFIG_VHOST_VSOCK=y
|
||||
CONFIG_VHOST_USER_VSOCK=y
|
||||
--
|
||||
2.48.1
|
||||
|
||||
94
SOURCES/kvm-reset-Add-RESET_TYPE_WAKEUP.patch
Normal file
94
SOURCES/kvm-reset-Add-RESET_TYPE_WAKEUP.patch
Normal file
@ -0,0 +1,94 @@
|
||||
From 2de79d978c2cd29ad686dd91e74a86dbf2121f1f Mon Sep 17 00:00:00 2001
|
||||
From: Juraj Marcin <jmarcin@redhat.com>
|
||||
Date: Wed, 4 Sep 2024 12:37:13 +0200
|
||||
Subject: [PATCH 06/26] reset: Add RESET_TYPE_WAKEUP
|
||||
|
||||
RH-Author: Thomas Huth <thuth@redhat.com>
|
||||
RH-MergeRequest: 351: Enable virtio-mem support on s390x
|
||||
RH-Jira: RHEL-72977
|
||||
RH-Acked-by: David Hildenbrand <david@redhat.com>
|
||||
RH-Acked-by: Juraj Marcin <None>
|
||||
RH-Commit: [6/26] 6169fe25bfa5715340c180ee8711d0ad61832106 (thuth/qemu-kvm-cs)
|
||||
|
||||
Some devices need to distinguish cold start reset from waking up from a
|
||||
suspended state. This patch adds new value to the enum, and updates the
|
||||
i386 wakeup method to use this new reset type.
|
||||
|
||||
Message-ID: <20240904103722.946194-3-jmarcin@redhat.com>
|
||||
Reviewed-by: David Hildenbrand <david@redhat.com>
|
||||
Signed-off-by: Juraj Marcin <jmarcin@redhat.com>
|
||||
Signed-off-by: David Hildenbrand <david@redhat.com>
|
||||
(cherry picked from commit 759cbb4ee971da13ddfa8ad73befc2351d542044)
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
---
|
||||
docs/devel/reset.rst | 12 +++++++++++-
|
||||
hw/i386/pc.c | 2 +-
|
||||
include/hw/resettable.h | 2 ++
|
||||
3 files changed, 14 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/docs/devel/reset.rst b/docs/devel/reset.rst
|
||||
index d2799eba7a..44bd51b42e 100644
|
||||
--- a/docs/devel/reset.rst
|
||||
+++ b/docs/devel/reset.rst
|
||||
@@ -44,6 +44,17 @@ The Resettable interface handles reset types with an enum ``ResetType``:
|
||||
value on each cold reset, such as RNG seed information, and which they
|
||||
must not reinitialize on a snapshot-load reset.
|
||||
|
||||
+``RESET_TYPE_WAKEUP``
|
||||
+ If the machine supports waking up from a suspended state and needs to reset
|
||||
+ its devices during wake-up (from the ``MachineClass::wakeup()`` method), this
|
||||
+ reset type should be used for such a request. Devices can utilize this reset
|
||||
+ type to differentiate the reset requested during machine wake-up from other
|
||||
+ reset requests. For example, RAM content must not be lost during wake-up, and
|
||||
+ memory devices like virtio-mem that provide additional RAM must not reset
|
||||
+ such state during wake-ups, but might do so during cold resets. However, this
|
||||
+ reset type should not be used for wake-up detection, as not every machine
|
||||
+ type issues a device reset request during wake-up.
|
||||
+
|
||||
``RESET_TYPE_S390_CPU_NORMAL``
|
||||
This is only used for S390 CPU objects; it clears interrupts, stops
|
||||
processing, and clears the TLB, but does not touch register contents.
|
||||
@@ -53,7 +64,6 @@ The Resettable interface handles reset types with an enum ``ResetType``:
|
||||
``RESET_TYPE_S390_CPU_NORMAL`` does and also clears the PSW, prefix,
|
||||
FPC, timer and control registers. It does not touch gprs, fprs or acrs.
|
||||
|
||||
-
|
||||
Devices which implement reset methods must treat any unknown ``ResetType``
|
||||
as equivalent to ``RESET_TYPE_COLD``; this will reduce the amount of
|
||||
existing code we need to change if we add more types in future.
|
||||
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
|
||||
index fedcf2a65f..fa9f16cbaf 100644
|
||||
--- a/hw/i386/pc.c
|
||||
+++ b/hw/i386/pc.c
|
||||
@@ -1889,7 +1889,7 @@ static void pc_machine_reset(MachineState *machine, ResetType type)
|
||||
static void pc_machine_wakeup(MachineState *machine)
|
||||
{
|
||||
cpu_synchronize_all_states();
|
||||
- pc_machine_reset(machine, RESET_TYPE_COLD);
|
||||
+ pc_machine_reset(machine, RESET_TYPE_WAKEUP);
|
||||
cpu_synchronize_all_post_reset();
|
||||
}
|
||||
|
||||
diff --git a/include/hw/resettable.h b/include/hw/resettable.h
|
||||
index 83b561fc83..cf37cd5ead 100644
|
||||
--- a/include/hw/resettable.h
|
||||
+++ b/include/hw/resettable.h
|
||||
@@ -29,6 +29,7 @@ typedef struct ResettableState ResettableState;
|
||||
* Types of reset.
|
||||
*
|
||||
* + Cold: reset resulting from a power cycle of the object.
|
||||
+ * + Wakeup: reset resulting from a wake-up from a suspended state.
|
||||
*
|
||||
* TODO: Support has to be added to handle more types. In particular,
|
||||
* ResettableState structure needs to be expanded.
|
||||
@@ -36,6 +37,7 @@ typedef struct ResettableState ResettableState;
|
||||
typedef enum ResetType {
|
||||
RESET_TYPE_COLD,
|
||||
RESET_TYPE_SNAPSHOT_LOAD,
|
||||
+ RESET_TYPE_WAKEUP,
|
||||
RESET_TYPE_S390_CPU_INITIAL,
|
||||
RESET_TYPE_S390_CPU_NORMAL,
|
||||
} ResetType;
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,360 @@
|
||||
From 8d48193b5a661f31c1c1db068d241b31ae379339 Mon Sep 17 00:00:00 2001
|
||||
From: Juraj Marcin <jmarcin@redhat.com>
|
||||
Date: Wed, 4 Sep 2024 12:37:12 +0200
|
||||
Subject: [PATCH 05/26] reset: Use ResetType for qemu_devices_reset() and
|
||||
MachineClass::reset()
|
||||
|
||||
RH-Author: Thomas Huth <thuth@redhat.com>
|
||||
RH-MergeRequest: 351: Enable virtio-mem support on s390x
|
||||
RH-Jira: RHEL-72977
|
||||
RH-Acked-by: David Hildenbrand <david@redhat.com>
|
||||
RH-Acked-by: Juraj Marcin <None>
|
||||
RH-Commit: [5/26] ea1324b27885d979bcc54cc355dbdf940686776c (thuth/qemu-kvm-cs)
|
||||
|
||||
Currently, both qemu_devices_reset() and MachineClass::reset() use
|
||||
ShutdownCause for the reason of the reset. However, the Resettable
|
||||
interface uses ResetState, so ShutdownCause needs to be translated to
|
||||
ResetType somewhere. Translating it qemu_devices_reset() makes adding
|
||||
new reset types harder, as they cannot always be matched to a single
|
||||
ShutdownCause here, and devices may need to check the ResetType to
|
||||
determine what to reset and if to reset at all.
|
||||
|
||||
This patch moves this translation up in the call stack to
|
||||
qemu_system_reset() and updates all MachineClass children to use the
|
||||
ResetType instead.
|
||||
|
||||
Message-ID: <20240904103722.946194-2-jmarcin@redhat.com>
|
||||
Reviewed-by: David Hildenbrand <david@redhat.com>
|
||||
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
|
||||
Signed-off-by: Juraj Marcin <jmarcin@redhat.com>
|
||||
Signed-off-by: David Hildenbrand <david@redhat.com>
|
||||
(cherry picked from commit 1b063fe2df002052cc2d10799764979b8c583480)
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
---
|
||||
hw/arm/aspeed.c | 4 ++--
|
||||
hw/arm/mps2-tz.c | 4 ++--
|
||||
hw/core/reset.c | 5 +----
|
||||
hw/hppa/machine.c | 4 ++--
|
||||
hw/i386/microvm.c | 4 ++--
|
||||
hw/i386/pc.c | 6 +++---
|
||||
hw/ppc/pegasos2.c | 4 ++--
|
||||
hw/ppc/pnv.c | 4 ++--
|
||||
hw/ppc/spapr.c | 6 +++---
|
||||
hw/s390x/s390-virtio-ccw.c | 4 ++--
|
||||
include/hw/boards.h | 3 ++-
|
||||
include/sysemu/reset.h | 5 +++--
|
||||
system/runstate.c | 13 +++++++++++--
|
||||
13 files changed, 37 insertions(+), 29 deletions(-)
|
||||
|
||||
diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c
|
||||
index fd5603f7aa..cbca7685da 100644
|
||||
--- a/hw/arm/aspeed.c
|
||||
+++ b/hw/arm/aspeed.c
|
||||
@@ -1529,12 +1529,12 @@ static void aspeed_machine_bletchley_class_init(ObjectClass *oc, void *data)
|
||||
aspeed_machine_class_init_cpus_defaults(mc);
|
||||
}
|
||||
|
||||
-static void fby35_reset(MachineState *state, ShutdownCause reason)
|
||||
+static void fby35_reset(MachineState *state, ResetType type)
|
||||
{
|
||||
AspeedMachineState *bmc = ASPEED_MACHINE(state);
|
||||
AspeedGPIOState *gpio = &bmc->soc->gpio;
|
||||
|
||||
- qemu_devices_reset(reason);
|
||||
+ qemu_devices_reset(type);
|
||||
|
||||
/* Board ID: 7 (Class-1, 4 slots) */
|
||||
object_property_set_bool(OBJECT(gpio), "gpioV4", true, &error_fatal);
|
||||
diff --git a/hw/arm/mps2-tz.c b/hw/arm/mps2-tz.c
|
||||
index aec57c0d68..8edf57a66d 100644
|
||||
--- a/hw/arm/mps2-tz.c
|
||||
+++ b/hw/arm/mps2-tz.c
|
||||
@@ -1254,7 +1254,7 @@ static void mps2_set_remap(Object *obj, const char *value, Error **errp)
|
||||
}
|
||||
}
|
||||
|
||||
-static void mps2_machine_reset(MachineState *machine, ShutdownCause reason)
|
||||
+static void mps2_machine_reset(MachineState *machine, ResetType type)
|
||||
{
|
||||
MPS2TZMachineState *mms = MPS2TZ_MACHINE(machine);
|
||||
|
||||
@@ -1264,7 +1264,7 @@ static void mps2_machine_reset(MachineState *machine, ShutdownCause reason)
|
||||
* reset see the correct mapping.
|
||||
*/
|
||||
remap_memory(mms, mms->remap);
|
||||
- qemu_devices_reset(reason);
|
||||
+ qemu_devices_reset(type);
|
||||
}
|
||||
|
||||
static void mps2tz_class_init(ObjectClass *oc, void *data)
|
||||
diff --git a/hw/core/reset.c b/hw/core/reset.c
|
||||
index 58dfc8db3d..14a2639fbf 100644
|
||||
--- a/hw/core/reset.c
|
||||
+++ b/hw/core/reset.c
|
||||
@@ -170,11 +170,8 @@ void qemu_unregister_resettable(Object *obj)
|
||||
resettable_container_remove(get_root_reset_container(), obj);
|
||||
}
|
||||
|
||||
-void qemu_devices_reset(ShutdownCause reason)
|
||||
+void qemu_devices_reset(ResetType type)
|
||||
{
|
||||
- ResetType type = (reason == SHUTDOWN_CAUSE_SNAPSHOT_LOAD) ?
|
||||
- RESET_TYPE_SNAPSHOT_LOAD : RESET_TYPE_COLD;
|
||||
-
|
||||
/* Reset the simulation */
|
||||
resettable_reset(OBJECT(get_root_reset_container()), type);
|
||||
}
|
||||
diff --git a/hw/hppa/machine.c b/hw/hppa/machine.c
|
||||
index 5d0a8739de..8259fe2e38 100644
|
||||
--- a/hw/hppa/machine.c
|
||||
+++ b/hw/hppa/machine.c
|
||||
@@ -642,12 +642,12 @@ static void machine_HP_C3700_init(MachineState *machine)
|
||||
machine_HP_common_init_tail(machine, pci_bus, translate);
|
||||
}
|
||||
|
||||
-static void hppa_machine_reset(MachineState *ms, ShutdownCause reason)
|
||||
+static void hppa_machine_reset(MachineState *ms, ResetType type)
|
||||
{
|
||||
unsigned int smp_cpus = ms->smp.cpus;
|
||||
int i;
|
||||
|
||||
- qemu_devices_reset(reason);
|
||||
+ qemu_devices_reset(type);
|
||||
|
||||
/* Start all CPUs at the firmware entry point.
|
||||
* Monarch CPU will initialize firmware, secondary CPUs
|
||||
diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c
|
||||
index 40edcee7af..8ae4dff7f2 100644
|
||||
--- a/hw/i386/microvm.c
|
||||
+++ b/hw/i386/microvm.c
|
||||
@@ -462,7 +462,7 @@ static void microvm_machine_state_init(MachineState *machine)
|
||||
microvm_devices_init(mms);
|
||||
}
|
||||
|
||||
-static void microvm_machine_reset(MachineState *machine, ShutdownCause reason)
|
||||
+static void microvm_machine_reset(MachineState *machine, ResetType type)
|
||||
{
|
||||
MicrovmMachineState *mms = MICROVM_MACHINE(machine);
|
||||
CPUState *cs;
|
||||
@@ -475,7 +475,7 @@ static void microvm_machine_reset(MachineState *machine, ShutdownCause reason)
|
||||
mms->kernel_cmdline_fixed = true;
|
||||
}
|
||||
|
||||
- qemu_devices_reset(reason);
|
||||
+ qemu_devices_reset(type);
|
||||
|
||||
CPU_FOREACH(cs) {
|
||||
cpu = X86_CPU(cs);
|
||||
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
|
||||
index fa0e42d072..fedcf2a65f 100644
|
||||
--- a/hw/i386/pc.c
|
||||
+++ b/hw/i386/pc.c
|
||||
@@ -1869,12 +1869,12 @@ static void pc_machine_initfn(Object *obj)
|
||||
qemu_add_machine_init_done_notifier(&pcms->machine_done);
|
||||
}
|
||||
|
||||
-static void pc_machine_reset(MachineState *machine, ShutdownCause reason)
|
||||
+static void pc_machine_reset(MachineState *machine, ResetType type)
|
||||
{
|
||||
CPUState *cs;
|
||||
X86CPU *cpu;
|
||||
|
||||
- qemu_devices_reset(reason);
|
||||
+ qemu_devices_reset(type);
|
||||
|
||||
/* Reset APIC after devices have been reset to cancel
|
||||
* any changes that qemu_devices_reset() might have done.
|
||||
@@ -1889,7 +1889,7 @@ static void pc_machine_reset(MachineState *machine, ShutdownCause reason)
|
||||
static void pc_machine_wakeup(MachineState *machine)
|
||||
{
|
||||
cpu_synchronize_all_states();
|
||||
- pc_machine_reset(machine, SHUTDOWN_CAUSE_NONE);
|
||||
+ pc_machine_reset(machine, RESET_TYPE_COLD);
|
||||
cpu_synchronize_all_post_reset();
|
||||
}
|
||||
|
||||
diff --git a/hw/ppc/pegasos2.c b/hw/ppc/pegasos2.c
|
||||
index 9b0a6b70ab..8ff4a00c34 100644
|
||||
--- a/hw/ppc/pegasos2.c
|
||||
+++ b/hw/ppc/pegasos2.c
|
||||
@@ -291,14 +291,14 @@ static void pegasos2_superio_write(uint8_t addr, uint8_t val)
|
||||
cpu_physical_memory_write(PCI1_IO_BASE + 0x3f1, &val, 1);
|
||||
}
|
||||
|
||||
-static void pegasos2_machine_reset(MachineState *machine, ShutdownCause reason)
|
||||
+static void pegasos2_machine_reset(MachineState *machine, ResetType type)
|
||||
{
|
||||
Pegasos2MachineState *pm = PEGASOS2_MACHINE(machine);
|
||||
void *fdt;
|
||||
uint64_t d[2];
|
||||
int sz;
|
||||
|
||||
- qemu_devices_reset(reason);
|
||||
+ qemu_devices_reset(type);
|
||||
if (!pm->vof) {
|
||||
return; /* Firmware should set up machine so nothing to do */
|
||||
}
|
||||
diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
|
||||
index 3526852685..988fd55d88 100644
|
||||
--- a/hw/ppc/pnv.c
|
||||
+++ b/hw/ppc/pnv.c
|
||||
@@ -709,13 +709,13 @@ static void pnv_powerdown_notify(Notifier *n, void *opaque)
|
||||
}
|
||||
}
|
||||
|
||||
-static void pnv_reset(MachineState *machine, ShutdownCause reason)
|
||||
+static void pnv_reset(MachineState *machine, ResetType type)
|
||||
{
|
||||
PnvMachineState *pnv = PNV_MACHINE(machine);
|
||||
IPMIBmc *bmc;
|
||||
void *fdt;
|
||||
|
||||
- qemu_devices_reset(reason);
|
||||
+ qemu_devices_reset(type);
|
||||
|
||||
/*
|
||||
* The machine should provide by default an internal BMC simulator.
|
||||
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
|
||||
index 29e66f1b3f..11c953669a 100644
|
||||
--- a/hw/ppc/spapr.c
|
||||
+++ b/hw/ppc/spapr.c
|
||||
@@ -1725,7 +1725,7 @@ void spapr_check_mmu_mode(bool guest_radix)
|
||||
}
|
||||
}
|
||||
|
||||
-static void spapr_machine_reset(MachineState *machine, ShutdownCause reason)
|
||||
+static void spapr_machine_reset(MachineState *machine, ResetType type)
|
||||
{
|
||||
SpaprMachineState *spapr = SPAPR_MACHINE(machine);
|
||||
PowerPCCPU *first_ppc_cpu;
|
||||
@@ -1733,7 +1733,7 @@ static void spapr_machine_reset(MachineState *machine, ShutdownCause reason)
|
||||
void *fdt;
|
||||
int rc;
|
||||
|
||||
- if (reason != SHUTDOWN_CAUSE_SNAPSHOT_LOAD) {
|
||||
+ if (type != RESET_TYPE_SNAPSHOT_LOAD) {
|
||||
/*
|
||||
* Record-replay snapshot load must not consume random, this was
|
||||
* already replayed from initial machine reset.
|
||||
@@ -1769,7 +1769,7 @@ static void spapr_machine_reset(MachineState *machine, ShutdownCause reason)
|
||||
spapr_setup_hpt(spapr);
|
||||
}
|
||||
|
||||
- qemu_devices_reset(reason);
|
||||
+ qemu_devices_reset(type);
|
||||
|
||||
spapr_ovec_cleanup(spapr->ov5_cas);
|
||||
spapr->ov5_cas = spapr_ovec_new();
|
||||
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
|
||||
index ef2a9687c7..94cad1705b 100644
|
||||
--- a/hw/s390x/s390-virtio-ccw.c
|
||||
+++ b/hw/s390x/s390-virtio-ccw.c
|
||||
@@ -434,7 +434,7 @@ static void s390_pv_prepare_reset(S390CcwMachineState *ms)
|
||||
s390_pv_prep_reset();
|
||||
}
|
||||
|
||||
-static void s390_machine_reset(MachineState *machine, ShutdownCause reason)
|
||||
+static void s390_machine_reset(MachineState *machine, ResetType type)
|
||||
{
|
||||
S390CcwMachineState *ms = S390_CCW_MACHINE(machine);
|
||||
enum s390_reset reset_type;
|
||||
@@ -466,7 +466,7 @@ static void s390_machine_reset(MachineState *machine, ShutdownCause reason)
|
||||
* Device reset includes CPU clear resets so this has to be
|
||||
* done AFTER the unprotect call above.
|
||||
*/
|
||||
- qemu_devices_reset(reason);
|
||||
+ qemu_devices_reset(type);
|
||||
s390_crypto_reset();
|
||||
|
||||
/* configure and start the ipl CPU only */
|
||||
diff --git a/include/hw/boards.h b/include/hw/boards.h
|
||||
index ffefc0a625..fe011b1e86 100644
|
||||
--- a/include/hw/boards.h
|
||||
+++ b/include/hw/boards.h
|
||||
@@ -10,6 +10,7 @@
|
||||
#include "qemu/module.h"
|
||||
#include "qom/object.h"
|
||||
#include "hw/core/cpu.h"
|
||||
+#include "hw/resettable.h"
|
||||
|
||||
#define TYPE_MACHINE_SUFFIX "-machine"
|
||||
|
||||
@@ -253,7 +254,7 @@ struct MachineClass {
|
||||
const char *deprecation_reason;
|
||||
|
||||
void (*init)(MachineState *state);
|
||||
- void (*reset)(MachineState *state, ShutdownCause reason);
|
||||
+ void (*reset)(MachineState *state, ResetType type);
|
||||
void (*wakeup)(MachineState *state);
|
||||
int (*kvm_type)(MachineState *machine, const char *arg);
|
||||
|
||||
diff --git a/include/sysemu/reset.h b/include/sysemu/reset.h
|
||||
index ae436044a9..0e297c0e02 100644
|
||||
--- a/include/sysemu/reset.h
|
||||
+++ b/include/sysemu/reset.h
|
||||
@@ -27,6 +27,7 @@
|
||||
#ifndef QEMU_SYSEMU_RESET_H
|
||||
#define QEMU_SYSEMU_RESET_H
|
||||
|
||||
+#include "hw/resettable.h"
|
||||
#include "qapi/qapi-events-run-state.h"
|
||||
|
||||
typedef void QEMUResetHandler(void *opaque);
|
||||
@@ -110,7 +111,7 @@ void qemu_unregister_reset(QEMUResetHandler *func, void *opaque);
|
||||
|
||||
/**
|
||||
* qemu_devices_reset: Perform a complete system reset
|
||||
- * @reason: reason for the reset
|
||||
+ * @reason: type of the reset
|
||||
*
|
||||
* This function performs the low-level work needed to do a complete reset
|
||||
* of the system (calling all the callbacks registered with
|
||||
@@ -121,6 +122,6 @@ void qemu_unregister_reset(QEMUResetHandler *func, void *opaque);
|
||||
* If you want to trigger a system reset from, for instance, a device
|
||||
* model, don't use this function. Use qemu_system_reset_request().
|
||||
*/
|
||||
-void qemu_devices_reset(ShutdownCause reason);
|
||||
+void qemu_devices_reset(ResetType type);
|
||||
|
||||
#endif
|
||||
diff --git a/system/runstate.c b/system/runstate.c
|
||||
index a0e2a5fd22..c2c9afa905 100644
|
||||
--- a/system/runstate.c
|
||||
+++ b/system/runstate.c
|
||||
@@ -32,6 +32,7 @@
|
||||
#include "exec/cpu-common.h"
|
||||
#include "gdbstub/syscalls.h"
|
||||
#include "hw/boards.h"
|
||||
+#include "hw/resettable.h"
|
||||
#include "migration/misc.h"
|
||||
#include "migration/postcopy-ram.h"
|
||||
#include "monitor/monitor.h"
|
||||
@@ -507,15 +508,23 @@ static int qemu_debug_requested(void)
|
||||
void qemu_system_reset(ShutdownCause reason)
|
||||
{
|
||||
MachineClass *mc;
|
||||
+ ResetType type;
|
||||
|
||||
mc = current_machine ? MACHINE_GET_CLASS(current_machine) : NULL;
|
||||
|
||||
cpu_synchronize_all_states();
|
||||
|
||||
+ switch (reason) {
|
||||
+ case SHUTDOWN_CAUSE_SNAPSHOT_LOAD:
|
||||
+ type = RESET_TYPE_SNAPSHOT_LOAD;
|
||||
+ break;
|
||||
+ default:
|
||||
+ type = RESET_TYPE_COLD;
|
||||
+ }
|
||||
if (mc && mc->reset) {
|
||||
- mc->reset(current_machine, reason);
|
||||
+ mc->reset(current_machine, type);
|
||||
} else {
|
||||
- qemu_devices_reset(reason);
|
||||
+ qemu_devices_reset(type);
|
||||
}
|
||||
switch (reason) {
|
||||
case SHUTDOWN_CAUSE_NONE:
|
||||
--
|
||||
2.48.1
|
||||
|
||||
60
SOURCES/kvm-s390x-Fix-leak-in-machine_set_loadparm.patch
Normal file
60
SOURCES/kvm-s390x-Fix-leak-in-machine_set_loadparm.patch
Normal file
@ -0,0 +1,60 @@
|
||||
From 4f627e0ae8efb96380070b6a8d50e88c71f40477 Mon Sep 17 00:00:00 2001
|
||||
From: Fabiano Rosas <farosas@suse.de>
|
||||
Date: Fri, 9 May 2025 14:49:38 -0300
|
||||
Subject: [PATCH 01/57] s390x: Fix leak in machine_set_loadparm
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Thomas Huth <thuth@redhat.com>
|
||||
RH-MergeRequest: 387: s390x: Fix memory leaks related to loadparm [rhel-9]
|
||||
RH-Jira: RHEL-98554
|
||||
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
|
||||
RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
|
||||
RH-Commit: [1/2] dadf5b9e187a644e0a8a8c565b1b913ef7f4dcc8 (thuth/qemu-kvm-cs)
|
||||
|
||||
ASAN spotted a leaking string in machine_set_loadparm():
|
||||
|
||||
Direct leak of 9 byte(s) in 1 object(s) allocated from:
|
||||
#0 0x560ffb5bb379 in malloc ../projects/compiler-rt/lib/asan/asan_malloc_linux.cpp:69:3
|
||||
#1 0x7f1aca926518 in g_malloc ../glib/gmem.c:106
|
||||
#2 0x7f1aca94113e in g_strdup ../glib/gstrfuncs.c:364
|
||||
#3 0x560ffc8afbf9 in qobject_input_type_str ../qapi/qobject-input-visitor.c:542:12
|
||||
#4 0x560ffc8a80ff in visit_type_str ../qapi/qapi-visit-core.c:349:10
|
||||
#5 0x560ffbe6053a in machine_set_loadparm ../hw/s390x/s390-virtio-ccw.c:802:10
|
||||
#6 0x560ffc0c5e52 in object_property_set ../qom/object.c:1450:5
|
||||
#7 0x560ffc0d4175 in object_property_set_qobject ../qom/qom-qobject.c:28:10
|
||||
#8 0x560ffc0c6004 in object_property_set_str ../qom/object.c:1458:15
|
||||
#9 0x560ffbe2ae60 in update_machine_ipl_properties ../hw/s390x/ipl.c:569:9
|
||||
#10 0x560ffbe2aa65 in s390_ipl_update_diag308 ../hw/s390x/ipl.c:594:5
|
||||
#11 0x560ffbdee132 in handle_diag_308 ../target/s390x/diag.c:147:9
|
||||
#12 0x560ffbebb956 in helper_diag ../target/s390x/tcg/misc_helper.c:137:9
|
||||
#13 0x7f1a3c51c730 (/memfd:tcg-jit (deleted)+0x39730)
|
||||
|
||||
Cc: qemu-stable@nongnu.org
|
||||
Signed-off-by: Fabiano Rosas <farosas@suse.de>
|
||||
Message-ID: <20250509174938.25935-1-farosas@suse.de>
|
||||
Fixes: 1fd396e3228 ("s390x: Register TYPE_S390_CCW_MACHINE properties as class properties")
|
||||
Reviewed-by: Thomas Huth <thuth@redhat.com>
|
||||
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
(cherry picked from commit bdf12f2a56bf3f13c52eb51f0a994bbfe40706b2)
|
||||
---
|
||||
hw/s390x/s390-virtio-ccw.c | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
|
||||
index 77a1bde71e..fc18ab575f 100644
|
||||
--- a/hw/s390x/s390-virtio-ccw.c
|
||||
+++ b/hw/s390x/s390-virtio-ccw.c
|
||||
@@ -782,6 +782,7 @@ static void machine_set_loadparm(Object *obj, Visitor *v,
|
||||
}
|
||||
|
||||
s390_ipl_fmt_loadparm(ms->loadparm, val, errp);
|
||||
+ g_free(val);
|
||||
}
|
||||
|
||||
static void ccw_machine_class_init(ObjectClass *oc, void *data)
|
||||
--
|
||||
2.39.3
|
||||
|
||||
144
SOURCES/kvm-s390x-introduce-s390_get_memory_limit.patch
Normal file
144
SOURCES/kvm-s390x-introduce-s390_get_memory_limit.patch
Normal file
@ -0,0 +1,144 @@
|
||||
From 1dd38383832fc27f2980f33bb5e10ec1af7e3fc3 Mon Sep 17 00:00:00 2001
|
||||
From: David Hildenbrand <david@redhat.com>
|
||||
Date: Thu, 19 Dec 2024 15:41:07 +0100
|
||||
Subject: [PATCH 15/26] s390x: introduce s390_get_memory_limit()
|
||||
|
||||
RH-Author: Thomas Huth <thuth@redhat.com>
|
||||
RH-MergeRequest: 351: Enable virtio-mem support on s390x
|
||||
RH-Jira: RHEL-72977
|
||||
RH-Acked-by: David Hildenbrand <david@redhat.com>
|
||||
RH-Acked-by: Juraj Marcin <None>
|
||||
RH-Commit: [15/26] 5ae6a624a6541283cb15e90ebeb8fef3940c823b (thuth/qemu-kvm-cs)
|
||||
|
||||
Let's add s390_get_memory_limit(), to query what has been successfully
|
||||
set via s390_set_memory_limit(). Allow setting the limit only once.
|
||||
|
||||
We'll remember the limit in the machine state. Move
|
||||
s390_set_memory_limit() to machine code, merging it into
|
||||
set_memory_limit(), because this really is a machine property.
|
||||
|
||||
Message-ID: <20241219144115.2820241-7-david@redhat.com>
|
||||
Acked-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Reviewed-by: Thomas Huth <thuth@redhat.com>
|
||||
Signed-off-by: David Hildenbrand <david@redhat.com>
|
||||
(cherry picked from commit 27221b69a3ea49339a1f82b9622126f3928e0915)
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
---
|
||||
hw/s390x/s390-virtio-ccw.c | 17 ++++++++++++-----
|
||||
include/hw/s390x/s390-virtio-ccw.h | 8 ++++++++
|
||||
target/s390x/cpu-sysemu.c | 8 --------
|
||||
target/s390x/cpu.h | 1 -
|
||||
4 files changed, 20 insertions(+), 14 deletions(-)
|
||||
|
||||
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
|
||||
index 248ac28d20..f5f147eb92 100644
|
||||
--- a/hw/s390x/s390-virtio-ccw.c
|
||||
+++ b/hw/s390x/s390-virtio-ccw.c
|
||||
@@ -45,6 +45,7 @@
|
||||
#include "migration/blocker.h"
|
||||
#include "qapi/visitor.h"
|
||||
#include "hw/s390x/cpu-topology.h"
|
||||
+#include "kvm/kvm_s390x.h"
|
||||
#include CONFIG_DEVICES
|
||||
|
||||
static Error *pv_mig_blocker;
|
||||
@@ -121,12 +122,16 @@ static void subsystem_reset(void)
|
||||
}
|
||||
}
|
||||
|
||||
-static void set_memory_limit(uint64_t new_limit)
|
||||
+static void s390_set_memory_limit(S390CcwMachineState *s390ms,
|
||||
+ uint64_t new_limit)
|
||||
{
|
||||
- uint64_t hw_limit;
|
||||
- int ret;
|
||||
+ uint64_t hw_limit = 0;
|
||||
+ int ret = 0;
|
||||
|
||||
- ret = s390_set_memory_limit(new_limit, &hw_limit);
|
||||
+ assert(!s390ms->memory_limit && new_limit);
|
||||
+ if (kvm_enabled()) {
|
||||
+ ret = kvm_s390_set_mem_limit(new_limit, &hw_limit);
|
||||
+ }
|
||||
if (ret == -E2BIG) {
|
||||
error_report("host supports a maximum of %" PRIu64 " GB",
|
||||
hw_limit / GiB);
|
||||
@@ -135,10 +140,12 @@ static void set_memory_limit(uint64_t new_limit)
|
||||
error_report("setting the guest size failed");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
+ s390ms->memory_limit = new_limit;
|
||||
}
|
||||
|
||||
static void s390_memory_init(MachineState *machine)
|
||||
{
|
||||
+ S390CcwMachineState *s390ms = S390_CCW_MACHINE(machine);
|
||||
MemoryRegion *sysmem = get_system_memory();
|
||||
MemoryRegion *ram = machine->ram;
|
||||
uint64_t ram_size = memory_region_size(ram);
|
||||
@@ -154,7 +161,7 @@ static void s390_memory_init(MachineState *machine)
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
- set_memory_limit(ram_size);
|
||||
+ s390_set_memory_limit(s390ms, ram_size);
|
||||
|
||||
/* Map the initial memory. Must happen after setting the memory limit. */
|
||||
memory_region_add_subregion(sysmem, 0, ram);
|
||||
diff --git a/include/hw/s390x/s390-virtio-ccw.h b/include/hw/s390x/s390-virtio-ccw.h
|
||||
index 996864a34e..de04336c5a 100644
|
||||
--- a/include/hw/s390x/s390-virtio-ccw.h
|
||||
+++ b/include/hw/s390x/s390-virtio-ccw.h
|
||||
@@ -29,10 +29,18 @@ struct S390CcwMachineState {
|
||||
bool dea_key_wrap;
|
||||
bool pv;
|
||||
uint8_t loadparm[8];
|
||||
+ uint64_t memory_limit;
|
||||
|
||||
SCLPDevice *sclp;
|
||||
};
|
||||
|
||||
+static inline uint64_t s390_get_memory_limit(S390CcwMachineState *s390ms)
|
||||
+{
|
||||
+ /* We expect to be called only after the limit was set. */
|
||||
+ assert(s390ms->memory_limit);
|
||||
+ return s390ms->memory_limit;
|
||||
+}
|
||||
+
|
||||
#define S390_PTF_REASON_NONE (0x00 << 8)
|
||||
#define S390_PTF_REASON_DONE (0x01 << 8)
|
||||
#define S390_PTF_REASON_BUSY (0x02 << 8)
|
||||
diff --git a/target/s390x/cpu-sysemu.c b/target/s390x/cpu-sysemu.c
|
||||
index 1cd30c1d84..3118a25fee 100644
|
||||
--- a/target/s390x/cpu-sysemu.c
|
||||
+++ b/target/s390x/cpu-sysemu.c
|
||||
@@ -255,14 +255,6 @@ unsigned int s390_cpu_set_state(uint8_t cpu_state, S390CPU *cpu)
|
||||
return s390_count_running_cpus();
|
||||
}
|
||||
|
||||
-int s390_set_memory_limit(uint64_t new_limit, uint64_t *hw_limit)
|
||||
-{
|
||||
- if (kvm_enabled()) {
|
||||
- return kvm_s390_set_mem_limit(new_limit, hw_limit);
|
||||
- }
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
void s390_set_max_pagesize(uint64_t pagesize, Error **errp)
|
||||
{
|
||||
if (kvm_enabled()) {
|
||||
diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
|
||||
index 6a64472403..ecaf3191d2 100644
|
||||
--- a/target/s390x/cpu.h
|
||||
+++ b/target/s390x/cpu.h
|
||||
@@ -881,7 +881,6 @@ static inline void s390_do_cpu_load_normal(CPUState *cs, run_on_cpu_data arg)
|
||||
|
||||
/* cpu.c */
|
||||
void s390_crypto_reset(void);
|
||||
-int s390_set_memory_limit(uint64_t new_limit, uint64_t *hw_limit);
|
||||
void s390_set_max_pagesize(uint64_t pagesize, Error **errp);
|
||||
void s390_cmma_reset(void);
|
||||
void s390_enable_css_support(S390CPU *cpu);
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,256 @@
|
||||
From c60d0770ff3f9124e6e9d7beb03e1ef8067e8e26 Mon Sep 17 00:00:00 2001
|
||||
From: Christoph Schlameuss <cschlame@redhat.com>
|
||||
Date: Thu, 12 Jun 2025 13:25:32 +0200
|
||||
Subject: [PATCH 01/16] s390x/pci: add support for guests that request direct
|
||||
mapping
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Christoph Schlameuss <None>
|
||||
RH-MergeRequest: 376: Draft: KVM: Performance Enhanced Refresh PCI Translation
|
||||
RH-Jira: RHEL-11430
|
||||
RH-Acked-by: Thomas Huth <thuth@redhat.com>
|
||||
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
|
||||
RH-Commit: [1/2] 11d1dd9a5add55ae43d5d922588a33945ecbfe27 (cschlame/qemu-kvm)
|
||||
|
||||
JIRA: https://issues.redhat.com/browse/RHEL-11430
|
||||
Conflicts: hw/s390x/s390-pci-bus.c old s390_pci_device_properties[] still has DEFINE_PROP_END_OF_LIST()
|
||||
hw/s390x/s390-pci-inst.c hw_accel.h is still in sysemu
|
||||
hw/s390x/s390-virtio-ccw.c changes from ccw_machine_9_2_class_options() moved to ccw_rhel_machine_9_6_0_class_options()
|
||||
|
||||
commit dfcee1ea4c52ac60e0a06221eafb7b6253eb10c3
|
||||
Author: Matthew Rosato <mjrosato@linux.ibm.com>
|
||||
Date: Wed Feb 26 16:00:12 2025 -0500
|
||||
|
||||
s390x/pci: add support for guests that request direct mapping
|
||||
|
||||
When receiving a guest mpcifc(4) or mpcifc(6) instruction without the T
|
||||
bit set, treat this as a request to perform direct mapping instead of
|
||||
address translation. In order to facilitate this, pin the entirety of
|
||||
guest memory into the host iommu.
|
||||
|
||||
Pinning for the direct mapping case is handled via vfio and its memory
|
||||
listener. Additionally, ram discard settings are inherited from vfio:
|
||||
coordinated discards (e.g. virtio-mem) are allowed while uncoordinated
|
||||
discards (e.g. virtio-balloon) are disabled.
|
||||
|
||||
Subsequent guest DMA operations are all expected to be of the format
|
||||
guest_phys+sdma, allowing them to be used as lookup into the host
|
||||
iommu table.
|
||||
|
||||
Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
|
||||
Reviewed-by: David Hildenbrand <david@redhat.com>
|
||||
Message-ID: <20250226210013.238349-2-mjrosato@linux.ibm.com>
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
|
||||
Signed-off-by: Christoph Schlameuss <cschlame@redhat.com>
|
||||
---
|
||||
hw/s390x/s390-pci-bus.c | 39 +++++++++++++++++++++++++++++++--
|
||||
hw/s390x/s390-pci-inst.c | 13 +++++++++--
|
||||
hw/s390x/s390-pci-vfio.c | 23 +++++++++++++++----
|
||||
hw/s390x/s390-virtio-ccw.c | 5 +++++
|
||||
include/hw/s390x/s390-pci-bus.h | 3 +++
|
||||
5 files changed, 75 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
|
||||
index 3e57d5faca..13bc02d837 100644
|
||||
--- a/hw/s390x/s390-pci-bus.c
|
||||
+++ b/hw/s390x/s390-pci-bus.c
|
||||
@@ -18,6 +18,8 @@
|
||||
#include "hw/s390x/s390-pci-inst.h"
|
||||
#include "hw/s390x/s390-pci-kvm.h"
|
||||
#include "hw/s390x/s390-pci-vfio.h"
|
||||
+#include "hw/s390x/s390-virtio-ccw.h"
|
||||
+#include "hw/boards.h"
|
||||
#include "hw/pci/pci_bus.h"
|
||||
#include "hw/qdev-properties.h"
|
||||
#include "hw/pci/pci_bridge.h"
|
||||
@@ -724,12 +726,42 @@ void s390_pci_iommu_enable(S390PCIIOMMU *iommu)
|
||||
g_free(name);
|
||||
}
|
||||
|
||||
+void s390_pci_iommu_direct_map_enable(S390PCIIOMMU *iommu)
|
||||
+{
|
||||
+ MachineState *ms = MACHINE(qdev_get_machine());
|
||||
+ S390CcwMachineState *s390ms = S390_CCW_MACHINE(ms);
|
||||
+
|
||||
+ /*
|
||||
+ * For direct-mapping we must map the entire guest address space. Rather
|
||||
+ * than using an iommu, create a memory region alias that maps GPA X to
|
||||
+ * IOVA X + SDMA. VFIO will handle pinning via its memory listener.
|
||||
+ */
|
||||
+ g_autofree char *name = g_strdup_printf("iommu-dm-s390-%04x",
|
||||
+ iommu->pbdev->uid);
|
||||
+
|
||||
+ iommu->dm_mr = g_malloc0(sizeof(*iommu->dm_mr));
|
||||
+ memory_region_init_alias(iommu->dm_mr, OBJECT(&iommu->mr), name,
|
||||
+ get_system_memory(), 0,
|
||||
+ s390_get_memory_limit(s390ms));
|
||||
+ iommu->enabled = true;
|
||||
+ memory_region_add_subregion(&iommu->mr, iommu->pbdev->zpci_fn.sdma,
|
||||
+ iommu->dm_mr);
|
||||
+}
|
||||
+
|
||||
void s390_pci_iommu_disable(S390PCIIOMMU *iommu)
|
||||
{
|
||||
iommu->enabled = false;
|
||||
g_hash_table_remove_all(iommu->iotlb);
|
||||
- memory_region_del_subregion(&iommu->mr, MEMORY_REGION(&iommu->iommu_mr));
|
||||
- object_unparent(OBJECT(&iommu->iommu_mr));
|
||||
+ if (iommu->dm_mr) {
|
||||
+ memory_region_del_subregion(&iommu->mr, iommu->dm_mr);
|
||||
+ object_unparent(OBJECT(iommu->dm_mr));
|
||||
+ g_free(iommu->dm_mr);
|
||||
+ iommu->dm_mr = NULL;
|
||||
+ } else {
|
||||
+ memory_region_del_subregion(&iommu->mr,
|
||||
+ MEMORY_REGION(&iommu->iommu_mr));
|
||||
+ object_unparent(OBJECT(&iommu->iommu_mr));
|
||||
+ }
|
||||
}
|
||||
|
||||
static void s390_pci_iommu_free(S390pciState *s, PCIBus *bus, int32_t devfn)
|
||||
@@ -1130,6 +1162,7 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
|
||||
/* Always intercept emulated devices */
|
||||
pbdev->interp = false;
|
||||
pbdev->forwarding_assist = false;
|
||||
+ pbdev->rtr_avail = false;
|
||||
}
|
||||
|
||||
if (s390_pci_msix_init(pbdev) && !pbdev->interp) {
|
||||
@@ -1488,6 +1521,8 @@ static Property s390_pci_device_properties[] = {
|
||||
DEFINE_PROP_BOOL("interpret", S390PCIBusDevice, interp, true),
|
||||
DEFINE_PROP_BOOL("forwarding-assist", S390PCIBusDevice, forwarding_assist,
|
||||
true),
|
||||
+ DEFINE_PROP_BOOL("relaxed-translation", S390PCIBusDevice, rtr_avail,
|
||||
+ true),
|
||||
DEFINE_PROP_END_OF_LIST(),
|
||||
};
|
||||
|
||||
diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c
|
||||
index 30149546c0..803ebcd9b3 100644
|
||||
--- a/hw/s390x/s390-pci-inst.c
|
||||
+++ b/hw/s390x/s390-pci-inst.c
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "exec/memory.h"
|
||||
#include "qemu/error-report.h"
|
||||
#include "sysemu/hw_accel.h"
|
||||
+#include "hw/boards.h"
|
||||
#include "hw/pci/pci_device.h"
|
||||
#include "hw/s390x/s390-pci-inst.h"
|
||||
#include "hw/s390x/s390-pci-bus.h"
|
||||
@@ -1008,17 +1009,25 @@ static int reg_ioat(CPUS390XState *env, S390PCIBusDevice *pbdev, ZpciFib fib,
|
||||
}
|
||||
|
||||
/* currently we only support designation type 1 with translation */
|
||||
- if (!(dt == ZPCI_IOTA_RTTO && t)) {
|
||||
+ if (t && dt != ZPCI_IOTA_RTTO) {
|
||||
error_report("unsupported ioat dt %d t %d", dt, t);
|
||||
s390_program_interrupt(env, PGM_OPERAND, ra);
|
||||
return -EINVAL;
|
||||
+ } else if (!t && !pbdev->rtr_avail) {
|
||||
+ error_report("relaxed translation not allowed");
|
||||
+ s390_program_interrupt(env, PGM_OPERAND, ra);
|
||||
+ return -EINVAL;
|
||||
}
|
||||
|
||||
iommu->pba = pba;
|
||||
iommu->pal = pal;
|
||||
iommu->g_iota = g_iota;
|
||||
|
||||
- s390_pci_iommu_enable(iommu);
|
||||
+ if (t) {
|
||||
+ s390_pci_iommu_enable(iommu);
|
||||
+ } else {
|
||||
+ s390_pci_iommu_direct_map_enable(iommu);
|
||||
+ }
|
||||
|
||||
return 0;
|
||||
}
|
||||
diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c
|
||||
index 7dbbc76823..443e222912 100644
|
||||
--- a/hw/s390x/s390-pci-vfio.c
|
||||
+++ b/hw/s390x/s390-pci-vfio.c
|
||||
@@ -131,13 +131,28 @@ static void s390_pci_read_base(S390PCIBusDevice *pbdev,
|
||||
/* Store function type separately for type-specific behavior */
|
||||
pbdev->pft = cap->pft;
|
||||
|
||||
+ /*
|
||||
+ * If the device is a passthrough ISM device, disallow relaxed
|
||||
+ * translation.
|
||||
+ */
|
||||
+ if (pbdev->pft == ZPCI_PFT_ISM) {
|
||||
+ pbdev->rtr_avail = false;
|
||||
+ }
|
||||
+
|
||||
/*
|
||||
* If appropriate, reduce the size of the supported DMA aperture reported
|
||||
- * to the guest based upon the vfio DMA limit.
|
||||
+ * to the guest based upon the vfio DMA limit. This is applicable for
|
||||
+ * devices that are guaranteed to not use relaxed translation. If the
|
||||
+ * device is capable of relaxed translation then we must advertise the
|
||||
+ * full aperture. In this case, if translation is used then we will
|
||||
+ * rely on the vfio DMA limit counting and use RPCIT CC1 / status 16
|
||||
+ * to request that the guest free DMA mappings as necessary.
|
||||
*/
|
||||
- vfio_size = pbdev->iommu->max_dma_limit << TARGET_PAGE_BITS;
|
||||
- if (vfio_size > 0 && vfio_size < cap->end_dma - cap->start_dma + 1) {
|
||||
- pbdev->zpci_fn.edma = cap->start_dma + vfio_size - 1;
|
||||
+ if (!pbdev->rtr_avail) {
|
||||
+ vfio_size = pbdev->iommu->max_dma_limit << TARGET_PAGE_BITS;
|
||||
+ if (vfio_size > 0 && vfio_size < cap->end_dma - cap->start_dma + 1) {
|
||||
+ pbdev->zpci_fn.edma = cap->start_dma + vfio_size - 1;
|
||||
+ }
|
||||
}
|
||||
}
|
||||
|
||||
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
|
||||
index 312e8f18aa..77a1bde71e 100644
|
||||
--- a/hw/s390x/s390-virtio-ccw.c
|
||||
+++ b/hw/s390x/s390-virtio-ccw.c
|
||||
@@ -1348,8 +1348,13 @@ static void ccw_rhel_machine_9_6_0_instance_options(MachineState *machine)
|
||||
|
||||
static void ccw_rhel_machine_9_6_0_class_options(MachineClass *mc)
|
||||
{
|
||||
+ static GlobalProperty compat[] = {
|
||||
+ { TYPE_S390_PCI_DEVICE, "relaxed-translation", "off", },
|
||||
+ };
|
||||
+
|
||||
/* NB: remember to move this line to the *latest* RHEL 9 machine */
|
||||
compat_props_add(mc->compat_props, hw_compat_rhel_9, hw_compat_rhel_9_len);
|
||||
+ compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
|
||||
}
|
||||
DEFINE_CCW_MACHINE_AS_LATEST(9, 6, 0);
|
||||
|
||||
diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h
|
||||
index 2c43ea123f..04944d4fed 100644
|
||||
--- a/include/hw/s390x/s390-pci-bus.h
|
||||
+++ b/include/hw/s390x/s390-pci-bus.h
|
||||
@@ -277,6 +277,7 @@ struct S390PCIIOMMU {
|
||||
AddressSpace as;
|
||||
MemoryRegion mr;
|
||||
IOMMUMemoryRegion iommu_mr;
|
||||
+ MemoryRegion *dm_mr;
|
||||
bool enabled;
|
||||
uint64_t g_iota;
|
||||
uint64_t pba;
|
||||
@@ -362,6 +363,7 @@ struct S390PCIBusDevice {
|
||||
bool interp;
|
||||
bool forwarding_assist;
|
||||
bool aif;
|
||||
+ bool rtr_avail;
|
||||
QTAILQ_ENTRY(S390PCIBusDevice) link;
|
||||
};
|
||||
|
||||
@@ -389,6 +391,7 @@ int pci_chsc_sei_nt2_have_event(void);
|
||||
void s390_pci_sclp_configure(SCCB *sccb);
|
||||
void s390_pci_sclp_deconfigure(SCCB *sccb);
|
||||
void s390_pci_iommu_enable(S390PCIIOMMU *iommu);
|
||||
+void s390_pci_iommu_direct_map_enable(S390PCIIOMMU *iommu);
|
||||
void s390_pci_iommu_disable(S390PCIIOMMU *iommu);
|
||||
void s390_pci_generate_error_event(uint16_t pec, uint32_t fh, uint32_t fid,
|
||||
uint64_t faddr, uint32_t e);
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,72 @@
|
||||
From 13e8ddbd282da692c8199a6cb9ca847334089e29 Mon Sep 17 00:00:00 2001
|
||||
From: Christoph Schlameuss <cschlame@redhat.com>
|
||||
Date: Thu, 12 Jun 2025 11:48:41 +0200
|
||||
Subject: [PATCH 02/16] s390x/pci: indicate QEMU supports relaxed translation
|
||||
for passthrough
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
RH-Author: Christoph Schlameuss <None>
|
||||
RH-MergeRequest: 376: Draft: KVM: Performance Enhanced Refresh PCI Translation
|
||||
RH-Jira: RHEL-11430
|
||||
RH-Acked-by: Thomas Huth <thuth@redhat.com>
|
||||
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
|
||||
RH-Commit: [2/2] afd514268347d0b434a60d7c6c09d20b84e5d902 (cschlame/qemu-kvm)
|
||||
|
||||
JIRA: https://issues.redhat.com/browse/RHEL-11430
|
||||
|
||||
commit d9b5dfc7122559e5b5959ecf534788b90c3dd102
|
||||
Author: Matthew Rosato <mjrosato@linux.ibm.com>
|
||||
Date: Wed Feb 26 16:00:13 2025 -0500
|
||||
|
||||
s390x/pci: indicate QEMU supports relaxed translation for passthrough
|
||||
|
||||
Specifying this bit in the guest CLP response indicates that the guest
|
||||
can optionally choose to skip translation and instead use
|
||||
identity-mapped operations.
|
||||
|
||||
Tested-by: Niklas Schnelle <schnelle@linux.ibm.com>
|
||||
Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
|
||||
Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
|
||||
Message-ID: <20250226210013.238349-3-mjrosato@linux.ibm.com>
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
|
||||
Signed-off-by: Christoph Schlameuss <cschlame@redhat.com>
|
||||
---
|
||||
hw/s390x/s390-pci-vfio.c | 5 ++++-
|
||||
include/hw/s390x/s390-pci-clp.h | 1 +
|
||||
2 files changed, 5 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c
|
||||
index 443e222912..6236ac7f1e 100644
|
||||
--- a/hw/s390x/s390-pci-vfio.c
|
||||
+++ b/hw/s390x/s390-pci-vfio.c
|
||||
@@ -238,8 +238,11 @@ static void s390_pci_read_group(S390PCIBusDevice *pbdev,
|
||||
pbdev->pci_group = s390_group_create(pbdev->zpci_fn.pfgid, start_gid);
|
||||
|
||||
resgrp = &pbdev->pci_group->zpci_group;
|
||||
+ if (pbdev->rtr_avail) {
|
||||
+ resgrp->fr |= CLP_RSP_QPCIG_MASK_RTR;
|
||||
+ }
|
||||
if (cap->flags & VFIO_DEVICE_INFO_ZPCI_FLAG_REFRESH) {
|
||||
- resgrp->fr = 1;
|
||||
+ resgrp->fr |= CLP_RSP_QPCIG_MASK_REFRESH;
|
||||
}
|
||||
resgrp->dasm = cap->dasm;
|
||||
resgrp->msia = cap->msi_addr;
|
||||
diff --git a/include/hw/s390x/s390-pci-clp.h b/include/hw/s390x/s390-pci-clp.h
|
||||
index 03b7f9ba5f..6a635d693b 100644
|
||||
--- a/include/hw/s390x/s390-pci-clp.h
|
||||
+++ b/include/hw/s390x/s390-pci-clp.h
|
||||
@@ -158,6 +158,7 @@ typedef struct ClpRspQueryPciGrp {
|
||||
#define CLP_RSP_QPCIG_MASK_NOI 0xfff
|
||||
uint16_t i;
|
||||
uint8_t version;
|
||||
+#define CLP_RSP_QPCIG_MASK_RTR 0x20
|
||||
#define CLP_RSP_QPCIG_MASK_FRAME 0x2
|
||||
#define CLP_RSP_QPCIG_MASK_REFRESH 0x1
|
||||
uint8_t fr;
|
||||
--
|
||||
2.48.1
|
||||
|
||||
46
SOURCES/kvm-s390x-pv-prepare-for-memory-devices.patch
Normal file
46
SOURCES/kvm-s390x-pv-prepare-for-memory-devices.patch
Normal file
@ -0,0 +1,46 @@
|
||||
From 9d5420c4370b74d60f082f2aa1225b19150ee629 Mon Sep 17 00:00:00 2001
|
||||
From: David Hildenbrand <david@redhat.com>
|
||||
Date: Thu, 19 Dec 2024 15:41:12 +0100
|
||||
Subject: [PATCH 20/26] s390x/pv: prepare for memory devices
|
||||
|
||||
RH-Author: Thomas Huth <thuth@redhat.com>
|
||||
RH-MergeRequest: 351: Enable virtio-mem support on s390x
|
||||
RH-Jira: RHEL-72977
|
||||
RH-Acked-by: David Hildenbrand <david@redhat.com>
|
||||
RH-Acked-by: Juraj Marcin <None>
|
||||
RH-Commit: [20/26] cdbe71168b9afa9657b94f1e7500568314c707a8 (thuth/qemu-kvm-cs)
|
||||
|
||||
Let's avoid checking for the maxram_size, and instead rely on the memory
|
||||
limit determined in s390_memory_init(), that might be larger than
|
||||
maxram_size, for example due to alignment purposes.
|
||||
|
||||
This check now correctly mimics what the kernel will check in
|
||||
kvm_s390_pv_set_aside(), whereby a VM <= 2 GiB VM would end up using
|
||||
a segment type ASCE.
|
||||
|
||||
Message-ID: <20241219144115.2820241-12-david@redhat.com>
|
||||
Acked-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Reviewed-by: Nina Schoetterl-Glausch <nsg@linux.ibm.com>
|
||||
Signed-off-by: David Hildenbrand <david@redhat.com>
|
||||
(cherry picked from commit a056332e732110c8ef0d40ffd49bd03afc2f04ca)
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
---
|
||||
target/s390x/kvm/pv.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/target/s390x/kvm/pv.c b/target/s390x/kvm/pv.c
|
||||
index 424cce75ca..fa66607e7b 100644
|
||||
--- a/target/s390x/kvm/pv.c
|
||||
+++ b/target/s390x/kvm/pv.c
|
||||
@@ -133,7 +133,7 @@ bool s390_pv_vm_try_disable_async(S390CcwMachineState *ms)
|
||||
* If the feature is not present or if the VM is not larger than 2 GiB,
|
||||
* KVM_PV_ASYNC_CLEANUP_PREPARE fill fail; no point in attempting it.
|
||||
*/
|
||||
- if ((MACHINE(ms)->ram_size <= 2 * GiB) ||
|
||||
+ if (s390_get_memory_limit(ms) <= 2 * GiB ||
|
||||
!kvm_check_extension(kvm_state, KVM_CAP_S390_PROTECTED_ASYNC_DISABLE)) {
|
||||
return false;
|
||||
}
|
||||
--
|
||||
2.48.1
|
||||
|
||||
107
SOURCES/kvm-s390x-remember-the-maximum-page-size.patch
Normal file
107
SOURCES/kvm-s390x-remember-the-maximum-page-size.patch
Normal file
@ -0,0 +1,107 @@
|
||||
From 5a311d410bca4a5530a51c0b789ce8525d2d0653 Mon Sep 17 00:00:00 2001
|
||||
From: David Hildenbrand <david@redhat.com>
|
||||
Date: Thu, 19 Dec 2024 15:41:13 +0100
|
||||
Subject: [PATCH 21/26] s390x: remember the maximum page size
|
||||
|
||||
RH-Author: Thomas Huth <thuth@redhat.com>
|
||||
RH-MergeRequest: 351: Enable virtio-mem support on s390x
|
||||
RH-Jira: RHEL-72977
|
||||
RH-Acked-by: David Hildenbrand <david@redhat.com>
|
||||
RH-Acked-by: Juraj Marcin <None>
|
||||
RH-Commit: [21/26] 3b97c555b153d42e4fcb27dbb65fbf3edac622a4 (thuth/qemu-kvm-cs)
|
||||
|
||||
Let's remember the value (successfully) set via s390_set_max_pagesize().
|
||||
This will be helpful to reject hotplugged memory devices that would exceed
|
||||
this initially set page size.
|
||||
|
||||
Handle it just like how we handle s390_get_memory_limit(), storing it in
|
||||
the machine, and moving the handling to machine code.
|
||||
|
||||
Message-ID: <20241219144115.2820241-13-david@redhat.com>
|
||||
Acked-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Reviewed-by: Thomas Huth <thuth@redhat.com>
|
||||
Signed-off-by: David Hildenbrand <david@redhat.com>
|
||||
(cherry picked from commit df2ac211a62e6ced7f1495b634fa6f78962f2321)
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
---
|
||||
hw/s390x/s390-virtio-ccw.c | 12 +++++++++++-
|
||||
include/hw/s390x/s390-virtio-ccw.h | 1 +
|
||||
target/s390x/cpu-sysemu.c | 7 -------
|
||||
target/s390x/cpu.h | 1 -
|
||||
4 files changed, 12 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
|
||||
index 824c73536a..bd05a22b4e 100644
|
||||
--- a/hw/s390x/s390-virtio-ccw.c
|
||||
+++ b/hw/s390x/s390-virtio-ccw.c
|
||||
@@ -143,6 +143,16 @@ static void s390_set_memory_limit(S390CcwMachineState *s390ms,
|
||||
s390ms->memory_limit = new_limit;
|
||||
}
|
||||
|
||||
+static void s390_set_max_pagesize(S390CcwMachineState *s390ms,
|
||||
+ uint64_t pagesize)
|
||||
+{
|
||||
+ assert(!s390ms->max_pagesize && pagesize);
|
||||
+ if (kvm_enabled()) {
|
||||
+ kvm_s390_set_max_pagesize(pagesize, &error_fatal);
|
||||
+ }
|
||||
+ s390ms->max_pagesize = pagesize;
|
||||
+}
|
||||
+
|
||||
static void s390_memory_init(MachineState *machine)
|
||||
{
|
||||
S390CcwMachineState *s390ms = S390_CCW_MACHINE(machine);
|
||||
@@ -191,7 +201,7 @@ static void s390_memory_init(MachineState *machine)
|
||||
* Configure the maximum page size. As no memory devices were created
|
||||
* yet, this is the page size of initial memory only.
|
||||
*/
|
||||
- s390_set_max_pagesize(qemu_maxrampagesize(), &error_fatal);
|
||||
+ s390_set_max_pagesize(s390ms, qemu_maxrampagesize());
|
||||
/* Initialize storage key device */
|
||||
s390_skeys_init();
|
||||
/* Initialize storage attributes device */
|
||||
diff --git a/include/hw/s390x/s390-virtio-ccw.h b/include/hw/s390x/s390-virtio-ccw.h
|
||||
index de04336c5a..599740a998 100644
|
||||
--- a/include/hw/s390x/s390-virtio-ccw.h
|
||||
+++ b/include/hw/s390x/s390-virtio-ccw.h
|
||||
@@ -30,6 +30,7 @@ struct S390CcwMachineState {
|
||||
bool pv;
|
||||
uint8_t loadparm[8];
|
||||
uint64_t memory_limit;
|
||||
+ uint64_t max_pagesize;
|
||||
|
||||
SCLPDevice *sclp;
|
||||
};
|
||||
diff --git a/target/s390x/cpu-sysemu.c b/target/s390x/cpu-sysemu.c
|
||||
index 3118a25fee..706a5c53e2 100644
|
||||
--- a/target/s390x/cpu-sysemu.c
|
||||
+++ b/target/s390x/cpu-sysemu.c
|
||||
@@ -255,13 +255,6 @@ unsigned int s390_cpu_set_state(uint8_t cpu_state, S390CPU *cpu)
|
||||
return s390_count_running_cpus();
|
||||
}
|
||||
|
||||
-void s390_set_max_pagesize(uint64_t pagesize, Error **errp)
|
||||
-{
|
||||
- if (kvm_enabled()) {
|
||||
- kvm_s390_set_max_pagesize(pagesize, errp);
|
||||
- }
|
||||
-}
|
||||
-
|
||||
void s390_cmma_reset(void)
|
||||
{
|
||||
if (kvm_enabled()) {
|
||||
diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
|
||||
index ecaf3191d2..9770a62ac9 100644
|
||||
--- a/target/s390x/cpu.h
|
||||
+++ b/target/s390x/cpu.h
|
||||
@@ -881,7 +881,6 @@ static inline void s390_do_cpu_load_normal(CPUState *cs, run_on_cpu_data arg)
|
||||
|
||||
/* cpu.c */
|
||||
void s390_crypto_reset(void);
|
||||
-void s390_set_max_pagesize(uint64_t pagesize, Error **errp);
|
||||
void s390_cmma_reset(void);
|
||||
void s390_enable_css_support(S390CPU *cpu);
|
||||
void s390_do_cpu_set_diag318(CPUState *cs, run_on_cpu_data arg);
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,113 @@
|
||||
From 2fbdf7e3cf23daea470aaa4a29e16641feb76f3c Mon Sep 17 00:00:00 2001
|
||||
From: David Hildenbrand <david@redhat.com>
|
||||
Date: Thu, 19 Dec 2024 15:41:05 +0100
|
||||
Subject: [PATCH 13/26] s390x: rename s390-virtio-hcall* to s390-hypercall*
|
||||
|
||||
RH-Author: Thomas Huth <thuth@redhat.com>
|
||||
RH-MergeRequest: 351: Enable virtio-mem support on s390x
|
||||
RH-Jira: RHEL-72977
|
||||
RH-Acked-by: David Hildenbrand <david@redhat.com>
|
||||
RH-Acked-by: Juraj Marcin <None>
|
||||
RH-Commit: [13/26] 3c1ef3cbb137517b306871f0a88a61a59740af5a (thuth/qemu-kvm-cs)
|
||||
|
||||
Let's make it clearer that we are talking about general
|
||||
QEMU/KVM-specific hypercalls.
|
||||
|
||||
Message-ID: <20241219144115.2820241-5-david@redhat.com>
|
||||
Acked-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Reviewed-by: Thomas Huth <thuth@redhat.com>
|
||||
Signed-off-by: David Hildenbrand <david@redhat.com>
|
||||
(cherry picked from commit 85489fc3652d0c4433c940f1a80a952e8cb5d3cb)
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
---
|
||||
hw/s390x/meson.build | 2 +-
|
||||
hw/s390x/{s390-virtio-hcall.c => s390-hypercall.c} | 2 +-
|
||||
hw/s390x/{s390-virtio-hcall.h => s390-hypercall.h} | 6 +++---
|
||||
target/s390x/kvm/kvm.c | 2 +-
|
||||
target/s390x/tcg/misc_helper.c | 2 +-
|
||||
5 files changed, 7 insertions(+), 7 deletions(-)
|
||||
rename hw/s390x/{s390-virtio-hcall.c => s390-hypercall.c} (97%)
|
||||
rename hw/s390x/{s390-virtio-hcall.h => s390-hypercall.h} (86%)
|
||||
|
||||
diff --git a/hw/s390x/meson.build b/hw/s390x/meson.build
|
||||
index d6c8c33915..e344a3bd8c 100644
|
||||
--- a/hw/s390x/meson.build
|
||||
+++ b/hw/s390x/meson.build
|
||||
@@ -29,7 +29,7 @@ s390x_ss.add(when: 'CONFIG_TCG', if_true: files(
|
||||
))
|
||||
s390x_ss.add(when: 'CONFIG_S390_CCW_VIRTIO', if_true: files(
|
||||
's390-virtio-ccw.c',
|
||||
- 's390-virtio-hcall.c',
|
||||
+ 's390-hypercall.c',
|
||||
))
|
||||
s390x_ss.add(when: 'CONFIG_TERMINAL3270', if_true: files('3270-ccw.c'))
|
||||
s390x_ss.add(when: 'CONFIG_VFIO', if_true: files('s390-pci-vfio.c'))
|
||||
diff --git a/hw/s390x/s390-virtio-hcall.c b/hw/s390x/s390-hypercall.c
|
||||
similarity index 97%
|
||||
rename from hw/s390x/s390-virtio-hcall.c
|
||||
rename to hw/s390x/s390-hypercall.c
|
||||
index 5fb78a719e..f816c2b1ef 100644
|
||||
--- a/hw/s390x/s390-virtio-hcall.c
|
||||
+++ b/hw/s390x/s390-hypercall.c
|
||||
@@ -12,7 +12,7 @@
|
||||
#include "qemu/osdep.h"
|
||||
#include "cpu.h"
|
||||
#include "hw/boards.h"
|
||||
-#include "hw/s390x/s390-virtio-hcall.h"
|
||||
+#include "hw/s390x/s390-hypercall.h"
|
||||
#include "hw/s390x/ioinst.h"
|
||||
#include "hw/s390x/css.h"
|
||||
#include "virtio-ccw.h"
|
||||
diff --git a/hw/s390x/s390-virtio-hcall.h b/hw/s390x/s390-hypercall.h
|
||||
similarity index 86%
|
||||
rename from hw/s390x/s390-virtio-hcall.h
|
||||
rename to hw/s390x/s390-hypercall.h
|
||||
index dca456b926..2fa81dbfdd 100644
|
||||
--- a/hw/s390x/s390-virtio-hcall.h
|
||||
+++ b/hw/s390x/s390-hypercall.h
|
||||
@@ -9,8 +9,8 @@
|
||||
* directory.
|
||||
*/
|
||||
|
||||
-#ifndef HW_S390_VIRTIO_HCALL_H
|
||||
-#define HW_S390_VIRTIO_HCALL_H
|
||||
+#ifndef HW_S390_HYPERCALL_H
|
||||
+#define HW_S390_HYPERCALL_H
|
||||
|
||||
#include "cpu.h"
|
||||
|
||||
@@ -21,4 +21,4 @@
|
||||
|
||||
void handle_diag_500(S390CPU *cpu, uintptr_t ra);
|
||||
|
||||
-#endif /* HW_S390_VIRTIO_HCALL_H */
|
||||
+#endif /* HW_S390_HYPERCALL_H */
|
||||
diff --git a/target/s390x/kvm/kvm.c b/target/s390x/kvm/kvm.c
|
||||
index 42d6a54126..afc8d570c9 100644
|
||||
--- a/target/s390x/kvm/kvm.c
|
||||
+++ b/target/s390x/kvm/kvm.c
|
||||
@@ -49,7 +49,7 @@
|
||||
#include "hw/s390x/ebcdic.h"
|
||||
#include "exec/memattrs.h"
|
||||
#include "hw/s390x/s390-virtio-ccw.h"
|
||||
-#include "hw/s390x/s390-virtio-hcall.h"
|
||||
+#include "hw/s390x/s390-hypercall.h"
|
||||
#include "target/s390x/kvm/pv.h"
|
||||
#include CONFIG_DEVICES
|
||||
|
||||
diff --git a/target/s390x/tcg/misc_helper.c b/target/s390x/tcg/misc_helper.c
|
||||
index 2b4310003b..b726a95352 100644
|
||||
--- a/target/s390x/tcg/misc_helper.c
|
||||
+++ b/target/s390x/tcg/misc_helper.c
|
||||
@@ -36,7 +36,7 @@
|
||||
#include "sysemu/cpus.h"
|
||||
#include "sysemu/sysemu.h"
|
||||
#include "hw/s390x/ebcdic.h"
|
||||
-#include "hw/s390x/s390-virtio-hcall.h"
|
||||
+#include "hw/s390x/s390-hypercall.h"
|
||||
#include "hw/s390x/sclp.h"
|
||||
#include "hw/s390x/s390_flic.h"
|
||||
#include "hw/s390x/ioinst.h"
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,99 @@
|
||||
From 86417a068f24964422d4fd5ea301d70a0f8142d2 Mon Sep 17 00:00:00 2001
|
||||
From: David Hildenbrand <david@redhat.com>
|
||||
Date: Thu, 19 Dec 2024 15:41:08 +0100
|
||||
Subject: [PATCH 16/26] s390x/s390-hypercall: introduce DIAG500 STORAGE_LIMIT
|
||||
|
||||
RH-Author: Thomas Huth <thuth@redhat.com>
|
||||
RH-MergeRequest: 351: Enable virtio-mem support on s390x
|
||||
RH-Jira: RHEL-72977
|
||||
RH-Acked-by: David Hildenbrand <david@redhat.com>
|
||||
RH-Acked-by: Juraj Marcin <None>
|
||||
RH-Commit: [16/26] c1c341227388735450ddbba0201e7523e0658c07 (thuth/qemu-kvm-cs)
|
||||
|
||||
A guest OS that supports memory hotplug / memory devices must during
|
||||
boot be aware of the maximum possible physical memory address that it might
|
||||
have to handle at a later stage during its runtime.
|
||||
|
||||
For example, the maximum possible memory address might be required to
|
||||
prepare the kernel virtual address space accordingly (e.g., select page
|
||||
table hierarchy depth).
|
||||
|
||||
On s390x there is currently no such mechanism that is compatible with
|
||||
paravirtualized memory devices, because the whole SCLP interface was
|
||||
designed around the idea of "storage increments" and "standby memory".
|
||||
Paravirtualized memory devices we want to support, such as virtio-mem, have
|
||||
no intersection with any of that, but could co-exist with them in the
|
||||
future if ever needed.
|
||||
|
||||
In particular, a guest OS must never detect and use device memory
|
||||
without the help of a proper device driver. Device memory must not be
|
||||
exposed in any firmware-provided memory map (SCLP or diag260 on s390x).
|
||||
For this reason, these memory devices will be places in memory *above*
|
||||
the "maximum storage increment" exposed via SCLP.
|
||||
|
||||
Let's provide a new diag500 subcode to query the memory limit determined in
|
||||
s390_memory_init().
|
||||
|
||||
Message-ID: <20241219144115.2820241-8-david@redhat.com>
|
||||
Acked-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Reviewed-by: Thomas Huth <thuth@redhat.com>
|
||||
Signed-off-by: David Hildenbrand <david@redhat.com>
|
||||
(cherry picked from commit f7c168657816486527727d860b73747d41f0c5f6)
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
---
|
||||
hw/s390x/s390-hypercall.c | 12 +++++++++++-
|
||||
hw/s390x/s390-hypercall.h | 1 +
|
||||
2 files changed, 12 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/hw/s390x/s390-hypercall.c b/hw/s390x/s390-hypercall.c
|
||||
index f816c2b1ef..ac1b08b2cd 100644
|
||||
--- a/hw/s390x/s390-hypercall.c
|
||||
+++ b/hw/s390x/s390-hypercall.c
|
||||
@@ -11,7 +11,7 @@
|
||||
|
||||
#include "qemu/osdep.h"
|
||||
#include "cpu.h"
|
||||
-#include "hw/boards.h"
|
||||
+#include "hw/s390x/s390-virtio-ccw.h"
|
||||
#include "hw/s390x/s390-hypercall.h"
|
||||
#include "hw/s390x/ioinst.h"
|
||||
#include "hw/s390x/css.h"
|
||||
@@ -57,6 +57,13 @@ static int handle_virtio_ccw_notify(uint64_t subch_id, uint64_t data)
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static uint64_t handle_storage_limit(void)
|
||||
+{
|
||||
+ S390CcwMachineState *s390ms = S390_CCW_MACHINE(qdev_get_machine());
|
||||
+
|
||||
+ return s390_get_memory_limit(s390ms) - 1;
|
||||
+}
|
||||
+
|
||||
void handle_diag_500(S390CPU *cpu, uintptr_t ra)
|
||||
{
|
||||
CPUS390XState *env = &cpu->env;
|
||||
@@ -69,6 +76,9 @@ void handle_diag_500(S390CPU *cpu, uintptr_t ra)
|
||||
case DIAG500_VIRTIO_CCW_NOTIFY:
|
||||
env->regs[2] = handle_virtio_ccw_notify(env->regs[2], env->regs[3]);
|
||||
break;
|
||||
+ case DIAG500_STORAGE_LIMIT:
|
||||
+ env->regs[2] = handle_storage_limit();
|
||||
+ break;
|
||||
default:
|
||||
s390_program_interrupt(env, PGM_SPECIFICATION, ra);
|
||||
}
|
||||
diff --git a/hw/s390x/s390-hypercall.h b/hw/s390x/s390-hypercall.h
|
||||
index 2fa81dbfdd..4f07209128 100644
|
||||
--- a/hw/s390x/s390-hypercall.h
|
||||
+++ b/hw/s390x/s390-hypercall.h
|
||||
@@ -18,6 +18,7 @@
|
||||
#define DIAG500_VIRTIO_RESET 1 /* legacy */
|
||||
#define DIAG500_VIRTIO_SET_STATUS 2 /* legacy */
|
||||
#define DIAG500_VIRTIO_CCW_NOTIFY 3 /* KVM_S390_VIRTIO_CCW_NOTIFY */
|
||||
+#define DIAG500_STORAGE_LIMIT 4
|
||||
|
||||
void handle_diag_500(S390CPU *cpu, uintptr_t ra);
|
||||
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,56 @@
|
||||
From 53d1b43699c6b30583f41a18a33c28893718aeac Mon Sep 17 00:00:00 2001
|
||||
From: David Hildenbrand <david@redhat.com>
|
||||
Date: Thu, 19 Dec 2024 15:41:10 +0100
|
||||
Subject: [PATCH 18/26] s390x/s390-skeys: prepare for memory devices
|
||||
|
||||
RH-Author: Thomas Huth <thuth@redhat.com>
|
||||
RH-MergeRequest: 351: Enable virtio-mem support on s390x
|
||||
RH-Jira: RHEL-72977
|
||||
RH-Acked-by: David Hildenbrand <david@redhat.com>
|
||||
RH-Acked-by: Juraj Marcin <None>
|
||||
RH-Commit: [18/26] 47edda0eeb6d5932f81633f2d9d294b1ca5f413c (thuth/qemu-kvm-cs)
|
||||
|
||||
With memory devices, we will have storage keys for memory that
|
||||
exceeds the initial ram size.
|
||||
|
||||
The TODO already states that current handling is subopimal,
|
||||
but we won't worry about improving that (TCG-only) thing for now.
|
||||
|
||||
Message-ID: <20241219144115.2820241-10-david@redhat.com>
|
||||
Acked-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Reviewed-by: Thomas Huth <thuth@redhat.com>
|
||||
Signed-off-by: David Hildenbrand <david@redhat.com>
|
||||
(cherry picked from commit d1e3c2ac41b3f73708682e4e8212c32ad35013b9)
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
---
|
||||
hw/s390x/s390-skeys.c | 6 +++---
|
||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/hw/s390x/s390-skeys.c b/hw/s390x/s390-skeys.c
|
||||
index bf22d6863e..e4297b3b8a 100644
|
||||
--- a/hw/s390x/s390-skeys.c
|
||||
+++ b/hw/s390x/s390-skeys.c
|
||||
@@ -11,7 +11,7 @@
|
||||
|
||||
#include "qemu/osdep.h"
|
||||
#include "qemu/units.h"
|
||||
-#include "hw/boards.h"
|
||||
+#include "hw/s390x/s390-virtio-ccw.h"
|
||||
#include "hw/qdev-properties.h"
|
||||
#include "hw/s390x/storage-keys.h"
|
||||
#include "qapi/error.h"
|
||||
@@ -251,9 +251,9 @@ static bool qemu_s390_enable_skeys(S390SKeysState *ss)
|
||||
* g_once_init_enter() is good enough.
|
||||
*/
|
||||
if (g_once_init_enter(&initialized)) {
|
||||
- MachineState *machine = MACHINE(qdev_get_machine());
|
||||
+ S390CcwMachineState *s390ms = S390_CCW_MACHINE(qdev_get_machine());
|
||||
|
||||
- skeys->key_count = machine->ram_size / TARGET_PAGE_SIZE;
|
||||
+ skeys->key_count = s390_get_memory_limit(s390ms) / TARGET_PAGE_SIZE;
|
||||
skeys->keydata = g_malloc0(skeys->key_count);
|
||||
g_once_init_leave(&initialized, 1);
|
||||
}
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,155 @@
|
||||
From 1195c91d10892a888870248fd881612955b9e1eb Mon Sep 17 00:00:00 2001
|
||||
From: David Hildenbrand <david@redhat.com>
|
||||
Date: Thu, 19 Dec 2024 15:41:09 +0100
|
||||
Subject: [PATCH 17/26] s390x/s390-stattrib-kvm: prepare for memory devices and
|
||||
sparse memory layouts
|
||||
|
||||
RH-Author: Thomas Huth <thuth@redhat.com>
|
||||
RH-MergeRequest: 351: Enable virtio-mem support on s390x
|
||||
RH-Jira: RHEL-72977
|
||||
RH-Acked-by: David Hildenbrand <david@redhat.com>
|
||||
RH-Acked-by: Juraj Marcin <None>
|
||||
RH-Commit: [17/26] 799aa7b2b9cc2a948e9f391bc0ecf739254c78b1 (thuth/qemu-kvm-cs)
|
||||
|
||||
With memory devices, we will have storage attributes for memory that
|
||||
exceeds the initial ram size. Further, we can easily have memory holes,
|
||||
for which there (currently) are no storage attributes.
|
||||
|
||||
In particular, with memory holes, KVM_S390_SET_CMMA_BITS will fail to set
|
||||
some storage attributes.
|
||||
|
||||
So let's do it like we handle storage keys migration, relying on
|
||||
guest_phys_blocks_append(). However, in contrast to storage key
|
||||
migration, we will handle it on the migration destination.
|
||||
|
||||
This is a preparation for virtio-mem support. Note that ever since the
|
||||
"early migration" feature was added (x-early-migration), the state
|
||||
of device blocks (plugged/unplugged) is migrated early such that
|
||||
guest_phys_blocks_append() will properly consider all currently plugged
|
||||
memory blocks and skip any unplugged ones.
|
||||
|
||||
In the future, we should try getting rid of the large temporary buffer
|
||||
and also not send any attributes for any memory holes, just so they
|
||||
get ignored on the destination.
|
||||
|
||||
Message-ID: <20241219144115.2820241-9-david@redhat.com>
|
||||
Acked-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Reviewed-by: Thomas Huth <thuth@redhat.com>
|
||||
Signed-off-by: David Hildenbrand <david@redhat.com>
|
||||
(cherry picked from commit 241e6b2d27b090b17cda5b011b2064544b0c458b)
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
---
|
||||
hw/s390x/s390-stattrib-kvm.c | 67 +++++++++++++++++++++++-------------
|
||||
1 file changed, 43 insertions(+), 24 deletions(-)
|
||||
|
||||
diff --git a/hw/s390x/s390-stattrib-kvm.c b/hw/s390x/s390-stattrib-kvm.c
|
||||
index eeaa811098..33ec91422a 100644
|
||||
--- a/hw/s390x/s390-stattrib-kvm.c
|
||||
+++ b/hw/s390x/s390-stattrib-kvm.c
|
||||
@@ -10,11 +10,12 @@
|
||||
*/
|
||||
|
||||
#include "qemu/osdep.h"
|
||||
-#include "hw/boards.h"
|
||||
+#include "hw/s390x/s390-virtio-ccw.h"
|
||||
#include "migration/qemu-file.h"
|
||||
#include "hw/s390x/storage-attributes.h"
|
||||
#include "qemu/error-report.h"
|
||||
#include "sysemu/kvm.h"
|
||||
+#include "sysemu/memory_mapping.h"
|
||||
#include "exec/ram_addr.h"
|
||||
#include "kvm/kvm_s390x.h"
|
||||
#include "qapi/error.h"
|
||||
@@ -84,8 +85,8 @@ static int kvm_s390_stattrib_set_stattr(S390StAttribState *sa,
|
||||
uint8_t *values)
|
||||
{
|
||||
KVMS390StAttribState *sas = KVM_S390_STATTRIB(sa);
|
||||
- MachineState *machine = MACHINE(qdev_get_machine());
|
||||
- unsigned long max = machine->ram_size / TARGET_PAGE_SIZE;
|
||||
+ S390CcwMachineState *s390ms = S390_CCW_MACHINE(qdev_get_machine());
|
||||
+ unsigned long max = s390_get_memory_limit(s390ms) / TARGET_PAGE_SIZE;
|
||||
|
||||
if (start_gfn + count > max) {
|
||||
error_report("Out of memory bounds when setting storage attributes");
|
||||
@@ -103,39 +104,57 @@ static int kvm_s390_stattrib_set_stattr(S390StAttribState *sa,
|
||||
static void kvm_s390_stattrib_synchronize(S390StAttribState *sa)
|
||||
{
|
||||
KVMS390StAttribState *sas = KVM_S390_STATTRIB(sa);
|
||||
- MachineState *machine = MACHINE(qdev_get_machine());
|
||||
- unsigned long max = machine->ram_size / TARGET_PAGE_SIZE;
|
||||
- /* We do not need to reach the maximum buffer size allowed */
|
||||
- unsigned long cx, len = KVM_S390_SKEYS_MAX / 2;
|
||||
+ S390CcwMachineState *s390ms = S390_CCW_MACHINE(qdev_get_machine());
|
||||
+ unsigned long max = s390_get_memory_limit(s390ms) / TARGET_PAGE_SIZE;
|
||||
+ unsigned long start_gfn, end_gfn, pages;
|
||||
+ GuestPhysBlockList guest_phys_blocks;
|
||||
+ GuestPhysBlock *block;
|
||||
int r;
|
||||
struct kvm_s390_cmma_log clog = {
|
||||
.flags = 0,
|
||||
.mask = ~0ULL,
|
||||
};
|
||||
|
||||
- if (sas->incoming_buffer) {
|
||||
- for (cx = 0; cx + len <= max; cx += len) {
|
||||
- clog.start_gfn = cx;
|
||||
- clog.count = len;
|
||||
- clog.values = (uint64_t)(sas->incoming_buffer + cx);
|
||||
- r = kvm_vm_ioctl(kvm_state, KVM_S390_SET_CMMA_BITS, &clog);
|
||||
- if (r) {
|
||||
- error_report("KVM_S390_SET_CMMA_BITS failed: %s", strerror(-r));
|
||||
- return;
|
||||
- }
|
||||
- }
|
||||
- if (cx < max) {
|
||||
- clog.start_gfn = cx;
|
||||
- clog.count = max - cx;
|
||||
- clog.values = (uint64_t)(sas->incoming_buffer + cx);
|
||||
+ if (!sas->incoming_buffer) {
|
||||
+ return;
|
||||
+ }
|
||||
+ guest_phys_blocks_init(&guest_phys_blocks);
|
||||
+ guest_phys_blocks_append(&guest_phys_blocks);
|
||||
+
|
||||
+ QTAILQ_FOREACH(block, &guest_phys_blocks.head, next) {
|
||||
+ assert(QEMU_IS_ALIGNED(block->target_start, TARGET_PAGE_SIZE));
|
||||
+ assert(QEMU_IS_ALIGNED(block->target_end, TARGET_PAGE_SIZE));
|
||||
+
|
||||
+ start_gfn = block->target_start / TARGET_PAGE_SIZE;
|
||||
+ end_gfn = block->target_end / TARGET_PAGE_SIZE;
|
||||
+
|
||||
+ while (start_gfn < end_gfn) {
|
||||
+ /* Don't exceed the maximum buffer size. */
|
||||
+ pages = MIN(end_gfn - start_gfn, KVM_S390_SKEYS_MAX / 2);
|
||||
+
|
||||
+ /*
|
||||
+ * If we ever get guest physical memory beyond the configured
|
||||
+ * memory limit, something went very wrong.
|
||||
+ */
|
||||
+ assert(start_gfn + pages <= max);
|
||||
+
|
||||
+ clog.start_gfn = start_gfn;
|
||||
+ clog.count = pages;
|
||||
+ clog.values = (uint64_t)(sas->incoming_buffer + start_gfn);
|
||||
r = kvm_vm_ioctl(kvm_state, KVM_S390_SET_CMMA_BITS, &clog);
|
||||
if (r) {
|
||||
error_report("KVM_S390_SET_CMMA_BITS failed: %s", strerror(-r));
|
||||
+ goto out;
|
||||
}
|
||||
+
|
||||
+ start_gfn += pages;
|
||||
}
|
||||
- g_free(sas->incoming_buffer);
|
||||
- sas->incoming_buffer = NULL;
|
||||
}
|
||||
+
|
||||
+out:
|
||||
+ guest_phys_blocks_free(&guest_phys_blocks);
|
||||
+ g_free(sas->incoming_buffer);
|
||||
+ sas->incoming_buffer = NULL;
|
||||
}
|
||||
|
||||
static int kvm_s390_stattrib_set_migrationmode(S390StAttribState *sa, bool val,
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,63 @@
|
||||
From 4ee3076ac566622929f9410636483c4f0b2da967 Mon Sep 17 00:00:00 2001
|
||||
From: David Hildenbrand <david@redhat.com>
|
||||
Date: Thu, 19 Dec 2024 15:41:02 +0100
|
||||
Subject: [PATCH 10/26] s390x/s390-virtio-ccw: don't crash on weird RAM sizes
|
||||
|
||||
RH-Author: Thomas Huth <thuth@redhat.com>
|
||||
RH-MergeRequest: 351: Enable virtio-mem support on s390x
|
||||
RH-Jira: RHEL-72977
|
||||
RH-Acked-by: David Hildenbrand <david@redhat.com>
|
||||
RH-Acked-by: Juraj Marcin <None>
|
||||
RH-Commit: [10/26] 55738da52f3cf4746bee2b17780a10720fa05863 (thuth/qemu-kvm-cs)
|
||||
|
||||
KVM is not happy when starting a VM with weird RAM sizes:
|
||||
|
||||
# qemu-system-s390x --enable-kvm --nographic -m 1234K
|
||||
qemu-system-s390x: kvm_set_user_memory_region: KVM_SET_USER_MEMORY_REGION
|
||||
failed, slot=0, start=0x0, size=0x244000: Invalid argument
|
||||
kvm_set_phys_mem: error registering slot: Invalid argument
|
||||
Aborted (core dumped)
|
||||
|
||||
Let's handle that in a better way by rejecting such weird RAM sizes
|
||||
right from the start:
|
||||
|
||||
# qemu-system-s390x --enable-kvm --nographic -m 1234K
|
||||
qemu-system-s390x: ram size must be multiples of 1 MiB
|
||||
|
||||
Message-ID: <20241219144115.2820241-2-david@redhat.com>
|
||||
Acked-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Reviewed-by: Eric Farman <farman@linux.ibm.com>
|
||||
Reviewed-by: Thomas Huth <thuth@redhat.com>
|
||||
Acked-by: Janosch Frank <frankja@linux.ibm.com>
|
||||
Signed-off-by: David Hildenbrand <david@redhat.com>
|
||||
(cherry picked from commit 14e568ab4836347481af2e334009c385f456a734)
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
---
|
||||
hw/s390x/s390-virtio-ccw.c | 11 +++++++++++
|
||||
1 file changed, 11 insertions(+)
|
||||
|
||||
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
|
||||
index 94cad1705b..82ded9666c 100644
|
||||
--- a/hw/s390x/s390-virtio-ccw.c
|
||||
+++ b/hw/s390x/s390-virtio-ccw.c
|
||||
@@ -180,6 +180,17 @@ static void s390_memory_init(MemoryRegion *ram)
|
||||
{
|
||||
MemoryRegion *sysmem = get_system_memory();
|
||||
|
||||
+ if (!QEMU_IS_ALIGNED(memory_region_size(ram), 1 * MiB)) {
|
||||
+ /*
|
||||
+ * SCLP cannot possibly expose smaller granularity right now and KVM
|
||||
+ * cannot handle smaller granularity. As we don't support NUMA, the
|
||||
+ * region size directly corresponds to machine->ram_size, and the region
|
||||
+ * is a single RAM memory region.
|
||||
+ */
|
||||
+ error_report("ram size must be multiples of 1 MiB");
|
||||
+ exit(EXIT_FAILURE);
|
||||
+ }
|
||||
+
|
||||
/* allocate RAM for core */
|
||||
memory_region_add_subregion(sysmem, 0, ram);
|
||||
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,140 @@
|
||||
From 9ec2d356210f1e66f50519cc4d58633a13db9004 Mon Sep 17 00:00:00 2001
|
||||
From: David Hildenbrand <david@redhat.com>
|
||||
Date: Thu, 19 Dec 2024 15:41:06 +0100
|
||||
Subject: [PATCH 14/26] s390x/s390-virtio-ccw: move setting the maximum guest
|
||||
size from sclp to machine code
|
||||
|
||||
RH-Author: Thomas Huth <thuth@redhat.com>
|
||||
RH-MergeRequest: 351: Enable virtio-mem support on s390x
|
||||
RH-Jira: RHEL-72977
|
||||
RH-Acked-by: David Hildenbrand <david@redhat.com>
|
||||
RH-Acked-by: Juraj Marcin <None>
|
||||
RH-Commit: [14/26] a5970c1c6d8d09a473a25a7eee533ec3a6711ec8 (thuth/qemu-kvm-cs)
|
||||
|
||||
Nowadays, it feels more natural to have that code located in
|
||||
s390_memory_init(), where we also have direct access to the machine
|
||||
object.
|
||||
|
||||
While at it, use the actual RAM size, not the maximum RAM size which
|
||||
cannot currently be reached without support for any memory devices.
|
||||
Consequently update s390_pv_vm_try_disable_async() to rely on the RAM size
|
||||
as well, to avoid temporary issues while we further rework that
|
||||
handling.
|
||||
|
||||
set_memory_limit() is temporary, we'll merge it with
|
||||
s390_set_memory_limit() next.
|
||||
|
||||
Message-ID: <20241219144115.2820241-6-david@redhat.com>
|
||||
Acked-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Reviewed-by: Thomas Huth <thuth@redhat.com>
|
||||
Signed-off-by: David Hildenbrand <david@redhat.com>
|
||||
(cherry picked from commit 3c6fb557d295949bea291c3bf88ee9c83392e78c)
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
---
|
||||
hw/s390x/s390-virtio-ccw.c | 28 ++++++++++++++++++++++++----
|
||||
hw/s390x/sclp.c | 11 -----------
|
||||
target/s390x/kvm/pv.c | 2 +-
|
||||
3 files changed, 25 insertions(+), 16 deletions(-)
|
||||
|
||||
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
|
||||
index d47e99028e..248ac28d20 100644
|
||||
--- a/hw/s390x/s390-virtio-ccw.c
|
||||
+++ b/hw/s390x/s390-virtio-ccw.c
|
||||
@@ -121,11 +121,29 @@ static void subsystem_reset(void)
|
||||
}
|
||||
}
|
||||
|
||||
-static void s390_memory_init(MemoryRegion *ram)
|
||||
+static void set_memory_limit(uint64_t new_limit)
|
||||
+{
|
||||
+ uint64_t hw_limit;
|
||||
+ int ret;
|
||||
+
|
||||
+ ret = s390_set_memory_limit(new_limit, &hw_limit);
|
||||
+ if (ret == -E2BIG) {
|
||||
+ error_report("host supports a maximum of %" PRIu64 " GB",
|
||||
+ hw_limit / GiB);
|
||||
+ exit(EXIT_FAILURE);
|
||||
+ } else if (ret) {
|
||||
+ error_report("setting the guest size failed");
|
||||
+ exit(EXIT_FAILURE);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static void s390_memory_init(MachineState *machine)
|
||||
{
|
||||
MemoryRegion *sysmem = get_system_memory();
|
||||
+ MemoryRegion *ram = machine->ram;
|
||||
+ uint64_t ram_size = memory_region_size(ram);
|
||||
|
||||
- if (!QEMU_IS_ALIGNED(memory_region_size(ram), 1 * MiB)) {
|
||||
+ if (!QEMU_IS_ALIGNED(ram_size, 1 * MiB)) {
|
||||
/*
|
||||
* SCLP cannot possibly expose smaller granularity right now and KVM
|
||||
* cannot handle smaller granularity. As we don't support NUMA, the
|
||||
@@ -136,7 +154,9 @@ static void s390_memory_init(MemoryRegion *ram)
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
- /* allocate RAM for core */
|
||||
+ set_memory_limit(ram_size);
|
||||
+
|
||||
+ /* Map the initial memory. Must happen after setting the memory limit. */
|
||||
memory_region_add_subregion(sysmem, 0, ram);
|
||||
|
||||
/*
|
||||
@@ -211,7 +231,7 @@ static void ccw_init(MachineState *machine)
|
||||
qdev_realize_and_unref(DEVICE(ms->sclp), NULL, &error_fatal);
|
||||
|
||||
/* init memory + setup max page size. Required for the CPU model */
|
||||
- s390_memory_init(machine->ram);
|
||||
+ s390_memory_init(machine);
|
||||
|
||||
/* init CPUs (incl. CPU model) early so s390_has_feature() works */
|
||||
s390_init_cpus(machine);
|
||||
diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c
|
||||
index 8757626b5c..73e88ab4eb 100644
|
||||
--- a/hw/s390x/sclp.c
|
||||
+++ b/hw/s390x/sclp.c
|
||||
@@ -376,10 +376,7 @@ void sclp_service_interrupt(uint32_t sccb)
|
||||
/* qemu object creation and initialization functions */
|
||||
static void sclp_realize(DeviceState *dev, Error **errp)
|
||||
{
|
||||
- MachineState *machine = MACHINE(qdev_get_machine());
|
||||
SCLPDevice *sclp = SCLP(dev);
|
||||
- uint64_t hw_limit;
|
||||
- int ret;
|
||||
|
||||
/*
|
||||
* qdev_device_add searches the sysbus for TYPE_SCLP_EVENTS_BUS. As long
|
||||
@@ -389,14 +386,6 @@ static void sclp_realize(DeviceState *dev, Error **errp)
|
||||
if (!sysbus_realize(SYS_BUS_DEVICE(sclp->event_facility), errp)) {
|
||||
return;
|
||||
}
|
||||
-
|
||||
- ret = s390_set_memory_limit(machine->maxram_size, &hw_limit);
|
||||
- if (ret == -E2BIG) {
|
||||
- error_setg(errp, "host supports a maximum of %" PRIu64 " GB",
|
||||
- hw_limit / GiB);
|
||||
- } else if (ret) {
|
||||
- error_setg(errp, "setting the guest size failed");
|
||||
- }
|
||||
}
|
||||
|
||||
static void sclp_memory_init(SCLPDevice *sclp)
|
||||
diff --git a/target/s390x/kvm/pv.c b/target/s390x/kvm/pv.c
|
||||
index dde836d21a..424cce75ca 100644
|
||||
--- a/target/s390x/kvm/pv.c
|
||||
+++ b/target/s390x/kvm/pv.c
|
||||
@@ -133,7 +133,7 @@ bool s390_pv_vm_try_disable_async(S390CcwMachineState *ms)
|
||||
* If the feature is not present or if the VM is not larger than 2 GiB,
|
||||
* KVM_PV_ASYNC_CLEANUP_PREPARE fill fail; no point in attempting it.
|
||||
*/
|
||||
- if ((MACHINE(ms)->maxram_size <= 2 * GiB) ||
|
||||
+ if ((MACHINE(ms)->ram_size <= 2 * GiB) ||
|
||||
!kvm_check_extension(kvm_state, KVM_CAP_S390_PROTECTED_ASYNC_DISABLE)) {
|
||||
return false;
|
||||
}
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,117 @@
|
||||
From 0e7d7bf86fb242c1ea90bf9648fb061626790eda Mon Sep 17 00:00:00 2001
|
||||
From: David Hildenbrand <david@redhat.com>
|
||||
Date: Thu, 19 Dec 2024 15:41:11 +0100
|
||||
Subject: [PATCH 19/26] s390x/s390-virtio-ccw: prepare for memory devices
|
||||
|
||||
RH-Author: Thomas Huth <thuth@redhat.com>
|
||||
RH-MergeRequest: 351: Enable virtio-mem support on s390x
|
||||
RH-Jira: RHEL-72977
|
||||
RH-Acked-by: David Hildenbrand <david@redhat.com>
|
||||
RH-Acked-by: Juraj Marcin <None>
|
||||
RH-Commit: [19/26] 2441c8c5f5a06d5ca93188dd44e8a08f06d1722b (thuth/qemu-kvm-cs)
|
||||
|
||||
Let's prepare our address space for memory devices if enabled via
|
||||
"maxmem" and if we have CONFIG_MEM_DEVICE enabled at all. Note that
|
||||
CONFIG_MEM_DEVICE will be selected automatically once we add support
|
||||
for devices.
|
||||
|
||||
Just like on other architectures, the region container for memory devices
|
||||
is placed directly above our initial memory. For now, we only align the
|
||||
start address of the region up to 1 GiB, but we won't add any additional
|
||||
space to the region for internal alignment purposes; this can be done in
|
||||
the future if really required.
|
||||
|
||||
The RAM size returned via SCLP is not modified, as this only
|
||||
covers initial RAM (and standby memory we don't implement) and not memory
|
||||
devices; clarify that in the docs of read_SCP_info(). Existing OSes without
|
||||
support for memory devices will keep working as is, even when memory
|
||||
devices would be attached the VM.
|
||||
|
||||
Guest OSs which support memory devices, such as virtio-mem, will
|
||||
consult diag500(), to find out the maximum possible pfn. Guest OSes that
|
||||
don't support memory devices, don't have to be changed and will continue
|
||||
relying on information provided by SCLP.
|
||||
|
||||
There are no remaining maxram_size users in s390x code, and the remaining
|
||||
ram_size users only care about initial RAM:
|
||||
* hw/s390x/ipl.c
|
||||
* hw/s390x/s390-hypercall.c
|
||||
* hw/s390x/sclp.c
|
||||
* target/s390x/kvm/pv.c
|
||||
|
||||
Message-ID: <20241219144115.2820241-11-david@redhat.com>
|
||||
Acked-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Reviewed-by: Thomas Huth <thuth@redhat.com>
|
||||
Signed-off-by: David Hildenbrand <david@redhat.com>
|
||||
(cherry picked from commit 1e86400298cf0fed5f7d49427db477775b859093)
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
---
|
||||
hw/s390x/s390-virtio-ccw.c | 23 ++++++++++++++++++++++-
|
||||
hw/s390x/sclp.c | 6 +++++-
|
||||
2 files changed, 27 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
|
||||
index f5f147eb92..824c73536a 100644
|
||||
--- a/hw/s390x/s390-virtio-ccw.c
|
||||
+++ b/hw/s390x/s390-virtio-ccw.c
|
||||
@@ -149,6 +149,7 @@ static void s390_memory_init(MachineState *machine)
|
||||
MemoryRegion *sysmem = get_system_memory();
|
||||
MemoryRegion *ram = machine->ram;
|
||||
uint64_t ram_size = memory_region_size(ram);
|
||||
+ uint64_t devmem_base, devmem_size;
|
||||
|
||||
if (!QEMU_IS_ALIGNED(ram_size, 1 * MiB)) {
|
||||
/*
|
||||
@@ -161,11 +162,31 @@ static void s390_memory_init(MachineState *machine)
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
- s390_set_memory_limit(s390ms, ram_size);
|
||||
+ devmem_size = 0;
|
||||
+ devmem_base = ram_size;
|
||||
+#ifdef CONFIG_MEM_DEVICE
|
||||
+ if (machine->ram_size < machine->maxram_size) {
|
||||
+
|
||||
+ /*
|
||||
+ * Make sure memory devices have a sane default alignment, even
|
||||
+ * when weird initial memory sizes are specified.
|
||||
+ */
|
||||
+ devmem_base = QEMU_ALIGN_UP(devmem_base, 1 * GiB);
|
||||
+ devmem_size = machine->maxram_size - machine->ram_size;
|
||||
+ }
|
||||
+#endif
|
||||
+ s390_set_memory_limit(s390ms, devmem_base + devmem_size);
|
||||
|
||||
/* Map the initial memory. Must happen after setting the memory limit. */
|
||||
memory_region_add_subregion(sysmem, 0, ram);
|
||||
|
||||
+ /* Initialize address space for memory devices. */
|
||||
+#ifdef CONFIG_MEM_DEVICE
|
||||
+ if (devmem_size) {
|
||||
+ machine_memory_devices_init(machine, devmem_base, devmem_size);
|
||||
+ }
|
||||
+#endif /* CONFIG_MEM_DEVICE */
|
||||
+
|
||||
/*
|
||||
* Configure the maximum page size. As no memory devices were created
|
||||
* yet, this is the page size of initial memory only.
|
||||
diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c
|
||||
index 73e88ab4eb..5945c9b1d8 100644
|
||||
--- a/hw/s390x/sclp.c
|
||||
+++ b/hw/s390x/sclp.c
|
||||
@@ -161,7 +161,11 @@ static void read_SCP_info(SCLPDevice *sclp, SCCB *sccb)
|
||||
read_info->rnsize2 = cpu_to_be32(rnsize);
|
||||
}
|
||||
|
||||
- /* we don't support standby memory, maxram_size is never exposed */
|
||||
+ /*
|
||||
+ * We don't support standby memory. maxram_size is used for sizing the
|
||||
+ * memory device region, which is not exposed through SCLP but through
|
||||
+ * diag500.
|
||||
+ */
|
||||
rnmax = machine->ram_size >> sclp->increment_size;
|
||||
if (rnmax < 0x10000) {
|
||||
read_info->rnmax = cpu_to_be16(rnmax);
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,163 @@
|
||||
From d2764db41fc6edcead9ad27b8d31e7bff524c0c0 Mon Sep 17 00:00:00 2001
|
||||
From: David Hildenbrand <david@redhat.com>
|
||||
Date: Thu, 19 Dec 2024 15:41:04 +0100
|
||||
Subject: [PATCH 12/26] s390x/s390-virtio-hcall: prepare for more diag500
|
||||
hypercalls
|
||||
|
||||
RH-Author: Thomas Huth <thuth@redhat.com>
|
||||
RH-MergeRequest: 351: Enable virtio-mem support on s390x
|
||||
RH-Jira: RHEL-72977
|
||||
RH-Acked-by: David Hildenbrand <david@redhat.com>
|
||||
RH-Acked-by: Juraj Marcin <None>
|
||||
RH-Commit: [12/26] 6573602d71b9e70679a48315f913309be29d6239 (thuth/qemu-kvm-cs)
|
||||
|
||||
Let's generalize, abstracting the virtio bits. diag500 is now a generic
|
||||
hypercall to handle QEMU/KVM specific things. Explicitly specify all
|
||||
already defined subcodes, including legacy ones (so we know what we can
|
||||
use for new hypercalls).
|
||||
|
||||
Move the PGM_SPECIFICATION injection into the renamed function
|
||||
handle_diag_500(), so we can turn it into a void function.
|
||||
|
||||
We'll rename the files separately, so git properly detects the rename.
|
||||
|
||||
Message-ID: <20241219144115.2820241-4-david@redhat.com>
|
||||
Acked-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Reviewed-by: Thomas Huth <thuth@redhat.com>
|
||||
Signed-off-by: David Hildenbrand <david@redhat.com>
|
||||
(cherry picked from commit 6e9cc2da4e8b997fd6ff3249034f436b84fc7974)
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
---
|
||||
hw/s390x/s390-virtio-hcall.c | 15 ++++++++-------
|
||||
hw/s390x/s390-virtio-hcall.h | 11 ++++++-----
|
||||
target/s390x/kvm/kvm.c | 20 +++-----------------
|
||||
target/s390x/tcg/misc_helper.c | 5 +++--
|
||||
4 files changed, 20 insertions(+), 31 deletions(-)
|
||||
|
||||
diff --git a/hw/s390x/s390-virtio-hcall.c b/hw/s390x/s390-virtio-hcall.c
|
||||
index ca49e3cd22..5fb78a719e 100644
|
||||
--- a/hw/s390x/s390-virtio-hcall.c
|
||||
+++ b/hw/s390x/s390-virtio-hcall.c
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
- * Support for virtio hypercalls on s390
|
||||
+ * Support for QEMU/KVM hypercalls on s390
|
||||
*
|
||||
* Copyright 2012 IBM Corp.
|
||||
* Author(s): Cornelia Huck <cornelia.huck@de.ibm.com>
|
||||
@@ -57,18 +57,19 @@ static int handle_virtio_ccw_notify(uint64_t subch_id, uint64_t data)
|
||||
return 0;
|
||||
}
|
||||
|
||||
-int s390_virtio_hypercall(CPUS390XState *env)
|
||||
+void handle_diag_500(S390CPU *cpu, uintptr_t ra)
|
||||
{
|
||||
+ CPUS390XState *env = &cpu->env;
|
||||
const uint64_t subcode = env->regs[1];
|
||||
|
||||
switch (subcode) {
|
||||
- case KVM_S390_VIRTIO_NOTIFY:
|
||||
+ case DIAG500_VIRTIO_NOTIFY:
|
||||
env->regs[2] = handle_virtio_notify(env->regs[2]);
|
||||
- return 0;
|
||||
- case KVM_S390_VIRTIO_CCW_NOTIFY:
|
||||
+ break;
|
||||
+ case DIAG500_VIRTIO_CCW_NOTIFY:
|
||||
env->regs[2] = handle_virtio_ccw_notify(env->regs[2], env->regs[3]);
|
||||
- return 0;
|
||||
+ break;
|
||||
default:
|
||||
- return -EINVAL;
|
||||
+ s390_program_interrupt(env, PGM_SPECIFICATION, ra);
|
||||
}
|
||||
}
|
||||
diff --git a/hw/s390x/s390-virtio-hcall.h b/hw/s390x/s390-virtio-hcall.h
|
||||
index 3d9fe147d2..dca456b926 100644
|
||||
--- a/hw/s390x/s390-virtio-hcall.h
|
||||
+++ b/hw/s390x/s390-virtio-hcall.h
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
- * Support for virtio hypercalls on s390x
|
||||
+ * Support for QEMU/KVM hypercalls on s390x
|
||||
*
|
||||
* Copyright IBM Corp. 2012, 2017
|
||||
* Author(s): Cornelia Huck <cornelia.huck@de.ibm.com>
|
||||
@@ -12,12 +12,13 @@
|
||||
#ifndef HW_S390_VIRTIO_HCALL_H
|
||||
#define HW_S390_VIRTIO_HCALL_H
|
||||
|
||||
-#include "standard-headers/asm-s390/virtio-ccw.h"
|
||||
#include "cpu.h"
|
||||
|
||||
-/* The only thing that we need from the old kvm_virtio.h file */
|
||||
-#define KVM_S390_VIRTIO_NOTIFY 0
|
||||
+#define DIAG500_VIRTIO_NOTIFY 0 /* legacy, implemented as a NOP */
|
||||
+#define DIAG500_VIRTIO_RESET 1 /* legacy */
|
||||
+#define DIAG500_VIRTIO_SET_STATUS 2 /* legacy */
|
||||
+#define DIAG500_VIRTIO_CCW_NOTIFY 3 /* KVM_S390_VIRTIO_CCW_NOTIFY */
|
||||
|
||||
-int s390_virtio_hypercall(CPUS390XState *env);
|
||||
+void handle_diag_500(S390CPU *cpu, uintptr_t ra);
|
||||
|
||||
#endif /* HW_S390_VIRTIO_HCALL_H */
|
||||
diff --git a/target/s390x/kvm/kvm.c b/target/s390x/kvm/kvm.c
|
||||
index 5947dda829..42d6a54126 100644
|
||||
--- a/target/s390x/kvm/kvm.c
|
||||
+++ b/target/s390x/kvm/kvm.c
|
||||
@@ -1492,22 +1492,6 @@ static int handle_e3(S390CPU *cpu, struct kvm_run *run, uint8_t ipbl)
|
||||
return r;
|
||||
}
|
||||
|
||||
-static int handle_hypercall(S390CPU *cpu, struct kvm_run *run)
|
||||
-{
|
||||
- CPUS390XState *env = &cpu->env;
|
||||
- int ret = -EINVAL;
|
||||
-
|
||||
-#ifdef CONFIG_S390_CCW_VIRTIO
|
||||
- ret = s390_virtio_hypercall(env);
|
||||
-#endif /* CONFIG_S390_CCW_VIRTIO */
|
||||
- if (ret == -EINVAL) {
|
||||
- kvm_s390_program_interrupt(cpu, PGM_SPECIFICATION);
|
||||
- return 0;
|
||||
- }
|
||||
-
|
||||
- return ret;
|
||||
-}
|
||||
-
|
||||
static void kvm_handle_diag_288(S390CPU *cpu, struct kvm_run *run)
|
||||
{
|
||||
uint64_t r1, r3;
|
||||
@@ -1603,9 +1587,11 @@ static int handle_diag(S390CPU *cpu, struct kvm_run *run, uint32_t ipb)
|
||||
case DIAG_SET_CONTROL_PROGRAM_CODES:
|
||||
handle_diag_318(cpu, run);
|
||||
break;
|
||||
+#ifdef CONFIG_S390_CCW_VIRTIO
|
||||
case DIAG_KVM_HYPERCALL:
|
||||
- r = handle_hypercall(cpu, run);
|
||||
+ handle_diag_500(cpu, RA_IGNORED);
|
||||
break;
|
||||
+#endif /* CONFIG_S390_CCW_VIRTIO */
|
||||
case DIAG_KVM_BREAKPOINT:
|
||||
r = handle_sw_breakpoint(cpu, run);
|
||||
break;
|
||||
diff --git a/target/s390x/tcg/misc_helper.c b/target/s390x/tcg/misc_helper.c
|
||||
index f44136a568..2b4310003b 100644
|
||||
--- a/target/s390x/tcg/misc_helper.c
|
||||
+++ b/target/s390x/tcg/misc_helper.c
|
||||
@@ -119,10 +119,11 @@ void HELPER(diag)(CPUS390XState *env, uint32_t r1, uint32_t r3, uint32_t num)
|
||||
switch (num) {
|
||||
#ifdef CONFIG_S390_CCW_VIRTIO
|
||||
case 0x500:
|
||||
- /* KVM hypercall */
|
||||
+ /* QEMU/KVM hypercall */
|
||||
bql_lock();
|
||||
- r = s390_virtio_hypercall(env);
|
||||
+ handle_diag_500(env_archcpu(env), GETPC());
|
||||
bql_unlock();
|
||||
+ r = 0;
|
||||
break;
|
||||
#endif /* CONFIG_S390_CCW_VIRTIO */
|
||||
case 0x44:
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,296 @@
|
||||
From 16ccb16d393a3e63936dc993c30c67fdecb1f120 Mon Sep 17 00:00:00 2001
|
||||
From: David Hildenbrand <david@redhat.com>
|
||||
Date: Thu, 19 Dec 2024 15:41:03 +0100
|
||||
Subject: [PATCH 11/26] s390x/s390-virtio-hcall: remove hypercall registration
|
||||
mechanism
|
||||
|
||||
RH-Author: Thomas Huth <thuth@redhat.com>
|
||||
RH-MergeRequest: 351: Enable virtio-mem support on s390x
|
||||
RH-Jira: RHEL-72977
|
||||
RH-Acked-by: David Hildenbrand <david@redhat.com>
|
||||
RH-Acked-by: Juraj Marcin <None>
|
||||
RH-Commit: [11/26] 5e8d2720fe9fd6e6e24487d71988821f1cf27f17 (thuth/qemu-kvm-cs)
|
||||
|
||||
Nowadays, we only have a single machine type in QEMU, everything is based
|
||||
on virtio-ccw and the traditional virtio machine does no longer exist. No
|
||||
need to dynamically register diag500 handlers. Move the two existing
|
||||
handlers into s390-virtio-hcall.c.
|
||||
|
||||
Message-ID: <20241219144115.2820241-3-david@redhat.com>
|
||||
Acked-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Reviewed-by: Thomas Huth <thuth@redhat.com>
|
||||
Acked-by: Christian Borntraeger <borntraeger@linux.ibm.com>
|
||||
Signed-off-by: David Hildenbrand <david@redhat.com>
|
||||
(cherry picked from commit 4be0fce498d0a08f18b3a9accdb9ded79484d30a)
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
---
|
||||
hw/s390x/meson.build | 6 ++--
|
||||
hw/s390x/s390-virtio-ccw.c | 58 ------------------------------
|
||||
hw/s390x/s390-virtio-hcall.c | 65 +++++++++++++++++++++++++---------
|
||||
hw/s390x/s390-virtio-hcall.h | 2 --
|
||||
target/s390x/kvm/kvm.c | 5 ++-
|
||||
target/s390x/tcg/misc_helper.c | 3 ++
|
||||
6 files changed, 60 insertions(+), 79 deletions(-)
|
||||
|
||||
diff --git a/hw/s390x/meson.build b/hw/s390x/meson.build
|
||||
index 482fd13420..d6c8c33915 100644
|
||||
--- a/hw/s390x/meson.build
|
||||
+++ b/hw/s390x/meson.build
|
||||
@@ -12,7 +12,6 @@ s390x_ss.add(files(
|
||||
's390-pci-inst.c',
|
||||
's390-skeys.c',
|
||||
's390-stattrib.c',
|
||||
- 's390-virtio-hcall.c',
|
||||
'sclp.c',
|
||||
'sclpcpu.c',
|
||||
'sclpquiesce.c',
|
||||
@@ -28,7 +27,10 @@ s390x_ss.add(when: 'CONFIG_KVM', if_true: files(
|
||||
s390x_ss.add(when: 'CONFIG_TCG', if_true: files(
|
||||
'tod-tcg.c',
|
||||
))
|
||||
-s390x_ss.add(when: 'CONFIG_S390_CCW_VIRTIO', if_true: files('s390-virtio-ccw.c'))
|
||||
+s390x_ss.add(when: 'CONFIG_S390_CCW_VIRTIO', if_true: files(
|
||||
+ 's390-virtio-ccw.c',
|
||||
+ 's390-virtio-hcall.c',
|
||||
+))
|
||||
s390x_ss.add(when: 'CONFIG_TERMINAL3270', if_true: files('3270-ccw.c'))
|
||||
s390x_ss.add(when: 'CONFIG_VFIO', if_true: files('s390-pci-vfio.c'))
|
||||
|
||||
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
|
||||
index 82ded9666c..d47e99028e 100644
|
||||
--- a/hw/s390x/s390-virtio-ccw.c
|
||||
+++ b/hw/s390x/s390-virtio-ccw.c
|
||||
@@ -16,11 +16,8 @@
|
||||
#include "exec/ram_addr.h"
|
||||
#include "exec/confidential-guest-support.h"
|
||||
#include "hw/boards.h"
|
||||
-#include "hw/s390x/s390-virtio-hcall.h"
|
||||
#include "hw/s390x/sclp.h"
|
||||
#include "hw/s390x/s390_flic.h"
|
||||
-#include "hw/s390x/ioinst.h"
|
||||
-#include "hw/s390x/css.h"
|
||||
#include "virtio-ccw.h"
|
||||
#include "qemu/config-file.h"
|
||||
#include "qemu/ctype.h"
|
||||
@@ -124,58 +121,6 @@ static void subsystem_reset(void)
|
||||
}
|
||||
}
|
||||
|
||||
-static int virtio_ccw_hcall_notify(const uint64_t *args)
|
||||
-{
|
||||
- uint64_t subch_id = args[0];
|
||||
- uint64_t data = args[1];
|
||||
- SubchDev *sch;
|
||||
- VirtIODevice *vdev;
|
||||
- int cssid, ssid, schid, m;
|
||||
- uint16_t vq_idx = data;
|
||||
-
|
||||
- if (ioinst_disassemble_sch_ident(subch_id, &m, &cssid, &ssid, &schid)) {
|
||||
- return -EINVAL;
|
||||
- }
|
||||
- sch = css_find_subch(m, cssid, ssid, schid);
|
||||
- if (!sch || !css_subch_visible(sch)) {
|
||||
- return -EINVAL;
|
||||
- }
|
||||
-
|
||||
- vdev = virtio_ccw_get_vdev(sch);
|
||||
- if (vq_idx >= VIRTIO_QUEUE_MAX || !virtio_queue_get_num(vdev, vq_idx)) {
|
||||
- return -EINVAL;
|
||||
- }
|
||||
-
|
||||
- if (virtio_vdev_has_feature(vdev, VIRTIO_F_NOTIFICATION_DATA)) {
|
||||
- virtio_queue_set_shadow_avail_idx(virtio_get_queue(vdev, vq_idx),
|
||||
- (data >> 16) & 0xFFFF);
|
||||
- }
|
||||
-
|
||||
- virtio_queue_notify(vdev, vq_idx);
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
-static int virtio_ccw_hcall_early_printk(const uint64_t *args)
|
||||
-{
|
||||
- uint64_t mem = args[0];
|
||||
- MachineState *ms = MACHINE(qdev_get_machine());
|
||||
-
|
||||
- if (mem < ms->ram_size) {
|
||||
- /* Early printk */
|
||||
- return 0;
|
||||
- }
|
||||
- return -EINVAL;
|
||||
-}
|
||||
-
|
||||
-static void virtio_ccw_register_hcalls(void)
|
||||
-{
|
||||
- s390_register_virtio_hypercall(KVM_S390_VIRTIO_CCW_NOTIFY,
|
||||
- virtio_ccw_hcall_notify);
|
||||
- /* Tolerate early printk. */
|
||||
- s390_register_virtio_hypercall(KVM_S390_VIRTIO_NOTIFY,
|
||||
- virtio_ccw_hcall_early_printk);
|
||||
-}
|
||||
-
|
||||
static void s390_memory_init(MemoryRegion *ram)
|
||||
{
|
||||
MemoryRegion *sysmem = get_system_memory();
|
||||
@@ -296,9 +241,6 @@ static void ccw_init(MachineState *machine)
|
||||
OBJECT(dev));
|
||||
sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
|
||||
|
||||
- /* register hypercalls */
|
||||
- virtio_ccw_register_hcalls();
|
||||
-
|
||||
s390_enable_css_support(s390_cpu_addr2state(0));
|
||||
|
||||
ret = css_create_css_image(VIRTUAL_CSSID, true);
|
||||
diff --git a/hw/s390x/s390-virtio-hcall.c b/hw/s390x/s390-virtio-hcall.c
|
||||
index ec7cf8beb3..ca49e3cd22 100644
|
||||
--- a/hw/s390x/s390-virtio-hcall.c
|
||||
+++ b/hw/s390x/s390-virtio-hcall.c
|
||||
@@ -11,31 +11,64 @@
|
||||
|
||||
#include "qemu/osdep.h"
|
||||
#include "cpu.h"
|
||||
+#include "hw/boards.h"
|
||||
#include "hw/s390x/s390-virtio-hcall.h"
|
||||
+#include "hw/s390x/ioinst.h"
|
||||
+#include "hw/s390x/css.h"
|
||||
+#include "virtio-ccw.h"
|
||||
|
||||
-#define MAX_DIAG_SUBCODES 255
|
||||
+static int handle_virtio_notify(uint64_t mem)
|
||||
+{
|
||||
+ MachineState *ms = MACHINE(qdev_get_machine());
|
||||
|
||||
-static s390_virtio_fn s390_diag500_table[MAX_DIAG_SUBCODES];
|
||||
+ if (mem < ms->ram_size) {
|
||||
+ /* Early printk */
|
||||
+ return 0;
|
||||
+ }
|
||||
+ return -EINVAL;
|
||||
+}
|
||||
|
||||
-void s390_register_virtio_hypercall(uint64_t code, s390_virtio_fn fn)
|
||||
+static int handle_virtio_ccw_notify(uint64_t subch_id, uint64_t data)
|
||||
{
|
||||
- assert(code < MAX_DIAG_SUBCODES);
|
||||
- assert(!s390_diag500_table[code]);
|
||||
+ SubchDev *sch;
|
||||
+ VirtIODevice *vdev;
|
||||
+ int cssid, ssid, schid, m;
|
||||
+ uint16_t vq_idx = data;
|
||||
+
|
||||
+ if (ioinst_disassemble_sch_ident(subch_id, &m, &cssid, &ssid, &schid)) {
|
||||
+ return -EINVAL;
|
||||
+ }
|
||||
+ sch = css_find_subch(m, cssid, ssid, schid);
|
||||
+ if (!sch || !css_subch_visible(sch)) {
|
||||
+ return -EINVAL;
|
||||
+ }
|
||||
|
||||
- s390_diag500_table[code] = fn;
|
||||
+ vdev = virtio_ccw_get_vdev(sch);
|
||||
+ if (vq_idx >= VIRTIO_QUEUE_MAX || !virtio_queue_get_num(vdev, vq_idx)) {
|
||||
+ return -EINVAL;
|
||||
+ }
|
||||
+
|
||||
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_NOTIFICATION_DATA)) {
|
||||
+ virtio_queue_set_shadow_avail_idx(virtio_get_queue(vdev, vq_idx),
|
||||
+ (data >> 16) & 0xFFFF);
|
||||
+ }
|
||||
+
|
||||
+ virtio_queue_notify(vdev, vq_idx);
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
int s390_virtio_hypercall(CPUS390XState *env)
|
||||
{
|
||||
- s390_virtio_fn fn;
|
||||
-
|
||||
- if (env->regs[1] < MAX_DIAG_SUBCODES) {
|
||||
- fn = s390_diag500_table[env->regs[1]];
|
||||
- if (fn) {
|
||||
- env->regs[2] = fn(&env->regs[2]);
|
||||
- return 0;
|
||||
- }
|
||||
- }
|
||||
+ const uint64_t subcode = env->regs[1];
|
||||
|
||||
- return -EINVAL;
|
||||
+ switch (subcode) {
|
||||
+ case KVM_S390_VIRTIO_NOTIFY:
|
||||
+ env->regs[2] = handle_virtio_notify(env->regs[2]);
|
||||
+ return 0;
|
||||
+ case KVM_S390_VIRTIO_CCW_NOTIFY:
|
||||
+ env->regs[2] = handle_virtio_ccw_notify(env->regs[2], env->regs[3]);
|
||||
+ return 0;
|
||||
+ default:
|
||||
+ return -EINVAL;
|
||||
+ }
|
||||
}
|
||||
diff --git a/hw/s390x/s390-virtio-hcall.h b/hw/s390x/s390-virtio-hcall.h
|
||||
index 3ae6d6ae3a..3d9fe147d2 100644
|
||||
--- a/hw/s390x/s390-virtio-hcall.h
|
||||
+++ b/hw/s390x/s390-virtio-hcall.h
|
||||
@@ -18,8 +18,6 @@
|
||||
/* The only thing that we need from the old kvm_virtio.h file */
|
||||
#define KVM_S390_VIRTIO_NOTIFY 0
|
||||
|
||||
-typedef int (*s390_virtio_fn)(const uint64_t *args);
|
||||
-void s390_register_virtio_hypercall(uint64_t code, s390_virtio_fn fn);
|
||||
int s390_virtio_hypercall(CPUS390XState *env);
|
||||
|
||||
#endif /* HW_S390_VIRTIO_HCALL_H */
|
||||
diff --git a/target/s390x/kvm/kvm.c b/target/s390x/kvm/kvm.c
|
||||
index 7a0ca5570f..5947dda829 100644
|
||||
--- a/target/s390x/kvm/kvm.c
|
||||
+++ b/target/s390x/kvm/kvm.c
|
||||
@@ -51,6 +51,7 @@
|
||||
#include "hw/s390x/s390-virtio-ccw.h"
|
||||
#include "hw/s390x/s390-virtio-hcall.h"
|
||||
#include "target/s390x/kvm/pv.h"
|
||||
+#include CONFIG_DEVICES
|
||||
|
||||
#define kvm_vm_check_mem_attr(s, attr) \
|
||||
kvm_vm_check_attr(s, KVM_S390_VM_MEM_CTRL, attr)
|
||||
@@ -1494,9 +1495,11 @@ static int handle_e3(S390CPU *cpu, struct kvm_run *run, uint8_t ipbl)
|
||||
static int handle_hypercall(S390CPU *cpu, struct kvm_run *run)
|
||||
{
|
||||
CPUS390XState *env = &cpu->env;
|
||||
- int ret;
|
||||
+ int ret = -EINVAL;
|
||||
|
||||
+#ifdef CONFIG_S390_CCW_VIRTIO
|
||||
ret = s390_virtio_hypercall(env);
|
||||
+#endif /* CONFIG_S390_CCW_VIRTIO */
|
||||
if (ret == -EINVAL) {
|
||||
kvm_s390_program_interrupt(cpu, PGM_SPECIFICATION);
|
||||
return 0;
|
||||
diff --git a/target/s390x/tcg/misc_helper.c b/target/s390x/tcg/misc_helper.c
|
||||
index 303f86d363..f44136a568 100644
|
||||
--- a/target/s390x/tcg/misc_helper.c
|
||||
+++ b/target/s390x/tcg/misc_helper.c
|
||||
@@ -43,6 +43,7 @@
|
||||
#include "hw/s390x/s390-pci-inst.h"
|
||||
#include "hw/boards.h"
|
||||
#include "hw/s390x/tod.h"
|
||||
+#include CONFIG_DEVICES
|
||||
#endif
|
||||
|
||||
/* #define DEBUG_HELPER */
|
||||
@@ -116,12 +117,14 @@ void HELPER(diag)(CPUS390XState *env, uint32_t r1, uint32_t r3, uint32_t num)
|
||||
uint64_t r;
|
||||
|
||||
switch (num) {
|
||||
+#ifdef CONFIG_S390_CCW_VIRTIO
|
||||
case 0x500:
|
||||
/* KVM hypercall */
|
||||
bql_lock();
|
||||
r = s390_virtio_hypercall(env);
|
||||
bql_unlock();
|
||||
break;
|
||||
+#endif /* CONFIG_S390_CCW_VIRTIO */
|
||||
case 0x44:
|
||||
/* yield */
|
||||
r = 0;
|
||||
--
|
||||
2.48.1
|
||||
|
||||
@ -0,0 +1,423 @@
|
||||
From 6b82fca2ecac0c7b30780ebb71ce5bad0421b9b4 Mon Sep 17 00:00:00 2001
|
||||
From: David Hildenbrand <david@redhat.com>
|
||||
Date: Thu, 19 Dec 2024 15:41:14 +0100
|
||||
Subject: [PATCH 22/26] s390x/virtio-ccw: add support for virtio based memory
|
||||
devices
|
||||
|
||||
RH-Author: Thomas Huth <thuth@redhat.com>
|
||||
RH-MergeRequest: 351: Enable virtio-mem support on s390x
|
||||
RH-Jira: RHEL-72977
|
||||
RH-Acked-by: David Hildenbrand <david@redhat.com>
|
||||
RH-Acked-by: Juraj Marcin <None>
|
||||
RH-Commit: [22/26] 270a9fbe7e5bacfa6c9377815a01da26c4d26097 (thuth/qemu-kvm-cs)
|
||||
|
||||
Let's implement support for abstract virtio based memory devices, using
|
||||
the virtio-pci implementation as an orientation. Wire them up in the
|
||||
machine hotplug handler, taking care of s390x page size limitations.
|
||||
|
||||
As we neither support virtio-mem or virtio-pmem yet, the code is
|
||||
effectively unused. We'll implement support for virtio-mem based on this
|
||||
next.
|
||||
|
||||
Note that we won't wire up the virtio-pci variant (should currently be
|
||||
impossible due to lack of support for MSI-X), but we'll add a safety net
|
||||
to reject plugging them in the pre-plug handler.
|
||||
|
||||
Message-ID: <20241219144115.2820241-14-david@redhat.com>
|
||||
Acked-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Signed-off-by: David Hildenbrand <david@redhat.com>
|
||||
(cherry picked from commit 88d86f6f1e36741ba9e1625da19a7ccf1a343d39)
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
---
|
||||
MAINTAINERS | 3 +
|
||||
hw/s390x/meson.build | 3 +
|
||||
hw/s390x/s390-virtio-ccw.c | 47 +++++++++-
|
||||
hw/s390x/virtio-ccw-md-stubs.c | 24 ++++++
|
||||
hw/s390x/virtio-ccw-md.c | 153 +++++++++++++++++++++++++++++++++
|
||||
hw/s390x/virtio-ccw-md.h | 44 ++++++++++
|
||||
hw/virtio/Kconfig | 1 +
|
||||
7 files changed, 274 insertions(+), 1 deletion(-)
|
||||
create mode 100644 hw/s390x/virtio-ccw-md-stubs.c
|
||||
create mode 100644 hw/s390x/virtio-ccw-md.c
|
||||
create mode 100644 hw/s390x/virtio-ccw-md.h
|
||||
|
||||
diff --git a/MAINTAINERS b/MAINTAINERS
|
||||
index 3584d6a6c6..f21dc3fa75 100644
|
||||
--- a/MAINTAINERS
|
||||
+++ b/MAINTAINERS
|
||||
@@ -2387,6 +2387,9 @@ F: include/hw/virtio/virtio-crypto.h
|
||||
virtio based memory device
|
||||
M: David Hildenbrand <david@redhat.com>
|
||||
S: Supported
|
||||
+F: hw/s390x/virtio-ccw-md.c
|
||||
+F: hw/s390x/virtio-ccw-md.h
|
||||
+F: hw/s390x/virtio-ccw-md-stubs.c
|
||||
F: hw/virtio/virtio-md-pci.c
|
||||
F: include/hw/virtio/virtio-md-pci.h
|
||||
F: stubs/virtio-md-pci.c
|
||||
diff --git a/hw/s390x/meson.build b/hw/s390x/meson.build
|
||||
index e344a3bd8c..4431868408 100644
|
||||
--- a/hw/s390x/meson.build
|
||||
+++ b/hw/s390x/meson.build
|
||||
@@ -50,8 +50,11 @@ endif
|
||||
virtio_ss.add(when: 'CONFIG_VHOST_SCSI', if_true: files('vhost-scsi-ccw.c'))
|
||||
virtio_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: files('vhost-vsock-ccw.c'))
|
||||
virtio_ss.add(when: 'CONFIG_VHOST_USER_FS', if_true: files('vhost-user-fs-ccw.c'))
|
||||
+virtio_ss.add(when: 'CONFIG_VIRTIO_MD', if_true: files('virtio-ccw-md.c'))
|
||||
s390x_ss.add_all(when: 'CONFIG_VIRTIO_CCW', if_true: virtio_ss)
|
||||
|
||||
+s390x_ss.add(when: 'CONFIG_VIRTIO_MD', if_false: files('virtio-ccw-md-stubs.c'))
|
||||
+
|
||||
hw_arch += {'s390x': s390x_ss}
|
||||
|
||||
hw_s390x_modules = {}
|
||||
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
|
||||
index bd05a22b4e..9f4ad01789 100644
|
||||
--- a/hw/s390x/s390-virtio-ccw.c
|
||||
+++ b/hw/s390x/s390-virtio-ccw.c
|
||||
@@ -46,6 +46,8 @@
|
||||
#include "qapi/visitor.h"
|
||||
#include "hw/s390x/cpu-topology.h"
|
||||
#include "kvm/kvm_s390x.h"
|
||||
+#include "hw/virtio/virtio-md-pci.h"
|
||||
+#include "hw/s390x/virtio-ccw-md.h"
|
||||
#include CONFIG_DEVICES
|
||||
|
||||
static Error *pv_mig_blocker;
|
||||
@@ -546,11 +548,39 @@ static void s390_machine_reset(MachineState *machine, ResetType type)
|
||||
s390_ipl_clear_reset_request();
|
||||
}
|
||||
|
||||
+static void s390_machine_device_pre_plug(HotplugHandler *hotplug_dev,
|
||||
+ DeviceState *dev, Error **errp)
|
||||
+{
|
||||
+ if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_CCW)) {
|
||||
+ virtio_ccw_md_pre_plug(VIRTIO_MD_CCW(dev), MACHINE(hotplug_dev), errp);
|
||||
+ } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) {
|
||||
+ error_setg(errp,
|
||||
+ "PCI-attached virtio based memory devices not supported");
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
static void s390_machine_device_plug(HotplugHandler *hotplug_dev,
|
||||
DeviceState *dev, Error **errp)
|
||||
{
|
||||
+ S390CcwMachineState *s390ms = S390_CCW_MACHINE(hotplug_dev);
|
||||
+
|
||||
if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
|
||||
s390_cpu_plug(hotplug_dev, dev, errp);
|
||||
+ } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_CCW)) {
|
||||
+ /*
|
||||
+ * At this point, the device is realized and set all memdevs mapped, so
|
||||
+ * qemu_maxrampagesize() will pick up the page sizes of these memdevs
|
||||
+ * as well. Before we plug the device and expose any RAM memory regions
|
||||
+ * to the system, make sure we don't exceed the previously set max page
|
||||
+ * size. While only relevant for KVM, there is not really any use case
|
||||
+ * for this with TCG, so we'll unconditionally reject it.
|
||||
+ */
|
||||
+ if (qemu_maxrampagesize() != s390ms->max_pagesize) {
|
||||
+ error_setg(errp, "Memory device uses a bigger page size than"
|
||||
+ " initial memory");
|
||||
+ return;
|
||||
+ }
|
||||
+ virtio_ccw_md_plug(VIRTIO_MD_CCW(dev), MACHINE(hotplug_dev), errp);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -560,9 +590,20 @@ static void s390_machine_device_unplug_request(HotplugHandler *hotplug_dev,
|
||||
if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
|
||||
error_setg(errp, "CPU hot unplug not supported on this machine");
|
||||
return;
|
||||
+ } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_CCW)) {
|
||||
+ virtio_ccw_md_unplug_request(VIRTIO_MD_CCW(dev), MACHINE(hotplug_dev),
|
||||
+ errp);
|
||||
}
|
||||
}
|
||||
|
||||
+static void s390_machine_device_unplug(HotplugHandler *hotplug_dev,
|
||||
+ DeviceState *dev, Error **errp)
|
||||
+{
|
||||
+ if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_CCW)) {
|
||||
+ virtio_ccw_md_unplug(VIRTIO_MD_CCW(dev), MACHINE(hotplug_dev), errp);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
static CpuInstanceProperties s390_cpu_index_to_props(MachineState *ms,
|
||||
unsigned cpu_index)
|
||||
{
|
||||
@@ -609,7 +650,9 @@ static const CPUArchIdList *s390_possible_cpu_arch_ids(MachineState *ms)
|
||||
static HotplugHandler *s390_get_hotplug_handler(MachineState *machine,
|
||||
DeviceState *dev)
|
||||
{
|
||||
- if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
|
||||
+ if (object_dynamic_cast(OBJECT(dev), TYPE_CPU) ||
|
||||
+ object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_CCW) ||
|
||||
+ object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) {
|
||||
return HOTPLUG_HANDLER(machine);
|
||||
}
|
||||
return NULL;
|
||||
@@ -769,8 +812,10 @@ static void ccw_machine_class_init(ObjectClass *oc, void *data)
|
||||
mc->possible_cpu_arch_ids = s390_possible_cpu_arch_ids;
|
||||
/* it is overridden with 'host' cpu *in kvm_arch_init* */
|
||||
mc->default_cpu_type = S390_CPU_TYPE_NAME("qemu");
|
||||
+ hc->pre_plug = s390_machine_device_pre_plug;
|
||||
hc->plug = s390_machine_device_plug;
|
||||
hc->unplug_request = s390_machine_device_unplug_request;
|
||||
+ hc->unplug = s390_machine_device_unplug;
|
||||
nc->nmi_monitor_handler = s390_nmi;
|
||||
mc->default_ram_id = "s390.ram";
|
||||
mc->default_nic = "virtio-net-ccw";
|
||||
diff --git a/hw/s390x/virtio-ccw-md-stubs.c b/hw/s390x/virtio-ccw-md-stubs.c
|
||||
new file mode 100644
|
||||
index 0000000000..e937865550
|
||||
--- /dev/null
|
||||
+++ b/hw/s390x/virtio-ccw-md-stubs.c
|
||||
@@ -0,0 +1,24 @@
|
||||
+#include "qemu/osdep.h"
|
||||
+#include "qapi/error.h"
|
||||
+#include "hw/s390x/virtio-ccw-md.h"
|
||||
+
|
||||
+void virtio_ccw_md_pre_plug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp)
|
||||
+{
|
||||
+ error_setg(errp, "virtio based memory devices not supported");
|
||||
+}
|
||||
+
|
||||
+void virtio_ccw_md_plug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp)
|
||||
+{
|
||||
+ error_setg(errp, "virtio based memory devices not supported");
|
||||
+}
|
||||
+
|
||||
+void virtio_ccw_md_unplug_request(VirtIOMDCcw *vmd, MachineState *ms,
|
||||
+ Error **errp)
|
||||
+{
|
||||
+ error_setg(errp, "virtio based memory devices not supported");
|
||||
+}
|
||||
+
|
||||
+void virtio_ccw_md_unplug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp)
|
||||
+{
|
||||
+ error_setg(errp, "virtio based memory devices not supported");
|
||||
+}
|
||||
diff --git a/hw/s390x/virtio-ccw-md.c b/hw/s390x/virtio-ccw-md.c
|
||||
new file mode 100644
|
||||
index 0000000000..de333282df
|
||||
--- /dev/null
|
||||
+++ b/hw/s390x/virtio-ccw-md.c
|
||||
@@ -0,0 +1,153 @@
|
||||
+/*
|
||||
+ * Virtio CCW support for abstract virtio based memory device
|
||||
+ *
|
||||
+ * Copyright (C) 2024 Red Hat, Inc.
|
||||
+ *
|
||||
+ * Authors:
|
||||
+ * David Hildenbrand <david@redhat.com>
|
||||
+ *
|
||||
+ * This work is licensed under the terms of the GNU GPL, version 2.
|
||||
+ * See the COPYING file in the top-level directory.
|
||||
+ */
|
||||
+
|
||||
+#include "qemu/osdep.h"
|
||||
+#include "hw/s390x/virtio-ccw-md.h"
|
||||
+#include "hw/mem/memory-device.h"
|
||||
+#include "qapi/error.h"
|
||||
+#include "qemu/error-report.h"
|
||||
+
|
||||
+void virtio_ccw_md_pre_plug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp)
|
||||
+{
|
||||
+ DeviceState *dev = DEVICE(vmd);
|
||||
+ HotplugHandler *bus_handler = qdev_get_bus_hotplug_handler(dev);
|
||||
+ MemoryDeviceState *md = MEMORY_DEVICE(vmd);
|
||||
+ Error *local_err = NULL;
|
||||
+
|
||||
+ if (!bus_handler && dev->hotplugged) {
|
||||
+ /*
|
||||
+ * Without a bus hotplug handler, we cannot control the plug/unplug
|
||||
+ * order. We should never reach this point when hotplugging, but
|
||||
+ * better add a safety net.
|
||||
+ */
|
||||
+ error_setg(errp, "hotplug of virtio based memory devices not supported"
|
||||
+ " on this bus.");
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * First, see if we can plug this memory device at all. If that
|
||||
+ * succeeds, branch of to the actual hotplug handler.
|
||||
+ */
|
||||
+ memory_device_pre_plug(md, ms, &local_err);
|
||||
+ if (!local_err && bus_handler) {
|
||||
+ hotplug_handler_pre_plug(bus_handler, dev, &local_err);
|
||||
+ }
|
||||
+ error_propagate(errp, local_err);
|
||||
+}
|
||||
+
|
||||
+void virtio_ccw_md_plug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp)
|
||||
+{
|
||||
+ DeviceState *dev = DEVICE(vmd);
|
||||
+ HotplugHandler *bus_handler = qdev_get_bus_hotplug_handler(dev);
|
||||
+ MemoryDeviceState *md = MEMORY_DEVICE(vmd);
|
||||
+ Error *local_err = NULL;
|
||||
+
|
||||
+ /*
|
||||
+ * Plug the memory device first and then branch off to the actual
|
||||
+ * hotplug handler. If that one fails, we can easily undo the memory
|
||||
+ * device bits.
|
||||
+ */
|
||||
+ memory_device_plug(md, ms);
|
||||
+ if (bus_handler) {
|
||||
+ hotplug_handler_plug(bus_handler, dev, &local_err);
|
||||
+ if (local_err) {
|
||||
+ memory_device_unplug(md, ms);
|
||||
+ }
|
||||
+ }
|
||||
+ error_propagate(errp, local_err);
|
||||
+}
|
||||
+
|
||||
+void virtio_ccw_md_unplug_request(VirtIOMDCcw *vmd, MachineState *ms,
|
||||
+ Error **errp)
|
||||
+{
|
||||
+ VirtIOMDCcwClass *vmdc = VIRTIO_MD_CCW_GET_CLASS(vmd);
|
||||
+ DeviceState *dev = DEVICE(vmd);
|
||||
+ HotplugHandler *bus_handler = qdev_get_bus_hotplug_handler(dev);
|
||||
+ HotplugHandlerClass *hdc;
|
||||
+ Error *local_err = NULL;
|
||||
+
|
||||
+ if (!vmdc->unplug_request_check) {
|
||||
+ error_setg(errp,
|
||||
+ "this virtio based memory devices cannot be unplugged");
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ if (!bus_handler) {
|
||||
+ error_setg(errp, "hotunplug of virtio based memory devices not"
|
||||
+ "supported on this bus");
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ vmdc->unplug_request_check(vmd, &local_err);
|
||||
+ if (local_err) {
|
||||
+ error_propagate(errp, local_err);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * Forward the async request or turn it into a sync request (handling it
|
||||
+ * like qdev_unplug()).
|
||||
+ */
|
||||
+ hdc = HOTPLUG_HANDLER_GET_CLASS(bus_handler);
|
||||
+ if (hdc->unplug_request) {
|
||||
+ hotplug_handler_unplug_request(bus_handler, dev, &local_err);
|
||||
+ } else {
|
||||
+ virtio_ccw_md_unplug(vmd, ms, &local_err);
|
||||
+ if (!local_err) {
|
||||
+ object_unparent(OBJECT(dev));
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+void virtio_ccw_md_unplug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp)
|
||||
+{
|
||||
+ DeviceState *dev = DEVICE(vmd);
|
||||
+ HotplugHandler *bus_handler = qdev_get_bus_hotplug_handler(dev);
|
||||
+ MemoryDeviceState *md = MEMORY_DEVICE(vmd);
|
||||
+ Error *local_err = NULL;
|
||||
+
|
||||
+ /* Unplug the memory device while it is still realized. */
|
||||
+ memory_device_unplug(md, ms);
|
||||
+
|
||||
+ if (bus_handler) {
|
||||
+ hotplug_handler_unplug(bus_handler, dev, &local_err);
|
||||
+ if (local_err) {
|
||||
+ /* Not expected to fail ... but still try to recover. */
|
||||
+ memory_device_plug(md, ms);
|
||||
+ error_propagate(errp, local_err);
|
||||
+ return;
|
||||
+ }
|
||||
+ } else {
|
||||
+ /* Very unexpected, but let's just try to do the right thing. */
|
||||
+ warn_report("Unexpected unplug of virtio based memory device");
|
||||
+ qdev_unrealize(dev);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static const TypeInfo virtio_ccw_md_info = {
|
||||
+ .name = TYPE_VIRTIO_MD_CCW,
|
||||
+ .parent = TYPE_VIRTIO_CCW_DEVICE,
|
||||
+ .instance_size = sizeof(VirtIOMDCcw),
|
||||
+ .class_size = sizeof(VirtIOMDCcwClass),
|
||||
+ .abstract = true,
|
||||
+ .interfaces = (InterfaceInfo[]) {
|
||||
+ { TYPE_MEMORY_DEVICE },
|
||||
+ { }
|
||||
+ },
|
||||
+};
|
||||
+
|
||||
+static void virtio_ccw_md_register(void)
|
||||
+{
|
||||
+ type_register_static(&virtio_ccw_md_info);
|
||||
+}
|
||||
+type_init(virtio_ccw_md_register)
|
||||
diff --git a/hw/s390x/virtio-ccw-md.h b/hw/s390x/virtio-ccw-md.h
|
||||
new file mode 100644
|
||||
index 0000000000..39ba864c92
|
||||
--- /dev/null
|
||||
+++ b/hw/s390x/virtio-ccw-md.h
|
||||
@@ -0,0 +1,44 @@
|
||||
+/*
|
||||
+ * Virtio CCW support for abstract virtio based memory device
|
||||
+ *
|
||||
+ * Copyright (C) 2024 Red Hat, Inc.
|
||||
+ *
|
||||
+ * Authors:
|
||||
+ * David Hildenbrand <david@redhat.com>
|
||||
+ *
|
||||
+ * This work is licensed under the terms of the GNU GPL, version 2.
|
||||
+ * See the COPYING file in the top-level directory.
|
||||
+ */
|
||||
+
|
||||
+#ifndef HW_S390X_VIRTIO_CCW_MD_H
|
||||
+#define HW_S390X_VIRTIO_CCW_MD_H
|
||||
+
|
||||
+#include "virtio-ccw.h"
|
||||
+#include "qom/object.h"
|
||||
+
|
||||
+/*
|
||||
+ * virtio-md-ccw: This extends VirtioCcwDevice.
|
||||
+ */
|
||||
+#define TYPE_VIRTIO_MD_CCW "virtio-md-ccw"
|
||||
+
|
||||
+OBJECT_DECLARE_TYPE(VirtIOMDCcw, VirtIOMDCcwClass, VIRTIO_MD_CCW)
|
||||
+
|
||||
+struct VirtIOMDCcwClass {
|
||||
+ /* private */
|
||||
+ VirtIOCCWDeviceClass parent;
|
||||
+
|
||||
+ /* public */
|
||||
+ void (*unplug_request_check)(VirtIOMDCcw *vmd, Error **errp);
|
||||
+};
|
||||
+
|
||||
+struct VirtIOMDCcw {
|
||||
+ VirtioCcwDevice parent_obj;
|
||||
+};
|
||||
+
|
||||
+void virtio_ccw_md_pre_plug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp);
|
||||
+void virtio_ccw_md_plug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp);
|
||||
+void virtio_ccw_md_unplug_request(VirtIOMDCcw *vmd, MachineState *ms,
|
||||
+ Error **errp);
|
||||
+void virtio_ccw_md_unplug(VirtIOMDCcw *vmd, MachineState *ms, Error **errp);
|
||||
+
|
||||
+#endif /* HW_S390X_VIRTIO_CCW_MD_H */
|
||||
diff --git a/hw/virtio/Kconfig b/hw/virtio/Kconfig
|
||||
index 0afec2ae92..f4b14e1a44 100644
|
||||
--- a/hw/virtio/Kconfig
|
||||
+++ b/hw/virtio/Kconfig
|
||||
@@ -25,6 +25,7 @@ config VIRTIO_MMIO
|
||||
config VIRTIO_CCW
|
||||
bool
|
||||
select VIRTIO
|
||||
+ select VIRTIO_MD_SUPPORTED
|
||||
|
||||
config VIRTIO_BALLOON
|
||||
bool
|
||||
--
|
||||
2.48.1
|
||||
|
||||
459
SOURCES/kvm-s390x-virtio-mem-support.patch
Normal file
459
SOURCES/kvm-s390x-virtio-mem-support.patch
Normal file
@ -0,0 +1,459 @@
|
||||
From fa68427f55bee8d18d846e03ebf9f1eeb80f274d Mon Sep 17 00:00:00 2001
|
||||
From: David Hildenbrand <david@redhat.com>
|
||||
Date: Thu, 19 Dec 2024 15:41:15 +0100
|
||||
Subject: [PATCH 23/26] s390x: virtio-mem support
|
||||
|
||||
RH-Author: Thomas Huth <thuth@redhat.com>
|
||||
RH-MergeRequest: 351: Enable virtio-mem support on s390x
|
||||
RH-Jira: RHEL-72977
|
||||
RH-Acked-by: David Hildenbrand <david@redhat.com>
|
||||
RH-Acked-by: Juraj Marcin <None>
|
||||
RH-Commit: [23/26] 4c59ba9025ce5ba7686a7f3e01bb70e8c580709f (thuth/qemu-kvm-cs)
|
||||
|
||||
Let's add our virtio-mem-ccw proxy device and wire it up. We should
|
||||
be supporting everything (e.g., device unplug, "dynamic-memslots") that
|
||||
we already support for the virtio-pci variant.
|
||||
|
||||
With a Linux guest that supports virtio-mem (and has automatic memory
|
||||
onlining properly configured) the following example will work:
|
||||
|
||||
1. Start a VM with 4G initial memory and a virtio-mem device with a maximum
|
||||
capacity of 16GB:
|
||||
|
||||
qemu/build/qemu-system-s390x \
|
||||
--enable-kvm \
|
||||
-m 4G,maxmem=20G \
|
||||
-nographic \
|
||||
-smp 8 \
|
||||
-hda Fedora-Server-KVM-40-1.14.s390x.qcow2 \
|
||||
-chardev socket,id=monitor,path=/var/tmp/monitor,server,nowait \
|
||||
-mon chardev=monitor,mode=readline \
|
||||
-object memory-backend-ram,id=mem0,size=16G,reserve=off \
|
||||
-device virtio-mem-ccw,id=vmem0,memdev=mem0,dynamic-memslots=on
|
||||
|
||||
2. Query the current size of virtio-mem device:
|
||||
|
||||
(qemu) info memory-devices
|
||||
Memory device [virtio-mem]: "vmem0"
|
||||
memaddr: 0x100000000
|
||||
node: 0
|
||||
requested-size: 0
|
||||
size: 0
|
||||
max-size: 17179869184
|
||||
block-size: 1048576
|
||||
memdev: /objects/mem0
|
||||
|
||||
3. Request to grow it to 8GB (hotplug 8GB):
|
||||
|
||||
(qemu) qom-set vmem0 requested-size 8G
|
||||
(qemu) info memory-devices
|
||||
Memory device [virtio-mem]: "vmem0"
|
||||
memaddr: 0x100000000
|
||||
node: 0
|
||||
requested-size: 8589934592
|
||||
size: 8589934592
|
||||
max-size: 17179869184
|
||||
block-size: 1048576
|
||||
memdev: /objects/mem0
|
||||
|
||||
4. Request to grow to 16GB (hotplug another 8GB):
|
||||
|
||||
(qemu) qom-set vmem0 requested-size 16G
|
||||
(qemu) info memory-devices
|
||||
Memory device [virtio-mem]: "vmem0"
|
||||
memaddr: 0x100000000
|
||||
node: 0
|
||||
requested-size: 17179869184
|
||||
size: 17179869184
|
||||
max-size: 17179869184
|
||||
block-size: 1048576
|
||||
memdev: /objects/mem0
|
||||
|
||||
5. Try to hotunplug all memory again, shrinking to 0GB:
|
||||
|
||||
(qemu) qom-set vmem0 requested-size 0G
|
||||
(qemu) info memory-devices
|
||||
Memory device [virtio-mem]: "vmem0"
|
||||
memaddr: 0x100000000
|
||||
node: 0
|
||||
requested-size: 0
|
||||
size: 0
|
||||
max-size: 17179869184
|
||||
block-size: 1048576
|
||||
memdev: /objects/mem0
|
||||
|
||||
6. If it worked, unplug the device
|
||||
|
||||
(qemu) device_del vmem0
|
||||
(qemu) info memory-devices
|
||||
(qemu) object_del mem0
|
||||
|
||||
7. Hotplug a new device with a smaller capacity and directly size it to 1GB
|
||||
|
||||
(qemu) object_add memory-backend-ram,id=mem0,size=8G,reserve=off
|
||||
(qemu) device_add virtio-mem-ccw,id=vmem0,memdev=mem0,\
|
||||
dynamic-memslots=on,requested-size=1G
|
||||
(qemu) info memory-devices
|
||||
Memory device [virtio-mem]: "vmem0"
|
||||
memaddr: 0x100000000
|
||||
node: 0
|
||||
requested-size: 1073741824
|
||||
size: 1073741824
|
||||
max-size: 8589934592
|
||||
block-size: 1048576
|
||||
memdev: /objects/mem0
|
||||
|
||||
Trying to use a virtio-mem device backed by hugetlb into a !hugetlb VM
|
||||
correctly results in the error:
|
||||
... Memory device uses a bigger page size than initial memory
|
||||
|
||||
Note that the virtio-mem driver in Linux will supports 1 MiB (pageblock)
|
||||
granularity.
|
||||
|
||||
Message-ID: <20241219144115.2820241-15-david@redhat.com>
|
||||
Acked-by: Michael S. Tsirkin <mst@redhat.com>
|
||||
Signed-off-by: David Hildenbrand <david@redhat.com>
|
||||
(cherry picked from commit aa910c20ec5f3b10551da19e441b3e2b54406e25)
|
||||
Signed-off-by: Thomas Huth <thuth@redhat.com>
|
||||
---
|
||||
MAINTAINERS | 2 +
|
||||
hw/s390x/Kconfig | 1 +
|
||||
hw/s390x/meson.build | 1 +
|
||||
hw/s390x/virtio-ccw-mem.c | 226 ++++++++++++++++++++++++++++++++++++++
|
||||
hw/s390x/virtio-ccw-mem.h | 34 ++++++
|
||||
hw/virtio/virtio-mem.c | 4 +-
|
||||
6 files changed, 267 insertions(+), 1 deletion(-)
|
||||
create mode 100644 hw/s390x/virtio-ccw-mem.c
|
||||
create mode 100644 hw/s390x/virtio-ccw-mem.h
|
||||
|
||||
diff --git a/MAINTAINERS b/MAINTAINERS
|
||||
index f21dc3fa75..f7b7ceffc4 100644
|
||||
--- a/MAINTAINERS
|
||||
+++ b/MAINTAINERS
|
||||
@@ -2401,6 +2401,8 @@ W: https://virtio-mem.gitlab.io/
|
||||
F: hw/virtio/virtio-mem.c
|
||||
F: hw/virtio/virtio-mem-pci.h
|
||||
F: hw/virtio/virtio-mem-pci.c
|
||||
+F: hw/s390x/virtio-ccw-mem.c
|
||||
+F: hw/s390x/virtio-ccw-mem.h
|
||||
F: include/hw/virtio/virtio-mem.h
|
||||
|
||||
virtio-snd
|
||||
diff --git a/hw/s390x/Kconfig b/hw/s390x/Kconfig
|
||||
index 3bbf4ae56e..5d57daff77 100644
|
||||
--- a/hw/s390x/Kconfig
|
||||
+++ b/hw/s390x/Kconfig
|
||||
@@ -15,3 +15,4 @@ config S390_CCW_VIRTIO
|
||||
select SCLPCONSOLE
|
||||
select VIRTIO_CCW
|
||||
select MSI_NONBROKEN
|
||||
+ select VIRTIO_MEM_SUPPORTED
|
||||
diff --git a/hw/s390x/meson.build b/hw/s390x/meson.build
|
||||
index 4431868408..3bbebfd817 100644
|
||||
--- a/hw/s390x/meson.build
|
||||
+++ b/hw/s390x/meson.build
|
||||
@@ -51,6 +51,7 @@ virtio_ss.add(when: 'CONFIG_VHOST_SCSI', if_true: files('vhost-scsi-ccw.c'))
|
||||
virtio_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: files('vhost-vsock-ccw.c'))
|
||||
virtio_ss.add(when: 'CONFIG_VHOST_USER_FS', if_true: files('vhost-user-fs-ccw.c'))
|
||||
virtio_ss.add(when: 'CONFIG_VIRTIO_MD', if_true: files('virtio-ccw-md.c'))
|
||||
+virtio_ss.add(when: 'CONFIG_VIRTIO_MEM', if_true: files('virtio-ccw-mem.c'))
|
||||
s390x_ss.add_all(when: 'CONFIG_VIRTIO_CCW', if_true: virtio_ss)
|
||||
|
||||
s390x_ss.add(when: 'CONFIG_VIRTIO_MD', if_false: files('virtio-ccw-md-stubs.c'))
|
||||
diff --git a/hw/s390x/virtio-ccw-mem.c b/hw/s390x/virtio-ccw-mem.c
|
||||
new file mode 100644
|
||||
index 0000000000..bee0d560cb
|
||||
--- /dev/null
|
||||
+++ b/hw/s390x/virtio-ccw-mem.c
|
||||
@@ -0,0 +1,226 @@
|
||||
+/*
|
||||
+ * virtio-mem CCW implementation
|
||||
+ *
|
||||
+ * Copyright (C) 2024 Red Hat, Inc.
|
||||
+ *
|
||||
+ * Authors:
|
||||
+ * David Hildenbrand <david@redhat.com>
|
||||
+ *
|
||||
+ * This work is licensed under the terms of the GNU GPL, version 2.
|
||||
+ * See the COPYING file in the top-level directory.
|
||||
+ */
|
||||
+
|
||||
+#include "qemu/osdep.h"
|
||||
+#include "hw/qdev-properties.h"
|
||||
+#include "qapi/error.h"
|
||||
+#include "qemu/module.h"
|
||||
+#include "virtio-ccw-mem.h"
|
||||
+#include "hw/mem/memory-device.h"
|
||||
+#include "qapi/qapi-events-machine.h"
|
||||
+#include "qapi/qapi-events-misc.h"
|
||||
+
|
||||
+static void virtio_ccw_mem_realize(VirtioCcwDevice *ccw_dev, Error **errp)
|
||||
+{
|
||||
+ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(ccw_dev);
|
||||
+ DeviceState *vdev = DEVICE(&dev->vdev);
|
||||
+
|
||||
+ qdev_realize(vdev, BUS(&ccw_dev->bus), errp);
|
||||
+}
|
||||
+
|
||||
+static void virtio_ccw_mem_set_addr(MemoryDeviceState *md, uint64_t addr,
|
||||
+ Error **errp)
|
||||
+{
|
||||
+ object_property_set_uint(OBJECT(md), VIRTIO_MEM_ADDR_PROP, addr, errp);
|
||||
+}
|
||||
+
|
||||
+static uint64_t virtio_ccw_mem_get_addr(const MemoryDeviceState *md)
|
||||
+{
|
||||
+ return object_property_get_uint(OBJECT(md), VIRTIO_MEM_ADDR_PROP,
|
||||
+ &error_abort);
|
||||
+}
|
||||
+
|
||||
+static MemoryRegion *virtio_ccw_mem_get_memory_region(MemoryDeviceState *md,
|
||||
+ Error **errp)
|
||||
+{
|
||||
+ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(md);
|
||||
+ VirtIOMEM *vmem = &dev->vdev;
|
||||
+ VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem);
|
||||
+
|
||||
+ return vmc->get_memory_region(vmem, errp);
|
||||
+}
|
||||
+
|
||||
+static void virtio_ccw_mem_decide_memslots(MemoryDeviceState *md,
|
||||
+ unsigned int limit)
|
||||
+{
|
||||
+ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(md);
|
||||
+ VirtIOMEM *vmem = VIRTIO_MEM(&dev->vdev);
|
||||
+ VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem);
|
||||
+
|
||||
+ vmc->decide_memslots(vmem, limit);
|
||||
+}
|
||||
+
|
||||
+static unsigned int virtio_ccw_mem_get_memslots(MemoryDeviceState *md)
|
||||
+{
|
||||
+ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(md);
|
||||
+ VirtIOMEM *vmem = VIRTIO_MEM(&dev->vdev);
|
||||
+ VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem);
|
||||
+
|
||||
+ return vmc->get_memslots(vmem);
|
||||
+}
|
||||
+
|
||||
+static uint64_t virtio_ccw_mem_get_plugged_size(const MemoryDeviceState *md,
|
||||
+ Error **errp)
|
||||
+{
|
||||
+ return object_property_get_uint(OBJECT(md), VIRTIO_MEM_SIZE_PROP,
|
||||
+ errp);
|
||||
+}
|
||||
+
|
||||
+static void virtio_ccw_mem_fill_device_info(const MemoryDeviceState *md,
|
||||
+ MemoryDeviceInfo *info)
|
||||
+{
|
||||
+ VirtioMEMDeviceInfo *vi = g_new0(VirtioMEMDeviceInfo, 1);
|
||||
+ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(md);
|
||||
+ VirtIOMEM *vmem = &dev->vdev;
|
||||
+ VirtIOMEMClass *vpc = VIRTIO_MEM_GET_CLASS(vmem);
|
||||
+ DeviceState *vdev = DEVICE(md);
|
||||
+
|
||||
+ if (vdev->id) {
|
||||
+ vi->id = g_strdup(vdev->id);
|
||||
+ }
|
||||
+
|
||||
+ /* let the real device handle everything else */
|
||||
+ vpc->fill_device_info(vmem, vi);
|
||||
+
|
||||
+ info->u.virtio_mem.data = vi;
|
||||
+ info->type = MEMORY_DEVICE_INFO_KIND_VIRTIO_MEM;
|
||||
+}
|
||||
+
|
||||
+static uint64_t virtio_ccw_mem_get_min_alignment(const MemoryDeviceState *md)
|
||||
+{
|
||||
+ return object_property_get_uint(OBJECT(md), VIRTIO_MEM_BLOCK_SIZE_PROP,
|
||||
+ &error_abort);
|
||||
+}
|
||||
+
|
||||
+static void virtio_ccw_mem_size_change_notify(Notifier *notifier, void *data)
|
||||
+{
|
||||
+ VirtIOMEMCcw *dev = container_of(notifier, VirtIOMEMCcw,
|
||||
+ size_change_notifier);
|
||||
+ DeviceState *vdev = DEVICE(dev);
|
||||
+ char *qom_path = object_get_canonical_path(OBJECT(dev));
|
||||
+ const uint64_t * const size_p = data;
|
||||
+
|
||||
+ qapi_event_send_memory_device_size_change(vdev->id, *size_p, qom_path);
|
||||
+ g_free(qom_path);
|
||||
+}
|
||||
+
|
||||
+static void virtio_ccw_mem_unplug_request_check(VirtIOMDCcw *vmd, Error **errp)
|
||||
+{
|
||||
+ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(vmd);
|
||||
+ VirtIOMEM *vmem = &dev->vdev;
|
||||
+ VirtIOMEMClass *vpc = VIRTIO_MEM_GET_CLASS(vmem);
|
||||
+
|
||||
+ vpc->unplug_request_check(vmem, errp);
|
||||
+}
|
||||
+
|
||||
+static void virtio_ccw_mem_get_requested_size(Object *obj, Visitor *v,
|
||||
+ const char *name, void *opaque,
|
||||
+ Error **errp)
|
||||
+{
|
||||
+ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(obj);
|
||||
+
|
||||
+ object_property_get(OBJECT(&dev->vdev), name, v, errp);
|
||||
+}
|
||||
+
|
||||
+static void virtio_ccw_mem_set_requested_size(Object *obj, Visitor *v,
|
||||
+ const char *name, void *opaque,
|
||||
+ Error **errp)
|
||||
+{
|
||||
+ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(obj);
|
||||
+ DeviceState *vdev = DEVICE(obj);
|
||||
+
|
||||
+ /*
|
||||
+ * If we passed virtio_ccw_mem_unplug_request_check(), making sure that
|
||||
+ * the requested size is 0, don't allow modifying the requested size
|
||||
+ * anymore, otherwise the VM might end up hotplugging memory before
|
||||
+ * handling the unplug request.
|
||||
+ */
|
||||
+ if (vdev->pending_deleted_event) {
|
||||
+ error_setg(errp, "'%s' cannot be changed if the device is in the"
|
||||
+ " process of unplug", name);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ object_property_set(OBJECT(&dev->vdev), name, v, errp);
|
||||
+}
|
||||
+
|
||||
+static Property virtio_ccw_mem_properties[] = {
|
||||
+ DEFINE_PROP_BIT("ioeventfd", VirtioCcwDevice, flags,
|
||||
+ VIRTIO_CCW_FLAG_USE_IOEVENTFD_BIT, true),
|
||||
+ DEFINE_PROP_UINT32("max_revision", VirtioCcwDevice, max_rev,
|
||||
+ VIRTIO_CCW_MAX_REV),
|
||||
+ DEFINE_PROP_END_OF_LIST(),
|
||||
+};
|
||||
+
|
||||
+static void virtio_ccw_mem_class_init(ObjectClass *klass, void *data)
|
||||
+{
|
||||
+ DeviceClass *dc = DEVICE_CLASS(klass);
|
||||
+ VirtIOCCWDeviceClass *k = VIRTIO_CCW_DEVICE_CLASS(klass);
|
||||
+ MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(klass);
|
||||
+ VirtIOMDCcwClass *vmdc = VIRTIO_MD_CCW_CLASS(klass);
|
||||
+
|
||||
+ k->realize = virtio_ccw_mem_realize;
|
||||
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
|
||||
+ device_class_set_props(dc, virtio_ccw_mem_properties);
|
||||
+
|
||||
+ mdc->get_addr = virtio_ccw_mem_get_addr;
|
||||
+ mdc->set_addr = virtio_ccw_mem_set_addr;
|
||||
+ mdc->get_plugged_size = virtio_ccw_mem_get_plugged_size;
|
||||
+ mdc->get_memory_region = virtio_ccw_mem_get_memory_region;
|
||||
+ mdc->decide_memslots = virtio_ccw_mem_decide_memslots;
|
||||
+ mdc->get_memslots = virtio_ccw_mem_get_memslots;
|
||||
+ mdc->fill_device_info = virtio_ccw_mem_fill_device_info;
|
||||
+ mdc->get_min_alignment = virtio_ccw_mem_get_min_alignment;
|
||||
+
|
||||
+ vmdc->unplug_request_check = virtio_ccw_mem_unplug_request_check;
|
||||
+}
|
||||
+
|
||||
+static void virtio_ccw_mem_instance_init(Object *obj)
|
||||
+{
|
||||
+ VirtIOMEMCcw *dev = VIRTIO_MEM_CCW(obj);
|
||||
+ VirtIOMEMClass *vmc;
|
||||
+ VirtIOMEM *vmem;
|
||||
+
|
||||
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
|
||||
+ TYPE_VIRTIO_MEM);
|
||||
+
|
||||
+ dev->size_change_notifier.notify = virtio_ccw_mem_size_change_notify;
|
||||
+ vmem = &dev->vdev;
|
||||
+ vmc = VIRTIO_MEM_GET_CLASS(vmem);
|
||||
+ /*
|
||||
+ * We never remove the notifier again, as we expect both devices to
|
||||
+ * disappear at the same time.
|
||||
+ */
|
||||
+ vmc->add_size_change_notifier(vmem, &dev->size_change_notifier);
|
||||
+
|
||||
+ object_property_add_alias(obj, VIRTIO_MEM_BLOCK_SIZE_PROP,
|
||||
+ OBJECT(&dev->vdev), VIRTIO_MEM_BLOCK_SIZE_PROP);
|
||||
+ object_property_add_alias(obj, VIRTIO_MEM_SIZE_PROP, OBJECT(&dev->vdev),
|
||||
+ VIRTIO_MEM_SIZE_PROP);
|
||||
+ object_property_add(obj, VIRTIO_MEM_REQUESTED_SIZE_PROP, "size",
|
||||
+ virtio_ccw_mem_get_requested_size,
|
||||
+ virtio_ccw_mem_set_requested_size, NULL, NULL);
|
||||
+}
|
||||
+
|
||||
+static const TypeInfo virtio_ccw_mem = {
|
||||
+ .name = TYPE_VIRTIO_MEM_CCW,
|
||||
+ .parent = TYPE_VIRTIO_MD_CCW,
|
||||
+ .instance_size = sizeof(VirtIOMEMCcw),
|
||||
+ .instance_init = virtio_ccw_mem_instance_init,
|
||||
+ .class_init = virtio_ccw_mem_class_init,
|
||||
+};
|
||||
+
|
||||
+static void virtio_ccw_mem_register_types(void)
|
||||
+{
|
||||
+ type_register_static(&virtio_ccw_mem);
|
||||
+}
|
||||
+type_init(virtio_ccw_mem_register_types)
|
||||
diff --git a/hw/s390x/virtio-ccw-mem.h b/hw/s390x/virtio-ccw-mem.h
|
||||
new file mode 100644
|
||||
index 0000000000..738ab2c744
|
||||
--- /dev/null
|
||||
+++ b/hw/s390x/virtio-ccw-mem.h
|
||||
@@ -0,0 +1,34 @@
|
||||
+/*
|
||||
+ * Virtio MEM CCW device
|
||||
+ *
|
||||
+ * Copyright (C) 2024 Red Hat, Inc.
|
||||
+ *
|
||||
+ * Authors:
|
||||
+ * David Hildenbrand <david@redhat.com>
|
||||
+ *
|
||||
+ * This work is licensed under the terms of the GNU GPL, version 2.
|
||||
+ * See the COPYING file in the top-level directory.
|
||||
+ */
|
||||
+
|
||||
+#ifndef HW_S390X_VIRTIO_CCW_MEM_H
|
||||
+#define HW_S390X_VIRTIO_CCW_MEM_H
|
||||
+
|
||||
+#include "virtio-ccw-md.h"
|
||||
+#include "hw/virtio/virtio-mem.h"
|
||||
+#include "qom/object.h"
|
||||
+
|
||||
+typedef struct VirtIOMEMCcw VirtIOMEMCcw;
|
||||
+
|
||||
+/*
|
||||
+ * virtio-mem-ccw: This extends VirtIOMDCcw
|
||||
+ */
|
||||
+#define TYPE_VIRTIO_MEM_CCW "virtio-mem-ccw"
|
||||
+DECLARE_INSTANCE_CHECKER(VirtIOMEMCcw, VIRTIO_MEM_CCW, TYPE_VIRTIO_MEM_CCW)
|
||||
+
|
||||
+struct VirtIOMEMCcw {
|
||||
+ VirtIOMDCcw parent_obj;
|
||||
+ VirtIOMEM vdev;
|
||||
+ Notifier size_change_notifier;
|
||||
+};
|
||||
+
|
||||
+#endif /* HW_S390X_VIRTIO_CCW_MEM_H */
|
||||
diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
|
||||
index 00da98b6e1..c9f8a23bbc 100644
|
||||
--- a/hw/virtio/virtio-mem.c
|
||||
+++ b/hw/virtio/virtio-mem.c
|
||||
@@ -61,6 +61,8 @@ static uint32_t virtio_mem_default_thp_size(void)
|
||||
} else if (qemu_real_host_page_size() == 64 * KiB) {
|
||||
default_thp_size = 512 * MiB;
|
||||
}
|
||||
+#elif defined(__s390x__)
|
||||
+ default_thp_size = 1 * MiB;
|
||||
#endif
|
||||
|
||||
return default_thp_size;
|
||||
@@ -161,7 +163,7 @@ static bool virtio_mem_has_shared_zeropage(RAMBlock *rb)
|
||||
* necessary (as the section size can change). But it's more likely that the
|
||||
* section size will rather get smaller and not bigger over time.
|
||||
*/
|
||||
-#if defined(TARGET_X86_64) || defined(TARGET_I386)
|
||||
+#if defined(TARGET_X86_64) || defined(TARGET_I386) || defined(TARGET_S390X)
|
||||
#define VIRTIO_MEM_USABLE_EXTENT (2 * (128 * MiB))
|
||||
#elif defined(TARGET_ARM)
|
||||
#define VIRTIO_MEM_USABLE_EXTENT (2 * (512 * MiB))
|
||||
--
|
||||
2.48.1
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user