qemu-kvm/kvm-s390x-pci-add-support-for-guests-that-request-direct.patch
Jon Maloy 0dc0c63dbf * Mon Jun 16 2025 Jon Maloy <jmaloy@redhat.com> - 9.1.0-24
- kvm-s390x-pci-add-support-for-guests-that-request-direct.patch [RHEL-11430]
- kvm-s390x-pci-indicate-QEMU-supports-relaxed-translation.patch [RHEL-11430]
- kvm-block-Expand-block-status-mode-from-bool-to-flags.patch [RHEL-82906 RHEL-83015]
- kvm-file-posix-gluster-Handle-zero-block-status-hint-bet.patch [RHEL-82906 RHEL-83015]
- kvm-block-Let-bdrv_co_is_zero_fast-consolidate-adjacent-.patch [RHEL-82906 RHEL-83015]
- kvm-block-Add-new-bdrv_co_is_all_zeroes-function.patch [RHEL-82906 RHEL-83015]
- kvm-iotests-Improve-iotest-194-to-mirror-data.patch [RHEL-82906 RHEL-83015]
- kvm-mirror-Minor-refactoring.patch [RHEL-82906 RHEL-83015]
- kvm-mirror-Pass-full-sync-mode-rather-than-bool-to-inter.patch [RHEL-82906 RHEL-83015]
- kvm-mirror-Allow-QMP-override-to-declare-target-already-.patch [RHEL-82906 RHEL-83015]
- kvm-mirror-Drop-redundant-zero_target-parameter.patch [RHEL-82906 RHEL-83015]
- kvm-mirror-Skip-pre-zeroing-destination-if-it-is-already.patch [RHEL-82906 RHEL-83015]
- kvm-mirror-Skip-writing-zeroes-when-target-is-already-ze.patch [RHEL-82906 RHEL-83015]
- kvm-iotests-common.rc-add-disk_usage-function.patch [RHEL-82906 RHEL-83015]
- kvm-tests-Add-iotest-mirror-sparse-for-recent-patches.patch [RHEL-82906 RHEL-83015]
- kvm-mirror-Reduce-I-O-when-destination-is-detect-zeroes-.patch [RHEL-82906 RHEL-83015]
- Resolves: RHEL-11430
  ([IBM 9.7 FEAT] KVM: Performance Enhanced Refresh PCI Translation - qemu part)
- Resolves: RHEL-82906
  (--migrate-disks-detect-zeroes doesn't take effect for disk migration [rhel-9.7])
- Resolves: RHEL-83015
  (Disk size of target raw image is full allocated when doing mirror with default discard value [rhel-9.7])
2025-06-16 17:59:51 -04:00

257 lines
9.8 KiB
Diff

From c60d0770ff3f9124e6e9d7beb03e1ef8067e8e26 Mon Sep 17 00:00:00 2001
From: Christoph Schlameuss <cschlame@redhat.com>
Date: Thu, 12 Jun 2025 13:25:32 +0200
Subject: [PATCH 01/16] s390x/pci: add support for guests that request direct
mapping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Christoph Schlameuss <None>
RH-MergeRequest: 376: Draft: KVM: Performance Enhanced Refresh PCI Translation
RH-Jira: RHEL-11430
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Commit: [1/2] 11d1dd9a5add55ae43d5d922588a33945ecbfe27 (cschlame/qemu-kvm)
JIRA: https://issues.redhat.com/browse/RHEL-11430
Conflicts: hw/s390x/s390-pci-bus.c old s390_pci_device_properties[] still has DEFINE_PROP_END_OF_LIST()
hw/s390x/s390-pci-inst.c hw_accel.h is still in sysemu
hw/s390x/s390-virtio-ccw.c changes from ccw_machine_9_2_class_options() moved to ccw_rhel_machine_9_6_0_class_options()
commit dfcee1ea4c52ac60e0a06221eafb7b6253eb10c3
Author: Matthew Rosato <mjrosato@linux.ibm.com>
Date: Wed Feb 26 16:00:12 2025 -0500
s390x/pci: add support for guests that request direct mapping
When receiving a guest mpcifc(4) or mpcifc(6) instruction without the T
bit set, treat this as a request to perform direct mapping instead of
address translation. In order to facilitate this, pin the entirety of
guest memory into the host iommu.
Pinning for the direct mapping case is handled via vfio and its memory
listener. Additionally, ram discard settings are inherited from vfio:
coordinated discards (e.g. virtio-mem) are allowed while uncoordinated
discards (e.g. virtio-balloon) are disabled.
Subsequent guest DMA operations are all expected to be of the format
guest_phys+sdma, allowing them to be used as lookup into the host
iommu table.
Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Message-ID: <20250226210013.238349-2-mjrosato@linux.ibm.com>
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Christoph Schlameuss <cschlame@redhat.com>
---
hw/s390x/s390-pci-bus.c | 39 +++++++++++++++++++++++++++++++--
hw/s390x/s390-pci-inst.c | 13 +++++++++--
hw/s390x/s390-pci-vfio.c | 23 +++++++++++++++----
hw/s390x/s390-virtio-ccw.c | 5 +++++
include/hw/s390x/s390-pci-bus.h | 3 +++
5 files changed, 75 insertions(+), 8 deletions(-)
diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
index 3e57d5faca..13bc02d837 100644
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -18,6 +18,8 @@
#include "hw/s390x/s390-pci-inst.h"
#include "hw/s390x/s390-pci-kvm.h"
#include "hw/s390x/s390-pci-vfio.h"
+#include "hw/s390x/s390-virtio-ccw.h"
+#include "hw/boards.h"
#include "hw/pci/pci_bus.h"
#include "hw/qdev-properties.h"
#include "hw/pci/pci_bridge.h"
@@ -724,12 +726,42 @@ void s390_pci_iommu_enable(S390PCIIOMMU *iommu)
g_free(name);
}
+void s390_pci_iommu_direct_map_enable(S390PCIIOMMU *iommu)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+ S390CcwMachineState *s390ms = S390_CCW_MACHINE(ms);
+
+ /*
+ * For direct-mapping we must map the entire guest address space. Rather
+ * than using an iommu, create a memory region alias that maps GPA X to
+ * IOVA X + SDMA. VFIO will handle pinning via its memory listener.
+ */
+ g_autofree char *name = g_strdup_printf("iommu-dm-s390-%04x",
+ iommu->pbdev->uid);
+
+ iommu->dm_mr = g_malloc0(sizeof(*iommu->dm_mr));
+ memory_region_init_alias(iommu->dm_mr, OBJECT(&iommu->mr), name,
+ get_system_memory(), 0,
+ s390_get_memory_limit(s390ms));
+ iommu->enabled = true;
+ memory_region_add_subregion(&iommu->mr, iommu->pbdev->zpci_fn.sdma,
+ iommu->dm_mr);
+}
+
void s390_pci_iommu_disable(S390PCIIOMMU *iommu)
{
iommu->enabled = false;
g_hash_table_remove_all(iommu->iotlb);
- memory_region_del_subregion(&iommu->mr, MEMORY_REGION(&iommu->iommu_mr));
- object_unparent(OBJECT(&iommu->iommu_mr));
+ if (iommu->dm_mr) {
+ memory_region_del_subregion(&iommu->mr, iommu->dm_mr);
+ object_unparent(OBJECT(iommu->dm_mr));
+ g_free(iommu->dm_mr);
+ iommu->dm_mr = NULL;
+ } else {
+ memory_region_del_subregion(&iommu->mr,
+ MEMORY_REGION(&iommu->iommu_mr));
+ object_unparent(OBJECT(&iommu->iommu_mr));
+ }
}
static void s390_pci_iommu_free(S390pciState *s, PCIBus *bus, int32_t devfn)
@@ -1130,6 +1162,7 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
/* Always intercept emulated devices */
pbdev->interp = false;
pbdev->forwarding_assist = false;
+ pbdev->rtr_avail = false;
}
if (s390_pci_msix_init(pbdev) && !pbdev->interp) {
@@ -1488,6 +1521,8 @@ static Property s390_pci_device_properties[] = {
DEFINE_PROP_BOOL("interpret", S390PCIBusDevice, interp, true),
DEFINE_PROP_BOOL("forwarding-assist", S390PCIBusDevice, forwarding_assist,
true),
+ DEFINE_PROP_BOOL("relaxed-translation", S390PCIBusDevice, rtr_avail,
+ true),
DEFINE_PROP_END_OF_LIST(),
};
diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c
index 30149546c0..803ebcd9b3 100644
--- a/hw/s390x/s390-pci-inst.c
+++ b/hw/s390x/s390-pci-inst.c
@@ -16,6 +16,7 @@
#include "exec/memory.h"
#include "qemu/error-report.h"
#include "sysemu/hw_accel.h"
+#include "hw/boards.h"
#include "hw/pci/pci_device.h"
#include "hw/s390x/s390-pci-inst.h"
#include "hw/s390x/s390-pci-bus.h"
@@ -1008,17 +1009,25 @@ static int reg_ioat(CPUS390XState *env, S390PCIBusDevice *pbdev, ZpciFib fib,
}
/* currently we only support designation type 1 with translation */
- if (!(dt == ZPCI_IOTA_RTTO && t)) {
+ if (t && dt != ZPCI_IOTA_RTTO) {
error_report("unsupported ioat dt %d t %d", dt, t);
s390_program_interrupt(env, PGM_OPERAND, ra);
return -EINVAL;
+ } else if (!t && !pbdev->rtr_avail) {
+ error_report("relaxed translation not allowed");
+ s390_program_interrupt(env, PGM_OPERAND, ra);
+ return -EINVAL;
}
iommu->pba = pba;
iommu->pal = pal;
iommu->g_iota = g_iota;
- s390_pci_iommu_enable(iommu);
+ if (t) {
+ s390_pci_iommu_enable(iommu);
+ } else {
+ s390_pci_iommu_direct_map_enable(iommu);
+ }
return 0;
}
diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c
index 7dbbc76823..443e222912 100644
--- a/hw/s390x/s390-pci-vfio.c
+++ b/hw/s390x/s390-pci-vfio.c
@@ -131,13 +131,28 @@ static void s390_pci_read_base(S390PCIBusDevice *pbdev,
/* Store function type separately for type-specific behavior */
pbdev->pft = cap->pft;
+ /*
+ * If the device is a passthrough ISM device, disallow relaxed
+ * translation.
+ */
+ if (pbdev->pft == ZPCI_PFT_ISM) {
+ pbdev->rtr_avail = false;
+ }
+
/*
* If appropriate, reduce the size of the supported DMA aperture reported
- * to the guest based upon the vfio DMA limit.
+ * to the guest based upon the vfio DMA limit. This is applicable for
+ * devices that are guaranteed to not use relaxed translation. If the
+ * device is capable of relaxed translation then we must advertise the
+ * full aperture. In this case, if translation is used then we will
+ * rely on the vfio DMA limit counting and use RPCIT CC1 / status 16
+ * to request that the guest free DMA mappings as necessary.
*/
- vfio_size = pbdev->iommu->max_dma_limit << TARGET_PAGE_BITS;
- if (vfio_size > 0 && vfio_size < cap->end_dma - cap->start_dma + 1) {
- pbdev->zpci_fn.edma = cap->start_dma + vfio_size - 1;
+ if (!pbdev->rtr_avail) {
+ vfio_size = pbdev->iommu->max_dma_limit << TARGET_PAGE_BITS;
+ if (vfio_size > 0 && vfio_size < cap->end_dma - cap->start_dma + 1) {
+ pbdev->zpci_fn.edma = cap->start_dma + vfio_size - 1;
+ }
}
}
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index 312e8f18aa..77a1bde71e 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -1348,8 +1348,13 @@ static void ccw_rhel_machine_9_6_0_instance_options(MachineState *machine)
static void ccw_rhel_machine_9_6_0_class_options(MachineClass *mc)
{
+ static GlobalProperty compat[] = {
+ { TYPE_S390_PCI_DEVICE, "relaxed-translation", "off", },
+ };
+
/* NB: remember to move this line to the *latest* RHEL 9 machine */
compat_props_add(mc->compat_props, hw_compat_rhel_9, hw_compat_rhel_9_len);
+ compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
}
DEFINE_CCW_MACHINE_AS_LATEST(9, 6, 0);
diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h
index 2c43ea123f..04944d4fed 100644
--- a/include/hw/s390x/s390-pci-bus.h
+++ b/include/hw/s390x/s390-pci-bus.h
@@ -277,6 +277,7 @@ struct S390PCIIOMMU {
AddressSpace as;
MemoryRegion mr;
IOMMUMemoryRegion iommu_mr;
+ MemoryRegion *dm_mr;
bool enabled;
uint64_t g_iota;
uint64_t pba;
@@ -362,6 +363,7 @@ struct S390PCIBusDevice {
bool interp;
bool forwarding_assist;
bool aif;
+ bool rtr_avail;
QTAILQ_ENTRY(S390PCIBusDevice) link;
};
@@ -389,6 +391,7 @@ int pci_chsc_sei_nt2_have_event(void);
void s390_pci_sclp_configure(SCCB *sccb);
void s390_pci_sclp_deconfigure(SCCB *sccb);
void s390_pci_iommu_enable(S390PCIIOMMU *iommu);
+void s390_pci_iommu_direct_map_enable(S390PCIIOMMU *iommu);
void s390_pci_iommu_disable(S390PCIIOMMU *iommu);
void s390_pci_generate_error_event(uint16_t pec, uint32_t fh, uint32_t fid,
uint64_t faddr, uint32_t e);
--
2.48.1