* Wed Jan 24 2024 Miroslav Rezanina <mrezanin@redhat.com> - 8.2.0-3

- kvm-hw-arm-virt-Add-properties-to-disable-high-memory-re.patch [RHEL-19738]
- kvm-vfio-Introduce-base-object-for-VFIOContainer-and-tar.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-container-Introduce-a-empty-VFIOIOMMUOps.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-container-Switch-to-dma_map-unmap-API.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-common-Introduce-vfio_container_init-destroy-he.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-common-Move-giommu_list-in-base-container.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-container-Move-space-field-to-base-container.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-container-Switch-to-IOMMU-BE-set_dirty_page_tra.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-container-Move-per-container-device-list-in-bas.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-container-Convert-functions-to-base-container.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-container-Move-pgsizes-and-dma_max_mappings-to-.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-container-Move-vrdl_list-to-base-container.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-container-Move-listener-to-base-container.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-container-Move-dirty_pgsizes-and-max_dirty_bitm.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-container-Move-iova_ranges-to-base-container.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-container-Implement-attach-detach_device.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-spapr-Introduce-spapr-backend-and-target-interf.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-spapr-switch-to-spapr-IOMMU-BE-add-del_section_.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-spapr-Move-prereg_listener-into-spapr-container.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-spapr-Move-hostwin_list-into-spapr-container.patch [RHEL-19302 RHEL-21057]
- kvm-backends-iommufd-Introduce-the-iommufd-object.patch [RHEL-19302 RHEL-21057]
- kvm-util-char_dev-Add-open_cdev.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-common-return-early-if-space-isn-t-empty.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-iommufd-Implement-the-iommufd-backend.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-iommufd-Relax-assert-check-for-iommufd-backend.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-iommufd-Add-support-for-iova_ranges-and-pgsizes.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-pci-Extract-out-a-helper-vfio_pci_get_pci_hot_r.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-pci-Introduce-a-vfio-pci-hot-reset-interface.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-iommufd-Enable-pci-hot-reset-through-iommufd-cd.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-pci-Allow-the-selection-of-a-given-iommu-backen.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-pci-Make-vfio-cdev-pre-openable-by-passing-a-fi.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-platform-Allow-the-selection-of-a-given-iommu-b.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-platform-Make-vfio-cdev-pre-openable-by-passing.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-ap-Allow-the-selection-of-a-given-iommu-backend.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-ap-Make-vfio-cdev-pre-openable-by-passing-a-fil.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-ccw-Allow-the-selection-of-a-given-iommu-backen.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-ccw-Make-vfio-cdev-pre-openable-by-passing-a-fi.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-Make-VFIOContainerBase-poiner-parameter-const-i.patch [RHEL-19302 RHEL-21057]
- kvm-hw-arm-Activate-IOMMUFD-for-virt-machines.patch [RHEL-19302 RHEL-21057]
- kvm-kconfig-Activate-IOMMUFD-for-s390x-machines.patch [RHEL-19302 RHEL-21057]
- kvm-hw-i386-Activate-IOMMUFD-for-q35-machines.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-pci-Move-VFIODevice-initializations-in-vfio_ins.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-platform-Move-VFIODevice-initializations-in-vfi.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-ap-Move-VFIODevice-initializations-in-vfio_ap_i.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-ccw-Move-VFIODevice-initializations-in-vfio_ccw.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-Introduce-a-helper-function-to-initialize-VFIOD.patch [RHEL-19302 RHEL-21057]
- kvm-docs-devel-Add-VFIO-iommufd-backend-documentation.patch [RHEL-19302 RHEL-21057]
- kvm-hw-ppc-Kconfig-Imply-VFIO_PCI.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-spapr-Extend-VFIOIOMMUOps-with-a-release-handle.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-container-Introduce-vfio_legacy_setup-for-furth.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-container-Initialize-VFIOIOMMUOps-under-vfio_in.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-container-Introduce-a-VFIOIOMMU-QOM-interface.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-container-Introduce-a-VFIOIOMMU-legacy-QOM-inte.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-container-Intoduce-a-new-VFIOIOMMUClass-setup-h.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-spapr-Introduce-a-sPAPR-VFIOIOMMU-QOM-interface.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-iommufd-Introduce-a-VFIOIOMMU-iommufd-QOM-inter.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-spapr-Only-compile-sPAPR-IOMMU-support-when-nee.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-iommufd-Remove-CONFIG_IOMMUFD-usage.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-container-Replace-basename-with-g_path_get_base.patch [RHEL-19302 RHEL-21057]
- kvm-hw-vfio-fix-iteration-over-global-VFIODevice-list.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-iommufd-Remove-the-use-of-stat-to-check-file-ex.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-container-Rename-vfio_init_container-to-vfio_se.patch [RHEL-19302 RHEL-21057]
- kvm-vfio-migration-Add-helper-function-to-set-state-or-r.patch [RHEL-19302 RHEL-21057]
- kvm-backends-iommufd-Remove-check-on-number-of-backend-u.patch [RHEL-19302 RHEL-21057]
- kvm-backends-iommufd-Remove-mutex.patch [RHEL-19302 RHEL-21057]
- kvm-Compile-IOMMUFD-object-on-aarch64.patch [RHEL-19302 RHEL-21057]
- kvm-Compile-IOMMUFD-on-s390x.patch [RHEL-19302 RHEL-21057]
- kvm-Compile-IOMMUFD-on-x86_64.patch [RHEL-19302 RHEL-21057]
- kvm-target-s390x-kvm-pv-Provide-some-more-useful-informa.patch [RHEL-18212]
- kvm-nbd-server-avoid-per-NBDRequest-nbd_client_get-put.patch [RHEL-15965]
- kvm-nbd-server-only-traverse-NBDExport-clients-from-main.patch [RHEL-15965]
- kvm-nbd-server-introduce-NBDClient-lock-to-protect-field.patch [RHEL-15965]
- kvm-block-file-posix-set-up-Linux-AIO-and-io_uring-in-th.patch [RHEL-15965]
- kvm-virtio-blk-add-lock-to-protect-s-rq.patch [RHEL-15965]
- kvm-virtio-blk-don-t-lock-AioContext-in-the-completion-c.patch [RHEL-15965]
- kvm-virtio-blk-don-t-lock-AioContext-in-the-submission-c.patch [RHEL-15965]
- kvm-scsi-only-access-SCSIDevice-requests-from-one-thread.patch [RHEL-15965]
- kvm-virtio-scsi-don-t-lock-AioContext-around-virtio_queu.patch [RHEL-15965]
- kvm-scsi-don-t-lock-AioContext-in-I-O-code-path.patch [RHEL-15965]
- kvm-dma-helpers-don-t-lock-AioContext-in-dma_blk_cb.patch [RHEL-15965]
- kvm-virtio-scsi-replace-AioContext-lock-with-tmf_bh_lock.patch [RHEL-15965]
- kvm-scsi-assert-that-callbacks-run-in-the-correct-AioCon.patch [RHEL-15965]
- kvm-tests-remove-aio_context_acquire-tests.patch [RHEL-15965]
- kvm-aio-make-aio_context_acquire-aio_context_release-a-n.patch [RHEL-15965]
- kvm-graph-lock-remove-AioContext-locking.patch [RHEL-15965]
- kvm-block-remove-AioContext-locking.patch [RHEL-15965]
- kvm-block-remove-bdrv_co_lock.patch [RHEL-15965]
- kvm-scsi-remove-AioContext-locking.patch [RHEL-15965]
- kvm-aio-wait-draw-equivalence-between-AIO_WAIT_WHILE-and.patch [RHEL-15965]
- kvm-aio-remove-aio_context_acquire-aio_context_release-A.patch [RHEL-15965]
- kvm-docs-remove-AioContext-lock-from-IOThread-docs.patch [RHEL-15965]
- kvm-scsi-remove-outdated-AioContext-lock-comment.patch [RHEL-15965]
- kvm-job-remove-outdated-AioContext-locking-comments.patch [RHEL-15965]
- kvm-block-remove-outdated-AioContext-locking-comments.patch [RHEL-15965]
- kvm-block-coroutine-wrapper-use-qemu_get_current_aio_con.patch [RHEL-15965]
- kvm-s390x-pci-avoid-double-enable-disable-of-aif.patch [RHEL-21169]
- kvm-s390x-pci-refresh-fh-before-disabling-aif.patch [RHEL-21169]
- kvm-s390x-pci-drive-ISM-reset-from-subsystem-reset.patch [RHEL-21169]
- kvm-include-ui-rect.h-fix-qemu_rect_init-mis-assignment.patch [RHEL-21570]
- kvm-virtio-gpu-block-migration-of-VMs-with-blob-true.patch [RHEL-7565]
- kvm-spec-Enable-zstd.patch [RHEL-7361]
- Resolves: RHEL-19738
  (Enable properties allowing to disable high memory regions)
- Resolves: RHEL-19302
  (NVIDIA:Grace-Hopper Backport QEMU IOMMUFD Backend)
- Resolves: RHEL-21057
  (Request backport of 9353b6da430f90e47f352dbf6dc31120c8914da6)
- Resolves: RHEL-18212
  ([RHEL9][Secure-execution][s390x] The error message is not clear when boot up a SE guest with wrong encryption)
- Resolves: RHEL-15965
  ( [qemu-kvm] Remove AioContext lock (no response with QMP command block_resize))
- Resolves: RHEL-21169
  ([s390x] VM fails to start with ISM passed through QEMU 8.2)
- Resolves: RHEL-21570
  (Critical performance degradation for input devices in virtio vnc session)
- Resolves: RHEL-7565
  (qemu crashed when migrate guest with blob resources enabled)
- Resolves: RHEL-7361
  ([qemu-kvm] Enable zstd support for qcow2 files)
This commit is contained in:
Miroslav Rezanina 2024-01-24 04:26:42 -05:00
parent 25859dae3b
commit 9ef7cdf7ca
101 changed files with 20222 additions and 1 deletions

View File

@ -0,0 +1,37 @@
From 363d6aedc82314a70bdfbe9fa23b7e8fdda50138 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 11 Jan 2024 12:26:19 -0500
Subject: [PATCH 066/101] Compile IOMMUFD object on aarch64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [65/67] 9358030fdd499c5fe122dee3bb4f114966fac9c2 (eauger1/centos-qemu-kvm)
Upstream: RHEL only
Compiles the IOMMUFD object on aarch64 to be able to use
the IOMMUFD VFIO backend.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
configs/devices/aarch64-softmmu/aarch64-rh-devices.mak | 1 +
1 file changed, 1 insertion(+)
diff --git a/configs/devices/aarch64-softmmu/aarch64-rh-devices.mak b/configs/devices/aarch64-softmmu/aarch64-rh-devices.mak
index aec1831199..b0191d3c69 100644
--- a/configs/devices/aarch64-softmmu/aarch64-rh-devices.mak
+++ b/configs/devices/aarch64-softmmu/aarch64-rh-devices.mak
@@ -39,3 +39,4 @@ CONFIG_PXB=y
CONFIG_VHOST_VSOCK=y
CONFIG_VHOST_USER_VSOCK=y
CONFIG_VHOST_USER_FS=y
+CONFIG_IOMMUFD=y
--
2.39.3

View File

@ -0,0 +1,37 @@
From c1e9ddf8d0ea6d358fcaa5cacd3a91920f36e73b Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 11 Jan 2024 12:33:17 -0500
Subject: [PATCH 067/101] Compile IOMMUFD on s390x
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [66/67] d3004aafca2bb76d817ac99c3d65973b8fbd4557 (eauger1/centos-qemu-kvm)
Upstream: RHEL only
Compiles the IOMMUFD object on s390x to be able to use
the IOMMUFD VFIO backend.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
configs/devices/s390x-softmmu/s390x-rh-devices.mak | 1 +
1 file changed, 1 insertion(+)
diff --git a/configs/devices/s390x-softmmu/s390x-rh-devices.mak b/configs/devices/s390x-softmmu/s390x-rh-devices.mak
index 69a799adbd..24cf6dbd03 100644
--- a/configs/devices/s390x-softmmu/s390x-rh-devices.mak
+++ b/configs/devices/s390x-softmmu/s390x-rh-devices.mak
@@ -16,3 +16,4 @@ CONFIG_WDT_DIAG288=y
CONFIG_VHOST_VSOCK=y
CONFIG_VHOST_USER_VSOCK=y
CONFIG_VHOST_USER_FS=y
+CONFIG_IOMMUFD=y
--
2.39.3

View File

@ -0,0 +1,37 @@
From be2c3d9bbee1bdec061c901f507bc999fa40a53e Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 11 Jan 2024 12:34:44 -0500
Subject: [PATCH 068/101] Compile IOMMUFD on x86_64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [67/67] 411d48a5cc7ce1f05be793fd9a89c143ce34c91a (eauger1/centos-qemu-kvm)
Upstream: RHEL only
Compiles the IOMMUFD object on s390x to be able to use
the IOMMUFD VFIO backend.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
configs/devices/x86_64-softmmu/x86_64-rh-devices.mak | 1 +
1 file changed, 1 insertion(+)
diff --git a/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak b/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak
index ce5be73633..ba41108e0c 100644
--- a/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak
+++ b/configs/devices/x86_64-softmmu/x86_64-rh-devices.mak
@@ -108,3 +108,4 @@ CONFIG_SGX=y
CONFIG_VHOST_VSOCK=y
CONFIG_VHOST_USER_VSOCK=y
CONFIG_VHOST_USER_FS=y
+CONFIG_IOMMUFD=y
--
2.39.3

View File

@ -0,0 +1,60 @@
From 6b5cfed21e20b372090046a934387255ff4bda58 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 5 Dec 2023 13:20:01 -0500
Subject: [PATCH 084/101] aio: make aio_context_acquire()/aio_context_release()
a no-op
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [15/26] 723dcada900aaf08862e8221921be22506b561a8 (kmwolf/centos-qemu-kvm)
aio_context_acquire()/aio_context_release() has been replaced by
fine-grained locking to protect state shared by multiple threads. The
AioContext lock still plays the role of balancing locking in
AIO_WAIT_WHILE() and many functions in QEMU either require that the
AioContext lock is held or not held for this reason. In other words, the
AioContext lock is purely there for consistency with itself and serves
no real purpose anymore.
Stop actually acquiring/releasing the lock in
aio_context_acquire()/aio_context_release() so that subsequent patches
can remove callers across the codebase incrementally.
I have performed "make check" and qemu-iotests stress tests across
x86-64, ppc64le, and aarch64 to confirm that there are no failures as a
result of eliminating the lock.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Acked-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20231205182011.1976568-5-stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
util/async.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/util/async.c b/util/async.c
index 8f90ddc304..04ee83d220 100644
--- a/util/async.c
+++ b/util/async.c
@@ -725,12 +725,12 @@ void aio_context_unref(AioContext *ctx)
void aio_context_acquire(AioContext *ctx)
{
- qemu_rec_mutex_lock(&ctx->lock);
+ /* TODO remove this function */
}
void aio_context_release(AioContext *ctx)
{
- qemu_rec_mutex_unlock(&ctx->lock);
+ /* TODO remove this function */
}
QEMU_DEFINE_STATIC_CO_TLS(AioContext *, my_aiocontext)
--
2.39.3

View File

@ -0,0 +1,102 @@
From 14913d8970090c8914dc19dad14f3b9f91985ec3 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 5 Dec 2023 13:20:07 -0500
Subject: [PATCH 090/101] aio: remove
aio_context_acquire()/aio_context_release() API
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [21/26] 4b6d4afcac79d3248a6722b063b5fc777dc418df (kmwolf/centos-qemu-kvm)
Delete these functions because nothing calls these functions anymore.
I introduced these APIs in commit 98563fc3ec44 ("aio: add
aio_context_acquire() and aio_context_release()") in 2014. It's with a
sigh of relief that I delete these APIs almost 10 years later.
Thanks to Paolo Bonzini's vision for multi-queue QEMU, we got an
understanding of where the code needed to go in order to remove the
limitations that the original dataplane and the IOThread/AioContext
approach that followed it.
Emanuele Giuseppe Esposito had the splendid determination to convert
large parts of the codebase so that they no longer needed the AioContext
lock. This was a painstaking process, both in the actual code changes
required and the iterations of code review that Emanuele eked out of
Kevin and me over many months.
Kevin Wolf tackled multitudes of graph locking conversions to protect
in-flight I/O from run-time changes to the block graph as well as the
clang Thread Safety Analysis annotations that allow the compiler to
check whether the graph lock is being used correctly.
And me, well, I'm just here to add some pizzazz to the QEMU multi-queue
block layer :). Thank you to everyone who helped with this effort,
including Eric Blake, code reviewer extraordinaire, and others who I've
forgotten to mention.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-ID: <20231205182011.1976568-11-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
include/block/aio.h | 17 -----------------
util/async.c | 10 ----------
2 files changed, 27 deletions(-)
diff --git a/include/block/aio.h b/include/block/aio.h
index f08b358077..af05512a7d 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -278,23 +278,6 @@ void aio_context_ref(AioContext *ctx);
*/
void aio_context_unref(AioContext *ctx);
-/* Take ownership of the AioContext. If the AioContext will be shared between
- * threads, and a thread does not want to be interrupted, it will have to
- * take ownership around calls to aio_poll(). Otherwise, aio_poll()
- * automatically takes care of calling aio_context_acquire and
- * aio_context_release.
- *
- * Note that this is separate from bdrv_drained_begin/bdrv_drained_end. A
- * thread still has to call those to avoid being interrupted by the guest.
- *
- * Bottom halves, timers and callbacks can be created or removed without
- * acquiring the AioContext.
- */
-void aio_context_acquire(AioContext *ctx);
-
-/* Relinquish ownership of the AioContext. */
-void aio_context_release(AioContext *ctx);
-
/**
* aio_bh_schedule_oneshot_full: Allocate a new bottom half structure that will
* run only once and as soon as possible.
diff --git a/util/async.c b/util/async.c
index dfd44ef612..460529057c 100644
--- a/util/async.c
+++ b/util/async.c
@@ -719,16 +719,6 @@ void aio_context_unref(AioContext *ctx)
g_source_unref(&ctx->source);
}
-void aio_context_acquire(AioContext *ctx)
-{
- /* TODO remove this function */
-}
-
-void aio_context_release(AioContext *ctx)
-{
- /* TODO remove this function */
-}
-
QEMU_DEFINE_STATIC_CO_TLS(AioContext *, my_aiocontext)
AioContext *qemu_get_current_aio_context(void)
--
2.39.3

View File

@ -0,0 +1,81 @@
From e1e2f3972065c4b5d6fcf37e0e1c4fb92a0d5260 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 5 Dec 2023 13:20:06 -0500
Subject: [PATCH 089/101] aio-wait: draw equivalence between AIO_WAIT_WHILE()
and AIO_WAIT_WHILE_UNLOCKED()
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [20/26] 20e49777869714c99769263103f1b0c2c370cfcd (kmwolf/centos-qemu-kvm)
Now that the AioContext lock no longer exists, AIO_WAIT_WHILE() and
AIO_WAIT_WHILE_UNLOCKED() are equivalent.
A future patch will get rid of AIO_WAIT_WHILE_UNLOCKED().
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-ID: <20231205182011.1976568-10-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
include/block/aio-wait.h | 16 ++++------------
1 file changed, 4 insertions(+), 12 deletions(-)
diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h
index 5449b6d742..157f105916 100644
--- a/include/block/aio-wait.h
+++ b/include/block/aio-wait.h
@@ -63,9 +63,6 @@ extern AioWait global_aio_wait;
* @ctx: the aio context, or NULL if multiple aio contexts (for which the
* caller does not hold a lock) are involved in the polling condition.
* @cond: wait while this conditional expression is true
- * @unlock: whether to unlock and then lock again @ctx. This applies
- * only when waiting for another AioContext from the main loop.
- * Otherwise it's ignored.
*
* Wait while a condition is true. Use this to implement synchronous
* operations that require event loop activity.
@@ -78,7 +75,7 @@ extern AioWait global_aio_wait;
* wait on conditions between two IOThreads since that could lead to deadlock,
* go via the main loop instead.
*/
-#define AIO_WAIT_WHILE_INTERNAL(ctx, cond, unlock) ({ \
+#define AIO_WAIT_WHILE_INTERNAL(ctx, cond) ({ \
bool waited_ = false; \
AioWait *wait_ = &global_aio_wait; \
AioContext *ctx_ = (ctx); \
@@ -95,13 +92,7 @@ extern AioWait global_aio_wait;
assert(qemu_get_current_aio_context() == \
qemu_get_aio_context()); \
while ((cond)) { \
- if (unlock && ctx_) { \
- aio_context_release(ctx_); \
- } \
aio_poll(qemu_get_aio_context(), true); \
- if (unlock && ctx_) { \
- aio_context_acquire(ctx_); \
- } \
waited_ = true; \
} \
} \
@@ -109,10 +100,11 @@ extern AioWait global_aio_wait;
waited_; })
#define AIO_WAIT_WHILE(ctx, cond) \
- AIO_WAIT_WHILE_INTERNAL(ctx, cond, true)
+ AIO_WAIT_WHILE_INTERNAL(ctx, cond)
+/* TODO replace this with AIO_WAIT_WHILE() in a future patch */
#define AIO_WAIT_WHILE_UNLOCKED(ctx, cond) \
- AIO_WAIT_WHILE_INTERNAL(ctx, cond, false)
+ AIO_WAIT_WHILE_INTERNAL(ctx, cond)
/**
* aio_wait_kick:
--
2.39.3

View File

@ -0,0 +1,476 @@
From 0d8255c98b3ef6f603ff0279592d3e91de26de0e Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Tue, 21 Nov 2023 16:44:00 +0800
Subject: [PATCH 021/101] backends/iommufd: Introduce the iommufd object
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [20/67] 8a56344ab4a2126f248bfa492ccddd19265f39be (eauger1/centos-qemu-kvm)
Introduce an iommufd object which allows the interaction
with the host /dev/iommu device.
The /dev/iommu can have been already pre-opened outside of qemu,
in which case the fd can be passed directly along with the
iommufd object:
This allows the iommufd object to be shared accross several
subsystems (VFIO, VDPA, ...). For example, libvirt would open
the /dev/iommu once.
If no fd is passed along with the iommufd object, the /dev/iommu
is opened by the qemu code.
Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 6e6d8ac62b5b38dc9d4b69ffdf073f0a0b43b7be)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
MAINTAINERS | 8 ++
backends/Kconfig | 4 +
backends/iommufd.c | 245 +++++++++++++++++++++++++++++++++++++++
backends/meson.build | 1 +
backends/trace-events | 10 ++
include/sysemu/iommufd.h | 38 ++++++
qapi/qom.json | 19 +++
qemu-options.hx | 12 ++
8 files changed, 337 insertions(+)
create mode 100644 backends/iommufd.c
create mode 100644 include/sysemu/iommufd.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 695e0bd34f..a5a446914a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2167,6 +2167,14 @@ F: hw/vfio/ap.c
F: docs/system/s390x/vfio-ap.rst
L: qemu-s390x@nongnu.org
+iommufd
+M: Yi Liu <yi.l.liu@intel.com>
+M: Eric Auger <eric.auger@redhat.com>
+M: Zhenzhong Duan <zhenzhong.duan@intel.com>
+S: Supported
+F: backends/iommufd.c
+F: include/sysemu/iommufd.h
+
vhost
M: Michael S. Tsirkin <mst@redhat.com>
S: Supported
diff --git a/backends/Kconfig b/backends/Kconfig
index f35abc1609..2cb23f62fa 100644
--- a/backends/Kconfig
+++ b/backends/Kconfig
@@ -1 +1,5 @@
source tpm/Kconfig
+
+config IOMMUFD
+ bool
+ depends on VFIO
diff --git a/backends/iommufd.c b/backends/iommufd.c
new file mode 100644
index 0000000000..ba58a0eb0d
--- /dev/null
+++ b/backends/iommufd.c
@@ -0,0 +1,245 @@
+/*
+ * iommufd container backend
+ *
+ * Copyright (C) 2023 Intel Corporation.
+ * Copyright Red Hat, Inc. 2023
+ *
+ * Authors: Yi Liu <yi.l.liu@intel.com>
+ * Eric Auger <eric.auger@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "sysemu/iommufd.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qerror.h"
+#include "qemu/module.h"
+#include "qom/object_interfaces.h"
+#include "qemu/error-report.h"
+#include "monitor/monitor.h"
+#include "trace.h"
+#include <sys/ioctl.h>
+#include <linux/iommufd.h>
+
+static void iommufd_backend_init(Object *obj)
+{
+ IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
+
+ be->fd = -1;
+ be->users = 0;
+ be->owned = true;
+ qemu_mutex_init(&be->lock);
+}
+
+static void iommufd_backend_finalize(Object *obj)
+{
+ IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
+
+ if (be->owned) {
+ close(be->fd);
+ be->fd = -1;
+ }
+}
+
+static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp)
+{
+ IOMMUFDBackend *be = IOMMUFD_BACKEND(obj);
+ int fd = -1;
+
+ fd = monitor_fd_param(monitor_cur(), str, errp);
+ if (fd == -1) {
+ error_prepend(errp, "Could not parse remote object fd %s:", str);
+ return;
+ }
+ qemu_mutex_lock(&be->lock);
+ be->fd = fd;
+ be->owned = false;
+ qemu_mutex_unlock(&be->lock);
+ trace_iommu_backend_set_fd(be->fd);
+}
+
+static bool iommufd_backend_can_be_deleted(UserCreatable *uc)
+{
+ IOMMUFDBackend *be = IOMMUFD_BACKEND(uc);
+
+ return !be->users;
+}
+
+static void iommufd_backend_class_init(ObjectClass *oc, void *data)
+{
+ UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
+
+ ucc->can_be_deleted = iommufd_backend_can_be_deleted;
+
+ object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd);
+}
+
+int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp)
+{
+ int fd, ret = 0;
+
+ qemu_mutex_lock(&be->lock);
+ if (be->users == UINT32_MAX) {
+ error_setg(errp, "too many connections");
+ ret = -E2BIG;
+ goto out;
+ }
+ if (be->owned && !be->users) {
+ fd = qemu_open_old("/dev/iommu", O_RDWR);
+ if (fd < 0) {
+ error_setg_errno(errp, errno, "/dev/iommu opening failed");
+ ret = fd;
+ goto out;
+ }
+ be->fd = fd;
+ }
+ be->users++;
+out:
+ trace_iommufd_backend_connect(be->fd, be->owned,
+ be->users, ret);
+ qemu_mutex_unlock(&be->lock);
+ return ret;
+}
+
+void iommufd_backend_disconnect(IOMMUFDBackend *be)
+{
+ qemu_mutex_lock(&be->lock);
+ if (!be->users) {
+ goto out;
+ }
+ be->users--;
+ if (!be->users && be->owned) {
+ close(be->fd);
+ be->fd = -1;
+ }
+out:
+ trace_iommufd_backend_disconnect(be->fd, be->users);
+ qemu_mutex_unlock(&be->lock);
+}
+
+int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
+ Error **errp)
+{
+ int ret, fd = be->fd;
+ struct iommu_ioas_alloc alloc_data = {
+ .size = sizeof(alloc_data),
+ .flags = 0,
+ };
+
+ ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data);
+ if (ret) {
+ error_setg_errno(errp, errno, "Failed to allocate ioas");
+ return ret;
+ }
+
+ *ioas_id = alloc_data.out_ioas_id;
+ trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret);
+
+ return ret;
+}
+
+void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id)
+{
+ int ret, fd = be->fd;
+ struct iommu_destroy des = {
+ .size = sizeof(des),
+ .id = id,
+ };
+
+ ret = ioctl(fd, IOMMU_DESTROY, &des);
+ trace_iommufd_backend_free_id(fd, id, ret);
+ if (ret) {
+ error_report("Failed to free id: %u %m", id);
+ }
+}
+
+int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
+ ram_addr_t size, void *vaddr, bool readonly)
+{
+ int ret, fd = be->fd;
+ struct iommu_ioas_map map = {
+ .size = sizeof(map),
+ .flags = IOMMU_IOAS_MAP_READABLE |
+ IOMMU_IOAS_MAP_FIXED_IOVA,
+ .ioas_id = ioas_id,
+ .__reserved = 0,
+ .user_va = (uintptr_t)vaddr,
+ .iova = iova,
+ .length = size,
+ };
+
+ if (!readonly) {
+ map.flags |= IOMMU_IOAS_MAP_WRITEABLE;
+ }
+
+ ret = ioctl(fd, IOMMU_IOAS_MAP, &map);
+ trace_iommufd_backend_map_dma(fd, ioas_id, iova, size,
+ vaddr, readonly, ret);
+ if (ret) {
+ ret = -errno;
+
+ /* TODO: Not support mapping hardware PCI BAR region for now. */
+ if (errno == EFAULT) {
+ warn_report("IOMMU_IOAS_MAP failed: %m, PCI BAR?");
+ } else {
+ error_report("IOMMU_IOAS_MAP failed: %m");
+ }
+ }
+ return ret;
+}
+
+int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
+ hwaddr iova, ram_addr_t size)
+{
+ int ret, fd = be->fd;
+ struct iommu_ioas_unmap unmap = {
+ .size = sizeof(unmap),
+ .ioas_id = ioas_id,
+ .iova = iova,
+ .length = size,
+ };
+
+ ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap);
+ /*
+ * IOMMUFD takes mapping as some kind of object, unmapping
+ * nonexistent mapping is treated as deleting a nonexistent
+ * object and return ENOENT. This is different from legacy
+ * backend which allows it. vIOMMU may trigger a lot of
+ * redundant unmapping, to avoid flush the log, treat them
+ * as succeess for IOMMUFD just like legacy backend.
+ */
+ if (ret && errno == ENOENT) {
+ trace_iommufd_backend_unmap_dma_non_exist(fd, ioas_id, iova, size, ret);
+ ret = 0;
+ } else {
+ trace_iommufd_backend_unmap_dma(fd, ioas_id, iova, size, ret);
+ }
+
+ if (ret) {
+ ret = -errno;
+ error_report("IOMMU_IOAS_UNMAP failed: %m");
+ }
+ return ret;
+}
+
+static const TypeInfo iommufd_backend_info = {
+ .name = TYPE_IOMMUFD_BACKEND,
+ .parent = TYPE_OBJECT,
+ .instance_size = sizeof(IOMMUFDBackend),
+ .instance_init = iommufd_backend_init,
+ .instance_finalize = iommufd_backend_finalize,
+ .class_size = sizeof(IOMMUFDBackendClass),
+ .class_init = iommufd_backend_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_USER_CREATABLE },
+ { }
+ }
+};
+
+static void register_types(void)
+{
+ type_register_static(&iommufd_backend_info);
+}
+
+type_init(register_types);
diff --git a/backends/meson.build b/backends/meson.build
index 914c7c4afb..9a5cea480d 100644
--- a/backends/meson.build
+++ b/backends/meson.build
@@ -20,6 +20,7 @@ if have_vhost_user
system_ss.add(when: 'CONFIG_VIRTIO', if_true: files('vhost-user.c'))
endif
system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost.c'))
+system_ss.add(when: 'CONFIG_IOMMUFD', if_true: files('iommufd.c'))
if have_vhost_user_crypto
system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost-user.c'))
endif
diff --git a/backends/trace-events b/backends/trace-events
index 652eb76a57..d45c6e31a6 100644
--- a/backends/trace-events
+++ b/backends/trace-events
@@ -5,3 +5,13 @@ dbus_vmstate_pre_save(void)
dbus_vmstate_post_load(int version_id) "version_id: %d"
dbus_vmstate_loading(const char *id) "id: %s"
dbus_vmstate_saving(const char *id) "id: %s"
+
+# iommufd.c
+iommufd_backend_connect(int fd, bool owned, uint32_t users, int ret) "fd=%d owned=%d users=%d (%d)"
+iommufd_backend_disconnect(int fd, uint32_t users) "fd=%d users=%d"
+iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d"
+iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)"
+iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
+iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
+iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)"
+iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)"
diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
new file mode 100644
index 0000000000..9c5524b0ed
--- /dev/null
+++ b/include/sysemu/iommufd.h
@@ -0,0 +1,38 @@
+#ifndef SYSEMU_IOMMUFD_H
+#define SYSEMU_IOMMUFD_H
+
+#include "qom/object.h"
+#include "qemu/thread.h"
+#include "exec/hwaddr.h"
+#include "exec/cpu-common.h"
+
+#define TYPE_IOMMUFD_BACKEND "iommufd"
+OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND)
+
+struct IOMMUFDBackendClass {
+ ObjectClass parent_class;
+};
+
+struct IOMMUFDBackend {
+ Object parent;
+
+ /*< protected >*/
+ int fd; /* /dev/iommu file descriptor */
+ bool owned; /* is the /dev/iommu opened internally */
+ QemuMutex lock;
+ uint32_t users;
+
+ /*< public >*/
+};
+
+int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp);
+void iommufd_backend_disconnect(IOMMUFDBackend *be);
+
+int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
+ Error **errp);
+void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id);
+int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
+ ram_addr_t size, void *vaddr, bool readonly);
+int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
+ hwaddr iova, ram_addr_t size);
+#endif
diff --git a/qapi/qom.json b/qapi/qom.json
index c53ef978ff..95516ba325 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -794,6 +794,23 @@
{ 'struct': 'VfioUserServerProperties',
'data': { 'socket': 'SocketAddress', 'device': 'str' } }
+##
+# @IOMMUFDProperties:
+#
+# Properties for iommufd objects.
+#
+# @fd: file descriptor name previously passed via 'getfd' command,
+# which represents a pre-opened /dev/iommu. This allows the
+# iommufd object to be shared accross several subsystems
+# (VFIO, VDPA, ...), and the file descriptor to be shared
+# with other process, e.g. DPDK. (default: QEMU opens
+# /dev/iommu by itself)
+#
+# Since: 9.0
+##
+{ 'struct': 'IOMMUFDProperties',
+ 'data': { '*fd': 'str' } }
+
##
# @RngProperties:
#
@@ -934,6 +951,7 @@
'input-barrier',
{ 'name': 'input-linux',
'if': 'CONFIG_LINUX' },
+ 'iommufd',
'iothread',
'main-loop',
{ 'name': 'memory-backend-epc',
@@ -1003,6 +1021,7 @@
'input-barrier': 'InputBarrierProperties',
'input-linux': { 'type': 'InputLinuxProperties',
'if': 'CONFIG_LINUX' },
+ 'iommufd': 'IOMMUFDProperties',
'iothread': 'IothreadProperties',
'main-loop': 'MainLoopProperties',
'memory-backend-epc': { 'type': 'MemoryBackendEpcProperties',
diff --git a/qemu-options.hx b/qemu-options.hx
index 557118cb1f..0814f43066 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -5224,6 +5224,18 @@ SRST
The ``share`` boolean option is on by default with memfd.
+ ``-object iommufd,id=id[,fd=fd]``
+ Creates an iommufd backend which allows control of DMA mapping
+ through the ``/dev/iommu`` device.
+
+ The ``id`` parameter is a unique ID which frontends (such as
+ vfio-pci of vdpa) will use to connect with the iommufd backend.
+
+ The ``fd`` parameter is an optional pre-opened file descriptor
+ resulting from ``/dev/iommu`` opening. Usually the iommufd is shared
+ across all subsystems, bringing the benefit of centralized
+ reference counting.
+
``-object rng-builtin,id=id``
Creates a random number generator backend which obtains entropy
from QEMU builtin functions. The ``id`` parameter is a unique ID
--
2.39.3

View File

@ -0,0 +1,47 @@
From da9a24793e876f6f2727d57f939d882be26a47b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Fri, 22 Dec 2023 08:55:23 +0100
Subject: [PATCH 064/101] backends/iommufd: Remove check on number of backend
users
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [63/67] ac4d4589d1f2de5ac3f0adfd8d1f27dbf6bbfdee (eauger1/centos-qemu-kvm)
QOM already has a ref count on objects and it will assert much
earlier, when INT_MAX is reached.
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit c2ab3a6f7411c895e538e8350fee8948ac07c1a0)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
backends/iommufd.c | 5 -----
1 file changed, 5 deletions(-)
diff --git a/backends/iommufd.c b/backends/iommufd.c
index ba58a0eb0d..393c0d9a37 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -80,11 +80,6 @@ int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp)
int fd, ret = 0;
qemu_mutex_lock(&be->lock);
- if (be->users == UINT32_MAX) {
- error_setg(errp, "too many connections");
- ret = -E2BIG;
- goto out;
- }
if (be->owned && !be->users) {
fd = qemu_open_old("/dev/iommu", O_RDWR);
if (fd < 0) {
--
2.39.3

View File

@ -0,0 +1,112 @@
From 92aff3cc1a412de01e9563802fa48848eae5283f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Thu, 21 Dec 2023 16:58:41 +0100
Subject: [PATCH 065/101] backends/iommufd: Remove mutex
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [64/67] 65518432b18f18ceadafe1b0698cdaa962e84f61 (eauger1/centos-qemu-kvm)
Coverity reports a concurrent data access violation because be->users
is being accessed in iommufd_backend_can_be_deleted() without holding
the mutex.
However, these routines are called from the QEMU main thread when a
device is created. In this case, the code paths should be protected by
the BQL lock and it should be safe to drop the IOMMUFD backend mutex.
Simply remove it.
Fixes: CID 1531550
Fixes: CID 1531549
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 19368b1905b4b917e915526fcbd5bfa3f7439451)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
backends/iommufd.c | 7 -------
include/sysemu/iommufd.h | 2 --
2 files changed, 9 deletions(-)
diff --git a/backends/iommufd.c b/backends/iommufd.c
index 393c0d9a37..1ef683c7b0 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -29,7 +29,6 @@ static void iommufd_backend_init(Object *obj)
be->fd = -1;
be->users = 0;
be->owned = true;
- qemu_mutex_init(&be->lock);
}
static void iommufd_backend_finalize(Object *obj)
@@ -52,10 +51,8 @@ static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp)
error_prepend(errp, "Could not parse remote object fd %s:", str);
return;
}
- qemu_mutex_lock(&be->lock);
be->fd = fd;
be->owned = false;
- qemu_mutex_unlock(&be->lock);
trace_iommu_backend_set_fd(be->fd);
}
@@ -79,7 +76,6 @@ int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp)
{
int fd, ret = 0;
- qemu_mutex_lock(&be->lock);
if (be->owned && !be->users) {
fd = qemu_open_old("/dev/iommu", O_RDWR);
if (fd < 0) {
@@ -93,13 +89,11 @@ int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp)
out:
trace_iommufd_backend_connect(be->fd, be->owned,
be->users, ret);
- qemu_mutex_unlock(&be->lock);
return ret;
}
void iommufd_backend_disconnect(IOMMUFDBackend *be)
{
- qemu_mutex_lock(&be->lock);
if (!be->users) {
goto out;
}
@@ -110,7 +104,6 @@ void iommufd_backend_disconnect(IOMMUFDBackend *be)
}
out:
trace_iommufd_backend_disconnect(be->fd, be->users);
- qemu_mutex_unlock(&be->lock);
}
int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index 9c5524b0ed..9af27ebd6c 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -2,7 +2,6 @@
#define SYSEMU_IOMMUFD_H
#include "qom/object.h"
-#include "qemu/thread.h"
#include "exec/hwaddr.h"
#include "exec/cpu-common.h"
@@ -19,7 +18,6 @@ struct IOMMUFDBackend {
/*< protected >*/
int fd; /* /dev/iommu file descriptor */
bool owned; /* is the /dev/iommu opened internally */
- QemuMutex lock;
uint32_t users;
/*< public >*/
--
2.39.3

View File

@ -0,0 +1,69 @@
From b1a68aebadecd7d339cf5eaffeda15099c998528 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 12 Sep 2023 19:10:37 -0400
Subject: [PATCH 095/101] block-coroutine-wrapper: use
qemu_get_current_aio_context()
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [26/26] cde767bcdc626e90721792e3889952057a548ac5 (kmwolf/centos-qemu-kvm)
Use qemu_get_current_aio_context() in mixed wrappers and coroutine
wrappers so that code runs in the caller's AioContext instead of moving
to the BlockDriverState's AioContext. This change is necessary for the
multi-queue block layer where any thread can call into the block layer.
Most wrappers are IO_CODE where it's safe to use the current AioContext
nowadays. BlockDrivers and the core block layer use their own locks and
no longer depend on the AioContext lock for thread-safety.
The bdrv_create() wrapper invokes GLOBAL_STATE code. Using the current
AioContext is safe because this code is only called with the BQL held
from the main loop thread.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20230912231037.826804-6-stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
scripts/block-coroutine-wrapper.py | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/scripts/block-coroutine-wrapper.py b/scripts/block-coroutine-wrapper.py
index c9c09fcacd..dbbde99e39 100644
--- a/scripts/block-coroutine-wrapper.py
+++ b/scripts/block-coroutine-wrapper.py
@@ -92,8 +92,6 @@ def __init__(self, wrapper_type: str, return_type: str, name: str,
f"{self.name}")
self.target_name = f'{subsystem}_{subname}'
- self.ctx = self.gen_ctx()
-
self.get_result = 's->ret = '
self.ret = 'return s.ret;'
self.co_ret = 'return '
@@ -167,7 +165,7 @@ def create_mixed_wrapper(func: FuncDecl) -> str:
{func.co_ret}{name}({ func.gen_list('{name}') });
}} else {{
{struct_name} s = {{
- .poll_state.ctx = {func.ctx},
+ .poll_state.ctx = qemu_get_current_aio_context(),
.poll_state.in_progress = true,
{ func.gen_block(' .{name} = {name},') }
@@ -191,7 +189,7 @@ def create_co_wrapper(func: FuncDecl) -> str:
{func.return_type} {func.name}({ func.gen_list('{decl}') })
{{
{struct_name} s = {{
- .poll_state.ctx = {func.ctx},
+ .poll_state.ctx = qemu_get_current_aio_context(),
.poll_state.in_progress = true,
{ func.gen_block(' .{name} = {name},') }
--
2.39.3

View File

@ -0,0 +1,217 @@
From 25cce5df341861e8ba8ec57722558e2dee3ce56a Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 14 Sep 2023 10:00:58 -0400
Subject: [PATCH 073/101] block/file-posix: set up Linux AIO and io_uring in
the current thread
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [4/26] 74c7daf805daefe706378308c3afeb28d861164b (kmwolf/centos-qemu-kvm)
The file-posix block driver currently only sets up Linux AIO and
io_uring in the BDS's AioContext. In the multi-queue block layer we must
be able to submit I/O requests in AioContexts that do not have Linux AIO
and io_uring set up yet since any thread can call into the block driver.
Set up Linux AIO and io_uring for the current AioContext during request
submission. We lose the ability to return an error from
.bdrv_file_open() when Linux AIO and io_uring setup fails (e.g. due to
resource limits). Instead the user only gets warnings and we fall back
to aio=threads. This is still better than a fatal error after startup.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20230914140101.1065008-2-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
block/file-posix.c | 103 ++++++++++++++++++++++-----------------------
1 file changed, 51 insertions(+), 52 deletions(-)
diff --git a/block/file-posix.c b/block/file-posix.c
index b862406c71..35684f7e21 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -712,17 +712,11 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
#ifdef CONFIG_LINUX_AIO
/* Currently Linux does AIO only for files opened with O_DIRECT */
- if (s->use_linux_aio) {
- if (!(s->open_flags & O_DIRECT)) {
- error_setg(errp, "aio=native was specified, but it requires "
- "cache.direct=on, which was not specified.");
- ret = -EINVAL;
- goto fail;
- }
- if (!aio_setup_linux_aio(bdrv_get_aio_context(bs), errp)) {
- error_prepend(errp, "Unable to use native AIO: ");
- goto fail;
- }
+ if (s->use_linux_aio && !(s->open_flags & O_DIRECT)) {
+ error_setg(errp, "aio=native was specified, but it requires "
+ "cache.direct=on, which was not specified.");
+ ret = -EINVAL;
+ goto fail;
}
#else
if (s->use_linux_aio) {
@@ -733,14 +727,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
}
#endif /* !defined(CONFIG_LINUX_AIO) */
-#ifdef CONFIG_LINUX_IO_URING
- if (s->use_linux_io_uring) {
- if (!aio_setup_linux_io_uring(bdrv_get_aio_context(bs), errp)) {
- error_prepend(errp, "Unable to use io_uring: ");
- goto fail;
- }
- }
-#else
+#ifndef CONFIG_LINUX_IO_URING
if (s->use_linux_io_uring) {
error_setg(errp, "aio=io_uring was specified, but is not supported "
"in this build.");
@@ -2444,6 +2431,48 @@ static bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
return true;
}
+#ifdef CONFIG_LINUX_IO_URING
+static inline bool raw_check_linux_io_uring(BDRVRawState *s)
+{
+ Error *local_err = NULL;
+ AioContext *ctx;
+
+ if (!s->use_linux_io_uring) {
+ return false;
+ }
+
+ ctx = qemu_get_current_aio_context();
+ if (unlikely(!aio_setup_linux_io_uring(ctx, &local_err))) {
+ error_reportf_err(local_err, "Unable to use linux io_uring, "
+ "falling back to thread pool: ");
+ s->use_linux_io_uring = false;
+ return false;
+ }
+ return true;
+}
+#endif
+
+#ifdef CONFIG_LINUX_AIO
+static inline bool raw_check_linux_aio(BDRVRawState *s)
+{
+ Error *local_err = NULL;
+ AioContext *ctx;
+
+ if (!s->use_linux_aio) {
+ return false;
+ }
+
+ ctx = qemu_get_current_aio_context();
+ if (unlikely(!aio_setup_linux_aio(ctx, &local_err))) {
+ error_reportf_err(local_err, "Unable to use Linux AIO, "
+ "falling back to thread pool: ");
+ s->use_linux_aio = false;
+ return false;
+ }
+ return true;
+}
+#endif
+
static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
uint64_t bytes, QEMUIOVector *qiov, int type)
{
@@ -2474,13 +2503,13 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) {
type |= QEMU_AIO_MISALIGNED;
#ifdef CONFIG_LINUX_IO_URING
- } else if (s->use_linux_io_uring) {
+ } else if (raw_check_linux_io_uring(s)) {
assert(qiov->size == bytes);
ret = luring_co_submit(bs, s->fd, offset, qiov, type);
goto out;
#endif
#ifdef CONFIG_LINUX_AIO
- } else if (s->use_linux_aio) {
+ } else if (raw_check_linux_aio(s)) {
assert(qiov->size == bytes);
ret = laio_co_submit(s->fd, offset, qiov, type,
s->aio_max_batch);
@@ -2567,39 +2596,13 @@ static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
};
#ifdef CONFIG_LINUX_IO_URING
- if (s->use_linux_io_uring) {
+ if (raw_check_linux_io_uring(s)) {
return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH);
}
#endif
return raw_thread_pool_submit(handle_aiocb_flush, &acb);
}
-static void raw_aio_attach_aio_context(BlockDriverState *bs,
- AioContext *new_context)
-{
- BDRVRawState __attribute__((unused)) *s = bs->opaque;
-#ifdef CONFIG_LINUX_AIO
- if (s->use_linux_aio) {
- Error *local_err = NULL;
- if (!aio_setup_linux_aio(new_context, &local_err)) {
- error_reportf_err(local_err, "Unable to use native AIO, "
- "falling back to thread pool: ");
- s->use_linux_aio = false;
- }
- }
-#endif
-#ifdef CONFIG_LINUX_IO_URING
- if (s->use_linux_io_uring) {
- Error *local_err = NULL;
- if (!aio_setup_linux_io_uring(new_context, &local_err)) {
- error_reportf_err(local_err, "Unable to use linux io_uring, "
- "falling back to thread pool: ");
- s->use_linux_io_uring = false;
- }
- }
-#endif
-}
-
static void raw_close(BlockDriverState *bs)
{
BDRVRawState *s = bs->opaque;
@@ -3896,7 +3899,6 @@ BlockDriver bdrv_file = {
.bdrv_co_copy_range_from = raw_co_copy_range_from,
.bdrv_co_copy_range_to = raw_co_copy_range_to,
.bdrv_refresh_limits = raw_refresh_limits,
- .bdrv_attach_aio_context = raw_aio_attach_aio_context,
.bdrv_co_truncate = raw_co_truncate,
.bdrv_co_getlength = raw_co_getlength,
@@ -4266,7 +4268,6 @@ static BlockDriver bdrv_host_device = {
.bdrv_co_copy_range_from = raw_co_copy_range_from,
.bdrv_co_copy_range_to = raw_co_copy_range_to,
.bdrv_refresh_limits = raw_refresh_limits,
- .bdrv_attach_aio_context = raw_aio_attach_aio_context,
.bdrv_co_truncate = raw_co_truncate,
.bdrv_co_getlength = raw_co_getlength,
@@ -4402,7 +4403,6 @@ static BlockDriver bdrv_host_cdrom = {
.bdrv_co_pwritev = raw_co_pwritev,
.bdrv_co_flush_to_disk = raw_co_flush_to_disk,
.bdrv_refresh_limits = cdrom_refresh_limits,
- .bdrv_attach_aio_context = raw_aio_attach_aio_context,
.bdrv_co_truncate = raw_co_truncate,
.bdrv_co_getlength = raw_co_getlength,
@@ -4528,7 +4528,6 @@ static BlockDriver bdrv_host_cdrom = {
.bdrv_co_pwritev = raw_co_pwritev,
.bdrv_co_flush_to_disk = raw_co_flush_to_disk,
.bdrv_refresh_limits = cdrom_refresh_limits,
- .bdrv_attach_aio_context = raw_aio_attach_aio_context,
.bdrv_co_truncate = raw_co_truncate,
.bdrv_co_getlength = raw_co_getlength,
--
2.39.3

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,97 @@
From d0514c7d5d6cc1aa140119c95d5ea2c1591b01e9 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 5 Dec 2023 13:20:04 -0500
Subject: [PATCH 087/101] block: remove bdrv_co_lock()
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [18/26] a303f861ea5e84d8e89fd51e530fd0cb2da17b89 (kmwolf/centos-qemu-kvm)
The bdrv_co_lock() and bdrv_co_unlock() functions are already no-ops.
Remove them.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20231205182011.1976568-8-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
block.c | 10 ----------
blockdev.c | 5 -----
include/block/block-global-state.h | 14 --------------
3 files changed, 29 deletions(-)
diff --git a/block.c b/block.c
index 91ace5d2d5..434b7f4d72 100644
--- a/block.c
+++ b/block.c
@@ -7431,16 +7431,6 @@ void coroutine_fn bdrv_co_leave(BlockDriverState *bs, AioContext *old_ctx)
bdrv_dec_in_flight(bs);
}
-void coroutine_fn bdrv_co_lock(BlockDriverState *bs)
-{
- /* TODO removed in next patch */
-}
-
-void coroutine_fn bdrv_co_unlock(BlockDriverState *bs)
-{
- /* TODO removed in next patch */
-}
-
static void bdrv_do_remove_aio_context_notifier(BdrvAioNotifier *ban)
{
GLOBAL_STATE_CODE();
diff --git a/blockdev.c b/blockdev.c
index 5d8b3a23eb..3a5e7222ec 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2264,18 +2264,13 @@ void coroutine_fn qmp_block_resize(const char *device, const char *node_name,
return;
}
- bdrv_co_lock(bs);
bdrv_drained_begin(bs);
- bdrv_co_unlock(bs);
old_ctx = bdrv_co_enter(bs);
blk_co_truncate(blk, size, false, PREALLOC_MODE_OFF, 0, errp);
bdrv_co_leave(bs, old_ctx);
- bdrv_co_lock(bs);
bdrv_drained_end(bs);
- bdrv_co_unlock(bs);
-
blk_co_unref(blk);
}
diff --git a/include/block/block-global-state.h b/include/block/block-global-state.h
index 0327f1c605..4ec0b217f0 100644
--- a/include/block/block-global-state.h
+++ b/include/block/block-global-state.h
@@ -267,20 +267,6 @@ int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag);
int bdrv_debug_resume(BlockDriverState *bs, const char *tag);
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag);
-/**
- * Locks the AioContext of @bs if it's not the current AioContext. This avoids
- * double locking which could lead to deadlocks: This is a coroutine_fn, so we
- * know we already own the lock of the current AioContext.
- *
- * May only be called in the main thread.
- */
-void coroutine_fn bdrv_co_lock(BlockDriverState *bs);
-
-/**
- * Unlocks the AioContext of @bs if it's not the current AioContext.
- */
-void coroutine_fn bdrv_co_unlock(BlockDriverState *bs);
-
bool bdrv_child_change_aio_context(BdrvChild *c, AioContext *ctx,
GHashTable *visited, Transaction *tran,
Error **errp);
--
2.39.3

View File

@ -0,0 +1,411 @@
From dc4eb64185957a01948217814478abc450ce5f26 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 5 Dec 2023 13:20:11 -0500
Subject: [PATCH 094/101] block: remove outdated AioContext locking comments
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [25/26] 395e18fb40d28d4bc961acee1a00da7f60748076 (kmwolf/centos-qemu-kvm)
The AioContext lock no longer exists.
There is one noteworthy change:
- * More specifically, these functions use BDRV_POLL_WHILE(bs), which
- * requires the caller to be either in the main thread and hold
- * the BlockdriverState (bs) AioContext lock, or directly in the
- * home thread that runs the bs AioContext. Calling them from
- * another thread in another AioContext would cause deadlocks.
+ * More specifically, these functions use BDRV_POLL_WHILE(bs), which requires
+ * the caller to be either in the main thread or directly in the home thread
+ * that runs the bs AioContext. Calling them from another thread in another
+ * AioContext would cause deadlocks.
I am not sure whether deadlocks are still possible. Maybe they have just
moved to the fine-grained locks that have replaced the AioContext. Since
I am not sure if the deadlocks are gone, I have kept the substance
unchanged and just removed mention of the AioContext.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-ID: <20231205182011.1976568-15-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
block.c | 73 ++++++----------------------
block/block-backend.c | 8 ---
block/export/vhost-user-blk-server.c | 4 --
include/block/block-common.h | 3 --
include/block/block-io.h | 9 ++--
include/block/block_int-common.h | 2 -
tests/qemu-iotests/202 | 2 +-
tests/qemu-iotests/203 | 3 +-
8 files changed, 22 insertions(+), 82 deletions(-)
diff --git a/block.c b/block.c
index 434b7f4d72..a097772238 100644
--- a/block.c
+++ b/block.c
@@ -1616,11 +1616,6 @@ out:
g_free(gen_node_name);
}
-/*
- * The caller must always hold @bs AioContext lock, because this function calls
- * bdrv_refresh_total_sectors() which polls when called from non-coroutine
- * context.
- */
static int no_coroutine_fn GRAPH_UNLOCKED
bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, const char *node_name,
QDict *options, int open_flags, Error **errp)
@@ -2901,7 +2896,7 @@ uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm)
* Replaces the node that a BdrvChild points to without updating permissions.
*
* If @new_bs is non-NULL, the parent of @child must already be drained through
- * @child and the caller must hold the AioContext lock for @new_bs.
+ * @child.
*/
static void GRAPH_WRLOCK
bdrv_replace_child_noperm(BdrvChild *child, BlockDriverState *new_bs)
@@ -3041,9 +3036,8 @@ static TransactionActionDrv bdrv_attach_child_common_drv = {
*
* Returns new created child.
*
- * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
- * @child_bs can move to a different AioContext in this function. Callers must
- * make sure that their AioContext locking is still correct after this.
+ * Both @parent_bs and @child_bs can move to a different AioContext in this
+ * function.
*/
static BdrvChild * GRAPH_WRLOCK
bdrv_attach_child_common(BlockDriverState *child_bs,
@@ -3142,9 +3136,8 @@ bdrv_attach_child_common(BlockDriverState *child_bs,
/*
* Function doesn't update permissions, caller is responsible for this.
*
- * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
- * @child_bs can move to a different AioContext in this function. Callers must
- * make sure that their AioContext locking is still correct after this.
+ * Both @parent_bs and @child_bs can move to a different AioContext in this
+ * function.
*
* After calling this function, the transaction @tran may only be completed
* while holding a writer lock for the graph.
@@ -3184,9 +3177,6 @@ bdrv_attach_child_noperm(BlockDriverState *parent_bs,
*
* On failure NULL is returned, errp is set and the reference to
* child_bs is also dropped.
- *
- * The caller must hold the AioContext lock @child_bs, but not that of @ctx
- * (unless @child_bs is already in @ctx).
*/
BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
const char *child_name,
@@ -3226,9 +3216,6 @@ out:
*
* On failure NULL is returned, errp is set and the reference to
* child_bs is also dropped.
- *
- * If @parent_bs and @child_bs are in different AioContexts, the caller must
- * hold the AioContext lock for @child_bs, but not for @parent_bs.
*/
BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
BlockDriverState *child_bs,
@@ -3418,9 +3405,8 @@ static BdrvChildRole bdrv_backing_role(BlockDriverState *bs)
*
* Function doesn't update permissions, caller is responsible for this.
*
- * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
- * @child_bs can move to a different AioContext in this function. Callers must
- * make sure that their AioContext locking is still correct after this.
+ * Both @parent_bs and @child_bs can move to a different AioContext in this
+ * function.
*
* After calling this function, the transaction @tran may only be completed
* while holding a writer lock for the graph.
@@ -3513,9 +3499,8 @@ out:
}
/*
- * The caller must hold the AioContext lock for @backing_hd. Both @bs and
- * @backing_hd can move to a different AioContext in this function. Callers must
- * make sure that their AioContext locking is still correct after this.
+ * Both @bs and @backing_hd can move to a different AioContext in this
+ * function.
*
* If a backing child is already present (i.e. we're detaching a node), that
* child node must be drained.
@@ -3574,8 +3559,6 @@ int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
* itself, all options starting with "${bdref_key}." are considered part of the
* BlockdevRef.
*
- * The caller must hold the main AioContext lock.
- *
* TODO Can this be unified with bdrv_open_image()?
*/
int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
@@ -3745,9 +3728,7 @@ done:
*
* The BlockdevRef will be removed from the options QDict.
*
- * The caller must hold the lock of the main AioContext and no other AioContext.
- * @parent can move to a different AioContext in this function. Callers must
- * make sure that their AioContext locking is still correct after this.
+ * @parent can move to a different AioContext in this function.
*/
BdrvChild *bdrv_open_child(const char *filename,
QDict *options, const char *bdref_key,
@@ -3778,9 +3759,7 @@ BdrvChild *bdrv_open_child(const char *filename,
/*
* Wrapper on bdrv_open_child() for most popular case: open primary child of bs.
*
- * The caller must hold the lock of the main AioContext and no other AioContext.
- * @parent can move to a different AioContext in this function. Callers must
- * make sure that their AioContext locking is still correct after this.
+ * @parent can move to a different AioContext in this function.
*/
int bdrv_open_file_child(const char *filename,
QDict *options, const char *bdref_key,
@@ -3923,8 +3902,6 @@ out:
* The reference parameter may be used to specify an existing block device which
* should be opened. If specified, neither options nor a filename may be given,
* nor can an existing BDS be reused (that is, *pbs has to be NULL).
- *
- * The caller must always hold the main AioContext lock.
*/
static BlockDriverState * no_coroutine_fn
bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
@@ -4217,7 +4194,6 @@ close_and_fail:
return NULL;
}
-/* The caller must always hold the main AioContext lock. */
BlockDriverState *bdrv_open(const char *filename, const char *reference,
QDict *options, int flags, Error **errp)
{
@@ -4665,10 +4641,7 @@ int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only,
*
* Return 0 on success, otherwise return < 0 and set @errp.
*
- * The caller must hold the AioContext lock of @reopen_state->bs.
* @reopen_state->bs can move to a different AioContext in this function.
- * Callers must make sure that their AioContext locking is still correct after
- * this.
*/
static int GRAPH_UNLOCKED
bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
@@ -4801,8 +4774,6 @@ out_rdlock:
* It is the responsibility of the caller to then call the abort() or
* commit() for any other BDS that have been left in a prepare() state
*
- * The caller must hold the AioContext lock of @reopen_state->bs.
- *
* After calling this function, the transaction @change_child_tran may only be
* completed while holding a writer lock for the graph.
*/
@@ -5437,8 +5408,6 @@ int bdrv_drop_filter(BlockDriverState *bs, Error **errp)
* child.
*
* This function does not create any image files.
- *
- * The caller must hold the AioContext lock for @bs_top.
*/
int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
Error **errp)
@@ -5545,9 +5514,8 @@ static void bdrv_delete(BlockDriverState *bs)
* after the call (even on failure), so if the caller intends to reuse the
* dictionary, it needs to use qobject_ref() before calling bdrv_open.
*
- * The caller holds the AioContext lock for @bs. It must make sure that @bs
- * stays in the same AioContext, i.e. @options must not refer to nodes in a
- * different AioContext.
+ * The caller must make sure that @bs stays in the same AioContext, i.e.
+ * @options must not refer to nodes in a different AioContext.
*/
BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *options,
int flags, Error **errp)
@@ -7565,10 +7533,6 @@ static TransactionActionDrv set_aio_context = {
*
* Must be called from the main AioContext.
*
- * The caller must own the AioContext lock for the old AioContext of bs, but it
- * must not own the AioContext lock for new_context (unless new_context is the
- * same as the current context of bs).
- *
* @visited will accumulate all visited BdrvChild objects. The caller is
* responsible for freeing the list afterwards.
*/
@@ -7621,13 +7585,6 @@ static bool bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx,
*
* If ignore_child is not NULL, that child (and its subgraph) will not
* be touched.
- *
- * This function still requires the caller to take the bs current
- * AioContext lock, otherwise draining will fail since AIO_WAIT_WHILE
- * assumes the lock is always held if bs is in another AioContext.
- * For the same reason, it temporarily also holds the new AioContext, since
- * bdrv_drained_end calls BDRV_POLL_WHILE that assumes the lock is taken too.
- * Therefore the new AioContext lock must not be taken by the caller.
*/
int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx,
BdrvChild *ignore_child, Error **errp)
@@ -7653,8 +7610,8 @@ int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx,
/*
* Linear phase: go through all callbacks collected in the transaction.
- * Run all callbacks collected in the recursion to switch all nodes
- * AioContext lock (transaction commit), or undo all changes done in the
+ * Run all callbacks collected in the recursion to switch every node's
+ * AioContext (transaction commit), or undo all changes done in the
* recursion (transaction abort).
*/
diff --git a/block/block-backend.c b/block/block-backend.c
index f412bed274..209eb07528 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -390,8 +390,6 @@ BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm)
* Both sets of permissions can be changed later using blk_set_perm().
*
* Return the new BlockBackend on success, null on failure.
- *
- * Callers must hold the AioContext lock of @bs.
*/
BlockBackend *blk_new_with_bs(BlockDriverState *bs, uint64_t perm,
uint64_t shared_perm, Error **errp)
@@ -416,8 +414,6 @@ BlockBackend *blk_new_with_bs(BlockDriverState *bs, uint64_t perm,
* Just as with bdrv_open(), after having called this function the reference to
* @options belongs to the block layer (even on failure).
*
- * Called without holding an AioContext lock.
- *
* TODO: Remove @filename and @flags; it should be possible to specify a whole
* BDS tree just by specifying the @options QDict (or @reference,
* alternatively). At the time of adding this function, this is not possible,
@@ -872,8 +868,6 @@ BlockBackend *blk_by_public(BlockBackendPublic *public)
/*
* Disassociates the currently associated BlockDriverState from @blk.
- *
- * The caller must hold the AioContext lock for the BlockBackend.
*/
void blk_remove_bs(BlockBackend *blk)
{
@@ -915,8 +909,6 @@ void blk_remove_bs(BlockBackend *blk)
/*
* Associates a new BlockDriverState with @blk.
- *
- * Callers must hold the AioContext lock of @bs.
*/
int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp)
{
diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
index 16f48388d3..50c358e8cd 100644
--- a/block/export/vhost-user-blk-server.c
+++ b/block/export/vhost-user-blk-server.c
@@ -278,7 +278,6 @@ static void vu_blk_exp_resize(void *opaque)
vu_config_change_msg(&vexp->vu_server.vu_dev);
}
-/* Called with vexp->export.ctx acquired */
static void vu_blk_drained_begin(void *opaque)
{
VuBlkExport *vexp = opaque;
@@ -287,7 +286,6 @@ static void vu_blk_drained_begin(void *opaque)
vhost_user_server_detach_aio_context(&vexp->vu_server);
}
-/* Called with vexp->export.blk AioContext acquired */
static void vu_blk_drained_end(void *opaque)
{
VuBlkExport *vexp = opaque;
@@ -300,8 +298,6 @@ static void vu_blk_drained_end(void *opaque)
* Ensures that bdrv_drained_begin() waits until in-flight requests complete
* and the server->co_trip coroutine has terminated. It will be restarted in
* vhost_user_server_attach_aio_context().
- *
- * Called with vexp->export.ctx acquired.
*/
static bool vu_blk_drained_poll(void *opaque)
{
diff --git a/include/block/block-common.h b/include/block/block-common.h
index d7599564db..a846023a09 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -70,9 +70,6 @@
* automatically takes the graph rdlock when calling the wrapped function. In
* the same way, no_co_wrapper_bdrv_wrlock functions automatically take the
* graph wrlock.
- *
- * If the first parameter of the function is a BlockDriverState, BdrvChild or
- * BlockBackend pointer, the AioContext lock for it is taken in the wrapper.
*/
#define no_co_wrapper
#define no_co_wrapper_bdrv_rdlock
diff --git a/include/block/block-io.h b/include/block/block-io.h
index 8eb39a858b..b49e0537dd 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -332,11 +332,10 @@ bdrv_co_copy_range(BdrvChild *src, int64_t src_offset,
* "I/O or GS" API functions. These functions can run without
* the BQL, but only in one specific iothread/main loop.
*
- * More specifically, these functions use BDRV_POLL_WHILE(bs), which
- * requires the caller to be either in the main thread and hold
- * the BlockdriverState (bs) AioContext lock, or directly in the
- * home thread that runs the bs AioContext. Calling them from
- * another thread in another AioContext would cause deadlocks.
+ * More specifically, these functions use BDRV_POLL_WHILE(bs), which requires
+ * the caller to be either in the main thread or directly in the home thread
+ * that runs the bs AioContext. Calling them from another thread in another
+ * AioContext would cause deadlocks.
*
* Therefore, these functions are not proper I/O, because they
* can't run in *any* iothreads, but only in a specific one.
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 4e31d161c5..151279d481 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -1192,8 +1192,6 @@ struct BlockDriverState {
/* The error object in use for blocking operations on backing_hd */
Error *backing_blocker;
- /* Protected by AioContext lock */
-
/*
* If we are reading a disk image, give its size in sectors.
* Generally read-only; it is written to by load_snapshot and
diff --git a/tests/qemu-iotests/202 b/tests/qemu-iotests/202
index b784dcd791..13304242e5 100755
--- a/tests/qemu-iotests/202
+++ b/tests/qemu-iotests/202
@@ -21,7 +21,7 @@
# Check that QMP 'transaction' blockdev-snapshot-sync with multiple drives on a
# single IOThread completes successfully. This particular command triggered a
# hang due to recursive AioContext locking and BDRV_POLL_WHILE(). Protect
-# against regressions.
+# against regressions even though the AioContext lock no longer exists.
import iotests
diff --git a/tests/qemu-iotests/203 b/tests/qemu-iotests/203
index ab80fd0e44..1ba878522b 100755
--- a/tests/qemu-iotests/203
+++ b/tests/qemu-iotests/203
@@ -21,7 +21,8 @@
# Check that QMP 'migrate' with multiple drives on a single IOThread completes
# successfully. This particular command triggered a hang in the source QEMU
# process due to recursive AioContext locking in bdrv_invalidate_all() and
-# BDRV_POLL_WHILE().
+# BDRV_POLL_WHILE(). Protect against regressions even though the AioContext
+# lock no longer exists.
import iotests
--
2.39.3

View File

@ -0,0 +1,75 @@
From ac9dc8ea241ef6d3a0447d696620d4d4053b71bf Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 4 Dec 2023 11:42:59 -0500
Subject: [PATCH 080/101] dma-helpers: don't lock AioContext in dma_blk_cb()
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [11/26] a8580463ba6aee4ca248c0b947b9e72bd9e87aab (kmwolf/centos-qemu-kvm)
Commit abfcd2760b3e ("dma-helpers: prevent dma_blk_cb() vs
dma_aio_cancel() race") acquired the AioContext lock inside dma_blk_cb()
to avoid a race with scsi_device_purge_requests() running in the main
loop thread.
The SCSI code no longer calls dma_aio_cancel() from the main loop thread
while I/O is running in the IOThread AioContext. Therefore it is no
longer necessary to take this lock to protect DMAAIOCB fields. The
->cb() function also does not require the lock because blk_aio_*() and
friends do not need the AioContext lock.
Both hw/ide/core.c and hw/ide/macio.c also call dma_blk_io() but don't
rely on it taking the AioContext lock, so this change is safe.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20231204164259.1515217-5-stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
system/dma-helpers.c | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/system/dma-helpers.c b/system/dma-helpers.c
index 36211acc7e..528117f256 100644
--- a/system/dma-helpers.c
+++ b/system/dma-helpers.c
@@ -119,13 +119,12 @@ static void dma_blk_cb(void *opaque, int ret)
trace_dma_blk_cb(dbs, ret);
- aio_context_acquire(ctx);
dbs->acb = NULL;
dbs->offset += dbs->iov.size;
if (dbs->sg_cur_index == dbs->sg->nsg || ret < 0) {
dma_complete(dbs, ret);
- goto out;
+ return;
}
dma_blk_unmap(dbs);
@@ -168,7 +167,7 @@ static void dma_blk_cb(void *opaque, int ret)
trace_dma_map_wait(dbs);
dbs->bh = aio_bh_new(ctx, reschedule_dma, dbs);
cpu_register_map_client(dbs->bh);
- goto out;
+ return;
}
if (!QEMU_IS_ALIGNED(dbs->iov.size, dbs->align)) {
@@ -179,8 +178,6 @@ static void dma_blk_cb(void *opaque, int ret)
dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
dma_blk_cb, dbs, dbs->io_func_opaque);
assert(dbs->acb);
-out:
- aio_context_release(ctx);
}
static void dma_aio_cancel(BlockAIOCB *acb)
--
2.39.3

View File

@ -0,0 +1,228 @@
From 71aa0219f7c84cbf175eb2a091d48d5fd5daa40b Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Tue, 21 Nov 2023 16:44:26 +0800
Subject: [PATCH 047/101] docs/devel: Add VFIO iommufd backend documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [46/67] 6cf49d00e87788f894d690a985bb6798eae24505 (eauger1/centos-qemu-kvm)
Suggested-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 98dad2b01931f6064c6c4b48ca3c2a1d9f542cd8)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
MAINTAINERS | 1 +
docs/devel/index-internals.rst | 1 +
docs/devel/vfio-iommufd.rst | 166 +++++++++++++++++++++++++++++++++
3 files changed, 168 insertions(+)
create mode 100644 docs/devel/vfio-iommufd.rst
diff --git a/MAINTAINERS b/MAINTAINERS
index ca70bb4e64..0ddb20a35f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2176,6 +2176,7 @@ F: backends/iommufd.c
F: include/sysemu/iommufd.h
F: include/qemu/chardev_open.h
F: util/chardev_open.c
+F: docs/devel/vfio-iommufd.rst
vhost
M: Michael S. Tsirkin <mst@redhat.com>
diff --git a/docs/devel/index-internals.rst b/docs/devel/index-internals.rst
index 6f81df92bc..3def4a138b 100644
--- a/docs/devel/index-internals.rst
+++ b/docs/devel/index-internals.rst
@@ -18,5 +18,6 @@ Details about QEMU's various subsystems including how to add features to them.
s390-dasd-ipl
tracing
vfio-migration
+ vfio-iommufd
writing-monitor-commands
virtio-backends
diff --git a/docs/devel/vfio-iommufd.rst b/docs/devel/vfio-iommufd.rst
new file mode 100644
index 0000000000..3d1c11f175
--- /dev/null
+++ b/docs/devel/vfio-iommufd.rst
@@ -0,0 +1,166 @@
+===============================
+IOMMUFD BACKEND usage with VFIO
+===============================
+
+(Same meaning for backend/container/BE)
+
+With the introduction of iommufd, the Linux kernel provides a generic
+interface for user space drivers to propagate their DMA mappings to kernel
+for assigned devices. While the legacy kernel interface is group-centric,
+the new iommufd interface is device-centric, relying on device fd and iommufd.
+
+To support both interfaces in the QEMU VFIO device, introduce a base container
+to abstract the common part of VFIO legacy and iommufd container. So that the
+generic VFIO code can use either container.
+
+The base container implements generic functions such as memory_listener and
+address space management whereas the derived container implements callbacks
+specific to either legacy or iommufd. Each container has its own way to setup
+secure context and dma management interface. The below diagram shows how it
+looks like with both containers.
+
+::
+
+ VFIO AddressSpace/Memory
+ +-------+ +----------+ +-----+ +-----+
+ | pci | | platform | | ap | | ccw |
+ +---+---+ +----+-----+ +--+--+ +--+--+ +----------------------+
+ | | | | | AddressSpace |
+ | | | | +------------+---------+
+ +---V-----------V-----------V--------V----+ /
+ | VFIOAddressSpace | <------------+
+ | | | MemoryListener
+ | VFIOContainerBase list |
+ +-------+----------------------------+----+
+ | |
+ | |
+ +-------V------+ +--------V----------+
+ | iommufd | | vfio legacy |
+ | container | | container |
+ +-------+------+ +--------+----------+
+ | |
+ | /dev/iommu | /dev/vfio/vfio
+ | /dev/vfio/devices/vfioX | /dev/vfio/$group_id
+ Userspace | |
+ ============+============================+===========================
+ Kernel | device fd |
+ +---------------+ | group/container fd
+ | (BIND_IOMMUFD | | (SET_CONTAINER/SET_IOMMU)
+ | ATTACH_IOAS) | | device fd
+ | | |
+ | +-------V------------V-----------------+
+ iommufd | | vfio |
+ (map/unmap | +---------+--------------------+-------+
+ ioas_copy) | | | map/unmap
+ | | |
+ +------V------+ +-----V------+ +------V--------+
+ | iommfd core | | device | | vfio iommu |
+ +-------------+ +------------+ +---------------+
+
+* Secure Context setup
+
+ - iommufd BE: uses device fd and iommufd to setup secure context
+ (bind_iommufd, attach_ioas)
+ - vfio legacy BE: uses group fd and container fd to setup secure context
+ (set_container, set_iommu)
+
+* Device access
+
+ - iommufd BE: device fd is opened through ``/dev/vfio/devices/vfioX``
+ - vfio legacy BE: device fd is retrieved from group fd ioctl
+
+* DMA Mapping flow
+
+ 1. VFIOAddressSpace receives MemoryRegion add/del via MemoryListener
+ 2. VFIO populates DMA map/unmap via the container BEs
+ * iommufd BE: uses iommufd
+ * vfio legacy BE: uses container fd
+
+Example configuration
+=====================
+
+Step 1: configure the host device
+---------------------------------
+
+It's exactly same as the VFIO device with legacy VFIO container.
+
+Step 2: configure QEMU
+----------------------
+
+Interactions with the ``/dev/iommu`` are abstracted by a new iommufd
+object (compiled in with the ``CONFIG_IOMMUFD`` option).
+
+Any QEMU device (e.g. VFIO device) wishing to use ``/dev/iommu`` must
+be linked with an iommufd object. It gets a new optional property
+named iommufd which allows to pass an iommufd object. Take ``vfio-pci``
+device for example:
+
+.. code-block:: bash
+
+ -object iommufd,id=iommufd0
+ -device vfio-pci,host=0000:02:00.0,iommufd=iommufd0
+
+Note the ``/dev/iommu`` and VFIO cdev can be externally opened by a
+management layer. In such a case the fd is passed, the fd supports a
+string naming the fd or a number, for example:
+
+.. code-block:: bash
+
+ -object iommufd,id=iommufd0,fd=22
+ -device vfio-pci,iommufd=iommufd0,fd=23
+
+If the ``fd`` property is not passed, the fd is opened by QEMU.
+
+If no ``iommufd`` object is passed to the ``vfio-pci`` device, iommufd
+is not used and the user gets the behavior based on the legacy VFIO
+container:
+
+.. code-block:: bash
+
+ -device vfio-pci,host=0000:02:00.0
+
+Supported platform
+==================
+
+Supports x86, ARM and s390x currently.
+
+Caveats
+=======
+
+Dirty page sync
+---------------
+
+Dirty page sync with iommufd backend is unsupported yet, live migration is
+disabled by default. But it can be force enabled like below, low efficient
+though.
+
+.. code-block:: bash
+
+ -object iommufd,id=iommufd0
+ -device vfio-pci,host=0000:02:00.0,iommufd=iommufd0,enable-migration=on
+
+P2P DMA
+-------
+
+PCI p2p DMA is unsupported as IOMMUFD doesn't support mapping hardware PCI
+BAR region yet. Below warning shows for assigned PCI device, it's not a bug.
+
+.. code-block:: none
+
+ qemu-system-x86_64: warning: IOMMU_IOAS_MAP failed: Bad address, PCI BAR?
+ qemu-system-x86_64: vfio_container_dma_map(0x560cb6cb1620, 0xe000000021000, 0x3000, 0x7f32ed55c000) = -14 (Bad address)
+
+FD passing with mdev
+--------------------
+
+``vfio-pci`` device checks sysfsdev property to decide if backend is a mdev.
+If FD passing is used, there is no way to know that and the mdev is treated
+like a real PCI device. There is an error as below if user wants to enable
+RAM discarding for mdev.
+
+.. code-block:: none
+
+ qemu-system-x86_64: -device vfio-pci,iommufd=iommufd0,x-balloon-allowed=on,fd=9: vfio VFIO_FD9: x-balloon-allowed only potentially compatible with mdev devices
+
+``vfio-ap`` and ``vfio-ccw`` devices don't have same issue as their backend
+devices are always mdev and RAM discarding is force enabled.
--
2.39.3

View File

@ -0,0 +1,98 @@
From fc69df3a70bed5722643cc16828ca20beae3a20d Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 5 Dec 2023 13:20:08 -0500
Subject: [PATCH 091/101] docs: remove AioContext lock from IOThread docs
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [22/26] ab89cda483e74ded983d26e1c6e50217405e0a55 (kmwolf/centos-qemu-kvm)
Encourage the use of locking primitives and stop mentioning the
AioContext lock since it is being removed.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-ID: <20231205182011.1976568-12-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
docs/devel/multiple-iothreads.txt | 47 +++++++++++--------------------
1 file changed, 16 insertions(+), 31 deletions(-)
diff --git a/docs/devel/multiple-iothreads.txt b/docs/devel/multiple-iothreads.txt
index a3e949f6b3..4865196bde 100644
--- a/docs/devel/multiple-iothreads.txt
+++ b/docs/devel/multiple-iothreads.txt
@@ -88,27 +88,18 @@ loop, depending on which AioContext instance the caller passes in.
How to synchronize with an IOThread
-----------------------------------
-AioContext is not thread-safe so some rules must be followed when using file
-descriptors, event notifiers, timers, or BHs across threads:
+Variables that can be accessed by multiple threads require some form of
+synchronization such as qemu_mutex_lock(), rcu_read_lock(), etc.
-1. AioContext functions can always be called safely. They handle their
-own locking internally.
-
-2. Other threads wishing to access the AioContext must use
-aio_context_acquire()/aio_context_release() for mutual exclusion. Once the
-context is acquired no other thread can access it or run event loop iterations
-in this AioContext.
-
-Legacy code sometimes nests aio_context_acquire()/aio_context_release() calls.
-Do not use nesting anymore, it is incompatible with the BDRV_POLL_WHILE() macro
-used in the block layer and can lead to hangs.
-
-There is currently no lock ordering rule if a thread needs to acquire multiple
-AioContexts simultaneously. Therefore, it is only safe for code holding the
-QEMU global mutex to acquire other AioContexts.
+AioContext functions like aio_set_fd_handler(), aio_set_event_notifier(),
+aio_bh_new(), and aio_timer_new() are thread-safe. They can be used to trigger
+activity in an IOThread.
Side note: the best way to schedule a function call across threads is to call
-aio_bh_schedule_oneshot(). No acquire/release or locking is needed.
+aio_bh_schedule_oneshot().
+
+The main loop thread can wait synchronously for a condition using
+AIO_WAIT_WHILE().
AioContext and the block layer
------------------------------
@@ -124,22 +115,16 @@ Block layer code must therefore expect to run in an IOThread and avoid using
old APIs that implicitly use the main loop. See the "How to program for
IOThreads" above for information on how to do that.
-If main loop code such as a QMP function wishes to access a BlockDriverState
-it must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure
-that callbacks in the IOThread do not run in parallel.
-
Code running in the monitor typically needs to ensure that past
requests from the guest are completed. When a block device is running
in an IOThread, the IOThread can also process requests from the guest
(via ioeventfd). To achieve both objects, wrap the code between
bdrv_drained_begin() and bdrv_drained_end(), thus creating a "drained
-section". The functions must be called between aio_context_acquire()
-and aio_context_release(). You can freely release and re-acquire the
-AioContext within a drained section.
-
-Long-running jobs (usually in the form of coroutines) are best scheduled in
-the BlockDriverState's AioContext to avoid the need to acquire/release around
-each bdrv_*() call. The functions bdrv_add/remove_aio_context_notifier,
-or alternatively blk_add/remove_aio_context_notifier if you use BlockBackends,
-can be used to get a notification whenever bdrv_try_change_aio_context() moves a
+section".
+
+Long-running jobs (usually in the form of coroutines) are often scheduled in
+the BlockDriverState's AioContext. The functions
+bdrv_add/remove_aio_context_notifier, or alternatively
+blk_add/remove_aio_context_notifier if you use BlockBackends, can be used to
+get a notification whenever bdrv_try_change_aio_context() moves a
BlockDriverState to a different AioContext.
--
2.39.3

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,42 @@
From ceaee9c4372bbdc4196cb6808515047388f7aa26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Tue, 21 Nov 2023 16:44:18 +0800
Subject: [PATCH 039/101] hw/arm: Activate IOMMUFD for virt machines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [38/67] 0a059ae661616e95eb8455e17f35774495cae8e7 (eauger1/centos-qemu-kvm)
Signed-off-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 0970238343af45a8b547695bfc22f18d4eb7da7e)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/arm/Kconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig
index 3ada335a24..660f49db49 100644
--- a/hw/arm/Kconfig
+++ b/hw/arm/Kconfig
@@ -8,6 +8,7 @@ config ARM_VIRT
imply TPM_TIS_SYSBUS
imply TPM_TIS_I2C
imply NVDIMM
+ imply IOMMUFD
select ARM_GIC
select ACPI
select ARM_SMMUV3
--
2.39.3

View File

@ -0,0 +1,88 @@
From e670722b9a6460d41497688d820d5a9a9b51d8e9 Mon Sep 17 00:00:00 2001
From: Gavin Shan <gshan@redhat.com>
Date: Tue, 9 Jan 2024 11:36:42 +1000
Subject: [PATCH 001/101] hw/arm/virt: Add properties to disable high memory
regions
RH-Author: Gavin Shan <gshan@redhat.com>
RH-MergeRequest: 210: hw/arm/virt: Add properties to disable high memory regions
RH-Jira: RHEL-19738
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Eric Auger <eric.auger@redhat.com>
RH-Commit: [1/1] 4097ba5133a67126e30b84202cb40df4e019c5f4
Upstream: RHEL-only
Brew: https://brewweb.engineering.redhat.com/brew/taskinfo?taskID=57927352
There are 3 high memory regions for GICv3 or GICv4 redistributor, PCI
ECAM and PCI MMIO. Each of them has a property introduced by upstream
commit 6a48c64eec ("hw/arm/virt: Add properties to disable high memory
regions") so that the corresponding high memory region can be disabled.
It's notable that another property ("compact-highmem") introduced by
upstream commit f40408a9fe ("hw/arm/virt: Add 'compact-highmem' property")
so that the compact high memory region layout during assignment can be
disabled, compatible to the old machine types. However, we don't have
the compatible issue since the compact high memory region layout is
always kept as disabled until RHEL9.2.0 machine type and onwards.
Expose those 3 properties: "highmem-redists", "highmem-ecam" and
"highmem-mmio". The property "compact-highmem" is kept as hidden.
Signed-off-by: Gavin Shan <gshan@redhat.com>
---
hw/arm/virt.c | 24 +++++++++++++++++++++++-
1 file changed, 23 insertions(+), 1 deletion(-)
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 5cab00b4cd..60f117f0d2 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -2456,6 +2456,7 @@ static void virt_set_compact_highmem(Object *obj, bool value, Error **errp)
vms->highmem_compact = value;
}
+#endif /* disabled for RHEL */
static bool virt_get_highmem_redists(Object *obj, Error **errp)
{
@@ -2498,7 +2499,6 @@ static void virt_set_highmem_mmio(Object *obj, bool value, Error **errp)
vms->highmem_mmio = value;
}
-#endif /* disabled for RHEL */
static bool virt_get_its(Object *obj, Error **errp)
{
@@ -3521,6 +3521,28 @@ static void rhel_machine_class_init(ObjectClass *oc, void *data)
"Set on/off to enable/disable using "
"physical address space above 32 bits");
+ object_class_property_add_bool(oc, "highmem-redists",
+ virt_get_highmem_redists,
+ virt_set_highmem_redists);
+ object_class_property_set_description(oc, "highmem-redists",
+ "Set on/off to enable/disable high "
+ "memory region for GICv3 or GICv4 "
+ "redistributor");
+
+ object_class_property_add_bool(oc, "highmem-ecam",
+ virt_get_highmem_ecam,
+ virt_set_highmem_ecam);
+ object_class_property_set_description(oc, "highmem-ecam",
+ "Set on/off to enable/disable high "
+ "memory region for PCI ECAM");
+
+ object_class_property_add_bool(oc, "highmem-mmio",
+ virt_get_highmem_mmio,
+ virt_set_highmem_mmio);
+ object_class_property_set_description(oc, "highmem-mmio",
+ "Set on/off to enable/disable high "
+ "memory region for PCI MMIO");
+
object_class_property_add_str(oc, "gic-version", virt_get_gic_version,
virt_set_gic_version);
object_class_property_set_description(oc, "gic-version",
--
2.39.3

View File

@ -0,0 +1,41 @@
From 7a6be312c11911bdd2ce82566be22a3e014947c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Tue, 21 Nov 2023 16:44:20 +0800
Subject: [PATCH 041/101] hw/i386: Activate IOMMUFD for q35 machines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [40/67] b15764ab24fd57389a8d219736613484acd7d29e (eauger1/centos-qemu-kvm)
Signed-off-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 64ad06f6eba66c514477f490bcba409439a480d8)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/i386/Kconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig
index 55850791df..a1846be6f7 100644
--- a/hw/i386/Kconfig
+++ b/hw/i386/Kconfig
@@ -95,6 +95,7 @@ config Q35
imply E1000E_PCI_EXPRESS
imply VMPORT
imply VMMOUSE
+ imply IOMMUFD
select PC_PCI
select PC_ACPI
select PCI_EXPRESS_Q35
--
2.39.3

View File

@ -0,0 +1,116 @@
From 84f378c41832602dcf9bad6167b1f532c7c53e37 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Tue, 21 Nov 2023 15:03:55 +0100
Subject: [PATCH 048/101] hw/ppc/Kconfig: Imply VFIO_PCI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [47/67] c1a40cdab9bf62b16cb428d57a20b3e0eaa6de38 (eauger1/centos-qemu-kvm)
When the legacy and iommufd backends were introduced, a set of common
vfio-pci routines were exported in pci.c for both backends to use :
vfio_pci_pre_reset
vfio_pci_get_pci_hot_reset_info
vfio_pci_host_match
vfio_pci_post_reset
This introduced a build failure on PPC when --without-default-devices
is use because VFIO is always selected in ppc/Kconfig but VFIO_PCI is
not.
Use an 'imply VFIO_PCI' in ppc/Kconfig and bypass compilation of the
VFIO EEH hooks routines defined in hw/ppc/spapr_pci_vfio.c with
CONFIG_VFIO_PCI.
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 4278df9d1d2383b738338c857406357660f11e42)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/ppc/Kconfig | 2 +-
hw/ppc/spapr_pci_vfio.c | 36 ++++++++++++++++++++++++++++++++++++
2 files changed, 37 insertions(+), 1 deletion(-)
diff --git a/hw/ppc/Kconfig b/hw/ppc/Kconfig
index 56f0475a8e..44263a58c4 100644
--- a/hw/ppc/Kconfig
+++ b/hw/ppc/Kconfig
@@ -3,11 +3,11 @@ config PSERIES
imply PCI_DEVICES
imply TEST_DEVICES
imply VIRTIO_VGA
+ imply VFIO_PCI if LINUX # needed by spapr_pci_vfio.c
select NVDIMM
select DIMM
select PCI
select SPAPR_VSCSI
- select VFIO if LINUX # needed by spapr_pci_vfio.c
select XICS
select XIVE
select MSI_NONBROKEN
diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c
index d1d07bec46..76b2a3487b 100644
--- a/hw/ppc/spapr_pci_vfio.c
+++ b/hw/ppc/spapr_pci_vfio.c
@@ -26,10 +26,12 @@
#include "hw/pci/pci_device.h"
#include "hw/vfio/vfio-common.h"
#include "qemu/error-report.h"
+#include CONFIG_DEVICES /* CONFIG_VFIO_PCI */
/*
* Interfaces for IBM EEH (Enhanced Error Handling)
*/
+#ifdef CONFIG_VFIO_PCI
static bool vfio_eeh_container_ok(VFIOContainer *container)
{
/*
@@ -314,3 +316,37 @@ int spapr_phb_vfio_eeh_configure(SpaprPhbState *sphb)
return RTAS_OUT_SUCCESS;
}
+
+#else
+
+bool spapr_phb_eeh_available(SpaprPhbState *sphb)
+{
+ return false;
+}
+
+void spapr_phb_vfio_reset(DeviceState *qdev)
+{
+}
+
+int spapr_phb_vfio_eeh_set_option(SpaprPhbState *sphb,
+ unsigned int addr, int option)
+{
+ return RTAS_OUT_NOT_SUPPORTED;
+}
+
+int spapr_phb_vfio_eeh_get_state(SpaprPhbState *sphb, int *state)
+{
+ return RTAS_OUT_NOT_SUPPORTED;
+}
+
+int spapr_phb_vfio_eeh_reset(SpaprPhbState *sphb, int option)
+{
+ return RTAS_OUT_NOT_SUPPORTED;
+}
+
+int spapr_phb_vfio_eeh_configure(SpaprPhbState *sphb)
+{
+ return RTAS_OUT_NOT_SUPPORTED;
+}
+
+#endif /* CONFIG_VFIO_PCI */
--
2.39.3

View File

@ -0,0 +1,73 @@
From 8f27893a37e55a31180bb66cd9eae7199911881b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Volker=20R=C3=BCmelin?= <vr_qemu@t-online.de>
Date: Fri, 29 Dec 2023 21:38:54 +0100
Subject: [PATCH 060/101] hw/vfio: fix iteration over global VFIODevice list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [59/67] f926e1233c8c5ad418e8794b1a103371c9dc5eb0 (eauger1/centos-qemu-kvm)
Commit 3d779abafe ("vfio/common: Introduce a global VFIODevice list")
introduced a global VFIODevice list, but forgot to update the list
element field name when iterating over the new list. Change the code
to use the correct list element field.
Fixes: 3d779abafe ("vfio/common: Introduce a global VFIODevice list")
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2061
Signed-off-by: Volker Rümelin <vr_qemu@t-online.de>
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
(cherry picked from commit 9353b6da430f90e47f352dbf6dc31120c8914da6)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/common.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 0d4d8b8416..0b3352f2a9 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -73,7 +73,7 @@ bool vfio_mig_active(void)
return false;
}
- QLIST_FOREACH(vbasedev, &vfio_device_list, next) {
+ QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
if (vbasedev->migration_blocker) {
return false;
}
@@ -94,7 +94,7 @@ static bool vfio_multiple_devices_migration_is_supported(void)
unsigned int device_num = 0;
bool all_support_p2p = true;
- QLIST_FOREACH(vbasedev, &vfio_device_list, next) {
+ QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
if (vbasedev->migration) {
device_num++;
@@ -1366,13 +1366,13 @@ void vfio_reset_handler(void *opaque)
{
VFIODevice *vbasedev;
- QLIST_FOREACH(vbasedev, &vfio_device_list, next) {
+ QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
if (vbasedev->dev->realized) {
vbasedev->ops->vfio_compute_needs_reset(vbasedev);
}
}
- QLIST_FOREACH(vbasedev, &vfio_device_list, next) {
+ QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
if (vbasedev->dev->realized && vbasedev->needs_reset) {
vbasedev->ops->vfio_hot_reset_multi(vbasedev);
}
--
2.39.3

View File

@ -0,0 +1,54 @@
From 51b8f29cddb73eb02f91af5f52a205fdd3af6583 Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Wed, 17 Jan 2024 21:08:59 +0100
Subject: [PATCH 099/101] include/ui/rect.h: fix qemu_rect_init()
mis-assignment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 216: Fix regression in QEMU's virtio-gpu VNC sessions
RH-Jira: RHEL-21570
RH-Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Commit: [1/1] a9d487be04e2c1847b80c479b5cc790af81e3428 (thuth/qemu-kvm-cs9)
JIRA: https://issues.redhat.com/browse/RHEL-21570
commit 9d5b42beb6978dc6219d5dc029c9d453c6b8d503
Author: Elen Avan <elen.avan@bk.ru>
Date: Fri Dec 22 22:17:21 2023 +0300
include/ui/rect.h: fix qemu_rect_init() mis-assignment
Signed-off-by: Elen Avan <elen.avan@bk.ru>
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2051
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2050
Fixes: a200d53b1fde "virtio-gpu: replace PIXMAN for region/rect test"
Cc: qemu-stable@nongnu.org
Reviewed-by: Michael Tokarev <mjt@tls.msk.ru>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Signed-off-by: Michael Tokarev <mjt@tls.msk.ru>
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
include/ui/rect.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/ui/rect.h b/include/ui/rect.h
index 94898f92d0..68f05d78a8 100644
--- a/include/ui/rect.h
+++ b/include/ui/rect.h
@@ -19,7 +19,7 @@ static inline void qemu_rect_init(QemuRect *rect,
uint16_t width, uint16_t height)
{
rect->x = x;
- rect->y = x;
+ rect->y = y;
rect->width = width;
rect->height = height;
}
--
2.39.3

View File

@ -0,0 +1,105 @@
From 4ab25b33831fa207500179bd30f29388d81e4cce Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 5 Dec 2023 13:20:10 -0500
Subject: [PATCH 093/101] job: remove outdated AioContext locking comments
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [24/26] 15ff2928be82d6905c22619458487fbb72d6044a (kmwolf/centos-qemu-kvm)
The AioContext lock no longer exists.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-ID: <20231205182011.1976568-14-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
include/qemu/job.h | 20 --------------------
1 file changed, 20 deletions(-)
diff --git a/include/qemu/job.h b/include/qemu/job.h
index e502787dd8..9ea98b5927 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -67,8 +67,6 @@ typedef struct Job {
/**
* The completion function that will be called when the job completes.
- * Called with AioContext lock held, since many callback implementations
- * use bdrv_* functions that require to hold the lock.
*/
BlockCompletionFunc *cb;
@@ -264,9 +262,6 @@ struct JobDriver {
*
* This callback will not be invoked if the job has already failed.
* If it fails, abort and then clean will be called.
- *
- * Called with AioContext lock held, since many callbacs implementations
- * use bdrv_* functions that require to hold the lock.
*/
int (*prepare)(Job *job);
@@ -277,9 +272,6 @@ struct JobDriver {
*
* All jobs will complete with a call to either .commit() or .abort() but
* never both.
- *
- * Called with AioContext lock held, since many callback implementations
- * use bdrv_* functions that require to hold the lock.
*/
void (*commit)(Job *job);
@@ -290,9 +282,6 @@ struct JobDriver {
*
* All jobs will complete with a call to either .commit() or .abort() but
* never both.
- *
- * Called with AioContext lock held, since many callback implementations
- * use bdrv_* functions that require to hold the lock.
*/
void (*abort)(Job *job);
@@ -301,9 +290,6 @@ struct JobDriver {
* .commit() or .abort(). Regardless of which callback is invoked after
* completion, .clean() will always be called, even if the job does not
* belong to a transaction group.
- *
- * Called with AioContext lock held, since many callbacs implementations
- * use bdrv_* functions that require to hold the lock.
*/
void (*clean)(Job *job);
@@ -318,17 +304,12 @@ struct JobDriver {
* READY).
* (If the callback is NULL, the job is assumed to terminate
* without I/O.)
- *
- * Called with AioContext lock held, since many callback implementations
- * use bdrv_* functions that require to hold the lock.
*/
bool (*cancel)(Job *job, bool force);
/**
* Called when the job is freed.
- * Called with AioContext lock held, since many callback implementations
- * use bdrv_* functions that require to hold the lock.
*/
void (*free)(Job *job);
};
@@ -424,7 +405,6 @@ void job_ref_locked(Job *job);
* Release a reference that was previously acquired with job_ref_locked() or
* job_create(). If it's the last reference to the object, it will be freed.
*
- * Takes AioContext lock internally to invoke a job->driver callback.
* Called with job lock held.
*/
void job_unref_locked(Job *job);
--
2.39.3

View File

@ -0,0 +1,42 @@
From 9c2eb4ab03903bc084c53ac29b60b8d2121c9fed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Tue, 21 Nov 2023 16:44:19 +0800
Subject: [PATCH 040/101] kconfig: Activate IOMMUFD for s390x machines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [39/67] cf0ebe770b8db5916dd35247618c0a325dc1eaab (eauger1/centos-qemu-kvm)
Signed-off-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
Reviewed-by: Eric Farman <farman@linux.ibm.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 73e2df669335047b542b67d37ade060a6ae40dd8)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/s390x/Kconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/hw/s390x/Kconfig b/hw/s390x/Kconfig
index 4c068d7960..26ad104485 100644
--- a/hw/s390x/Kconfig
+++ b/hw/s390x/Kconfig
@@ -6,6 +6,7 @@ config S390_CCW_VIRTIO
imply VFIO_CCW
imply WDT_DIAG288
imply PCIE_DEVICES
+ imply IOMMUFD
select PCI_EXPRESS
select S390_FLIC
select S390_FLIC_KVM if KVM
--
2.39.3

View File

@ -0,0 +1,53 @@
From cd7788a857a6099206c4063e3ef69cb9e4aebcbc Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 21 Dec 2023 14:24:50 -0500
Subject: [PATCH 070/101] nbd/server: avoid per-NBDRequest nbd_client_get/put()
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [1/26] 5acb090ac4adf4260cd9e9c5605a27012b2a33aa (kmwolf/centos-qemu-kvm)
nbd_trip() processes a single NBD request from start to finish and holds
an NBDClient reference throughout. NBDRequest does not outlive the scope
of nbd_trip(). Therefore it is unnecessary to ref/unref NBDClient for
each NBDRequest.
Removing these nbd_client_get()/nbd_client_put() calls will make
thread-safety easier in the commits that follow.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <20231221192452.1785567-5-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
nbd/server.c | 3 ---
1 file changed, 3 deletions(-)
diff --git a/nbd/server.c b/nbd/server.c
index 895cf0a752..0b09ccc8dc 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -1557,7 +1557,6 @@ static NBDRequestData *nbd_request_get(NBDClient *client)
client->nb_requests++;
req = g_new0(NBDRequestData, 1);
- nbd_client_get(client);
req->client = client;
return req;
}
@@ -1578,8 +1577,6 @@ static void nbd_request_put(NBDRequestData *req)
}
nbd_client_receive_next_request(client);
-
- nbd_client_put(client);
}
static void blk_aio_attached(AioContext *ctx, void *opaque)
--
2.39.3

View File

@ -0,0 +1,373 @@
From bb0a6afff7f23a3ddb460dc1b2e70c06565f8a3f Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 21 Dec 2023 14:24:52 -0500
Subject: [PATCH 072/101] nbd/server: introduce NBDClient->lock to protect
fields
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [3/26] 49b64adaaf8b1c30f339d1ecc8ea89fb9db63f1c (kmwolf/centos-qemu-kvm)
NBDClient has a number of fields that are accessed by both the export
AioContext and the main loop thread. When the AioContext lock is removed
these fields will need another form of protection.
Add NBDClient->lock and protect fields that are accessed by both
threads. Also add assertions where possible and otherwise add doc
comments stating assumptions about which thread and lock holding.
Note this patch moves the client->recv_coroutine assertion from
nbd_co_receive_request() to nbd_trip() where client->lock is held.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20231221192452.1785567-7-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
nbd/server.c | 144 +++++++++++++++++++++++++++++++++++++++------------
1 file changed, 111 insertions(+), 33 deletions(-)
diff --git a/nbd/server.c b/nbd/server.c
index e91e2e0903..941832f178 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -125,23 +125,25 @@ struct NBDClient {
int refcount; /* atomic */
void (*close_fn)(NBDClient *client, bool negotiated);
+ QemuMutex lock;
+
NBDExport *exp;
QCryptoTLSCreds *tlscreds;
char *tlsauthz;
QIOChannelSocket *sioc; /* The underlying data channel */
QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */
- Coroutine *recv_coroutine;
+ Coroutine *recv_coroutine; /* protected by lock */
CoMutex send_lock;
Coroutine *send_coroutine;
- bool read_yielding;
- bool quiescing;
+ bool read_yielding; /* protected by lock */
+ bool quiescing; /* protected by lock */
QTAILQ_ENTRY(NBDClient) next;
- int nb_requests;
- bool closing;
+ int nb_requests; /* protected by lock */
+ bool closing; /* protected by lock */
uint32_t check_align; /* If non-zero, check for aligned client requests */
@@ -1415,11 +1417,18 @@ nbd_read_eof(NBDClient *client, void *buffer, size_t size, Error **errp)
len = qio_channel_readv(client->ioc, &iov, 1, errp);
if (len == QIO_CHANNEL_ERR_BLOCK) {
- client->read_yielding = true;
+ WITH_QEMU_LOCK_GUARD(&client->lock) {
+ client->read_yielding = true;
+
+ /* Prompt main loop thread to re-run nbd_drained_poll() */
+ aio_wait_kick();
+ }
qio_channel_yield(client->ioc, G_IO_IN);
- client->read_yielding = false;
- if (client->quiescing) {
- return -EAGAIN;
+ WITH_QEMU_LOCK_GUARD(&client->lock) {
+ client->read_yielding = false;
+ if (client->quiescing) {
+ return -EAGAIN;
+ }
}
continue;
} else if (len < 0) {
@@ -1528,6 +1537,7 @@ void nbd_client_put(NBDClient *client)
blk_exp_unref(&client->exp->common);
}
g_free(client->contexts.bitmaps);
+ qemu_mutex_destroy(&client->lock);
g_free(client);
}
}
@@ -1561,11 +1571,13 @@ static void client_close(NBDClient *client, bool negotiated)
{
assert(qemu_in_main_thread());
- if (client->closing) {
- return;
- }
+ WITH_QEMU_LOCK_GUARD(&client->lock) {
+ if (client->closing) {
+ return;
+ }
- client->closing = true;
+ client->closing = true;
+ }
/* Force requests to finish. They will drop their own references,
* then we'll close the socket and free the NBDClient.
@@ -1579,6 +1591,7 @@ static void client_close(NBDClient *client, bool negotiated)
}
}
+/* Runs in export AioContext with client->lock held */
static NBDRequestData *nbd_request_get(NBDClient *client)
{
NBDRequestData *req;
@@ -1591,6 +1604,7 @@ static NBDRequestData *nbd_request_get(NBDClient *client)
return req;
}
+/* Runs in export AioContext with client->lock held */
static void nbd_request_put(NBDRequestData *req)
{
NBDClient *client = req->client;
@@ -1614,14 +1628,18 @@ static void blk_aio_attached(AioContext *ctx, void *opaque)
NBDExport *exp = opaque;
NBDClient *client;
+ assert(qemu_in_main_thread());
+
trace_nbd_blk_aio_attached(exp->name, ctx);
exp->common.ctx = ctx;
QTAILQ_FOREACH(client, &exp->clients, next) {
- assert(client->nb_requests == 0);
- assert(client->recv_coroutine == NULL);
- assert(client->send_coroutine == NULL);
+ WITH_QEMU_LOCK_GUARD(&client->lock) {
+ assert(client->nb_requests == 0);
+ assert(client->recv_coroutine == NULL);
+ assert(client->send_coroutine == NULL);
+ }
}
}
@@ -1629,6 +1647,8 @@ static void blk_aio_detach(void *opaque)
{
NBDExport *exp = opaque;
+ assert(qemu_in_main_thread());
+
trace_nbd_blk_aio_detach(exp->name, exp->common.ctx);
exp->common.ctx = NULL;
@@ -1639,8 +1659,12 @@ static void nbd_drained_begin(void *opaque)
NBDExport *exp = opaque;
NBDClient *client;
+ assert(qemu_in_main_thread());
+
QTAILQ_FOREACH(client, &exp->clients, next) {
- client->quiescing = true;
+ WITH_QEMU_LOCK_GUARD(&client->lock) {
+ client->quiescing = true;
+ }
}
}
@@ -1649,28 +1673,48 @@ static void nbd_drained_end(void *opaque)
NBDExport *exp = opaque;
NBDClient *client;
+ assert(qemu_in_main_thread());
+
QTAILQ_FOREACH(client, &exp->clients, next) {
- client->quiescing = false;
- nbd_client_receive_next_request(client);
+ WITH_QEMU_LOCK_GUARD(&client->lock) {
+ client->quiescing = false;
+ nbd_client_receive_next_request(client);
+ }
}
}
+/* Runs in export AioContext */
+static void nbd_wake_read_bh(void *opaque)
+{
+ NBDClient *client = opaque;
+ qio_channel_wake_read(client->ioc);
+}
+
static bool nbd_drained_poll(void *opaque)
{
NBDExport *exp = opaque;
NBDClient *client;
+ assert(qemu_in_main_thread());
+
QTAILQ_FOREACH(client, &exp->clients, next) {
- if (client->nb_requests != 0) {
- /*
- * If there's a coroutine waiting for a request on nbd_read_eof()
- * enter it here so we don't depend on the client to wake it up.
- */
- if (client->recv_coroutine != NULL && client->read_yielding) {
- qio_channel_wake_read(client->ioc);
- }
+ WITH_QEMU_LOCK_GUARD(&client->lock) {
+ if (client->nb_requests != 0) {
+ /*
+ * If there's a coroutine waiting for a request on nbd_read_eof()
+ * enter it here so we don't depend on the client to wake it up.
+ *
+ * Schedule a BH in the export AioContext to avoid missing the
+ * wake up due to the race between qio_channel_wake_read() and
+ * qio_channel_yield().
+ */
+ if (client->recv_coroutine != NULL && client->read_yielding) {
+ aio_bh_schedule_oneshot(nbd_export_aio_context(client->exp),
+ nbd_wake_read_bh, client);
+ }
- return true;
+ return true;
+ }
}
}
@@ -1681,6 +1725,8 @@ static void nbd_eject_notifier(Notifier *n, void *data)
{
NBDExport *exp = container_of(n, NBDExport, eject_notifier);
+ assert(qemu_in_main_thread());
+
blk_exp_request_shutdown(&exp->common);
}
@@ -2566,7 +2612,6 @@ static int coroutine_fn nbd_co_receive_request(NBDRequestData *req,
int ret;
g_assert(qemu_in_coroutine());
- assert(client->recv_coroutine == qemu_coroutine_self());
ret = nbd_receive_request(client, request, errp);
if (ret < 0) {
return ret;
@@ -2975,6 +3020,9 @@ static coroutine_fn void nbd_trip(void *opaque)
*/
trace_nbd_trip();
+
+ qemu_mutex_lock(&client->lock);
+
if (client->closing) {
goto done;
}
@@ -2990,7 +3038,21 @@ static coroutine_fn void nbd_trip(void *opaque)
}
req = nbd_request_get(client);
- ret = nbd_co_receive_request(req, &request, &local_err);
+
+ /*
+ * nbd_co_receive_request() returns -EAGAIN when nbd_drained_begin() has
+ * set client->quiescing but by the time we get back nbd_drained_end() may
+ * have already cleared client->quiescing. In that case we try again
+ * because nothing else will spawn an nbd_trip() coroutine until we set
+ * client->recv_coroutine = NULL further down.
+ */
+ do {
+ assert(client->recv_coroutine == qemu_coroutine_self());
+ qemu_mutex_unlock(&client->lock);
+ ret = nbd_co_receive_request(req, &request, &local_err);
+ qemu_mutex_lock(&client->lock);
+ } while (ret == -EAGAIN && !client->quiescing);
+
client->recv_coroutine = NULL;
if (client->closing) {
@@ -3002,15 +3064,16 @@ static coroutine_fn void nbd_trip(void *opaque)
}
if (ret == -EAGAIN) {
- assert(client->quiescing);
goto done;
}
nbd_client_receive_next_request(client);
+
if (ret == -EIO) {
goto disconnect;
}
+ qemu_mutex_unlock(&client->lock);
qio_channel_set_cork(client->ioc, true);
if (ret < 0) {
@@ -3030,6 +3093,10 @@ static coroutine_fn void nbd_trip(void *opaque)
g_free(request.contexts->bitmaps);
g_free(request.contexts);
}
+
+ qio_channel_set_cork(client->ioc, false);
+ qemu_mutex_lock(&client->lock);
+
if (ret < 0) {
error_prepend(&local_err, "Failed to send reply: ");
goto disconnect;
@@ -3044,11 +3111,13 @@ static coroutine_fn void nbd_trip(void *opaque)
goto disconnect;
}
- qio_channel_set_cork(client->ioc, false);
done:
if (req) {
nbd_request_put(req);
}
+
+ qemu_mutex_unlock(&client->lock);
+
if (!nbd_client_put_nonzero(client)) {
aio_co_reschedule_self(qemu_get_aio_context());
nbd_client_put(client);
@@ -3059,13 +3128,19 @@ disconnect:
if (local_err) {
error_reportf_err(local_err, "Disconnect client, due to: ");
}
+
nbd_request_put(req);
+ qemu_mutex_unlock(&client->lock);
aio_co_reschedule_self(qemu_get_aio_context());
client_close(client, true);
nbd_client_put(client);
}
+/*
+ * Runs in export AioContext and main loop thread. Caller must hold
+ * client->lock.
+ */
static void nbd_client_receive_next_request(NBDClient *client)
{
if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS &&
@@ -3091,7 +3166,9 @@ static coroutine_fn void nbd_co_client_start(void *opaque)
return;
}
- nbd_client_receive_next_request(client);
+ WITH_QEMU_LOCK_GUARD(&client->lock) {
+ nbd_client_receive_next_request(client);
+ }
}
/*
@@ -3108,6 +3185,7 @@ void nbd_client_new(QIOChannelSocket *sioc,
Coroutine *co;
client = g_new0(NBDClient, 1);
+ qemu_mutex_init(&client->lock);
client->refcount = 1;
client->tlscreds = tlscreds;
if (tlscreds) {
--
2.39.3

View File

@ -0,0 +1,176 @@
From 8b60d72532b6511b41d82d591fb4f509314ef15f Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 21 Dec 2023 14:24:51 -0500
Subject: [PATCH 071/101] nbd/server: only traverse NBDExport->clients from
main loop thread
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [2/26] e7794a3a5c363c7508ee505c4ba03d9ef8862ca9 (kmwolf/centos-qemu-kvm)
The NBD clients list is currently accessed from both the export
AioContext and the main loop thread. When the AioContext lock is removed
there will be nothing protecting the clients list.
Adding a lock around the clients list is tricky because NBDClient
structs are refcounted and may be freed from the export AioContext or
the main loop thread. nbd_export_request_shutdown() -> client_close() ->
nbd_client_put() is also tricky because the list lock would be held
while indirectly dropping references to NDBClients.
A simpler approach is to only allow nbd_client_put() and client_close()
calls from the main loop thread. Then the NBD clients list is only
accessed from the main loop thread and no fancy locking is needed.
nbd_trip() just needs to reschedule itself in the main loop AioContext
before calling nbd_client_put() and client_close(). This costs more CPU
cycles per NBD request so add nbd_client_put_nonzero() to optimize the
common case where more references to NBDClient remain.
Note that nbd_client_get() can still be called from either thread, so
make NBDClient->refcount atomic.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20231221192452.1785567-6-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
nbd/server.c | 61 +++++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 51 insertions(+), 10 deletions(-)
diff --git a/nbd/server.c b/nbd/server.c
index 0b09ccc8dc..e91e2e0903 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -122,7 +122,7 @@ struct NBDMetaContexts {
};
struct NBDClient {
- int refcount;
+ int refcount; /* atomic */
void (*close_fn)(NBDClient *client, bool negotiated);
NBDExport *exp;
@@ -1501,14 +1501,17 @@ static int coroutine_fn nbd_receive_request(NBDClient *client, NBDRequest *reque
#define MAX_NBD_REQUESTS 16
+/* Runs in export AioContext and main loop thread */
void nbd_client_get(NBDClient *client)
{
- client->refcount++;
+ qatomic_inc(&client->refcount);
}
void nbd_client_put(NBDClient *client)
{
- if (--client->refcount == 0) {
+ assert(qemu_in_main_thread());
+
+ if (qatomic_fetch_dec(&client->refcount) == 1) {
/* The last reference should be dropped by client->close,
* which is called by client_close.
*/
@@ -1529,8 +1532,35 @@ void nbd_client_put(NBDClient *client)
}
}
+/*
+ * Tries to release the reference to @client, but only if other references
+ * remain. This is an optimization for the common case where we want to avoid
+ * the expense of scheduling nbd_client_put() in the main loop thread.
+ *
+ * Returns true upon success or false if the reference was not released because
+ * it is the last reference.
+ */
+static bool nbd_client_put_nonzero(NBDClient *client)
+{
+ int old = qatomic_read(&client->refcount);
+ int expected;
+
+ do {
+ if (old == 1) {
+ return false;
+ }
+
+ expected = old;
+ old = qatomic_cmpxchg(&client->refcount, expected, expected - 1);
+ } while (old != expected);
+
+ return true;
+}
+
static void client_close(NBDClient *client, bool negotiated)
{
+ assert(qemu_in_main_thread());
+
if (client->closing) {
return;
}
@@ -2933,15 +2963,20 @@ static coroutine_fn int nbd_handle_request(NBDClient *client,
static coroutine_fn void nbd_trip(void *opaque)
{
NBDClient *client = opaque;
- NBDRequestData *req;
+ NBDRequestData *req = NULL;
NBDRequest request = { 0 }; /* GCC thinks it can be used uninitialized */
int ret;
Error *local_err = NULL;
+ /*
+ * Note that nbd_client_put() and client_close() must be called from the
+ * main loop thread. Use aio_co_reschedule_self() to switch AioContext
+ * before calling these functions.
+ */
+
trace_nbd_trip();
if (client->closing) {
- nbd_client_put(client);
- return;
+ goto done;
}
if (client->quiescing) {
@@ -2949,10 +2984,9 @@ static coroutine_fn void nbd_trip(void *opaque)
* We're switching between AIO contexts. Don't attempt to receive a new
* request and kick the main context which may be waiting for us.
*/
- nbd_client_put(client);
client->recv_coroutine = NULL;
aio_wait_kick();
- return;
+ goto done;
}
req = nbd_request_get(client);
@@ -3012,8 +3046,13 @@ static coroutine_fn void nbd_trip(void *opaque)
qio_channel_set_cork(client->ioc, false);
done:
- nbd_request_put(req);
- nbd_client_put(client);
+ if (req) {
+ nbd_request_put(req);
+ }
+ if (!nbd_client_put_nonzero(client)) {
+ aio_co_reschedule_self(qemu_get_aio_context());
+ nbd_client_put(client);
+ }
return;
disconnect:
@@ -3021,6 +3060,8 @@ disconnect:
error_reportf_err(local_err, "Disconnect client, due to: ");
}
nbd_request_put(req);
+
+ aio_co_reschedule_self(qemu_get_aio_context());
client_close(client, true);
nbd_client_put(client);
}
--
2.39.3

View File

@ -0,0 +1,106 @@
From 64b0180f5a52668f8ac4c444ba369231dbc4d5b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Mon, 22 Jan 2024 09:25:53 +0100
Subject: [PATCH 096/101] s390x/pci: avoid double enable/disable of aif
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Cédric Le Goater <clg@redhat.com>
RH-MergeRequest: 215: s390x: Fix reset ordering of passthrough ISM devices
RH-Jira: RHEL-21169
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
RH-Commit: [1/3] ebdf8a474ea21486f5ec051683f17bae6d20f675 (clegoate/qemu-kvm-c9s)
JIRA: https://issues.redhat.com/browse/RHEL-21169
commit 07b2c8e034d80ff92e202405c494d2ff80fcf848
Author: Matthew Rosato <mjrosato@linux.ibm.com>
Date: Thu Jan 18 13:51:49 2024 -0500
s390x/pci: avoid double enable/disable of aif
Use a flag to keep track of whether AIF is currently enabled. This can be
used to avoid enabling/disabling AIF multiple times as well as to determine
whether or not it should be disabled during reset processing.
Fixes: d0bc7091c2 ("s390x/pci: enable adapter event notification for interpreted devices")
Reported-by: Cédric Le Goater <clg@redhat.com>
Reviewed-by: Eric Farman <farman@linux.ibm.com>
Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
Message-ID: <20240118185151.265329-2-mjrosato@linux.ibm.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
---
hw/s390x/s390-pci-kvm.c | 25 +++++++++++++++++++++++--
include/hw/s390x/s390-pci-bus.h | 1 +
2 files changed, 24 insertions(+), 2 deletions(-)
diff --git a/hw/s390x/s390-pci-kvm.c b/hw/s390x/s390-pci-kvm.c
index ff41e4106d..1ee510436c 100644
--- a/hw/s390x/s390-pci-kvm.c
+++ b/hw/s390x/s390-pci-kvm.c
@@ -27,6 +27,7 @@ bool s390_pci_kvm_interp_allowed(void)
int s390_pci_kvm_aif_enable(S390PCIBusDevice *pbdev, ZpciFib *fib, bool assist)
{
+ int rc;
struct kvm_s390_zpci_op args = {
.fh = pbdev->fh,
.op = KVM_S390_ZPCIOP_REG_AEN,
@@ -38,15 +39,35 @@ int s390_pci_kvm_aif_enable(S390PCIBusDevice *pbdev, ZpciFib *fib, bool assist)
.u.reg_aen.flags = (assist) ? 0 : KVM_S390_ZPCIOP_REGAEN_HOST
};
- return kvm_vm_ioctl(kvm_state, KVM_S390_ZPCI_OP, &args);
+ if (pbdev->aif) {
+ return -EINVAL;
+ }
+
+ rc = kvm_vm_ioctl(kvm_state, KVM_S390_ZPCI_OP, &args);
+ if (rc == 0) {
+ pbdev->aif = true;
+ }
+
+ return rc;
}
int s390_pci_kvm_aif_disable(S390PCIBusDevice *pbdev)
{
+ int rc;
+
struct kvm_s390_zpci_op args = {
.fh = pbdev->fh,
.op = KVM_S390_ZPCIOP_DEREG_AEN
};
- return kvm_vm_ioctl(kvm_state, KVM_S390_ZPCI_OP, &args);
+ if (!pbdev->aif) {
+ return -EINVAL;
+ }
+
+ rc = kvm_vm_ioctl(kvm_state, KVM_S390_ZPCI_OP, &args);
+ if (rc == 0) {
+ pbdev->aif = false;
+ }
+
+ return rc;
}
diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h
index b1bdbeaeb5..435e788867 100644
--- a/include/hw/s390x/s390-pci-bus.h
+++ b/include/hw/s390x/s390-pci-bus.h
@@ -361,6 +361,7 @@ struct S390PCIBusDevice {
bool unplug_requested;
bool interp;
bool forwarding_assist;
+ bool aif;
QTAILQ_ENTRY(S390PCIBusDevice) link;
};
--
2.39.3

View File

@ -0,0 +1,137 @@
From c885b17e09ab19a3e8d3b2e1765963811af6f764 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Mon, 22 Jan 2024 09:25:53 +0100
Subject: [PATCH 098/101] s390x/pci: drive ISM reset from subsystem reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Cédric Le Goater <clg@redhat.com>
RH-MergeRequest: 215: s390x: Fix reset ordering of passthrough ISM devices
RH-Jira: RHEL-21169
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
RH-Commit: [3/3] 426cf156a2c67e6dcd7483a769fa3741e2700504 (clegoate/qemu-kvm-c9s)
JIRA: https://issues.redhat.com/browse/RHEL-21169
commit 68c691ca99a2538d6a53a70ce8a9ce06ee307ff1
Author: Matthew Rosato <mjrosato@linux.ibm.com>
Date: Thu Jan 18 13:51:51 2024 -0500
s390x/pci: drive ISM reset from subsystem reset
ISM devices are sensitive to manipulation of the IOMMU, so the ISM device
needs to be reset before the vfio-pci device is reset (triggering a full
UNMAP). In order to ensure this occurs, trigger ISM device resets from
subsystem_reset before triggering the PCI bus reset (which will also
trigger vfio-pci reset). This only needs to be done for ISM devices
which were enabled for use by the guest.
Further, ensure that AIF is disabled as part of the reset event.
Fixes: ef1535901a ("s390x: do a subsystem reset before the unprotect on reboot")
Fixes: 03451953c7 ("s390x/pci: reset ISM passthrough devices on shutdown and system reset")
Reported-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
Message-ID: <20240118185151.265329-4-mjrosato@linux.ibm.com>
Reviewed-by: Eric Farman <farman@linux.ibm.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
---
hw/s390x/s390-pci-bus.c | 26 +++++++++++++++++---------
hw/s390x/s390-virtio-ccw.c | 8 ++++++++
include/hw/s390x/s390-pci-bus.h | 1 +
3 files changed, 26 insertions(+), 9 deletions(-)
diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
index 347580ebac..3e57d5faca 100644
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -151,20 +151,12 @@ static void s390_pci_shutdown_notifier(Notifier *n, void *opaque)
pci_device_reset(pbdev->pdev);
}
-static void s390_pci_reset_cb(void *opaque)
-{
- S390PCIBusDevice *pbdev = opaque;
-
- pci_device_reset(pbdev->pdev);
-}
-
static void s390_pci_perform_unplug(S390PCIBusDevice *pbdev)
{
HotplugHandler *hotplug_ctrl;
if (pbdev->pft == ZPCI_PFT_ISM) {
notifier_remove(&pbdev->shutdown_notifier);
- qemu_unregister_reset(s390_pci_reset_cb, pbdev);
}
/* Unplug the PCI device */
@@ -1132,7 +1124,6 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
if (pbdev->pft == ZPCI_PFT_ISM) {
pbdev->shutdown_notifier.notify = s390_pci_shutdown_notifier;
qemu_register_shutdown_notifier(&pbdev->shutdown_notifier);
- qemu_register_reset(s390_pci_reset_cb, pbdev);
}
} else {
pbdev->fh |= FH_SHM_EMUL;
@@ -1279,6 +1270,23 @@ static void s390_pci_enumerate_bridge(PCIBus *bus, PCIDevice *pdev,
pci_default_write_config(pdev, PCI_SUBORDINATE_BUS, s->bus_no, 1);
}
+void s390_pci_ism_reset(void)
+{
+ S390pciState *s = s390_get_phb();
+
+ S390PCIBusDevice *pbdev, *next;
+
+ /* Trigger reset event for each passthrough ISM device currently in-use */
+ QTAILQ_FOREACH_SAFE(pbdev, &s->zpci_devs, link, next) {
+ if (pbdev->interp && pbdev->pft == ZPCI_PFT_ISM &&
+ pbdev->fh & FH_MASK_ENABLE) {
+ s390_pci_kvm_aif_disable(pbdev);
+
+ pci_device_reset(pbdev->pdev);
+ }
+ }
+}
+
static void s390_pcihost_reset(DeviceState *dev)
{
S390pciState *s = S390_PCI_HOST_BRIDGE(dev);
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index e26ce26f5a..24f4773179 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -118,6 +118,14 @@ static void subsystem_reset(void)
DeviceState *dev;
int i;
+ /*
+ * ISM firmware is sensitive to unexpected changes to the IOMMU, which can
+ * occur during reset of the vfio-pci device (unmap of entire aperture).
+ * Ensure any passthrough ISM devices are reset now, while CPUs are paused
+ * but before vfio-pci cleanup occurs.
+ */
+ s390_pci_ism_reset();
+
for (i = 0; i < ARRAY_SIZE(reset_dev_types); i++) {
dev = DEVICE(object_resolve_path_type("", reset_dev_types[i], NULL));
if (dev) {
diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h
index 435e788867..2c43ea123f 100644
--- a/include/hw/s390x/s390-pci-bus.h
+++ b/include/hw/s390x/s390-pci-bus.h
@@ -401,5 +401,6 @@ S390PCIBusDevice *s390_pci_find_dev_by_target(S390pciState *s,
const char *target);
S390PCIBusDevice *s390_pci_find_next_avail_dev(S390pciState *s,
S390PCIBusDevice *pbdev);
+void s390_pci_ism_reset(void);
#endif
--
2.39.3

View File

@ -0,0 +1,71 @@
From 49078bdfd4c116da3e920632ec6f7041f1b38015 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Mon, 22 Jan 2024 09:25:53 +0100
Subject: [PATCH 097/101] s390x/pci: refresh fh before disabling aif
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Cédric Le Goater <clg@redhat.com>
RH-MergeRequest: 215: s390x: Fix reset ordering of passthrough ISM devices
RH-Jira: RHEL-21169
RH-Acked-by: Thomas Huth <thuth@redhat.com>
RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
RH-Commit: [2/3] 3523067909c41818dfc769abdb93930833416c11 (clegoate/qemu-kvm-c9s)
JIRA: https://issues.redhat.com/browse/RHEL-21169
commit 30e35258e25c75c9d799c34fd89afcafffb37084
Author: Matthew Rosato <mjrosato@linux.ibm.com>
Date: Thu Jan 18 13:51:50 2024 -0500
s390x/pci: refresh fh before disabling aif
Typically we refresh the host fh during CLP enable, however it's possible
that the device goes through multiple reset events before the guest
performs another CLP enable. Let's handle this for now by refreshing the
host handle from vfio before disabling aif.
Fixes: 03451953c7 ("s390x/pci: reset ISM passthrough devices on shutdown and system reset")
Reported-by: Cédric Le Goater <clg@redhat.com>
Reviewed-by: Eric Farman <farman@linux.ibm.com>
Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
Message-ID: <20240118185151.265329-3-mjrosato@linux.ibm.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
---
hw/s390x/s390-pci-kvm.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/hw/s390x/s390-pci-kvm.c b/hw/s390x/s390-pci-kvm.c
index 1ee510436c..9eef4fc3ec 100644
--- a/hw/s390x/s390-pci-kvm.c
+++ b/hw/s390x/s390-pci-kvm.c
@@ -18,6 +18,7 @@
#include "hw/s390x/s390-pci-bus.h"
#include "hw/s390x/s390-pci-kvm.h"
#include "hw/s390x/s390-pci-inst.h"
+#include "hw/s390x/s390-pci-vfio.h"
#include "cpu_models.h"
bool s390_pci_kvm_interp_allowed(void)
@@ -64,6 +65,14 @@ int s390_pci_kvm_aif_disable(S390PCIBusDevice *pbdev)
return -EINVAL;
}
+ /*
+ * The device may have already been reset but we still want to relinquish
+ * the guest ISC, so always be sure to use an up-to-date host fh.
+ */
+ if (!s390_pci_get_host_fh(pbdev, &args.fh)) {
+ return -EPERM;
+ }
+
rc = kvm_vm_ioctl(kvm_state, KVM_S390_ZPCI_OP, &args);
if (rc == 0) {
pbdev->aif = false;
--
2.39.3

View File

@ -0,0 +1,88 @@
From cd08d22a0da022d99fe6cfddb7de680abf66c8be Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 5 Dec 2023 13:19:59 -0500
Subject: [PATCH 082/101] scsi: assert that callbacks run in the correct
AioContext
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [13/26] d2fd5065c3b72d9d2f4e37efee39fe12eba0f0a9 (kmwolf/centos-qemu-kvm)
Since the removal of AioContext locking, the correctness of the code
relies on running requests from a single AioContext at any given time.
Add assertions that verify that callbacks are invoked in the correct
AioContext.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20231205182011.1976568-3-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
hw/scsi/scsi-disk.c | 14 ++++++++++++++
system/dma-helpers.c | 3 +++
2 files changed, 17 insertions(+)
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 2c1bbb3530..a5048e0aaf 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -273,6 +273,10 @@ static void scsi_aio_complete(void *opaque, int ret)
SCSIDiskReq *r = (SCSIDiskReq *)opaque;
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+ /* The request must only run in the BlockBackend's AioContext */
+ assert(blk_get_aio_context(s->qdev.conf.blk) ==
+ qemu_get_current_aio_context());
+
assert(r->req.aiocb != NULL);
r->req.aiocb = NULL;
@@ -370,8 +374,13 @@ static void scsi_dma_complete(void *opaque, int ret)
static void scsi_read_complete_noio(SCSIDiskReq *r, int ret)
{
+ SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
uint32_t n;
+ /* The request must only run in the BlockBackend's AioContext */
+ assert(blk_get_aio_context(s->qdev.conf.blk) ==
+ qemu_get_current_aio_context());
+
assert(r->req.aiocb == NULL);
if (scsi_disk_req_check_error(r, ret, false)) {
goto done;
@@ -496,8 +505,13 @@ static void scsi_read_data(SCSIRequest *req)
static void scsi_write_complete_noio(SCSIDiskReq *r, int ret)
{
+ SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
uint32_t n;
+ /* The request must only run in the BlockBackend's AioContext */
+ assert(blk_get_aio_context(s->qdev.conf.blk) ==
+ qemu_get_current_aio_context());
+
assert (r->req.aiocb == NULL);
if (scsi_disk_req_check_error(r, ret, false)) {
goto done;
diff --git a/system/dma-helpers.c b/system/dma-helpers.c
index 528117f256..9b221cf94e 100644
--- a/system/dma-helpers.c
+++ b/system/dma-helpers.c
@@ -119,6 +119,9 @@ static void dma_blk_cb(void *opaque, int ret)
trace_dma_blk_cb(dbs, ret);
+ /* DMAAIOCB is not thread-safe and must be accessed only from dbs->ctx */
+ assert(ctx == qemu_get_current_aio_context());
+
dbs->acb = NULL;
dbs->offset += dbs->iov.size;
--
2.39.3

View File

@ -0,0 +1,245 @@
From d1d384bd24a7aeb527f4abd8a0958146544ef9bb Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 4 Dec 2023 11:42:58 -0500
Subject: [PATCH 079/101] scsi: don't lock AioContext in I/O code path
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [10/26] b5814cec94af5c254e300646d8783672b085bac3 (kmwolf/centos-qemu-kvm)
blk_aio_*() doesn't require the AioContext lock and the SCSI subsystem's
internal state also does not anymore.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Acked-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20231204164259.1515217-4-stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
hw/scsi/scsi-disk.c | 23 -----------------------
hw/scsi/scsi-generic.c | 20 +++-----------------
2 files changed, 3 insertions(+), 40 deletions(-)
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 6691f5edb8..2c1bbb3530 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -273,8 +273,6 @@ static void scsi_aio_complete(void *opaque, int ret)
SCSIDiskReq *r = (SCSIDiskReq *)opaque;
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
- aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
-
assert(r->req.aiocb != NULL);
r->req.aiocb = NULL;
@@ -286,7 +284,6 @@ static void scsi_aio_complete(void *opaque, int ret)
scsi_req_complete(&r->req, GOOD);
done:
- aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
scsi_req_unref(&r->req);
}
@@ -394,8 +391,6 @@ static void scsi_read_complete(void *opaque, int ret)
SCSIDiskReq *r = (SCSIDiskReq *)opaque;
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
- aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
-
assert(r->req.aiocb != NULL);
r->req.aiocb = NULL;
@@ -406,7 +401,6 @@ static void scsi_read_complete(void *opaque, int ret)
trace_scsi_disk_read_complete(r->req.tag, r->qiov.size);
}
scsi_read_complete_noio(r, ret);
- aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
}
/* Actually issue a read to the block device. */
@@ -448,8 +442,6 @@ static void scsi_do_read_cb(void *opaque, int ret)
SCSIDiskReq *r = (SCSIDiskReq *)opaque;
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
- aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
-
assert (r->req.aiocb != NULL);
r->req.aiocb = NULL;
@@ -459,7 +451,6 @@ static void scsi_do_read_cb(void *opaque, int ret)
block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
}
scsi_do_read(opaque, ret);
- aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
}
/* Read more data from scsi device into buffer. */
@@ -533,8 +524,6 @@ static void scsi_write_complete(void * opaque, int ret)
SCSIDiskReq *r = (SCSIDiskReq *)opaque;
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
- aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
-
assert (r->req.aiocb != NULL);
r->req.aiocb = NULL;
@@ -544,7 +533,6 @@ static void scsi_write_complete(void * opaque, int ret)
block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
}
scsi_write_complete_noio(r, ret);
- aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
}
static void scsi_write_data(SCSIRequest *req)
@@ -1742,8 +1730,6 @@ static void scsi_unmap_complete(void *opaque, int ret)
SCSIDiskReq *r = data->r;
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
- aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
-
assert(r->req.aiocb != NULL);
r->req.aiocb = NULL;
@@ -1754,7 +1740,6 @@ static void scsi_unmap_complete(void *opaque, int ret)
block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
scsi_unmap_complete_noio(data, ret);
}
- aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
}
static void scsi_disk_emulate_unmap(SCSIDiskReq *r, uint8_t *inbuf)
@@ -1822,8 +1807,6 @@ static void scsi_write_same_complete(void *opaque, int ret)
SCSIDiskReq *r = data->r;
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
- aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
-
assert(r->req.aiocb != NULL);
r->req.aiocb = NULL;
@@ -1847,7 +1830,6 @@ static void scsi_write_same_complete(void *opaque, int ret)
data->sector << BDRV_SECTOR_BITS,
&data->qiov, 0,
scsi_write_same_complete, data);
- aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
return;
}
@@ -1857,7 +1839,6 @@ done:
scsi_req_unref(&r->req);
qemu_vfree(data->iov.iov_base);
g_free(data);
- aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
}
static void scsi_disk_emulate_write_same(SCSIDiskReq *r, uint8_t *inbuf)
@@ -2810,7 +2791,6 @@ static void scsi_block_sgio_complete(void *opaque, int ret)
{
SCSIBlockReq *req = (SCSIBlockReq *)opaque;
SCSIDiskReq *r = &req->req;
- SCSIDevice *s = r->req.dev;
sg_io_hdr_t *io_hdr = &req->io_header;
if (ret == 0) {
@@ -2827,13 +2807,10 @@ static void scsi_block_sgio_complete(void *opaque, int ret)
}
if (ret > 0) {
- aio_context_acquire(blk_get_aio_context(s->conf.blk));
if (scsi_handle_rw_error(r, ret, true)) {
- aio_context_release(blk_get_aio_context(s->conf.blk));
scsi_req_unref(&r->req);
return;
}
- aio_context_release(blk_get_aio_context(s->conf.blk));
/* Ignore error. */
ret = 0;
diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
index 2417f0ad84..b7b04e1d63 100644
--- a/hw/scsi/scsi-generic.c
+++ b/hw/scsi/scsi-generic.c
@@ -109,15 +109,11 @@ done:
static void scsi_command_complete(void *opaque, int ret)
{
SCSIGenericReq *r = (SCSIGenericReq *)opaque;
- SCSIDevice *s = r->req.dev;
-
- aio_context_acquire(blk_get_aio_context(s->conf.blk));
assert(r->req.aiocb != NULL);
r->req.aiocb = NULL;
scsi_command_complete_noio(r, ret);
- aio_context_release(blk_get_aio_context(s->conf.blk));
}
static int execute_command(BlockBackend *blk,
@@ -274,14 +270,12 @@ static void scsi_read_complete(void * opaque, int ret)
SCSIDevice *s = r->req.dev;
int len;
- aio_context_acquire(blk_get_aio_context(s->conf.blk));
-
assert(r->req.aiocb != NULL);
r->req.aiocb = NULL;
if (ret || r->req.io_canceled) {
scsi_command_complete_noio(r, ret);
- goto done;
+ return;
}
len = r->io_header.dxfer_len - r->io_header.resid;
@@ -320,7 +314,7 @@ static void scsi_read_complete(void * opaque, int ret)
r->io_header.status != GOOD ||
len == 0) {
scsi_command_complete_noio(r, 0);
- goto done;
+ return;
}
/* Snoop READ CAPACITY output to set the blocksize. */
@@ -356,9 +350,6 @@ static void scsi_read_complete(void * opaque, int ret)
req_complete:
scsi_req_data(&r->req, len);
scsi_req_unref(&r->req);
-
-done:
- aio_context_release(blk_get_aio_context(s->conf.blk));
}
/* Read more data from scsi device into buffer. */
@@ -391,14 +382,12 @@ static void scsi_write_complete(void * opaque, int ret)
trace_scsi_generic_write_complete(ret);
- aio_context_acquire(blk_get_aio_context(s->conf.blk));
-
assert(r->req.aiocb != NULL);
r->req.aiocb = NULL;
if (ret || r->req.io_canceled) {
scsi_command_complete_noio(r, ret);
- goto done;
+ return;
}
if (r->req.cmd.buf[0] == MODE_SELECT && r->req.cmd.buf[4] == 12 &&
@@ -408,9 +397,6 @@ static void scsi_write_complete(void * opaque, int ret)
}
scsi_command_complete_noio(r, ret);
-
-done:
- aio_context_release(blk_get_aio_context(s->conf.blk));
}
/* Write data to a scsi device. Returns nonzero on failure.
--
2.39.3

View File

@ -0,0 +1,307 @@
From 42dd1357310bd1a68d6cacaa53cd5b1d1b02880d Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 4 Dec 2023 11:42:56 -0500
Subject: [PATCH 077/101] scsi: only access SCSIDevice->requests from one
thread
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [8/26] 9df662e82a63e93d184b5763bebbe7e43bc9dabe (kmwolf/centos-qemu-kvm)
Stop depending on the AioContext lock and instead access
SCSIDevice->requests from only one thread at a time:
- When the VM is running only the BlockBackend's AioContext may access
the requests list.
- When the VM is stopped only the main loop may access the requests
list.
These constraints protect the requests list without the need for locking
in the I/O code path.
Note that multiple IOThreads are not supported yet because the code
assumes all SCSIRequests are executed from a single AioContext. Leave
that as future work.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-ID: <20231204164259.1515217-2-stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
hw/scsi/scsi-bus.c | 181 ++++++++++++++++++++++++++++-------------
include/hw/scsi/scsi.h | 7 +-
2 files changed, 131 insertions(+), 57 deletions(-)
diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index fc4b77fdb0..b649cdf555 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -85,6 +85,89 @@ SCSIDevice *scsi_device_get(SCSIBus *bus, int channel, int id, int lun)
return d;
}
+/*
+ * Invoke @fn() for each enqueued request in device @s. Must be called from the
+ * main loop thread while the guest is stopped. This is only suitable for
+ * vmstate ->put(), use scsi_device_for_each_req_async() for other cases.
+ */
+static void scsi_device_for_each_req_sync(SCSIDevice *s,
+ void (*fn)(SCSIRequest *, void *),
+ void *opaque)
+{
+ SCSIRequest *req;
+ SCSIRequest *next_req;
+
+ assert(!runstate_is_running());
+ assert(qemu_in_main_thread());
+
+ QTAILQ_FOREACH_SAFE(req, &s->requests, next, next_req) {
+ fn(req, opaque);
+ }
+}
+
+typedef struct {
+ SCSIDevice *s;
+ void (*fn)(SCSIRequest *, void *);
+ void *fn_opaque;
+} SCSIDeviceForEachReqAsyncData;
+
+static void scsi_device_for_each_req_async_bh(void *opaque)
+{
+ g_autofree SCSIDeviceForEachReqAsyncData *data = opaque;
+ SCSIDevice *s = data->s;
+ AioContext *ctx;
+ SCSIRequest *req;
+ SCSIRequest *next;
+
+ /*
+ * If the AioContext changed before this BH was called then reschedule into
+ * the new AioContext before accessing ->requests. This can happen when
+ * scsi_device_for_each_req_async() is called and then the AioContext is
+ * changed before BHs are run.
+ */
+ ctx = blk_get_aio_context(s->conf.blk);
+ if (ctx != qemu_get_current_aio_context()) {
+ aio_bh_schedule_oneshot(ctx, scsi_device_for_each_req_async_bh,
+ g_steal_pointer(&data));
+ return;
+ }
+
+ QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
+ data->fn(req, data->fn_opaque);
+ }
+
+ /* Drop the reference taken by scsi_device_for_each_req_async() */
+ object_unref(OBJECT(s));
+}
+
+/*
+ * Schedule @fn() to be invoked for each enqueued request in device @s. @fn()
+ * runs in the AioContext that is executing the request.
+ */
+static void scsi_device_for_each_req_async(SCSIDevice *s,
+ void (*fn)(SCSIRequest *, void *),
+ void *opaque)
+{
+ assert(qemu_in_main_thread());
+
+ SCSIDeviceForEachReqAsyncData *data =
+ g_new(SCSIDeviceForEachReqAsyncData, 1);
+
+ data->s = s;
+ data->fn = fn;
+ data->fn_opaque = opaque;
+
+ /*
+ * Hold a reference to the SCSIDevice until
+ * scsi_device_for_each_req_async_bh() finishes.
+ */
+ object_ref(OBJECT(s));
+
+ aio_bh_schedule_oneshot(blk_get_aio_context(s->conf.blk),
+ scsi_device_for_each_req_async_bh,
+ data);
+}
+
static void scsi_device_realize(SCSIDevice *s, Error **errp)
{
SCSIDeviceClass *sc = SCSI_DEVICE_GET_CLASS(s);
@@ -144,20 +227,18 @@ void scsi_bus_init_named(SCSIBus *bus, size_t bus_size, DeviceState *host,
qbus_set_bus_hotplug_handler(BUS(bus));
}
-static void scsi_dma_restart_bh(void *opaque)
+void scsi_req_retry(SCSIRequest *req)
{
- SCSIDevice *s = opaque;
- SCSIRequest *req, *next;
-
- qemu_bh_delete(s->bh);
- s->bh = NULL;
+ req->retry = true;
+}
- aio_context_acquire(blk_get_aio_context(s->conf.blk));
- QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
- scsi_req_ref(req);
- if (req->retry) {
- req->retry = false;
- switch (req->cmd.mode) {
+/* Called in the AioContext that is executing the request */
+static void scsi_dma_restart_req(SCSIRequest *req, void *opaque)
+{
+ scsi_req_ref(req);
+ if (req->retry) {
+ req->retry = false;
+ switch (req->cmd.mode) {
case SCSI_XFER_FROM_DEV:
case SCSI_XFER_TO_DEV:
scsi_req_continue(req);
@@ -166,37 +247,22 @@ static void scsi_dma_restart_bh(void *opaque)
scsi_req_dequeue(req);
scsi_req_enqueue(req);
break;
- }
}
- scsi_req_unref(req);
}
- aio_context_release(blk_get_aio_context(s->conf.blk));
- /* Drop the reference that was acquired in scsi_dma_restart_cb */
- object_unref(OBJECT(s));
-}
-
-void scsi_req_retry(SCSIRequest *req)
-{
- /* No need to save a reference, because scsi_dma_restart_bh just
- * looks at the request list. */
- req->retry = true;
+ scsi_req_unref(req);
}
static void scsi_dma_restart_cb(void *opaque, bool running, RunState state)
{
SCSIDevice *s = opaque;
+ assert(qemu_in_main_thread());
+
if (!running) {
return;
}
- if (!s->bh) {
- AioContext *ctx = blk_get_aio_context(s->conf.blk);
- /* The reference is dropped in scsi_dma_restart_bh.*/
- object_ref(OBJECT(s));
- s->bh = aio_bh_new_guarded(ctx, scsi_dma_restart_bh, s,
- &DEVICE(s)->mem_reentrancy_guard);
- qemu_bh_schedule(s->bh);
- }
+
+ scsi_device_for_each_req_async(s, scsi_dma_restart_req, NULL);
}
static bool scsi_bus_is_address_free(SCSIBus *bus,
@@ -1657,15 +1723,16 @@ void scsi_device_set_ua(SCSIDevice *sdev, SCSISense sense)
}
}
+static void scsi_device_purge_one_req(SCSIRequest *req, void *opaque)
+{
+ scsi_req_cancel_async(req, NULL);
+}
+
void scsi_device_purge_requests(SCSIDevice *sdev, SCSISense sense)
{
- SCSIRequest *req;
+ scsi_device_for_each_req_async(sdev, scsi_device_purge_one_req, NULL);
aio_context_acquire(blk_get_aio_context(sdev->conf.blk));
- while (!QTAILQ_EMPTY(&sdev->requests)) {
- req = QTAILQ_FIRST(&sdev->requests);
- scsi_req_cancel_async(req, NULL);
- }
blk_drain(sdev->conf.blk);
aio_context_release(blk_get_aio_context(sdev->conf.blk));
scsi_device_set_ua(sdev, sense);
@@ -1737,31 +1804,33 @@ static char *scsibus_get_fw_dev_path(DeviceState *dev)
/* SCSI request list. For simplicity, pv points to the whole device */
+static void put_scsi_req(SCSIRequest *req, void *opaque)
+{
+ QEMUFile *f = opaque;
+
+ assert(!req->io_canceled);
+ assert(req->status == -1 && req->host_status == -1);
+ assert(req->enqueued);
+
+ qemu_put_sbyte(f, req->retry ? 1 : 2);
+ qemu_put_buffer(f, req->cmd.buf, sizeof(req->cmd.buf));
+ qemu_put_be32s(f, &req->tag);
+ qemu_put_be32s(f, &req->lun);
+ if (req->bus->info->save_request) {
+ req->bus->info->save_request(f, req);
+ }
+ if (req->ops->save_request) {
+ req->ops->save_request(f, req);
+ }
+}
+
static int put_scsi_requests(QEMUFile *f, void *pv, size_t size,
const VMStateField *field, JSONWriter *vmdesc)
{
SCSIDevice *s = pv;
- SCSIBus *bus = DO_UPCAST(SCSIBus, qbus, s->qdev.parent_bus);
- SCSIRequest *req;
- QTAILQ_FOREACH(req, &s->requests, next) {
- assert(!req->io_canceled);
- assert(req->status == -1 && req->host_status == -1);
- assert(req->enqueued);
-
- qemu_put_sbyte(f, req->retry ? 1 : 2);
- qemu_put_buffer(f, req->cmd.buf, sizeof(req->cmd.buf));
- qemu_put_be32s(f, &req->tag);
- qemu_put_be32s(f, &req->lun);
- if (bus->info->save_request) {
- bus->info->save_request(f, req);
- }
- if (req->ops->save_request) {
- req->ops->save_request(f, req);
- }
- }
+ scsi_device_for_each_req_sync(s, put_scsi_req, f);
qemu_put_sbyte(f, 0);
-
return 0;
}
diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
index 3692ca82f3..10c4e8288d 100644
--- a/include/hw/scsi/scsi.h
+++ b/include/hw/scsi/scsi.h
@@ -69,14 +69,19 @@ struct SCSIDevice
{
DeviceState qdev;
VMChangeStateEntry *vmsentry;
- QEMUBH *bh;
uint32_t id;
BlockConf conf;
SCSISense unit_attention;
bool sense_is_ua;
uint8_t sense[SCSI_SENSE_BUF_SIZE];
uint32_t sense_len;
+
+ /*
+ * The requests list is only accessed from the AioContext that executes
+ * requests or from the main loop when IOThread processing is stopped.
+ */
QTAILQ_HEAD(, SCSIRequest) requests;
+
uint32_t channel;
uint32_t lun;
int blocksize;
--
2.39.3

View File

@ -0,0 +1,280 @@
From 61d605433a5edfcc7fe836fd399106ed1e1907bb Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 5 Dec 2023 13:20:05 -0500
Subject: [PATCH 088/101] scsi: remove AioContext locking
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [19/26] 12a8e26670074a17dd2b0cfac06e0aea03b3068f (kmwolf/centos-qemu-kvm)
The AioContext lock no longer has any effect. Remove it.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-ID: <20231205182011.1976568-9-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
hw/scsi/scsi-bus.c | 2 --
hw/scsi/scsi-disk.c | 31 +++++--------------------------
hw/scsi/virtio-scsi.c | 18 ------------------
include/hw/virtio/virtio-scsi.h | 14 --------------
4 files changed, 5 insertions(+), 60 deletions(-)
diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index b649cdf555..5b08cbf60a 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -1732,9 +1732,7 @@ void scsi_device_purge_requests(SCSIDevice *sdev, SCSISense sense)
{
scsi_device_for_each_req_async(sdev, scsi_device_purge_one_req, NULL);
- aio_context_acquire(blk_get_aio_context(sdev->conf.blk));
blk_drain(sdev->conf.blk);
- aio_context_release(blk_get_aio_context(sdev->conf.blk));
scsi_device_set_ua(sdev, sense);
}
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index a5048e0aaf..61be3d395a 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -2339,14 +2339,10 @@ static void scsi_disk_reset(DeviceState *dev)
{
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev.qdev, dev);
uint64_t nb_sectors;
- AioContext *ctx;
scsi_device_purge_requests(&s->qdev, SENSE_CODE(RESET));
- ctx = blk_get_aio_context(s->qdev.conf.blk);
- aio_context_acquire(ctx);
blk_get_geometry(s->qdev.conf.blk, &nb_sectors);
- aio_context_release(ctx);
nb_sectors /= s->qdev.blocksize / BDRV_SECTOR_SIZE;
if (nb_sectors) {
@@ -2545,15 +2541,13 @@ static void scsi_unrealize(SCSIDevice *dev)
static void scsi_hd_realize(SCSIDevice *dev, Error **errp)
{
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, dev);
- AioContext *ctx = NULL;
+
/* can happen for devices without drive. The error message for missing
* backend will be issued in scsi_realize
*/
if (s->qdev.conf.blk) {
- ctx = blk_get_aio_context(s->qdev.conf.blk);
- aio_context_acquire(ctx);
if (!blkconf_blocksizes(&s->qdev.conf, errp)) {
- goto out;
+ return;
}
}
s->qdev.blocksize = s->qdev.conf.logical_block_size;
@@ -2562,16 +2556,11 @@ static void scsi_hd_realize(SCSIDevice *dev, Error **errp)
s->product = g_strdup("QEMU HARDDISK");
}
scsi_realize(&s->qdev, errp);
-out:
- if (ctx) {
- aio_context_release(ctx);
- }
}
static void scsi_cd_realize(SCSIDevice *dev, Error **errp)
{
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, dev);
- AioContext *ctx;
int ret;
uint32_t blocksize = 2048;
@@ -2587,8 +2576,6 @@ static void scsi_cd_realize(SCSIDevice *dev, Error **errp)
blocksize = dev->conf.physical_block_size;
}
- ctx = blk_get_aio_context(dev->conf.blk);
- aio_context_acquire(ctx);
s->qdev.blocksize = blocksize;
s->qdev.type = TYPE_ROM;
s->features |= 1 << SCSI_DISK_F_REMOVABLE;
@@ -2596,7 +2583,6 @@ static void scsi_cd_realize(SCSIDevice *dev, Error **errp)
s->product = g_strdup("QEMU CD-ROM");
}
scsi_realize(&s->qdev, errp);
- aio_context_release(ctx);
}
@@ -2727,7 +2713,6 @@ static int get_device_type(SCSIDiskState *s)
static void scsi_block_realize(SCSIDevice *dev, Error **errp)
{
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, dev);
- AioContext *ctx;
int sg_version;
int rc;
@@ -2742,9 +2727,6 @@ static void scsi_block_realize(SCSIDevice *dev, Error **errp)
"be removed in a future version");
}
- ctx = blk_get_aio_context(s->qdev.conf.blk);
- aio_context_acquire(ctx);
-
/* check we are using a driver managing SG_IO (version 3 and after) */
rc = blk_ioctl(s->qdev.conf.blk, SG_GET_VERSION_NUM, &sg_version);
if (rc < 0) {
@@ -2752,18 +2734,18 @@ static void scsi_block_realize(SCSIDevice *dev, Error **errp)
if (rc != -EPERM) {
error_append_hint(errp, "Is this a SCSI device?\n");
}
- goto out;
+ return;
}
if (sg_version < 30000) {
error_setg(errp, "scsi generic interface too old");
- goto out;
+ return;
}
/* get device type from INQUIRY data */
rc = get_device_type(s);
if (rc < 0) {
error_setg(errp, "INQUIRY failed");
- goto out;
+ return;
}
/* Make a guess for the block size, we'll fix it when the guest sends.
@@ -2783,9 +2765,6 @@ static void scsi_block_realize(SCSIDevice *dev, Error **errp)
scsi_realize(&s->qdev, errp);
scsi_generic_read_device_inquiry(&s->qdev);
-
-out:
- aio_context_release(ctx);
}
typedef struct SCSIBlockReq {
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 4f8d35facc..ca365a70e9 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -642,9 +642,7 @@ static void virtio_scsi_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
return;
}
- virtio_scsi_acquire(s);
virtio_scsi_handle_ctrl_vq(s, vq);
- virtio_scsi_release(s);
}
static void virtio_scsi_complete_cmd_req(VirtIOSCSIReq *req)
@@ -882,9 +880,7 @@ static void virtio_scsi_handle_cmd(VirtIODevice *vdev, VirtQueue *vq)
return;
}
- virtio_scsi_acquire(s);
virtio_scsi_handle_cmd_vq(s, vq);
- virtio_scsi_release(s);
}
static void virtio_scsi_get_config(VirtIODevice *vdev,
@@ -1031,9 +1027,7 @@ static void virtio_scsi_handle_event(VirtIODevice *vdev, VirtQueue *vq)
return;
}
- virtio_scsi_acquire(s);
virtio_scsi_handle_event_vq(s, vq);
- virtio_scsi_release(s);
}
static void virtio_scsi_change(SCSIBus *bus, SCSIDevice *dev, SCSISense sense)
@@ -1052,9 +1046,7 @@ static void virtio_scsi_change(SCSIBus *bus, SCSIDevice *dev, SCSISense sense)
},
};
- virtio_scsi_acquire(s);
virtio_scsi_push_event(s, &info);
- virtio_scsi_release(s);
}
}
@@ -1071,17 +1063,13 @@ static void virtio_scsi_hotplug(HotplugHandler *hotplug_dev, DeviceState *dev,
VirtIODevice *vdev = VIRTIO_DEVICE(hotplug_dev);
VirtIOSCSI *s = VIRTIO_SCSI(vdev);
SCSIDevice *sd = SCSI_DEVICE(dev);
- AioContext *old_context;
int ret;
if (s->ctx && !s->dataplane_fenced) {
if (blk_op_is_blocked(sd->conf.blk, BLOCK_OP_TYPE_DATAPLANE, errp)) {
return;
}
- old_context = blk_get_aio_context(sd->conf.blk);
- aio_context_acquire(old_context);
ret = blk_set_aio_context(sd->conf.blk, s->ctx, errp);
- aio_context_release(old_context);
if (ret < 0) {
return;
}
@@ -1097,10 +1085,8 @@ static void virtio_scsi_hotplug(HotplugHandler *hotplug_dev, DeviceState *dev,
},
};
- virtio_scsi_acquire(s);
virtio_scsi_push_event(s, &info);
scsi_bus_set_ua(&s->bus, SENSE_CODE(REPORTED_LUNS_CHANGED));
- virtio_scsi_release(s);
}
}
@@ -1122,17 +1108,13 @@ static void virtio_scsi_hotunplug(HotplugHandler *hotplug_dev, DeviceState *dev,
qdev_simple_device_unplug_cb(hotplug_dev, dev, errp);
if (s->ctx) {
- virtio_scsi_acquire(s);
/* If other users keep the BlockBackend in the iothread, that's ok */
blk_set_aio_context(sd->conf.blk, qemu_get_aio_context(), NULL);
- virtio_scsi_release(s);
}
if (virtio_vdev_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) {
- virtio_scsi_acquire(s);
virtio_scsi_push_event(s, &info);
scsi_bus_set_ua(&s->bus, SENSE_CODE(REPORTED_LUNS_CHANGED));
- virtio_scsi_release(s);
}
}
diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
index da8cb928d9..7f0573b1bf 100644
--- a/include/hw/virtio/virtio-scsi.h
+++ b/include/hw/virtio/virtio-scsi.h
@@ -101,20 +101,6 @@ struct VirtIOSCSI {
uint32_t host_features;
};
-static inline void virtio_scsi_acquire(VirtIOSCSI *s)
-{
- if (s->ctx) {
- aio_context_acquire(s->ctx);
- }
-}
-
-static inline void virtio_scsi_release(VirtIOSCSI *s)
-{
- if (s->ctx) {
- aio_context_release(s->ctx);
- }
-}
-
void virtio_scsi_common_realize(DeviceState *dev,
VirtIOHandleOutput ctrl,
VirtIOHandleOutput evt,
--
2.39.3

View File

@ -0,0 +1,41 @@
From 9f5c6dbe907fe6227006ab51179eaa50a63559cb Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 5 Dec 2023 13:20:09 -0500
Subject: [PATCH 092/101] scsi: remove outdated AioContext lock comment
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [23/26] 96e2e7d2e6a160ce4d695060f902d21030b3b1d8 (kmwolf/centos-qemu-kvm)
The SCSI subsystem no longer uses the AioContext lock. Request
processing runs exclusively in the BlockBackend's AioContext since
"scsi: only access SCSIDevice->requests from one thread" and hence the
lock is unnecessary.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-ID: <20231205182011.1976568-13-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
hw/scsi/scsi-disk.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 61be3d395a..2e7e1e9a1c 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -355,7 +355,6 @@ done:
scsi_req_unref(&r->req);
}
-/* Called with AioContext lock held */
static void scsi_dma_complete(void *opaque, int ret)
{
SCSIDiskReq *r = (SCSIDiskReq *)opaque;
--
2.39.3

View File

@ -0,0 +1,205 @@
From cc8d794932e26df7c7f3c8cc0c1f42da8d52f12b Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Mon, 15 Jan 2024 10:26:52 +0100
Subject: [PATCH 069/101] target/s390x/kvm/pv: Provide some more useful
information if decryption fails
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Thomas Huth <thuth@redhat.com>
RH-MergeRequest: 213: s390x: Provide some more useful information if decryption of a PV image fails
RH-Jira: RHEL-18212
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
RH-Commit: [1/1] 4ffb61869f7df33e23d3e0ebf8c29e386e3f6cbc (thuth/qemu-kvm-cs9)
JIRA: https://issues.redhat.com/browse/RHEL-18212
commit 7af51621b16ae86646cc2dc9dee30de8176ff761
Author: Thomas Huth <thuth@redhat.com>
Date: Wed Jan 10 15:29:16 2024 +0100
target/s390x/kvm/pv: Provide some more useful information if decryption fails
It's a common scenario to copy guest images from one host to another
to run the guest on the other machine. This (of course) does not work
with "secure execution" guests since they are encrypted with one certain
host key. However, if you still (accidentally) do it, you only get a
very user-unfriendly error message that looks like this:
qemu-system-s390x: KVM PV command 2 (KVM_PV_SET_SEC_PARMS) failed:
header rc 108 rrc 5 IOCTL rc: -22
Let's provide at least a somewhat nicer hint to the users so that they
are able to figure out what might have gone wrong.
Buglink: https://issues.redhat.com/browse/RHEL-18212
Message-ID: <20240110142916.850605-1-thuth@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
hw/s390x/ipl.c | 5 ++---
hw/s390x/ipl.h | 2 +-
hw/s390x/s390-virtio-ccw.c | 5 ++++-
target/s390x/kvm/pv.c | 25 ++++++++++++++++++++-----
target/s390x/kvm/pv.h | 5 +++--
5 files changed, 30 insertions(+), 12 deletions(-)
diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c
index 515dcf51b5..b23a6a0ef3 100644
--- a/hw/s390x/ipl.c
+++ b/hw/s390x/ipl.c
@@ -703,7 +703,7 @@ static void s390_ipl_prepare_qipl(S390CPU *cpu)
cpu_physical_memory_unmap(addr, len, 1, len);
}
-int s390_ipl_prepare_pv_header(void)
+int s390_ipl_prepare_pv_header(Error **errp)
{
IplParameterBlock *ipib = s390_ipl_get_iplb_pv();
IPLBlockPV *ipib_pv = &ipib->pv;
@@ -712,8 +712,7 @@ int s390_ipl_prepare_pv_header(void)
cpu_physical_memory_read(ipib_pv->pv_header_addr, hdr,
ipib_pv->pv_header_len);
- rc = s390_pv_set_sec_parms((uintptr_t)hdr,
- ipib_pv->pv_header_len);
+ rc = s390_pv_set_sec_parms((uintptr_t)hdr, ipib_pv->pv_header_len, errp);
g_free(hdr);
return rc;
}
diff --git a/hw/s390x/ipl.h b/hw/s390x/ipl.h
index 7fc86e7905..57cd125769 100644
--- a/hw/s390x/ipl.h
+++ b/hw/s390x/ipl.h
@@ -107,7 +107,7 @@ typedef union IplParameterBlock IplParameterBlock;
int s390_ipl_set_loadparm(uint8_t *loadparm);
void s390_ipl_update_diag308(IplParameterBlock *iplb);
-int s390_ipl_prepare_pv_header(void);
+int s390_ipl_prepare_pv_header(Error **errp);
int s390_ipl_pv_unpack(void);
void s390_ipl_prepare_cpu(S390CPU *cpu);
IplParameterBlock *s390_ipl_get_iplb(void);
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index 984891b82a..e26ce26f5a 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -391,7 +391,7 @@ static int s390_machine_protect(S390CcwMachineState *ms)
}
/* Set SE header and unpack */
- rc = s390_ipl_prepare_pv_header();
+ rc = s390_ipl_prepare_pv_header(&local_err);
if (rc) {
goto out_err;
}
@@ -410,6 +410,9 @@ static int s390_machine_protect(S390CcwMachineState *ms)
return rc;
out_err:
+ if (local_err) {
+ error_report_err(local_err);
+ }
s390_machine_unprotect(ms);
return rc;
}
diff --git a/target/s390x/kvm/pv.c b/target/s390x/kvm/pv.c
index 6a69be7e5c..7ca7faec73 100644
--- a/target/s390x/kvm/pv.c
+++ b/target/s390x/kvm/pv.c
@@ -29,7 +29,8 @@ static bool info_valid;
static struct kvm_s390_pv_info_vm info_vm;
static struct kvm_s390_pv_info_dump info_dump;
-static int __s390_pv_cmd(uint32_t cmd, const char *cmdname, void *data)
+static int __s390_pv_cmd(uint32_t cmd, const char *cmdname, void *data,
+ int *pvrc)
{
struct kvm_pv_cmd pv_cmd = {
.cmd = cmd,
@@ -46,6 +47,9 @@ static int __s390_pv_cmd(uint32_t cmd, const char *cmdname, void *data)
"IOCTL rc: %d", cmd, cmdname, pv_cmd.rc, pv_cmd.rrc,
rc);
}
+ if (pvrc) {
+ *pvrc = pv_cmd.rc;
+ }
return rc;
}
@@ -53,12 +57,13 @@ static int __s390_pv_cmd(uint32_t cmd, const char *cmdname, void *data)
* This macro lets us pass the command as a string to the function so
* we can print it on an error.
*/
-#define s390_pv_cmd(cmd, data) __s390_pv_cmd(cmd, #cmd, data)
+#define s390_pv_cmd(cmd, data) __s390_pv_cmd(cmd, #cmd, data, NULL)
+#define s390_pv_cmd_pvrc(cmd, data, pvrc) __s390_pv_cmd(cmd, #cmd, data, pvrc)
#define s390_pv_cmd_exit(cmd, data) \
{ \
int rc; \
\
- rc = __s390_pv_cmd(cmd, #cmd, data);\
+ rc = __s390_pv_cmd(cmd, #cmd, data, NULL); \
if (rc) { \
exit(1); \
} \
@@ -142,14 +147,24 @@ bool s390_pv_vm_try_disable_async(S390CcwMachineState *ms)
return true;
}
-int s390_pv_set_sec_parms(uint64_t origin, uint64_t length)
+int s390_pv_set_sec_parms(uint64_t origin, uint64_t length, Error **errp)
{
+ int ret, pvrc;
struct kvm_s390_pv_sec_parm args = {
.origin = origin,
.length = length,
};
- return s390_pv_cmd(KVM_PV_SET_SEC_PARMS, &args);
+ ret = s390_pv_cmd_pvrc(KVM_PV_SET_SEC_PARMS, &args, &pvrc);
+ if (ret) {
+ error_setg(errp, "Failed to set secure execution parameters");
+ if (pvrc == 0x108) {
+ error_append_hint(errp, "Please check whether the image is "
+ "correctly encrypted for this host\n");
+ }
+ }
+
+ return ret;
}
/*
diff --git a/target/s390x/kvm/pv.h b/target/s390x/kvm/pv.h
index 7b935e2246..5877d28ff1 100644
--- a/target/s390x/kvm/pv.h
+++ b/target/s390x/kvm/pv.h
@@ -42,7 +42,7 @@ int s390_pv_query_info(void);
int s390_pv_vm_enable(void);
void s390_pv_vm_disable(void);
bool s390_pv_vm_try_disable_async(S390CcwMachineState *ms);
-int s390_pv_set_sec_parms(uint64_t origin, uint64_t length);
+int s390_pv_set_sec_parms(uint64_t origin, uint64_t length, Error **errp);
int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak);
void s390_pv_prep_reset(void);
int s390_pv_verify(void);
@@ -62,7 +62,8 @@ static inline int s390_pv_query_info(void) { return 0; }
static inline int s390_pv_vm_enable(void) { return 0; }
static inline void s390_pv_vm_disable(void) {}
static inline bool s390_pv_vm_try_disable_async(S390CcwMachineState *ms) { return false; }
-static inline int s390_pv_set_sec_parms(uint64_t origin, uint64_t length) { return 0; }
+static inline int s390_pv_set_sec_parms(uint64_t origin, uint64_t length,
+ Error **errp) { return 0; }
static inline int s390_pv_unpack(uint64_t addr, uint64_t size, uint64_t tweak) { return 0; }
static inline void s390_pv_prep_reset(void) {}
static inline int s390_pv_verify(void) { return 0; }
--
2.39.3

View File

@ -0,0 +1,125 @@
From 420bf75353286324822c3bbca3b52a7a56ed668c Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 5 Dec 2023 13:20:00 -0500
Subject: [PATCH 083/101] tests: remove aio_context_acquire() tests
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [14/26] f6421037c1523bc957f3be0f4ad05571ae012dba (kmwolf/centos-qemu-kvm)
The aio_context_acquire() API is being removed. Drop the test case that
calls the API.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20231205182011.1976568-4-stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
tests/unit/test-aio.c | 67 +------------------------------------------
1 file changed, 1 insertion(+), 66 deletions(-)
diff --git a/tests/unit/test-aio.c b/tests/unit/test-aio.c
index 337b6e4ea7..e77d86be87 100644
--- a/tests/unit/test-aio.c
+++ b/tests/unit/test-aio.c
@@ -100,76 +100,12 @@ static void event_ready_cb(EventNotifier *e)
/* Tests using aio_*. */
-typedef struct {
- QemuMutex start_lock;
- EventNotifier notifier;
- bool thread_acquired;
-} AcquireTestData;
-
-static void *test_acquire_thread(void *opaque)
-{
- AcquireTestData *data = opaque;
-
- /* Wait for other thread to let us start */
- qemu_mutex_lock(&data->start_lock);
- qemu_mutex_unlock(&data->start_lock);
-
- /* event_notifier_set might be called either before or after
- * the main thread's call to poll(). The test case's outcome
- * should be the same in either case.
- */
- event_notifier_set(&data->notifier);
- aio_context_acquire(ctx);
- aio_context_release(ctx);
-
- data->thread_acquired = true; /* success, we got here */
-
- return NULL;
-}
-
static void set_event_notifier(AioContext *nctx, EventNotifier *notifier,
EventNotifierHandler *handler)
{
aio_set_event_notifier(nctx, notifier, handler, NULL, NULL);
}
-static void dummy_notifier_read(EventNotifier *n)
-{
- event_notifier_test_and_clear(n);
-}
-
-static void test_acquire(void)
-{
- QemuThread thread;
- AcquireTestData data;
-
- /* Dummy event notifier ensures aio_poll() will block */
- event_notifier_init(&data.notifier, false);
- set_event_notifier(ctx, &data.notifier, dummy_notifier_read);
- g_assert(!aio_poll(ctx, false)); /* consume aio_notify() */
-
- qemu_mutex_init(&data.start_lock);
- qemu_mutex_lock(&data.start_lock);
- data.thread_acquired = false;
-
- qemu_thread_create(&thread, "test_acquire_thread",
- test_acquire_thread,
- &data, QEMU_THREAD_JOINABLE);
-
- /* Block in aio_poll(), let other thread kick us and acquire context */
- aio_context_acquire(ctx);
- qemu_mutex_unlock(&data.start_lock); /* let the thread run */
- g_assert(aio_poll(ctx, true));
- g_assert(!data.thread_acquired);
- aio_context_release(ctx);
-
- qemu_thread_join(&thread);
- set_event_notifier(ctx, &data.notifier, NULL);
- event_notifier_cleanup(&data.notifier);
-
- g_assert(data.thread_acquired);
-}
-
static void test_bh_schedule(void)
{
BHTestData data = { .n = 0 };
@@ -879,7 +815,7 @@ static void test_worker_thread_co_enter(void)
qemu_thread_get_self(&this_thread);
co = qemu_coroutine_create(co_check_current_thread, &this_thread);
- qemu_thread_create(&worker_thread, "test_acquire_thread",
+ qemu_thread_create(&worker_thread, "test_aio_co_enter",
test_aio_co_enter,
co, QEMU_THREAD_JOINABLE);
@@ -899,7 +835,6 @@ int main(int argc, char **argv)
while (g_main_context_iteration(NULL, false));
g_test_init(&argc, &argv, NULL);
- g_test_add_func("/aio/acquire", test_acquire);
g_test_add_func("/aio/bh/schedule", test_bh_schedule);
g_test_add_func("/aio/bh/schedule10", test_bh_schedule10);
g_test_add_func("/aio/bh/cancel", test_bh_cancel);
--
2.39.3

View File

@ -0,0 +1,175 @@
From de167878ec4ca159cc6def5134c91c5fe9b5ab96 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Tue, 21 Nov 2023 16:44:01 +0800
Subject: [PATCH 022/101] util/char_dev: Add open_cdev()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [21/67] 72bf9ec3ccc9959626235bd270ec84caa4cee435 (eauger1/centos-qemu-kvm)
/dev/vfio/devices/vfioX may not exist. In that case it is still possible
to open /dev/char/$major:$minor instead. Add helper function to abstract
the cdev open.
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit d6b5c4c1b516a8176b74ec35a0af8cf89b04b6c1)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
MAINTAINERS | 2 +
include/qemu/chardev_open.h | 16 ++++++++
util/chardev_open.c | 81 +++++++++++++++++++++++++++++++++++++
util/meson.build | 1 +
4 files changed, 100 insertions(+)
create mode 100644 include/qemu/chardev_open.h
create mode 100644 util/chardev_open.c
diff --git a/MAINTAINERS b/MAINTAINERS
index a5a446914a..ca70bb4e64 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2174,6 +2174,8 @@ M: Zhenzhong Duan <zhenzhong.duan@intel.com>
S: Supported
F: backends/iommufd.c
F: include/sysemu/iommufd.h
+F: include/qemu/chardev_open.h
+F: util/chardev_open.c
vhost
M: Michael S. Tsirkin <mst@redhat.com>
diff --git a/include/qemu/chardev_open.h b/include/qemu/chardev_open.h
new file mode 100644
index 0000000000..64e8fcfdcb
--- /dev/null
+++ b/include/qemu/chardev_open.h
@@ -0,0 +1,16 @@
+/*
+ * QEMU Chardev Helper
+ *
+ * Copyright (C) 2023 Intel Corporation.
+ *
+ * Authors: Yi Liu <yi.l.liu@intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_CHARDEV_OPEN_H
+#define QEMU_CHARDEV_OPEN_H
+
+int open_cdev(const char *devpath, dev_t cdev);
+#endif
diff --git a/util/chardev_open.c b/util/chardev_open.c
new file mode 100644
index 0000000000..f776429788
--- /dev/null
+++ b/util/chardev_open.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019, Mellanox Technologies. All rights reserved.
+ * Copyright (C) 2023 Intel Corporation.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors: Yi Liu <yi.l.liu@intel.com>
+ *
+ * Copied from
+ * https://github.com/linux-rdma/rdma-core/blob/master/util/open_cdev.c
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/chardev_open.h"
+
+static int open_cdev_internal(const char *path, dev_t cdev)
+{
+ struct stat st;
+ int fd;
+
+ fd = qemu_open_old(path, O_RDWR);
+ if (fd == -1) {
+ return -1;
+ }
+ if (fstat(fd, &st) || !S_ISCHR(st.st_mode) ||
+ (cdev != 0 && st.st_rdev != cdev)) {
+ close(fd);
+ return -1;
+ }
+ return fd;
+}
+
+static int open_cdev_robust(dev_t cdev)
+{
+ g_autofree char *devpath = NULL;
+
+ /*
+ * This assumes that udev is being used and is creating the /dev/char/
+ * symlinks.
+ */
+ devpath = g_strdup_printf("/dev/char/%u:%u", major(cdev), minor(cdev));
+ return open_cdev_internal(devpath, cdev);
+}
+
+int open_cdev(const char *devpath, dev_t cdev)
+{
+ int fd;
+
+ fd = open_cdev_internal(devpath, cdev);
+ if (fd == -1 && cdev != 0) {
+ return open_cdev_robust(cdev);
+ }
+ return fd;
+}
diff --git a/util/meson.build b/util/meson.build
index c2322ef6e7..174c133368 100644
--- a/util/meson.build
+++ b/util/meson.build
@@ -108,6 +108,7 @@ if have_block
util_ss.add(files('filemonitor-stub.c'))
endif
util_ss.add(when: 'CONFIG_LINUX', if_true: files('vfio-helpers.c'))
+ util_ss.add(when: 'CONFIG_LINUX', if_true: files('chardev_open.c'))
endif
if cpu == 'aarch64'
--
2.39.3

View File

@ -0,0 +1,154 @@
From f554328f6f4702743af71befcb83c25c36e4fa4d Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Tue, 21 Nov 2023 16:44:25 +0800
Subject: [PATCH 046/101] vfio: Introduce a helper function to initialize
VFIODevice
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [45/67] 73225f394540bf5aeb70c0bdb89771f19a6d286d (eauger1/centos-qemu-kvm)
Introduce a helper function to replace the common code to initialize
VFIODevice in pci, platform, ap and ccw VFIO device.
No functional change intended.
Suggested-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 6106a329141af7d47bdc3346ce9820d4714e0e5d)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/ap.c | 8 ++------
hw/vfio/ccw.c | 8 ++------
hw/vfio/helpers.c | 11 +++++++++++
hw/vfio/pci.c | 6 ++----
hw/vfio/platform.c | 6 ++----
include/hw/vfio/vfio-common.h | 2 ++
6 files changed, 21 insertions(+), 20 deletions(-)
diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index 95fe7cd98b..e157aa1ff7 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -226,18 +226,14 @@ static void vfio_ap_instance_init(Object *obj)
VFIOAPDevice *vapdev = VFIO_AP_DEVICE(obj);
VFIODevice *vbasedev = &vapdev->vdev;
- vbasedev->type = VFIO_DEVICE_TYPE_AP;
- vbasedev->ops = &vfio_ap_ops;
- vbasedev->dev = DEVICE(vapdev);
- vbasedev->fd = -1;
-
/*
* vfio-ap devices operate in a way compatible with discarding of
* memory in RAM blocks, as no pages are pinned in the host.
* This needs to be set before vfio_get_device() for vfio common to
* handle ram_block_discard_disable().
*/
- vbasedev->ram_block_discard_allowed = true;
+ vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_AP, &vfio_ap_ops,
+ DEVICE(vapdev), true);
}
#ifdef CONFIG_IOMMUFD
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index 6305a4c1b8..90e4a53437 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -683,11 +683,6 @@ static void vfio_ccw_instance_init(Object *obj)
VFIOCCWDevice *vcdev = VFIO_CCW(obj);
VFIODevice *vbasedev = &vcdev->vdev;
- vbasedev->type = VFIO_DEVICE_TYPE_CCW;
- vbasedev->ops = &vfio_ccw_ops;
- vbasedev->dev = DEVICE(vcdev);
- vbasedev->fd = -1;
-
/*
* All vfio-ccw devices are believed to operate in a way compatible with
* discarding of memory in RAM blocks, ie. pages pinned in the host are
@@ -696,7 +691,8 @@ static void vfio_ccw_instance_init(Object *obj)
* needs to be set before vfio_get_device() for vfio common to handle
* ram_block_discard_disable().
*/
- vbasedev->ram_block_discard_allowed = true;
+ vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_CCW, &vfio_ccw_ops,
+ DEVICE(vcdev), true);
}
#ifdef CONFIG_IOMMUFD
diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c
index 3592c3d54e..6789870802 100644
--- a/hw/vfio/helpers.c
+++ b/hw/vfio/helpers.c
@@ -652,3 +652,14 @@ void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp)
}
vbasedev->fd = fd;
}
+
+void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
+ DeviceState *dev, bool ram_discard)
+{
+ vbasedev->type = type;
+ vbasedev->ops = ops;
+ vbasedev->dev = dev;
+ vbasedev->fd = -1;
+
+ vbasedev->ram_block_discard_allowed = ram_discard;
+}
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 3f5900cc46..83c3238608 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3353,10 +3353,8 @@ static void vfio_instance_init(Object *obj)
vdev->host.slot = ~0U;
vdev->host.function = ~0U;
- vbasedev->type = VFIO_DEVICE_TYPE_PCI;
- vbasedev->ops = &vfio_pci_ops;
- vbasedev->dev = DEVICE(vdev);
- vbasedev->fd = -1;
+ vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_pci_ops,
+ DEVICE(vdev), false);
vdev->nv_gpudirect_clique = 0xFF;
diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
index 506eb8193f..a8d9b7da63 100644
--- a/hw/vfio/platform.c
+++ b/hw/vfio/platform.c
@@ -657,10 +657,8 @@ static void vfio_platform_instance_init(Object *obj)
VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(obj);
VFIODevice *vbasedev = &vdev->vbasedev;
- vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM;
- vbasedev->ops = &vfio_platform_ops;
- vbasedev->dev = DEVICE(vdev);
- vbasedev->fd = -1;
+ vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PLATFORM, &vfio_platform_ops,
+ DEVICE(vdev), false);
}
#ifdef CONFIG_IOMMUFD
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index efcba19f66..b8aa8a5495 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -257,4 +257,6 @@ int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova,
/* Returns 0 on success, or a negative errno. */
int vfio_device_get_name(VFIODevice *vbasedev, Error **errp);
void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp);
+void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
+ DeviceState *dev, bool ram_discard);
#endif /* HW_VFIO_VFIO_COMMON_H */
--
2.39.3

View File

@ -0,0 +1,129 @@
From 7f392385d1b865904eae4b6681e3e7a87eb3af3d Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Thu, 2 Nov 2023 15:12:27 +0800
Subject: [PATCH 002/101] vfio: Introduce base object for VFIOContainer and
targeted interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [1/67] e63af50c2cb94f286b2d91f58c2d19dd862e019d (eauger1/centos-qemu-kvm)
Introduce a dumb VFIOContainerBase object and its targeted interface.
This is willingly not a QOM object because we don't want it to be
visible from the user interface. The VFIOContainerBase will be
smoothly populated in subsequent patches as well as interfaces.
No functional change intended.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit f61dddd73232e3d82d560d1e1bca120446021f2f)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
include/hw/vfio/vfio-common.h | 8 ++---
include/hw/vfio/vfio-container-base.h | 50 +++++++++++++++++++++++++++
2 files changed, 52 insertions(+), 6 deletions(-)
create mode 100644 include/hw/vfio/vfio-container-base.h
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index a4a22accb9..586d153c12 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -30,6 +30,7 @@
#include <linux/vfio.h>
#endif
#include "sysemu/sysemu.h"
+#include "hw/vfio/vfio-container-base.h"
#define VFIO_MSG_PREFIX "vfio %s: "
@@ -81,6 +82,7 @@ typedef struct VFIOAddressSpace {
struct VFIOGroup;
typedef struct VFIOContainer {
+ VFIOContainerBase bcontainer;
VFIOAddressSpace *space;
int fd; /* /dev/vfio/vfio, empowered by the attached groups */
MemoryListener listener;
@@ -201,12 +203,6 @@ typedef struct VFIODisplay {
} dmabuf;
} VFIODisplay;
-typedef struct {
- unsigned long *bitmap;
- hwaddr size;
- hwaddr pages;
-} VFIOBitmap;
-
VFIOAddressSpace *vfio_get_address_space(AddressSpace *as);
void vfio_put_address_space(VFIOAddressSpace *space);
bool vfio_devices_all_running_and_saving(VFIOContainer *container);
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
new file mode 100644
index 0000000000..1d6daaea5d
--- /dev/null
+++ b/include/hw/vfio/vfio-container-base.h
@@ -0,0 +1,50 @@
+/*
+ * VFIO BASE CONTAINER
+ *
+ * Copyright (C) 2023 Intel Corporation.
+ * Copyright Red Hat, Inc. 2023
+ *
+ * Authors: Yi Liu <yi.l.liu@intel.com>
+ * Eric Auger <eric.auger@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_VFIO_VFIO_CONTAINER_BASE_H
+#define HW_VFIO_VFIO_CONTAINER_BASE_H
+
+#include "exec/memory.h"
+
+typedef struct VFIODevice VFIODevice;
+typedef struct VFIOIOMMUOps VFIOIOMMUOps;
+
+typedef struct {
+ unsigned long *bitmap;
+ hwaddr size;
+ hwaddr pages;
+} VFIOBitmap;
+
+/*
+ * This is the base object for vfio container backends
+ */
+typedef struct VFIOContainerBase {
+ const VFIOIOMMUOps *ops;
+} VFIOContainerBase;
+
+struct VFIOIOMMUOps {
+ /* basic feature */
+ int (*dma_map)(VFIOContainerBase *bcontainer,
+ hwaddr iova, ram_addr_t size,
+ void *vaddr, bool readonly);
+ int (*dma_unmap)(VFIOContainerBase *bcontainer,
+ hwaddr iova, ram_addr_t size,
+ IOMMUTLBEntry *iotlb);
+ int (*attach_device)(const char *name, VFIODevice *vbasedev,
+ AddressSpace *as, Error **errp);
+ void (*detach_device)(VFIODevice *vbasedev);
+ /* migration feature */
+ int (*set_dirty_page_tracking)(VFIOContainerBase *bcontainer, bool start);
+ int (*query_dirty_bitmap)(VFIOContainerBase *bcontainer, VFIOBitmap *vbmap,
+ hwaddr iova, hwaddr size);
+};
+#endif /* HW_VFIO_VFIO_CONTAINER_BASE_H */
--
2.39.3

View File

@ -0,0 +1,276 @@
From 84b15fad1af781d06d0206d362de0801d7a18d0b Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Tue, 21 Nov 2023 16:44:17 +0800
Subject: [PATCH 038/101] vfio: Make VFIOContainerBase poiner parameter const
in VFIOIOMMUOps callbacks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [37/67] 95eb9edc7fcfefbd4b075f6f04941ed4a19ff87d (eauger1/centos-qemu-kvm)
Some of the callbacks in VFIOIOMMUOps pass VFIOContainerBase poiner,
those callbacks only need read access to the sub object of VFIOContainerBase.
So make VFIOContainerBase, VFIOContainer and VFIOIOMMUFDContainer as const
in these callbacks.
Local functions called by those callbacks also need same changes to avoid
build error.
Suggested-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 4517c33c31d392f08fa96a9db911da1e3507be94)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/common.c | 9 +++----
hw/vfio/container-base.c | 2 +-
hw/vfio/container.c | 34 ++++++++++++++-------------
hw/vfio/iommufd.c | 8 +++----
include/hw/vfio/vfio-common.h | 12 ++++++----
include/hw/vfio/vfio-container-base.h | 12 ++++++----
6 files changed, 42 insertions(+), 35 deletions(-)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 6569732b7a..08a3e57672 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -204,7 +204,7 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainerBase *bcontainer)
return true;
}
-bool vfio_devices_all_device_dirty_tracking(VFIOContainerBase *bcontainer)
+bool vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer)
{
VFIODevice *vbasedev;
@@ -221,7 +221,8 @@ bool vfio_devices_all_device_dirty_tracking(VFIOContainerBase *bcontainer)
* Check if all VFIO devices are running and migration is active, which is
* essentially equivalent to the migration being in pre-copy phase.
*/
-bool vfio_devices_all_running_and_mig_active(VFIOContainerBase *bcontainer)
+bool
+vfio_devices_all_running_and_mig_active(const VFIOContainerBase *bcontainer)
{
VFIODevice *vbasedev;
@@ -1139,7 +1140,7 @@ static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova,
return 0;
}
-int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer,
+int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
VFIOBitmap *vbmap, hwaddr iova,
hwaddr size)
{
@@ -1162,7 +1163,7 @@ int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer,
return 0;
}
-int vfio_get_dirty_bitmap(VFIOContainerBase *bcontainer, uint64_t iova,
+int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova,
uint64_t size, ram_addr_t ram_addr)
{
bool all_device_dirty_tracking =
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index eee2dcfe76..1ffd25bbfa 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -63,7 +63,7 @@ int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer,
return bcontainer->ops->set_dirty_page_tracking(bcontainer, start);
}
-int vfio_container_query_dirty_bitmap(VFIOContainerBase *bcontainer,
+int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
VFIOBitmap *vbmap,
hwaddr iova, hwaddr size)
{
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 1dbf9b9a17..b22feb8ded 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -61,11 +61,11 @@ static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state)
}
}
-static int vfio_dma_unmap_bitmap(VFIOContainer *container,
+static int vfio_dma_unmap_bitmap(const VFIOContainer *container,
hwaddr iova, ram_addr_t size,
IOMMUTLBEntry *iotlb)
{
- VFIOContainerBase *bcontainer = &container->bcontainer;
+ const VFIOContainerBase *bcontainer = &container->bcontainer;
struct vfio_iommu_type1_dma_unmap *unmap;
struct vfio_bitmap *bitmap;
VFIOBitmap vbmap;
@@ -117,11 +117,12 @@ unmap_exit:
/*
* DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
*/
-static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova,
- ram_addr_t size, IOMMUTLBEntry *iotlb)
+static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer,
+ hwaddr iova, ram_addr_t size,
+ IOMMUTLBEntry *iotlb)
{
- VFIOContainer *container = container_of(bcontainer, VFIOContainer,
- bcontainer);
+ const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+ bcontainer);
struct vfio_iommu_type1_dma_unmap unmap = {
.argsz = sizeof(unmap),
.flags = 0,
@@ -174,11 +175,11 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova,
return 0;
}
-static int vfio_legacy_dma_map(VFIOContainerBase *bcontainer, hwaddr iova,
+static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova,
ram_addr_t size, void *vaddr, bool readonly)
{
- VFIOContainer *container = container_of(bcontainer, VFIOContainer,
- bcontainer);
+ const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+ bcontainer);
struct vfio_iommu_type1_dma_map map = {
.argsz = sizeof(map),
.flags = VFIO_DMA_MAP_FLAG_READ,
@@ -207,11 +208,12 @@ static int vfio_legacy_dma_map(VFIOContainerBase *bcontainer, hwaddr iova,
return -errno;
}
-static int vfio_legacy_set_dirty_page_tracking(VFIOContainerBase *bcontainer,
- bool start)
+static int
+vfio_legacy_set_dirty_page_tracking(const VFIOContainerBase *bcontainer,
+ bool start)
{
- VFIOContainer *container = container_of(bcontainer, VFIOContainer,
- bcontainer);
+ const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+ bcontainer);
int ret;
struct vfio_iommu_type1_dirty_bitmap dirty = {
.argsz = sizeof(dirty),
@@ -233,12 +235,12 @@ static int vfio_legacy_set_dirty_page_tracking(VFIOContainerBase *bcontainer,
return ret;
}
-static int vfio_legacy_query_dirty_bitmap(VFIOContainerBase *bcontainer,
+static int vfio_legacy_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
VFIOBitmap *vbmap,
hwaddr iova, hwaddr size)
{
- VFIOContainer *container = container_of(bcontainer, VFIOContainer,
- bcontainer);
+ const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+ bcontainer);
struct vfio_iommu_type1_dirty_bitmap *dbitmap;
struct vfio_iommu_type1_dirty_bitmap_get *range;
int ret;
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 5accd26484..87a561c545 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -26,10 +26,10 @@
#include "qemu/chardev_open.h"
#include "pci.h"
-static int iommufd_cdev_map(VFIOContainerBase *bcontainer, hwaddr iova,
+static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova,
ram_addr_t size, void *vaddr, bool readonly)
{
- VFIOIOMMUFDContainer *container =
+ const VFIOIOMMUFDContainer *container =
container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer);
return iommufd_backend_map_dma(container->be,
@@ -37,11 +37,11 @@ static int iommufd_cdev_map(VFIOContainerBase *bcontainer, hwaddr iova,
iova, size, vaddr, readonly);
}
-static int iommufd_cdev_unmap(VFIOContainerBase *bcontainer,
+static int iommufd_cdev_unmap(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
IOMMUTLBEntry *iotlb)
{
- VFIOIOMMUFDContainer *container =
+ const VFIOIOMMUFDContainer *container =
container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer);
/* TODO: Handle dma_unmap_bitmap with iotlb args (migration) */
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 697bf24a35..efcba19f66 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -244,13 +244,15 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp);
void vfio_migration_exit(VFIODevice *vbasedev);
int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size);
-bool vfio_devices_all_running_and_mig_active(VFIOContainerBase *bcontainer);
-bool vfio_devices_all_device_dirty_tracking(VFIOContainerBase *bcontainer);
-int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer,
+bool
+vfio_devices_all_running_and_mig_active(const VFIOContainerBase *bcontainer);
+bool
+vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer);
+int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
VFIOBitmap *vbmap, hwaddr iova,
hwaddr size);
-int vfio_get_dirty_bitmap(VFIOContainerBase *bcontainer, uint64_t iova,
- uint64_t size, ram_addr_t ram_addr);
+int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova,
+ uint64_t size, ram_addr_t ram_addr);
/* Returns 0 on success, or a negative errno. */
int vfio_device_get_name(VFIODevice *vbasedev, Error **errp);
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index 45bb19c767..2ae297ccda 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -82,7 +82,7 @@ void vfio_container_del_section_window(VFIOContainerBase *bcontainer,
MemoryRegionSection *section);
int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer,
bool start);
-int vfio_container_query_dirty_bitmap(VFIOContainerBase *bcontainer,
+int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
VFIOBitmap *vbmap,
hwaddr iova, hwaddr size);
@@ -93,18 +93,20 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer);
struct VFIOIOMMUOps {
/* basic feature */
- int (*dma_map)(VFIOContainerBase *bcontainer,
+ int (*dma_map)(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
void *vaddr, bool readonly);
- int (*dma_unmap)(VFIOContainerBase *bcontainer,
+ int (*dma_unmap)(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
IOMMUTLBEntry *iotlb);
int (*attach_device)(const char *name, VFIODevice *vbasedev,
AddressSpace *as, Error **errp);
void (*detach_device)(VFIODevice *vbasedev);
/* migration feature */
- int (*set_dirty_page_tracking)(VFIOContainerBase *bcontainer, bool start);
- int (*query_dirty_bitmap)(VFIOContainerBase *bcontainer, VFIOBitmap *vbmap,
+ int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer,
+ bool start);
+ int (*query_dirty_bitmap)(const VFIOContainerBase *bcontainer,
+ VFIOBitmap *vbmap,
hwaddr iova, hwaddr size);
/* PCI specific */
int (*pci_hot_reset)(VFIODevice *vbasedev, bool single);
--
2.39.3

View File

@ -0,0 +1,75 @@
From 57bdfc821d6f4b4f9c6b1ff05bf0114e5cabc77e Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Tue, 21 Nov 2023 16:44:13 +0800
Subject: [PATCH 034/101] vfio/ap: Allow the selection of a given iommu backend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [33/67] a12bb86e5b627ccf246fb9ce60820595589ff8e5 (eauger1/centos-qemu-kvm)
Now we support two types of iommu backends, let's add the capability
to select one of them. This depends on whether an iommufd object has
been linked with the vfio-ap device:
if the user wants to use the legacy backend, it shall not
link the vfio-ap device with any iommufd object:
-device vfio-ap,sysfsdev=/sys/bus/mdev/devices/XXX
This is called the legacy mode/backend.
If the user wants to use the iommufd backend (/dev/iommu) it
shall pass an iommufd object id in the vfio-ap device options:
-object iommufd,id=iommufd0
-device vfio-ap,sysfsdev=/sys/bus/mdev/devices/XXX,iommufd=iommufd0
Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 336f308958d598f3db351bb7d94cc57b4b2d448d)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/ap.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index bbf69ff55a..80629609ae 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -11,10 +11,12 @@
*/
#include "qemu/osdep.h"
+#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
#include <linux/vfio.h>
#include <sys/ioctl.h>
#include "qapi/error.h"
#include "hw/vfio/vfio-common.h"
+#include "sysemu/iommufd.h"
#include "hw/s390x/ap-device.h"
#include "qemu/error-report.h"
#include "qemu/event_notifier.h"
@@ -204,6 +206,10 @@ static void vfio_ap_unrealize(DeviceState *dev)
static Property vfio_ap_properties[] = {
DEFINE_PROP_STRING("sysfsdev", VFIOAPDevice, vdev.sysfsdev),
+#ifdef CONFIG_IOMMUFD
+ DEFINE_PROP_LINK("iommufd", VFIOAPDevice, vdev.iommufd,
+ TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
+#endif
DEFINE_PROP_END_OF_LIST(),
};
--
2.39.3

View File

@ -0,0 +1,87 @@
From db09b7c60c01ee75d602261ee959a96fa0d89d68 Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Tue, 21 Nov 2023 16:44:14 +0800
Subject: [PATCH 035/101] vfio/ap: Make vfio cdev pre-openable by passing a
file handle
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [34/67] aaafa6088a9b0302d53aa539f67792d02ea0f663 (eauger1/centos-qemu-kvm)
This gives management tools like libvirt a chance to open the vfio
cdev with privilege and pass FD to qemu. This way qemu never needs
to have privilege to open a VFIO or iommu cdev node.
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 5e7ba401b71d18544a3e44b2a58b9e63fd5148d5)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/ap.c | 23 ++++++++++++++++++++++-
1 file changed, 22 insertions(+), 1 deletion(-)
diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index 80629609ae..f180e4a32a 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -160,7 +160,10 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp)
VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev);
VFIODevice *vbasedev = &vapdev->vdev;
- vbasedev->name = g_path_get_basename(vbasedev->sysfsdev);
+ if (vfio_device_get_name(vbasedev, errp) < 0) {
+ return;
+ }
+
vbasedev->ops = &vfio_ap_ops;
vbasedev->type = VFIO_DEVICE_TYPE_AP;
vbasedev->dev = dev;
@@ -230,11 +233,28 @@ static const VMStateDescription vfio_ap_vmstate = {
.unmigratable = 1,
};
+static void vfio_ap_instance_init(Object *obj)
+{
+ VFIOAPDevice *vapdev = VFIO_AP_DEVICE(obj);
+
+ vapdev->vdev.fd = -1;
+}
+
+#ifdef CONFIG_IOMMUFD
+static void vfio_ap_set_fd(Object *obj, const char *str, Error **errp)
+{
+ vfio_device_set_fd(&VFIO_AP_DEVICE(obj)->vdev, str, errp);
+}
+#endif
+
static void vfio_ap_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
device_class_set_props(dc, vfio_ap_properties);
+#ifdef CONFIG_IOMMUFD
+ object_class_property_add_str(klass, "fd", NULL, vfio_ap_set_fd);
+#endif
dc->vmsd = &vfio_ap_vmstate;
dc->desc = "VFIO-based AP device assignment";
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
@@ -249,6 +269,7 @@ static const TypeInfo vfio_ap_info = {
.name = TYPE_VFIO_AP_DEVICE,
.parent = TYPE_AP_DEVICE,
.instance_size = sizeof(VFIOAPDevice),
+ .instance_init = vfio_ap_instance_init,
.class_init = vfio_ap_class_init,
};
--
2.39.3

View File

@ -0,0 +1,81 @@
From b8630ecb698e31311089ba4e224d5e2c08c8e665 Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Tue, 21 Nov 2023 16:44:23 +0800
Subject: [PATCH 044/101] vfio/ap: Move VFIODevice initializations in
vfio_ap_instance_init
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [43/67] 95a527f649b28c5c78903e99735107667e8468b1 (eauger1/centos-qemu-kvm)
Some of the VFIODevice initializations is in vfio_ap_realize,
move all of them in vfio_ap_instance_init.
No functional change intended.
Suggested-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Eric Farman <farman@linux.ibm.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit cbbcc2f1706aa1a08637142744d2f5f6515ac93f)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/ap.c | 26 +++++++++++++-------------
1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index f180e4a32a..95fe7cd98b 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -164,18 +164,6 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp)
return;
}
- vbasedev->ops = &vfio_ap_ops;
- vbasedev->type = VFIO_DEVICE_TYPE_AP;
- vbasedev->dev = dev;
-
- /*
- * vfio-ap devices operate in a way compatible with discarding of
- * memory in RAM blocks, as no pages are pinned in the host.
- * This needs to be set before vfio_get_device() for vfio common to
- * handle ram_block_discard_disable().
- */
- vapdev->vdev.ram_block_discard_allowed = true;
-
ret = vfio_attach_device(vbasedev->name, vbasedev,
&address_space_memory, errp);
if (ret) {
@@ -236,8 +224,20 @@ static const VMStateDescription vfio_ap_vmstate = {
static void vfio_ap_instance_init(Object *obj)
{
VFIOAPDevice *vapdev = VFIO_AP_DEVICE(obj);
+ VFIODevice *vbasedev = &vapdev->vdev;
- vapdev->vdev.fd = -1;
+ vbasedev->type = VFIO_DEVICE_TYPE_AP;
+ vbasedev->ops = &vfio_ap_ops;
+ vbasedev->dev = DEVICE(vapdev);
+ vbasedev->fd = -1;
+
+ /*
+ * vfio-ap devices operate in a way compatible with discarding of
+ * memory in RAM blocks, as no pages are pinned in the host.
+ * This needs to be set before vfio_get_device() for vfio common to
+ * handle ram_block_discard_disable().
+ */
+ vbasedev->ram_block_discard_allowed = true;
}
#ifdef CONFIG_IOMMUFD
--
2.39.3

View File

@ -0,0 +1,79 @@
From 732115c80eb0dd672925a0737e09643d8a889abd Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Tue, 21 Nov 2023 16:44:15 +0800
Subject: [PATCH 036/101] vfio/ccw: Allow the selection of a given iommu
backend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [35/67] 1701de023a9f3b3f0420689bf851e11aee88800d (eauger1/centos-qemu-kvm)
Now we support two types of iommu backends, let's add the capability
to select one of them. This depends on whether an iommufd object has
been linked with the vfio-ccw device:
If the user wants to use the legacy backend, it shall not
link the vfio-ccw device with any iommufd object:
-device vfio-ccw,sysfsdev=/sys/bus/mdev/devices/XXX
This is called the legacy mode/backend.
If the user wants to use the iommufd backend (/dev/iommu) it
shall pass an iommufd object id in the vfio-ccw device options:
-object iommufd,id=iommufd0
-device vfio-ccw,sysfsdev=/sys/bus/mdev/devices/XXX,iommufd=iommufd0
Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Reviewed-by: Eric Farman <farman@linux.ibm.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit e70f971a6c1230138843d7ab82267e4a5aaf6bda)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/ccw.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index d857bb8d0f..d2d58bb677 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -15,12 +15,14 @@
*/
#include "qemu/osdep.h"
+#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
#include <linux/vfio.h>
#include <linux/vfio_ccw.h>
#include <sys/ioctl.h>
#include "qapi/error.h"
#include "hw/vfio/vfio-common.h"
+#include "sysemu/iommufd.h"
#include "hw/s390x/s390-ccw.h"
#include "hw/s390x/vfio-ccw.h"
#include "hw/qdev-properties.h"
@@ -677,6 +679,10 @@ static void vfio_ccw_unrealize(DeviceState *dev)
static Property vfio_ccw_properties[] = {
DEFINE_PROP_STRING("sysfsdev", VFIOCCWDevice, vdev.sysfsdev),
DEFINE_PROP_BOOL("force-orb-pfch", VFIOCCWDevice, force_orb_pfch, false),
+#ifdef CONFIG_IOMMUFD
+ DEFINE_PROP_LINK("iommufd", VFIOCCWDevice, vdev.iommufd,
+ TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
+#endif
DEFINE_PROP_END_OF_LIST(),
};
--
2.39.3

View File

@ -0,0 +1,93 @@
From 0ff08afdec19f4decaf750fa7d158e0ea498ff28 Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Tue, 21 Nov 2023 16:44:16 +0800
Subject: [PATCH 037/101] vfio/ccw: Make vfio cdev pre-openable by passing a
file handle
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [36/67] cc0d8f51cffa5d5a7aebc2334b908b9877179ae7 (eauger1/centos-qemu-kvm)
This gives management tools like libvirt a chance to open the vfio
cdev with privilege and pass FD to qemu. This way qemu never needs
to have privilege to open a VFIO or iommu cdev node.
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Reviewed-by: Eric Farman <farman@linux.ibm.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 909a6254edaa8d0b0e3f1c0a623862e73d1842e9)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/ccw.c | 25 ++++++++++++++++++++++---
1 file changed, 22 insertions(+), 3 deletions(-)
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index d2d58bb677..2afdf17dbe 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -590,11 +590,12 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp)
}
}
+ if (vfio_device_get_name(vbasedev, errp) < 0) {
+ return;
+ }
+
vbasedev->ops = &vfio_ccw_ops;
vbasedev->type = VFIO_DEVICE_TYPE_CCW;
- vbasedev->name = g_strdup_printf("%x.%x.%04x", vcdev->cdev.hostid.cssid,
- vcdev->cdev.hostid.ssid,
- vcdev->cdev.hostid.devid);
vbasedev->dev = dev;
/*
@@ -691,12 +692,29 @@ static const VMStateDescription vfio_ccw_vmstate = {
.unmigratable = 1,
};
+static void vfio_ccw_instance_init(Object *obj)
+{
+ VFIOCCWDevice *vcdev = VFIO_CCW(obj);
+
+ vcdev->vdev.fd = -1;
+}
+
+#ifdef CONFIG_IOMMUFD
+static void vfio_ccw_set_fd(Object *obj, const char *str, Error **errp)
+{
+ vfio_device_set_fd(&VFIO_CCW(obj)->vdev, str, errp);
+}
+#endif
+
static void vfio_ccw_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
S390CCWDeviceClass *cdc = S390_CCW_DEVICE_CLASS(klass);
device_class_set_props(dc, vfio_ccw_properties);
+#ifdef CONFIG_IOMMUFD
+ object_class_property_add_str(klass, "fd", NULL, vfio_ccw_set_fd);
+#endif
dc->vmsd = &vfio_ccw_vmstate;
dc->desc = "VFIO-based subchannel assignment";
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
@@ -714,6 +732,7 @@ static const TypeInfo vfio_ccw_info = {
.name = TYPE_VFIO_CCW,
.parent = TYPE_S390_CCW,
.instance_size = sizeof(VFIOCCWDevice),
+ .instance_init = vfio_ccw_instance_init,
.class_init = vfio_ccw_class_init,
};
--
2.39.3

View File

@ -0,0 +1,85 @@
From 2ef1c050722115247962e3cd4d8fcf73727e597e Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Tue, 21 Nov 2023 16:44:24 +0800
Subject: [PATCH 045/101] vfio/ccw: Move VFIODevice initializations in
vfio_ccw_instance_init
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [44/67] 3345ed58f491aba8fd51bcc172af267ae53e6c8c (eauger1/centos-qemu-kvm)
Some of the VFIODevice initializations is in vfio_ccw_realize,
move all of them in vfio_ccw_instance_init.
No functional change intended.
Suggested-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Eric Farman <farman@linux.ibm.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit c12b55ad6f9d3b4792b590e9211bd7319e4a2d70)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/ccw.c | 30 +++++++++++++++---------------
1 file changed, 15 insertions(+), 15 deletions(-)
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index 2afdf17dbe..6305a4c1b8 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -594,20 +594,6 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp)
return;
}
- vbasedev->ops = &vfio_ccw_ops;
- vbasedev->type = VFIO_DEVICE_TYPE_CCW;
- vbasedev->dev = dev;
-
- /*
- * All vfio-ccw devices are believed to operate in a way compatible with
- * discarding of memory in RAM blocks, ie. pages pinned in the host are
- * in the current working set of the guest driver and therefore never
- * overlap e.g., with pages available to the guest balloon driver. This
- * needs to be set before vfio_get_device() for vfio common to handle
- * ram_block_discard_disable().
- */
- vbasedev->ram_block_discard_allowed = true;
-
ret = vfio_attach_device(cdev->mdevid, vbasedev,
&address_space_memory, errp);
if (ret) {
@@ -695,8 +681,22 @@ static const VMStateDescription vfio_ccw_vmstate = {
static void vfio_ccw_instance_init(Object *obj)
{
VFIOCCWDevice *vcdev = VFIO_CCW(obj);
+ VFIODevice *vbasedev = &vcdev->vdev;
+
+ vbasedev->type = VFIO_DEVICE_TYPE_CCW;
+ vbasedev->ops = &vfio_ccw_ops;
+ vbasedev->dev = DEVICE(vcdev);
+ vbasedev->fd = -1;
- vcdev->vdev.fd = -1;
+ /*
+ * All vfio-ccw devices are believed to operate in a way compatible with
+ * discarding of memory in RAM blocks, ie. pages pinned in the host are
+ * in the current working set of the guest driver and therefore never
+ * overlap e.g., with pages available to the guest balloon driver. This
+ * needs to be set before vfio_get_device() for vfio common to handle
+ * ram_block_discard_disable().
+ */
+ vbasedev->ram_block_discard_allowed = true;
}
#ifdef CONFIG_IOMMUFD
--
2.39.3

View File

@ -0,0 +1,98 @@
From 7de36998dd6177380e46b8c5f3a91c3fad75483c Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Thu, 2 Nov 2023 15:12:30 +0800
Subject: [PATCH 005/101] vfio/common: Introduce vfio_container_init/destroy
helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [4/67] 8287f687ef19cd84afede1e8f3b16ac3caf29a1d (eauger1/centos-qemu-kvm)
This adds two helper functions vfio_container_init/destroy which will be
used by both legacy and iommufd containers to do base container specific
initialization and release.
No functional change intended.
Suggested-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit ed2f7f80170251e7cdd2965a13ee97527d1fbec8)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/container-base.c | 9 +++++++++
hw/vfio/container.c | 4 +++-
include/hw/vfio/vfio-container-base.h | 4 ++++
3 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index 55d3a35fa4..e929435751 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -30,3 +30,12 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
g_assert(bcontainer->ops->dma_unmap);
return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb);
}
+
+void vfio_container_init(VFIOContainerBase *bcontainer, const VFIOIOMMUOps *ops)
+{
+ bcontainer->ops = ops;
+}
+
+void vfio_container_destroy(VFIOContainerBase *bcontainer)
+{
+}
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index c04df26323..32a0251dd1 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -559,7 +559,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
QLIST_INIT(&container->giommu_list);
QLIST_INIT(&container->vrdl_list);
bcontainer = &container->bcontainer;
- bcontainer->ops = &vfio_legacy_ops;
+ vfio_container_init(bcontainer, &vfio_legacy_ops);
ret = vfio_init_container(container, group->fd, errp);
if (ret) {
@@ -661,6 +661,7 @@ put_space_exit:
static void vfio_disconnect_container(VFIOGroup *group)
{
VFIOContainer *container = group->container;
+ VFIOContainerBase *bcontainer = &container->bcontainer;
QLIST_REMOVE(group, container_next);
group->container = NULL;
@@ -695,6 +696,7 @@ static void vfio_disconnect_container(VFIOGroup *group)
QLIST_REMOVE(giommu, giommu_next);
g_free(giommu);
}
+ vfio_container_destroy(bcontainer);
trace_vfio_disconnect_container(container->fd);
close(container->fd);
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index 56b033f59f..577f52ccbc 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -38,6 +38,10 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
IOMMUTLBEntry *iotlb);
+void vfio_container_init(VFIOContainerBase *bcontainer,
+ const VFIOIOMMUOps *ops);
+void vfio_container_destroy(VFIOContainerBase *bcontainer);
+
struct VFIOIOMMUOps {
/* basic feature */
int (*dma_map)(VFIOContainerBase *bcontainer,
--
2.39.3

View File

@ -0,0 +1,221 @@
From 36f4005c3dbb4c8b63a975494c75281de51c25f9 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 2 Nov 2023 15:12:31 +0800
Subject: [PATCH 006/101] vfio/common: Move giommu_list in base container
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [5/67] ba5898e96c16c7f6e8108ae461b454d3c8c35404 (eauger1/centos-qemu-kvm)
Move the giommu_list field in the base container and store
the base container in the VFIOGuestIOMMU.
No functional change intended.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit dddf83ab99eb832c449249397a1c302c6ed746bf)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/common.c | 17 +++++++++++------
hw/vfio/container-base.c | 9 +++++++++
hw/vfio/container.c | 8 --------
include/hw/vfio/vfio-common.h | 9 ---------
include/hw/vfio/vfio-container-base.h | 9 +++++++++
5 files changed, 29 insertions(+), 23 deletions(-)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index e610771888..43580bcc43 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -292,7 +292,7 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
{
VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
- VFIOContainerBase *bcontainer = &giommu->container->bcontainer;
+ VFIOContainerBase *bcontainer = giommu->bcontainer;
hwaddr iova = iotlb->iova + giommu->iommu_offset;
void *vaddr;
int ret;
@@ -569,6 +569,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
MemoryRegionSection *section)
{
VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+ VFIOContainerBase *bcontainer = &container->bcontainer;
hwaddr iova, end;
Int128 llend, llsize;
void *vaddr;
@@ -612,7 +613,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
giommu->iommu_mr = iommu_mr;
giommu->iommu_offset = section->offset_within_address_space -
section->offset_within_region;
- giommu->container = container;
+ giommu->bcontainer = bcontainer;
llend = int128_add(int128_make64(section->offset_within_region),
section->size);
llend = int128_sub(llend, int128_one());
@@ -647,7 +648,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
g_free(giommu);
goto fail;
}
- QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
+ QLIST_INSERT_HEAD(&bcontainer->giommu_list, giommu, giommu_next);
memory_region_iommu_replay(giommu->iommu_mr, &giommu->n);
return;
@@ -732,6 +733,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
MemoryRegionSection *section)
{
VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+ VFIOContainerBase *bcontainer = &container->bcontainer;
hwaddr iova, end;
Int128 llend, llsize;
int ret;
@@ -744,7 +746,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
if (memory_region_is_iommu(section->mr)) {
VFIOGuestIOMMU *giommu;
- QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
+ QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
giommu->n.start == section->offset_within_region) {
memory_region_unregister_iommu_notifier(section->mr,
@@ -1206,7 +1208,9 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
vfio_giommu_dirty_notifier *gdn = container_of(n,
vfio_giommu_dirty_notifier, n);
VFIOGuestIOMMU *giommu = gdn->giommu;
- VFIOContainer *container = giommu->container;
+ VFIOContainerBase *bcontainer = giommu->bcontainer;
+ VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+ bcontainer);
hwaddr iova = iotlb->iova + giommu->iommu_offset;
ram_addr_t translated_addr;
int ret = -EINVAL;
@@ -1284,12 +1288,13 @@ static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container,
static int vfio_sync_dirty_bitmap(VFIOContainer *container,
MemoryRegionSection *section)
{
+ VFIOContainerBase *bcontainer = &container->bcontainer;
ram_addr_t ram_addr;
if (memory_region_is_iommu(section->mr)) {
VFIOGuestIOMMU *giommu;
- QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
+ QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) {
if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
giommu->n.start == section->offset_within_region) {
Int128 llend;
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index e929435751..20bcb9669a 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -34,8 +34,17 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
void vfio_container_init(VFIOContainerBase *bcontainer, const VFIOIOMMUOps *ops)
{
bcontainer->ops = ops;
+ QLIST_INIT(&bcontainer->giommu_list);
}
void vfio_container_destroy(VFIOContainerBase *bcontainer)
{
+ VFIOGuestIOMMU *giommu, *tmp;
+
+ QLIST_FOREACH_SAFE(giommu, &bcontainer->giommu_list, giommu_next, tmp) {
+ memory_region_unregister_iommu_notifier(
+ MEMORY_REGION(giommu->iommu_mr), &giommu->n);
+ QLIST_REMOVE(giommu, giommu_next);
+ g_free(giommu);
+ }
}
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 32a0251dd1..133d3c8f5c 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -556,7 +556,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
container->dirty_pages_supported = false;
container->dma_max_mappings = 0;
container->iova_ranges = NULL;
- QLIST_INIT(&container->giommu_list);
QLIST_INIT(&container->vrdl_list);
bcontainer = &container->bcontainer;
vfio_container_init(bcontainer, &vfio_legacy_ops);
@@ -686,16 +685,9 @@ static void vfio_disconnect_container(VFIOGroup *group)
if (QLIST_EMPTY(&container->group_list)) {
VFIOAddressSpace *space = container->space;
- VFIOGuestIOMMU *giommu, *tmp;
QLIST_REMOVE(container, next);
- QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
- memory_region_unregister_iommu_notifier(
- MEMORY_REGION(giommu->iommu_mr), &giommu->n);
- QLIST_REMOVE(giommu, giommu_next);
- g_free(giommu);
- }
vfio_container_destroy(bcontainer);
trace_vfio_disconnect_container(container->fd);
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 24a26345e5..6be082b8f2 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -95,7 +95,6 @@ typedef struct VFIOContainer {
uint64_t max_dirty_bitmap_size;
unsigned long pgsizes;
unsigned int dma_max_mappings;
- QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
QLIST_HEAD(, VFIOGroup) group_list;
QLIST_HEAD(, VFIORamDiscardListener) vrdl_list;
@@ -104,14 +103,6 @@ typedef struct VFIOContainer {
GList *iova_ranges;
} VFIOContainer;
-typedef struct VFIOGuestIOMMU {
- VFIOContainer *container;
- IOMMUMemoryRegion *iommu_mr;
- hwaddr iommu_offset;
- IOMMUNotifier n;
- QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
-} VFIOGuestIOMMU;
-
typedef struct VFIORamDiscardListener {
VFIOContainer *container;
MemoryRegion *mr;
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index 577f52ccbc..a11aec5755 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -29,8 +29,17 @@ typedef struct {
*/
typedef struct VFIOContainerBase {
const VFIOIOMMUOps *ops;
+ QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
} VFIOContainerBase;
+typedef struct VFIOGuestIOMMU {
+ VFIOContainerBase *bcontainer;
+ IOMMUMemoryRegion *iommu_mr;
+ hwaddr iommu_offset;
+ IOMMUNotifier n;
+ QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
+} VFIOGuestIOMMU;
+
int vfio_container_dma_map(VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
void *vaddr, bool readonly);
--
2.39.3

View File

@ -0,0 +1,55 @@
From e9476ee64edd81fafd409fb3ceaad80668446bff Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Tue, 21 Nov 2023 16:44:02 +0800
Subject: [PATCH 023/101] vfio/common: return early if space isn't empty
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [22/67] 239c21ae7cddc8efabc041b9c7774f15b4964631 (eauger1/centos-qemu-kvm)
This is a trivial optimization. If there is active container in space,
vfio_reset_handler will never be unregistered. So revert the check of
space->containers and return early.
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 1eae5b7bd3ddd03b5591e9122b011c6520064a5a)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/common.c | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 572ae7c934..934f4f5446 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1462,10 +1462,13 @@ VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
void vfio_put_address_space(VFIOAddressSpace *space)
{
- if (QLIST_EMPTY(&space->containers)) {
- QLIST_REMOVE(space, list);
- g_free(space);
+ if (!QLIST_EMPTY(&space->containers)) {
+ return;
}
+
+ QLIST_REMOVE(space, list);
+ g_free(space);
+
if (QLIST_EMPTY(&vfio_address_spaces)) {
qemu_unregister_reset(vfio_reset_handler, NULL);
}
--
2.39.3

View File

@ -0,0 +1,257 @@
From facad966c42b1ec38b12e45f2b84bd059542b60c Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 2 Nov 2023 15:12:35 +0800
Subject: [PATCH 010/101] vfio/container: Convert functions to base container
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [9/67] a0002d6e9cb0ca76e3e2f25208ecba22dd9f9a88 (eauger1/centos-qemu-kvm)
In the prospect to get rid of VFIOContainer refs
in common.c lets convert misc functions to use the base
container object instead:
vfio_devices_all_dirty_tracking
vfio_devices_all_device_dirty_tracking
vfio_devices_all_running_and_mig_active
vfio_devices_query_dirty_bitmap
vfio_get_dirty_bitmap
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit e1cac6b203f45b5322e831e8d50edfdf18609b09)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/common.c | 42 +++++++++++++++--------------------
hw/vfio/container.c | 6 ++---
hw/vfio/trace-events | 2 +-
include/hw/vfio/vfio-common.h | 9 ++++----
4 files changed, 26 insertions(+), 33 deletions(-)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 9415395ed9..cf6618f6ed 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -177,9 +177,8 @@ bool vfio_device_state_is_precopy(VFIODevice *vbasedev)
migration->device_state == VFIO_DEVICE_STATE_PRE_COPY_P2P;
}
-static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
+static bool vfio_devices_all_dirty_tracking(VFIOContainerBase *bcontainer)
{
- VFIOContainerBase *bcontainer = &container->bcontainer;
VFIODevice *vbasedev;
MigrationState *ms = migrate_get_current();
@@ -204,9 +203,8 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
return true;
}
-bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container)
+bool vfio_devices_all_device_dirty_tracking(VFIOContainerBase *bcontainer)
{
- VFIOContainerBase *bcontainer = &container->bcontainer;
VFIODevice *vbasedev;
QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
@@ -222,9 +220,8 @@ bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container)
* Check if all VFIO devices are running and migration is active, which is
* essentially equivalent to the migration being in pre-copy phase.
*/
-bool vfio_devices_all_running_and_mig_active(VFIOContainer *container)
+bool vfio_devices_all_running_and_mig_active(VFIOContainerBase *bcontainer)
{
- VFIOContainerBase *bcontainer = &container->bcontainer;
VFIODevice *vbasedev;
if (!migration_is_active(migrate_get_current())) {
@@ -1082,7 +1079,7 @@ static void vfio_listener_log_global_start(MemoryListener *listener)
VFIOContainer *container = container_of(listener, VFIOContainer, listener);
int ret;
- if (vfio_devices_all_device_dirty_tracking(container)) {
+ if (vfio_devices_all_device_dirty_tracking(&container->bcontainer)) {
ret = vfio_devices_dma_logging_start(container);
} else {
ret = vfio_container_set_dirty_page_tracking(&container->bcontainer,
@@ -1101,7 +1098,7 @@ static void vfio_listener_log_global_stop(MemoryListener *listener)
VFIOContainer *container = container_of(listener, VFIOContainer, listener);
int ret = 0;
- if (vfio_devices_all_device_dirty_tracking(container)) {
+ if (vfio_devices_all_device_dirty_tracking(&container->bcontainer)) {
vfio_devices_dma_logging_stop(container);
} else {
ret = vfio_container_set_dirty_page_tracking(&container->bcontainer,
@@ -1141,11 +1138,10 @@ static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova,
return 0;
}
-int vfio_devices_query_dirty_bitmap(VFIOContainer *container,
+int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer,
VFIOBitmap *vbmap, hwaddr iova,
hwaddr size)
{
- VFIOContainerBase *bcontainer = &container->bcontainer;
VFIODevice *vbasedev;
int ret;
@@ -1165,17 +1161,16 @@ int vfio_devices_query_dirty_bitmap(VFIOContainer *container,
return 0;
}
-int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
+int vfio_get_dirty_bitmap(VFIOContainerBase *bcontainer, uint64_t iova,
uint64_t size, ram_addr_t ram_addr)
{
bool all_device_dirty_tracking =
- vfio_devices_all_device_dirty_tracking(container);
+ vfio_devices_all_device_dirty_tracking(bcontainer);
uint64_t dirty_pages;
VFIOBitmap vbmap;
int ret;
- if (!container->bcontainer.dirty_pages_supported &&
- !all_device_dirty_tracking) {
+ if (!bcontainer->dirty_pages_supported && !all_device_dirty_tracking) {
cpu_physical_memory_set_dirty_range(ram_addr, size,
tcg_enabled() ? DIRTY_CLIENTS_ALL :
DIRTY_CLIENTS_NOCODE);
@@ -1188,10 +1183,9 @@ int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
}
if (all_device_dirty_tracking) {
- ret = vfio_devices_query_dirty_bitmap(container, &vbmap, iova, size);
+ ret = vfio_devices_query_dirty_bitmap(bcontainer, &vbmap, iova, size);
} else {
- ret = vfio_container_query_dirty_bitmap(&container->bcontainer, &vbmap,
- iova, size);
+ ret = vfio_container_query_dirty_bitmap(bcontainer, &vbmap, iova, size);
}
if (ret) {
@@ -1201,8 +1195,7 @@ int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr,
vbmap.pages);
- trace_vfio_get_dirty_bitmap(container->fd, iova, size, vbmap.size,
- ram_addr, dirty_pages);
+ trace_vfio_get_dirty_bitmap(iova, size, vbmap.size, ram_addr, dirty_pages);
out:
g_free(vbmap.bitmap);
@@ -1236,8 +1229,8 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
rcu_read_lock();
if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) {
- ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1,
- translated_addr);
+ ret = vfio_get_dirty_bitmap(&container->bcontainer, iova,
+ iotlb->addr_mask + 1, translated_addr);
if (ret) {
error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx") = %d (%s)",
@@ -1266,7 +1259,8 @@ static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
* Sync the whole mapped region (spanning multiple individual mappings)
* in one go.
*/
- return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr);
+ return vfio_get_dirty_bitmap(&vrdl->container->bcontainer, iova, size,
+ ram_addr);
}
static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container,
@@ -1335,7 +1329,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container,
ram_addr = memory_region_get_ram_addr(section->mr) +
section->offset_within_region;
- return vfio_get_dirty_bitmap(container,
+ return vfio_get_dirty_bitmap(&container->bcontainer,
REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
int128_get64(section->size), ram_addr);
}
@@ -1350,7 +1344,7 @@ static void vfio_listener_log_sync(MemoryListener *listener,
return;
}
- if (vfio_devices_all_dirty_tracking(container)) {
+ if (vfio_devices_all_dirty_tracking(&container->bcontainer)) {
ret = vfio_sync_dirty_bitmap(container, section);
if (ret) {
error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret,
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 63a906de93..7bd81eab09 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -129,8 +129,8 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova,
bool need_dirty_sync = false;
int ret;
- if (iotlb && vfio_devices_all_running_and_mig_active(container)) {
- if (!vfio_devices_all_device_dirty_tracking(container) &&
+ if (iotlb && vfio_devices_all_running_and_mig_active(bcontainer)) {
+ if (!vfio_devices_all_device_dirty_tracking(bcontainer) &&
container->bcontainer.dirty_pages_supported) {
return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
}
@@ -162,7 +162,7 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova,
}
if (need_dirty_sync) {
- ret = vfio_get_dirty_bitmap(container, iova, size,
+ ret = vfio_get_dirty_bitmap(bcontainer, iova, size,
iotlb->translated_addr);
if (ret) {
return ret;
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 9f7fedee98..08a1f9dfa4 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -117,7 +117,7 @@ vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Devic
vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]"
vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x"
vfio_legacy_dma_unmap_overflow_workaround(void) ""
-vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64
+vfio_get_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64
vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
# platform.c
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 9740cf9fbc..bc67e1316c 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -186,7 +186,6 @@ typedef struct VFIODisplay {
VFIOAddressSpace *vfio_get_address_space(AddressSpace *as);
void vfio_put_address_space(VFIOAddressSpace *space);
-bool vfio_devices_all_running_and_saving(VFIOContainer *container);
/* SPAPR specific */
int vfio_container_add_section_window(VFIOContainer *container,
@@ -260,11 +259,11 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp);
void vfio_migration_exit(VFIODevice *vbasedev);
int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size);
-bool vfio_devices_all_running_and_mig_active(VFIOContainer *container);
-bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container);
-int vfio_devices_query_dirty_bitmap(VFIOContainer *container,
+bool vfio_devices_all_running_and_mig_active(VFIOContainerBase *bcontainer);
+bool vfio_devices_all_device_dirty_tracking(VFIOContainerBase *bcontainer);
+int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer,
VFIOBitmap *vbmap, hwaddr iova,
hwaddr size);
-int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
+int vfio_get_dirty_bitmap(VFIOContainerBase *bcontainer, uint64_t iova,
uint64_t size, ram_addr_t ram_addr);
#endif /* HW_VFIO_VFIO_COMMON_H */
--
2.39.3

View File

@ -0,0 +1,97 @@
From a5d19bfbfddb36fa6d68ca6282a5acd9b245d48a Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 2 Nov 2023 15:12:41 +0800
Subject: [PATCH 016/101] vfio/container: Implement attach/detach_device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [15/67] e233c90e4af2061dc0612bc1b1d17be1a47daeae (eauger1/centos-qemu-kvm)
No functional change intended.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 1eb31f13b24c49884d8256f96a6664df2dd0824d)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/common.c | 16 ++++++++++++++++
hw/vfio/container.c | 12 +++++-------
2 files changed, 21 insertions(+), 7 deletions(-)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 8ef2e7967d..483ba82089 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1498,3 +1498,19 @@ retry:
return info;
}
+
+int vfio_attach_device(char *name, VFIODevice *vbasedev,
+ AddressSpace *as, Error **errp)
+{
+ const VFIOIOMMUOps *ops = &vfio_legacy_ops;
+
+ return ops->attach_device(name, vbasedev, as, errp);
+}
+
+void vfio_detach_device(VFIODevice *vbasedev)
+{
+ if (!vbasedev->bcontainer) {
+ return;
+ }
+ vbasedev->bcontainer->ops->detach_device(vbasedev);
+}
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 721c0d7375..6bacf38222 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -873,8 +873,8 @@ static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp)
* @name and @vbasedev->name are likely to be different depending
* on the type of the device, hence the need for passing @name
*/
-int vfio_attach_device(char *name, VFIODevice *vbasedev,
- AddressSpace *as, Error **errp)
+static int vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev,
+ AddressSpace *as, Error **errp)
{
int groupid = vfio_device_groupid(vbasedev, errp);
VFIODevice *vbasedev_iter;
@@ -914,14 +914,10 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev,
return ret;
}
-void vfio_detach_device(VFIODevice *vbasedev)
+static void vfio_legacy_detach_device(VFIODevice *vbasedev)
{
VFIOGroup *group = vbasedev->group;
- if (!vbasedev->bcontainer) {
- return;
- }
-
QLIST_REMOVE(vbasedev, global_next);
QLIST_REMOVE(vbasedev, container_next);
vbasedev->bcontainer = NULL;
@@ -933,6 +929,8 @@ void vfio_detach_device(VFIODevice *vbasedev)
const VFIOIOMMUOps vfio_legacy_ops = {
.dma_map = vfio_legacy_dma_map,
.dma_unmap = vfio_legacy_dma_unmap,
+ .attach_device = vfio_legacy_attach_device,
+ .detach_device = vfio_legacy_detach_device,
.set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking,
.query_dirty_bitmap = vfio_legacy_query_dirty_bitmap,
};
--
2.39.3

View File

@ -0,0 +1,65 @@
From c3c9f366c356032fa57ff7cc664732ba87ceb3fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Tue, 19 Dec 2023 07:58:18 +0100
Subject: [PATCH 051/101] vfio/container: Initialize VFIOIOMMUOps under
vfio_init_container()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [50/67] f325136391b22babadb1be3394c527deecdcd3ca (eauger1/centos-qemu-kvm)
vfio_init_container() already defines the IOMMU type of the container.
Do the same for the VFIOIOMMUOps struct. This prepares ground for the
following patches that will deduce the associated VFIOIOMMUOps struct
from the IOMMU type.
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Tested-by: Eric Farman <farman@linux.ibm.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit bffe92af0e7571868d47a1d1cd2205e13054d492)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/container.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index afcfe80488..f4a0434a52 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -370,7 +370,7 @@ static int vfio_get_iommu_type(VFIOContainer *container,
}
static int vfio_init_container(VFIOContainer *container, int group_fd,
- Error **errp)
+ VFIOAddressSpace *space, Error **errp)
{
int iommu_type, ret;
@@ -401,6 +401,7 @@ static int vfio_init_container(VFIOContainer *container, int group_fd,
}
container->iommu_type = iommu_type;
+ vfio_container_init(&container->bcontainer, space, &vfio_legacy_ops);
return 0;
}
@@ -583,9 +584,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
container = g_malloc0(sizeof(*container));
container->fd = fd;
bcontainer = &container->bcontainer;
- vfio_container_init(bcontainer, space, &vfio_legacy_ops);
- ret = vfio_init_container(container, group->fd, errp);
+ ret = vfio_init_container(container, group->fd, space, errp);
if (ret) {
goto free_container_exit;
}
--
2.39.3

View File

@ -0,0 +1,55 @@
From 29f13011e62f5370ef7fb3248dc85c90ae5bb042 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Tue, 19 Dec 2023 07:58:21 +0100
Subject: [PATCH 054/101] vfio/container: Intoduce a new VFIOIOMMUClass::setup
handler
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [53/67] 8641161afc33d68795bcf51a47e89061b34d50a8 (eauger1/centos-qemu-kvm)
This will help in converting the sPAPR IOMMU backend to a QOM interface.
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Tested-by: Eric Farman <farman@linux.ibm.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 61d893f2cdb34a2b0255f9b5fbba6b49b94ff730)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/container.c | 1 +
include/hw/vfio/vfio-container-base.h | 1 +
2 files changed, 2 insertions(+)
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 220e838a91..c22bdd3216 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -1129,6 +1129,7 @@ static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data)
{
VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
+ vioc->setup = vfio_legacy_setup;
vioc->dma_map = vfio_legacy_dma_map;
vioc->dma_unmap = vfio_legacy_dma_unmap;
vioc->attach_device = vfio_legacy_attach_device;
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index c60370fc5e..ce8b1fba88 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -109,6 +109,7 @@ struct VFIOIOMMUClass {
InterfaceClass parent_class;
/* basic feature */
+ int (*setup)(VFIOContainerBase *bcontainer, Error **errp);
int (*dma_map)(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
void *vaddr, bool readonly);
--
2.39.3

View File

@ -0,0 +1,143 @@
From 5b63e4595e106196ef922b7f762c8f4150d73979 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Tue, 19 Dec 2023 07:58:19 +0100
Subject: [PATCH 052/101] vfio/container: Introduce a VFIOIOMMU QOM interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [51/67] 7c06e2165efe94dcd203d44e422a7aa9fac9816c (eauger1/centos-qemu-kvm)
VFIOContainerBase was not introduced as an abstract QOM object because
it felt unnecessary to expose all the IOMMU backends to the QEMU
machine and human interface. However, we can still abstract the IOMMU
backend handlers using a QOM interface class. This provides more
flexibility when referencing the various implementations.
Simply transform the VFIOIOMMUOps struct in an InterfaceClass and do
some initial name replacements. Next changes will start converting
VFIOIOMMUOps.
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Tested-by: Eric Farman <farman@linux.ibm.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit fdaa774e67435a328c0e28006c4d749f2198294a)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/common.c | 2 +-
hw/vfio/container-base.c | 12 +++++++++++-
hw/vfio/pci.c | 2 +-
include/hw/vfio/vfio-container-base.h | 23 +++++++++++++++++++----
4 files changed, 32 insertions(+), 7 deletions(-)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 08a3e57672..49dab41566 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1503,7 +1503,7 @@ retry:
int vfio_attach_device(char *name, VFIODevice *vbasedev,
AddressSpace *as, Error **errp)
{
- const VFIOIOMMUOps *ops = &vfio_legacy_ops;
+ const VFIOIOMMUClass *ops = &vfio_legacy_ops;
#ifdef CONFIG_IOMMUFD
if (vbasedev->iommufd) {
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index 1ffd25bbfa..913ae49077 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -72,7 +72,7 @@ int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
}
void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space,
- const VFIOIOMMUOps *ops)
+ const VFIOIOMMUClass *ops)
{
bcontainer->ops = ops;
bcontainer->space = space;
@@ -99,3 +99,13 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer)
g_list_free_full(bcontainer->iova_ranges, g_free);
}
+
+static const TypeInfo types[] = {
+ {
+ .name = TYPE_VFIO_IOMMU,
+ .parent = TYPE_INTERFACE,
+ .class_size = sizeof(VFIOIOMMUClass),
+ },
+};
+
+DEFINE_TYPES(types)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 83c3238608..adb7c09367 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2491,7 +2491,7 @@ int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev,
static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
{
VFIODevice *vbasedev = &vdev->vbasedev;
- const VFIOIOMMUOps *ops = vbasedev->bcontainer->ops;
+ const VFIOIOMMUClass *ops = vbasedev->bcontainer->ops;
return ops->pci_hot_reset(vbasedev, single);
}
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index 5c9594b6c7..d6147b4aee 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -16,7 +16,8 @@
#include "exec/memory.h"
typedef struct VFIODevice VFIODevice;
-typedef struct VFIOIOMMUOps VFIOIOMMUOps;
+typedef struct VFIOIOMMUClass VFIOIOMMUClass;
+#define VFIOIOMMUOps VFIOIOMMUClass /* To remove */
typedef struct {
unsigned long *bitmap;
@@ -34,7 +35,7 @@ typedef struct VFIOAddressSpace {
* This is the base object for vfio container backends
*/
typedef struct VFIOContainerBase {
- const VFIOIOMMUOps *ops;
+ const VFIOIOMMUClass *ops;
VFIOAddressSpace *space;
MemoryListener listener;
Error *error;
@@ -88,10 +89,24 @@ int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
void vfio_container_init(VFIOContainerBase *bcontainer,
VFIOAddressSpace *space,
- const VFIOIOMMUOps *ops);
+ const VFIOIOMMUClass *ops);
void vfio_container_destroy(VFIOContainerBase *bcontainer);
-struct VFIOIOMMUOps {
+
+#define TYPE_VFIO_IOMMU "vfio-iommu"
+
+/*
+ * VFIOContainerBase is not an abstract QOM object because it felt
+ * unnecessary to expose all the IOMMU backends to the QEMU machine
+ * and human interface. However, we can still abstract the IOMMU
+ * backend handlers using a QOM interface class. This provides more
+ * flexibility when referencing the various implementations.
+ */
+DECLARE_CLASS_CHECKERS(VFIOIOMMUClass, VFIO_IOMMU, TYPE_VFIO_IOMMU)
+
+struct VFIOIOMMUClass {
+ InterfaceClass parent_class;
+
/* basic feature */
int (*dma_map)(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
--
2.39.3

View File

@ -0,0 +1,168 @@
From 58927bf236541b9423f855eca1970f7a3cf864a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Tue, 19 Dec 2023 07:58:20 +0100
Subject: [PATCH 053/101] vfio/container: Introduce a VFIOIOMMU legacy QOM
interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [52/67] a81f39d13305e84699313e17ae64d10ff4b09067 (eauger1/centos-qemu-kvm)
Convert the legacy VFIOIOMMUOps struct to the new VFIOIOMMU QOM
interface. The set of of operations for this backend can be referenced
with a literal typename instead of a C struct. This will simplify
support of multiple backends.
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Tested-by: Eric Farman <farman@linux.ibm.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 9812feefab3a4ff95a6cfd73aecb120b406bc98c)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/common.c | 6 ++-
hw/vfio/container.c | 58 ++++++++++++++++++++++-----
include/hw/vfio/vfio-common.h | 1 -
include/hw/vfio/vfio-container-base.h | 1 +
4 files changed, 55 insertions(+), 11 deletions(-)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 49dab41566..2329d0efc8 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1503,13 +1503,17 @@ retry:
int vfio_attach_device(char *name, VFIODevice *vbasedev,
AddressSpace *as, Error **errp)
{
- const VFIOIOMMUClass *ops = &vfio_legacy_ops;
+ const VFIOIOMMUClass *ops =
+ VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY));
#ifdef CONFIG_IOMMUFD
if (vbasedev->iommufd) {
ops = &vfio_iommufd_ops;
}
#endif
+
+ assert(ops);
+
return ops->attach_device(name, vbasedev, as, errp);
}
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index f4a0434a52..220e838a91 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -369,10 +369,30 @@ static int vfio_get_iommu_type(VFIOContainer *container,
return -EINVAL;
}
+/*
+ * vfio_get_iommu_ops - get a VFIOIOMMUClass associated with a type
+ */
+static const VFIOIOMMUClass *vfio_get_iommu_class(int iommu_type, Error **errp)
+{
+ ObjectClass *klass = NULL;
+
+ switch (iommu_type) {
+ case VFIO_TYPE1v2_IOMMU:
+ case VFIO_TYPE1_IOMMU:
+ klass = object_class_by_name(TYPE_VFIO_IOMMU_LEGACY);
+ break;
+ default:
+ g_assert_not_reached();
+ };
+
+ return VFIO_IOMMU_CLASS(klass);
+}
+
static int vfio_init_container(VFIOContainer *container, int group_fd,
VFIOAddressSpace *space, Error **errp)
{
int iommu_type, ret;
+ const VFIOIOMMUClass *vioc;
iommu_type = vfio_get_iommu_type(container, errp);
if (iommu_type < 0) {
@@ -401,7 +421,14 @@ static int vfio_init_container(VFIOContainer *container, int group_fd,
}
container->iommu_type = iommu_type;
- vfio_container_init(&container->bcontainer, space, &vfio_legacy_ops);
+
+ vioc = vfio_get_iommu_class(iommu_type, errp);
+ if (!vioc) {
+ error_setg(errp, "No available IOMMU models");
+ return -EINVAL;
+ }
+
+ vfio_container_init(&container->bcontainer, space, vioc);
return 0;
}
@@ -1098,12 +1125,25 @@ out_single:
return ret;
}
-const VFIOIOMMUOps vfio_legacy_ops = {
- .dma_map = vfio_legacy_dma_map,
- .dma_unmap = vfio_legacy_dma_unmap,
- .attach_device = vfio_legacy_attach_device,
- .detach_device = vfio_legacy_detach_device,
- .set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking,
- .query_dirty_bitmap = vfio_legacy_query_dirty_bitmap,
- .pci_hot_reset = vfio_legacy_pci_hot_reset,
+static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data)
+{
+ VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
+
+ vioc->dma_map = vfio_legacy_dma_map;
+ vioc->dma_unmap = vfio_legacy_dma_unmap;
+ vioc->attach_device = vfio_legacy_attach_device;
+ vioc->detach_device = vfio_legacy_detach_device;
+ vioc->set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking;
+ vioc->query_dirty_bitmap = vfio_legacy_query_dirty_bitmap;
+ vioc->pci_hot_reset = vfio_legacy_pci_hot_reset;
};
+
+static const TypeInfo types[] = {
+ {
+ .name = TYPE_VFIO_IOMMU_LEGACY,
+ .parent = TYPE_VFIO_IOMMU,
+ .class_init = vfio_iommu_legacy_class_init,
+ },
+};
+
+DEFINE_TYPES(types)
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index b8aa8a5495..14c497b6b0 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -210,7 +210,6 @@ typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList;
typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList;
extern VFIOGroupList vfio_group_list;
extern VFIODeviceList vfio_device_list;
-extern const VFIOIOMMUOps vfio_legacy_ops;
extern const VFIOIOMMUOps vfio_iommufd_ops;
extern const MemoryListener vfio_memory_listener;
extern int vfio_kvm_device_fd;
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index d6147b4aee..c60370fc5e 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -94,6 +94,7 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer);
#define TYPE_VFIO_IOMMU "vfio-iommu"
+#define TYPE_VFIO_IOMMU_LEGACY TYPE_VFIO_IOMMU "-legacy"
/*
* VFIOContainerBase is not an abstract QOM object because it felt
--
2.39.3

View File

@ -0,0 +1,71 @@
From e56f961fbe95a53a52c5eca00b4fca17d825e860 Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Thu, 2 Nov 2023 15:12:28 +0800
Subject: [PATCH 003/101] vfio/container: Introduce a empty VFIOIOMMUOps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [2/67] 0de0afffce42fa4a17f6d33a10b6162cdfbe8150 (eauger1/centos-qemu-kvm)
This empty VFIOIOMMUOps named vfio_legacy_ops will hold all general
IOMMU ops of legacy container.
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit d24668579184f4098779983724ec74cd3db62e10)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/container.c | 5 +++++
include/hw/vfio/vfio-common.h | 2 +-
2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 242010036a..4bc43ddfa4 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -472,6 +472,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
Error **errp)
{
VFIOContainer *container;
+ VFIOContainerBase *bcontainer;
int ret, fd;
VFIOAddressSpace *space;
@@ -552,6 +553,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
container->iova_ranges = NULL;
QLIST_INIT(&container->giommu_list);
QLIST_INIT(&container->vrdl_list);
+ bcontainer = &container->bcontainer;
+ bcontainer->ops = &vfio_legacy_ops;
ret = vfio_init_container(container, group->fd, errp);
if (ret) {
@@ -933,3 +936,5 @@ void vfio_detach_device(VFIODevice *vbasedev)
vfio_put_base_device(vbasedev);
vfio_put_group(group);
}
+
+const VFIOIOMMUOps vfio_legacy_ops;
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 586d153c12..678161f207 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -255,7 +255,7 @@ typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList;
typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList;
extern VFIOGroupList vfio_group_list;
extern VFIODeviceList vfio_device_list;
-
+extern const VFIOIOMMUOps vfio_legacy_ops;
extern const MemoryListener vfio_memory_listener;
extern int vfio_kvm_device_fd;
--
2.39.3

View File

@ -0,0 +1,118 @@
From 6c7546756e979e4f5ba29ae51a21c63fa90492cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Tue, 19 Dec 2023 07:58:17 +0100
Subject: [PATCH 050/101] vfio/container: Introduce vfio_legacy_setup() for
further cleanups
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [49/67] 3a621ba2605c98b7fbf7fd9f93a207f728f1202e (eauger1/centos-qemu-kvm)
This will help subsequent patches to unify the initialization of type1
and sPAPR IOMMU backends.
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Tested-by: Eric Farman <farman@linux.ibm.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit d3764db87531cd53849ccee9b2f72aede90ccf5b)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/container.c | 63 +++++++++++++++++++++++++--------------------
1 file changed, 35 insertions(+), 28 deletions(-)
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 1e77a2929e..afcfe80488 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -474,6 +474,35 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container,
}
}
+static int vfio_legacy_setup(VFIOContainerBase *bcontainer, Error **errp)
+{
+ VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+ bcontainer);
+ g_autofree struct vfio_iommu_type1_info *info = NULL;
+ int ret;
+
+ ret = vfio_get_iommu_info(container, &info);
+ if (ret) {
+ error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info");
+ return ret;
+ }
+
+ if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
+ bcontainer->pgsizes = info->iova_pgsizes;
+ } else {
+ bcontainer->pgsizes = qemu_real_host_page_size();
+ }
+
+ if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) {
+ bcontainer->dma_max_mappings = 65535;
+ }
+
+ vfio_get_info_iova_range(info, bcontainer);
+
+ vfio_get_iommu_info_migration(container, info);
+ return 0;
+}
+
static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
Error **errp)
{
@@ -570,40 +599,18 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
switch (container->iommu_type) {
case VFIO_TYPE1v2_IOMMU:
case VFIO_TYPE1_IOMMU:
- {
- struct vfio_iommu_type1_info *info;
-
- ret = vfio_get_iommu_info(container, &info);
- if (ret) {
- error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info");
- goto enable_discards_exit;
- }
-
- if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
- bcontainer->pgsizes = info->iova_pgsizes;
- } else {
- bcontainer->pgsizes = qemu_real_host_page_size();
- }
-
- if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) {
- bcontainer->dma_max_mappings = 65535;
- }
-
- vfio_get_info_iova_range(info, bcontainer);
-
- vfio_get_iommu_info_migration(container, info);
- g_free(info);
+ ret = vfio_legacy_setup(bcontainer, errp);
break;
- }
case VFIO_SPAPR_TCE_v2_IOMMU:
case VFIO_SPAPR_TCE_IOMMU:
- {
ret = vfio_spapr_container_init(container, errp);
- if (ret) {
- goto enable_discards_exit;
- }
break;
+ default:
+ g_assert_not_reached();
}
+
+ if (ret) {
+ goto enable_discards_exit;
}
vfio_kvm_device_add_group(group);
--
2.39.3

View File

@ -0,0 +1,102 @@
From 6a597d7c82a4538fa1f928db7e600ec2e5a44361 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 2 Nov 2023 15:12:39 +0800
Subject: [PATCH 014/101] vfio/container: Move dirty_pgsizes and
max_dirty_bitmap_size to base container
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [13/67] b9fe57174368e36788b017cc2ad13b748592cfc2 (eauger1/centos-qemu-kvm)
No functional change intended.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 4d6b95010c59127ac4f7230d6ee88b5d0e99738c)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/container.c | 9 +++++----
include/hw/vfio/vfio-common.h | 2 --
include/hw/vfio/vfio-container-base.h | 2 ++
3 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 5c1dee8c9f..c8088a8174 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -64,6 +64,7 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container,
hwaddr iova, ram_addr_t size,
IOMMUTLBEntry *iotlb)
{
+ VFIOContainerBase *bcontainer = &container->bcontainer;
struct vfio_iommu_type1_dma_unmap *unmap;
struct vfio_bitmap *bitmap;
VFIOBitmap vbmap;
@@ -91,7 +92,7 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container,
bitmap->size = vbmap.size;
bitmap->data = (__u64 *)vbmap.bitmap;
- if (vbmap.size > container->max_dirty_bitmap_size) {
+ if (vbmap.size > bcontainer->max_dirty_bitmap_size) {
error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size);
ret = -E2BIG;
goto unmap_exit;
@@ -131,7 +132,7 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova,
if (iotlb && vfio_devices_all_running_and_mig_active(bcontainer)) {
if (!vfio_devices_all_device_dirty_tracking(bcontainer) &&
- container->bcontainer.dirty_pages_supported) {
+ bcontainer->dirty_pages_supported) {
return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
}
@@ -469,8 +470,8 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container,
*/
if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) {
bcontainer->dirty_pages_supported = true;
- container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
- container->dirty_pgsizes = cap_mig->pgsize_bitmap;
+ bcontainer->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
+ bcontainer->dirty_pgsizes = cap_mig->pgsize_bitmap;
}
}
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 922022cbc6..b1c9fe711b 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -80,8 +80,6 @@ typedef struct VFIOContainer {
int fd; /* /dev/vfio/vfio, empowered by the attached groups */
MemoryListener prereg_listener;
unsigned iommu_type;
- uint64_t dirty_pgsizes;
- uint64_t max_dirty_bitmap_size;
QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
QLIST_HEAD(, VFIOGroup) group_list;
GList *iova_ranges;
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index 95f8d319e0..80e4a993c5 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -39,6 +39,8 @@ typedef struct VFIOContainerBase {
MemoryListener listener;
Error *error;
bool initialized;
+ uint64_t dirty_pgsizes;
+ uint64_t max_dirty_bitmap_size;
unsigned long pgsizes;
unsigned int dma_max_mappings;
bool dirty_pages_supported;
--
2.39.3

View File

@ -0,0 +1,168 @@
From 882143ef30da4182f049eb8192e0fac317c372b3 Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Thu, 2 Nov 2023 15:12:40 +0800
Subject: [PATCH 015/101] vfio/container: Move iova_ranges to base container
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [14/67] 49f2e3c484b4c0c63be9aa4eb1bf08804dcb1ec3 (eauger1/centos-qemu-kvm)
Meanwhile remove the helper function vfio_free_container as it
only calls g_free now.
No functional change intended.
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit f79baf8c9575ac3193ca86ec508791c86d96b13e)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/common.c | 5 +++--
hw/vfio/container-base.c | 3 +++
hw/vfio/container.c | 19 ++++++-------------
include/hw/vfio/vfio-common.h | 1 -
include/hw/vfio/vfio-container-base.h | 1 +
5 files changed, 13 insertions(+), 16 deletions(-)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index be623e544b..8ef2e7967d 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -637,9 +637,10 @@ static void vfio_listener_region_add(MemoryListener *listener,
goto fail;
}
- if (container->iova_ranges) {
+ if (bcontainer->iova_ranges) {
ret = memory_region_iommu_set_iova_ranges(giommu->iommu_mr,
- container->iova_ranges, &err);
+ bcontainer->iova_ranges,
+ &err);
if (ret) {
g_free(giommu);
goto fail;
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index 7f508669f5..0177f43741 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -54,6 +54,7 @@ void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space,
bcontainer->error = NULL;
bcontainer->dirty_pages_supported = false;
bcontainer->dma_max_mappings = 0;
+ bcontainer->iova_ranges = NULL;
QLIST_INIT(&bcontainer->giommu_list);
QLIST_INIT(&bcontainer->vrdl_list);
}
@@ -70,4 +71,6 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer)
QLIST_REMOVE(giommu, giommu_next);
g_free(giommu);
}
+
+ g_list_free_full(bcontainer->iova_ranges, g_free);
}
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index c8088a8174..721c0d7375 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -308,7 +308,7 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
}
static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info,
- VFIOContainer *container)
+ VFIOContainerBase *bcontainer)
{
struct vfio_info_cap_header *hdr;
struct vfio_iommu_type1_info_cap_iova_range *cap;
@@ -326,8 +326,8 @@ static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info,
range_set_bounds(range, cap->iova_ranges[i].start,
cap->iova_ranges[i].end);
- container->iova_ranges =
- range_list_insert(container->iova_ranges, range);
+ bcontainer->iova_ranges =
+ range_list_insert(bcontainer->iova_ranges, range);
}
return true;
@@ -475,12 +475,6 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container,
}
}
-static void vfio_free_container(VFIOContainer *container)
-{
- g_list_free_full(container->iova_ranges, g_free);
- g_free(container);
-}
-
static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
Error **errp)
{
@@ -560,7 +554,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
container = g_malloc0(sizeof(*container));
container->fd = fd;
- container->iova_ranges = NULL;
bcontainer = &container->bcontainer;
vfio_container_init(bcontainer, space, &vfio_legacy_ops);
@@ -597,7 +590,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
bcontainer->dma_max_mappings = 65535;
}
- vfio_get_info_iova_range(info, container);
+ vfio_get_info_iova_range(info, bcontainer);
vfio_get_iommu_info_migration(container, info);
g_free(info);
@@ -649,7 +642,7 @@ enable_discards_exit:
vfio_ram_block_discard_disable(container, false);
free_container_exit:
- vfio_free_container(container);
+ g_free(container);
close_fd_exit:
close(fd);
@@ -693,7 +686,7 @@ static void vfio_disconnect_container(VFIOGroup *group)
trace_vfio_disconnect_container(container->fd);
close(container->fd);
- vfio_free_container(container);
+ g_free(container);
vfio_put_address_space(space);
}
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index b1c9fe711b..b9e5a0e64b 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -82,7 +82,6 @@ typedef struct VFIOContainer {
unsigned iommu_type;
QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
QLIST_HEAD(, VFIOGroup) group_list;
- GList *iova_ranges;
} VFIOContainer;
typedef struct VFIOHostDMAWindow {
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index 80e4a993c5..9658ffb526 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -48,6 +48,7 @@ typedef struct VFIOContainerBase {
QLIST_HEAD(, VFIORamDiscardListener) vrdl_list;
QLIST_ENTRY(VFIOContainerBase) next;
QLIST_HEAD(, VFIODevice) device_list;
+ GList *iova_ranges;
} VFIOContainerBase;
typedef struct VFIOGuestIOMMU {
--
2.39.3

View File

@ -0,0 +1,522 @@
From 36bc7782bb02f81368e3e43a3947d16ad362e137 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 2 Nov 2023 15:12:38 +0800
Subject: [PATCH 013/101] vfio/container: Move listener to base container
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [12/67] f469ab126c6366170aa2520f9b4d9969d3ae0a04 (eauger1/centos-qemu-kvm)
Move listener to base container. Also error and initialized fields
are moved at the same time.
No functional change intended.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit c7b313d300f161c650d011a5c9da469bcd5d34e4)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/common.c | 110 +++++++++++++-------------
hw/vfio/container-base.c | 1 +
hw/vfio/container.c | 19 +++--
hw/vfio/spapr.c | 11 +--
include/hw/vfio/vfio-common.h | 3 -
include/hw/vfio/vfio-container-base.h | 3 +
6 files changed, 74 insertions(+), 73 deletions(-)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index f15665789f..be623e544b 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -541,7 +541,7 @@ static bool vfio_listener_valid_section(MemoryRegionSection *section,
return true;
}
-static bool vfio_get_section_iova_range(VFIOContainer *container,
+static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer,
MemoryRegionSection *section,
hwaddr *out_iova, hwaddr *out_end,
Int128 *out_llend)
@@ -569,8 +569,10 @@ static bool vfio_get_section_iova_range(VFIOContainer *container,
static void vfio_listener_region_add(MemoryListener *listener,
MemoryRegionSection *section)
{
- VFIOContainer *container = container_of(listener, VFIOContainer, listener);
- VFIOContainerBase *bcontainer = &container->bcontainer;
+ VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
+ listener);
+ VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+ bcontainer);
hwaddr iova, end;
Int128 llend, llsize;
void *vaddr;
@@ -581,7 +583,8 @@ static void vfio_listener_region_add(MemoryListener *listener,
return;
}
- if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) {
+ if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end,
+ &llend)) {
if (memory_region_is_ram_device(section->mr)) {
trace_vfio_listener_region_add_no_dma_map(
memory_region_name(section->mr),
@@ -688,13 +691,12 @@ static void vfio_listener_region_add(MemoryListener *listener,
}
}
- ret = vfio_container_dma_map(&container->bcontainer,
- iova, int128_get64(llsize), vaddr,
- section->readonly);
+ ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize),
+ vaddr, section->readonly);
if (ret) {
error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx", %p) = %d (%s)",
- container, iova, int128_get64(llsize), vaddr, ret,
+ bcontainer, iova, int128_get64(llsize), vaddr, ret,
strerror(-ret));
if (memory_region_is_ram_device(section->mr)) {
/* Allow unexpected mappings not to be fatal for RAM devices */
@@ -716,9 +718,9 @@ fail:
* can gracefully fail. Runtime, there's not much we can do other
* than throw a hardware error.
*/
- if (!container->initialized) {
- if (!container->error) {
- error_propagate_prepend(&container->error, err,
+ if (!bcontainer->initialized) {
+ if (!bcontainer->error) {
+ error_propagate_prepend(&bcontainer->error, err,
"Region %s: ",
memory_region_name(section->mr));
} else {
@@ -733,8 +735,10 @@ fail:
static void vfio_listener_region_del(MemoryListener *listener,
MemoryRegionSection *section)
{
- VFIOContainer *container = container_of(listener, VFIOContainer, listener);
- VFIOContainerBase *bcontainer = &container->bcontainer;
+ VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
+ listener);
+ VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+ bcontainer);
hwaddr iova, end;
Int128 llend, llsize;
int ret;
@@ -767,7 +771,8 @@ static void vfio_listener_region_del(MemoryListener *listener,
*/
}
- if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) {
+ if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end,
+ &llend)) {
return;
}
@@ -790,22 +795,22 @@ static void vfio_listener_region_del(MemoryListener *listener,
if (int128_eq(llsize, int128_2_64())) {
/* The unmap ioctl doesn't accept a full 64-bit span. */
llsize = int128_rshift(llsize, 1);
- ret = vfio_container_dma_unmap(&container->bcontainer, iova,
+ ret = vfio_container_dma_unmap(bcontainer, iova,
int128_get64(llsize), NULL);
if (ret) {
error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx") = %d (%s)",
- container, iova, int128_get64(llsize), ret,
+ bcontainer, iova, int128_get64(llsize), ret,
strerror(-ret));
}
iova += int128_get64(llsize);
}
- ret = vfio_container_dma_unmap(&container->bcontainer, iova,
+ ret = vfio_container_dma_unmap(bcontainer, iova,
int128_get64(llsize), NULL);
if (ret) {
error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx") = %d (%s)",
- container, iova, int128_get64(llsize), ret,
+ bcontainer, iova, int128_get64(llsize), ret,
strerror(-ret));
}
}
@@ -825,16 +830,15 @@ typedef struct VFIODirtyRanges {
} VFIODirtyRanges;
typedef struct VFIODirtyRangesListener {
- VFIOContainer *container;
+ VFIOContainerBase *bcontainer;
VFIODirtyRanges ranges;
MemoryListener listener;
} VFIODirtyRangesListener;
static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
- VFIOContainer *container)
+ VFIOContainerBase *bcontainer)
{
VFIOPCIDevice *pcidev;
- VFIOContainerBase *bcontainer = &container->bcontainer;
VFIODevice *vbasedev;
Object *owner;
@@ -863,7 +867,7 @@ static void vfio_dirty_tracking_update(MemoryListener *listener,
hwaddr iova, end, *min, *max;
if (!vfio_listener_valid_section(section, "tracking_update") ||
- !vfio_get_section_iova_range(dirty->container, section,
+ !vfio_get_section_iova_range(dirty->bcontainer, section,
&iova, &end, NULL)) {
return;
}
@@ -887,7 +891,7 @@ static void vfio_dirty_tracking_update(MemoryListener *listener,
* The alternative would be an IOVATree but that has a much bigger runtime
* overhead and unnecessary complexity.
*/
- if (vfio_section_is_vfio_pci(section, dirty->container) &&
+ if (vfio_section_is_vfio_pci(section, dirty->bcontainer) &&
iova >= UINT32_MAX) {
min = &range->minpci64;
max = &range->maxpci64;
@@ -911,7 +915,7 @@ static const MemoryListener vfio_dirty_tracking_listener = {
.region_add = vfio_dirty_tracking_update,
};
-static void vfio_dirty_tracking_init(VFIOContainer *container,
+static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer,
VFIODirtyRanges *ranges)
{
VFIODirtyRangesListener dirty;
@@ -921,10 +925,10 @@ static void vfio_dirty_tracking_init(VFIOContainer *container,
dirty.ranges.min64 = UINT64_MAX;
dirty.ranges.minpci64 = UINT64_MAX;
dirty.listener = vfio_dirty_tracking_listener;
- dirty.container = container;
+ dirty.bcontainer = bcontainer;
memory_listener_register(&dirty.listener,
- container->bcontainer.space->as);
+ bcontainer->space->as);
*ranges = dirty.ranges;
@@ -936,12 +940,11 @@ static void vfio_dirty_tracking_init(VFIOContainer *container,
memory_listener_unregister(&dirty.listener);
}
-static void vfio_devices_dma_logging_stop(VFIOContainer *container)
+static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer)
{
uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
sizeof(uint64_t))] = {};
struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
- VFIOContainerBase *bcontainer = &container->bcontainer;
VFIODevice *vbasedev;
feature->argsz = sizeof(buf);
@@ -962,7 +965,7 @@ static void vfio_devices_dma_logging_stop(VFIOContainer *container)
}
static struct vfio_device_feature *
-vfio_device_feature_dma_logging_start_create(VFIOContainer *container,
+vfio_device_feature_dma_logging_start_create(VFIOContainerBase *bcontainer,
VFIODirtyRanges *tracking)
{
struct vfio_device_feature *feature;
@@ -1035,16 +1038,15 @@ static void vfio_device_feature_dma_logging_start_destroy(
g_free(feature);
}
-static int vfio_devices_dma_logging_start(VFIOContainer *container)
+static int vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer)
{
struct vfio_device_feature *feature;
VFIODirtyRanges ranges;
- VFIOContainerBase *bcontainer = &container->bcontainer;
VFIODevice *vbasedev;
int ret = 0;
- vfio_dirty_tracking_init(container, &ranges);
- feature = vfio_device_feature_dma_logging_start_create(container,
+ vfio_dirty_tracking_init(bcontainer, &ranges);
+ feature = vfio_device_feature_dma_logging_start_create(bcontainer,
&ranges);
if (!feature) {
return -errno;
@@ -1067,7 +1069,7 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container)
out:
if (ret) {
- vfio_devices_dma_logging_stop(container);
+ vfio_devices_dma_logging_stop(bcontainer);
}
vfio_device_feature_dma_logging_start_destroy(feature);
@@ -1077,14 +1079,14 @@ out:
static void vfio_listener_log_global_start(MemoryListener *listener)
{
- VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+ VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
+ listener);
int ret;
- if (vfio_devices_all_device_dirty_tracking(&container->bcontainer)) {
- ret = vfio_devices_dma_logging_start(container);
+ if (vfio_devices_all_device_dirty_tracking(bcontainer)) {
+ ret = vfio_devices_dma_logging_start(bcontainer);
} else {
- ret = vfio_container_set_dirty_page_tracking(&container->bcontainer,
- true);
+ ret = vfio_container_set_dirty_page_tracking(bcontainer, true);
}
if (ret) {
@@ -1096,14 +1098,14 @@ static void vfio_listener_log_global_start(MemoryListener *listener)
static void vfio_listener_log_global_stop(MemoryListener *listener)
{
- VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+ VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
+ listener);
int ret = 0;
- if (vfio_devices_all_device_dirty_tracking(&container->bcontainer)) {
- vfio_devices_dma_logging_stop(container);
+ if (vfio_devices_all_device_dirty_tracking(bcontainer)) {
+ vfio_devices_dma_logging_stop(bcontainer);
} else {
- ret = vfio_container_set_dirty_page_tracking(&container->bcontainer,
- false);
+ ret = vfio_container_set_dirty_page_tracking(bcontainer, false);
}
if (ret) {
@@ -1214,8 +1216,6 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
vfio_giommu_dirty_notifier, n);
VFIOGuestIOMMU *giommu = gdn->giommu;
VFIOContainerBase *bcontainer = giommu->bcontainer;
- VFIOContainer *container = container_of(bcontainer, VFIOContainer,
- bcontainer);
hwaddr iova = iotlb->iova + giommu->iommu_offset;
ram_addr_t translated_addr;
int ret = -EINVAL;
@@ -1230,12 +1230,12 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
rcu_read_lock();
if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) {
- ret = vfio_get_dirty_bitmap(&container->bcontainer, iova,
- iotlb->addr_mask + 1, translated_addr);
+ ret = vfio_get_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1,
+ translated_addr);
if (ret) {
error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx") = %d (%s)",
- container, iova, iotlb->addr_mask + 1, ret,
+ bcontainer, iova, iotlb->addr_mask + 1, ret,
strerror(-ret));
}
}
@@ -1291,10 +1291,9 @@ vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer,
&vrdl);
}
-static int vfio_sync_dirty_bitmap(VFIOContainer *container,
+static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer,
MemoryRegionSection *section)
{
- VFIOContainerBase *bcontainer = &container->bcontainer;
ram_addr_t ram_addr;
if (memory_region_is_iommu(section->mr)) {
@@ -1330,7 +1329,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container,
ram_addr = memory_region_get_ram_addr(section->mr) +
section->offset_within_region;
- return vfio_get_dirty_bitmap(&container->bcontainer,
+ return vfio_get_dirty_bitmap(bcontainer,
REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
int128_get64(section->size), ram_addr);
}
@@ -1338,15 +1337,16 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container,
static void vfio_listener_log_sync(MemoryListener *listener,
MemoryRegionSection *section)
{
- VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+ VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
+ listener);
int ret;
if (vfio_listener_skipped_section(section)) {
return;
}
- if (vfio_devices_all_dirty_tracking(&container->bcontainer)) {
- ret = vfio_sync_dirty_bitmap(container, section);
+ if (vfio_devices_all_dirty_tracking(bcontainer)) {
+ ret = vfio_sync_dirty_bitmap(bcontainer, section);
if (ret) {
error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret,
strerror(-ret));
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index 584eee4ba1..7f508669f5 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -51,6 +51,7 @@ void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space,
{
bcontainer->ops = ops;
bcontainer->space = space;
+ bcontainer->error = NULL;
bcontainer->dirty_pages_supported = false;
bcontainer->dma_max_mappings = 0;
QLIST_INIT(&bcontainer->giommu_list);
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 6ba2e2f8c4..5c1dee8c9f 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -453,6 +453,7 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container,
{
struct vfio_info_cap_header *hdr;
struct vfio_iommu_type1_info_cap_migration *cap_mig;
+ VFIOContainerBase *bcontainer = &container->bcontainer;
hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
if (!hdr) {
@@ -467,7 +468,7 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container,
* qemu_real_host_page_size to mark those dirty.
*/
if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) {
- container->bcontainer.dirty_pages_supported = true;
+ bcontainer->dirty_pages_supported = true;
container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
container->dirty_pgsizes = cap_mig->pgsize_bitmap;
}
@@ -558,7 +559,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
container = g_malloc0(sizeof(*container));
container->fd = fd;
- container->error = NULL;
container->iova_ranges = NULL;
bcontainer = &container->bcontainer;
vfio_container_init(bcontainer, space, &vfio_legacy_ops);
@@ -621,25 +621,24 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
group->container = container;
QLIST_INSERT_HEAD(&container->group_list, group, container_next);
- container->listener = vfio_memory_listener;
-
- memory_listener_register(&container->listener, bcontainer->space->as);
+ bcontainer->listener = vfio_memory_listener;
+ memory_listener_register(&bcontainer->listener, bcontainer->space->as);
- if (container->error) {
+ if (bcontainer->error) {
ret = -1;
- error_propagate_prepend(errp, container->error,
+ error_propagate_prepend(errp, bcontainer->error,
"memory listener initialization failed: ");
goto listener_release_exit;
}
- container->initialized = true;
+ bcontainer->initialized = true;
return 0;
listener_release_exit:
QLIST_REMOVE(group, container_next);
QLIST_REMOVE(bcontainer, next);
vfio_kvm_device_del_group(group);
- memory_listener_unregister(&container->listener);
+ memory_listener_unregister(&bcontainer->listener);
if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
container->iommu_type == VFIO_SPAPR_TCE_IOMMU) {
vfio_spapr_container_deinit(container);
@@ -674,7 +673,7 @@ static void vfio_disconnect_container(VFIOGroup *group)
* group.
*/
if (QLIST_EMPTY(&container->group_list)) {
- memory_listener_unregister(&container->listener);
+ memory_listener_unregister(&bcontainer->listener);
if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
container->iommu_type == VFIO_SPAPR_TCE_IOMMU) {
vfio_spapr_container_deinit(container);
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 4f76bdd3ca..7a50975f25 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -46,6 +46,7 @@ static void vfio_prereg_listener_region_add(MemoryListener *listener,
{
VFIOContainer *container = container_of(listener, VFIOContainer,
prereg_listener);
+ VFIOContainerBase *bcontainer = &container->bcontainer;
const hwaddr gpa = section->offset_within_address_space;
hwaddr end;
int ret;
@@ -88,9 +89,9 @@ static void vfio_prereg_listener_region_add(MemoryListener *listener,
* can gracefully fail. Runtime, there's not much we can do other
* than throw a hardware error.
*/
- if (!container->initialized) {
- if (!container->error) {
- error_setg_errno(&container->error, -ret,
+ if (!bcontainer->initialized) {
+ if (!bcontainer->error) {
+ error_setg_errno(&bcontainer->error, -ret,
"Memory registering failed");
}
} else {
@@ -445,9 +446,9 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
memory_listener_register(&container->prereg_listener,
&address_space_memory);
- if (container->error) {
+ if (bcontainer->error) {
ret = -1;
- error_propagate_prepend(errp, container->error,
+ error_propagate_prepend(errp, bcontainer->error,
"RAM memory listener initialization failed: ");
goto listener_unregister_exit;
}
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 8a607a4c17..922022cbc6 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -78,11 +78,8 @@ struct VFIOGroup;
typedef struct VFIOContainer {
VFIOContainerBase bcontainer;
int fd; /* /dev/vfio/vfio, empowered by the attached groups */
- MemoryListener listener;
MemoryListener prereg_listener;
unsigned iommu_type;
- Error *error;
- bool initialized;
uint64_t dirty_pgsizes;
uint64_t max_dirty_bitmap_size;
QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index 8e05b5ac5a..95f8d319e0 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -36,6 +36,9 @@ typedef struct VFIOAddressSpace {
typedef struct VFIOContainerBase {
const VFIOIOMMUOps *ops;
VFIOAddressSpace *space;
+ MemoryListener listener;
+ Error *error;
+ bool initialized;
unsigned long pgsizes;
unsigned int dma_max_mappings;
bool dirty_pages_supported;
--
2.39.3

View File

@ -0,0 +1,230 @@
From 0b3fbb6bf5c5bccec184829ff9454fd637c512b9 Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Thu, 2 Nov 2023 15:12:34 +0800
Subject: [PATCH 009/101] vfio/container: Move per container device list in
base container
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [8/67] d546cc25f4424b2d42356765c860fdaf4a3ba652 (eauger1/centos-qemu-kvm)
VFIO Device is also changed to point to base container instead of
legacy container.
No functional change intended.
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 3e6015d1117579324b456aa169dfca06da9922cf)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/common.c | 23 +++++++++++++++--------
hw/vfio/container.c | 12 ++++++------
include/hw/vfio/vfio-common.h | 3 +--
include/hw/vfio/vfio-container-base.h | 1 +
4 files changed, 23 insertions(+), 16 deletions(-)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index b1a875ca93..9415395ed9 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -145,7 +145,7 @@ void vfio_unblock_multiple_devices_migration(void)
bool vfio_viommu_preset(VFIODevice *vbasedev)
{
- return vbasedev->container->bcontainer.space->as != &address_space_memory;
+ return vbasedev->bcontainer->space->as != &address_space_memory;
}
static void vfio_set_migration_error(int err)
@@ -179,6 +179,7 @@ bool vfio_device_state_is_precopy(VFIODevice *vbasedev)
static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
{
+ VFIOContainerBase *bcontainer = &container->bcontainer;
VFIODevice *vbasedev;
MigrationState *ms = migrate_get_current();
@@ -187,7 +188,7 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
return false;
}
- QLIST_FOREACH(vbasedev, &container->device_list, container_next) {
+ QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
VFIOMigration *migration = vbasedev->migration;
if (!migration) {
@@ -205,9 +206,10 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container)
{
+ VFIOContainerBase *bcontainer = &container->bcontainer;
VFIODevice *vbasedev;
- QLIST_FOREACH(vbasedev, &container->device_list, container_next) {
+ QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
if (!vbasedev->dirty_pages_supported) {
return false;
}
@@ -222,13 +224,14 @@ bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container)
*/
bool vfio_devices_all_running_and_mig_active(VFIOContainer *container)
{
+ VFIOContainerBase *bcontainer = &container->bcontainer;
VFIODevice *vbasedev;
if (!migration_is_active(migrate_get_current())) {
return false;
}
- QLIST_FOREACH(vbasedev, &container->device_list, container_next) {
+ QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
VFIOMigration *migration = vbasedev->migration;
if (!migration) {
@@ -833,12 +836,13 @@ static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
VFIOContainer *container)
{
VFIOPCIDevice *pcidev;
+ VFIOContainerBase *bcontainer = &container->bcontainer;
VFIODevice *vbasedev;
Object *owner;
owner = memory_region_owner(section->mr);
- QLIST_FOREACH(vbasedev, &container->device_list, container_next) {
+ QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
continue;
}
@@ -939,13 +943,14 @@ static void vfio_devices_dma_logging_stop(VFIOContainer *container)
uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
sizeof(uint64_t))] = {};
struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
+ VFIOContainerBase *bcontainer = &container->bcontainer;
VFIODevice *vbasedev;
feature->argsz = sizeof(buf);
feature->flags = VFIO_DEVICE_FEATURE_SET |
VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP;
- QLIST_FOREACH(vbasedev, &container->device_list, container_next) {
+ QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
if (!vbasedev->dirty_tracking) {
continue;
}
@@ -1036,6 +1041,7 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container)
{
struct vfio_device_feature *feature;
VFIODirtyRanges ranges;
+ VFIOContainerBase *bcontainer = &container->bcontainer;
VFIODevice *vbasedev;
int ret = 0;
@@ -1046,7 +1052,7 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container)
return -errno;
}
- QLIST_FOREACH(vbasedev, &container->device_list, container_next) {
+ QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
if (vbasedev->dirty_tracking) {
continue;
}
@@ -1139,10 +1145,11 @@ int vfio_devices_query_dirty_bitmap(VFIOContainer *container,
VFIOBitmap *vbmap, hwaddr iova,
hwaddr size)
{
+ VFIOContainerBase *bcontainer = &container->bcontainer;
VFIODevice *vbasedev;
int ret;
- QLIST_FOREACH(vbasedev, &container->device_list, container_next) {
+ QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) {
ret = vfio_device_dma_logging_report(vbasedev, iova, size,
vbmap->bitmap);
if (ret) {
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 3ab74e2615..63a906de93 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -888,7 +888,7 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev,
int groupid = vfio_device_groupid(vbasedev, errp);
VFIODevice *vbasedev_iter;
VFIOGroup *group;
- VFIOContainer *container;
+ VFIOContainerBase *bcontainer;
int ret;
if (groupid < 0) {
@@ -915,9 +915,9 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev,
return ret;
}
- container = group->container;
- vbasedev->container = container;
- QLIST_INSERT_HEAD(&container->device_list, vbasedev, container_next);
+ bcontainer = &group->container->bcontainer;
+ vbasedev->bcontainer = bcontainer;
+ QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
return ret;
@@ -927,13 +927,13 @@ void vfio_detach_device(VFIODevice *vbasedev)
{
VFIOGroup *group = vbasedev->group;
- if (!vbasedev->container) {
+ if (!vbasedev->bcontainer) {
return;
}
QLIST_REMOVE(vbasedev, global_next);
QLIST_REMOVE(vbasedev, container_next);
- vbasedev->container = NULL;
+ vbasedev->bcontainer = NULL;
trace_vfio_detach_device(vbasedev->name, group->groupid);
vfio_put_base_device(vbasedev);
vfio_put_group(group);
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 60f2785fe0..9740cf9fbc 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -90,7 +90,6 @@ typedef struct VFIOContainer {
QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
QLIST_HEAD(, VFIOGroup) group_list;
QLIST_HEAD(, VFIORamDiscardListener) vrdl_list;
- QLIST_HEAD(, VFIODevice) device_list;
GList *iova_ranges;
} VFIOContainer;
@@ -118,7 +117,7 @@ typedef struct VFIODevice {
QLIST_ENTRY(VFIODevice) container_next;
QLIST_ENTRY(VFIODevice) global_next;
struct VFIOGroup *group;
- VFIOContainer *container;
+ VFIOContainerBase *bcontainer;
char *sysfsdev;
char *name;
DeviceState *dev;
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index f244f003d0..7090962496 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -39,6 +39,7 @@ typedef struct VFIOContainerBase {
bool dirty_pages_supported;
QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
QLIST_ENTRY(VFIOContainerBase) next;
+ QLIST_HEAD(, VFIODevice) device_list;
} VFIOContainerBase;
typedef struct VFIOGuestIOMMU {
--
2.39.3

View File

@ -0,0 +1,242 @@
From d798939fbbe6c27200c165edd6f3771413821b34 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 2 Nov 2023 15:12:36 +0800
Subject: [PATCH 011/101] vfio/container: Move pgsizes and dma_max_mappings to
base container
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [10/67] e80696175aba159a17ce9a869535db66682deb08 (eauger1/centos-qemu-kvm)
No functional change intended.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 7ab1cb74ffdbf92ef237243b41bde5c7067d5298)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/common.c | 17 +++++++++--------
hw/vfio/container-base.c | 1 +
hw/vfio/container.c | 11 +++++------
hw/vfio/spapr.c | 10 ++++++----
include/hw/vfio/vfio-common.h | 2 --
include/hw/vfio/vfio-container-base.h | 2 ++
6 files changed, 23 insertions(+), 20 deletions(-)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index cf6618f6ed..1cb53d369e 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -401,6 +401,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
static void vfio_register_ram_discard_listener(VFIOContainer *container,
MemoryRegionSection *section)
{
+ VFIOContainerBase *bcontainer = &container->bcontainer;
RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
VFIORamDiscardListener *vrdl;
@@ -419,8 +420,8 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container,
section->mr);
g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity));
- g_assert(container->pgsizes &&
- vrdl->granularity >= 1ULL << ctz64(container->pgsizes));
+ g_assert(bcontainer->pgsizes &&
+ vrdl->granularity >= 1ULL << ctz64(bcontainer->pgsizes));
ram_discard_listener_init(&vrdl->listener,
vfio_ram_discard_notify_populate,
@@ -441,7 +442,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container,
* number of sections in the address space we could have over time,
* also consuming DMA mappings.
*/
- if (container->dma_max_mappings) {
+ if (bcontainer->dma_max_mappings) {
unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512;
#ifdef CONFIG_KVM
@@ -462,11 +463,11 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container,
}
if (vrdl_mappings + max_memslots - vrdl_count >
- container->dma_max_mappings) {
+ bcontainer->dma_max_mappings) {
warn_report("%s: possibly running out of DMA mappings. E.g., try"
" increasing the 'block-size' of virtio-mem devies."
" Maximum possible DMA mappings: %d, Maximum possible"
- " memslots: %d", __func__, container->dma_max_mappings,
+ " memslots: %d", __func__, bcontainer->dma_max_mappings,
max_memslots);
}
}
@@ -626,7 +627,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
iommu_idx);
ret = memory_region_iommu_set_page_size_mask(giommu->iommu_mr,
- container->pgsizes,
+ bcontainer->pgsizes,
&err);
if (ret) {
g_free(giommu);
@@ -675,7 +676,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
llsize = int128_sub(llend, int128_make64(iova));
if (memory_region_is_ram_device(section->mr)) {
- hwaddr pgmask = (1ULL << ctz64(container->pgsizes)) - 1;
+ hwaddr pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1;
if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
trace_vfio_listener_region_add_no_dma_map(
@@ -777,7 +778,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
if (memory_region_is_ram_device(section->mr)) {
hwaddr pgmask;
- pgmask = (1ULL << ctz64(container->pgsizes)) - 1;
+ pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1;
try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
} else if (memory_region_has_ram_discard_manager(section->mr)) {
vfio_unregister_ram_discard_listener(container, section);
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index 5d654ae172..dcce111349 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -52,6 +52,7 @@ void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space,
bcontainer->ops = ops;
bcontainer->space = space;
bcontainer->dirty_pages_supported = false;
+ bcontainer->dma_max_mappings = 0;
QLIST_INIT(&bcontainer->giommu_list);
}
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 7bd81eab09..c5a6262882 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -154,7 +154,7 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova,
if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
container->iommu_type == VFIO_TYPE1v2_IOMMU) {
trace_vfio_legacy_dma_unmap_overflow_workaround();
- unmap.size -= 1ULL << ctz64(container->pgsizes);
+ unmap.size -= 1ULL << ctz64(bcontainer->pgsizes);
continue;
}
error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
@@ -559,7 +559,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
container = g_malloc0(sizeof(*container));
container->fd = fd;
container->error = NULL;
- container->dma_max_mappings = 0;
container->iova_ranges = NULL;
QLIST_INIT(&container->vrdl_list);
bcontainer = &container->bcontainer;
@@ -589,13 +588,13 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
}
if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
- container->pgsizes = info->iova_pgsizes;
+ bcontainer->pgsizes = info->iova_pgsizes;
} else {
- container->pgsizes = qemu_real_host_page_size();
+ bcontainer->pgsizes = qemu_real_host_page_size();
}
- if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) {
- container->dma_max_mappings = 65535;
+ if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) {
+ bcontainer->dma_max_mappings = 65535;
}
vfio_get_info_iova_range(info, container);
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 83da2f7ec2..4f76bdd3ca 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -226,6 +226,7 @@ static int vfio_spapr_create_window(VFIOContainer *container,
hwaddr *pgsize)
{
int ret = 0;
+ VFIOContainerBase *bcontainer = &container->bcontainer;
IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask;
unsigned entries, bits_total, bits_per_level, max_levels;
@@ -239,13 +240,13 @@ static int vfio_spapr_create_window(VFIOContainer *container,
if (pagesize > rampagesize) {
pagesize = rampagesize;
}
- pgmask = container->pgsizes & (pagesize | (pagesize - 1));
+ pgmask = bcontainer->pgsizes & (pagesize | (pagesize - 1));
pagesize = pgmask ? (1ULL << (63 - clz64(pgmask))) : 0;
if (!pagesize) {
error_report("Host doesn't support page size 0x%"PRIx64
", the supported mask is 0x%lx",
memory_region_iommu_get_min_page_size(iommu_mr),
- container->pgsizes);
+ bcontainer->pgsizes);
return -EINVAL;
}
@@ -421,6 +422,7 @@ void vfio_container_del_section_window(VFIOContainer *container,
int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
{
+ VFIOContainerBase *bcontainer = &container->bcontainer;
struct vfio_iommu_spapr_tce_info info;
bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
int ret, fd = container->fd;
@@ -461,7 +463,7 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
}
if (v2) {
- container->pgsizes = info.ddw.pgsizes;
+ bcontainer->pgsizes = info.ddw.pgsizes;
/*
* There is a default window in just created container.
* To make region_add/del simpler, we better remove this
@@ -476,7 +478,7 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
}
} else {
/* The default table uses 4K pages */
- container->pgsizes = 0x1000;
+ bcontainer->pgsizes = 0x1000;
vfio_host_win_add(container, info.dma32_window_start,
info.dma32_window_start +
info.dma32_window_size - 1,
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index bc67e1316c..d3dc2f9dcb 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -85,8 +85,6 @@ typedef struct VFIOContainer {
bool initialized;
uint64_t dirty_pgsizes;
uint64_t max_dirty_bitmap_size;
- unsigned long pgsizes;
- unsigned int dma_max_mappings;
QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
QLIST_HEAD(, VFIOGroup) group_list;
QLIST_HEAD(, VFIORamDiscardListener) vrdl_list;
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index 7090962496..85ec7e1a56 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -36,6 +36,8 @@ typedef struct VFIOAddressSpace {
typedef struct VFIOContainerBase {
const VFIOIOMMUOps *ops;
VFIOAddressSpace *space;
+ unsigned long pgsizes;
+ unsigned int dma_max_mappings;
bool dirty_pages_supported;
QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
QLIST_ENTRY(VFIOContainerBase) next;
--
2.39.3

View File

@ -0,0 +1,265 @@
From 3ba43cbc5b096feed6272e070cf152d5fc74df01 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 2 Nov 2023 15:12:32 +0800
Subject: [PATCH 007/101] vfio/container: Move space field to base container
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [6/67] b0aa17d9ec4588bd64373452a30306e826234d0b (eauger1/centos-qemu-kvm)
Move the space field to the base object. Also the VFIOAddressSpace
now contains a list of base containers.
No functional change intended.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit e5597063386a0c76308ad16da31726d23f489945)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/ppc/spapr_pci_vfio.c | 10 +++++-----
hw/vfio/common.c | 4 ++--
hw/vfio/container-base.c | 6 +++++-
hw/vfio/container.c | 18 ++++++++----------
include/hw/vfio/vfio-common.h | 8 --------
include/hw/vfio/vfio-container-base.h | 9 +++++++++
6 files changed, 29 insertions(+), 26 deletions(-)
diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c
index f283f7e38d..d1d07bec46 100644
--- a/hw/ppc/spapr_pci_vfio.c
+++ b/hw/ppc/spapr_pci_vfio.c
@@ -84,27 +84,27 @@ static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op)
static VFIOContainer *vfio_eeh_as_container(AddressSpace *as)
{
VFIOAddressSpace *space = vfio_get_address_space(as);
- VFIOContainer *container = NULL;
+ VFIOContainerBase *bcontainer = NULL;
if (QLIST_EMPTY(&space->containers)) {
/* No containers to act on */
goto out;
}
- container = QLIST_FIRST(&space->containers);
+ bcontainer = QLIST_FIRST(&space->containers);
- if (QLIST_NEXT(container, next)) {
+ if (QLIST_NEXT(bcontainer, next)) {
/*
* We don't yet have logic to synchronize EEH state across
* multiple containers
*/
- container = NULL;
+ bcontainer = NULL;
goto out;
}
out:
vfio_put_address_space(space);
- return container;
+ return container_of(bcontainer, VFIOContainer, bcontainer);
}
static bool vfio_eeh_as_ok(AddressSpace *as)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 43580bcc43..1d8202537e 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -145,7 +145,7 @@ void vfio_unblock_multiple_devices_migration(void)
bool vfio_viommu_preset(VFIODevice *vbasedev)
{
- return vbasedev->container->space->as != &address_space_memory;
+ return vbasedev->container->bcontainer.space->as != &address_space_memory;
}
static void vfio_set_migration_error(int err)
@@ -922,7 +922,7 @@ static void vfio_dirty_tracking_init(VFIOContainer *container,
dirty.container = container;
memory_listener_register(&dirty.listener,
- container->space->as);
+ container->bcontainer.space->as);
*ranges = dirty.ranges;
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index 20bcb9669a..3933391e0d 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -31,9 +31,11 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb);
}
-void vfio_container_init(VFIOContainerBase *bcontainer, const VFIOIOMMUOps *ops)
+void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space,
+ const VFIOIOMMUOps *ops)
{
bcontainer->ops = ops;
+ bcontainer->space = space;
QLIST_INIT(&bcontainer->giommu_list);
}
@@ -41,6 +43,8 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer)
{
VFIOGuestIOMMU *giommu, *tmp;
+ QLIST_REMOVE(bcontainer, next);
+
QLIST_FOREACH_SAFE(giommu, &bcontainer->giommu_list, giommu_next, tmp) {
memory_region_unregister_iommu_notifier(
MEMORY_REGION(giommu->iommu_mr), &giommu->n);
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 133d3c8f5c..f12fcb6fe1 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -514,7 +514,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
* details once we know which type of IOMMU we are using.
*/
- QLIST_FOREACH(container, &space->containers, next) {
+ QLIST_FOREACH(bcontainer, &space->containers, next) {
+ container = container_of(bcontainer, VFIOContainer, bcontainer);
if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
ret = vfio_ram_block_discard_disable(container, true);
if (ret) {
@@ -550,7 +551,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
}
container = g_malloc0(sizeof(*container));
- container->space = space;
container->fd = fd;
container->error = NULL;
container->dirty_pages_supported = false;
@@ -558,7 +558,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
container->iova_ranges = NULL;
QLIST_INIT(&container->vrdl_list);
bcontainer = &container->bcontainer;
- vfio_container_init(bcontainer, &vfio_legacy_ops);
+ vfio_container_init(bcontainer, space, &vfio_legacy_ops);
ret = vfio_init_container(container, group->fd, errp);
if (ret) {
@@ -613,14 +613,14 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
vfio_kvm_device_add_group(group);
QLIST_INIT(&container->group_list);
- QLIST_INSERT_HEAD(&space->containers, container, next);
+ QLIST_INSERT_HEAD(&space->containers, bcontainer, next);
group->container = container;
QLIST_INSERT_HEAD(&container->group_list, group, container_next);
container->listener = vfio_memory_listener;
- memory_listener_register(&container->listener, container->space->as);
+ memory_listener_register(&container->listener, bcontainer->space->as);
if (container->error) {
ret = -1;
@@ -634,7 +634,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
return 0;
listener_release_exit:
QLIST_REMOVE(group, container_next);
- QLIST_REMOVE(container, next);
+ QLIST_REMOVE(bcontainer, next);
vfio_kvm_device_del_group(group);
memory_listener_unregister(&container->listener);
if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
@@ -684,9 +684,7 @@ static void vfio_disconnect_container(VFIOGroup *group)
}
if (QLIST_EMPTY(&container->group_list)) {
- VFIOAddressSpace *space = container->space;
-
- QLIST_REMOVE(container, next);
+ VFIOAddressSpace *space = bcontainer->space;
vfio_container_destroy(bcontainer);
@@ -707,7 +705,7 @@ static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
QLIST_FOREACH(group, &vfio_group_list, next) {
if (group->groupid == groupid) {
/* Found it. Now is it already in the right context? */
- if (group->container->space->as == as) {
+ if (group->container->bcontainer.space->as == as) {
return group;
} else {
error_setg(errp, "group %d used in multiple address spaces",
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 6be082b8f2..bd4de6cb3a 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -73,17 +73,10 @@ typedef struct VFIOMigration {
bool initial_data_sent;
} VFIOMigration;
-typedef struct VFIOAddressSpace {
- AddressSpace *as;
- QLIST_HEAD(, VFIOContainer) containers;
- QLIST_ENTRY(VFIOAddressSpace) list;
-} VFIOAddressSpace;
-
struct VFIOGroup;
typedef struct VFIOContainer {
VFIOContainerBase bcontainer;
- VFIOAddressSpace *space;
int fd; /* /dev/vfio/vfio, empowered by the attached groups */
MemoryListener listener;
MemoryListener prereg_listener;
@@ -98,7 +91,6 @@ typedef struct VFIOContainer {
QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
QLIST_HEAD(, VFIOGroup) group_list;
QLIST_HEAD(, VFIORamDiscardListener) vrdl_list;
- QLIST_ENTRY(VFIOContainer) next;
QLIST_HEAD(, VFIODevice) device_list;
GList *iova_ranges;
} VFIOContainer;
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index a11aec5755..c7cc6ec9c5 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -24,12 +24,20 @@ typedef struct {
hwaddr pages;
} VFIOBitmap;
+typedef struct VFIOAddressSpace {
+ AddressSpace *as;
+ QLIST_HEAD(, VFIOContainerBase) containers;
+ QLIST_ENTRY(VFIOAddressSpace) list;
+} VFIOAddressSpace;
+
/*
* This is the base object for vfio container backends
*/
typedef struct VFIOContainerBase {
const VFIOIOMMUOps *ops;
+ VFIOAddressSpace *space;
QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
+ QLIST_ENTRY(VFIOContainerBase) next;
} VFIOContainerBase;
typedef struct VFIOGuestIOMMU {
@@ -48,6 +56,7 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
IOMMUTLBEntry *iotlb);
void vfio_container_init(VFIOContainerBase *bcontainer,
+ VFIOAddressSpace *space,
const VFIOIOMMUOps *ops);
void vfio_container_destroy(VFIOContainerBase *bcontainer);
--
2.39.3

View File

@ -0,0 +1,255 @@
From aadd055dcc06cb964ebfd2868b7e9b207d62ae0e Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Thu, 2 Nov 2023 15:12:37 +0800
Subject: [PATCH 012/101] vfio/container: Move vrdl_list to base container
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [11/67] 42da5389e39291839259f0e4c020c7461b7225cc (eauger1/centos-qemu-kvm)
No functional change intended.
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit dc74a4b0056c0c803d46612a2319294921097974)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/common.c | 38 +++++++++++++--------------
hw/vfio/container-base.c | 1 +
hw/vfio/container.c | 1 -
include/hw/vfio/vfio-common.h | 11 --------
include/hw/vfio/vfio-container-base.h | 11 ++++++++
5 files changed, 31 insertions(+), 31 deletions(-)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 1cb53d369e..f15665789f 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -351,13 +351,13 @@ static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
{
VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
listener);
+ VFIOContainerBase *bcontainer = vrdl->bcontainer;
const hwaddr size = int128_get64(section->size);
const hwaddr iova = section->offset_within_address_space;
int ret;
/* Unmap with a single call. */
- ret = vfio_container_dma_unmap(&vrdl->container->bcontainer,
- iova, size , NULL);
+ ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL);
if (ret) {
error_report("%s: vfio_container_dma_unmap() failed: %s", __func__,
strerror(-ret));
@@ -369,6 +369,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
{
VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
listener);
+ VFIOContainerBase *bcontainer = vrdl->bcontainer;
const hwaddr end = section->offset_within_region +
int128_get64(section->size);
hwaddr start, next, iova;
@@ -387,8 +388,8 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
section->offset_within_address_space;
vaddr = memory_region_get_ram_ptr(section->mr) + start;
- ret = vfio_container_dma_map(&vrdl->container->bcontainer, iova,
- next - start, vaddr, section->readonly);
+ ret = vfio_container_dma_map(bcontainer, iova, next - start,
+ vaddr, section->readonly);
if (ret) {
/* Rollback */
vfio_ram_discard_notify_discard(rdl, section);
@@ -398,10 +399,9 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
return 0;
}
-static void vfio_register_ram_discard_listener(VFIOContainer *container,
+static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer,
MemoryRegionSection *section)
{
- VFIOContainerBase *bcontainer = &container->bcontainer;
RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
VFIORamDiscardListener *vrdl;
@@ -412,7 +412,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container,
g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE));
vrdl = g_new0(VFIORamDiscardListener, 1);
- vrdl->container = container;
+ vrdl->bcontainer = bcontainer;
vrdl->mr = section->mr;
vrdl->offset_within_address_space = section->offset_within_address_space;
vrdl->size = int128_get64(section->size);
@@ -427,7 +427,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container,
vfio_ram_discard_notify_populate,
vfio_ram_discard_notify_discard, true);
ram_discard_manager_register_listener(rdm, &vrdl->listener, section);
- QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next);
+ QLIST_INSERT_HEAD(&bcontainer->vrdl_list, vrdl, next);
/*
* Sanity-check if we have a theoretically problematic setup where we could
@@ -451,7 +451,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container,
}
#endif
- QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
+ QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
hwaddr start, end;
start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space,
@@ -473,13 +473,13 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container,
}
}
-static void vfio_unregister_ram_discard_listener(VFIOContainer *container,
+static void vfio_unregister_ram_discard_listener(VFIOContainerBase *bcontainer,
MemoryRegionSection *section)
{
RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
VFIORamDiscardListener *vrdl = NULL;
- QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
+ QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
if (vrdl->mr == section->mr &&
vrdl->offset_within_address_space ==
section->offset_within_address_space) {
@@ -663,7 +663,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
* about changes.
*/
if (memory_region_has_ram_discard_manager(section->mr)) {
- vfio_register_ram_discard_listener(container, section);
+ vfio_register_ram_discard_listener(bcontainer, section);
return;
}
@@ -781,7 +781,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1;
try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
} else if (memory_region_has_ram_discard_manager(section->mr)) {
- vfio_unregister_ram_discard_listener(container, section);
+ vfio_unregister_ram_discard_listener(bcontainer, section);
/* Unregistering will trigger an unmap. */
try_unmap = false;
}
@@ -1260,17 +1260,17 @@ static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
* Sync the whole mapped region (spanning multiple individual mappings)
* in one go.
*/
- return vfio_get_dirty_bitmap(&vrdl->container->bcontainer, iova, size,
- ram_addr);
+ return vfio_get_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr);
}
-static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container,
- MemoryRegionSection *section)
+static int
+vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer,
+ MemoryRegionSection *section)
{
RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
VFIORamDiscardListener *vrdl = NULL;
- QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
+ QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) {
if (vrdl->mr == section->mr &&
vrdl->offset_within_address_space ==
section->offset_within_address_space) {
@@ -1324,7 +1324,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container,
}
return 0;
} else if (memory_region_has_ram_discard_manager(section->mr)) {
- return vfio_sync_ram_discard_listener_dirty_bitmap(container, section);
+ return vfio_sync_ram_discard_listener_dirty_bitmap(bcontainer, section);
}
ram_addr = memory_region_get_ram_addr(section->mr) +
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index dcce111349..584eee4ba1 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -54,6 +54,7 @@ void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space,
bcontainer->dirty_pages_supported = false;
bcontainer->dma_max_mappings = 0;
QLIST_INIT(&bcontainer->giommu_list);
+ QLIST_INIT(&bcontainer->vrdl_list);
}
void vfio_container_destroy(VFIOContainerBase *bcontainer)
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index c5a6262882..6ba2e2f8c4 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -560,7 +560,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
container->fd = fd;
container->error = NULL;
container->iova_ranges = NULL;
- QLIST_INIT(&container->vrdl_list);
bcontainer = &container->bcontainer;
vfio_container_init(bcontainer, space, &vfio_legacy_ops);
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index d3dc2f9dcb..8a607a4c17 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -87,20 +87,9 @@ typedef struct VFIOContainer {
uint64_t max_dirty_bitmap_size;
QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
QLIST_HEAD(, VFIOGroup) group_list;
- QLIST_HEAD(, VFIORamDiscardListener) vrdl_list;
GList *iova_ranges;
} VFIOContainer;
-typedef struct VFIORamDiscardListener {
- VFIOContainer *container;
- MemoryRegion *mr;
- hwaddr offset_within_address_space;
- hwaddr size;
- uint64_t granularity;
- RamDiscardListener listener;
- QLIST_ENTRY(VFIORamDiscardListener) next;
-} VFIORamDiscardListener;
-
typedef struct VFIOHostDMAWindow {
hwaddr min_iova;
hwaddr max_iova;
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index 85ec7e1a56..8e05b5ac5a 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -40,6 +40,7 @@ typedef struct VFIOContainerBase {
unsigned int dma_max_mappings;
bool dirty_pages_supported;
QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
+ QLIST_HEAD(, VFIORamDiscardListener) vrdl_list;
QLIST_ENTRY(VFIOContainerBase) next;
QLIST_HEAD(, VFIODevice) device_list;
} VFIOContainerBase;
@@ -52,6 +53,16 @@ typedef struct VFIOGuestIOMMU {
QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
} VFIOGuestIOMMU;
+typedef struct VFIORamDiscardListener {
+ VFIOContainerBase *bcontainer;
+ MemoryRegion *mr;
+ hwaddr offset_within_address_space;
+ hwaddr size;
+ uint64_t granularity;
+ RamDiscardListener listener;
+ QLIST_ENTRY(VFIORamDiscardListener) next;
+} VFIORamDiscardListener;
+
int vfio_container_dma_map(VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
void *vaddr, bool readonly);
--
2.39.3

View File

@ -0,0 +1,66 @@
From edfc1ee2a1854d180ffad92e70212535a2ca668c Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Thu, 21 Dec 2023 10:45:17 +0800
Subject: [PATCH 062/101] vfio/container: Rename vfio_init_container to
vfio_set_iommu
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [61/67] 5e7f956379b54fe6fa7e078ec17e71325aa109af (eauger1/centos-qemu-kvm)
vfio_container_init() and vfio_init_container() names are confusing
especially when we see vfio_init_container() calls vfio_container_init().
vfio_container_init() operates on base container which is consistent
with all routines handling 'VFIOContainerBase *' ops.
vfio_init_container() operates on legacy container and setup IOMMU
context with ioctl(VFIO_SET_IOMMU).
So choose to rename vfio_init_container to vfio_set_iommu to avoid
the confusion.
No functional change intended.
Suggested-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 9f734a117cbf63b03577b46c8cad8ad88ec6dced)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/container.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 8d334f52f2..bd25b9fbad 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -392,8 +392,8 @@ static const VFIOIOMMUClass *vfio_get_iommu_class(int iommu_type, Error **errp)
return VFIO_IOMMU_CLASS(klass);
}
-static int vfio_init_container(VFIOContainer *container, int group_fd,
- VFIOAddressSpace *space, Error **errp)
+static int vfio_set_iommu(VFIOContainer *container, int group_fd,
+ VFIOAddressSpace *space, Error **errp)
{
int iommu_type, ret;
const VFIOIOMMUClass *vioc;
@@ -616,7 +616,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
container->fd = fd;
bcontainer = &container->bcontainer;
- ret = vfio_init_container(container, group->fd, space, errp);
+ ret = vfio_set_iommu(container, group->fd, space, errp);
if (ret) {
goto free_container_exit;
}
--
2.39.3

View File

@ -0,0 +1,59 @@
From 8d3857c7877da58ed0c6b62cf2714c4127350522 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Wed, 20 Dec 2023 14:53:02 +0100
Subject: [PATCH 059/101] vfio/container: Replace basename with
g_path_get_basename
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [58/67] 56a90f23dadc89271b1fff014fc64ade87c1a4cb (eauger1/centos-qemu-kvm)
g_path_get_basename() is a portable utility function that has the
advantage of not modifing the string argument. It also fixes a compile
breakage with the Musl C library reported in [1].
[1] https://lore.kernel.org/all/20231212010228.2701544-1-raj.khem@gmail.com/
Reported-by: Khem Raj <raj.khem@gmail.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Zhao Liu <zhao1.liu@intel.com>
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 213ae3ffda463c0503e39e0cf827511b5298c314)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/container.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 688cf23bab..8d334f52f2 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -869,7 +869,8 @@ static void vfio_put_base_device(VFIODevice *vbasedev)
static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp)
{
- char *tmp, group_path[PATH_MAX], *group_name;
+ char *tmp, group_path[PATH_MAX];
+ g_autofree char *group_name = NULL;
int ret, groupid;
ssize_t len;
@@ -885,7 +886,7 @@ static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp)
group_path[len] = 0;
- group_name = basename(group_path);
+ group_name = g_path_get_basename(group_path);
if (sscanf(group_name, "%d", &groupid) != 1) {
error_setg_errno(errp, errno, "failed to read %s", group_path);
return -errno;
--
2.39.3

View File

@ -0,0 +1,235 @@
From a2c8aa64b1b21a3e1d4cf2a4fe7d84dc32f69284 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 2 Nov 2023 15:12:33 +0800
Subject: [PATCH 008/101] vfio/container: Switch to IOMMU BE
set_dirty_page_tracking/query_dirty_bitmap API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [7/67] 88368809c7990e1d9b01406e48694fe3e3fb1397 (eauger1/centos-qemu-kvm)
dirty_pages_supported field is also moved to the base container
No functional change intended.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit bb424490edcef73d07f200d53f69415b203d81df)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/common.c | 12 ++++++++----
hw/vfio/container-base.c | 16 ++++++++++++++++
hw/vfio/container.c | 21 ++++++++++++++-------
include/hw/vfio/vfio-common.h | 6 ------
include/hw/vfio/vfio-container-base.h | 6 ++++++
5 files changed, 44 insertions(+), 17 deletions(-)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 1d8202537e..b1a875ca93 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1079,7 +1079,8 @@ static void vfio_listener_log_global_start(MemoryListener *listener)
if (vfio_devices_all_device_dirty_tracking(container)) {
ret = vfio_devices_dma_logging_start(container);
} else {
- ret = vfio_set_dirty_page_tracking(container, true);
+ ret = vfio_container_set_dirty_page_tracking(&container->bcontainer,
+ true);
}
if (ret) {
@@ -1097,7 +1098,8 @@ static void vfio_listener_log_global_stop(MemoryListener *listener)
if (vfio_devices_all_device_dirty_tracking(container)) {
vfio_devices_dma_logging_stop(container);
} else {
- ret = vfio_set_dirty_page_tracking(container, false);
+ ret = vfio_container_set_dirty_page_tracking(&container->bcontainer,
+ false);
}
if (ret) {
@@ -1165,7 +1167,8 @@ int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
VFIOBitmap vbmap;
int ret;
- if (!container->dirty_pages_supported && !all_device_dirty_tracking) {
+ if (!container->bcontainer.dirty_pages_supported &&
+ !all_device_dirty_tracking) {
cpu_physical_memory_set_dirty_range(ram_addr, size,
tcg_enabled() ? DIRTY_CLIENTS_ALL :
DIRTY_CLIENTS_NOCODE);
@@ -1180,7 +1183,8 @@ int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
if (all_device_dirty_tracking) {
ret = vfio_devices_query_dirty_bitmap(container, &vbmap, iova, size);
} else {
- ret = vfio_query_dirty_bitmap(container, &vbmap, iova, size);
+ ret = vfio_container_query_dirty_bitmap(&container->bcontainer, &vbmap,
+ iova, size);
}
if (ret) {
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index 3933391e0d..5d654ae172 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -31,11 +31,27 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb);
}
+int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer,
+ bool start)
+{
+ g_assert(bcontainer->ops->set_dirty_page_tracking);
+ return bcontainer->ops->set_dirty_page_tracking(bcontainer, start);
+}
+
+int vfio_container_query_dirty_bitmap(VFIOContainerBase *bcontainer,
+ VFIOBitmap *vbmap,
+ hwaddr iova, hwaddr size)
+{
+ g_assert(bcontainer->ops->query_dirty_bitmap);
+ return bcontainer->ops->query_dirty_bitmap(bcontainer, vbmap, iova, size);
+}
+
void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space,
const VFIOIOMMUOps *ops)
{
bcontainer->ops = ops;
bcontainer->space = space;
+ bcontainer->dirty_pages_supported = false;
QLIST_INIT(&bcontainer->giommu_list);
}
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index f12fcb6fe1..3ab74e2615 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -131,7 +131,7 @@ static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova,
if (iotlb && vfio_devices_all_running_and_mig_active(container)) {
if (!vfio_devices_all_device_dirty_tracking(container) &&
- container->dirty_pages_supported) {
+ container->bcontainer.dirty_pages_supported) {
return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
}
@@ -205,14 +205,17 @@ static int vfio_legacy_dma_map(VFIOContainerBase *bcontainer, hwaddr iova,
return -errno;
}
-int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
+static int vfio_legacy_set_dirty_page_tracking(VFIOContainerBase *bcontainer,
+ bool start)
{
+ VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+ bcontainer);
int ret;
struct vfio_iommu_type1_dirty_bitmap dirty = {
.argsz = sizeof(dirty),
};
- if (!container->dirty_pages_supported) {
+ if (!bcontainer->dirty_pages_supported) {
return 0;
}
@@ -232,9 +235,12 @@ int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
return ret;
}
-int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap,
- hwaddr iova, hwaddr size)
+static int vfio_legacy_query_dirty_bitmap(VFIOContainerBase *bcontainer,
+ VFIOBitmap *vbmap,
+ hwaddr iova, hwaddr size)
{
+ VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+ bcontainer);
struct vfio_iommu_type1_dirty_bitmap *dbitmap;
struct vfio_iommu_type1_dirty_bitmap_get *range;
int ret;
@@ -461,7 +467,7 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container,
* qemu_real_host_page_size to mark those dirty.
*/
if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) {
- container->dirty_pages_supported = true;
+ container->bcontainer.dirty_pages_supported = true;
container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
container->dirty_pgsizes = cap_mig->pgsize_bitmap;
}
@@ -553,7 +559,6 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
container = g_malloc0(sizeof(*container));
container->fd = fd;
container->error = NULL;
- container->dirty_pages_supported = false;
container->dma_max_mappings = 0;
container->iova_ranges = NULL;
QLIST_INIT(&container->vrdl_list);
@@ -937,4 +942,6 @@ void vfio_detach_device(VFIODevice *vbasedev)
const VFIOIOMMUOps vfio_legacy_ops = {
.dma_map = vfio_legacy_dma_map,
.dma_unmap = vfio_legacy_dma_unmap,
+ .set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking,
+ .query_dirty_bitmap = vfio_legacy_query_dirty_bitmap,
};
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index bd4de6cb3a..60f2785fe0 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -83,7 +83,6 @@ typedef struct VFIOContainer {
unsigned iommu_type;
Error *error;
bool initialized;
- bool dirty_pages_supported;
uint64_t dirty_pgsizes;
uint64_t max_dirty_bitmap_size;
unsigned long pgsizes;
@@ -190,11 +189,6 @@ VFIOAddressSpace *vfio_get_address_space(AddressSpace *as);
void vfio_put_address_space(VFIOAddressSpace *space);
bool vfio_devices_all_running_and_saving(VFIOContainer *container);
-/* container->fd */
-int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start);
-int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap,
- hwaddr iova, hwaddr size);
-
/* SPAPR specific */
int vfio_container_add_section_window(VFIOContainer *container,
MemoryRegionSection *section,
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index c7cc6ec9c5..f244f003d0 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -36,6 +36,7 @@ typedef struct VFIOAddressSpace {
typedef struct VFIOContainerBase {
const VFIOIOMMUOps *ops;
VFIOAddressSpace *space;
+ bool dirty_pages_supported;
QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
QLIST_ENTRY(VFIOContainerBase) next;
} VFIOContainerBase;
@@ -54,6 +55,11 @@ int vfio_container_dma_map(VFIOContainerBase *bcontainer,
int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
IOMMUTLBEntry *iotlb);
+int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer,
+ bool start);
+int vfio_container_query_dirty_bitmap(VFIOContainerBase *bcontainer,
+ VFIOBitmap *vbmap,
+ hwaddr iova, hwaddr size);
void vfio_container_init(VFIOContainerBase *bcontainer,
VFIOAddressSpace *space,
--
2.39.3

View File

@ -0,0 +1,303 @@
From 00daef8e3f4f64b1401b2e8945c256d27fbfa960 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 2 Nov 2023 15:12:29 +0800
Subject: [PATCH 004/101] vfio/container: Switch to dma_map|unmap API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [3/67] 9a20e2f2b277be65463f145df3309271493be6ac (eauger1/centos-qemu-kvm)
No functional change intended.
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit b08501a999e2448f500a46d68da503be55186b04)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/common.c | 45 +++++++++++++++------------
hw/vfio/container-base.c | 32 +++++++++++++++++++
hw/vfio/container.c | 22 ++++++++-----
hw/vfio/meson.build | 1 +
hw/vfio/trace-events | 2 +-
include/hw/vfio/vfio-common.h | 4 ---
include/hw/vfio/vfio-container-base.h | 7 +++++
7 files changed, 81 insertions(+), 32 deletions(-)
create mode 100644 hw/vfio/container-base.c
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index e70fdf5e0c..e610771888 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -292,7 +292,7 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
{
VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
- VFIOContainer *container = giommu->container;
+ VFIOContainerBase *bcontainer = &giommu->container->bcontainer;
hwaddr iova = iotlb->iova + giommu->iommu_offset;
void *vaddr;
int ret;
@@ -322,21 +322,22 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
* of vaddr will always be there, even if the memory object is
* destroyed and its backing memory munmap-ed.
*/
- ret = vfio_dma_map(container, iova,
- iotlb->addr_mask + 1, vaddr,
- read_only);
+ ret = vfio_container_dma_map(bcontainer, iova,
+ iotlb->addr_mask + 1, vaddr,
+ read_only);
if (ret) {
- error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
+ error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx", %p) = %d (%s)",
- container, iova,
+ bcontainer, iova,
iotlb->addr_mask + 1, vaddr, ret, strerror(-ret));
}
} else {
- ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
+ ret = vfio_container_dma_unmap(bcontainer, iova,
+ iotlb->addr_mask + 1, iotlb);
if (ret) {
- error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
+ error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx") = %d (%s)",
- container, iova,
+ bcontainer, iova,
iotlb->addr_mask + 1, ret, strerror(-ret));
vfio_set_migration_error(ret);
}
@@ -355,9 +356,10 @@ static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
int ret;
/* Unmap with a single call. */
- ret = vfio_dma_unmap(vrdl->container, iova, size , NULL);
+ ret = vfio_container_dma_unmap(&vrdl->container->bcontainer,
+ iova, size , NULL);
if (ret) {
- error_report("%s: vfio_dma_unmap() failed: %s", __func__,
+ error_report("%s: vfio_container_dma_unmap() failed: %s", __func__,
strerror(-ret));
}
}
@@ -385,8 +387,8 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
section->offset_within_address_space;
vaddr = memory_region_get_ram_ptr(section->mr) + start;
- ret = vfio_dma_map(vrdl->container, iova, next - start,
- vaddr, section->readonly);
+ ret = vfio_container_dma_map(&vrdl->container->bcontainer, iova,
+ next - start, vaddr, section->readonly);
if (ret) {
/* Rollback */
vfio_ram_discard_notify_discard(rdl, section);
@@ -684,10 +686,11 @@ static void vfio_listener_region_add(MemoryListener *listener,
}
}
- ret = vfio_dma_map(container, iova, int128_get64(llsize),
- vaddr, section->readonly);
+ ret = vfio_container_dma_map(&container->bcontainer,
+ iova, int128_get64(llsize), vaddr,
+ section->readonly);
if (ret) {
- error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
+ error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx", %p) = %d (%s)",
container, iova, int128_get64(llsize), vaddr, ret,
strerror(-ret));
@@ -784,18 +787,20 @@ static void vfio_listener_region_del(MemoryListener *listener,
if (int128_eq(llsize, int128_2_64())) {
/* The unmap ioctl doesn't accept a full 64-bit span. */
llsize = int128_rshift(llsize, 1);
- ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
+ ret = vfio_container_dma_unmap(&container->bcontainer, iova,
+ int128_get64(llsize), NULL);
if (ret) {
- error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
+ error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx") = %d (%s)",
container, iova, int128_get64(llsize), ret,
strerror(-ret));
}
iova += int128_get64(llsize);
}
- ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
+ ret = vfio_container_dma_unmap(&container->bcontainer, iova,
+ int128_get64(llsize), NULL);
if (ret) {
- error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
+ error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx") = %d (%s)",
container, iova, int128_get64(llsize), ret,
strerror(-ret));
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
new file mode 100644
index 0000000000..55d3a35fa4
--- /dev/null
+++ b/hw/vfio/container-base.c
@@ -0,0 +1,32 @@
+/*
+ * VFIO BASE CONTAINER
+ *
+ * Copyright (C) 2023 Intel Corporation.
+ * Copyright Red Hat, Inc. 2023
+ *
+ * Authors: Yi Liu <yi.l.liu@intel.com>
+ * Eric Auger <eric.auger@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "hw/vfio/vfio-container-base.h"
+
+int vfio_container_dma_map(VFIOContainerBase *bcontainer,
+ hwaddr iova, ram_addr_t size,
+ void *vaddr, bool readonly)
+{
+ g_assert(bcontainer->ops->dma_map);
+ return bcontainer->ops->dma_map(bcontainer, iova, size, vaddr, readonly);
+}
+
+int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
+ hwaddr iova, ram_addr_t size,
+ IOMMUTLBEntry *iotlb)
+{
+ g_assert(bcontainer->ops->dma_unmap);
+ return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb);
+}
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 4bc43ddfa4..c04df26323 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -115,9 +115,11 @@ unmap_exit:
/*
* DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
*/
-int vfio_dma_unmap(VFIOContainer *container, hwaddr iova,
- ram_addr_t size, IOMMUTLBEntry *iotlb)
+static int vfio_legacy_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova,
+ ram_addr_t size, IOMMUTLBEntry *iotlb)
{
+ VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+ bcontainer);
struct vfio_iommu_type1_dma_unmap unmap = {
.argsz = sizeof(unmap),
.flags = 0,
@@ -151,7 +153,7 @@ int vfio_dma_unmap(VFIOContainer *container, hwaddr iova,
*/
if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
container->iommu_type == VFIO_TYPE1v2_IOMMU) {
- trace_vfio_dma_unmap_overflow_workaround();
+ trace_vfio_legacy_dma_unmap_overflow_workaround();
unmap.size -= 1ULL << ctz64(container->pgsizes);
continue;
}
@@ -170,9 +172,11 @@ int vfio_dma_unmap(VFIOContainer *container, hwaddr iova,
return 0;
}
-int vfio_dma_map(VFIOContainer *container, hwaddr iova,
- ram_addr_t size, void *vaddr, bool readonly)
+static int vfio_legacy_dma_map(VFIOContainerBase *bcontainer, hwaddr iova,
+ ram_addr_t size, void *vaddr, bool readonly)
{
+ VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+ bcontainer);
struct vfio_iommu_type1_dma_map map = {
.argsz = sizeof(map),
.flags = VFIO_DMA_MAP_FLAG_READ,
@@ -191,7 +195,8 @@ int vfio_dma_map(VFIOContainer *container, hwaddr iova,
* the VGA ROM space.
*/
if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
- (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
+ (errno == EBUSY &&
+ vfio_legacy_dma_unmap(bcontainer, iova, size, NULL) == 0 &&
ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
return 0;
}
@@ -937,4 +942,7 @@ void vfio_detach_device(VFIODevice *vbasedev)
vfio_put_group(group);
}
-const VFIOIOMMUOps vfio_legacy_ops;
+const VFIOIOMMUOps vfio_legacy_ops = {
+ .dma_map = vfio_legacy_dma_map,
+ .dma_unmap = vfio_legacy_dma_unmap,
+};
diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build
index 2a6912c940..eb6ce6229d 100644
--- a/hw/vfio/meson.build
+++ b/hw/vfio/meson.build
@@ -2,6 +2,7 @@ vfio_ss = ss.source_set()
vfio_ss.add(files(
'helpers.c',
'common.c',
+ 'container-base.c',
'container.c',
'spapr.c',
'migration.c',
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 0eb2387cf2..9f7fedee98 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -116,7 +116,7 @@ vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Re
vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries"
vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]"
vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x"
-vfio_dma_unmap_overflow_workaround(void) ""
+vfio_legacy_dma_unmap_overflow_workaround(void) ""
vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64
vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 678161f207..24a26345e5 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -208,10 +208,6 @@ void vfio_put_address_space(VFIOAddressSpace *space);
bool vfio_devices_all_running_and_saving(VFIOContainer *container);
/* container->fd */
-int vfio_dma_unmap(VFIOContainer *container, hwaddr iova,
- ram_addr_t size, IOMMUTLBEntry *iotlb);
-int vfio_dma_map(VFIOContainer *container, hwaddr iova,
- ram_addr_t size, void *vaddr, bool readonly);
int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start);
int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap,
hwaddr iova, hwaddr size);
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index 1d6daaea5d..56b033f59f 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -31,6 +31,13 @@ typedef struct VFIOContainerBase {
const VFIOIOMMUOps *ops;
} VFIOContainerBase;
+int vfio_container_dma_map(VFIOContainerBase *bcontainer,
+ hwaddr iova, ram_addr_t size,
+ void *vaddr, bool readonly);
+int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
+ hwaddr iova, ram_addr_t size,
+ IOMMUTLBEntry *iotlb);
+
struct VFIOIOMMUOps {
/* basic feature */
int (*dma_map)(VFIOContainerBase *bcontainer,
--
2.39.3

View File

@ -0,0 +1,115 @@
From 49435d4d592bc890f56b69c2290f890c87b5a103 Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Tue, 21 Nov 2023 16:44:05 +0800
Subject: [PATCH 026/101] vfio/iommufd: Add support for iova_ranges and pgsizes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [25/67] 578af0547d97276ccd4936b574c12118fc70d468 (eauger1/centos-qemu-kvm)
Some vIOMMU such as virtio-iommu use IOVA ranges from host side to
setup reserved ranges for passthrough device, so that guest will not
use an IOVA range beyond host support.
Use an uAPI of IOMMUFD to get IOVA ranges of host side and pass to
vIOMMU just like the legacy backend, if this fails, fallback to
64bit IOVA range.
Also use out_iova_alignment returned from uAPI as pgsizes instead of
qemu_real_host_page_size() as a fallback.
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 714e9affa8ae1d84007c8afde7bb10fef9cb883d)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/iommufd.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 55 insertions(+), 1 deletion(-)
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 6d31aeac7b..01b448e840 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -261,6 +261,53 @@ static int iommufd_cdev_ram_block_discard_disable(bool state)
return ram_block_uncoordinated_discard_disable(state);
}
+static int iommufd_cdev_get_info_iova_range(VFIOIOMMUFDContainer *container,
+ uint32_t ioas_id, Error **errp)
+{
+ VFIOContainerBase *bcontainer = &container->bcontainer;
+ struct iommu_ioas_iova_ranges *info;
+ struct iommu_iova_range *iova_ranges;
+ int ret, sz, fd = container->be->fd;
+
+ info = g_malloc0(sizeof(*info));
+ info->size = sizeof(*info);
+ info->ioas_id = ioas_id;
+
+ ret = ioctl(fd, IOMMU_IOAS_IOVA_RANGES, info);
+ if (ret && errno != EMSGSIZE) {
+ goto error;
+ }
+
+ sz = info->num_iovas * sizeof(struct iommu_iova_range);
+ info = g_realloc(info, sizeof(*info) + sz);
+ info->allowed_iovas = (uintptr_t)(info + 1);
+
+ ret = ioctl(fd, IOMMU_IOAS_IOVA_RANGES, info);
+ if (ret) {
+ goto error;
+ }
+
+ iova_ranges = (struct iommu_iova_range *)(uintptr_t)info->allowed_iovas;
+
+ for (int i = 0; i < info->num_iovas; i++) {
+ Range *range = g_new(Range, 1);
+
+ range_set_bounds(range, iova_ranges[i].start, iova_ranges[i].last);
+ bcontainer->iova_ranges =
+ range_list_insert(bcontainer->iova_ranges, range);
+ }
+ bcontainer->pgsizes = info->out_iova_alignment;
+
+ g_free(info);
+ return 0;
+
+error:
+ ret = -errno;
+ g_free(info);
+ error_setg_errno(errp, errno, "Cannot get IOVA ranges");
+ return ret;
+}
+
static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
AddressSpace *as, Error **errp)
{
@@ -335,7 +382,14 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
goto err_discard_disable;
}
- bcontainer->pgsizes = qemu_real_host_page_size();
+ ret = iommufd_cdev_get_info_iova_range(container, ioas_id, &err);
+ if (ret) {
+ error_append_hint(&err,
+ "Fallback to default 64bit IOVA range and 4K page size\n");
+ warn_report_err(err);
+ err = NULL;
+ bcontainer->pgsizes = qemu_real_host_page_size();
+ }
bcontainer->listener = vfio_memory_listener;
memory_listener_register(&bcontainer->listener, bcontainer->space->as);
--
2.39.3

View File

@ -0,0 +1,215 @@
From e94700896dd8fcea149d9719eccde6f485440be2 Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Tue, 21 Nov 2023 16:44:08 +0800
Subject: [PATCH 029/101] vfio/iommufd: Enable pci hot reset through iommufd
cdev interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [28/67] ca1ae970138ee4a6f4b3b49817e775f3159f4c97 (eauger1/centos-qemu-kvm)
Implement the newly introduced pci_hot_reset callback named
iommufd_cdev_pci_hot_reset to do iommufd specific check and
reset operation.
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 96d6f85ff012abd7aaa35b1a2bc48b8640c898d9)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/iommufd.c | 150 +++++++++++++++++++++++++++++++++++++++++++
hw/vfio/trace-events | 1 +
2 files changed, 151 insertions(+)
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 01b448e840..6e53e013ef 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -24,6 +24,7 @@
#include "sysemu/reset.h"
#include "qemu/cutils.h"
#include "qemu/chardev_open.h"
+#include "pci.h"
static int iommufd_cdev_map(VFIOContainerBase *bcontainer, hwaddr iova,
ram_addr_t size, void *vaddr, bool readonly)
@@ -468,9 +469,158 @@ static void iommufd_cdev_detach(VFIODevice *vbasedev)
close(vbasedev->fd);
}
+static VFIODevice *iommufd_cdev_pci_find_by_devid(__u32 devid)
+{
+ VFIODevice *vbasedev_iter;
+
+ QLIST_FOREACH(vbasedev_iter, &vfio_device_list, global_next) {
+ if (vbasedev_iter->bcontainer->ops != &vfio_iommufd_ops) {
+ continue;
+ }
+ if (devid == vbasedev_iter->devid) {
+ return vbasedev_iter;
+ }
+ }
+ return NULL;
+}
+
+static VFIOPCIDevice *
+iommufd_cdev_dep_get_realized_vpdev(struct vfio_pci_dependent_device *dep_dev,
+ VFIODevice *reset_dev)
+{
+ VFIODevice *vbasedev_tmp;
+
+ if (dep_dev->devid == reset_dev->devid ||
+ dep_dev->devid == VFIO_PCI_DEVID_OWNED) {
+ return NULL;
+ }
+
+ vbasedev_tmp = iommufd_cdev_pci_find_by_devid(dep_dev->devid);
+ if (!vbasedev_tmp || !vbasedev_tmp->dev->realized ||
+ vbasedev_tmp->type != VFIO_DEVICE_TYPE_PCI) {
+ return NULL;
+ }
+
+ return container_of(vbasedev_tmp, VFIOPCIDevice, vbasedev);
+}
+
+static int iommufd_cdev_pci_hot_reset(VFIODevice *vbasedev, bool single)
+{
+ VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+ struct vfio_pci_hot_reset_info *info = NULL;
+ struct vfio_pci_dependent_device *devices;
+ struct vfio_pci_hot_reset *reset;
+ int ret, i;
+ bool multi = false;
+
+ trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
+
+ if (!single) {
+ vfio_pci_pre_reset(vdev);
+ }
+ vdev->vbasedev.needs_reset = false;
+
+ ret = vfio_pci_get_pci_hot_reset_info(vdev, &info);
+
+ if (ret) {
+ goto out_single;
+ }
+
+ assert(info->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID);
+
+ devices = &info->devices[0];
+
+ if (!(info->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED)) {
+ if (!vdev->has_pm_reset) {
+ for (i = 0; i < info->count; i++) {
+ if (devices[i].devid == VFIO_PCI_DEVID_NOT_OWNED) {
+ error_report("vfio: Cannot reset device %s, "
+ "depends on device %04x:%02x:%02x.%x "
+ "which is not owned.",
+ vdev->vbasedev.name, devices[i].segment,
+ devices[i].bus, PCI_SLOT(devices[i].devfn),
+ PCI_FUNC(devices[i].devfn));
+ }
+ }
+ }
+ ret = -EPERM;
+ goto out_single;
+ }
+
+ trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
+
+ for (i = 0; i < info->count; i++) {
+ VFIOPCIDevice *tmp;
+
+ trace_iommufd_cdev_pci_hot_reset_dep_devices(devices[i].segment,
+ devices[i].bus,
+ PCI_SLOT(devices[i].devfn),
+ PCI_FUNC(devices[i].devfn),
+ devices[i].devid);
+
+ /*
+ * If a VFIO cdev device is resettable, all the dependent devices
+ * are either bound to same iommufd or within same iommu_groups as
+ * one of the iommufd bound devices.
+ */
+ assert(devices[i].devid != VFIO_PCI_DEVID_NOT_OWNED);
+
+ tmp = iommufd_cdev_dep_get_realized_vpdev(&devices[i], &vdev->vbasedev);
+ if (!tmp) {
+ continue;
+ }
+
+ if (single) {
+ ret = -EINVAL;
+ goto out_single;
+ }
+ vfio_pci_pre_reset(tmp);
+ tmp->vbasedev.needs_reset = false;
+ multi = true;
+ }
+
+ if (!single && !multi) {
+ ret = -EINVAL;
+ goto out_single;
+ }
+
+ /* Use zero length array for hot reset with iommufd backend */
+ reset = g_malloc0(sizeof(*reset));
+ reset->argsz = sizeof(*reset);
+
+ /* Bus reset! */
+ ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
+ g_free(reset);
+ if (ret) {
+ ret = -errno;
+ }
+
+ trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
+ ret ? strerror(errno) : "Success");
+
+ /* Re-enable INTx on affected devices */
+ for (i = 0; i < info->count; i++) {
+ VFIOPCIDevice *tmp;
+
+ tmp = iommufd_cdev_dep_get_realized_vpdev(&devices[i], &vdev->vbasedev);
+ if (!tmp) {
+ continue;
+ }
+ vfio_pci_post_reset(tmp);
+ }
+out_single:
+ if (!single) {
+ vfio_pci_post_reset(vdev);
+ }
+ g_free(info);
+
+ return ret;
+}
+
const VFIOIOMMUOps vfio_iommufd_ops = {
.dma_map = iommufd_cdev_map,
.dma_unmap = iommufd_cdev_unmap,
.attach_device = iommufd_cdev_attach,
.detach_device = iommufd_cdev_detach,
+ .pci_hot_reset = iommufd_cdev_pci_hot_reset,
};
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 3340c93af0..8fdde54456 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -174,3 +174,4 @@ iommufd_cdev_detach_ioas_hwpt(int iommufd, const char *name) " [iommufd=%d] Succ
iommufd_cdev_fail_attach_existing_container(const char *msg) " %s"
iommufd_cdev_alloc_ioas(int iommufd, int ioas_id) " [iommufd=%d] new IOMMUFD container with ioasid=%d"
iommufd_cdev_device_info(char *name, int devfd, int num_irqs, int num_regions, int flags) " %s (%d) num_irqs=%d num_regions=%d flags=%d"
+iommufd_cdev_pci_hot_reset_dep_devices(int domain, int bus, int slot, int function, int dev_id) "\t%04x:%02x:%02x.%x devid %d"
--
2.39.3

View File

@ -0,0 +1,561 @@
From f018d0b686406256c2b5e823e4227316ee1394e9 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Tue, 21 Nov 2023 16:44:03 +0800
Subject: [PATCH 024/101] vfio/iommufd: Implement the iommufd backend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [23/67] d11046654117a690542a1e2b48b9d1994f778b2d (eauger1/centos-qemu-kvm)
The iommufd backend is implemented based on the new /dev/iommu user API.
This backend obviously depends on CONFIG_IOMMUFD.
So far, the iommufd backend doesn't support dirty page sync yet.
Co-authored-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 5ee3dc7af7859e7b8aa34c10c21778101c15e812)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/common.c | 6 +
hw/vfio/iommufd.c | 422 ++++++++++++++++++++++++++++++++++
hw/vfio/meson.build | 3 +
hw/vfio/trace-events | 10 +
include/hw/vfio/vfio-common.h | 11 +
5 files changed, 452 insertions(+)
create mode 100644 hw/vfio/iommufd.c
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 934f4f5446..6569732b7a 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -19,6 +19,7 @@
*/
#include "qemu/osdep.h"
+#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
#include <sys/ioctl.h>
#ifdef CONFIG_KVM
#include <linux/kvm.h>
@@ -1503,6 +1504,11 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev,
{
const VFIOIOMMUOps *ops = &vfio_legacy_ops;
+#ifdef CONFIG_IOMMUFD
+ if (vbasedev->iommufd) {
+ ops = &vfio_iommufd_ops;
+ }
+#endif
return ops->attach_device(name, vbasedev, as, errp);
}
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
new file mode 100644
index 0000000000..6d31aeac7b
--- /dev/null
+++ b/hw/vfio/iommufd.c
@@ -0,0 +1,422 @@
+/*
+ * iommufd container backend
+ *
+ * Copyright (C) 2023 Intel Corporation.
+ * Copyright Red Hat, Inc. 2023
+ *
+ * Authors: Yi Liu <yi.l.liu@intel.com>
+ * Eric Auger <eric.auger@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include <sys/ioctl.h>
+#include <linux/vfio.h>
+#include <linux/iommufd.h>
+
+#include "hw/vfio/vfio-common.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+#include "qapi/error.h"
+#include "sysemu/iommufd.h"
+#include "hw/qdev-core.h"
+#include "sysemu/reset.h"
+#include "qemu/cutils.h"
+#include "qemu/chardev_open.h"
+
+static int iommufd_cdev_map(VFIOContainerBase *bcontainer, hwaddr iova,
+ ram_addr_t size, void *vaddr, bool readonly)
+{
+ VFIOIOMMUFDContainer *container =
+ container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer);
+
+ return iommufd_backend_map_dma(container->be,
+ container->ioas_id,
+ iova, size, vaddr, readonly);
+}
+
+static int iommufd_cdev_unmap(VFIOContainerBase *bcontainer,
+ hwaddr iova, ram_addr_t size,
+ IOMMUTLBEntry *iotlb)
+{
+ VFIOIOMMUFDContainer *container =
+ container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer);
+
+ /* TODO: Handle dma_unmap_bitmap with iotlb args (migration) */
+ return iommufd_backend_unmap_dma(container->be,
+ container->ioas_id, iova, size);
+}
+
+static int iommufd_cdev_kvm_device_add(VFIODevice *vbasedev, Error **errp)
+{
+ return vfio_kvm_device_add_fd(vbasedev->fd, errp);
+}
+
+static void iommufd_cdev_kvm_device_del(VFIODevice *vbasedev)
+{
+ Error *err = NULL;
+
+ if (vfio_kvm_device_del_fd(vbasedev->fd, &err)) {
+ error_report_err(err);
+ }
+}
+
+static int iommufd_cdev_connect_and_bind(VFIODevice *vbasedev, Error **errp)
+{
+ IOMMUFDBackend *iommufd = vbasedev->iommufd;
+ struct vfio_device_bind_iommufd bind = {
+ .argsz = sizeof(bind),
+ .flags = 0,
+ };
+ int ret;
+
+ ret = iommufd_backend_connect(iommufd, errp);
+ if (ret) {
+ return ret;
+ }
+
+ /*
+ * Add device to kvm-vfio to be prepared for the tracking
+ * in KVM. Especially for some emulated devices, it requires
+ * to have kvm information in the device open.
+ */
+ ret = iommufd_cdev_kvm_device_add(vbasedev, errp);
+ if (ret) {
+ goto err_kvm_device_add;
+ }
+
+ /* Bind device to iommufd */
+ bind.iommufd = iommufd->fd;
+ ret = ioctl(vbasedev->fd, VFIO_DEVICE_BIND_IOMMUFD, &bind);
+ if (ret) {
+ error_setg_errno(errp, errno, "error bind device fd=%d to iommufd=%d",
+ vbasedev->fd, bind.iommufd);
+ goto err_bind;
+ }
+
+ vbasedev->devid = bind.out_devid;
+ trace_iommufd_cdev_connect_and_bind(bind.iommufd, vbasedev->name,
+ vbasedev->fd, vbasedev->devid);
+ return ret;
+err_bind:
+ iommufd_cdev_kvm_device_del(vbasedev);
+err_kvm_device_add:
+ iommufd_backend_disconnect(iommufd);
+ return ret;
+}
+
+static void iommufd_cdev_unbind_and_disconnect(VFIODevice *vbasedev)
+{
+ /* Unbind is automatically conducted when device fd is closed */
+ iommufd_cdev_kvm_device_del(vbasedev);
+ iommufd_backend_disconnect(vbasedev->iommufd);
+}
+
+static int iommufd_cdev_getfd(const char *sysfs_path, Error **errp)
+{
+ long int ret = -ENOTTY;
+ char *path, *vfio_dev_path = NULL, *vfio_path = NULL;
+ DIR *dir = NULL;
+ struct dirent *dent;
+ gchar *contents;
+ struct stat st;
+ gsize length;
+ int major, minor;
+ dev_t vfio_devt;
+
+ path = g_strdup_printf("%s/vfio-dev", sysfs_path);
+ if (stat(path, &st) < 0) {
+ error_setg_errno(errp, errno, "no such host device");
+ goto out_free_path;
+ }
+
+ dir = opendir(path);
+ if (!dir) {
+ error_setg_errno(errp, errno, "couldn't open directory %s", path);
+ goto out_free_path;
+ }
+
+ while ((dent = readdir(dir))) {
+ if (!strncmp(dent->d_name, "vfio", 4)) {
+ vfio_dev_path = g_strdup_printf("%s/%s/dev", path, dent->d_name);
+ break;
+ }
+ }
+
+ if (!vfio_dev_path) {
+ error_setg(errp, "failed to find vfio-dev/vfioX/dev");
+ goto out_close_dir;
+ }
+
+ if (!g_file_get_contents(vfio_dev_path, &contents, &length, NULL)) {
+ error_setg(errp, "failed to load \"%s\"", vfio_dev_path);
+ goto out_free_dev_path;
+ }
+
+ if (sscanf(contents, "%d:%d", &major, &minor) != 2) {
+ error_setg(errp, "failed to get major:minor for \"%s\"", vfio_dev_path);
+ goto out_free_dev_path;
+ }
+ g_free(contents);
+ vfio_devt = makedev(major, minor);
+
+ vfio_path = g_strdup_printf("/dev/vfio/devices/%s", dent->d_name);
+ ret = open_cdev(vfio_path, vfio_devt);
+ if (ret < 0) {
+ error_setg(errp, "Failed to open %s", vfio_path);
+ }
+
+ trace_iommufd_cdev_getfd(vfio_path, ret);
+ g_free(vfio_path);
+
+out_free_dev_path:
+ g_free(vfio_dev_path);
+out_close_dir:
+ closedir(dir);
+out_free_path:
+ if (*errp) {
+ error_prepend(errp, VFIO_MSG_PREFIX, path);
+ }
+ g_free(path);
+
+ return ret;
+}
+
+static int iommufd_cdev_attach_ioas_hwpt(VFIODevice *vbasedev, uint32_t id,
+ Error **errp)
+{
+ int ret, iommufd = vbasedev->iommufd->fd;
+ struct vfio_device_attach_iommufd_pt attach_data = {
+ .argsz = sizeof(attach_data),
+ .flags = 0,
+ .pt_id = id,
+ };
+
+ /* Attach device to an IOAS or hwpt within iommufd */
+ ret = ioctl(vbasedev->fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach_data);
+ if (ret) {
+ error_setg_errno(errp, errno,
+ "[iommufd=%d] error attach %s (%d) to id=%d",
+ iommufd, vbasedev->name, vbasedev->fd, id);
+ } else {
+ trace_iommufd_cdev_attach_ioas_hwpt(iommufd, vbasedev->name,
+ vbasedev->fd, id);
+ }
+ return ret;
+}
+
+static int iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp)
+{
+ int ret, iommufd = vbasedev->iommufd->fd;
+ struct vfio_device_detach_iommufd_pt detach_data = {
+ .argsz = sizeof(detach_data),
+ .flags = 0,
+ };
+
+ ret = ioctl(vbasedev->fd, VFIO_DEVICE_DETACH_IOMMUFD_PT, &detach_data);
+ if (ret) {
+ error_setg_errno(errp, errno, "detach %s failed", vbasedev->name);
+ } else {
+ trace_iommufd_cdev_detach_ioas_hwpt(iommufd, vbasedev->name);
+ }
+ return ret;
+}
+
+static int iommufd_cdev_attach_container(VFIODevice *vbasedev,
+ VFIOIOMMUFDContainer *container,
+ Error **errp)
+{
+ return iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp);
+}
+
+static void iommufd_cdev_detach_container(VFIODevice *vbasedev,
+ VFIOIOMMUFDContainer *container)
+{
+ Error *err = NULL;
+
+ if (iommufd_cdev_detach_ioas_hwpt(vbasedev, &err)) {
+ error_report_err(err);
+ }
+}
+
+static void iommufd_cdev_container_destroy(VFIOIOMMUFDContainer *container)
+{
+ VFIOContainerBase *bcontainer = &container->bcontainer;
+
+ if (!QLIST_EMPTY(&bcontainer->device_list)) {
+ return;
+ }
+ memory_listener_unregister(&bcontainer->listener);
+ vfio_container_destroy(bcontainer);
+ iommufd_backend_free_id(container->be, container->ioas_id);
+ g_free(container);
+}
+
+static int iommufd_cdev_ram_block_discard_disable(bool state)
+{
+ /*
+ * We support coordinated discarding of RAM via the RamDiscardManager.
+ */
+ return ram_block_uncoordinated_discard_disable(state);
+}
+
+static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
+ AddressSpace *as, Error **errp)
+{
+ VFIOContainerBase *bcontainer;
+ VFIOIOMMUFDContainer *container;
+ VFIOAddressSpace *space;
+ struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
+ int ret, devfd;
+ uint32_t ioas_id;
+ Error *err = NULL;
+
+ devfd = iommufd_cdev_getfd(vbasedev->sysfsdev, errp);
+ if (devfd < 0) {
+ return devfd;
+ }
+ vbasedev->fd = devfd;
+
+ ret = iommufd_cdev_connect_and_bind(vbasedev, errp);
+ if (ret) {
+ goto err_connect_bind;
+ }
+
+ space = vfio_get_address_space(as);
+
+ /* try to attach to an existing container in this space */
+ QLIST_FOREACH(bcontainer, &space->containers, next) {
+ container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer);
+ if (bcontainer->ops != &vfio_iommufd_ops ||
+ vbasedev->iommufd != container->be) {
+ continue;
+ }
+ if (iommufd_cdev_attach_container(vbasedev, container, &err)) {
+ const char *msg = error_get_pretty(err);
+
+ trace_iommufd_cdev_fail_attach_existing_container(msg);
+ error_free(err);
+ err = NULL;
+ } else {
+ ret = iommufd_cdev_ram_block_discard_disable(true);
+ if (ret) {
+ error_setg(errp,
+ "Cannot set discarding of RAM broken (%d)", ret);
+ goto err_discard_disable;
+ }
+ goto found_container;
+ }
+ }
+
+ /* Need to allocate a new dedicated container */
+ ret = iommufd_backend_alloc_ioas(vbasedev->iommufd, &ioas_id, errp);
+ if (ret < 0) {
+ goto err_alloc_ioas;
+ }
+
+ trace_iommufd_cdev_alloc_ioas(vbasedev->iommufd->fd, ioas_id);
+
+ container = g_malloc0(sizeof(*container));
+ container->be = vbasedev->iommufd;
+ container->ioas_id = ioas_id;
+
+ bcontainer = &container->bcontainer;
+ vfio_container_init(bcontainer, space, &vfio_iommufd_ops);
+ QLIST_INSERT_HEAD(&space->containers, bcontainer, next);
+
+ ret = iommufd_cdev_attach_container(vbasedev, container, errp);
+ if (ret) {
+ goto err_attach_container;
+ }
+
+ ret = iommufd_cdev_ram_block_discard_disable(true);
+ if (ret) {
+ goto err_discard_disable;
+ }
+
+ bcontainer->pgsizes = qemu_real_host_page_size();
+
+ bcontainer->listener = vfio_memory_listener;
+ memory_listener_register(&bcontainer->listener, bcontainer->space->as);
+
+ if (bcontainer->error) {
+ ret = -1;
+ error_propagate_prepend(errp, bcontainer->error,
+ "memory listener initialization failed: ");
+ goto err_listener_register;
+ }
+
+ bcontainer->initialized = true;
+
+found_container:
+ ret = ioctl(devfd, VFIO_DEVICE_GET_INFO, &dev_info);
+ if (ret) {
+ error_setg_errno(errp, errno, "error getting device info");
+ goto err_listener_register;
+ }
+
+ /*
+ * TODO: examine RAM_BLOCK_DISCARD stuff, should we do group level
+ * for discarding incompatibility check as well?
+ */
+ if (vbasedev->ram_block_discard_allowed) {
+ iommufd_cdev_ram_block_discard_disable(false);
+ }
+
+ vbasedev->group = 0;
+ vbasedev->num_irqs = dev_info.num_irqs;
+ vbasedev->num_regions = dev_info.num_regions;
+ vbasedev->flags = dev_info.flags;
+ vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
+ vbasedev->bcontainer = bcontainer;
+ QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
+ QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
+
+ trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs,
+ vbasedev->num_regions, vbasedev->flags);
+ return 0;
+
+err_listener_register:
+ iommufd_cdev_ram_block_discard_disable(false);
+err_discard_disable:
+ iommufd_cdev_detach_container(vbasedev, container);
+err_attach_container:
+ iommufd_cdev_container_destroy(container);
+err_alloc_ioas:
+ vfio_put_address_space(space);
+ iommufd_cdev_unbind_and_disconnect(vbasedev);
+err_connect_bind:
+ close(vbasedev->fd);
+ return ret;
+}
+
+static void iommufd_cdev_detach(VFIODevice *vbasedev)
+{
+ VFIOContainerBase *bcontainer = vbasedev->bcontainer;
+ VFIOAddressSpace *space = bcontainer->space;
+ VFIOIOMMUFDContainer *container = container_of(bcontainer,
+ VFIOIOMMUFDContainer,
+ bcontainer);
+ QLIST_REMOVE(vbasedev, global_next);
+ QLIST_REMOVE(vbasedev, container_next);
+ vbasedev->bcontainer = NULL;
+
+ if (!vbasedev->ram_block_discard_allowed) {
+ iommufd_cdev_ram_block_discard_disable(false);
+ }
+
+ iommufd_cdev_detach_container(vbasedev, container);
+ iommufd_cdev_container_destroy(container);
+ vfio_put_address_space(space);
+
+ iommufd_cdev_unbind_and_disconnect(vbasedev);
+ close(vbasedev->fd);
+}
+
+const VFIOIOMMUOps vfio_iommufd_ops = {
+ .dma_map = iommufd_cdev_map,
+ .dma_unmap = iommufd_cdev_unmap,
+ .attach_device = iommufd_cdev_attach,
+ .detach_device = iommufd_cdev_detach,
+};
diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build
index eb6ce6229d..e5d98b6adc 100644
--- a/hw/vfio/meson.build
+++ b/hw/vfio/meson.build
@@ -7,6 +7,9 @@ vfio_ss.add(files(
'spapr.c',
'migration.c',
))
+vfio_ss.add(when: 'CONFIG_IOMMUFD', if_true: files(
+ 'iommufd.c',
+))
vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files(
'display.c',
'pci-quirks.c',
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 08a1f9dfa4..3340c93af0 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -164,3 +164,13 @@ vfio_state_pending_estimate(const char *name, uint64_t precopy, uint64_t postcop
vfio_state_pending_exact(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t stopcopy_size, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" stopcopy size 0x%"PRIx64" precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64
vfio_vmstate_change(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s"
vfio_vmstate_change_prepare(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s"
+
+#iommufd.c
+
+iommufd_cdev_connect_and_bind(int iommufd, const char *name, int devfd, int devid) " [iommufd=%d] Successfully bound device %s (fd=%d): output devid=%d"
+iommufd_cdev_getfd(const char *dev, int devfd) " %s (fd=%d)"
+iommufd_cdev_attach_ioas_hwpt(int iommufd, const char *name, int devfd, int id) " [iommufd=%d] Successfully attached device %s (%d) to id=%d"
+iommufd_cdev_detach_ioas_hwpt(int iommufd, const char *name) " [iommufd=%d] Successfully detached %s"
+iommufd_cdev_fail_attach_existing_container(const char *msg) " %s"
+iommufd_cdev_alloc_ioas(int iommufd, int ioas_id) " [iommufd=%d] new IOMMUFD container with ioasid=%d"
+iommufd_cdev_device_info(char *name, int devfd, int num_irqs, int num_regions, int flags) " %s (%d) num_irqs=%d num_regions=%d flags=%d"
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 24ecc0e7ee..3dac5c167e 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -89,6 +89,14 @@ typedef struct VFIOHostDMAWindow {
QLIST_ENTRY(VFIOHostDMAWindow) hostwin_next;
} VFIOHostDMAWindow;
+typedef struct IOMMUFDBackend IOMMUFDBackend;
+
+typedef struct VFIOIOMMUFDContainer {
+ VFIOContainerBase bcontainer;
+ IOMMUFDBackend *be;
+ uint32_t ioas_id;
+} VFIOIOMMUFDContainer;
+
typedef struct VFIODeviceOps VFIODeviceOps;
typedef struct VFIODevice {
@@ -116,6 +124,8 @@ typedef struct VFIODevice {
OnOffAuto pre_copy_dirty_page_tracking;
bool dirty_pages_supported;
bool dirty_tracking;
+ int devid;
+ IOMMUFDBackend *iommufd;
} VFIODevice;
struct VFIODeviceOps {
@@ -201,6 +211,7 @@ typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList;
extern VFIOGroupList vfio_group_list;
extern VFIODeviceList vfio_device_list;
extern const VFIOIOMMUOps vfio_legacy_ops;
+extern const VFIOIOMMUOps vfio_iommufd_ops;
extern const MemoryListener vfio_memory_listener;
extern int vfio_kvm_device_fd;
--
2.39.3

View File

@ -0,0 +1,155 @@
From f98defd6fe081bc44f5bd823d187d7d3b12832ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Tue, 19 Dec 2023 07:58:23 +0100
Subject: [PATCH 056/101] vfio/iommufd: Introduce a VFIOIOMMU iommufd QOM
interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [55/67] 789ecf74ace326b0df5d494fd558d7d0b6294a85 (eauger1/centos-qemu-kvm)
As previously done for the sPAPR and legacy IOMMU backends, convert
the VFIOIOMMUOps struct to a QOM interface. The set of of operations
for this backend can be referenced with a literal typename instead of
a C struct.
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Tested-by: Eric Farman <farman@linux.ibm.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit ce5f6d49f5845c3b9955cc377a5223c3f8d7ba1e)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/common.c | 2 +-
hw/vfio/iommufd.c | 35 ++++++++++++++++++++-------
include/hw/vfio/vfio-common.h | 1 -
include/hw/vfio/vfio-container-base.h | 2 +-
4 files changed, 28 insertions(+), 12 deletions(-)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 2329d0efc8..89ff1c7aed 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1508,7 +1508,7 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev,
#ifdef CONFIG_IOMMUFD
if (vbasedev->iommufd) {
- ops = &vfio_iommufd_ops;
+ ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD));
}
#endif
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 87a561c545..d4c586e842 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -319,6 +319,8 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
int ret, devfd;
uint32_t ioas_id;
Error *err = NULL;
+ const VFIOIOMMUClass *iommufd_vioc =
+ VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD));
if (vbasedev->fd < 0) {
devfd = iommufd_cdev_getfd(vbasedev->sysfsdev, errp);
@@ -340,7 +342,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
/* try to attach to an existing container in this space */
QLIST_FOREACH(bcontainer, &space->containers, next) {
container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer);
- if (bcontainer->ops != &vfio_iommufd_ops ||
+ if (bcontainer->ops != iommufd_vioc ||
vbasedev->iommufd != container->be) {
continue;
}
@@ -374,7 +376,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
container->ioas_id = ioas_id;
bcontainer = &container->bcontainer;
- vfio_container_init(bcontainer, space, &vfio_iommufd_ops);
+ vfio_container_init(bcontainer, space, iommufd_vioc);
QLIST_INSERT_HEAD(&space->containers, bcontainer, next);
ret = iommufd_cdev_attach_container(vbasedev, container, errp);
@@ -476,9 +478,11 @@ static void iommufd_cdev_detach(VFIODevice *vbasedev)
static VFIODevice *iommufd_cdev_pci_find_by_devid(__u32 devid)
{
VFIODevice *vbasedev_iter;
+ const VFIOIOMMUClass *iommufd_vioc =
+ VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD));
QLIST_FOREACH(vbasedev_iter, &vfio_device_list, global_next) {
- if (vbasedev_iter->bcontainer->ops != &vfio_iommufd_ops) {
+ if (vbasedev_iter->bcontainer->ops != iommufd_vioc) {
continue;
}
if (devid == vbasedev_iter->devid) {
@@ -621,10 +625,23 @@ out_single:
return ret;
}
-const VFIOIOMMUOps vfio_iommufd_ops = {
- .dma_map = iommufd_cdev_map,
- .dma_unmap = iommufd_cdev_unmap,
- .attach_device = iommufd_cdev_attach,
- .detach_device = iommufd_cdev_detach,
- .pci_hot_reset = iommufd_cdev_pci_hot_reset,
+static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data)
+{
+ VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
+
+ vioc->dma_map = iommufd_cdev_map;
+ vioc->dma_unmap = iommufd_cdev_unmap;
+ vioc->attach_device = iommufd_cdev_attach;
+ vioc->detach_device = iommufd_cdev_detach;
+ vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset;
};
+
+static const TypeInfo types[] = {
+ {
+ .name = TYPE_VFIO_IOMMU_IOMMUFD,
+ .parent = TYPE_VFIO_IOMMU,
+ .class_init = vfio_iommu_iommufd_class_init,
+ },
+};
+
+DEFINE_TYPES(types)
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 14c497b6b0..9b7ef7d02b 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -210,7 +210,6 @@ typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList;
typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList;
extern VFIOGroupList vfio_group_list;
extern VFIODeviceList vfio_device_list;
-extern const VFIOIOMMUOps vfio_iommufd_ops;
extern const MemoryListener vfio_memory_listener;
extern int vfio_kvm_device_fd;
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index 9e21d7811f..b2813b0c11 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -17,7 +17,6 @@
typedef struct VFIODevice VFIODevice;
typedef struct VFIOIOMMUClass VFIOIOMMUClass;
-#define VFIOIOMMUOps VFIOIOMMUClass /* To remove */
typedef struct {
unsigned long *bitmap;
@@ -96,6 +95,7 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer);
#define TYPE_VFIO_IOMMU "vfio-iommu"
#define TYPE_VFIO_IOMMU_LEGACY TYPE_VFIO_IOMMU "-legacy"
#define TYPE_VFIO_IOMMU_SPAPR TYPE_VFIO_IOMMU "-spapr"
+#define TYPE_VFIO_IOMMU_IOMMUFD TYPE_VFIO_IOMMU "-iommufd"
/*
* VFIOContainerBase is not an abstract QOM object because it felt
--
2.39.3

View File

@ -0,0 +1,71 @@
From 5a49c5bb690d55fc88b6fb12f059ae932de0a716 Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Tue, 21 Nov 2023 16:44:04 +0800
Subject: [PATCH 025/101] vfio/iommufd: Relax assert check for iommufd backend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [24/67] 2c9e41e9ca0b67ebf807d1643a98866a0cb75768 (eauger1/centos-qemu-kvm)
Currently iommufd doesn't support dirty page sync yet,
but it will not block us doing live migration if VFIO
migration is force enabled.
So in this case we allow set_dirty_page_tracking to be NULL.
Note we don't need same change for query_dirty_bitmap because
when dirty page sync isn't supported, query_dirty_bitmap will
never be called.
Suggested-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 36e84d0c17102fa1c887d8c650a13ec08fca0ec0)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/container-base.c | 4 ++++
hw/vfio/container.c | 4 ----
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index 71f7274973..eee2dcfe76 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -55,6 +55,10 @@ void vfio_container_del_section_window(VFIOContainerBase *bcontainer,
int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer,
bool start)
{
+ if (!bcontainer->dirty_pages_supported) {
+ return 0;
+ }
+
g_assert(bcontainer->ops->set_dirty_page_tracking);
return bcontainer->ops->set_dirty_page_tracking(bcontainer, start);
}
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 6bacf38222..ed2d721b2b 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -216,10 +216,6 @@ static int vfio_legacy_set_dirty_page_tracking(VFIOContainerBase *bcontainer,
.argsz = sizeof(dirty),
};
- if (!bcontainer->dirty_pages_supported) {
- return 0;
- }
-
if (start) {
dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
} else {
--
2.39.3

View File

@ -0,0 +1,55 @@
From 5549bf1b2e07213c23e280a43ab2ab67d5b7304a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Tue, 19 Dec 2023 07:58:25 +0100
Subject: [PATCH 058/101] vfio/iommufd: Remove CONFIG_IOMMUFD usage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [57/67] 3a6a45d379241d9412e0b8bcfeb9be0b4add59a5 (eauger1/centos-qemu-kvm)
Availability of the IOMMUFD backend can now be fully determined at
runtime and the ifdef check was a build time protection (for PPC not
supporting it mostly).
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Tested-by: Eric Farman <farman@linux.ibm.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit c1139fa4feba8c320e4bd0a4e34af55caa5ffbb9)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/common.c | 3 ---
1 file changed, 3 deletions(-)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 89ff1c7aed..0d4d8b8416 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -19,7 +19,6 @@
*/
#include "qemu/osdep.h"
-#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
#include <sys/ioctl.h>
#ifdef CONFIG_KVM
#include <linux/kvm.h>
@@ -1506,11 +1505,9 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev,
const VFIOIOMMUClass *ops =
VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY));
-#ifdef CONFIG_IOMMUFD
if (vbasedev->iommufd) {
ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD));
}
-#endif
assert(ops);
--
2.39.3

View File

@ -0,0 +1,56 @@
From 6b36dc2a305af856af03aad2e315eea96a349153 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Thu, 21 Dec 2023 09:09:57 +0100
Subject: [PATCH 061/101] vfio/iommufd: Remove the use of stat() to check file
existence
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [60/67] 485770e45c1a6399780939bfb8b01b615d9213c6 (eauger1/centos-qemu-kvm)
Using stat() before opening a file or a directory can lead to a
time-of-check to time-of-use (TOCTOU) filesystem race, which is
reported by coverity as a Security best practices violations. The
sequence could be replaced by open and fdopendir but it doesn't add
much in this case. Simply use opendir to avoid the race.
Fixes: CID 1531551
Signed-off-by: Cédric Le Goater <clg@redhat.com>
Reviewed-by: Zhenzhong Duan <Zhenzhong.duan@intel.com>
(cherry picked from commit 6ba254801f6bc7f3ef68a6414f1b107237c7eb26)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/iommufd.c | 6 ------
1 file changed, 6 deletions(-)
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index d4c586e842..9bfddc1360 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -121,17 +121,11 @@ static int iommufd_cdev_getfd(const char *sysfs_path, Error **errp)
DIR *dir = NULL;
struct dirent *dent;
gchar *contents;
- struct stat st;
gsize length;
int major, minor;
dev_t vfio_devt;
path = g_strdup_printf("%s/vfio-dev", sysfs_path);
- if (stat(path, &st) < 0) {
- error_setg_errno(errp, errno, "no such host device");
- goto out_free_path;
- }
-
dir = opendir(path);
if (!dir) {
error_setg_errno(errp, errno, "couldn't open directory %s", path);
--
2.39.3

View File

@ -0,0 +1,115 @@
From 0c0435e7210b99a6bf7b8f8205f7af8277b7525b Mon Sep 17 00:00:00 2001
From: Avihai Horon <avihaih@nvidia.com>
Date: Sun, 31 Dec 2023 12:48:18 +0200
Subject: [PATCH 063/101] vfio/migration: Add helper function to set state or
reset device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [62/67] 1a63eea289561a05a6a8527c2a9da0289a7836d9 (eauger1/centos-qemu-kvm)
There are several places where failure in setting the device state leads
to a device reset, which is done by setting ERROR as the recover state.
Add a helper function that sets the device state and resets the device
in case of failure. This will make the code cleaner and remove duplicate
comments.
Signed-off-by: Avihai Horon <avihaih@nvidia.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
(cherry picked from commit c817e5a377a334241eed149e35760aca58bdeb34)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/migration.c | 41 +++++++++++++++++------------------------
1 file changed, 17 insertions(+), 24 deletions(-)
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 28d422b39f..70e6b1a709 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -163,6 +163,19 @@ reset_device:
return ret;
}
+/*
+ * Some device state transitions require resetting the device if they fail.
+ * This function sets the device in new_state and resets the device if that
+ * fails. Reset is done by using ERROR as the recover state.
+ */
+static int
+vfio_migration_set_state_or_reset(VFIODevice *vbasedev,
+ enum vfio_device_mig_state new_state)
+{
+ return vfio_migration_set_state(vbasedev, new_state,
+ VFIO_DEVICE_STATE_ERROR);
+}
+
static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev,
uint64_t data_size)
{
@@ -422,12 +435,7 @@ static void vfio_save_cleanup(void *opaque)
* after migration has completed, so it won't increase downtime.
*/
if (migration->device_state == VFIO_DEVICE_STATE_STOP_COPY) {
- /*
- * If setting the device in STOP state fails, the device should be
- * reset. To do so, use ERROR state as a recover state.
- */
- vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP,
- VFIO_DEVICE_STATE_ERROR);
+ vfio_migration_set_state_or_reset(vbasedev, VFIO_DEVICE_STATE_STOP);
}
g_free(migration->data_buffer);
@@ -699,12 +707,7 @@ static void vfio_vmstate_change_prepare(void *opaque, bool running,
VFIO_DEVICE_STATE_PRE_COPY_P2P :
VFIO_DEVICE_STATE_RUNNING_P2P;
- /*
- * If setting the device in new_state fails, the device should be reset.
- * To do so, use ERROR state as a recover state.
- */
- ret = vfio_migration_set_state(vbasedev, new_state,
- VFIO_DEVICE_STATE_ERROR);
+ ret = vfio_migration_set_state_or_reset(vbasedev, new_state);
if (ret) {
/*
* Migration should be aborted in this case, but vm_state_notify()
@@ -736,12 +739,7 @@ static void vfio_vmstate_change(void *opaque, bool running, RunState state)
VFIO_DEVICE_STATE_STOP;
}
- /*
- * If setting the device in new_state fails, the device should be reset.
- * To do so, use ERROR state as a recover state.
- */
- ret = vfio_migration_set_state(vbasedev, new_state,
- VFIO_DEVICE_STATE_ERROR);
+ ret = vfio_migration_set_state_or_reset(vbasedev, new_state);
if (ret) {
/*
* Migration should be aborted in this case, but vm_state_notify()
@@ -770,12 +768,7 @@ static void vfio_migration_state_notifier(Notifier *notifier, void *data)
case MIGRATION_STATUS_CANCELLING:
case MIGRATION_STATUS_CANCELLED:
case MIGRATION_STATUS_FAILED:
- /*
- * If setting the device in RUNNING state fails, the device should
- * be reset. To do so, use ERROR state as a recover state.
- */
- vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RUNNING,
- VFIO_DEVICE_STATE_ERROR);
+ vfio_migration_set_state_or_reset(vbasedev, VFIO_DEVICE_STATE_RUNNING);
}
}
--
2.39.3

View File

@ -0,0 +1,81 @@
From 7788fdc2375e01ead0c8a705c3b3d7467dd93d67 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Tue, 21 Nov 2023 16:44:09 +0800
Subject: [PATCH 030/101] vfio/pci: Allow the selection of a given iommu
backend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [29/67] 363c62607a11093ea0062489e11a708117d8ffb9 (eauger1/centos-qemu-kvm)
Now we support two types of iommu backends, let's add the capability
to select one of them. This depends on whether an iommufd object has
been linked with the vfio-pci device:
If the user wants to use the legacy backend, it shall not
link the vfio-pci device with any iommufd object:
-device vfio-pci,host=0000:02:00.0
This is called the legacy mode/backend.
If the user wants to use the iommufd backend (/dev/iommu) it
shall pass an iommufd object id in the vfio-pci device options:
-object iommufd,id=iommufd0
-device vfio-pci,host=0000:02:00.0,iommufd=iommufd0
Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit ee42b261b0a2e465ae003ddcaf1caf117c201f74)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/pci.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 83b2561908..39e6a6678e 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -19,6 +19,7 @@
*/
#include "qemu/osdep.h"
+#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
#include <linux/vfio.h>
#include <sys/ioctl.h>
@@ -42,6 +43,7 @@
#include "qapi/error.h"
#include "migration/blocker.h"
#include "migration/qemu-file.h"
+#include "sysemu/iommufd.h"
#define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug"
@@ -3415,6 +3417,10 @@ static Property vfio_pci_dev_properties[] = {
* DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
* DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name),
*/
+#ifdef CONFIG_IOMMUFD
+ DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd,
+ TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
+#endif
DEFINE_PROP_END_OF_LIST(),
};
--
2.39.3

View File

@ -0,0 +1,139 @@
From fe5ecedd452754eeb238b23eb0544ed3c5086157 Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Tue, 21 Nov 2023 16:44:06 +0800
Subject: [PATCH 027/101] vfio/pci: Extract out a helper
vfio_pci_get_pci_hot_reset_info
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [26/67] 730b7f1496f4f21310fa13c79cb87f8d5e2ad2a8 (eauger1/centos-qemu-kvm)
This helper will be used by both legacy and iommufd backends.
No functional changes intended.
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 4d36ec23a75eb387492f4d68ff1b8eeee5d68142)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/pci.c | 54 +++++++++++++++++++++++++++++++++++----------------
hw/vfio/pci.h | 3 +++
2 files changed, 40 insertions(+), 17 deletions(-)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index ec98080f28..b482e5479f 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2448,22 +2448,13 @@ static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
return (strcmp(tmp, name) == 0);
}
-static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
+int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev,
+ struct vfio_pci_hot_reset_info **info_p)
{
- VFIOGroup *group;
struct vfio_pci_hot_reset_info *info;
- struct vfio_pci_dependent_device *devices;
- struct vfio_pci_hot_reset *reset;
- int32_t *fds;
- int ret, i, count;
- bool multi = false;
+ int ret, count;
- trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
-
- if (!single) {
- vfio_pci_pre_reset(vdev);
- }
- vdev->vbasedev.needs_reset = false;
+ assert(info_p && !*info_p);
info = g_malloc0(sizeof(*info));
info->argsz = sizeof(*info);
@@ -2471,24 +2462,53 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
if (ret && errno != ENOSPC) {
ret = -errno;
+ g_free(info);
if (!vdev->has_pm_reset) {
error_report("vfio: Cannot reset device %s, "
"no available reset mechanism.", vdev->vbasedev.name);
}
- goto out_single;
+ return ret;
}
count = info->count;
- info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
- info->argsz = sizeof(*info) + (count * sizeof(*devices));
- devices = &info->devices[0];
+ info = g_realloc(info, sizeof(*info) + (count * sizeof(info->devices[0])));
+ info->argsz = sizeof(*info) + (count * sizeof(info->devices[0]));
ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
if (ret) {
ret = -errno;
+ g_free(info);
error_report("vfio: hot reset info failed: %m");
+ return ret;
+ }
+
+ *info_p = info;
+ return 0;
+}
+
+static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
+{
+ VFIOGroup *group;
+ struct vfio_pci_hot_reset_info *info = NULL;
+ struct vfio_pci_dependent_device *devices;
+ struct vfio_pci_hot_reset *reset;
+ int32_t *fds;
+ int ret, i, count;
+ bool multi = false;
+
+ trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
+
+ if (!single) {
+ vfio_pci_pre_reset(vdev);
+ }
+ vdev->vbasedev.needs_reset = false;
+
+ ret = vfio_pci_get_pci_hot_reset_info(vdev, &info);
+
+ if (ret) {
goto out_single;
}
+ devices = &info->devices[0];
trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index eb74d9de2d..3568a6135d 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -219,6 +219,9 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr);
extern const PropertyInfo qdev_prop_nv_gpudirect_clique;
+int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev,
+ struct vfio_pci_hot_reset_info **info_p);
+
int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp);
int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
--
2.39.3

View File

@ -0,0 +1,466 @@
From acc3e5306e184567006bc45e7f36f2473e75d08a Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Tue, 21 Nov 2023 16:44:07 +0800
Subject: [PATCH 028/101] vfio/pci: Introduce a vfio pci hot reset interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [27/67] 192088dbf2cf88663acd2416f69b7eeb175b2525 (eauger1/centos-qemu-kvm)
Legacy vfio pci and iommufd cdev have different process to hot reset
vfio device, expand current code to abstract out pci_hot_reset callback
for legacy vfio, this same interface will also be used by iommufd
cdev vfio device.
Rename vfio_pci_hot_reset to vfio_legacy_pci_hot_reset and move it
into container.c.
vfio_pci_[pre/post]_reset and vfio_pci_host_match are exported so
they could be called in legacy and iommufd pci_hot_reset callback.
Suggested-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit c328e7e8ad1c969dbcbe90ee76afcd3cfec5e945)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/container.c | 170 ++++++++++++++++++++++++++
hw/vfio/pci.c | 168 +------------------------
hw/vfio/pci.h | 3 +
include/hw/vfio/vfio-container-base.h | 3 +
4 files changed, 182 insertions(+), 162 deletions(-)
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index ed2d721b2b..1dbf9b9a17 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -33,6 +33,7 @@
#include "trace.h"
#include "qapi/error.h"
#include "migration/migration.h"
+#include "pci.h"
VFIOGroupList vfio_group_list =
QLIST_HEAD_INITIALIZER(vfio_group_list);
@@ -922,6 +923,174 @@ static void vfio_legacy_detach_device(VFIODevice *vbasedev)
vfio_put_group(group);
}
+static int vfio_legacy_pci_hot_reset(VFIODevice *vbasedev, bool single)
+{
+ VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+ VFIOGroup *group;
+ struct vfio_pci_hot_reset_info *info = NULL;
+ struct vfio_pci_dependent_device *devices;
+ struct vfio_pci_hot_reset *reset;
+ int32_t *fds;
+ int ret, i, count;
+ bool multi = false;
+
+ trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
+
+ if (!single) {
+ vfio_pci_pre_reset(vdev);
+ }
+ vdev->vbasedev.needs_reset = false;
+
+ ret = vfio_pci_get_pci_hot_reset_info(vdev, &info);
+
+ if (ret) {
+ goto out_single;
+ }
+ devices = &info->devices[0];
+
+ trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
+
+ /* Verify that we have all the groups required */
+ for (i = 0; i < info->count; i++) {
+ PCIHostDeviceAddress host;
+ VFIOPCIDevice *tmp;
+ VFIODevice *vbasedev_iter;
+
+ host.domain = devices[i].segment;
+ host.bus = devices[i].bus;
+ host.slot = PCI_SLOT(devices[i].devfn);
+ host.function = PCI_FUNC(devices[i].devfn);
+
+ trace_vfio_pci_hot_reset_dep_devices(host.domain,
+ host.bus, host.slot, host.function, devices[i].group_id);
+
+ if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
+ continue;
+ }
+
+ QLIST_FOREACH(group, &vfio_group_list, next) {
+ if (group->groupid == devices[i].group_id) {
+ break;
+ }
+ }
+
+ if (!group) {
+ if (!vdev->has_pm_reset) {
+ error_report("vfio: Cannot reset device %s, "
+ "depends on group %d which is not owned.",
+ vdev->vbasedev.name, devices[i].group_id);
+ }
+ ret = -EPERM;
+ goto out;
+ }
+
+ /* Prep dependent devices for reset and clear our marker. */
+ QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
+ if (!vbasedev_iter->dev->realized ||
+ vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
+ continue;
+ }
+ tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
+ if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
+ if (single) {
+ ret = -EINVAL;
+ goto out_single;
+ }
+ vfio_pci_pre_reset(tmp);
+ tmp->vbasedev.needs_reset = false;
+ multi = true;
+ break;
+ }
+ }
+ }
+
+ if (!single && !multi) {
+ ret = -EINVAL;
+ goto out_single;
+ }
+
+ /* Determine how many group fds need to be passed */
+ count = 0;
+ QLIST_FOREACH(group, &vfio_group_list, next) {
+ for (i = 0; i < info->count; i++) {
+ if (group->groupid == devices[i].group_id) {
+ count++;
+ break;
+ }
+ }
+ }
+
+ reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
+ reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
+ fds = &reset->group_fds[0];
+
+ /* Fill in group fds */
+ QLIST_FOREACH(group, &vfio_group_list, next) {
+ for (i = 0; i < info->count; i++) {
+ if (group->groupid == devices[i].group_id) {
+ fds[reset->count++] = group->fd;
+ break;
+ }
+ }
+ }
+
+ /* Bus reset! */
+ ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
+ g_free(reset);
+ if (ret) {
+ ret = -errno;
+ }
+
+ trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
+ ret ? strerror(errno) : "Success");
+
+out:
+ /* Re-enable INTx on affected devices */
+ for (i = 0; i < info->count; i++) {
+ PCIHostDeviceAddress host;
+ VFIOPCIDevice *tmp;
+ VFIODevice *vbasedev_iter;
+
+ host.domain = devices[i].segment;
+ host.bus = devices[i].bus;
+ host.slot = PCI_SLOT(devices[i].devfn);
+ host.function = PCI_FUNC(devices[i].devfn);
+
+ if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
+ continue;
+ }
+
+ QLIST_FOREACH(group, &vfio_group_list, next) {
+ if (group->groupid == devices[i].group_id) {
+ break;
+ }
+ }
+
+ if (!group) {
+ break;
+ }
+
+ QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
+ if (!vbasedev_iter->dev->realized ||
+ vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
+ continue;
+ }
+ tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
+ if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
+ vfio_pci_post_reset(tmp);
+ break;
+ }
+ }
+ }
+out_single:
+ if (!single) {
+ vfio_pci_post_reset(vdev);
+ }
+ g_free(info);
+
+ return ret;
+}
+
const VFIOIOMMUOps vfio_legacy_ops = {
.dma_map = vfio_legacy_dma_map,
.dma_unmap = vfio_legacy_dma_unmap,
@@ -929,4 +1098,5 @@ const VFIOIOMMUOps vfio_legacy_ops = {
.detach_device = vfio_legacy_detach_device,
.set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking,
.query_dirty_bitmap = vfio_legacy_query_dirty_bitmap,
+ .pci_hot_reset = vfio_legacy_pci_hot_reset,
};
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index b482e5479f..83b2561908 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2377,7 +2377,7 @@ static int vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp)
return 0;
}
-static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
+void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
{
PCIDevice *pdev = &vdev->pdev;
uint16_t cmd;
@@ -2414,7 +2414,7 @@ static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
}
-static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
+void vfio_pci_post_reset(VFIOPCIDevice *vdev)
{
Error *err = NULL;
int nr;
@@ -2438,7 +2438,7 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
vfio_quirk_reset(vdev);
}
-static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
+bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
{
char tmp[13];
@@ -2488,166 +2488,10 @@ int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev,
static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
{
- VFIOGroup *group;
- struct vfio_pci_hot_reset_info *info = NULL;
- struct vfio_pci_dependent_device *devices;
- struct vfio_pci_hot_reset *reset;
- int32_t *fds;
- int ret, i, count;
- bool multi = false;
-
- trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
-
- if (!single) {
- vfio_pci_pre_reset(vdev);
- }
- vdev->vbasedev.needs_reset = false;
-
- ret = vfio_pci_get_pci_hot_reset_info(vdev, &info);
-
- if (ret) {
- goto out_single;
- }
- devices = &info->devices[0];
-
- trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
-
- /* Verify that we have all the groups required */
- for (i = 0; i < info->count; i++) {
- PCIHostDeviceAddress host;
- VFIOPCIDevice *tmp;
- VFIODevice *vbasedev_iter;
-
- host.domain = devices[i].segment;
- host.bus = devices[i].bus;
- host.slot = PCI_SLOT(devices[i].devfn);
- host.function = PCI_FUNC(devices[i].devfn);
-
- trace_vfio_pci_hot_reset_dep_devices(host.domain,
- host.bus, host.slot, host.function, devices[i].group_id);
-
- if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
- continue;
- }
-
- QLIST_FOREACH(group, &vfio_group_list, next) {
- if (group->groupid == devices[i].group_id) {
- break;
- }
- }
-
- if (!group) {
- if (!vdev->has_pm_reset) {
- error_report("vfio: Cannot reset device %s, "
- "depends on group %d which is not owned.",
- vdev->vbasedev.name, devices[i].group_id);
- }
- ret = -EPERM;
- goto out;
- }
-
- /* Prep dependent devices for reset and clear our marker. */
- QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
- if (!vbasedev_iter->dev->realized ||
- vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
- continue;
- }
- tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
- if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
- if (single) {
- ret = -EINVAL;
- goto out_single;
- }
- vfio_pci_pre_reset(tmp);
- tmp->vbasedev.needs_reset = false;
- multi = true;
- break;
- }
- }
- }
-
- if (!single && !multi) {
- ret = -EINVAL;
- goto out_single;
- }
-
- /* Determine how many group fds need to be passed */
- count = 0;
- QLIST_FOREACH(group, &vfio_group_list, next) {
- for (i = 0; i < info->count; i++) {
- if (group->groupid == devices[i].group_id) {
- count++;
- break;
- }
- }
- }
-
- reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
- reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
- fds = &reset->group_fds[0];
-
- /* Fill in group fds */
- QLIST_FOREACH(group, &vfio_group_list, next) {
- for (i = 0; i < info->count; i++) {
- if (group->groupid == devices[i].group_id) {
- fds[reset->count++] = group->fd;
- break;
- }
- }
- }
-
- /* Bus reset! */
- ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
- g_free(reset);
-
- trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
- ret ? strerror(errno) : "Success");
-
-out:
- /* Re-enable INTx on affected devices */
- for (i = 0; i < info->count; i++) {
- PCIHostDeviceAddress host;
- VFIOPCIDevice *tmp;
- VFIODevice *vbasedev_iter;
-
- host.domain = devices[i].segment;
- host.bus = devices[i].bus;
- host.slot = PCI_SLOT(devices[i].devfn);
- host.function = PCI_FUNC(devices[i].devfn);
-
- if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
- continue;
- }
-
- QLIST_FOREACH(group, &vfio_group_list, next) {
- if (group->groupid == devices[i].group_id) {
- break;
- }
- }
-
- if (!group) {
- break;
- }
-
- QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
- if (!vbasedev_iter->dev->realized ||
- vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
- continue;
- }
- tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
- if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
- vfio_pci_post_reset(tmp);
- break;
- }
- }
- }
-out_single:
- if (!single) {
- vfio_pci_post_reset(vdev);
- }
- g_free(info);
+ VFIODevice *vbasedev = &vdev->vbasedev;
+ const VFIOIOMMUOps *ops = vbasedev->bcontainer->ops;
- return ret;
+ return ops->pci_hot_reset(vbasedev, single);
}
/*
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 3568a6135d..b7de39c010 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -219,6 +219,9 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr);
extern const PropertyInfo qdev_prop_nv_gpudirect_clique;
+void vfio_pci_pre_reset(VFIOPCIDevice *vdev);
+void vfio_pci_post_reset(VFIOPCIDevice *vdev);
+bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name);
int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev,
struct vfio_pci_hot_reset_info **info_p);
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index 4b6f017c6f..45bb19c767 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -106,6 +106,9 @@ struct VFIOIOMMUOps {
int (*set_dirty_page_tracking)(VFIOContainerBase *bcontainer, bool start);
int (*query_dirty_bitmap)(VFIOContainerBase *bcontainer, VFIOBitmap *vbmap,
hwaddr iova, hwaddr size);
+ /* PCI specific */
+ int (*pci_hot_reset)(VFIODevice *vbasedev, bool single);
+
/* SPAPR specific */
int (*add_window)(VFIOContainerBase *bcontainer,
MemoryRegionSection *section,
--
2.39.3

View File

@ -0,0 +1,237 @@
From 965a44793806fef2094906947bd3b428638bf89a Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Tue, 21 Nov 2023 16:44:10 +0800
Subject: [PATCH 031/101] vfio/pci: Make vfio cdev pre-openable by passing a
file handle
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [30/67] a14b824b700e8fb36633cd159bcc422d992a316f (eauger1/centos-qemu-kvm)
Conflicts: contextual conflict in hw/vfio/pci.c due to
RHEL-only f73562144e492 vfio: cap number of devices that can be assigned
This gives management tools like libvirt a chance to open the vfio
cdev with privilege and pass FD to qemu. This way qemu never needs
to have privilege to open a VFIO or iommu cdev node.
Together with the earlier support of pre-opening /dev/iommu device,
now we have full support of passing a vfio device to unprivileged
qemu by management tool. This mode is no more considered for the
legacy backend. So let's remove the "TODO" comment.
Add helper functions vfio_device_set_fd() and vfio_device_get_name()
to set fd and get device name, they will also be used by other vfio
devices.
There is no easy way to check if a device is mdev with FD passing,
so fail the x-balloon-allowed check unconditionally in this case.
There is also no easy way to get BDF as name with FD passing, so
we fake a name by VFIO_FD[fd].
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit da3e04b26fd8d15b344944504d5ffa9c5f20b54b)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/helpers.c | 43 +++++++++++++++++++++++++++++++++++
hw/vfio/iommufd.c | 12 ++++++----
hw/vfio/pci.c | 28 +++++++++++++----------
include/hw/vfio/vfio-common.h | 4 ++++
4 files changed, 71 insertions(+), 16 deletions(-)
diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c
index 168847e7c5..3592c3d54e 100644
--- a/hw/vfio/helpers.c
+++ b/hw/vfio/helpers.c
@@ -27,6 +27,7 @@
#include "trace.h"
#include "qapi/error.h"
#include "qemu/error-report.h"
+#include "monitor/monitor.h"
/*
* Common VFIO interrupt disable
@@ -609,3 +610,45 @@ bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
return ret;
}
+
+int vfio_device_get_name(VFIODevice *vbasedev, Error **errp)
+{
+ struct stat st;
+
+ if (vbasedev->fd < 0) {
+ if (stat(vbasedev->sysfsdev, &st) < 0) {
+ error_setg_errno(errp, errno, "no such host device");
+ error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev);
+ return -errno;
+ }
+ /* User may specify a name, e.g: VFIO platform device */
+ if (!vbasedev->name) {
+ vbasedev->name = g_path_get_basename(vbasedev->sysfsdev);
+ }
+ } else {
+ if (!vbasedev->iommufd) {
+ error_setg(errp, "Use FD passing only with iommufd backend");
+ return -EINVAL;
+ }
+ /*
+ * Give a name with fd so any function printing out vbasedev->name
+ * will not break.
+ */
+ if (!vbasedev->name) {
+ vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd);
+ }
+ }
+
+ return 0;
+}
+
+void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp)
+{
+ int fd = monitor_fd_param(monitor_cur(), str, errp);
+
+ if (fd < 0) {
+ error_prepend(errp, "Could not parse remote object fd %s:", str);
+ return;
+ }
+ vbasedev->fd = fd;
+}
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 6e53e013ef..5accd26484 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -320,11 +320,15 @@ static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev,
uint32_t ioas_id;
Error *err = NULL;
- devfd = iommufd_cdev_getfd(vbasedev->sysfsdev, errp);
- if (devfd < 0) {
- return devfd;
+ if (vbasedev->fd < 0) {
+ devfd = iommufd_cdev_getfd(vbasedev->sysfsdev, errp);
+ if (devfd < 0) {
+ return devfd;
+ }
+ vbasedev->fd = devfd;
+ } else {
+ devfd = vbasedev->fd;
}
- vbasedev->fd = devfd;
ret = iommufd_cdev_connect_and_bind(vbasedev, errp);
if (ret) {
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 39e6a6678e..3412a63bb1 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2949,7 +2949,6 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
VFIOGroup *group;
char *tmp, *subsys;
Error *err = NULL;
- struct stat st;
int ret, i = 0;
bool is_mdev;
char uuid[UUID_STR_LEN];
@@ -2976,11 +2975,14 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
return;
}
- if (!vbasedev->sysfsdev) {
+ if (vbasedev->fd < 0 && !vbasedev->sysfsdev) {
if (!(~vdev->host.domain || ~vdev->host.bus ||
~vdev->host.slot || ~vdev->host.function)) {
error_setg(errp, "No provided host device");
error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F "
+#ifdef CONFIG_IOMMUFD
+ "or -device vfio-pci,fd=DEVICE_FD "
+#endif
"or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n");
return;
}
@@ -2990,13 +2992,9 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
vdev->host.slot, vdev->host.function);
}
- if (stat(vbasedev->sysfsdev, &st) < 0) {
- error_setg_errno(errp, errno, "no such host device");
- error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev);
+ if (vfio_device_get_name(vbasedev, errp) < 0) {
return;
}
-
- vbasedev->name = g_path_get_basename(vbasedev->sysfsdev);
vbasedev->ops = &vfio_pci_ops;
vbasedev->type = VFIO_DEVICE_TYPE_PCI;
vbasedev->dev = DEVICE(vdev);
@@ -3356,6 +3354,7 @@ static void vfio_instance_init(Object *obj)
vdev->host.bus = ~0U;
vdev->host.slot = ~0U;
vdev->host.function = ~0U;
+ vdev->vbasedev.fd = -1;
vdev->nv_gpudirect_clique = 0xFF;
@@ -3412,11 +3411,6 @@ static Property vfio_pci_dev_properties[] = {
qdev_prop_nv_gpudirect_clique, uint8_t),
DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
OFF_AUTOPCIBAR_OFF),
- /*
- * TODO - support passed fds... is this necessary?
- * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
- * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name),
- */
#ifdef CONFIG_IOMMUFD
DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd,
TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
@@ -3424,6 +3418,13 @@ static Property vfio_pci_dev_properties[] = {
DEFINE_PROP_END_OF_LIST(),
};
+#ifdef CONFIG_IOMMUFD
+static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp)
+{
+ vfio_device_set_fd(&VFIO_PCI(obj)->vbasedev, str, errp);
+}
+#endif
+
static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
@@ -3431,6 +3432,9 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
dc->reset = vfio_pci_reset;
device_class_set_props(dc, vfio_pci_dev_properties);
+#ifdef CONFIG_IOMMUFD
+ object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd);
+#endif
dc->desc = "VFIO-based PCI device assignment";
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
pdc->realize = vfio_realize;
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 3dac5c167e..697bf24a35 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -251,4 +251,8 @@ int vfio_devices_query_dirty_bitmap(VFIOContainerBase *bcontainer,
hwaddr size);
int vfio_get_dirty_bitmap(VFIOContainerBase *bcontainer, uint64_t iova,
uint64_t size, ram_addr_t ram_addr);
+
+/* Returns 0 on success, or a negative errno. */
+int vfio_device_get_name(VFIODevice *vbasedev, Error **errp);
+void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp);
#endif /* HW_VFIO_VFIO_COMMON_H */
--
2.39.3

View File

@ -0,0 +1,70 @@
From 942bd7251d166f558e0e6acf7ba853e940e2fb52 Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Tue, 21 Nov 2023 16:44:21 +0800
Subject: [PATCH 042/101] vfio/pci: Move VFIODevice initializations in
vfio_instance_init
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [41/67] 67392d7a92a6ec2155697a355c88d295338a0785 (eauger1/centos-qemu-kvm)
Some of the VFIODevice initializations is in vfio_realize,
move all of them in vfio_instance_init.
No functional change intended.
Suggested-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit dd2fcb1716be9b89c726b3446f38446bb99d6b3a)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/pci.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 3412a63bb1..3f5900cc46 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2995,9 +2995,6 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
if (vfio_device_get_name(vbasedev, errp) < 0) {
return;
}
- vbasedev->ops = &vfio_pci_ops;
- vbasedev->type = VFIO_DEVICE_TYPE_PCI;
- vbasedev->dev = DEVICE(vdev);
/*
* Mediated devices *might* operate compatibly with discarding of RAM, but
@@ -3346,6 +3343,7 @@ static void vfio_instance_init(Object *obj)
{
PCIDevice *pci_dev = PCI_DEVICE(obj);
VFIOPCIDevice *vdev = VFIO_PCI(obj);
+ VFIODevice *vbasedev = &vdev->vbasedev;
device_add_bootindex_property(obj, &vdev->bootindex,
"bootindex", NULL,
@@ -3354,7 +3352,11 @@ static void vfio_instance_init(Object *obj)
vdev->host.bus = ~0U;
vdev->host.slot = ~0U;
vdev->host.function = ~0U;
- vdev->vbasedev.fd = -1;
+
+ vbasedev->type = VFIO_DEVICE_TYPE_PCI;
+ vbasedev->ops = &vfio_pci_ops;
+ vbasedev->dev = DEVICE(vdev);
+ vbasedev->fd = -1;
vdev->nv_gpudirect_clique = 0xFF;
--
2.39.3

View File

@ -0,0 +1,77 @@
From ede579d6d5fe5be9235d6a218efdb237192aee0e Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Tue, 21 Nov 2023 16:44:11 +0800
Subject: [PATCH 032/101] vfio/platform: Allow the selection of a given iommu
backend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [31/67] aba1dc16cada602edd7be1a28b0f57991131e6f7 (eauger1/centos-qemu-kvm)
Now we support two types of iommu backends, let's add the capability
to select one of them. This depends on whether an iommufd object has
been linked with the vfio-platform device:
If the user wants to use the legacy backend, it shall not
link the vfio-platform device with any iommufd object:
-device vfio-platform,host=XXX
This is called the legacy mode/backend.
If the user wants to use the iommufd backend (/dev/iommu) it
shall pass an iommufd object id in the vfio-platform device options:
-object iommufd,id=iommufd0
-device vfio-platform,host=XXX,iommufd=iommufd0
Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit a6c50e1c3f8d0eb77edaea392e61508bb3c516f8)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/platform.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
index 8e3d4ac458..98ae4bc655 100644
--- a/hw/vfio/platform.c
+++ b/hw/vfio/platform.c
@@ -15,11 +15,13 @@
*/
#include "qemu/osdep.h"
+#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
#include "qapi/error.h"
#include <sys/ioctl.h>
#include <linux/vfio.h>
#include "hw/vfio/vfio-platform.h"
+#include "sysemu/iommufd.h"
#include "migration/vmstate.h"
#include "qemu/error-report.h"
#include "qemu/lockable.h"
@@ -649,6 +651,10 @@ static Property vfio_platform_dev_properties[] = {
DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice,
mmap_timeout, 1100),
DEFINE_PROP_BOOL("x-irqfd", VFIOPlatformDevice, irqfd_allowed, true),
+#ifdef CONFIG_IOMMUFD
+ DEFINE_PROP_LINK("iommufd", VFIOPlatformDevice, vbasedev.iommufd,
+ TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
+#endif
DEFINE_PROP_END_OF_LIST(),
};
--
2.39.3

View File

@ -0,0 +1,108 @@
From 22664f4115d9b297ef4276e48f8ba0bc195ec99e Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Tue, 21 Nov 2023 16:44:12 +0800
Subject: [PATCH 033/101] vfio/platform: Make vfio cdev pre-openable by passing
a file handle
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [32/67] 069867dce64b826e92dc2051405a4ded5261981f (eauger1/centos-qemu-kvm)
This gives management tools like libvirt a chance to open the vfio
cdev with privilege and pass FD to qemu. This way qemu never needs
to have privilege to open a VFIO or iommu cdev node.
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 3016e60f8f715d2058a48e4956be994482c5e218)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/platform.c | 32 ++++++++++++++++++++++++--------
1 file changed, 24 insertions(+), 8 deletions(-)
diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
index 98ae4bc655..a97d9c6234 100644
--- a/hw/vfio/platform.c
+++ b/hw/vfio/platform.c
@@ -531,14 +531,13 @@ static VFIODeviceOps vfio_platform_ops = {
*/
static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp)
{
- struct stat st;
int ret;
- /* @sysfsdev takes precedence over @host */
- if (vbasedev->sysfsdev) {
+ /* @fd takes precedence over @sysfsdev which takes precedence over @host */
+ if (vbasedev->fd < 0 && vbasedev->sysfsdev) {
g_free(vbasedev->name);
vbasedev->name = g_path_get_basename(vbasedev->sysfsdev);
- } else {
+ } else if (vbasedev->fd < 0) {
if (!vbasedev->name || strchr(vbasedev->name, '/')) {
error_setg(errp, "wrong host device name");
return -EINVAL;
@@ -548,10 +547,9 @@ static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp)
vbasedev->name);
}
- if (stat(vbasedev->sysfsdev, &st) < 0) {
- error_setg_errno(errp, errno,
- "failed to get the sysfs host device file status");
- return -errno;
+ ret = vfio_device_get_name(vbasedev, errp);
+ if (ret) {
+ return ret;
}
ret = vfio_attach_device(vbasedev->name, vbasedev,
@@ -658,6 +656,20 @@ static Property vfio_platform_dev_properties[] = {
DEFINE_PROP_END_OF_LIST(),
};
+static void vfio_platform_instance_init(Object *obj)
+{
+ VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(obj);
+
+ vdev->vbasedev.fd = -1;
+}
+
+#ifdef CONFIG_IOMMUFD
+static void vfio_platform_set_fd(Object *obj, const char *str, Error **errp)
+{
+ vfio_device_set_fd(&VFIO_PLATFORM_DEVICE(obj)->vbasedev, str, errp);
+}
+#endif
+
static void vfio_platform_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
@@ -665,6 +677,9 @@ static void vfio_platform_class_init(ObjectClass *klass, void *data)
dc->realize = vfio_platform_realize;
device_class_set_props(dc, vfio_platform_dev_properties);
+#ifdef CONFIG_IOMMUFD
+ object_class_property_add_str(klass, "fd", NULL, vfio_platform_set_fd);
+#endif
dc->vmsd = &vfio_platform_vmstate;
dc->desc = "VFIO-based platform device assignment";
sbc->connect_irq_notifier = vfio_start_irqfd_injection;
@@ -677,6 +692,7 @@ static const TypeInfo vfio_platform_dev_info = {
.name = TYPE_VFIO_PLATFORM,
.parent = TYPE_SYS_BUS_DEVICE,
.instance_size = sizeof(VFIOPlatformDevice),
+ .instance_init = vfio_platform_instance_init,
.class_init = vfio_platform_class_init,
.class_size = sizeof(VFIOPlatformDeviceClass),
};
--
2.39.3

View File

@ -0,0 +1,64 @@
From 2417020283532030f424fe07dfeb7477e6489640 Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Tue, 21 Nov 2023 16:44:22 +0800
Subject: [PATCH 043/101] vfio/platform: Move VFIODevice initializations in
vfio_platform_instance_init
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [42/67] 53a459b6246d7d7bdc7a62ac92f02f1e775a54a6 (eauger1/centos-qemu-kvm)
Some of the VFIODevice initializations is in vfio_platform_realize,
move all of them in vfio_platform_instance_init.
No functional change intended.
Suggested-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit a0cf44c8d618578843a65ea7f6d3db8ce52185bc)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/platform.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
index a97d9c6234..506eb8193f 100644
--- a/hw/vfio/platform.c
+++ b/hw/vfio/platform.c
@@ -581,10 +581,6 @@ static void vfio_platform_realize(DeviceState *dev, Error **errp)
VFIODevice *vbasedev = &vdev->vbasedev;
int i, ret;
- vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM;
- vbasedev->dev = dev;
- vbasedev->ops = &vfio_platform_ops;
-
qemu_mutex_init(&vdev->intp_mutex);
trace_vfio_platform_realize(vbasedev->sysfsdev ?
@@ -659,8 +655,12 @@ static Property vfio_platform_dev_properties[] = {
static void vfio_platform_instance_init(Object *obj)
{
VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(obj);
+ VFIODevice *vbasedev = &vdev->vbasedev;
- vdev->vbasedev.fd = -1;
+ vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM;
+ vbasedev->ops = &vfio_platform_ops;
+ vbasedev->dev = DEVICE(vdev);
+ vbasedev->fd = -1;
}
#ifdef CONFIG_IOMMUFD
--
2.39.3

View File

@ -0,0 +1,129 @@
From e75ec2aca351daabe597ca6322c1589885f30d7a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Tue, 19 Dec 2023 07:58:16 +0100
Subject: [PATCH 049/101] vfio/spapr: Extend VFIOIOMMUOps with a release
handler
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [48/67] 1c4d22a6f69324805d050767fcf178d8566f2030 (eauger1/centos-qemu-kvm)
This allows to abstract a bit more the sPAPR IOMMU support in the
legacy IOMMU backend.
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Tested-by: Eric Farman <farman@linux.ibm.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 001a013ea3f125d2ec0e709b5765754149d8d968)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/container.c | 10 +++-----
hw/vfio/spapr.c | 35 +++++++++++++++------------
include/hw/vfio/vfio-container-base.h | 1 +
3 files changed, 24 insertions(+), 22 deletions(-)
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index b22feb8ded..1e77a2929e 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -632,9 +632,8 @@ listener_release_exit:
QLIST_REMOVE(bcontainer, next);
vfio_kvm_device_del_group(group);
memory_listener_unregister(&bcontainer->listener);
- if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
- container->iommu_type == VFIO_SPAPR_TCE_IOMMU) {
- vfio_spapr_container_deinit(container);
+ if (bcontainer->ops->release) {
+ bcontainer->ops->release(bcontainer);
}
enable_discards_exit:
@@ -667,9 +666,8 @@ static void vfio_disconnect_container(VFIOGroup *group)
*/
if (QLIST_EMPTY(&container->group_list)) {
memory_listener_unregister(&bcontainer->listener);
- if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU ||
- container->iommu_type == VFIO_SPAPR_TCE_IOMMU) {
- vfio_spapr_container_deinit(container);
+ if (bcontainer->ops->release) {
+ bcontainer->ops->release(bcontainer);
}
}
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 5c6426e697..44617dfc6b 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -440,6 +440,24 @@ vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer,
}
}
+static void vfio_spapr_container_release(VFIOContainerBase *bcontainer)
+{
+ VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+ bcontainer);
+ VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
+ container);
+ VFIOHostDMAWindow *hostwin, *next;
+
+ if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
+ memory_listener_unregister(&scontainer->prereg_listener);
+ }
+ QLIST_FOREACH_SAFE(hostwin, &scontainer->hostwin_list, hostwin_next,
+ next) {
+ QLIST_REMOVE(hostwin, hostwin_next);
+ g_free(hostwin);
+ }
+}
+
static VFIOIOMMUOps vfio_iommu_spapr_ops;
static void setup_spapr_ops(VFIOContainerBase *bcontainer)
@@ -447,6 +465,7 @@ static void setup_spapr_ops(VFIOContainerBase *bcontainer)
vfio_iommu_spapr_ops = *bcontainer->ops;
vfio_iommu_spapr_ops.add_window = vfio_spapr_container_add_section_window;
vfio_iommu_spapr_ops.del_window = vfio_spapr_container_del_section_window;
+ vfio_iommu_spapr_ops.release = vfio_spapr_container_release;
bcontainer->ops = &vfio_iommu_spapr_ops;
}
@@ -527,19 +546,3 @@ listener_unregister_exit:
}
return ret;
}
-
-void vfio_spapr_container_deinit(VFIOContainer *container)
-{
- VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
- container);
- VFIOHostDMAWindow *hostwin, *next;
-
- if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
- memory_listener_unregister(&scontainer->prereg_listener);
- }
- QLIST_FOREACH_SAFE(hostwin, &scontainer->hostwin_list, hostwin_next,
- next) {
- QLIST_REMOVE(hostwin, hostwin_next);
- g_free(hostwin);
- }
-}
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index 2ae297ccda..5c9594b6c7 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -117,5 +117,6 @@ struct VFIOIOMMUOps {
Error **errp);
void (*del_window)(VFIOContainerBase *bcontainer,
MemoryRegionSection *section);
+ void (*release)(VFIOContainerBase *bcontainer);
};
#endif /* HW_VFIO_VFIO_CONTAINER_BASE_H */
--
2.39.3

View File

@ -0,0 +1,150 @@
From 645ed97633935712edcc2c56f252738b38f15e3a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Tue, 19 Dec 2023 07:58:22 +0100
Subject: [PATCH 055/101] vfio/spapr: Introduce a sPAPR VFIOIOMMU QOM interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [54/67] 2ceac3c07d71790dc3852fbbbd4084a7affb9373 (eauger1/centos-qemu-kvm)
Move vfio_spapr_container_setup() to a VFIOIOMMUClass::setup handler
and convert the sPAPR VFIOIOMMUOps struct to a QOM interface. The
sPAPR QOM interface inherits from the legacy QOM interface because
because both have the same basic needs. The sPAPR interface is then
extended with the handlers specific to the sPAPR IOMMU.
This allows reuse and provides better abstraction of the backends. It
will be useful to avoid compiling the sPAPR IOMMU backend on targets
not supporting it.
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Tested-by: Eric Farman <farman@linux.ibm.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit f221f641a2fe69c2ca3857759551470664b0bec8)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/container.c | 18 +++++--------
hw/vfio/spapr.c | 39 ++++++++++++++++-----------
include/hw/vfio/vfio-container-base.h | 1 +
3 files changed, 31 insertions(+), 27 deletions(-)
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index c22bdd3216..688cf23bab 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -381,6 +381,10 @@ static const VFIOIOMMUClass *vfio_get_iommu_class(int iommu_type, Error **errp)
case VFIO_TYPE1_IOMMU:
klass = object_class_by_name(TYPE_VFIO_IOMMU_LEGACY);
break;
+ case VFIO_SPAPR_TCE_v2_IOMMU:
+ case VFIO_SPAPR_TCE_IOMMU:
+ klass = object_class_by_name(TYPE_VFIO_IOMMU_SPAPR);
+ break;
default:
g_assert_not_reached();
};
@@ -623,19 +627,9 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
goto free_container_exit;
}
- switch (container->iommu_type) {
- case VFIO_TYPE1v2_IOMMU:
- case VFIO_TYPE1_IOMMU:
- ret = vfio_legacy_setup(bcontainer, errp);
- break;
- case VFIO_SPAPR_TCE_v2_IOMMU:
- case VFIO_SPAPR_TCE_IOMMU:
- ret = vfio_spapr_container_init(container, errp);
- break;
- default:
- g_assert_not_reached();
- }
+ assert(bcontainer->ops->setup);
+ ret = bcontainer->ops->setup(bcontainer, errp);
if (ret) {
goto enable_discards_exit;
}
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 44617dfc6b..0d949bb728 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -458,20 +458,11 @@ static void vfio_spapr_container_release(VFIOContainerBase *bcontainer)
}
}
-static VFIOIOMMUOps vfio_iommu_spapr_ops;
-
-static void setup_spapr_ops(VFIOContainerBase *bcontainer)
-{
- vfio_iommu_spapr_ops = *bcontainer->ops;
- vfio_iommu_spapr_ops.add_window = vfio_spapr_container_add_section_window;
- vfio_iommu_spapr_ops.del_window = vfio_spapr_container_del_section_window;
- vfio_iommu_spapr_ops.release = vfio_spapr_container_release;
- bcontainer->ops = &vfio_iommu_spapr_ops;
-}
-
-int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
+static int vfio_spapr_container_setup(VFIOContainerBase *bcontainer,
+ Error **errp)
{
- VFIOContainerBase *bcontainer = &container->bcontainer;
+ VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+ bcontainer);
VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
container);
struct vfio_iommu_spapr_tce_info info;
@@ -536,8 +527,6 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
0x1000);
}
- setup_spapr_ops(bcontainer);
-
return 0;
listener_unregister_exit:
@@ -546,3 +535,23 @@ listener_unregister_exit:
}
return ret;
}
+
+static void vfio_iommu_spapr_class_init(ObjectClass *klass, void *data)
+{
+ VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
+
+ vioc->add_window = vfio_spapr_container_add_section_window;
+ vioc->del_window = vfio_spapr_container_del_section_window;
+ vioc->release = vfio_spapr_container_release;
+ vioc->setup = vfio_spapr_container_setup;
+};
+
+static const TypeInfo types[] = {
+ {
+ .name = TYPE_VFIO_IOMMU_SPAPR,
+ .parent = TYPE_VFIO_IOMMU_LEGACY,
+ .class_init = vfio_iommu_spapr_class_init,
+ },
+};
+
+DEFINE_TYPES(types)
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index ce8b1fba88..9e21d7811f 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -95,6 +95,7 @@ void vfio_container_destroy(VFIOContainerBase *bcontainer);
#define TYPE_VFIO_IOMMU "vfio-iommu"
#define TYPE_VFIO_IOMMU_LEGACY TYPE_VFIO_IOMMU "-legacy"
+#define TYPE_VFIO_IOMMU_SPAPR TYPE_VFIO_IOMMU "-spapr"
/*
* VFIOContainerBase is not an abstract QOM object because it felt
--
2.39.3

View File

@ -0,0 +1,91 @@
From ff0c13c22878eed0f3879c0805bef5b9f9d83e04 Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Thu, 2 Nov 2023 15:12:42 +0800
Subject: [PATCH 017/101] vfio/spapr: Introduce spapr backend and target
interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [16/67] e35cda157a2a1afeded3305622c861abd07edb51 (eauger1/centos-qemu-kvm)
Introduce an empty spapr backend which will hold spapr specific
content, currently only prereg_listener and hostwin_list.
Also introduce two spapr specific callbacks add/del_window into
VFIOIOMMUOps. Instantiate a spapr ops with a helper setup_spapr_ops
and assign it to bcontainer->ops.
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 9b7d38bf5a2c1054bfe6de08806954cdc45d8d98)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/spapr.c | 14 ++++++++++++++
include/hw/vfio/vfio-container-base.h | 6 ++++++
2 files changed, 20 insertions(+)
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 7a50975f25..e1a6b35563 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -24,6 +24,10 @@
#include "qapi/error.h"
#include "trace.h"
+typedef struct VFIOSpaprContainer {
+ VFIOContainer container;
+} VFIOSpaprContainer;
+
static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
{
if (memory_region_is_iommu(section->mr)) {
@@ -421,6 +425,14 @@ void vfio_container_del_section_window(VFIOContainer *container,
}
}
+static VFIOIOMMUOps vfio_iommu_spapr_ops;
+
+static void setup_spapr_ops(VFIOContainerBase *bcontainer)
+{
+ vfio_iommu_spapr_ops = *bcontainer->ops;
+ bcontainer->ops = &vfio_iommu_spapr_ops;
+}
+
int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
{
VFIOContainerBase *bcontainer = &container->bcontainer;
@@ -486,6 +498,8 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
0x1000);
}
+ setup_spapr_ops(bcontainer);
+
return 0;
listener_unregister_exit:
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index 9658ffb526..f62a14ac73 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -101,5 +101,11 @@ struct VFIOIOMMUOps {
int (*set_dirty_page_tracking)(VFIOContainerBase *bcontainer, bool start);
int (*query_dirty_bitmap)(VFIOContainerBase *bcontainer, VFIOBitmap *vbmap,
hwaddr iova, hwaddr size);
+ /* SPAPR specific */
+ int (*add_window)(VFIOContainerBase *bcontainer,
+ MemoryRegionSection *section,
+ Error **errp);
+ void (*del_window)(VFIOContainerBase *bcontainer,
+ MemoryRegionSection *section);
};
#endif /* HW_VFIO_VFIO_CONTAINER_BASE_H */
--
2.39.3

View File

@ -0,0 +1,188 @@
From 3e9e7b57b15ac328f5d663b4e04df546d49f5fa6 Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Thu, 2 Nov 2023 15:12:45 +0800
Subject: [PATCH 020/101] vfio/spapr: Move hostwin_list into spapr container
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [19/67] 87cfeaa32ad32a260a89b2bb1866d59e20c0fe30 (eauger1/centos-qemu-kvm)
No functional changes intended.
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit dbb9d0c9691d145338686d3e0920da047f2ab3da)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/spapr.c | 36 +++++++++++++++++++----------------
include/hw/vfio/vfio-common.h | 1 -
2 files changed, 20 insertions(+), 17 deletions(-)
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 68c3dd6c75..5c6426e697 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -27,6 +27,7 @@
typedef struct VFIOSpaprContainer {
VFIOContainer container;
MemoryListener prereg_listener;
+ QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
} VFIOSpaprContainer;
static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
@@ -154,12 +155,12 @@ static const MemoryListener vfio_prereg_listener = {
.region_del = vfio_prereg_listener_region_del,
};
-static void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova,
+static void vfio_host_win_add(VFIOSpaprContainer *scontainer, hwaddr min_iova,
hwaddr max_iova, uint64_t iova_pgsizes)
{
VFIOHostDMAWindow *hostwin;
- QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+ QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
if (ranges_overlap(hostwin->min_iova,
hostwin->max_iova - hostwin->min_iova + 1,
min_iova,
@@ -173,15 +174,15 @@ static void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova,
hostwin->min_iova = min_iova;
hostwin->max_iova = max_iova;
hostwin->iova_pgsizes = iova_pgsizes;
- QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next);
+ QLIST_INSERT_HEAD(&scontainer->hostwin_list, hostwin, hostwin_next);
}
-static int vfio_host_win_del(VFIOContainer *container,
+static int vfio_host_win_del(VFIOSpaprContainer *scontainer,
hwaddr min_iova, hwaddr max_iova)
{
VFIOHostDMAWindow *hostwin;
- QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+ QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
QLIST_REMOVE(hostwin, hostwin_next);
g_free(hostwin);
@@ -192,7 +193,7 @@ static int vfio_host_win_del(VFIOContainer *container,
return -1;
}
-static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container,
+static VFIOHostDMAWindow *vfio_find_hostwin(VFIOSpaprContainer *container,
hwaddr iova, hwaddr end)
{
VFIOHostDMAWindow *hostwin;
@@ -329,6 +330,8 @@ vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer,
{
VFIOContainer *container = container_of(bcontainer, VFIOContainer,
bcontainer);
+ VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
+ container);
VFIOHostDMAWindow *hostwin;
hwaddr pgsize = 0;
int ret;
@@ -344,7 +347,7 @@ vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer,
iova = section->offset_within_address_space;
end = iova + int128_get64(section->size) - 1;
- if (!vfio_find_hostwin(container, iova, end)) {
+ if (!vfio_find_hostwin(scontainer, iova, end)) {
error_setg(errp, "Container %p can't map guest IOVA region"
" 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container,
iova, end);
@@ -358,7 +361,7 @@ vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer,
}
/* For now intersections are not allowed, we may relax this later */
- QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+ QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
if (ranges_overlap(hostwin->min_iova,
hostwin->max_iova - hostwin->min_iova + 1,
section->offset_within_address_space,
@@ -380,7 +383,7 @@ vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer,
return ret;
}
- vfio_host_win_add(container, section->offset_within_address_space,
+ vfio_host_win_add(scontainer, section->offset_within_address_space,
section->offset_within_address_space +
int128_get64(section->size) - 1, pgsize);
#ifdef CONFIG_KVM
@@ -419,6 +422,8 @@ vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer,
{
VFIOContainer *container = container_of(bcontainer, VFIOContainer,
bcontainer);
+ VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
+ container);
if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
return;
@@ -426,7 +431,7 @@ vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer,
vfio_spapr_remove_window(container,
section->offset_within_address_space);
- if (vfio_host_win_del(container,
+ if (vfio_host_win_del(scontainer,
section->offset_within_address_space,
section->offset_within_address_space +
int128_get64(section->size) - 1) < 0) {
@@ -454,7 +459,7 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
int ret, fd = container->fd;
- QLIST_INIT(&container->hostwin_list);
+ QLIST_INIT(&scontainer->hostwin_list);
/*
* The host kernel code implementing VFIO_IOMMU_DISABLE is called
@@ -506,7 +511,7 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
} else {
/* The default table uses 4K pages */
bcontainer->pgsizes = 0x1000;
- vfio_host_win_add(container, info.dma32_window_start,
+ vfio_host_win_add(scontainer, info.dma32_window_start,
info.dma32_window_start +
info.dma32_window_size - 1,
0x1000);
@@ -525,15 +530,14 @@ listener_unregister_exit:
void vfio_spapr_container_deinit(VFIOContainer *container)
{
+ VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
+ container);
VFIOHostDMAWindow *hostwin, *next;
if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
- VFIOSpaprContainer *scontainer = container_of(container,
- VFIOSpaprContainer,
- container);
memory_listener_unregister(&scontainer->prereg_listener);
}
- QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next,
+ QLIST_FOREACH_SAFE(hostwin, &scontainer->hostwin_list, hostwin_next,
next) {
QLIST_REMOVE(hostwin, hostwin_next);
g_free(hostwin);
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index ed6148c058..24ecc0e7ee 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -79,7 +79,6 @@ typedef struct VFIOContainer {
VFIOContainerBase bcontainer;
int fd; /* /dev/vfio/vfio, empowered by the attached groups */
unsigned iommu_type;
- QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
QLIST_HEAD(, VFIOGroup) group_list;
} VFIOContainer;
--
2.39.3

View File

@ -0,0 +1,120 @@
From 17e6dad3e43e173147c0ca33f6f1f4f317a77d0b Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Thu, 2 Nov 2023 15:12:44 +0800
Subject: [PATCH 019/101] vfio/spapr: Move prereg_listener into spapr container
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [18/67] dbea1b0b759e91b953271da92bba4ca6853bec82 (eauger1/centos-qemu-kvm)
No functional changes intended.
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 6ad359ec29af7f21dcb206c8edb26905a4925f80)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/spapr.c | 24 ++++++++++++++++--------
include/hw/vfio/vfio-common.h | 1 -
2 files changed, 16 insertions(+), 9 deletions(-)
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 5be1911aad..68c3dd6c75 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -26,6 +26,7 @@
typedef struct VFIOSpaprContainer {
VFIOContainer container;
+ MemoryListener prereg_listener;
} VFIOSpaprContainer;
static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
@@ -48,8 +49,9 @@ static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa)
static void vfio_prereg_listener_region_add(MemoryListener *listener,
MemoryRegionSection *section)
{
- VFIOContainer *container = container_of(listener, VFIOContainer,
- prereg_listener);
+ VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer,
+ prereg_listener);
+ VFIOContainer *container = &scontainer->container;
VFIOContainerBase *bcontainer = &container->bcontainer;
const hwaddr gpa = section->offset_within_address_space;
hwaddr end;
@@ -107,8 +109,9 @@ static void vfio_prereg_listener_region_add(MemoryListener *listener,
static void vfio_prereg_listener_region_del(MemoryListener *listener,
MemoryRegionSection *section)
{
- VFIOContainer *container = container_of(listener, VFIOContainer,
- prereg_listener);
+ VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer,
+ prereg_listener);
+ VFIOContainer *container = &scontainer->container;
const hwaddr gpa = section->offset_within_address_space;
hwaddr end;
int ret;
@@ -445,6 +448,8 @@ static void setup_spapr_ops(VFIOContainerBase *bcontainer)
int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
{
VFIOContainerBase *bcontainer = &container->bcontainer;
+ VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
+ container);
struct vfio_iommu_spapr_tce_info info;
bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
int ret, fd = container->fd;
@@ -463,9 +468,9 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
return -errno;
}
} else {
- container->prereg_listener = vfio_prereg_listener;
+ scontainer->prereg_listener = vfio_prereg_listener;
- memory_listener_register(&container->prereg_listener,
+ memory_listener_register(&scontainer->prereg_listener,
&address_space_memory);
if (bcontainer->error) {
ret = -1;
@@ -513,7 +518,7 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
listener_unregister_exit:
if (v2) {
- memory_listener_unregister(&container->prereg_listener);
+ memory_listener_unregister(&scontainer->prereg_listener);
}
return ret;
}
@@ -523,7 +528,10 @@ void vfio_spapr_container_deinit(VFIOContainer *container)
VFIOHostDMAWindow *hostwin, *next;
if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
- memory_listener_unregister(&container->prereg_listener);
+ VFIOSpaprContainer *scontainer = container_of(container,
+ VFIOSpaprContainer,
+ container);
+ memory_listener_unregister(&scontainer->prereg_listener);
}
QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next,
next) {
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 055f679363..ed6148c058 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -78,7 +78,6 @@ struct VFIOGroup;
typedef struct VFIOContainer {
VFIOContainerBase bcontainer;
int fd; /* /dev/vfio/vfio, empowered by the attached groups */
- MemoryListener prereg_listener;
unsigned iommu_type;
QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
QLIST_HEAD(, VFIOGroup) group_list;
--
2.39.3

View File

@ -0,0 +1,46 @@
From 5d485eb1442a81b51688124ce30024e96490acbf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
Date: Tue, 19 Dec 2023 07:58:24 +0100
Subject: [PATCH 057/101] vfio/spapr: Only compile sPAPR IOMMU support when
needed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [56/67] 4dc0cbde470f877a8aac2bf6fab6923f2f919285 (eauger1/centos-qemu-kvm)
sPAPR IOMMU support is only needed for pseries machines. Compile out
support when CONFIG_PSERIES is not set. This saves ~7K of text.
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Tested-by: Eric Farman <farman@linux.ibm.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 10164df6ed3d41cbf67105dcd954a663ef4cc3e9)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/meson.build | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build
index e5d98b6adc..bb98493b53 100644
--- a/hw/vfio/meson.build
+++ b/hw/vfio/meson.build
@@ -4,9 +4,9 @@ vfio_ss.add(files(
'common.c',
'container-base.c',
'container.c',
- 'spapr.c',
'migration.c',
))
+vfio_ss.add(when: 'CONFIG_PSERIES', if_true: files('spapr.c'))
vfio_ss.add(when: 'CONFIG_IOMMUFD', if_true: files(
'iommufd.c',
))
--
2.39.3

View File

@ -0,0 +1,184 @@
From 3b7f044f15b4a9daf4ad7eda58777aba6dbe3fc0 Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@intel.com>
Date: Thu, 2 Nov 2023 15:12:43 +0800
Subject: [PATCH 018/101] vfio/spapr: switch to spapr IOMMU BE
add/del_section_window
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Eric Auger <eric.auger@redhat.com>
RH-MergeRequest: 211: IOMMUFD backend backport
RH-Jira: RHEL-19302 RHEL-21057
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
RH-Acked-by: Sebastian Ott <sebott@redhat.com>
RH-Commit: [17/67] a0d9f1f2d4d2592f3d9fc2ee5b2c38236a986e38 (eauger1/centos-qemu-kvm)
No functional change intended.
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com
Signed-off-by: Cédric Le Goater <clg@redhat.com>
(cherry picked from commit 233309e8e4c158af6c6b126d5ad021bae40a918a)
Signed-off-by: Eric Auger <eric.auger@redhat.com>
---
hw/vfio/common.c | 8 ++------
hw/vfio/container-base.c | 21 +++++++++++++++++++++
hw/vfio/spapr.c | 19 ++++++++++++++-----
include/hw/vfio/vfio-common.h | 5 -----
include/hw/vfio/vfio-container-base.h | 5 +++++
5 files changed, 42 insertions(+), 16 deletions(-)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 483ba82089..572ae7c934 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -571,8 +571,6 @@ static void vfio_listener_region_add(MemoryListener *listener,
{
VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
listener);
- VFIOContainer *container = container_of(bcontainer, VFIOContainer,
- bcontainer);
hwaddr iova, end;
Int128 llend, llsize;
void *vaddr;
@@ -595,7 +593,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
return;
}
- if (vfio_container_add_section_window(container, section, &err)) {
+ if (vfio_container_add_section_window(bcontainer, section, &err)) {
goto fail;
}
@@ -738,8 +736,6 @@ static void vfio_listener_region_del(MemoryListener *listener,
{
VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase,
listener);
- VFIOContainer *container = container_of(bcontainer, VFIOContainer,
- bcontainer);
hwaddr iova, end;
Int128 llend, llsize;
int ret;
@@ -818,7 +814,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
memory_region_unref(section->mr);
- vfio_container_del_section_window(container, section);
+ vfio_container_del_section_window(bcontainer, section);
}
typedef struct VFIODirtyRanges {
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index 0177f43741..71f7274973 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -31,6 +31,27 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb);
}
+int vfio_container_add_section_window(VFIOContainerBase *bcontainer,
+ MemoryRegionSection *section,
+ Error **errp)
+{
+ if (!bcontainer->ops->add_window) {
+ return 0;
+ }
+
+ return bcontainer->ops->add_window(bcontainer, section, errp);
+}
+
+void vfio_container_del_section_window(VFIOContainerBase *bcontainer,
+ MemoryRegionSection *section)
+{
+ if (!bcontainer->ops->del_window) {
+ return;
+ }
+
+ return bcontainer->ops->del_window(bcontainer, section);
+}
+
int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer,
bool start)
{
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index e1a6b35563..5be1911aad 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -319,10 +319,13 @@ static int vfio_spapr_create_window(VFIOContainer *container,
return 0;
}
-int vfio_container_add_section_window(VFIOContainer *container,
- MemoryRegionSection *section,
- Error **errp)
+static int
+vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer,
+ MemoryRegionSection *section,
+ Error **errp)
{
+ VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+ bcontainer);
VFIOHostDMAWindow *hostwin;
hwaddr pgsize = 0;
int ret;
@@ -407,9 +410,13 @@ int vfio_container_add_section_window(VFIOContainer *container,
return 0;
}
-void vfio_container_del_section_window(VFIOContainer *container,
- MemoryRegionSection *section)
+static void
+vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer,
+ MemoryRegionSection *section)
{
+ VFIOContainer *container = container_of(bcontainer, VFIOContainer,
+ bcontainer);
+
if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
return;
}
@@ -430,6 +437,8 @@ static VFIOIOMMUOps vfio_iommu_spapr_ops;
static void setup_spapr_ops(VFIOContainerBase *bcontainer)
{
vfio_iommu_spapr_ops = *bcontainer->ops;
+ vfio_iommu_spapr_ops.add_window = vfio_spapr_container_add_section_window;
+ vfio_iommu_spapr_ops.del_window = vfio_spapr_container_del_section_window;
bcontainer->ops = &vfio_iommu_spapr_ops;
}
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index b9e5a0e64b..055f679363 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -169,11 +169,6 @@ VFIOAddressSpace *vfio_get_address_space(AddressSpace *as);
void vfio_put_address_space(VFIOAddressSpace *space);
/* SPAPR specific */
-int vfio_container_add_section_window(VFIOContainer *container,
- MemoryRegionSection *section,
- Error **errp);
-void vfio_container_del_section_window(VFIOContainer *container,
- MemoryRegionSection *section);
int vfio_spapr_container_init(VFIOContainer *container, Error **errp);
void vfio_spapr_container_deinit(VFIOContainer *container);
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h
index f62a14ac73..4b6f017c6f 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -75,6 +75,11 @@ int vfio_container_dma_map(VFIOContainerBase *bcontainer,
int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
IOMMUTLBEntry *iotlb);
+int vfio_container_add_section_window(VFIOContainerBase *bcontainer,
+ MemoryRegionSection *section,
+ Error **errp);
+void vfio_container_del_section_window(VFIOContainerBase *bcontainer,
+ MemoryRegionSection *section);
int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer,
bool start);
int vfio_container_query_dirty_bitmap(VFIOContainerBase *bcontainer,
--
2.39.3

View File

@ -0,0 +1,177 @@
From d54e88103aa76f3bf755b3f4308d8ab60367c6ef Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 14 Sep 2023 10:00:59 -0400
Subject: [PATCH 074/101] virtio-blk: add lock to protect s->rq
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [5/26] 17dcd5ba18c03e5633a014d8d62d34d8dd7b43bf (kmwolf/centos-qemu-kvm)
s->rq is accessed from IO_CODE and GLOBAL_STATE_CODE. Introduce a lock
to protect s->rq and eliminate reliance on the AioContext lock.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20230914140101.1065008-3-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
hw/block/virtio-blk.c | 67 +++++++++++++++++++++++-----------
include/hw/virtio/virtio-blk.h | 3 +-
2 files changed, 47 insertions(+), 23 deletions(-)
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index a1f8e15522..ee38e089bc 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -82,8 +82,11 @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
/* Break the link as the next request is going to be parsed from the
* ring again. Otherwise we may end up doing a double completion! */
req->mr_next = NULL;
- req->next = s->rq;
- s->rq = req;
+
+ WITH_QEMU_LOCK_GUARD(&s->rq_lock) {
+ req->next = s->rq;
+ s->rq = req;
+ }
} else if (action == BLOCK_ERROR_ACTION_REPORT) {
virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
if (acct_failed) {
@@ -1183,10 +1186,13 @@ static void virtio_blk_dma_restart_bh(void *opaque)
{
VirtIOBlock *s = opaque;
- VirtIOBlockReq *req = s->rq;
+ VirtIOBlockReq *req;
MultiReqBuffer mrb = {};
- s->rq = NULL;
+ WITH_QEMU_LOCK_GUARD(&s->rq_lock) {
+ req = s->rq;
+ s->rq = NULL;
+ }
aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
while (req) {
@@ -1238,22 +1244,29 @@ static void virtio_blk_reset(VirtIODevice *vdev)
AioContext *ctx;
VirtIOBlockReq *req;
+ /* Dataplane has stopped... */
+ assert(!s->dataplane_started);
+
+ /* ...but requests may still be in flight. */
ctx = blk_get_aio_context(s->blk);
aio_context_acquire(ctx);
blk_drain(s->blk);
+ aio_context_release(ctx);
/* We drop queued requests after blk_drain() because blk_drain() itself can
* produce them. */
- while (s->rq) {
- req = s->rq;
- s->rq = req->next;
- virtqueue_detach_element(req->vq, &req->elem, 0);
- virtio_blk_free_request(req);
- }
+ WITH_QEMU_LOCK_GUARD(&s->rq_lock) {
+ while (s->rq) {
+ req = s->rq;
+ s->rq = req->next;
- aio_context_release(ctx);
+ /* No other threads can access req->vq here */
+ virtqueue_detach_element(req->vq, &req->elem, 0);
+
+ virtio_blk_free_request(req);
+ }
+ }
- assert(!s->dataplane_started);
blk_set_enable_write_cache(s->blk, s->original_wce);
}
@@ -1443,18 +1456,22 @@ static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t status)
static void virtio_blk_save_device(VirtIODevice *vdev, QEMUFile *f)
{
VirtIOBlock *s = VIRTIO_BLK(vdev);
- VirtIOBlockReq *req = s->rq;
- while (req) {
- qemu_put_sbyte(f, 1);
+ WITH_QEMU_LOCK_GUARD(&s->rq_lock) {
+ VirtIOBlockReq *req = s->rq;
- if (s->conf.num_queues > 1) {
- qemu_put_be32(f, virtio_get_queue_index(req->vq));
- }
+ while (req) {
+ qemu_put_sbyte(f, 1);
- qemu_put_virtqueue_element(vdev, f, &req->elem);
- req = req->next;
+ if (s->conf.num_queues > 1) {
+ qemu_put_be32(f, virtio_get_queue_index(req->vq));
+ }
+
+ qemu_put_virtqueue_element(vdev, f, &req->elem);
+ req = req->next;
+ }
}
+
qemu_put_sbyte(f, 0);
}
@@ -1480,8 +1497,11 @@ static int virtio_blk_load_device(VirtIODevice *vdev, QEMUFile *f,
req = qemu_get_virtqueue_element(vdev, f, sizeof(VirtIOBlockReq));
virtio_blk_init_request(s, virtio_get_queue(vdev, vq_idx), req);
- req->next = s->rq;
- s->rq = req;
+
+ WITH_QEMU_LOCK_GUARD(&s->rq_lock) {
+ req->next = s->rq;
+ s->rq = req;
+ }
}
return 0;
@@ -1628,6 +1648,8 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
s->host_features);
virtio_init(vdev, VIRTIO_ID_BLOCK, s->config_size);
+ qemu_mutex_init(&s->rq_lock);
+
s->blk = conf->conf.blk;
s->rq = NULL;
s->sector_mask = (s->conf.conf.logical_block_size / BDRV_SECTOR_SIZE) - 1;
@@ -1679,6 +1701,7 @@ static void virtio_blk_device_unrealize(DeviceState *dev)
virtio_del_queue(vdev, i);
}
qemu_coroutine_dec_pool_size(conf->num_queues * conf->queue_size / 2);
+ qemu_mutex_destroy(&s->rq_lock);
blk_ram_registrar_destroy(&s->blk_ram_registrar);
qemu_del_vm_change_state_handler(s->change);
blockdev_mark_auto_del(s->blk);
diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
index dafec432ce..9881009c22 100644
--- a/include/hw/virtio/virtio-blk.h
+++ b/include/hw/virtio/virtio-blk.h
@@ -54,7 +54,8 @@ struct VirtIOBlockReq;
struct VirtIOBlock {
VirtIODevice parent_obj;
BlockBackend *blk;
- void *rq;
+ QemuMutex rq_lock;
+ void *rq; /* protected by rq_lock */
VirtIOBlkConf conf;
unsigned short sector_mask;
bool original_wce;
--
2.39.3

View File

@ -0,0 +1,167 @@
From a2069ff76637365cacf5b96f9427b98a6ca2c9ba Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 14 Sep 2023 10:01:00 -0400
Subject: [PATCH 075/101] virtio-blk: don't lock AioContext in the completion
code path
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [6/26] 3426f62c2156f6967bb4ffbce75a4ff46d3312a3 (kmwolf/centos-qemu-kvm)
Nothing in the completion code path relies on the AioContext lock
anymore. Virtqueues are only accessed from one thread at any moment and
the s->rq global state is protected by its own lock now.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20230914140101.1065008-4-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
hw/block/virtio-blk.c | 34 ++++------------------------------
1 file changed, 4 insertions(+), 30 deletions(-)
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index ee38e089bc..f5315df042 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -105,7 +105,6 @@ static void virtio_blk_rw_complete(void *opaque, int ret)
VirtIOBlock *s = next->dev;
VirtIODevice *vdev = VIRTIO_DEVICE(s);
- aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
while (next) {
VirtIOBlockReq *req = next;
next = req->mr_next;
@@ -138,7 +137,6 @@ static void virtio_blk_rw_complete(void *opaque, int ret)
block_acct_done(blk_get_stats(s->blk), &req->acct);
virtio_blk_free_request(req);
}
- aio_context_release(blk_get_aio_context(s->conf.conf.blk));
}
static void virtio_blk_flush_complete(void *opaque, int ret)
@@ -146,19 +144,13 @@ static void virtio_blk_flush_complete(void *opaque, int ret)
VirtIOBlockReq *req = opaque;
VirtIOBlock *s = req->dev;
- aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
- if (ret) {
- if (virtio_blk_handle_rw_error(req, -ret, 0, true)) {
- goto out;
- }
+ if (ret && virtio_blk_handle_rw_error(req, -ret, 0, true)) {
+ return;
}
virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
block_acct_done(blk_get_stats(s->blk), &req->acct);
virtio_blk_free_request(req);
-
-out:
- aio_context_release(blk_get_aio_context(s->conf.conf.blk));
}
static void virtio_blk_discard_write_zeroes_complete(void *opaque, int ret)
@@ -168,11 +160,8 @@ static void virtio_blk_discard_write_zeroes_complete(void *opaque, int ret)
bool is_write_zeroes = (virtio_ldl_p(VIRTIO_DEVICE(s), &req->out.type) &
~VIRTIO_BLK_T_BARRIER) == VIRTIO_BLK_T_WRITE_ZEROES;
- aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
- if (ret) {
- if (virtio_blk_handle_rw_error(req, -ret, false, is_write_zeroes)) {
- goto out;
- }
+ if (ret && virtio_blk_handle_rw_error(req, -ret, false, is_write_zeroes)) {
+ return;
}
virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
@@ -180,9 +169,6 @@ static void virtio_blk_discard_write_zeroes_complete(void *opaque, int ret)
block_acct_done(blk_get_stats(s->blk), &req->acct);
}
virtio_blk_free_request(req);
-
-out:
- aio_context_release(blk_get_aio_context(s->conf.conf.blk));
}
#ifdef __linux__
@@ -229,10 +215,8 @@ static void virtio_blk_ioctl_complete(void *opaque, int status)
virtio_stl_p(vdev, &scsi->data_len, hdr->dxfer_len);
out:
- aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
virtio_blk_req_complete(req, status);
virtio_blk_free_request(req);
- aio_context_release(blk_get_aio_context(s->conf.conf.blk));
g_free(ioctl_req);
}
@@ -672,7 +656,6 @@ static void virtio_blk_zone_report_complete(void *opaque, int ret)
{
ZoneCmdData *data = opaque;
VirtIOBlockReq *req = data->req;
- VirtIOBlock *s = req->dev;
VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
struct iovec *in_iov = data->in_iov;
unsigned in_num = data->in_num;
@@ -763,10 +746,8 @@ static void virtio_blk_zone_report_complete(void *opaque, int ret)
}
out:
- aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
virtio_blk_req_complete(req, err_status);
virtio_blk_free_request(req);
- aio_context_release(blk_get_aio_context(s->conf.conf.blk));
g_free(data->zone_report_data.zones);
g_free(data);
}
@@ -829,10 +810,8 @@ static void virtio_blk_zone_mgmt_complete(void *opaque, int ret)
err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
}
- aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
virtio_blk_req_complete(req, err_status);
virtio_blk_free_request(req);
- aio_context_release(blk_get_aio_context(s->conf.conf.blk));
}
static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op)
@@ -882,7 +861,6 @@ static void virtio_blk_zone_append_complete(void *opaque, int ret)
{
ZoneCmdData *data = opaque;
VirtIOBlockReq *req = data->req;
- VirtIOBlock *s = req->dev;
VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
int64_t append_sector, n;
uint8_t err_status = VIRTIO_BLK_S_OK;
@@ -905,10 +883,8 @@ static void virtio_blk_zone_append_complete(void *opaque, int ret)
trace_virtio_blk_zone_append_complete(vdev, req, append_sector, ret);
out:
- aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
virtio_blk_req_complete(req, err_status);
virtio_blk_free_request(req);
- aio_context_release(blk_get_aio_context(s->conf.conf.blk));
g_free(data);
}
@@ -944,10 +920,8 @@ static int virtio_blk_handle_zone_append(VirtIOBlockReq *req,
return 0;
out:
- aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
virtio_blk_req_complete(req, err_status);
virtio_blk_free_request(req);
- aio_context_release(blk_get_aio_context(s->conf.conf.blk));
return err_status;
}
--
2.39.3

View File

@ -0,0 +1,67 @@
From 2816f6ce20c496e21947f215112be34a5cb93606 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 14 Sep 2023 10:01:01 -0400
Subject: [PATCH 076/101] virtio-blk: don't lock AioContext in the submission
code path
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [7/26] e0de2744cb319569ea008334e45ee5fc2ba9b6d7 (kmwolf/centos-qemu-kvm)
There is no need to acquire the AioContext lock around blk_aio_*() or
blk_get_geometry() anymore. I/O plugging (defer_call()) also does not
require the AioContext lock anymore.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-ID: <20230914140101.1065008-5-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
hw/block/virtio-blk.c | 5 -----
1 file changed, 5 deletions(-)
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index f5315df042..e110f9718b 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -1111,7 +1111,6 @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
MultiReqBuffer mrb = {};
bool suppress_notifications = virtio_queue_get_notification(vq);
- aio_context_acquire(blk_get_aio_context(s->blk));
defer_call_begin();
do {
@@ -1137,7 +1136,6 @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
}
defer_call_end();
- aio_context_release(blk_get_aio_context(s->blk));
}
static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
@@ -1168,7 +1166,6 @@ static void virtio_blk_dma_restart_bh(void *opaque)
s->rq = NULL;
}
- aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
while (req) {
VirtIOBlockReq *next = req->next;
if (virtio_blk_handle_request(req, &mrb)) {
@@ -1192,8 +1189,6 @@ static void virtio_blk_dma_restart_bh(void *opaque)
/* Paired with inc in virtio_blk_dma_restart_cb() */
blk_dec_in_flight(s->conf.conf.blk);
-
- aio_context_release(blk_get_aio_context(s->conf.conf.blk));
}
static void virtio_blk_dma_restart_cb(void *opaque, bool running,
--
2.39.3

View File

@ -0,0 +1,87 @@
From 5db0b4131c56d96760b3300298f4bedab99d35cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= <marcandre.lureau@redhat.com>
Date: Wed, 6 Sep 2023 17:00:22 +0400
Subject: [PATCH 100/101] virtio-gpu: block migration of VMs with blob=true
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RH-Author: Marc-André Lureau <marcandre.lureau@redhat.com>
RH-MergeRequest: 217: virtio-gpu: block migration of VMs with blob=true
RH-Jira: RHEL-7565
RH-Commit: [1/1] f978ca697d574b1419eb027a1007c060dfb83298
JIRA: https://issues.redhat.com/browse/RHEL-7565
commit 9c549ab6895a43ad0cb33e684e11cdb0b5400897
Author: Marc-André Lureau <marcandre.lureau@redhat.com>
Date: Wed Sep 6 17:00:22 2023 +0400
virtio-gpu: block migration of VMs with blob=true
"blob" resources don't have an associated pixman image:
#0 pixman_image_get_stride (image=0x0) at ../pixman/pixman-image.c:921
#1 0x0000562327c25236 in virtio_gpu_save (f=0x56232bb13b00, opaque=0x56232b555a60, size=0, field=0x5623289ab6c8 <__compound_literal.3+104>, vmdesc=0x56232ab59fe0) at ../hw/display/virtio-gpu.c:1225
Related to:
https://bugzilla.redhat.com/show_bug.cgi?id=2236353
Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Acked-by: Peter Xu <peterx@redhat.com>
[ rhel backport - fix Error* vs Error** argument ]
Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
---
hw/display/virtio-gpu.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
index b016d3bac8..1702190ead 100644
--- a/hw/display/virtio-gpu.c
+++ b/hw/display/virtio-gpu.c
@@ -27,6 +27,7 @@
#include "hw/virtio/virtio-gpu-pixman.h"
#include "hw/virtio/virtio-bus.h"
#include "hw/qdev-properties.h"
+#include "migration/blocker.h"
#include "qemu/log.h"
#include "qemu/module.h"
#include "qapi/error.h"
@@ -41,6 +42,8 @@ virtio_gpu_find_check_resource(VirtIOGPU *g, uint32_t resource_id,
static void virtio_gpu_reset_bh(void *opaque);
+static Error *blob_mig_blocker;
+
void virtio_gpu_update_cursor_data(VirtIOGPU *g,
struct virtio_gpu_scanout *s,
uint32_t resource_id)
@@ -1452,6 +1455,14 @@ void virtio_gpu_device_realize(DeviceState *qdev, Error **errp)
error_setg(errp, "blobs and virgl are not compatible (yet)");
return;
}
+
+ if (!blob_mig_blocker) {
+ error_setg(&blob_mig_blocker,
+ "virtio-gpu blob VMs are currently not migratable.");
+ }
+ if (migrate_add_blocker(&blob_mig_blocker, errp)) {
+ return;
+ }
}
if (!virtio_gpu_base_device_realize(qdev,
@@ -1478,6 +1489,9 @@ static void virtio_gpu_device_unrealize(DeviceState *qdev)
{
VirtIOGPU *g = VIRTIO_GPU(qdev);
+ if (virtio_gpu_blob_enabled(g->parent_obj.conf)) {
+ migrate_del_blocker(&blob_mig_blocker);
+ }
g_clear_pointer(&g->ctrl_bh, qemu_bh_delete);
g_clear_pointer(&g->cursor_bh, qemu_bh_delete);
g_clear_pointer(&g->reset_bh, qemu_bh_delete);
--
2.39.3

View File

@ -0,0 +1,58 @@
From 1ee3f919a51135a0798a14c734ca80d74d30025d Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 4 Dec 2023 11:42:57 -0500
Subject: [PATCH 078/101] virtio-scsi: don't lock AioContext around
virtio_queue_aio_attach_host_notifier()
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [9/26] 5e1179e617d05bf765b285ba42393ec1ddbeba28 (kmwolf/centos-qemu-kvm)
virtio_queue_aio_attach_host_notifier() does not require the AioContext
lock. Stop taking the lock and add an explicit smp_wmb() because we were
relying on the implicit barrier in the AioContext lock before.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20231204164259.1515217-3-stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
hw/scsi/virtio-scsi-dataplane.c | 8 +-------
1 file changed, 1 insertion(+), 7 deletions(-)
diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
index 1e684beebe..135e23fe54 100644
--- a/hw/scsi/virtio-scsi-dataplane.c
+++ b/hw/scsi/virtio-scsi-dataplane.c
@@ -149,23 +149,17 @@ int virtio_scsi_dataplane_start(VirtIODevice *vdev)
memory_region_transaction_commit();
- /*
- * These fields are visible to the IOThread so we rely on implicit barriers
- * in aio_context_acquire() on the write side and aio_notify_accept() on
- * the read side.
- */
s->dataplane_starting = false;
s->dataplane_started = true;
+ smp_wmb(); /* paired with aio_notify_accept() */
if (s->bus.drain_count == 0) {
- aio_context_acquire(s->ctx);
virtio_queue_aio_attach_host_notifier(vs->ctrl_vq, s->ctx);
virtio_queue_aio_attach_host_notifier_no_poll(vs->event_vq, s->ctx);
for (i = 0; i < vs->conf.num_queues; i++) {
virtio_queue_aio_attach_host_notifier(vs->cmd_vqs[i], s->ctx);
}
- aio_context_release(s->ctx);
}
return 0;
--
2.39.3

View File

@ -0,0 +1,173 @@
From c2d7633ead6e19d4b6af5552ca907ae071b8734b Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 5 Dec 2023 13:19:58 -0500
Subject: [PATCH 081/101] virtio-scsi: replace AioContext lock with tmf_bh_lock
RH-Author: Kevin Wolf <kwolf@redhat.com>
RH-MergeRequest: 214: Remove AioContext lock
RH-Jira: RHEL-15965
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
RH-Commit: [12/26] 8fb375bfd72a491d47321c78078577071a4e90fb (kmwolf/centos-qemu-kvm)
Protect the Task Management Function BH state with a lock. The TMF BH
runs in the main loop thread. An IOThread might process a TMF at the
same time as the TMF BH is running. Therefore tmf_bh_list and tmf_bh
must be protected by a lock.
Run TMF request completion in the IOThread using aio_wait_bh_oneshot().
This avoids more locking to protect the virtqueue and SCSI layer state.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20231205182011.1976568-2-stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
hw/scsi/virtio-scsi.c | 62 ++++++++++++++++++++++-----------
include/hw/virtio/virtio-scsi.h | 3 +-
2 files changed, 43 insertions(+), 22 deletions(-)
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 9c751bf296..4f8d35facc 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -123,6 +123,30 @@ static void virtio_scsi_complete_req(VirtIOSCSIReq *req)
virtio_scsi_free_req(req);
}
+static void virtio_scsi_complete_req_bh(void *opaque)
+{
+ VirtIOSCSIReq *req = opaque;
+
+ virtio_scsi_complete_req(req);
+}
+
+/*
+ * Called from virtio_scsi_do_one_tmf_bh() in main loop thread. The main loop
+ * thread cannot touch the virtqueue since that could race with an IOThread.
+ */
+static void virtio_scsi_complete_req_from_main_loop(VirtIOSCSIReq *req)
+{
+ VirtIOSCSI *s = req->dev;
+
+ if (!s->ctx || s->ctx == qemu_get_aio_context()) {
+ /* No need to schedule a BH when there is no IOThread */
+ virtio_scsi_complete_req(req);
+ } else {
+ /* Run request completion in the IOThread */
+ aio_wait_bh_oneshot(s->ctx, virtio_scsi_complete_req_bh, req);
+ }
+}
+
static void virtio_scsi_bad_req(VirtIOSCSIReq *req)
{
virtio_error(VIRTIO_DEVICE(req->dev), "wrong size for virtio-scsi headers");
@@ -338,10 +362,7 @@ static void virtio_scsi_do_one_tmf_bh(VirtIOSCSIReq *req)
out:
object_unref(OBJECT(d));
-
- virtio_scsi_acquire(s);
- virtio_scsi_complete_req(req);
- virtio_scsi_release(s);
+ virtio_scsi_complete_req_from_main_loop(req);
}
/* Some TMFs must be processed from the main loop thread */
@@ -354,18 +375,16 @@ static void virtio_scsi_do_tmf_bh(void *opaque)
GLOBAL_STATE_CODE();
- virtio_scsi_acquire(s);
+ WITH_QEMU_LOCK_GUARD(&s->tmf_bh_lock) {
+ QTAILQ_FOREACH_SAFE(req, &s->tmf_bh_list, next, tmp) {
+ QTAILQ_REMOVE(&s->tmf_bh_list, req, next);
+ QTAILQ_INSERT_TAIL(&reqs, req, next);
+ }
- QTAILQ_FOREACH_SAFE(req, &s->tmf_bh_list, next, tmp) {
- QTAILQ_REMOVE(&s->tmf_bh_list, req, next);
- QTAILQ_INSERT_TAIL(&reqs, req, next);
+ qemu_bh_delete(s->tmf_bh);
+ s->tmf_bh = NULL;
}
- qemu_bh_delete(s->tmf_bh);
- s->tmf_bh = NULL;
-
- virtio_scsi_release(s);
-
QTAILQ_FOREACH_SAFE(req, &reqs, next, tmp) {
QTAILQ_REMOVE(&reqs, req, next);
virtio_scsi_do_one_tmf_bh(req);
@@ -379,8 +398,7 @@ static void virtio_scsi_reset_tmf_bh(VirtIOSCSI *s)
GLOBAL_STATE_CODE();
- virtio_scsi_acquire(s);
-
+ /* Called after ioeventfd has been stopped, so tmf_bh_lock is not needed */
if (s->tmf_bh) {
qemu_bh_delete(s->tmf_bh);
s->tmf_bh = NULL;
@@ -393,19 +411,19 @@ static void virtio_scsi_reset_tmf_bh(VirtIOSCSI *s)
req->resp.tmf.response = VIRTIO_SCSI_S_TARGET_FAILURE;
virtio_scsi_complete_req(req);
}
-
- virtio_scsi_release(s);
}
static void virtio_scsi_defer_tmf_to_bh(VirtIOSCSIReq *req)
{
VirtIOSCSI *s = req->dev;
- QTAILQ_INSERT_TAIL(&s->tmf_bh_list, req, next);
+ WITH_QEMU_LOCK_GUARD(&s->tmf_bh_lock) {
+ QTAILQ_INSERT_TAIL(&s->tmf_bh_list, req, next);
- if (!s->tmf_bh) {
- s->tmf_bh = qemu_bh_new(virtio_scsi_do_tmf_bh, s);
- qemu_bh_schedule(s->tmf_bh);
+ if (!s->tmf_bh) {
+ s->tmf_bh = qemu_bh_new(virtio_scsi_do_tmf_bh, s);
+ qemu_bh_schedule(s->tmf_bh);
+ }
}
}
@@ -1235,6 +1253,7 @@ static void virtio_scsi_device_realize(DeviceState *dev, Error **errp)
Error *err = NULL;
QTAILQ_INIT(&s->tmf_bh_list);
+ qemu_mutex_init(&s->tmf_bh_lock);
virtio_scsi_common_realize(dev,
virtio_scsi_handle_ctrl,
@@ -1277,6 +1296,7 @@ static void virtio_scsi_device_unrealize(DeviceState *dev)
qbus_set_hotplug_handler(BUS(&s->bus), NULL);
virtio_scsi_common_unrealize(dev);
+ qemu_mutex_destroy(&s->tmf_bh_lock);
}
static Property virtio_scsi_properties[] = {
diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
index 779568ab5d..da8cb928d9 100644
--- a/include/hw/virtio/virtio-scsi.h
+++ b/include/hw/virtio/virtio-scsi.h
@@ -85,8 +85,9 @@ struct VirtIOSCSI {
/*
* TMFs deferred to main loop BH. These fields are protected by
- * virtio_scsi_acquire().
+ * tmf_bh_lock.
*/
+ QemuMutex tmf_bh_lock;
QEMUBH *tmf_bh;
QTAILQ_HEAD(, VirtIOSCSIReq) tmf_bh_list;
--
2.39.3

Some files were not shown because too many files have changed in this diff Show More