diff --git a/kvm-block-blkio-fix-module_block.py-parsing.patch b/kvm-block-blkio-fix-module_block.py-parsing.patch new file mode 100644 index 0000000..1c89a0b --- /dev/null +++ b/kvm-block-blkio-fix-module_block.py-parsing.patch @@ -0,0 +1,205 @@ +From 545482400ea87d54b1b839587f8aaad41e30692f Mon Sep 17 00:00:00 2001 +From: Stefan Hajnoczi +Date: Tue, 4 Jul 2023 14:34:36 +0200 +Subject: [PATCH 36/37] block/blkio: fix module_block.py parsing + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 181: block/blkio: fix module_block.py parsing +RH-Bugzilla: 2213317 +RH-Acked-by: Stefano Garzarella +RH-Acked-by: Kevin Wolf +RH-Commit: [1/2] c85df95824f4889526a73527771dec9efcb06926 (stefanha/centos-stream-qemu-kvm) + +When QEMU is built with --enable-modules, the module_block.py script +parses block/*.c to find block drivers that are built as modules. The +script generates a table of block drivers called block_driver_modules[]. +This table is used for block driver module loading. + +The blkio.c driver uses macros to define its BlockDriver structs. This +was done to avoid code duplication but the module_block.py script is +unable to parse the macro. The result is that libblkio-based block +drivers can be built as modules but will not be found at runtime. + +One fix is to make the module_block.py script or build system fancier so +it can parse C macros (e.g. by parsing the preprocessed source code). I +chose not to do this because it raises the complexity of the build, +making future issues harder to debug. + +Keep things simple: use the macro to avoid duplicating BlockDriver +function pointers but define .format_name and .protocol_name manually +for each BlockDriver. This way the module_block.py is able to parse the +code. + +Also get rid of the block driver name macros (e.g. DRIVER_IO_URING) +because module_block.py cannot parse them either. + +Fixes: fd66dbd424f5 ("blkio: add libblkio block driver") +Reported-by: Qing Wang +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Stefano Garzarella +Message-id: 20230704123436.187761-1-stefanha@redhat.com +Cc: Stefano Garzarella +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit c21eae1ccc782440f320accb6f90c66cb8f45ee9) + +Conflicts: +- Downstream lacks commit 28ff7b4dfbb5 ("block/blkio: convert to + blk_io_plug_call() API") so keep the .bdrv_co_io_unplug callback. + +Signed-off-by: Stefan Hajnoczi +--- + block/blkio.c | 118 ++++++++++++++++++++++++++------------------------ + 1 file changed, 61 insertions(+), 57 deletions(-) + +diff --git a/block/blkio.c b/block/blkio.c +index 6a6f20f923..afcec359f2 100644 +--- a/block/blkio.c ++++ b/block/blkio.c +@@ -21,16 +21,6 @@ + + #include "block/block-io.h" + +-/* +- * Keep the QEMU BlockDriver names identical to the libblkio driver names. +- * Using macros instead of typing out the string literals avoids typos. +- */ +-#define DRIVER_IO_URING "io_uring" +-#define DRIVER_NVME_IO_URING "nvme-io_uring" +-#define DRIVER_VIRTIO_BLK_VFIO_PCI "virtio-blk-vfio-pci" +-#define DRIVER_VIRTIO_BLK_VHOST_USER "virtio-blk-vhost-user" +-#define DRIVER_VIRTIO_BLK_VHOST_VDPA "virtio-blk-vhost-vdpa" +- + /* + * Allocated bounce buffers are kept in a list sorted by buffer address. + */ +@@ -743,15 +733,15 @@ static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags, + return ret; + } + +- if (strcmp(blkio_driver, DRIVER_IO_URING) == 0) { ++ if (strcmp(blkio_driver, "io_uring") == 0) { + ret = blkio_io_uring_open(bs, options, flags, errp); +- } else if (strcmp(blkio_driver, DRIVER_NVME_IO_URING) == 0) { ++ } else if (strcmp(blkio_driver, "nvme-io_uring") == 0) { + ret = blkio_nvme_io_uring(bs, options, flags, errp); +- } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VFIO_PCI) == 0) { ++ } else if (strcmp(blkio_driver, "virtio-blk-vfio-pci") == 0) { + ret = blkio_virtio_blk_common_open(bs, options, flags, errp); +- } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_USER) == 0) { ++ } else if (strcmp(blkio_driver, "virtio-blk-vhost-user") == 0) { + ret = blkio_virtio_blk_common_open(bs, options, flags, errp); +- } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_VDPA) == 0) { ++ } else if (strcmp(blkio_driver, "virtio-blk-vhost-vdpa") == 0) { + ret = blkio_virtio_blk_common_open(bs, options, flags, errp); + } else { + g_assert_not_reached(); +@@ -1027,50 +1017,64 @@ static void blkio_refresh_limits(BlockDriverState *bs, Error **errp) + * - truncate + */ + +-#define BLKIO_DRIVER(name, ...) \ +- { \ +- .format_name = name, \ +- .protocol_name = name, \ +- .instance_size = sizeof(BDRVBlkioState), \ +- .bdrv_file_open = blkio_file_open, \ +- .bdrv_close = blkio_close, \ +- .bdrv_co_getlength = blkio_co_getlength, \ +- .bdrv_co_truncate = blkio_truncate, \ +- .bdrv_co_get_info = blkio_co_get_info, \ +- .bdrv_attach_aio_context = blkio_attach_aio_context, \ +- .bdrv_detach_aio_context = blkio_detach_aio_context, \ +- .bdrv_co_pdiscard = blkio_co_pdiscard, \ +- .bdrv_co_preadv = blkio_co_preadv, \ +- .bdrv_co_pwritev = blkio_co_pwritev, \ +- .bdrv_co_flush_to_disk = blkio_co_flush, \ +- .bdrv_co_pwrite_zeroes = blkio_co_pwrite_zeroes, \ +- .bdrv_co_io_unplug = blkio_co_io_unplug, \ +- .bdrv_refresh_limits = blkio_refresh_limits, \ +- .bdrv_register_buf = blkio_register_buf, \ +- .bdrv_unregister_buf = blkio_unregister_buf, \ +- __VA_ARGS__ \ +- } +- +-static BlockDriver bdrv_io_uring = BLKIO_DRIVER( +- DRIVER_IO_URING, +- .bdrv_needs_filename = true, +-); +- +-static BlockDriver bdrv_nvme_io_uring = BLKIO_DRIVER( +- DRIVER_NVME_IO_URING, +-); +- +-static BlockDriver bdrv_virtio_blk_vfio_pci = BLKIO_DRIVER( +- DRIVER_VIRTIO_BLK_VFIO_PCI +-); ++/* ++ * Do not include .format_name and .protocol_name because module_block.py ++ * does not parse macros in the source code. ++ */ ++#define BLKIO_DRIVER_COMMON \ ++ .instance_size = sizeof(BDRVBlkioState), \ ++ .bdrv_file_open = blkio_file_open, \ ++ .bdrv_close = blkio_close, \ ++ .bdrv_co_getlength = blkio_co_getlength, \ ++ .bdrv_co_truncate = blkio_truncate, \ ++ .bdrv_co_get_info = blkio_co_get_info, \ ++ .bdrv_attach_aio_context = blkio_attach_aio_context, \ ++ .bdrv_detach_aio_context = blkio_detach_aio_context, \ ++ .bdrv_co_pdiscard = blkio_co_pdiscard, \ ++ .bdrv_co_preadv = blkio_co_preadv, \ ++ .bdrv_co_pwritev = blkio_co_pwritev, \ ++ .bdrv_co_flush_to_disk = blkio_co_flush, \ ++ .bdrv_co_pwrite_zeroes = blkio_co_pwrite_zeroes, \ ++ .bdrv_co_io_unplug = blkio_co_io_unplug, \ ++ .bdrv_refresh_limits = blkio_refresh_limits, \ ++ .bdrv_register_buf = blkio_register_buf, \ ++ .bdrv_unregister_buf = blkio_unregister_buf, + +-static BlockDriver bdrv_virtio_blk_vhost_user = BLKIO_DRIVER( +- DRIVER_VIRTIO_BLK_VHOST_USER +-); ++/* ++ * Use the same .format_name and .protocol_name as the libblkio driver name for ++ * consistency. ++ */ + +-static BlockDriver bdrv_virtio_blk_vhost_vdpa = BLKIO_DRIVER( +- DRIVER_VIRTIO_BLK_VHOST_VDPA +-); ++static BlockDriver bdrv_io_uring = { ++ .format_name = "io_uring", ++ .protocol_name = "io_uring", ++ .bdrv_needs_filename = true, ++ BLKIO_DRIVER_COMMON ++}; ++ ++static BlockDriver bdrv_nvme_io_uring = { ++ .format_name = "nvme-io_uring", ++ .protocol_name = "nvme-io_uring", ++ BLKIO_DRIVER_COMMON ++}; ++ ++static BlockDriver bdrv_virtio_blk_vfio_pci = { ++ .format_name = "virtio-blk-vfio-pci", ++ .protocol_name = "virtio-blk-vfio-pci", ++ BLKIO_DRIVER_COMMON ++}; ++ ++static BlockDriver bdrv_virtio_blk_vhost_user = { ++ .format_name = "virtio-blk-vhost-user", ++ .protocol_name = "virtio-blk-vhost-user", ++ BLKIO_DRIVER_COMMON ++}; ++ ++static BlockDriver bdrv_virtio_blk_vhost_vdpa = { ++ .format_name = "virtio-blk-vhost-vdpa", ++ .protocol_name = "virtio-blk-vhost-vdpa", ++ BLKIO_DRIVER_COMMON ++}; + + static void bdrv_blkio_init(void) + { +-- +2.39.3 + diff --git a/kvm-hw-vfio-pci-quirks-Sanitize-capability-pointer.patch b/kvm-hw-vfio-pci-quirks-Sanitize-capability-pointer.patch new file mode 100644 index 0000000..ffabd75 --- /dev/null +++ b/kvm-hw-vfio-pci-quirks-Sanitize-capability-pointer.patch @@ -0,0 +1,76 @@ +From fcd6219a95851d17fd8bde69d87e78c6533be990 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 24/37] hw/vfio/pci-quirks: Sanitize capability pointer +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [22/28] cb080409c1912f4365f8e31cd23c914b48f91575 (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit 0ddcb39c9357 +Author: Alex Williamson +Date: Fri Jun 30 16:36:08 2023 -0600 + + hw/vfio/pci-quirks: Sanitize capability pointer + + Coverity reports a tained scalar when traversing the capabilities + chain (CID 1516589). In practice I've never seen a device with a + chain so broken as to cause an issue, but it's also pretty easy to + sanitize. + + Fixes: f6b30c1984f7 ("hw/vfio/pci-quirks: Support alternate offset for GPUDirect Cliques") + Signed-off-by: Alex Williamson + Reviewed-by: Cédric Le Goater + Signed-off-by: Cédric Le Goater + +Signed-off-by: Cédric Le Goater +--- + hw/vfio/pci-quirks.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c +index 0ed2fcd531..f4ff836805 100644 +--- a/hw/vfio/pci-quirks.c ++++ b/hw/vfio/pci-quirks.c +@@ -1530,6 +1530,12 @@ const PropertyInfo qdev_prop_nv_gpudirect_clique = { + .set = set_nv_gpudirect_clique_id, + }; + ++static bool is_valid_std_cap_offset(uint8_t pos) ++{ ++ return (pos >= PCI_STD_HEADER_SIZEOF && ++ pos <= (PCI_CFG_SPACE_SIZE - PCI_CAP_SIZEOF)); ++} ++ + static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp) + { + PCIDevice *pdev = &vdev->pdev; +@@ -1563,7 +1569,7 @@ static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp) + */ + ret = pread(vdev->vbasedev.fd, &tmp, 1, + vdev->config_offset + PCI_CAPABILITY_LIST); +- if (ret != 1 || !tmp) { ++ if (ret != 1 || !is_valid_std_cap_offset(tmp)) { + error_setg(errp, "NVIDIA GPUDirect Clique ID: error getting cap list"); + return -EINVAL; + } +@@ -1575,7 +1581,7 @@ static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp) + d4_conflict = true; + } + tmp = pdev->config[tmp + PCI_CAP_LIST_NEXT]; +- } while (tmp); ++ } while (is_valid_std_cap_offset(tmp)); + + if (!c8_conflict) { + pos = 0xC8; +-- +2.39.3 + diff --git a/kvm-hw-vfio-pci-quirks-Support-alternate-offset-for-GPUD.patch b/kvm-hw-vfio-pci-quirks-Support-alternate-offset-for-GPUD.patch new file mode 100644 index 0000000..99f5c75 --- /dev/null +++ b/kvm-hw-vfio-pci-quirks-Support-alternate-offset-for-GPUD.patch @@ -0,0 +1,110 @@ +From dd38230a0a375fb8427fa106ff79562e56c51b6c Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 18/37] hw/vfio/pci-quirks: Support alternate offset for + GPUDirect Cliques +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [16/28] 9befb7c9adaeb58e9d0b49686cf54b751c742832 (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit f6b30c1984f7 +Author: Alex Williamson +Date: Thu Jun 8 12:05:07 2023 -0600 + + hw/vfio/pci-quirks: Support alternate offset for GPUDirect Cliques + + NVIDIA Turing and newer GPUs implement the MSI-X capability at the offset + previously reserved for use by hypervisors to implement the GPUDirect + Cliques capability. A revised specification provides an alternate + location. Add a config space walk to the quirk to check for conflicts, + allowing us to fall back to the new location or generate an error at the + quirk setup rather than when the real conflicting capability is added + should there be no available location. + + Signed-off-by: Alex Williamson + Reviewed-by: Cédric Le Goater + Signed-off-by: Cédric Le Goater + +Signed-off-by: Cédric Le Goater +--- + hw/vfio/pci-quirks.c | 41 ++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 40 insertions(+), 1 deletion(-) + +diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c +index f0147a050a..0ed2fcd531 100644 +--- a/hw/vfio/pci-quirks.c ++++ b/hw/vfio/pci-quirks.c +@@ -1490,6 +1490,9 @@ void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev) + * +---------------------------------+---------------------------------+ + * + * https://lists.gnu.org/archive/html/qemu-devel/2017-08/pdfUda5iEpgOS.pdf ++ * ++ * Specification for Turning and later GPU architectures: ++ * https://lists.gnu.org/archive/html/qemu-devel/2023-06/pdf142OR4O4c2.pdf + */ + static void get_nv_gpudirect_clique_id(Object *obj, Visitor *v, + const char *name, void *opaque, +@@ -1530,7 +1533,9 @@ const PropertyInfo qdev_prop_nv_gpudirect_clique = { + static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp) + { + PCIDevice *pdev = &vdev->pdev; +- int ret, pos = 0xC8; ++ int ret, pos; ++ bool c8_conflict = false, d4_conflict = false; ++ uint8_t tmp; + + if (vdev->nv_gpudirect_clique == 0xFF) { + return 0; +@@ -1547,6 +1552,40 @@ static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp) + return -EINVAL; + } + ++ /* ++ * Per the updated specification above, it's recommended to use offset ++ * D4h for Turing and later GPU architectures due to a conflict of the ++ * MSI-X capability at C8h. We don't know how to determine the GPU ++ * architecture, instead we walk the capability chain to mark conflicts ++ * and choose one or error based on the result. ++ * ++ * NB. Cap list head in pdev->config is already cleared, read from device. ++ */ ++ ret = pread(vdev->vbasedev.fd, &tmp, 1, ++ vdev->config_offset + PCI_CAPABILITY_LIST); ++ if (ret != 1 || !tmp) { ++ error_setg(errp, "NVIDIA GPUDirect Clique ID: error getting cap list"); ++ return -EINVAL; ++ } ++ ++ do { ++ if (tmp == 0xC8) { ++ c8_conflict = true; ++ } else if (tmp == 0xD4) { ++ d4_conflict = true; ++ } ++ tmp = pdev->config[tmp + PCI_CAP_LIST_NEXT]; ++ } while (tmp); ++ ++ if (!c8_conflict) { ++ pos = 0xC8; ++ } else if (!d4_conflict) { ++ pos = 0xD4; ++ } else { ++ error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid config space"); ++ return -EINVAL; ++ } ++ + ret = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, 8, errp); + if (ret < 0) { + error_prepend(errp, "Failed to add NVIDIA GPUDirect cap: "); +-- +2.39.3 + diff --git a/kvm-migration-Add-switchover-ack-capability.patch b/kvm-migration-Add-switchover-ack-capability.patch new file mode 100644 index 0000000..399c9ed --- /dev/null +++ b/kvm-migration-Add-switchover-ack-capability.patch @@ -0,0 +1,162 @@ +From 8f89d3bc8f226cd038bf88b9fb3ef43b0fb33034 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 10/37] migration: Add switchover ack capability +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [8/28] 2f4ca020783bd617eca13b18289fce764279833b (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit 6574232fff6a +Author: Avihai Horon +Date: Wed Jun 21 14:11:54 2023 +0300 + + migration: Add switchover ack capability + + Migration downtime estimation is calculated based on bandwidth and + remaining migration data. This assumes that loading of migration data in + the destination takes a negligible amount of time and that downtime + depends only on network speed. + + While this may be true for RAM, it's not necessarily true for other + migrated devices. For example, loading the data of a VFIO device in the + destination might require from the device to allocate resources, prepare + internal data structures and so on. These operations can take a + significant amount of time which can increase migration downtime. + + This patch adds a new capability "switchover ack" that prevents the + source from stopping the VM and completing the migration until an ACK + is received from the destination that it's OK to do so. + + This can be used by migrated devices in various ways to reduce downtime. + For example, a device can send initial precopy metadata to pre-allocate + resources in the destination and use this capability to make sure that + the pre-allocation is completed before the source VM is stopped, so it + will have full effect. + + This new capability relies on the return path capability to communicate + from the destination back to the source. + + The actual implementation of the capability will be added in the + following patches. + + Signed-off-by: Avihai Horon + Reviewed-by: Peter Xu + Acked-by: Markus Armbruster + Tested-by: YangHang Liu + Acked-by: Alex Williamson + Signed-off-by: Cédric Le Goater + +Conflicts: + - qapi/migration.json + re-indent of @switchover-ack to avoid ../qapi/migration.json:482:1: + unexpected de-indent (expected at least 17 spaces) + +Signed-off-by: Cédric Le Goater +--- + migration/options.c | 21 +++++++++++++++++++++ + migration/options.h | 1 + + qapi/migration.json | 14 +++++++++++++- + 3 files changed, 35 insertions(+), 1 deletion(-) + +diff --git a/migration/options.c b/migration/options.c +index a76984276d..c3df6c6dde 100644 +--- a/migration/options.c ++++ b/migration/options.c +@@ -182,6 +182,8 @@ Property migration_properties[] = { + DEFINE_PROP_MIG_CAP("x-zero-copy-send", + MIGRATION_CAPABILITY_ZERO_COPY_SEND), + #endif ++ DEFINE_PROP_MIG_CAP("x-switchover-ack", ++ MIGRATION_CAPABILITY_SWITCHOVER_ACK), + + DEFINE_PROP_END_OF_LIST(), + }; +@@ -305,6 +307,13 @@ bool migrate_return_path(void) + return s->capabilities[MIGRATION_CAPABILITY_RETURN_PATH]; + } + ++bool migrate_switchover_ack(void) ++{ ++ MigrationState *s = migrate_get_current(); ++ ++ return s->capabilities[MIGRATION_CAPABILITY_SWITCHOVER_ACK]; ++} ++ + bool migrate_validate_uuid(void) + { + MigrationState *s = migrate_get_current(); +@@ -532,6 +541,18 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp) + } + } + ++ if (new_caps[MIGRATION_CAPABILITY_SWITCHOVER_ACK]) { ++ if (!new_caps[MIGRATION_CAPABILITY_RETURN_PATH]) { ++ error_setg(errp, "Capability 'switchover-ack' requires capability " ++ "'return-path'"); ++ return false; ++ } ++ ++ /* Disable this capability until it's implemented */ ++ error_setg(errp, "'switchover-ack' is not implemented yet"); ++ return false; ++ } ++ + return true; + } + +diff --git a/migration/options.h b/migration/options.h +index 7b0f7245ad..0fc7be6869 100644 +--- a/migration/options.h ++++ b/migration/options.h +@@ -47,6 +47,7 @@ bool migrate_postcopy_ram(void); + bool migrate_rdma_pin_all(void); + bool migrate_release_ram(void); + bool migrate_return_path(void); ++bool migrate_switchover_ack(void); + bool migrate_validate_uuid(void); + bool migrate_xbzrle(void); + bool migrate_zero_blocks(void); +diff --git a/qapi/migration.json b/qapi/migration.json +index 2c35b7b9cf..b6a58347cc 100644 +--- a/qapi/migration.json ++++ b/qapi/migration.json +@@ -478,6 +478,18 @@ + # should not affect the correctness of postcopy migration. + # (since 7.1) + # ++# @switchover-ack: If enabled, migration will not stop the source VM ++# and complete the migration until an ACK is received ++# from the destination that it's OK to do so. ++# Exactly when this ACK is sent depends on the ++# migrated devices that use this feature. For ++# example, a device can use it to make sure some of ++# its data is sent and loaded in the destination ++# before doing switchover. This can reduce downtime ++# if devices that support this capability are ++# present. 'return-path' capability must be enabled ++# to use it. (since 8.1) ++# + # Features: + # @unstable: Members @x-colo and @x-ignore-shared are experimental. + # +@@ -492,7 +504,7 @@ + 'dirty-bitmaps', 'postcopy-blocktime', 'late-block-activate', + { 'name': 'x-ignore-shared', 'features': [ 'unstable' ] }, + 'validate-uuid', 'background-snapshot', +- 'zero-copy-send', 'postcopy-preempt'] } ++ 'zero-copy-send', 'postcopy-preempt', 'switchover-ack'] } + + ## + # @MigrationCapabilityStatus: +-- +2.39.3 + diff --git a/kvm-migration-Enable-switchover-ack-capability.patch b/kvm-migration-Enable-switchover-ack-capability.patch new file mode 100644 index 0000000..e08e5df --- /dev/null +++ b/kvm-migration-Enable-switchover-ack-capability.patch @@ -0,0 +1,56 @@ +From bbe565f7d3b7fe46971e020e9bd8e79dc9ffa69c Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 12/37] migration: Enable switchover ack capability +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [10/28] c4a7d7d26a97181c9516d133a6610bfa5dcb1d16 (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit 538ef4fe2f72 +Author: Avihai Horon +Date: Wed Jun 21 14:11:56 2023 +0300 + + migration: Enable switchover ack capability + + Now that switchover ack logic has been implemented, enable the + capability. + + Signed-off-by: Avihai Horon + Reviewed-by: Juan Quintela + Reviewed-by: Peter Xu + Tested-by: YangHang Liu + Acked-by: Alex Williamson + Signed-off-by: Cédric Le Goater + +Signed-off-by: Cédric Le Goater +--- + migration/options.c | 4 ---- + 1 file changed, 4 deletions(-) + +diff --git a/migration/options.c b/migration/options.c +index c3df6c6dde..ccd7ef3907 100644 +--- a/migration/options.c ++++ b/migration/options.c +@@ -547,10 +547,6 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp) + "'return-path'"); + return false; + } +- +- /* Disable this capability until it's implemented */ +- error_setg(errp, "'switchover-ack' is not implemented yet"); +- return false; + } + + return true; +-- +2.39.3 + diff --git a/kvm-migration-Implement-switchover-ack-logic.patch b/kvm-migration-Implement-switchover-ack-logic.patch new file mode 100644 index 0000000..49b9f12 --- /dev/null +++ b/kvm-migration-Implement-switchover-ack-logic.patch @@ -0,0 +1,339 @@ +From 387c39f198d94f600be525e363edc7ca916dc261 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 11/37] migration: Implement switchover ack logic +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [9/28] 853e1978f3b9f87942863bba894a0ed908bde6b1 (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit 1b4adb10f898 +Author: Avihai Horon +Date: Wed Jun 21 14:11:55 2023 +0300 + + migration: Implement switchover ack logic + + Implement switchover ack logic. This prevents the source from stopping + the VM and completing the migration until an ACK is received from the + destination that it's OK to do so. + + To achieve this, a new SaveVMHandlers handler switchover_ack_needed() + and a new return path message MIG_RP_MSG_SWITCHOVER_ACK are added. + + The switchover_ack_needed() handler is called during migration setup in + the destination to check if switchover ack is used by the migrated + device. + + When switchover is approved by all migrated devices in the destination + that support this capability, the MIG_RP_MSG_SWITCHOVER_ACK return path + message is sent to the source to notify it that it's OK to do + switchover. + + Signed-off-by: Avihai Horon + Reviewed-by: Peter Xu + Tested-by: YangHang Liu + Acked-by: Alex Williamson + Signed-off-by: Cédric Le Goater + +Conflicts: + - migration/migration.c + context changes due to commit f4584076fc31 ("migration: switch + from .vm_was_running to .vm_old_state") + +Signed-off-by: Cédric Le Goater +--- + include/migration/register.h | 2 ++ + migration/migration.c | 32 +++++++++++++++++++-- + migration/migration.h | 14 ++++++++++ + migration/savevm.c | 54 ++++++++++++++++++++++++++++++++++++ + migration/savevm.h | 1 + + migration/trace-events | 3 ++ + 6 files changed, 104 insertions(+), 2 deletions(-) + +diff --git a/include/migration/register.h b/include/migration/register.h +index a8dfd8fefd..90914f32f5 100644 +--- a/include/migration/register.h ++++ b/include/migration/register.h +@@ -71,6 +71,8 @@ typedef struct SaveVMHandlers { + int (*load_cleanup)(void *opaque); + /* Called when postcopy migration wants to resume from failure */ + int (*resume_prepare)(MigrationState *s, void *opaque); ++ /* Checks if switchover ack should be used. Called only in dest */ ++ bool (*switchover_ack_needed)(void *opaque); + } SaveVMHandlers; + + int register_savevm_live(const char *idstr, +diff --git a/migration/migration.c b/migration/migration.c +index 1ac5f19bc2..9bf1caee6c 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -76,6 +76,7 @@ enum mig_rp_message_type { + MIG_RP_MSG_REQ_PAGES, /* data (start: be64, len: be32) */ + MIG_RP_MSG_RECV_BITMAP, /* send recved_bitmap back to source */ + MIG_RP_MSG_RESUME_ACK, /* tell source that we are ready to resume */ ++ MIG_RP_MSG_SWITCHOVER_ACK, /* Tell source it's OK to do switchover */ + + MIG_RP_MSG_MAX + }; +@@ -756,6 +757,11 @@ bool migration_has_all_channels(void) + return true; + } + ++int migrate_send_rp_switchover_ack(MigrationIncomingState *mis) ++{ ++ return migrate_send_rp_message(mis, MIG_RP_MSG_SWITCHOVER_ACK, 0, NULL); ++} ++ + /* + * Send a 'SHUT' message on the return channel with the given value + * to indicate that we've finished with the RP. Non-0 value indicates +@@ -1415,6 +1421,7 @@ void migrate_init(MigrationState *s) + s->vm_was_running = false; + s->iteration_initial_bytes = 0; + s->threshold_size = 0; ++ s->switchover_acked = false; + } + + int migrate_add_blocker_internal(Error *reason, Error **errp) +@@ -1731,6 +1738,7 @@ static struct rp_cmd_args { + [MIG_RP_MSG_REQ_PAGES_ID] = { .len = -1, .name = "REQ_PAGES_ID" }, + [MIG_RP_MSG_RECV_BITMAP] = { .len = -1, .name = "RECV_BITMAP" }, + [MIG_RP_MSG_RESUME_ACK] = { .len = 4, .name = "RESUME_ACK" }, ++ [MIG_RP_MSG_SWITCHOVER_ACK] = { .len = 0, .name = "SWITCHOVER_ACK" }, + [MIG_RP_MSG_MAX] = { .len = -1, .name = "MAX" }, + }; + +@@ -1969,6 +1977,11 @@ retry: + } + break; + ++ case MIG_RP_MSG_SWITCHOVER_ACK: ++ ms->switchover_acked = true; ++ trace_source_return_path_thread_switchover_acked(); ++ break; ++ + default: + break; + } +@@ -2720,6 +2733,20 @@ static void migration_update_counters(MigrationState *s, + bandwidth, s->threshold_size); + } + ++static bool migration_can_switchover(MigrationState *s) ++{ ++ if (!migrate_switchover_ack()) { ++ return true; ++ } ++ ++ /* No reason to wait for switchover ACK if VM is stopped */ ++ if (!runstate_is_running()) { ++ return true; ++ } ++ ++ return s->switchover_acked; ++} ++ + /* Migration thread iteration status */ + typedef enum { + MIG_ITERATE_RESUME, /* Resume current iteration */ +@@ -2735,6 +2762,7 @@ static MigIterateState migration_iteration_run(MigrationState *s) + { + uint64_t must_precopy, can_postcopy; + bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE; ++ bool can_switchover = migration_can_switchover(s); + + qemu_savevm_state_pending_estimate(&must_precopy, &can_postcopy); + uint64_t pending_size = must_precopy + can_postcopy; +@@ -2747,14 +2775,14 @@ static MigIterateState migration_iteration_run(MigrationState *s) + trace_migrate_pending_exact(pending_size, must_precopy, can_postcopy); + } + +- if (!pending_size || pending_size < s->threshold_size) { ++ if ((!pending_size || pending_size < s->threshold_size) && can_switchover) { + trace_migration_thread_low_pending(pending_size); + migration_completion(s); + return MIG_ITERATE_BREAK; + } + + /* Still a significant amount to transfer */ +- if (!in_postcopy && must_precopy <= s->threshold_size && ++ if (!in_postcopy && must_precopy <= s->threshold_size && can_switchover && + qatomic_read(&s->start_postcopy)) { + if (postcopy_start(s)) { + error_report("%s: postcopy failed to start", __func__); +diff --git a/migration/migration.h b/migration/migration.h +index 2b71df8617..e9679f8029 100644 +--- a/migration/migration.h ++++ b/migration/migration.h +@@ -204,6 +204,13 @@ struct MigrationIncomingState { + * contains valid information. + */ + QemuMutex page_request_mutex; ++ ++ /* ++ * Number of devices that have yet to approve switchover. When this reaches ++ * zero an ACK that it's OK to do switchover is sent to the source. No lock ++ * is needed as this field is updated serially. ++ */ ++ unsigned int switchover_ack_pending_num; + }; + + MigrationIncomingState *migration_incoming_get_current(void); +@@ -421,6 +428,12 @@ struct MigrationState { + + /* QEMU_VM_VMDESCRIPTION content filled for all non-iterable devices. */ + JSONWriter *vmdesc; ++ ++ /* ++ * Indicates whether an ACK from the destination that it's OK to do ++ * switchover has been received. ++ */ ++ bool switchover_acked; + }; + + void migrate_set_state(int *state, int old_state, int new_state); +@@ -461,6 +474,7 @@ int migrate_send_rp_message_req_pages(MigrationIncomingState *mis, + void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis, + char *block_name); + void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value); ++int migrate_send_rp_switchover_ack(MigrationIncomingState *mis); + + void dirty_bitmap_mig_before_vm_start(void); + void dirty_bitmap_mig_cancel_outgoing(void); +diff --git a/migration/savevm.c b/migration/savevm.c +index 211eff3a8b..aff70e6263 100644 +--- a/migration/savevm.c ++++ b/migration/savevm.c +@@ -2358,6 +2358,21 @@ static int loadvm_process_command(QEMUFile *f) + error_report("CMD_OPEN_RETURN_PATH failed"); + return -1; + } ++ ++ /* ++ * Switchover ack is enabled but no device uses it, so send an ACK to ++ * source that it's OK to switchover. Do it here, after return path has ++ * been created. ++ */ ++ if (migrate_switchover_ack() && !mis->switchover_ack_pending_num) { ++ int ret = migrate_send_rp_switchover_ack(mis); ++ if (ret) { ++ error_report( ++ "Could not send switchover ack RP MSG, err %d (%s)", ret, ++ strerror(-ret)); ++ return ret; ++ } ++ } + break; + + case MIG_CMD_PING: +@@ -2584,6 +2599,23 @@ static int qemu_loadvm_state_header(QEMUFile *f) + return 0; + } + ++static void qemu_loadvm_state_switchover_ack_needed(MigrationIncomingState *mis) ++{ ++ SaveStateEntry *se; ++ ++ QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { ++ if (!se->ops || !se->ops->switchover_ack_needed) { ++ continue; ++ } ++ ++ if (se->ops->switchover_ack_needed(se->opaque)) { ++ mis->switchover_ack_pending_num++; ++ } ++ } ++ ++ trace_loadvm_state_switchover_ack_needed(mis->switchover_ack_pending_num); ++} ++ + static int qemu_loadvm_state_setup(QEMUFile *f) + { + SaveStateEntry *se; +@@ -2787,6 +2819,10 @@ int qemu_loadvm_state(QEMUFile *f) + return -EINVAL; + } + ++ if (migrate_switchover_ack()) { ++ qemu_loadvm_state_switchover_ack_needed(mis); ++ } ++ + cpu_synchronize_all_pre_loadvm(); + + ret = qemu_loadvm_state_main(f, mis); +@@ -2860,6 +2896,24 @@ int qemu_load_device_state(QEMUFile *f) + return 0; + } + ++int qemu_loadvm_approve_switchover(void) ++{ ++ MigrationIncomingState *mis = migration_incoming_get_current(); ++ ++ if (!mis->switchover_ack_pending_num) { ++ return -EINVAL; ++ } ++ ++ mis->switchover_ack_pending_num--; ++ trace_loadvm_approve_switchover(mis->switchover_ack_pending_num); ++ ++ if (mis->switchover_ack_pending_num) { ++ return 0; ++ } ++ ++ return migrate_send_rp_switchover_ack(mis); ++} ++ + bool save_snapshot(const char *name, bool overwrite, const char *vmstate, + bool has_devices, strList *devices, Error **errp) + { +diff --git a/migration/savevm.h b/migration/savevm.h +index fb636735f0..e894bbc143 100644 +--- a/migration/savevm.h ++++ b/migration/savevm.h +@@ -65,6 +65,7 @@ int qemu_loadvm_state(QEMUFile *f); + void qemu_loadvm_state_cleanup(void); + int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); + int qemu_load_device_state(QEMUFile *f); ++int qemu_loadvm_approve_switchover(void); + int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, + bool in_postcopy, bool inactivate_disks); + +diff --git a/migration/trace-events b/migration/trace-events +index 92161eeac5..cda807d271 100644 +--- a/migration/trace-events ++++ b/migration/trace-events +@@ -7,6 +7,7 @@ qemu_loadvm_state_section_partend(uint32_t section_id) "%u" + qemu_loadvm_state_post_main(int ret) "%d" + qemu_loadvm_state_section_startfull(uint32_t section_id, const char *idstr, uint32_t instance_id, uint32_t version_id) "%u(%s) %u %u" + qemu_savevm_send_packaged(void) "" ++loadvm_state_switchover_ack_needed(unsigned int switchover_ack_pending_num) "Switchover ack pending num=%u" + loadvm_state_setup(void) "" + loadvm_state_cleanup(void) "" + loadvm_handle_cmd_packaged(unsigned int length) "%u" +@@ -23,6 +24,7 @@ loadvm_postcopy_ram_handle_discard_end(void) "" + loadvm_postcopy_ram_handle_discard_header(const char *ramid, uint16_t len) "%s: %ud" + loadvm_process_command(const char *s, uint16_t len) "com=%s len=%d" + loadvm_process_command_ping(uint32_t val) "0x%x" ++loadvm_approve_switchover(unsigned int switchover_ack_pending_num) "Switchover ack pending num=%u" + postcopy_ram_listen_thread_exit(void) "" + postcopy_ram_listen_thread_start(void) "" + qemu_savevm_send_postcopy_advise(void) "" +@@ -180,6 +182,7 @@ source_return_path_thread_loop_top(void) "" + source_return_path_thread_pong(uint32_t val) "0x%x" + source_return_path_thread_shut(uint32_t val) "0x%x" + source_return_path_thread_resume_ack(uint32_t v) "%"PRIu32 ++source_return_path_thread_switchover_acked(void) "" + migration_thread_low_pending(uint64_t pending) "%" PRIu64 + migrate_transferred(uint64_t tranferred, uint64_t time_spent, uint64_t bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %" PRIu64 " max_size %" PRId64 + process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d" +-- +2.39.3 + diff --git a/kvm-migration-Make-all-functions-check-have-the-same-for.patch b/kvm-migration-Make-all-functions-check-have-the-same-for.patch new file mode 100644 index 0000000..f873f3f --- /dev/null +++ b/kvm-migration-Make-all-functions-check-have-the-same-for.patch @@ -0,0 +1,431 @@ +From eaccfc91b34f93dcaf597e6b39f78741da618ff3 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 08/37] migration: Make all functions check have the same + format +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [6/28] 774df2a81502d3eab5d5b8f64fa9b69f8be43669 (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit 8f9c532756c5 +Author: Juan Quintela +Date: Wed Mar 1 23:11:08 2023 +0100 + + migration: Make all functions check have the same format + + Signed-off-by: Juan Quintela + Reviewed-by: Vladimir Sementsov-Ogievskiy + +Signed-off-by: Cédric Le Goater +--- + migration/options.c | 153 +++++++++++--------------------------------- + 1 file changed, 39 insertions(+), 114 deletions(-) + +diff --git a/migration/options.c b/migration/options.c +index e51d667e14..bcfe244fa9 100644 +--- a/migration/options.c ++++ b/migration/options.c +@@ -33,27 +33,21 @@ + + bool migrate_auto_converge(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE]; + } + + bool migrate_background_snapshot(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]; + } + + bool migrate_block(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_BLOCK]; + } +@@ -61,95 +55,76 @@ bool migrate_block(void) + bool migrate_colo(void) + { + MigrationState *s = migrate_get_current(); ++ + return s->capabilities[MIGRATION_CAPABILITY_X_COLO]; + } + + bool migrate_compress(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_COMPRESS]; + } + + bool migrate_dirty_bitmaps(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_DIRTY_BITMAPS]; + } + + bool migrate_events(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_EVENTS]; + } + + bool migrate_ignore_shared(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_X_IGNORE_SHARED]; + } + + bool migrate_late_block_activate(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE]; + } + + bool migrate_multifd(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_MULTIFD]; + } + + bool migrate_pause_before_switchover(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER]; + } + + bool migrate_postcopy_blocktime(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME]; + } + + bool migrate_postcopy_preempt(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_POSTCOPY_PREEMPT]; + } + + bool migrate_postcopy_ram(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM]; + } +@@ -163,54 +138,42 @@ bool migrate_rdma_pin_all(void) + + bool migrate_release_ram(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_RELEASE_RAM]; + } + + bool migrate_return_path(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_RETURN_PATH]; + } + + bool migrate_validate_uuid(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_VALIDATE_UUID]; + } + + bool migrate_xbzrle(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_XBZRLE]; + } + + bool migrate_zero_blocks(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS]; + } + + bool migrate_zero_copy_send(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_ZERO_COPY_SEND]; + } +@@ -224,9 +187,7 @@ bool migrate_postcopy(void) + + bool migrate_tls(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->parameters.tls_creds && *s->parameters.tls_creds; + } +@@ -491,126 +452,98 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, + + bool migrate_block_incremental(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->parameters.block_incremental; + } + + uint32_t migrate_checkpoint_delay(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->parameters.x_checkpoint_delay; + } + + int migrate_compress_level(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->parameters.compress_level; + } + + int migrate_compress_threads(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->parameters.compress_threads; + } + + int migrate_compress_wait_thread(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->parameters.compress_wait_thread; + } + + uint8_t migrate_cpu_throttle_increment(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->parameters.cpu_throttle_increment; + } + + uint8_t migrate_cpu_throttle_initial(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->parameters.cpu_throttle_initial; + } + + bool migrate_cpu_throttle_tailslow(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->parameters.cpu_throttle_tailslow; + } + + int migrate_decompress_threads(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->parameters.decompress_threads; + } + + uint8_t migrate_max_cpu_throttle(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->parameters.max_cpu_throttle; + } + + uint64_t migrate_max_bandwidth(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->parameters.max_bandwidth; + } + + int64_t migrate_max_postcopy_bandwidth(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->parameters.max_postcopy_bandwidth; + } + + int migrate_multifd_channels(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->parameters.multifd_channels; + } + + MultiFDCompression migrate_multifd_compression(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + assert(s->parameters.multifd_compression < MULTIFD_COMPRESSION__MAX); + return s->parameters.multifd_compression; +@@ -618,36 +551,28 @@ MultiFDCompression migrate_multifd_compression(void) + + int migrate_multifd_zlib_level(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->parameters.multifd_zlib_level; + } + + int migrate_multifd_zstd_level(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->parameters.multifd_zstd_level; + } + + uint8_t migrate_throttle_trigger_threshold(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->parameters.throttle_trigger_threshold; + } + + uint64_t migrate_xbzrle_cache_size(void) + { +- MigrationState *s; +- +- s = migrate_get_current(); ++ MigrationState *s = migrate_get_current(); + + return s->parameters.xbzrle_cache_size; + } +-- +2.39.3 + diff --git a/kvm-migration-Move-migration_properties-to-options.c.patch b/kvm-migration-Move-migration_properties-to-options.c.patch new file mode 100644 index 0000000..145b510 --- /dev/null +++ b/kvm-migration-Move-migration_properties-to-options.c.patch @@ -0,0 +1,409 @@ +From 0911e025a9dc8a0c85944ac11fb9df72e5ad0677 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 09/37] migration: Move migration_properties to options.c +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [7/28] ff07358afa0c90f13125b177b0e08c74ef1b9905 (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit f9436522c8dd +Author: Juan Quintela +Date: Thu Mar 2 12:55:57 2023 +0100 + + migration: Move migration_properties to options.c + + Signed-off-by: Juan Quintela + Reviewed-by: Vladimir Sementsov-Ogievskiy + +Signed-off-by: Cédric Le Goater +--- + migration/migration.c | 157 ------------------------------------------ + migration/options.c | 155 +++++++++++++++++++++++++++++++++++++++++ + migration/options.h | 7 ++ + 3 files changed, 162 insertions(+), 157 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index 08f87f2b0e..1ac5f19bc2 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -52,8 +52,6 @@ + #include "io/channel-tls.h" + #include "migration/colo.h" + #include "hw/boards.h" +-#include "hw/qdev-properties.h" +-#include "hw/qdev-properties-system.h" + #include "monitor/monitor.h" + #include "net/announce.h" + #include "qemu/queue.h" +@@ -65,51 +63,6 @@ + #include "sysemu/qtest.h" + #include "options.h" + +-#define MAX_THROTTLE (128 << 20) /* Migration transfer speed throttling */ +- +-/* Time in milliseconds we are allowed to stop the source, +- * for sending the last part */ +-#define DEFAULT_MIGRATE_SET_DOWNTIME 300 +- +-/* Default compression thread count */ +-#define DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT 8 +-/* Default decompression thread count, usually decompression is at +- * least 4 times as fast as compression.*/ +-#define DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT 2 +-/*0: means nocompress, 1: best speed, ... 9: best compress ratio */ +-#define DEFAULT_MIGRATE_COMPRESS_LEVEL 1 +-/* Define default autoconverge cpu throttle migration parameters */ +-#define DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD 50 +-#define DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL 20 +-#define DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT 10 +-#define DEFAULT_MIGRATE_MAX_CPU_THROTTLE 99 +- +-/* Migration XBZRLE default cache size */ +-#define DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE (64 * 1024 * 1024) +- +-/* The delay time (in ms) between two COLO checkpoints */ +-#define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY (200 * 100) +-#define DEFAULT_MIGRATE_MULTIFD_CHANNELS 2 +-#define DEFAULT_MIGRATE_MULTIFD_COMPRESSION MULTIFD_COMPRESSION_NONE +-/* 0: means nocompress, 1: best speed, ... 9: best compress ratio */ +-#define DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL 1 +-/* 0: means nocompress, 1: best speed, ... 20: best compress ratio */ +-#define DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL 1 +- +-/* Background transfer rate for postcopy, 0 means unlimited, note +- * that page requests can still exceed this limit. +- */ +-#define DEFAULT_MIGRATE_MAX_POSTCOPY_BANDWIDTH 0 +- +-/* +- * Parameters for self_announce_delay giving a stream of RARP/ARP +- * packets after migration. +- */ +-#define DEFAULT_MIGRATE_ANNOUNCE_INITIAL 50 +-#define DEFAULT_MIGRATE_ANNOUNCE_MAX 550 +-#define DEFAULT_MIGRATE_ANNOUNCE_ROUNDS 5 +-#define DEFAULT_MIGRATE_ANNOUNCE_STEP 100 +- + static NotifierList migration_state_notifiers = + NOTIFIER_LIST_INITIALIZER(migration_state_notifiers); + +@@ -3336,116 +3289,6 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) + s->migration_thread_running = true; + } + +-#define DEFINE_PROP_MIG_CAP(name, x) \ +- DEFINE_PROP_BOOL(name, MigrationState, capabilities[x], false) +- +-static Property migration_properties[] = { +- DEFINE_PROP_BOOL("store-global-state", MigrationState, +- store_global_state, true), +- DEFINE_PROP_BOOL("send-configuration", MigrationState, +- send_configuration, true), +- DEFINE_PROP_BOOL("send-section-footer", MigrationState, +- send_section_footer, true), +- DEFINE_PROP_BOOL("decompress-error-check", MigrationState, +- decompress_error_check, true), +- DEFINE_PROP_UINT8("x-clear-bitmap-shift", MigrationState, +- clear_bitmap_shift, CLEAR_BITMAP_SHIFT_DEFAULT), +- DEFINE_PROP_BOOL("x-preempt-pre-7-2", MigrationState, +- preempt_pre_7_2, false), +- +- /* Migration parameters */ +- DEFINE_PROP_UINT8("x-compress-level", MigrationState, +- parameters.compress_level, +- DEFAULT_MIGRATE_COMPRESS_LEVEL), +- DEFINE_PROP_UINT8("x-compress-threads", MigrationState, +- parameters.compress_threads, +- DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT), +- DEFINE_PROP_BOOL("x-compress-wait-thread", MigrationState, +- parameters.compress_wait_thread, true), +- DEFINE_PROP_UINT8("x-decompress-threads", MigrationState, +- parameters.decompress_threads, +- DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT), +- DEFINE_PROP_UINT8("x-throttle-trigger-threshold", MigrationState, +- parameters.throttle_trigger_threshold, +- DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD), +- DEFINE_PROP_UINT8("x-cpu-throttle-initial", MigrationState, +- parameters.cpu_throttle_initial, +- DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL), +- DEFINE_PROP_UINT8("x-cpu-throttle-increment", MigrationState, +- parameters.cpu_throttle_increment, +- DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT), +- DEFINE_PROP_BOOL("x-cpu-throttle-tailslow", MigrationState, +- parameters.cpu_throttle_tailslow, false), +- DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState, +- parameters.max_bandwidth, MAX_THROTTLE), +- DEFINE_PROP_UINT64("x-downtime-limit", MigrationState, +- parameters.downtime_limit, +- DEFAULT_MIGRATE_SET_DOWNTIME), +- DEFINE_PROP_UINT32("x-checkpoint-delay", MigrationState, +- parameters.x_checkpoint_delay, +- DEFAULT_MIGRATE_X_CHECKPOINT_DELAY), +- DEFINE_PROP_UINT8("multifd-channels", MigrationState, +- parameters.multifd_channels, +- DEFAULT_MIGRATE_MULTIFD_CHANNELS), +- DEFINE_PROP_MULTIFD_COMPRESSION("multifd-compression", MigrationState, +- parameters.multifd_compression, +- DEFAULT_MIGRATE_MULTIFD_COMPRESSION), +- DEFINE_PROP_UINT8("multifd-zlib-level", MigrationState, +- parameters.multifd_zlib_level, +- DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL), +- DEFINE_PROP_UINT8("multifd-zstd-level", MigrationState, +- parameters.multifd_zstd_level, +- DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL), +- DEFINE_PROP_SIZE("xbzrle-cache-size", MigrationState, +- parameters.xbzrle_cache_size, +- DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE), +- DEFINE_PROP_SIZE("max-postcopy-bandwidth", MigrationState, +- parameters.max_postcopy_bandwidth, +- DEFAULT_MIGRATE_MAX_POSTCOPY_BANDWIDTH), +- DEFINE_PROP_UINT8("max-cpu-throttle", MigrationState, +- parameters.max_cpu_throttle, +- DEFAULT_MIGRATE_MAX_CPU_THROTTLE), +- DEFINE_PROP_SIZE("announce-initial", MigrationState, +- parameters.announce_initial, +- DEFAULT_MIGRATE_ANNOUNCE_INITIAL), +- DEFINE_PROP_SIZE("announce-max", MigrationState, +- parameters.announce_max, +- DEFAULT_MIGRATE_ANNOUNCE_MAX), +- DEFINE_PROP_SIZE("announce-rounds", MigrationState, +- parameters.announce_rounds, +- DEFAULT_MIGRATE_ANNOUNCE_ROUNDS), +- DEFINE_PROP_SIZE("announce-step", MigrationState, +- parameters.announce_step, +- DEFAULT_MIGRATE_ANNOUNCE_STEP), +- DEFINE_PROP_STRING("tls-creds", MigrationState, parameters.tls_creds), +- DEFINE_PROP_STRING("tls-hostname", MigrationState, parameters.tls_hostname), +- DEFINE_PROP_STRING("tls-authz", MigrationState, parameters.tls_authz), +- +- /* Migration capabilities */ +- DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE), +- DEFINE_PROP_MIG_CAP("x-rdma-pin-all", MIGRATION_CAPABILITY_RDMA_PIN_ALL), +- DEFINE_PROP_MIG_CAP("x-auto-converge", MIGRATION_CAPABILITY_AUTO_CONVERGE), +- DEFINE_PROP_MIG_CAP("x-zero-blocks", MIGRATION_CAPABILITY_ZERO_BLOCKS), +- DEFINE_PROP_MIG_CAP("x-compress", MIGRATION_CAPABILITY_COMPRESS), +- DEFINE_PROP_MIG_CAP("x-events", MIGRATION_CAPABILITY_EVENTS), +- DEFINE_PROP_MIG_CAP("x-postcopy-ram", MIGRATION_CAPABILITY_POSTCOPY_RAM), +- DEFINE_PROP_MIG_CAP("x-postcopy-preempt", +- MIGRATION_CAPABILITY_POSTCOPY_PREEMPT), +- DEFINE_PROP_MIG_CAP("x-colo", MIGRATION_CAPABILITY_X_COLO), +- DEFINE_PROP_MIG_CAP("x-release-ram", MIGRATION_CAPABILITY_RELEASE_RAM), +- DEFINE_PROP_MIG_CAP("x-block", MIGRATION_CAPABILITY_BLOCK), +- DEFINE_PROP_MIG_CAP("x-return-path", MIGRATION_CAPABILITY_RETURN_PATH), +- DEFINE_PROP_MIG_CAP("x-multifd", MIGRATION_CAPABILITY_MULTIFD), +- DEFINE_PROP_MIG_CAP("x-background-snapshot", +- MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT), +-#ifdef CONFIG_LINUX +- DEFINE_PROP_MIG_CAP("x-zero-copy-send", +- MIGRATION_CAPABILITY_ZERO_COPY_SEND), +-#endif +- +- DEFINE_PROP_END_OF_LIST(), +-}; +- + static void migration_class_init(ObjectClass *klass, void *data) + { + DeviceClass *dc = DEVICE_CLASS(klass); +diff --git a/migration/options.c b/migration/options.c +index bcfe244fa9..a76984276d 100644 +--- a/migration/options.c ++++ b/migration/options.c +@@ -31,6 +31,161 @@ + #define MAX_MIGRATE_DOWNTIME_SECONDS 2000 + #define MAX_MIGRATE_DOWNTIME (MAX_MIGRATE_DOWNTIME_SECONDS * 1000) + ++#define MAX_THROTTLE (128 << 20) /* Migration transfer speed throttling */ ++ ++/* Time in milliseconds we are allowed to stop the source, ++ * for sending the last part */ ++#define DEFAULT_MIGRATE_SET_DOWNTIME 300 ++ ++/* Default compression thread count */ ++#define DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT 8 ++/* Default decompression thread count, usually decompression is at ++ * least 4 times as fast as compression.*/ ++#define DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT 2 ++/*0: means nocompress, 1: best speed, ... 9: best compress ratio */ ++#define DEFAULT_MIGRATE_COMPRESS_LEVEL 1 ++/* Define default autoconverge cpu throttle migration parameters */ ++#define DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD 50 ++#define DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL 20 ++#define DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT 10 ++#define DEFAULT_MIGRATE_MAX_CPU_THROTTLE 99 ++ ++/* Migration XBZRLE default cache size */ ++#define DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE (64 * 1024 * 1024) ++ ++/* The delay time (in ms) between two COLO checkpoints */ ++#define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY (200 * 100) ++#define DEFAULT_MIGRATE_MULTIFD_CHANNELS 2 ++#define DEFAULT_MIGRATE_MULTIFD_COMPRESSION MULTIFD_COMPRESSION_NONE ++/* 0: means nocompress, 1: best speed, ... 9: best compress ratio */ ++#define DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL 1 ++/* 0: means nocompress, 1: best speed, ... 20: best compress ratio */ ++#define DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL 1 ++ ++/* Background transfer rate for postcopy, 0 means unlimited, note ++ * that page requests can still exceed this limit. ++ */ ++#define DEFAULT_MIGRATE_MAX_POSTCOPY_BANDWIDTH 0 ++ ++/* ++ * Parameters for self_announce_delay giving a stream of RARP/ARP ++ * packets after migration. ++ */ ++#define DEFAULT_MIGRATE_ANNOUNCE_INITIAL 50 ++#define DEFAULT_MIGRATE_ANNOUNCE_MAX 550 ++#define DEFAULT_MIGRATE_ANNOUNCE_ROUNDS 5 ++#define DEFAULT_MIGRATE_ANNOUNCE_STEP 100 ++ ++#define DEFINE_PROP_MIG_CAP(name, x) \ ++ DEFINE_PROP_BOOL(name, MigrationState, capabilities[x], false) ++ ++Property migration_properties[] = { ++ DEFINE_PROP_BOOL("store-global-state", MigrationState, ++ store_global_state, true), ++ DEFINE_PROP_BOOL("send-configuration", MigrationState, ++ send_configuration, true), ++ DEFINE_PROP_BOOL("send-section-footer", MigrationState, ++ send_section_footer, true), ++ DEFINE_PROP_BOOL("decompress-error-check", MigrationState, ++ decompress_error_check, true), ++ DEFINE_PROP_UINT8("x-clear-bitmap-shift", MigrationState, ++ clear_bitmap_shift, CLEAR_BITMAP_SHIFT_DEFAULT), ++ DEFINE_PROP_BOOL("x-preempt-pre-7-2", MigrationState, ++ preempt_pre_7_2, false), ++ ++ /* Migration parameters */ ++ DEFINE_PROP_UINT8("x-compress-level", MigrationState, ++ parameters.compress_level, ++ DEFAULT_MIGRATE_COMPRESS_LEVEL), ++ DEFINE_PROP_UINT8("x-compress-threads", MigrationState, ++ parameters.compress_threads, ++ DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT), ++ DEFINE_PROP_BOOL("x-compress-wait-thread", MigrationState, ++ parameters.compress_wait_thread, true), ++ DEFINE_PROP_UINT8("x-decompress-threads", MigrationState, ++ parameters.decompress_threads, ++ DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT), ++ DEFINE_PROP_UINT8("x-throttle-trigger-threshold", MigrationState, ++ parameters.throttle_trigger_threshold, ++ DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD), ++ DEFINE_PROP_UINT8("x-cpu-throttle-initial", MigrationState, ++ parameters.cpu_throttle_initial, ++ DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL), ++ DEFINE_PROP_UINT8("x-cpu-throttle-increment", MigrationState, ++ parameters.cpu_throttle_increment, ++ DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT), ++ DEFINE_PROP_BOOL("x-cpu-throttle-tailslow", MigrationState, ++ parameters.cpu_throttle_tailslow, false), ++ DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState, ++ parameters.max_bandwidth, MAX_THROTTLE), ++ DEFINE_PROP_UINT64("x-downtime-limit", MigrationState, ++ parameters.downtime_limit, ++ DEFAULT_MIGRATE_SET_DOWNTIME), ++ DEFINE_PROP_UINT32("x-checkpoint-delay", MigrationState, ++ parameters.x_checkpoint_delay, ++ DEFAULT_MIGRATE_X_CHECKPOINT_DELAY), ++ DEFINE_PROP_UINT8("multifd-channels", MigrationState, ++ parameters.multifd_channels, ++ DEFAULT_MIGRATE_MULTIFD_CHANNELS), ++ DEFINE_PROP_MULTIFD_COMPRESSION("multifd-compression", MigrationState, ++ parameters.multifd_compression, ++ DEFAULT_MIGRATE_MULTIFD_COMPRESSION), ++ DEFINE_PROP_UINT8("multifd-zlib-level", MigrationState, ++ parameters.multifd_zlib_level, ++ DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL), ++ DEFINE_PROP_UINT8("multifd-zstd-level", MigrationState, ++ parameters.multifd_zstd_level, ++ DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL), ++ DEFINE_PROP_SIZE("xbzrle-cache-size", MigrationState, ++ parameters.xbzrle_cache_size, ++ DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE), ++ DEFINE_PROP_SIZE("max-postcopy-bandwidth", MigrationState, ++ parameters.max_postcopy_bandwidth, ++ DEFAULT_MIGRATE_MAX_POSTCOPY_BANDWIDTH), ++ DEFINE_PROP_UINT8("max-cpu-throttle", MigrationState, ++ parameters.max_cpu_throttle, ++ DEFAULT_MIGRATE_MAX_CPU_THROTTLE), ++ DEFINE_PROP_SIZE("announce-initial", MigrationState, ++ parameters.announce_initial, ++ DEFAULT_MIGRATE_ANNOUNCE_INITIAL), ++ DEFINE_PROP_SIZE("announce-max", MigrationState, ++ parameters.announce_max, ++ DEFAULT_MIGRATE_ANNOUNCE_MAX), ++ DEFINE_PROP_SIZE("announce-rounds", MigrationState, ++ parameters.announce_rounds, ++ DEFAULT_MIGRATE_ANNOUNCE_ROUNDS), ++ DEFINE_PROP_SIZE("announce-step", MigrationState, ++ parameters.announce_step, ++ DEFAULT_MIGRATE_ANNOUNCE_STEP), ++ DEFINE_PROP_STRING("tls-creds", MigrationState, parameters.tls_creds), ++ DEFINE_PROP_STRING("tls-hostname", MigrationState, parameters.tls_hostname), ++ DEFINE_PROP_STRING("tls-authz", MigrationState, parameters.tls_authz), ++ ++ /* Migration capabilities */ ++ DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE), ++ DEFINE_PROP_MIG_CAP("x-rdma-pin-all", MIGRATION_CAPABILITY_RDMA_PIN_ALL), ++ DEFINE_PROP_MIG_CAP("x-auto-converge", MIGRATION_CAPABILITY_AUTO_CONVERGE), ++ DEFINE_PROP_MIG_CAP("x-zero-blocks", MIGRATION_CAPABILITY_ZERO_BLOCKS), ++ DEFINE_PROP_MIG_CAP("x-compress", MIGRATION_CAPABILITY_COMPRESS), ++ DEFINE_PROP_MIG_CAP("x-events", MIGRATION_CAPABILITY_EVENTS), ++ DEFINE_PROP_MIG_CAP("x-postcopy-ram", MIGRATION_CAPABILITY_POSTCOPY_RAM), ++ DEFINE_PROP_MIG_CAP("x-postcopy-preempt", ++ MIGRATION_CAPABILITY_POSTCOPY_PREEMPT), ++ DEFINE_PROP_MIG_CAP("x-colo", MIGRATION_CAPABILITY_X_COLO), ++ DEFINE_PROP_MIG_CAP("x-release-ram", MIGRATION_CAPABILITY_RELEASE_RAM), ++ DEFINE_PROP_MIG_CAP("x-block", MIGRATION_CAPABILITY_BLOCK), ++ DEFINE_PROP_MIG_CAP("x-return-path", MIGRATION_CAPABILITY_RETURN_PATH), ++ DEFINE_PROP_MIG_CAP("x-multifd", MIGRATION_CAPABILITY_MULTIFD), ++ DEFINE_PROP_MIG_CAP("x-background-snapshot", ++ MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT), ++#ifdef CONFIG_LINUX ++ DEFINE_PROP_MIG_CAP("x-zero-copy-send", ++ MIGRATION_CAPABILITY_ZERO_COPY_SEND), ++#endif ++ ++ DEFINE_PROP_END_OF_LIST(), ++}; ++ + bool migrate_auto_converge(void) + { + MigrationState *s = migrate_get_current(); +diff --git a/migration/options.h b/migration/options.h +index 89067e59a0..7b0f7245ad 100644 +--- a/migration/options.h ++++ b/migration/options.h +@@ -14,6 +14,9 @@ + #ifndef QEMU_MIGRATION_OPTIONS_H + #define QEMU_MIGRATION_OPTIONS_H + ++#include "hw/qdev-properties.h" ++#include "hw/qdev-properties-system.h" ++ + /* constants */ + + /* Amount of time to allocate to each "chunk" of bandwidth-throttled +@@ -21,6 +24,10 @@ + #define BUFFER_DELAY 100 + #define XFER_LIMIT_RATIO (1000 / BUFFER_DELAY) + ++/* migration properties */ ++ ++extern Property migration_properties[]; ++ + /* capabilities */ + + bool migrate_auto_converge(void); +-- +2.39.3 + diff --git a/kvm-pc-bios-s390-ccw-Don-t-use-__bss_start-with-the-larl.patch b/kvm-pc-bios-s390-ccw-Don-t-use-__bss_start-with-the-larl.patch new file mode 100644 index 0000000..312af68 --- /dev/null +++ b/kvm-pc-bios-s390-ccw-Don-t-use-__bss_start-with-the-larl.patch @@ -0,0 +1,78 @@ +From 7495a51c586818925470fb247882f5ba0f7b0ffd Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Tue, 27 Jun 2023 09:47:03 +0200 +Subject: [PATCH 34/37] pc-bios/s390-ccw: Don't use __bss_start with the "larl" + instruction +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +RH-MergeRequest: 180: Fix misaligned symbol error in the s390-ccw image during qemu-kvm build with binutils 2.40 +RH-Bugzilla: 2220866 +RH-Acked-by: Cédric Le Goater +RH-Acked-by: Miroslav Rezanina +RH-Commit: [4/4] 2483a50c0ed37fa29db649ec44220ac83c215698 (thuth/qemu-kvm-cs9) + +start.S currently cannot be compiled with Clang 16 and binutils 2.40: + + ld: start.o(.text+0x8): misaligned symbol `__bss_start' (0xc1e5) for + relocation R_390_PC32DBL + +According to the built-in linker script of ld, the symbol __bss_start +can actually point *before* the .bss section and does not need to have +any alignment, so in certain situations (like when using the internal +assembler of Clang), the __bss_start symbol can indeed be unaligned +and thus it is not suitable for being used with the "larl" instruction +that needs an address that is at least aligned to halfwords. +The problem went unnoticed so far since binutils <= 2.39 did not +check the alignment, but starting with binutils 2.40, such unaligned +addresses are now refused. + +Fix it by loading the address indirectly instead. + +Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2216662 +Reported-by: Miroslav Rezanina +Suggested-by: Andreas Krebbel +Message-Id: <20230629104821.194859-8-thuth@redhat.com> +Reviewed-by: Claudio Imbrenda +Signed-off-by: Thomas Huth +(cherry picked from commit 7cd50cbe4ca3e2860b31b06ec92c17c54bd82d48) +--- + pc-bios/s390-ccw/start.S | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/pc-bios/s390-ccw/start.S b/pc-bios/s390-ccw/start.S +index abd6fe6639..22c1c296df 100644 +--- a/pc-bios/s390-ccw/start.S ++++ b/pc-bios/s390-ccw/start.S +@@ -19,7 +19,8 @@ _start: + larl %r15,stack + STACK_SIZE - STACK_FRAME_SIZE /* Set up stack */ + + /* clear bss */ +- larl %r2,__bss_start ++ larl %r2,bss_start_literal /* __bss_start might be unaligned ... */ ++ lg %r2,0(%r2) /* ... so load it indirectly */ + larl %r3,_end + slgr %r3,%r2 /* get sizeof bss */ + ltgr %r3,%r3 /* bss empty? */ +@@ -45,7 +46,6 @@ done: + memsetxc: + xc 0(1,%r1),0(%r1) + +- + /* + * void disabled_wait(void) + * +@@ -113,6 +113,8 @@ io_new_code: + br %r14 + + .align 8 ++bss_start_literal: ++ .quad __bss_start + disabled_wait_psw: + .quad 0x0002000180000000,0x0000000000000000 + enabled_wait_psw: +-- +2.39.3 + diff --git a/kvm-pc-bios-s390-ccw-Fix-indentation-in-start.S.patch b/kvm-pc-bios-s390-ccw-Fix-indentation-in-start.S.patch new file mode 100644 index 0000000..bd13187 --- /dev/null +++ b/kvm-pc-bios-s390-ccw-Fix-indentation-in-start.S.patch @@ -0,0 +1,218 @@ +From 24bc8fc932ae1c88cc2e97f0f90786a7be411bb2 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Tue, 27 Jun 2023 09:47:00 +0200 +Subject: [PATCH 32/37] pc-bios/s390-ccw: Fix indentation in start.S +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +RH-MergeRequest: 180: Fix misaligned symbol error in the s390-ccw image during qemu-kvm build with binutils 2.40 +RH-Bugzilla: 2220866 +RH-Acked-by: Cédric Le Goater +RH-Acked-by: Miroslav Rezanina +RH-Commit: [2/4] cf8fa053602ce1cfac0b6efa67f491688d4f9348 (thuth/qemu-kvm-cs9) + +start.S is currently indented with a mixture of spaces and tabs, which +is quite ugly. QEMU coding style says indentation should be 4 spaces, +and this is also what we are using in the assembler files in the +tests/tcg/s390x/ folder already, so let's adjust start.S accordingly. + +Reviewed-by: Cédric Le Goater +Message-Id: <20230627074703.99608-2-thuth@redhat.com> +Reviewed-by: Claudio Imbrenda +Reviewed-by: Eric Farman +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Thomas Huth +(cherry picked from commit f52420fa4fd9f519dc42c20d2616aba4149adc25) +--- + pc-bios/s390-ccw/start.S | 136 +++++++++++++++++++-------------------- + 1 file changed, 68 insertions(+), 68 deletions(-) + +diff --git a/pc-bios/s390-ccw/start.S b/pc-bios/s390-ccw/start.S +index 6072906df4..d29de09cc6 100644 +--- a/pc-bios/s390-ccw/start.S ++++ b/pc-bios/s390-ccw/start.S +@@ -10,37 +10,37 @@ + * directory. + */ + +- .globl _start ++ .globl _start + _start: + +- larl %r15, stack + 0x8000 /* Set up stack */ ++ larl %r15,stack + 0x8000 /* Set up stack */ + +- /* clear bss */ +- larl %r2, __bss_start +- larl %r3, _end +- slgr %r3, %r2 /* get sizeof bss */ +- ltgr %r3,%r3 /* bss empty? */ +- jz done +- aghi %r3,-1 +- srlg %r4,%r3,8 /* how many 256 byte chunks? */ +- ltgr %r4,%r4 +- lgr %r1,%r2 +- jz remainder ++ /* clear bss */ ++ larl %r2,__bss_start ++ larl %r3,_end ++ slgr %r3,%r2 /* get sizeof bss */ ++ ltgr %r3,%r3 /* bss empty? */ ++ jz done ++ aghi %r3,-1 ++ srlg %r4,%r3,8 /* how many 256 byte chunks? */ ++ ltgr %r4,%r4 ++ lgr %r1,%r2 ++ jz remainder + loop: +- xc 0(256,%r1),0(%r1) +- la %r1,256(%r1) +- brctg %r4,loop ++ xc 0(256,%r1),0(%r1) ++ la %r1,256(%r1) ++ brctg %r4,loop + remainder: +- larl %r2,memsetxc +- ex %r3,0(%r2) ++ larl %r2,memsetxc ++ ex %r3,0(%r2) + done: +- /* set up a pgm exception disabled wait psw */ +- larl %r2, disabled_wait_psw +- mvc 0x01d0(16), 0(%r2) +- j main /* And call C */ ++ /* set up a pgm exception disabled wait psw */ ++ larl %r2,disabled_wait_psw ++ mvc 0x01d0(16),0(%r2) ++ j main /* And call C */ + + memsetxc: +- xc 0(1,%r1),0(%r1) ++ xc 0(1,%r1),0(%r1) + + + /* +@@ -48,11 +48,11 @@ memsetxc: + * + * stops the current guest cpu. + */ +- .globl disabled_wait ++ .globl disabled_wait + disabled_wait: +- larl %r1,disabled_wait_psw +- lpswe 0(%r1) +-1: j 1b ++ larl %r1,disabled_wait_psw ++ lpswe 0(%r1) ++1: j 1b + + + /* +@@ -60,61 +60,61 @@ disabled_wait: + * + * eats one sclp interrupt + */ +- .globl consume_sclp_int ++ .globl consume_sclp_int + consume_sclp_int: +- /* enable service interrupts in cr0 */ +- stctg %c0,%c0,0(%r15) +- oi 6(%r15),0x2 +- lctlg %c0,%c0,0(%r15) +- /* prepare external call handler */ +- larl %r1, external_new_code +- stg %r1, 0x1b8 +- larl %r1, external_new_mask +- mvc 0x1b0(8),0(%r1) +- /* load enabled wait PSW */ +- larl %r1, enabled_wait_psw +- lpswe 0(%r1) ++ /* enable service interrupts in cr0 */ ++ stctg %c0,%c0,0(%r15) ++ oi 6(%r15),0x2 ++ lctlg %c0,%c0,0(%r15) ++ /* prepare external call handler */ ++ larl %r1,external_new_code ++ stg %r1,0x1b8 ++ larl %r1,external_new_mask ++ mvc 0x1b0(8),0(%r1) ++ /* load enabled wait PSW */ ++ larl %r1,enabled_wait_psw ++ lpswe 0(%r1) + + /* + * void consume_io_int(void) + * + * eats one I/O interrupt + */ +- .globl consume_io_int ++ .globl consume_io_int + consume_io_int: +- /* enable I/O interrupts in cr6 */ +- stctg %c6,%c6,0(%r15) +- oi 4(%r15), 0xff +- lctlg %c6,%c6,0(%r15) +- /* prepare i/o call handler */ +- larl %r1, io_new_code +- stg %r1, 0x1f8 +- larl %r1, io_new_mask +- mvc 0x1f0(8),0(%r1) +- /* load enabled wait PSW */ +- larl %r1, enabled_wait_psw +- lpswe 0(%r1) ++ /* enable I/O interrupts in cr6 */ ++ stctg %c6,%c6,0(%r15) ++ oi 4(%r15), 0xff ++ lctlg %c6,%c6,0(%r15) ++ /* prepare i/o call handler */ ++ larl %r1,io_new_code ++ stg %r1,0x1f8 ++ larl %r1,io_new_mask ++ mvc 0x1f0(8),0(%r1) ++ /* load enabled wait PSW */ ++ larl %r1,enabled_wait_psw ++ lpswe 0(%r1) + + external_new_code: +- /* disable service interrupts in cr0 */ +- stctg %c0,%c0,0(%r15) +- ni 6(%r15),0xfd +- lctlg %c0,%c0,0(%r15) +- br %r14 ++ /* disable service interrupts in cr0 */ ++ stctg %c0,%c0,0(%r15) ++ ni 6(%r15),0xfd ++ lctlg %c0,%c0,0(%r15) ++ br %r14 + + io_new_code: +- /* disable I/O interrupts in cr6 */ +- stctg %c6,%c6,0(%r15) +- ni 4(%r15), 0x00 +- lctlg %c6,%c6,0(%r15) +- br %r14 ++ /* disable I/O interrupts in cr6 */ ++ stctg %c6,%c6,0(%r15) ++ ni 4(%r15),0x00 ++ lctlg %c6,%c6,0(%r15) ++ br %r14 + +- .align 8 ++ .align 8 + disabled_wait_psw: +- .quad 0x0002000180000000,0x0000000000000000 ++ .quad 0x0002000180000000,0x0000000000000000 + enabled_wait_psw: +- .quad 0x0302000180000000,0x0000000000000000 ++ .quad 0x0302000180000000,0x0000000000000000 + external_new_mask: +- .quad 0x0000000180000000 ++ .quad 0x0000000180000000 + io_new_mask: +- .quad 0x0000000180000000 ++ .quad 0x0000000180000000 +-- +2.39.3 + diff --git a/kvm-pc-bios-s390-ccw-Makefile-Use-z-noexecstack-to-silen.patch b/kvm-pc-bios-s390-ccw-Makefile-Use-z-noexecstack-to-silen.patch new file mode 100644 index 0000000..907fe43 --- /dev/null +++ b/kvm-pc-bios-s390-ccw-Makefile-Use-z-noexecstack-to-silen.patch @@ -0,0 +1,50 @@ +From b5b243cbbb897b236c08699529e13457e1e49924 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Thu, 22 Jun 2023 15:08:22 +0200 +Subject: [PATCH 31/37] pc-bios/s390-ccw/Makefile: Use -z noexecstack to + silence linker warning +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +RH-MergeRequest: 180: Fix misaligned symbol error in the s390-ccw image during qemu-kvm build with binutils 2.40 +RH-Bugzilla: 2220866 +RH-Acked-by: Cédric Le Goater +RH-Acked-by: Miroslav Rezanina +RH-Commit: [1/4] 04f6f83169f1c5545a0e2772b4babfc6a50bd5bf (thuth/qemu-kvm-cs9) + +Recent versions of ld complain when linking the s390-ccw bios: + + /usr/bin/ld: warning: start.o: missing .note.GNU-stack section implies + executable stack + /usr/bin/ld: NOTE: This behaviour is deprecated and will be removed in + a future version of the linker + +We can silence the warning by telling the linker to mark the stack +as not executable. + +Message-Id: <20230622130822.396793-1-thuth@redhat.com> +Acked-by: Christian Borntraeger +Signed-off-by: Thomas Huth +(cherry picked from commit 442ef32ee5b6059a8f247fb2def9d449578d0a89) +--- + pc-bios/s390-ccw/Makefile | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/pc-bios/s390-ccw/Makefile b/pc-bios/s390-ccw/Makefile +index 10e8f5cb63..2a590af4a9 100644 +--- a/pc-bios/s390-ccw/Makefile ++++ b/pc-bios/s390-ccw/Makefile +@@ -53,7 +53,7 @@ config-cc.mak: Makefile + $(call cc-option,-march=z900,-march=z10)) 3> config-cc.mak + -include config-cc.mak + +-LDFLAGS += -Wl,-pie -nostdlib ++LDFLAGS += -Wl,-pie -nostdlib -z noexecstack + + build-all: s390-ccw.img s390-netboot.img + +-- +2.39.3 + diff --git a/kvm-pc-bios-s390-ccw-Provide-space-for-initial-stack-fra.patch b/kvm-pc-bios-s390-ccw-Provide-space-for-initial-stack-fra.patch new file mode 100644 index 0000000..0c4ce6f --- /dev/null +++ b/kvm-pc-bios-s390-ccw-Provide-space-for-initial-stack-fra.patch @@ -0,0 +1,59 @@ +From 2c52aebf90f28121a3e46a9305304406023b9747 Mon Sep 17 00:00:00 2001 +From: Thomas Huth +Date: Tue, 27 Jun 2023 09:47:01 +0200 +Subject: [PATCH 33/37] pc-bios/s390-ccw: Provide space for initial stack frame + in start.S +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Thomas Huth +RH-MergeRequest: 180: Fix misaligned symbol error in the s390-ccw image during qemu-kvm build with binutils 2.40 +RH-Bugzilla: 2220866 +RH-Acked-by: Cédric Le Goater +RH-Acked-by: Miroslav Rezanina +RH-Commit: [3/4] c2f69ce5998861fe20b799bf0113def8cf0cd128 (thuth/qemu-kvm-cs9) + +Providing the space of a stack frame is the duty of the caller, +so we should reserve 160 bytes before jumping into the main function. +Otherwise the main() function might write past the stack array. + +While we're at it, add a proper STACK_SIZE macro for the stack size +instead of using magic numbers (this is also required for the following +patch). + +Reviewed-by: Christian Borntraeger +Reviewed-by: Cédric Le Goater +Message-Id: <20230627074703.99608-3-thuth@redhat.com> +Reviewed-by: Eric Farman +Reviewed-by: Claudio Imbrenda +Reviewed-by: Marc Hartmayer +Reviewed-by: Philippe Mathieu-Daudé +Signed-off-by: Thomas Huth +(cherry picked from commit 74fe98ee7fb3344dbd085d1fa32c0dc2fc2c831f) +--- + pc-bios/s390-ccw/start.S | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/pc-bios/s390-ccw/start.S b/pc-bios/s390-ccw/start.S +index d29de09cc6..abd6fe6639 100644 +--- a/pc-bios/s390-ccw/start.S ++++ b/pc-bios/s390-ccw/start.S +@@ -10,10 +10,13 @@ + * directory. + */ + ++#define STACK_SIZE 0x8000 ++#define STACK_FRAME_SIZE 160 ++ + .globl _start + _start: + +- larl %r15,stack + 0x8000 /* Set up stack */ ++ larl %r15,stack + STACK_SIZE - STACK_FRAME_SIZE /* Set up stack */ + + /* clear bss */ + larl %r2,__bss_start +-- +2.39.3 + diff --git a/kvm-ui-Fix-pixel-colour-channel-order-for-PNG-screenshot.patch b/kvm-ui-Fix-pixel-colour-channel-order-for-PNG-screenshot.patch new file mode 100644 index 0000000..ef99b30 --- /dev/null +++ b/kvm-ui-Fix-pixel-colour-channel-order-for-PNG-screenshot.patch @@ -0,0 +1,88 @@ +From b998f8474846886fa1e0428fe79fe2a79231cc05 Mon Sep 17 00:00:00 2001 +From: Peter Maydell +Date: Fri, 12 May 2023 15:43:38 +0100 +Subject: [PATCH 35/37] ui: Fix pixel colour channel order for PNG screenshots +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Marc-André Lureau +RH-MergeRequest: 183: ui: Fix pixel colour channel order for PNG screenshots +RH-Bugzilla: 2222579 +RH-Acked-by: Miroslav Rezanina +RH-Commit: [1/1] 76acd3c5526639e70bc2998f584503c78fc9bc56 (marcandre.lureau-rh/qemu-kvm-centos) + +When we take a PNG screenshot the ordering of the colour channels in +the data is not correct, resulting in the image having weird +colouring compared to the actual display. (Specifically, on a +little-endian host the blue and red channels are swapped; on +big-endian everything is wrong.) + +This happens because the pixman idea of the pixel data and the libpng +idea differ. PIXMAN_a8r8g8b8 defines that pixels are 32-bit values, +with A in bits 24-31, R in bits 16-23, G in bits 8-15 and B in bits +0-7. This means that on little-endian systems the bytes in memory +are + B G R A +and on big-endian systems they are + A R G B + +libpng, on the other hand, thinks of pixels as being a series of +values for each channel, so its format PNG_COLOR_TYPE_RGB_ALPHA +always wants bytes in the order + R G B A + +This isn't the same as the pixman order for either big or little +endian hosts. + +The alpha channel is also unnecessary bulk in the output PNG file, +because there is no alpha information in a screenshot. + +To handle the endianness issue, we already define in ui/qemu-pixman.h +various PIXMAN_BE_* and PIXMAN_LE_* values that give consistent +byte-order pixel channel formats. So we can use PIXMAN_BE_r8g8b8 and +PNG_COLOR_TYPE_RGB, which both have an in-memory byte order of + R G B +and 3 bytes per pixel. + +(PPM format screenshots get this right; they already use the +PIXMAN_BE_r8g8b8 format.) + +Cc: qemu-stable@nongnu.org +Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1622 +Fixes: 9a0a119a382867 ("Added parameter to take screenshot with screendump as PNG") +Signed-off-by: Peter Maydell +Reviewed-by: Marc-André Lureau +Message-id: 20230502135548.2451309-1-peter.maydell@linaro.org + +(cherry picked from commit cd22a0f520f471e3bd33bc19cf3b2fa772cdb2a8) +Signed-off-by: Marc-André Lureau +--- + ui/console.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/ui/console.c b/ui/console.c +index 6e8a3cdc62..e173731e20 100644 +--- a/ui/console.c ++++ b/ui/console.c +@@ -311,7 +311,7 @@ static bool png_save(int fd, pixman_image_t *image, Error **errp) + png_struct *png_ptr; + png_info *info_ptr; + g_autoptr(pixman_image_t) linebuf = +- qemu_pixman_linebuf_create(PIXMAN_a8r8g8b8, width); ++ qemu_pixman_linebuf_create(PIXMAN_BE_r8g8b8, width); + uint8_t *buf = (uint8_t *)pixman_image_get_data(linebuf); + FILE *f = fdopen(fd, "wb"); + int y; +@@ -341,7 +341,7 @@ static bool png_save(int fd, pixman_image_t *image, Error **errp) + png_init_io(png_ptr, f); + + png_set_IHDR(png_ptr, info_ptr, width, height, 8, +- PNG_COLOR_TYPE_RGB_ALPHA, PNG_INTERLACE_NONE, ++ PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE, + PNG_COMPRESSION_TYPE_BASE, PNG_FILTER_TYPE_BASE); + + png_write_info(png_ptr, info_ptr); +-- +2.39.3 + diff --git a/kvm-util-vfio-helpers-Use-g_file_read_link.patch b/kvm-util-vfio-helpers-Use-g_file_read_link.patch new file mode 100644 index 0000000..4e492d9 --- /dev/null +++ b/kvm-util-vfio-helpers-Use-g_file_read_link.patch @@ -0,0 +1,82 @@ +From fb2d40cc84f689e46138a81c57ccd1f234dbbb7c Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 07/37] util/vfio-helpers: Use g_file_read_link() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [5/28] 3545a07c967782dba8dd081415232f91d3f600a9 (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit dbdea0dbfe2c +Author: Akihiko Odaki +Date: Tue May 23 11:39:12 2023 +0900 + + util/vfio-helpers: Use g_file_read_link() + + When _FORTIFY_SOURCE=2, glibc version is 2.35, and GCC version is + 12.1.0, the compiler complains as follows: + + In file included from /usr/include/features.h:490, + from /usr/include/bits/libc-header-start.h:33, + from /usr/include/stdint.h:26, + from /usr/lib/gcc/aarch64-unknown-linux-gnu/12.1.0/include/stdint.h:9, + from /home/alarm/q/var/qemu/include/qemu/osdep.h:94, + from ../util/vfio-helpers.c:13: + In function 'readlink', + inlined from 'sysfs_find_group_file' at ../util/vfio-helpers.c:116:9, + inlined from 'qemu_vfio_init_pci' at ../util/vfio-helpers.c:326:18, + inlined from 'qemu_vfio_open_pci' at ../util/vfio-helpers.c:517:9: + /usr/include/bits/unistd.h:119:10: error: argument 2 is null but the corresponding size argument 3 value is 4095 [-Werror=nonnull] + 119 | return __glibc_fortify (readlink, __len, sizeof (char), + | ^~~~~~~~~~~~~~~ + + This error implies the allocated buffer can be NULL. Use + g_file_read_link(), which allocates buffer automatically to avoid the + error. + + Signed-off-by: Akihiko Odaki + Reviewed-by: Philippe Mathieu-Daudé + Reviewed-by: Cédric Le Goater + Signed-off-by: Cédric Le Goater + +Signed-off-by: Cédric Le Goater +--- + util/vfio-helpers.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c +index 2d8af38f88..f8bab46c68 100644 +--- a/util/vfio-helpers.c ++++ b/util/vfio-helpers.c +@@ -106,15 +106,17 @@ struct QEMUVFIOState { + */ + static char *sysfs_find_group_file(const char *device, Error **errp) + { ++ g_autoptr(GError) gerr = NULL; + char *sysfs_link; + char *sysfs_group; + char *p; + char *path = NULL; + + sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device); +- sysfs_group = g_malloc0(PATH_MAX); +- if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) { +- error_setg_errno(errp, errno, "Failed to find iommu group sysfs path"); ++ sysfs_group = g_file_read_link(sysfs_link, &gerr); ++ if (gerr) { ++ error_setg(errp, "Failed to find iommu group sysfs path: %s", ++ gerr->message); + goto out; + } + p = strrchr(sysfs_group, '/'); +-- +2.39.3 + diff --git a/kvm-vfio-Fix-null-pointer-dereference-bug-in-vfio_bars_f.patch b/kvm-vfio-Fix-null-pointer-dereference-bug-in-vfio_bars_f.patch new file mode 100644 index 0000000..1e00427 --- /dev/null +++ b/kvm-vfio-Fix-null-pointer-dereference-bug-in-vfio_bars_f.patch @@ -0,0 +1,72 @@ +From 97124d4f2afbc8e65a3ecf76096e6b34a9b71541 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 30/37] vfio: Fix null pointer dereference bug in + vfio_bars_finalize() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [28/28] 4bbdf7f9c5595897244c6cc3d88d487dd5f99bf0 (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit 8af87a3ec7e4 +Author: Avihai Horon +Date: Tue Jul 4 16:39:27 2023 +0300 + + vfio: Fix null pointer dereference bug in vfio_bars_finalize() + + vfio_realize() has the following flow: + 1. vfio_bars_prepare() -- sets VFIOBAR->size. + 2. msix_early_setup(). + 3. vfio_bars_register() -- allocates VFIOBAR->mr. + + After vfio_bars_prepare() is called msix_early_setup() can fail. If it + does fail, vfio_bars_register() is never called and VFIOBAR->mr is not + allocated. + + In this case, vfio_bars_finalize() is called as part of the error flow + to free the bars' resources. However, vfio_bars_finalize() calls + object_unparent() for VFIOBAR->mr after checking only VFIOBAR->size, and + thus we get a null pointer dereference. + + Fix it by checking VFIOBAR->mr in vfio_bars_finalize(). + + Fixes: 89d5202edc50 ("vfio/pci: Allow relocating MSI-X MMIO") + Signed-off-by: Avihai Horon + Reviewed-by: Philippe Mathieu-Daudé + Reviewed-by: Cédric Le Goater + Reviewed-by: Alex Williamson + Signed-off-by: Cédric Le Goater + +Signed-off-by: Cédric Le Goater +--- + hw/vfio/pci.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index ba40ca8784..9189459a38 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -1755,9 +1755,11 @@ static void vfio_bars_finalize(VFIOPCIDevice *vdev) + + vfio_bar_quirk_finalize(vdev, i); + vfio_region_finalize(&bar->region); +- if (bar->size) { ++ if (bar->mr) { ++ assert(bar->size); + object_unparent(OBJECT(bar->mr)); + g_free(bar->mr); ++ bar->mr = NULL; + } + } + +-- +2.39.3 + diff --git a/kvm-vfio-Implement-a-common-device-info-helper.patch b/kvm-vfio-Implement-a-common-device-info-helper.patch new file mode 100644 index 0000000..78a554d --- /dev/null +++ b/kvm-vfio-Implement-a-common-device-info-helper.patch @@ -0,0 +1,196 @@ +From f68e8c5d841cd7fc785cc3d15b3c280211bfb4c3 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 17/37] vfio: Implement a common device info helper +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [15/28] 9cfd233ab1b95dc7de776e8ef901823bd37c5a6b (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit 634f38f0f73f +Author: Alex Williamson +Date: Thu Jun 1 08:45:06 2023 -0600 + + vfio: Implement a common device info helper + + A common helper implementing the realloc algorithm for handling + capabilities. + + Reviewed-by: Philippe Mathieu-Daudé + Reviewed-by: Cédric Le Goater + Signed-off-by: Alex Williamson + Reviewed-by: Robin Voetter + Signed-off-by: Cédric Le Goater + +Signed-off-by: Cédric Le Goater +--- + hw/s390x/s390-pci-vfio.c | 37 ++++------------------------ + hw/vfio/common.c | 46 ++++++++++++++++++++++++++--------- + include/hw/vfio/vfio-common.h | 1 + + 3 files changed, 41 insertions(+), 43 deletions(-) + +diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c +index f51190d466..59a2e03873 100644 +--- a/hw/s390x/s390-pci-vfio.c ++++ b/hw/s390x/s390-pci-vfio.c +@@ -289,38 +289,11 @@ static void s390_pci_read_pfip(S390PCIBusDevice *pbdev, + memcpy(pbdev->zpci_fn.pfip, cap->pfip, CLP_PFIP_NR_SEGMENTS); + } + +-static struct vfio_device_info *get_device_info(S390PCIBusDevice *pbdev, +- uint32_t argsz) ++static struct vfio_device_info *get_device_info(S390PCIBusDevice *pbdev) + { +- struct vfio_device_info *info = g_malloc0(argsz); +- VFIOPCIDevice *vfio_pci; +- int fd; ++ VFIOPCIDevice *vfio_pci = container_of(pbdev->pdev, VFIOPCIDevice, pdev); + +- vfio_pci = container_of(pbdev->pdev, VFIOPCIDevice, pdev); +- fd = vfio_pci->vbasedev.fd; +- +- /* +- * If the specified argsz is not large enough to contain all capabilities +- * it will be updated upon return from the ioctl. Retry until we have +- * a big enough buffer to hold the entire capability chain. On error, +- * just exit and rely on CLP defaults. +- */ +-retry: +- info->argsz = argsz; +- +- if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) { +- trace_s390_pci_clp_dev_info(vfio_pci->vbasedev.name); +- g_free(info); +- return NULL; +- } +- +- if (info->argsz > argsz) { +- argsz = info->argsz; +- info = g_realloc(info, argsz); +- goto retry; +- } +- +- return info; ++ return vfio_get_device_info(vfio_pci->vbasedev.fd); + } + + /* +@@ -335,7 +308,7 @@ bool s390_pci_get_host_fh(S390PCIBusDevice *pbdev, uint32_t *fh) + + assert(fh); + +- info = get_device_info(pbdev, sizeof(*info)); ++ info = get_device_info(pbdev); + if (!info) { + return false; + } +@@ -356,7 +329,7 @@ void s390_pci_get_clp_info(S390PCIBusDevice *pbdev) + { + g_autofree struct vfio_device_info *info = NULL; + +- info = get_device_info(pbdev, sizeof(*info)); ++ info = get_device_info(pbdev); + if (!info) { + return; + } +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index b73086e17a..3b4ac53f15 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -2845,11 +2845,35 @@ void vfio_put_group(VFIOGroup *group) + } + } + ++struct vfio_device_info *vfio_get_device_info(int fd) ++{ ++ struct vfio_device_info *info; ++ uint32_t argsz = sizeof(*info); ++ ++ info = g_malloc0(argsz); ++ ++retry: ++ info->argsz = argsz; ++ ++ if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) { ++ g_free(info); ++ return NULL; ++ } ++ ++ if (info->argsz > argsz) { ++ argsz = info->argsz; ++ info = g_realloc(info, argsz); ++ goto retry; ++ } ++ ++ return info; ++} ++ + int vfio_get_device(VFIOGroup *group, const char *name, + VFIODevice *vbasedev, Error **errp) + { +- struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; +- int ret, fd; ++ g_autofree struct vfio_device_info *info = NULL; ++ int fd; + + fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name); + if (fd < 0) { +@@ -2861,11 +2885,11 @@ int vfio_get_device(VFIOGroup *group, const char *name, + return fd; + } + +- ret = ioctl(fd, VFIO_DEVICE_GET_INFO, &dev_info); +- if (ret) { ++ info = vfio_get_device_info(fd); ++ if (!info) { + error_setg_errno(errp, errno, "error getting device info"); + close(fd); +- return ret; ++ return -1; + } + + /* +@@ -2893,14 +2917,14 @@ int vfio_get_device(VFIOGroup *group, const char *name, + vbasedev->group = group; + QLIST_INSERT_HEAD(&group->device_list, vbasedev, next); + +- vbasedev->num_irqs = dev_info.num_irqs; +- vbasedev->num_regions = dev_info.num_regions; +- vbasedev->flags = dev_info.flags; ++ vbasedev->num_irqs = info->num_irqs; ++ vbasedev->num_regions = info->num_regions; ++ vbasedev->flags = info->flags; ++ ++ trace_vfio_get_device(name, info->flags, info->num_regions, info->num_irqs); + +- trace_vfio_get_device(name, dev_info.flags, dev_info.num_regions, +- dev_info.num_irqs); ++ vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET); + +- vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET); + return 0; + } + +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 3dc5f2104c..6d1b8487c3 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -216,6 +216,7 @@ void vfio_region_finalize(VFIORegion *region); + void vfio_reset_handler(void *opaque); + VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp); + void vfio_put_group(VFIOGroup *group); ++struct vfio_device_info *vfio_get_device_info(int fd); + int vfio_get_device(VFIOGroup *group, const char *name, + VFIODevice *vbasedev, Error **errp); + +-- +2.39.3 + diff --git a/kvm-vfio-migration-Add-VFIO-migration-pre-copy-support.patch b/kvm-vfio-migration-Add-VFIO-migration-pre-copy-support.patch new file mode 100644 index 0000000..b8e72e6 --- /dev/null +++ b/kvm-vfio-migration-Add-VFIO-migration-pre-copy-support.patch @@ -0,0 +1,438 @@ +From 080d28c191b7d951f1f4596dcaa13d590c07d886 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 15/37] vfio/migration: Add VFIO migration pre-copy support +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [13/28] 7b2ea1471440d47e5aed1211c96942ca7bface96 (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit eda7362af959 +Author: Avihai Horon +Date: Wed Jun 21 14:12:00 2023 +0300 + + vfio/migration: Add VFIO migration pre-copy support + + Pre-copy support allows the VFIO device data to be transferred while the + VM is running. This helps to accommodate VFIO devices that have a large + amount of data that needs to be transferred, and it can reduce migration + downtime. + + Pre-copy support is optional in VFIO migration protocol v2. + Implement pre-copy of VFIO migration protocol v2 and use it for devices + that support it. Full description of it can be found in the following + Linux commit: 4db52602a607 ("vfio: Extend the device migration protocol + with PRE_COPY"). + + Signed-off-by: Avihai Horon + Reviewed-by: Cédric Le Goater + Tested-by: YangHang Liu + Acked-by: Alex Williamson + Signed-off-by: Cédric Le Goater + +Signed-off-by: Cédric Le Goater +--- + docs/devel/vfio-migration.rst | 35 +++++--- + hw/vfio/common.c | 6 +- + hw/vfio/migration.c | 165 ++++++++++++++++++++++++++++++++-- + hw/vfio/trace-events | 4 +- + include/hw/vfio/vfio-common.h | 2 + + 5 files changed, 190 insertions(+), 22 deletions(-) + +diff --git a/docs/devel/vfio-migration.rst b/docs/devel/vfio-migration.rst +index 1b68ccf115..e896b2a673 100644 +--- a/docs/devel/vfio-migration.rst ++++ b/docs/devel/vfio-migration.rst +@@ -7,12 +7,14 @@ the guest is running on source host and restoring this saved state on the + destination host. This document details how saving and restoring of VFIO + devices is done in QEMU. + +-Migration of VFIO devices currently consists of a single stop-and-copy phase. +-During the stop-and-copy phase the guest is stopped and the entire VFIO device +-data is transferred to the destination. +- +-The pre-copy phase of migration is currently not supported for VFIO devices. +-Support for VFIO pre-copy will be added later on. ++Migration of VFIO devices consists of two phases: the optional pre-copy phase, ++and the stop-and-copy phase. The pre-copy phase is iterative and allows to ++accommodate VFIO devices that have a large amount of data that needs to be ++transferred. The iterative pre-copy phase of migration allows for the guest to ++continue whilst the VFIO device state is transferred to the destination, this ++helps to reduce the total downtime of the VM. VFIO devices opt-in to pre-copy ++support by reporting the VFIO_MIGRATION_PRE_COPY flag in the ++VFIO_DEVICE_FEATURE_MIGRATION ioctl. + + Note that currently VFIO migration is supported only for a single device. This + is due to VFIO migration's lack of P2P support. However, P2P support is planned +@@ -29,10 +31,20 @@ VFIO implements the device hooks for the iterative approach as follows: + * A ``load_setup`` function that sets the VFIO device on the destination in + _RESUMING state. + ++* A ``state_pending_estimate`` function that reports an estimate of the ++ remaining pre-copy data that the vendor driver has yet to save for the VFIO ++ device. ++ + * A ``state_pending_exact`` function that reads pending_bytes from the vendor + driver, which indicates the amount of data that the vendor driver has yet to + save for the VFIO device. + ++* An ``is_active_iterate`` function that indicates ``save_live_iterate`` is ++ active only when the VFIO device is in pre-copy states. ++ ++* A ``save_live_iterate`` function that reads the VFIO device's data from the ++ vendor driver during iterative pre-copy phase. ++ + * A ``save_state`` function to save the device config space if it is present. + + * A ``save_live_complete_precopy`` function that sets the VFIO device in +@@ -111,8 +123,10 @@ Flow of state changes during Live migration + =========================================== + + Below is the flow of state change during live migration. +-The values in the brackets represent the VM state, the migration state, and ++The values in the parentheses represent the VM state, the migration state, and + the VFIO device state, respectively. ++The text in the square brackets represents the flow if the VFIO device supports ++pre-copy. + + Live migration save path + ------------------------ +@@ -124,11 +138,12 @@ Live migration save path + | + migrate_init spawns migration_thread + Migration thread then calls each device's .save_setup() +- (RUNNING, _SETUP, _RUNNING) ++ (RUNNING, _SETUP, _RUNNING [_PRE_COPY]) + | +- (RUNNING, _ACTIVE, _RUNNING) +- If device is active, get pending_bytes by .state_pending_exact() ++ (RUNNING, _ACTIVE, _RUNNING [_PRE_COPY]) ++ If device is active, get pending_bytes by .state_pending_{estimate,exact}() + If total pending_bytes >= threshold_size, call .save_live_iterate() ++ [Data of VFIO device for pre-copy phase is copied] + Iterate till total pending bytes converge and are less than threshold + | + On migration completion, vCPU stops and calls .save_live_complete_precopy for +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 78358ede27..b73086e17a 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -492,7 +492,8 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) + } + + if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF && +- migration->device_state == VFIO_DEVICE_STATE_RUNNING) { ++ (migration->device_state == VFIO_DEVICE_STATE_RUNNING || ++ migration->device_state == VFIO_DEVICE_STATE_PRE_COPY)) { + return false; + } + } +@@ -537,7 +538,8 @@ static bool vfio_devices_all_running_and_mig_active(VFIOContainer *container) + return false; + } + +- if (migration->device_state == VFIO_DEVICE_STATE_RUNNING) { ++ if (migration->device_state == VFIO_DEVICE_STATE_RUNNING || ++ migration->device_state == VFIO_DEVICE_STATE_PRE_COPY) { + continue; + } else { + return false; +diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c +index 8d33414379..d8f6a22ae1 100644 +--- a/hw/vfio/migration.c ++++ b/hw/vfio/migration.c +@@ -68,6 +68,8 @@ static const char *mig_state_to_str(enum vfio_device_mig_state state) + return "STOP_COPY"; + case VFIO_DEVICE_STATE_RESUMING: + return "RESUMING"; ++ case VFIO_DEVICE_STATE_PRE_COPY: ++ return "PRE_COPY"; + default: + return "UNKNOWN STATE"; + } +@@ -241,6 +243,25 @@ static int vfio_query_stop_copy_size(VFIODevice *vbasedev, + return 0; + } + ++static int vfio_query_precopy_size(VFIOMigration *migration) ++{ ++ struct vfio_precopy_info precopy = { ++ .argsz = sizeof(precopy), ++ }; ++ ++ migration->precopy_init_size = 0; ++ migration->precopy_dirty_size = 0; ++ ++ if (ioctl(migration->data_fd, VFIO_MIG_GET_PRECOPY_INFO, &precopy)) { ++ return -errno; ++ } ++ ++ migration->precopy_init_size = precopy.initial_bytes; ++ migration->precopy_dirty_size = precopy.dirty_bytes; ++ ++ return 0; ++} ++ + /* Returns the size of saved data on success and -errno on error */ + static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration) + { +@@ -249,6 +270,14 @@ static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration) + data_size = read(migration->data_fd, migration->data_buffer, + migration->data_buffer_size); + if (data_size < 0) { ++ /* ++ * Pre-copy emptied all the device state for now. For more information, ++ * please refer to the Linux kernel VFIO uAPI. ++ */ ++ if (errno == ENOMSG) { ++ return 0; ++ } ++ + return -errno; + } + if (data_size == 0) { +@@ -265,6 +294,38 @@ static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration) + return qemu_file_get_error(f) ?: data_size; + } + ++static void vfio_update_estimated_pending_data(VFIOMigration *migration, ++ uint64_t data_size) ++{ ++ if (!data_size) { ++ /* ++ * Pre-copy emptied all the device state for now, update estimated sizes ++ * accordingly. ++ */ ++ migration->precopy_init_size = 0; ++ migration->precopy_dirty_size = 0; ++ ++ return; ++ } ++ ++ if (migration->precopy_init_size) { ++ uint64_t init_size = MIN(migration->precopy_init_size, data_size); ++ ++ migration->precopy_init_size -= init_size; ++ data_size -= init_size; ++ } ++ ++ migration->precopy_dirty_size -= MIN(migration->precopy_dirty_size, ++ data_size); ++} ++ ++static bool vfio_precopy_supported(VFIODevice *vbasedev) ++{ ++ VFIOMigration *migration = vbasedev->migration; ++ ++ return migration->mig_flags & VFIO_MIGRATION_PRE_COPY; ++} ++ + /* ---------------------------------------------------------------------- */ + + static int vfio_save_setup(QEMUFile *f, void *opaque) +@@ -285,6 +346,28 @@ static int vfio_save_setup(QEMUFile *f, void *opaque) + return -ENOMEM; + } + ++ if (vfio_precopy_supported(vbasedev)) { ++ int ret; ++ ++ switch (migration->device_state) { ++ case VFIO_DEVICE_STATE_RUNNING: ++ ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_PRE_COPY, ++ VFIO_DEVICE_STATE_RUNNING); ++ if (ret) { ++ return ret; ++ } ++ ++ vfio_query_precopy_size(migration); ++ ++ break; ++ case VFIO_DEVICE_STATE_STOP: ++ /* vfio_save_complete_precopy() will go to STOP_COPY */ ++ break; ++ default: ++ return -EINVAL; ++ } ++ } ++ + trace_vfio_save_setup(vbasedev->name, migration->data_buffer_size); + + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); +@@ -299,26 +382,42 @@ static void vfio_save_cleanup(void *opaque) + + g_free(migration->data_buffer); + migration->data_buffer = NULL; ++ migration->precopy_init_size = 0; ++ migration->precopy_dirty_size = 0; + vfio_migration_cleanup(vbasedev); + trace_vfio_save_cleanup(vbasedev->name); + } + ++static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy, ++ uint64_t *can_postcopy) ++{ ++ VFIODevice *vbasedev = opaque; ++ VFIOMigration *migration = vbasedev->migration; ++ ++ if (migration->device_state != VFIO_DEVICE_STATE_PRE_COPY) { ++ return; ++ } ++ ++ *must_precopy += ++ migration->precopy_init_size + migration->precopy_dirty_size; ++ ++ trace_vfio_state_pending_estimate(vbasedev->name, *must_precopy, ++ *can_postcopy, ++ migration->precopy_init_size, ++ migration->precopy_dirty_size); ++} ++ + /* + * Migration size of VFIO devices can be as little as a few KBs or as big as + * many GBs. This value should be big enough to cover the worst case. + */ + #define VFIO_MIG_STOP_COPY_SIZE (100 * GiB) + +-/* +- * Only exact function is implemented and not estimate function. The reason is +- * that during pre-copy phase of migration the estimate function is called +- * repeatedly while pending RAM size is over the threshold, thus migration +- * can't converge and querying the VFIO device pending data size is useless. +- */ + static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy, + uint64_t *can_postcopy) + { + VFIODevice *vbasedev = opaque; ++ VFIOMigration *migration = vbasedev->migration; + uint64_t stop_copy_size = VFIO_MIG_STOP_COPY_SIZE; + + /* +@@ -328,8 +427,48 @@ static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy, + vfio_query_stop_copy_size(vbasedev, &stop_copy_size); + *must_precopy += stop_copy_size; + ++ if (migration->device_state == VFIO_DEVICE_STATE_PRE_COPY) { ++ vfio_query_precopy_size(migration); ++ ++ *must_precopy += ++ migration->precopy_init_size + migration->precopy_dirty_size; ++ } ++ + trace_vfio_state_pending_exact(vbasedev->name, *must_precopy, *can_postcopy, +- stop_copy_size); ++ stop_copy_size, migration->precopy_init_size, ++ migration->precopy_dirty_size); ++} ++ ++static bool vfio_is_active_iterate(void *opaque) ++{ ++ VFIODevice *vbasedev = opaque; ++ VFIOMigration *migration = vbasedev->migration; ++ ++ return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY; ++} ++ ++static int vfio_save_iterate(QEMUFile *f, void *opaque) ++{ ++ VFIODevice *vbasedev = opaque; ++ VFIOMigration *migration = vbasedev->migration; ++ ssize_t data_size; ++ ++ data_size = vfio_save_block(f, migration); ++ if (data_size < 0) { ++ return data_size; ++ } ++ qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); ++ ++ vfio_update_estimated_pending_data(migration, data_size); ++ ++ trace_vfio_save_iterate(vbasedev->name, migration->precopy_init_size, ++ migration->precopy_dirty_size); ++ ++ /* ++ * A VFIO device's pre-copy dirty_bytes is not guaranteed to reach zero. ++ * Return 1 so following handlers will not be potentially blocked. ++ */ ++ return 1; + } + + static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) +@@ -338,7 +477,7 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) + ssize_t data_size; + int ret; + +- /* We reach here with device state STOP only */ ++ /* We reach here with device state STOP or STOP_COPY only */ + ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY, + VFIO_DEVICE_STATE_STOP); + if (ret) { +@@ -457,7 +596,10 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) + static const SaveVMHandlers savevm_vfio_handlers = { + .save_setup = vfio_save_setup, + .save_cleanup = vfio_save_cleanup, ++ .state_pending_estimate = vfio_state_pending_estimate, + .state_pending_exact = vfio_state_pending_exact, ++ .is_active_iterate = vfio_is_active_iterate, ++ .save_live_iterate = vfio_save_iterate, + .save_live_complete_precopy = vfio_save_complete_precopy, + .save_state = vfio_save_state, + .load_setup = vfio_load_setup, +@@ -470,13 +612,18 @@ static const SaveVMHandlers savevm_vfio_handlers = { + static void vfio_vmstate_change(void *opaque, bool running, RunState state) + { + VFIODevice *vbasedev = opaque; ++ VFIOMigration *migration = vbasedev->migration; + enum vfio_device_mig_state new_state; + int ret; + + if (running) { + new_state = VFIO_DEVICE_STATE_RUNNING; + } else { +- new_state = VFIO_DEVICE_STATE_STOP; ++ new_state = ++ (migration->device_state == VFIO_DEVICE_STATE_PRE_COPY && ++ (state == RUN_STATE_FINISH_MIGRATE || state == RUN_STATE_PAUSED)) ? ++ VFIO_DEVICE_STATE_STOP_COPY : ++ VFIO_DEVICE_STATE_STOP; + } + + /* +diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events +index 646e42fd27..4150b59e58 100644 +--- a/hw/vfio/trace-events ++++ b/hw/vfio/trace-events +@@ -162,6 +162,8 @@ vfio_save_block(const char *name, int data_size) " (%s) data_size %d" + vfio_save_cleanup(const char *name) " (%s)" + vfio_save_complete_precopy(const char *name, int ret) " (%s) ret %d" + vfio_save_device_config_state(const char *name) " (%s)" ++vfio_save_iterate(const char *name, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64 + vfio_save_setup(const char *name, uint64_t data_buffer_size) " (%s) data buffer size 0x%"PRIx64 +-vfio_state_pending_exact(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t stopcopy_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" stopcopy size 0x%"PRIx64 ++vfio_state_pending_estimate(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64 ++vfio_state_pending_exact(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t stopcopy_size, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" stopcopy size 0x%"PRIx64" precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64 + vfio_vmstate_change(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s" +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 5f29dab839..1db901c194 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -67,6 +67,8 @@ typedef struct VFIOMigration { + void *data_buffer; + size_t data_buffer_size; + uint64_t mig_flags; ++ uint64_t precopy_init_size; ++ uint64_t precopy_dirty_size; + } VFIOMigration; + + typedef struct VFIOAddressSpace { +-- +2.39.3 + diff --git a/kvm-vfio-migration-Add-support-for-switchover-ack-capabi.patch b/kvm-vfio-migration-Add-support-for-switchover-ack-capabi.patch new file mode 100644 index 0000000..d87680d --- /dev/null +++ b/kvm-vfio-migration-Add-support-for-switchover-ack-capabi.patch @@ -0,0 +1,192 @@ +From 169dc1bb051b3aebc571936d956b49ba0621ae43 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 16/37] vfio/migration: Add support for switchover ack + capability +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [14/28] b3bd2eb2d0ca49ff05a0a82ae5bb956a354aed47 (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit 745c42912a04 +Author: Avihai Horon +Date: Wed Jun 21 14:12:01 2023 +0300 + + vfio/migration: Add support for switchover ack capability + + Loading of a VFIO device's data can take a substantial amount of time as + the device may need to allocate resources, prepare internal data + structures, etc. This can increase migration downtime, especially for + VFIO devices with a lot of resources. + + To solve this, VFIO migration uAPI defines "initial bytes" as part of + its precopy data stream. Initial bytes can be used in various ways to + improve VFIO migration performance. For example, it can be used to + transfer device metadata to pre-allocate resources in the destination. + However, for this to work we need to make sure that all initial bytes + are sent and loaded in the destination before the source VM is stopped. + + Use migration switchover ack capability to make sure a VFIO device's + initial bytes are sent and loaded in the destination before the source + stops the VM and attempts to complete the migration. + This can significantly reduce migration downtime for some devices. + + Signed-off-by: Avihai Horon + Reviewed-by: Cédric Le Goater + Tested-by: YangHang Liu + Acked-by: Alex Williamson + Signed-off-by: Cédric Le Goater + +Signed-off-by: Cédric Le Goater +--- + docs/devel/vfio-migration.rst | 10 +++++++++ + hw/vfio/migration.c | 39 ++++++++++++++++++++++++++++++++++- + include/hw/vfio/vfio-common.h | 1 + + 3 files changed, 49 insertions(+), 1 deletion(-) + +diff --git a/docs/devel/vfio-migration.rst b/docs/devel/vfio-migration.rst +index e896b2a673..b433cb5bb2 100644 +--- a/docs/devel/vfio-migration.rst ++++ b/docs/devel/vfio-migration.rst +@@ -16,6 +16,13 @@ helps to reduce the total downtime of the VM. VFIO devices opt-in to pre-copy + support by reporting the VFIO_MIGRATION_PRE_COPY flag in the + VFIO_DEVICE_FEATURE_MIGRATION ioctl. + ++When pre-copy is supported, it's possible to further reduce downtime by ++enabling "switchover-ack" migration capability. ++VFIO migration uAPI defines "initial bytes" as part of its pre-copy data stream ++and recommends that the initial bytes are sent and loaded in the destination ++before stopping the source VM. Enabling this migration capability will ++guarantee that and thus, can potentially reduce downtime even further. ++ + Note that currently VFIO migration is supported only for a single device. This + is due to VFIO migration's lack of P2P support. However, P2P support is planned + to be added later on. +@@ -45,6 +52,9 @@ VFIO implements the device hooks for the iterative approach as follows: + * A ``save_live_iterate`` function that reads the VFIO device's data from the + vendor driver during iterative pre-copy phase. + ++* A ``switchover_ack_needed`` function that checks if the VFIO device uses ++ "switchover-ack" migration capability when this capability is enabled. ++ + * A ``save_state`` function to save the device config space if it is present. + + * A ``save_live_complete_precopy`` function that sets the VFIO device in +diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c +index d8f6a22ae1..acbf0bb7ab 100644 +--- a/hw/vfio/migration.c ++++ b/hw/vfio/migration.c +@@ -18,6 +18,8 @@ + #include "sysemu/runstate.h" + #include "hw/vfio/vfio-common.h" + #include "migration/migration.h" ++#include "migration/options.h" ++#include "migration/savevm.h" + #include "migration/vmstate.h" + #include "migration/qemu-file.h" + #include "migration/register.h" +@@ -45,6 +47,7 @@ + #define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL) + #define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) + #define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) ++#define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL) + + /* + * This is an arbitrary size based on migration of mlx5 devices, where typically +@@ -384,6 +387,7 @@ static void vfio_save_cleanup(void *opaque) + migration->data_buffer = NULL; + migration->precopy_init_size = 0; + migration->precopy_dirty_size = 0; ++ migration->initial_data_sent = false; + vfio_migration_cleanup(vbasedev); + trace_vfio_save_cleanup(vbasedev->name); + } +@@ -457,10 +461,17 @@ static int vfio_save_iterate(QEMUFile *f, void *opaque) + if (data_size < 0) { + return data_size; + } +- qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); + + vfio_update_estimated_pending_data(migration, data_size); + ++ if (migrate_switchover_ack() && !migration->precopy_init_size && ++ !migration->initial_data_sent) { ++ qemu_put_be64(f, VFIO_MIG_FLAG_DEV_INIT_DATA_SENT); ++ migration->initial_data_sent = true; ++ } else { ++ qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); ++ } ++ + trace_vfio_save_iterate(vbasedev->name, migration->precopy_init_size, + migration->precopy_dirty_size); + +@@ -579,6 +590,24 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) + } + break; + } ++ case VFIO_MIG_FLAG_DEV_INIT_DATA_SENT: ++ { ++ if (!vfio_precopy_supported(vbasedev) || ++ !migrate_switchover_ack()) { ++ error_report("%s: Received INIT_DATA_SENT but switchover ack " ++ "is not used", vbasedev->name); ++ return -EINVAL; ++ } ++ ++ ret = qemu_loadvm_approve_switchover(); ++ if (ret) { ++ error_report( ++ "%s: qemu_loadvm_approve_switchover failed, err=%d (%s)", ++ vbasedev->name, ret, strerror(-ret)); ++ } ++ ++ return ret; ++ } + default: + error_report("%s: Unknown tag 0x%"PRIx64, vbasedev->name, data); + return -EINVAL; +@@ -593,6 +622,13 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) + return ret; + } + ++static bool vfio_switchover_ack_needed(void *opaque) ++{ ++ VFIODevice *vbasedev = opaque; ++ ++ return vfio_precopy_supported(vbasedev); ++} ++ + static const SaveVMHandlers savevm_vfio_handlers = { + .save_setup = vfio_save_setup, + .save_cleanup = vfio_save_cleanup, +@@ -605,6 +641,7 @@ static const SaveVMHandlers savevm_vfio_handlers = { + .load_setup = vfio_load_setup, + .load_cleanup = vfio_load_cleanup, + .load_state = vfio_load_state, ++ .switchover_ack_needed = vfio_switchover_ack_needed, + }; + + /* ---------------------------------------------------------------------- */ +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 1db901c194..3dc5f2104c 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -69,6 +69,7 @@ typedef struct VFIOMigration { + uint64_t mig_flags; + uint64_t precopy_init_size; + uint64_t precopy_dirty_size; ++ bool initial_data_sent; + } VFIOMigration; + + typedef struct VFIOAddressSpace { +-- +2.39.3 + diff --git a/kvm-vfio-migration-Change-vIOMMU-blocker-from-global-to-.patch b/kvm-vfio-migration-Change-vIOMMU-blocker-from-global-to-.patch new file mode 100644 index 0000000..dde2e24 --- /dev/null +++ b/kvm-vfio-migration-Change-vIOMMU-blocker-from-global-to-.patch @@ -0,0 +1,171 @@ +From 35c7d0d3b02d61d6f29afae74bd83edd70a6a1b4 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 26/37] vfio/migration: Change vIOMMU blocker from global to + per device +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [24/28] 8fda1c82a81fadd4f38e6a5e878c9228a81c0f6e (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit 3c26c80a0a26 +Author: Zhenzhong Duan +Date: Mon Jul 3 15:15:07 2023 +0800 + + vfio/migration: Change vIOMMU blocker from global to per device + + Contrary to multiple device blocker which needs to consider already-attached + devices to unblock/block dynamically, the vIOMMU migration blocker is a device + specific config. Meaning it only needs to know whether the device is bypassing + or not the vIOMMU (via machine property, or per pxb-pcie::bypass_iommu), and + does not need the state of currently present devices. For this reason, the + vIOMMU global migration blocker can be consolidated into the per-device + migration blocker, allowing us to remove some unnecessary code. + + This change also makes vfio_mig_active() more accurate as it doesn't check for + global blocker. + + Signed-off-by: Zhenzhong Duan + Reviewed-by: Joao Martins + Reviewed-by: Cédric Le Goater + Signed-off-by: Cédric Le Goater + +Signed-off-by: Cédric Le Goater +--- + hw/vfio/common.c | 51 ++--------------------------------- + hw/vfio/migration.c | 7 ++--- + hw/vfio/pci.c | 1 - + include/hw/vfio/vfio-common.h | 3 +-- + 4 files changed, 7 insertions(+), 55 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 136d8243d6..e815f6ba30 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -362,7 +362,6 @@ bool vfio_mig_active(void) + } + + static Error *multiple_devices_migration_blocker; +-static Error *giommu_migration_blocker; + + static unsigned int vfio_migratable_device_num(void) + { +@@ -420,55 +419,9 @@ void vfio_unblock_multiple_devices_migration(void) + multiple_devices_migration_blocker = NULL; + } + +-static bool vfio_viommu_preset(void) ++bool vfio_viommu_preset(VFIODevice *vbasedev) + { +- VFIOAddressSpace *space; +- +- QLIST_FOREACH(space, &vfio_address_spaces, list) { +- if (space->as != &address_space_memory) { +- return true; +- } +- } +- +- return false; +-} +- +-int vfio_block_giommu_migration(VFIODevice *vbasedev, Error **errp) +-{ +- int ret; +- +- if (giommu_migration_blocker || +- !vfio_viommu_preset()) { +- return 0; +- } +- +- if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { +- error_setg(errp, +- "Migration is currently not supported with vIOMMU enabled"); +- return -EINVAL; +- } +- +- error_setg(&giommu_migration_blocker, +- "Migration is currently not supported with vIOMMU enabled"); +- ret = migrate_add_blocker(giommu_migration_blocker, errp); +- if (ret < 0) { +- error_free(giommu_migration_blocker); +- giommu_migration_blocker = NULL; +- } +- +- return ret; +-} +- +-void vfio_migration_finalize(void) +-{ +- if (!giommu_migration_blocker || +- vfio_viommu_preset()) { +- return; +- } +- +- migrate_del_blocker(giommu_migration_blocker); +- error_free(giommu_migration_blocker); +- giommu_migration_blocker = NULL; ++ return vbasedev->group->container->space->as != &address_space_memory; + } + + static void vfio_set_migration_error(int err) +diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c +index 1db7d52ab2..e6e5e85f75 100644 +--- a/hw/vfio/migration.c ++++ b/hw/vfio/migration.c +@@ -878,9 +878,10 @@ int vfio_migration_realize(VFIODevice *vbasedev, Error **errp) + return ret; + } + +- ret = vfio_block_giommu_migration(vbasedev, errp); +- if (ret) { +- return ret; ++ if (vfio_viommu_preset(vbasedev)) { ++ error_setg(&err, "%s: Migration is currently not supported " ++ "with vIOMMU enabled", vbasedev->name); ++ return vfio_block_migration(vbasedev, err, errp); + } + + trace_vfio_migration_realize(vbasedev->name); +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 2d059832a4..922c81872c 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -3279,7 +3279,6 @@ static void vfio_instance_finalize(Object *obj) + */ + vfio_put_device(vdev); + vfio_put_group(group); +- vfio_migration_finalize(); + } + + static void vfio_exitfn(PCIDevice *pdev) +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 93429b9abb..45167c8a8a 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -227,7 +227,7 @@ extern VFIOGroupList vfio_group_list; + bool vfio_mig_active(void); + int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp); + void vfio_unblock_multiple_devices_migration(void); +-int vfio_block_giommu_migration(VFIODevice *vbasedev, Error **errp); ++bool vfio_viommu_preset(VFIODevice *vbasedev); + int64_t vfio_mig_bytes_transferred(void); + void vfio_reset_bytes_transferred(void); + +@@ -254,6 +254,5 @@ int vfio_spapr_remove_window(VFIOContainer *container, + + int vfio_migration_realize(VFIODevice *vbasedev, Error **errp); + void vfio_migration_exit(VFIODevice *vbasedev); +-void vfio_migration_finalize(void); + + #endif /* HW_VFIO_VFIO_COMMON_H */ +-- +2.39.3 + diff --git a/kvm-vfio-migration-Free-resources-when-vfio_migration_re.patch b/kvm-vfio-migration-Free-resources-when-vfio_migration_re.patch new file mode 100644 index 0000000..9deaf1a --- /dev/null +++ b/kvm-vfio-migration-Free-resources-when-vfio_migration_re.patch @@ -0,0 +1,145 @@ +From a36fa46369fe9bf2a2174e9ed6ab83042e904066 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 27/37] vfio/migration: Free resources when + vfio_migration_realize fails +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [25/28] b3ab8d3443d4bc12a689dc7d88a94da315814bb7 (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit 2b43b2995b02 +Author: Zhenzhong Duan +Date: Mon Jul 3 15:15:08 2023 +0800 + + vfio/migration: Free resources when vfio_migration_realize fails + + When vfio_realize() succeeds, hot unplug will call vfio_exitfn() + to free resources allocated in vfio_realize(); when vfio_realize() + fails, vfio_exitfn() is never called and we need to free resources + in vfio_realize(). + + In the case that vfio_migration_realize() fails, + e.g: with -only-migratable & enable-migration=off, we see below: + + (qemu) device_add vfio-pci,host=81:11.1,id=vfio1,bus=root1,enable-migration=off + 0000:81:11.1: Migration disabled + Error: disallowing migration blocker (--only-migratable) for: 0000:81:11.1: Migration is disabled for VFIO device + + If we hotplug again we should see same log as above, but we see: + (qemu) device_add vfio-pci,host=81:11.1,id=vfio1,bus=root1,enable-migration=off + Error: vfio 0000:81:11.1: device is already attached + + That's because some references to VFIO device isn't released. + For resources allocated in vfio_migration_realize(), free them by + jumping to out_deinit path with calling a new function + vfio_migration_deinit(). For resources allocated in vfio_realize(), + free them by jumping to de-register path in vfio_realize(). + + Signed-off-by: Zhenzhong Duan + Fixes: a22651053b59 ("vfio: Make vfio-pci device migration capable") + Reviewed-by: Cédric Le Goater + Reviewed-by: Joao Martins + Signed-off-by: Cédric Le Goater + +Signed-off-by: Cédric Le Goater +--- + hw/vfio/migration.c | 33 +++++++++++++++++++++++---------- + hw/vfio/pci.c | 1 + + 2 files changed, 24 insertions(+), 10 deletions(-) + +diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c +index e6e5e85f75..e3954570c8 100644 +--- a/hw/vfio/migration.c ++++ b/hw/vfio/migration.c +@@ -802,6 +802,17 @@ static int vfio_migration_init(VFIODevice *vbasedev) + return 0; + } + ++static void vfio_migration_deinit(VFIODevice *vbasedev) ++{ ++ VFIOMigration *migration = vbasedev->migration; ++ ++ remove_migration_state_change_notifier(&migration->migration_state); ++ qemu_del_vm_change_state_handler(migration->vm_state); ++ unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev); ++ vfio_migration_free(vbasedev); ++ vfio_unblock_multiple_devices_migration(); ++} ++ + static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp) + { + int ret; +@@ -866,7 +877,7 @@ int vfio_migration_realize(VFIODevice *vbasedev, Error **errp) + error_setg(&err, + "%s: VFIO device doesn't support device dirty tracking", + vbasedev->name); +- return vfio_block_migration(vbasedev, err, errp); ++ goto add_blocker; + } + + warn_report("%s: VFIO device doesn't support device dirty tracking", +@@ -875,29 +886,31 @@ int vfio_migration_realize(VFIODevice *vbasedev, Error **errp) + + ret = vfio_block_multiple_devices_migration(vbasedev, errp); + if (ret) { +- return ret; ++ goto out_deinit; + } + + if (vfio_viommu_preset(vbasedev)) { + error_setg(&err, "%s: Migration is currently not supported " + "with vIOMMU enabled", vbasedev->name); +- return vfio_block_migration(vbasedev, err, errp); ++ goto add_blocker; + } + + trace_vfio_migration_realize(vbasedev->name); + return 0; ++ ++add_blocker: ++ ret = vfio_block_migration(vbasedev, err, errp); ++out_deinit: ++ if (ret) { ++ vfio_migration_deinit(vbasedev); ++ } ++ return ret; + } + + void vfio_migration_exit(VFIODevice *vbasedev) + { + if (vbasedev->migration) { +- VFIOMigration *migration = vbasedev->migration; +- +- remove_migration_state_change_notifier(&migration->migration_state); +- qemu_del_vm_change_state_handler(migration->vm_state); +- unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev); +- vfio_migration_free(vbasedev); +- vfio_unblock_multiple_devices_migration(); ++ vfio_migration_deinit(vbasedev); + } + + if (vbasedev->migration_blocker) { +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 922c81872c..037b7d4176 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -3234,6 +3234,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + ret = vfio_migration_realize(vbasedev, errp); + if (ret) { + error_report("%s: Migration disabled", vbasedev->name); ++ goto out_deregister; + } + } + +-- +2.39.3 + diff --git a/kvm-vfio-migration-Make-VFIO-migration-non-experimental.patch b/kvm-vfio-migration-Make-VFIO-migration-non-experimental.patch new file mode 100644 index 0000000..3258541 --- /dev/null +++ b/kvm-vfio-migration-Make-VFIO-migration-non-experimental.patch @@ -0,0 +1,283 @@ +From 747c34c0a3b8048ebdab387d22f2b922c81d572a Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 21/37] vfio/migration: Make VFIO migration non-experimental +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [19/28] 2f457c1c0de95a3fced0270f2edbbc5193cc4de9 (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit 8bbcb64a71d8 +Author: Avihai Horon +Date: Wed Jun 28 10:31:12 2023 +0300 + + vfio/migration: Make VFIO migration non-experimental + + The major parts of VFIO migration are supported today in QEMU. This + includes basic VFIO migration, device dirty page tracking and precopy + support. + + Thus, at this point in time, it seems appropriate to make VFIO migration + non-experimental: remove the x prefix from enable_migration property, + change it to ON_OFF_AUTO and let the default value be AUTO. + + In addition, make the following adjustments: + 1. When enable_migration is ON and migration is not supported, fail VFIO + device realization. + 2. When enable_migration is AUTO (i.e., not explicitly enabled), require + device dirty tracking support. This is because device dirty tracking + is currently the only method to do dirty page tracking, which is + essential for migrating in a reasonable downtime. Setting + enable_migration to ON will not require device dirty tracking. + 3. Make migration error and blocker messages more elaborate. + 4. Remove error prints in vfio_migration_query_flags(). + 5. Rename trace_vfio_migration_probe() to + trace_vfio_migration_realize(). + + Signed-off-by: Avihai Horon + Reviewed-by: Joao Martins + Reviewed-by: Cédric Le Goater + Reviewed-by: Alex Williamson + Signed-off-by: Cédric Le Goater + +Signed-off-by: Cédric Le Goater +--- + hw/vfio/common.c | 16 ++++++- + hw/vfio/migration.c | 79 +++++++++++++++++++++++------------ + hw/vfio/pci.c | 4 +- + hw/vfio/trace-events | 2 +- + include/hw/vfio/vfio-common.h | 6 +-- + 5 files changed, 73 insertions(+), 34 deletions(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 3b4ac53f15..136d8243d6 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -381,7 +381,7 @@ static unsigned int vfio_migratable_device_num(void) + return device_num; + } + +-int vfio_block_multiple_devices_migration(Error **errp) ++int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp) + { + int ret; + +@@ -390,6 +390,12 @@ int vfio_block_multiple_devices_migration(Error **errp) + return 0; + } + ++ if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { ++ error_setg(errp, "Migration is currently not supported with multiple " ++ "VFIO devices"); ++ return -EINVAL; ++ } ++ + error_setg(&multiple_devices_migration_blocker, + "Migration is currently not supported with multiple " + "VFIO devices"); +@@ -427,7 +433,7 @@ static bool vfio_viommu_preset(void) + return false; + } + +-int vfio_block_giommu_migration(Error **errp) ++int vfio_block_giommu_migration(VFIODevice *vbasedev, Error **errp) + { + int ret; + +@@ -436,6 +442,12 @@ int vfio_block_giommu_migration(Error **errp) + return 0; + } + ++ if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { ++ error_setg(errp, ++ "Migration is currently not supported with vIOMMU enabled"); ++ return -EINVAL; ++ } ++ + error_setg(&giommu_migration_blocker, + "Migration is currently not supported with vIOMMU enabled"); + ret = migrate_add_blocker(giommu_migration_blocker, errp); +diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c +index 7cf143926c..1db7d52ab2 100644 +--- a/hw/vfio/migration.c ++++ b/hw/vfio/migration.c +@@ -724,14 +724,6 @@ static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags) + feature->argsz = sizeof(buf); + feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION; + if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { +- if (errno == ENOTTY) { +- error_report("%s: VFIO migration is not supported in kernel", +- vbasedev->name); +- } else { +- error_report("%s: Failed to query VFIO migration support, err: %s", +- vbasedev->name, strerror(errno)); +- } +- + return -errno; + } + +@@ -810,6 +802,27 @@ static int vfio_migration_init(VFIODevice *vbasedev) + return 0; + } + ++static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp) ++{ ++ int ret; ++ ++ if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { ++ error_propagate(errp, err); ++ return -EINVAL; ++ } ++ ++ vbasedev->migration_blocker = error_copy(err); ++ error_free(err); ++ ++ ret = migrate_add_blocker(vbasedev->migration_blocker, errp); ++ if (ret < 0) { ++ error_free(vbasedev->migration_blocker); ++ vbasedev->migration_blocker = NULL; ++ } ++ ++ return ret; ++} ++ + /* ---------------------------------------------------------------------- */ + + int64_t vfio_mig_bytes_transferred(void) +@@ -824,40 +837,54 @@ void vfio_reset_bytes_transferred(void) + + int vfio_migration_realize(VFIODevice *vbasedev, Error **errp) + { +- int ret = -ENOTSUP; ++ Error *err = NULL; ++ int ret; + +- if (!vbasedev->enable_migration) { +- goto add_blocker; ++ if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) { ++ error_setg(&err, "%s: Migration is disabled for VFIO device", ++ vbasedev->name); ++ return vfio_block_migration(vbasedev, err, errp); + } + + ret = vfio_migration_init(vbasedev); + if (ret) { +- goto add_blocker; ++ if (ret == -ENOTTY) { ++ error_setg(&err, "%s: VFIO migration is not supported in kernel", ++ vbasedev->name); ++ } else { ++ error_setg(&err, ++ "%s: Migration couldn't be initialized for VFIO device, " ++ "err: %d (%s)", ++ vbasedev->name, ret, strerror(-ret)); ++ } ++ ++ return vfio_block_migration(vbasedev, err, errp); ++ } ++ ++ if (!vbasedev->dirty_pages_supported) { ++ if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) { ++ error_setg(&err, ++ "%s: VFIO device doesn't support device dirty tracking", ++ vbasedev->name); ++ return vfio_block_migration(vbasedev, err, errp); ++ } ++ ++ warn_report("%s: VFIO device doesn't support device dirty tracking", ++ vbasedev->name); + } + +- ret = vfio_block_multiple_devices_migration(errp); ++ ret = vfio_block_multiple_devices_migration(vbasedev, errp); + if (ret) { + return ret; + } + +- ret = vfio_block_giommu_migration(errp); ++ ret = vfio_block_giommu_migration(vbasedev, errp); + if (ret) { + return ret; + } + +- trace_vfio_migration_probe(vbasedev->name); ++ trace_vfio_migration_realize(vbasedev->name); + return 0; +- +-add_blocker: +- error_setg(&vbasedev->migration_blocker, +- "VFIO device doesn't support migration"); +- +- ret = migrate_add_blocker(vbasedev->migration_blocker, errp); +- if (ret < 0) { +- error_free(vbasedev->migration_blocker); +- vbasedev->migration_blocker = NULL; +- } +- return ret; + } + + void vfio_migration_exit(VFIODevice *vbasedev) +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 15e7554954..6634945a70 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -3371,8 +3371,8 @@ static Property vfio_pci_dev_properties[] = { + VFIO_FEATURE_ENABLE_REQ_BIT, true), + DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features, + VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), +- DEFINE_PROP_BOOL("x-enable-migration", VFIOPCIDevice, +- vbasedev.enable_migration, false), ++ DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice, ++ vbasedev.enable_migration, ON_OFF_AUTO_AUTO), + DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false), + DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice, + vbasedev.ram_block_discard_allowed, false), +diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events +index 4150b59e58..0391bd583b 100644 +--- a/hw/vfio/trace-events ++++ b/hw/vfio/trace-events +@@ -155,7 +155,7 @@ vfio_load_cleanup(const char *name) " (%s)" + vfio_load_device_config_state(const char *name) " (%s)" + vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64 + vfio_load_state_device_data(const char *name, uint64_t data_size, int ret) " (%s) size 0x%"PRIx64" ret %d" +-vfio_migration_probe(const char *name) " (%s)" ++vfio_migration_realize(const char *name) " (%s)" + vfio_migration_set_state(const char *name, const char *state) " (%s) state %s" + vfio_migration_state_notifier(const char *name, const char *state) " (%s) state %s" + vfio_save_block(const char *name, int data_size) " (%s) data_size %d" +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 1d19c6f251..93429b9abb 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -139,7 +139,7 @@ typedef struct VFIODevice { + bool needs_reset; + bool no_mmap; + bool ram_block_discard_allowed; +- bool enable_migration; ++ OnOffAuto enable_migration; + VFIODeviceOps *ops; + unsigned int num_irqs; + unsigned int num_regions; +@@ -225,9 +225,9 @@ typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList; + extern VFIOGroupList vfio_group_list; + + bool vfio_mig_active(void); +-int vfio_block_multiple_devices_migration(Error **errp); ++int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp); + void vfio_unblock_multiple_devices_migration(void); +-int vfio_block_giommu_migration(Error **errp); ++int vfio_block_giommu_migration(VFIODevice *vbasedev, Error **errp); + int64_t vfio_mig_bytes_transferred(void); + void vfio_reset_bytes_transferred(void); + +-- +2.39.3 + diff --git a/kvm-vfio-migration-Refactor-vfio_save_block-to-return-sa.patch b/kvm-vfio-migration-Refactor-vfio_save_block-to-return-sa.patch new file mode 100644 index 0000000..3b61c5d --- /dev/null +++ b/kvm-vfio-migration-Refactor-vfio_save_block-to-return-sa.patch @@ -0,0 +1,102 @@ +From edcf24a08d66d620a10c746824e31d230c8516ce Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 13/37] vfio/migration: Refactor vfio_save_block() to return + saved data size +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [11/28] b4aed6ddcbde159e98275a0675dcdf45d644673b (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit cf53efbbda2e +Author: Avihai Horon +Date: Wed Jun 21 14:11:58 2023 +0300 + + vfio/migration: Refactor vfio_save_block() to return saved data size + + Refactor vfio_save_block() to return the size of saved data on success + and -errno on error. + + This will be used in next patch to implement VFIO migration pre-copy + support. + + Signed-off-by: Avihai Horon + Reviewed-by: Cédric Le Goater + Reviewed-by: Juan Quintela + Tested-by: YangHang Liu + Acked-by: Alex Williamson + Signed-off-by: Cédric Le Goater + +Signed-off-by: Cédric Le Goater +--- + hw/vfio/migration.c | 17 +++++++++-------- + 1 file changed, 9 insertions(+), 8 deletions(-) + +diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c +index 6b58dddb88..235978fd68 100644 +--- a/hw/vfio/migration.c ++++ b/hw/vfio/migration.c +@@ -241,8 +241,8 @@ static int vfio_query_stop_copy_size(VFIODevice *vbasedev, + return 0; + } + +-/* Returns 1 if end-of-stream is reached, 0 if more data and -errno if error */ +-static int vfio_save_block(QEMUFile *f, VFIOMigration *migration) ++/* Returns the size of saved data on success and -errno on error */ ++static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration) + { + ssize_t data_size; + +@@ -252,7 +252,7 @@ static int vfio_save_block(QEMUFile *f, VFIOMigration *migration) + return -errno; + } + if (data_size == 0) { +- return 1; ++ return 0; + } + + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); +@@ -262,7 +262,7 @@ static int vfio_save_block(QEMUFile *f, VFIOMigration *migration) + + trace_vfio_save_block(migration->vbasedev->name, data_size); + +- return qemu_file_get_error(f); ++ return qemu_file_get_error(f) ?: data_size; + } + + /* ---------------------------------------------------------------------- */ +@@ -335,6 +335,7 @@ static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy, + static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) + { + VFIODevice *vbasedev = opaque; ++ ssize_t data_size; + int ret; + + /* We reach here with device state STOP only */ +@@ -345,11 +346,11 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) + } + + do { +- ret = vfio_save_block(f, vbasedev->migration); +- if (ret < 0) { +- return ret; ++ data_size = vfio_save_block(f, vbasedev->migration); ++ if (data_size < 0) { ++ return data_size; + } +- } while (!ret); ++ } while (data_size); + + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); + ret = qemu_file_get_error(f); +-- +2.39.3 + diff --git a/kvm-vfio-migration-Remove-print-of-Migration-disabled.patch b/kvm-vfio-migration-Remove-print-of-Migration-disabled.patch new file mode 100644 index 0000000..ad3c6ca --- /dev/null +++ b/kvm-vfio-migration-Remove-print-of-Migration-disabled.patch @@ -0,0 +1,56 @@ +From 5bb94c4eaeb94f0b41a57660098a4c12a295b725 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 28/37] vfio/migration: Remove print of "Migration disabled" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [26/28] c7ff1f9c90b4cfcb327ef474042ea71ea577a94d (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit 0520d63c7701 +Author: Zhenzhong Duan +Date: Mon Jul 3 15:15:09 2023 +0800 + + vfio/migration: Remove print of "Migration disabled" + + Property enable_migration supports [on/off/auto]. + In ON mode, error pointer is passed to errp and logged. + In OFF mode, we doesn't need to log "Migration disabled" as it's intentional. + In AUTO mode, we should only ever see errors or warnings if the device + supports migration and an error or incompatibility occurs while further + probing or configuring it. Lack of support for migration shoundn't + generate an error or warning. + + Signed-off-by: Zhenzhong Duan + Reviewed-by: Joao Martins + Reviewed-by: Cédric Le Goater + Signed-off-by: Cédric Le Goater + +Signed-off-by: Cédric Le Goater +--- + hw/vfio/pci.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 037b7d4176..a60b868c38 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -3233,7 +3233,6 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + if (!pdev->failover_pair_id) { + ret = vfio_migration_realize(vbasedev, errp); + if (ret) { +- error_report("%s: Migration disabled", vbasedev->name); + goto out_deregister; + } + } +-- +2.39.3 + diff --git a/kvm-vfio-migration-Reset-bytes_transferred-properly.patch b/kvm-vfio-migration-Reset-bytes_transferred-properly.patch new file mode 100644 index 0000000..2666460 --- /dev/null +++ b/kvm-vfio-migration-Reset-bytes_transferred-properly.patch @@ -0,0 +1,165 @@ +From a63b4010ba4f491c9144afff363bebcf35ecf496 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 20/37] vfio/migration: Reset bytes_transferred properly +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [18/28] e9a70faeca4fd5aa7ef36502cf76bf0b62f65057 (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit 808642a2f640 +Author: Avihai Horon +Date: Wed Jun 28 10:31:11 2023 +0300 + + vfio/migration: Reset bytes_transferred properly + + Currently, VFIO bytes_transferred is not reset properly: + 1. bytes_transferred is not reset after a VM snapshot (so a migration + following a snapshot will report incorrect value). + 2. bytes_transferred is a single counter for all VFIO devices, however + upon migration failure it is reset multiple times, by each VFIO + device. + + Fix it by introducing a new function vfio_reset_bytes_transferred() and + calling it during migration and snapshot start. + + Remove existing bytes_transferred reset in VFIO migration state + notifier, which is not needed anymore. + + Fixes: 3710586caa5d ("qapi: Add VFIO devices migration stats in Migration stats") + Signed-off-by: Avihai Horon + Reviewed-by: Cédric Le Goater + Reviewed-by: Alex Williamson + Signed-off-by: Cédric Le Goater + +Conflicts: + - migration/migration.c + migration/savevm.c + context changes due to commit aff3f6606d14 ("migration: Rename + ram_counters to mig_stats") + +Signed-off-by: Cédric Le Goater +--- + hw/vfio/migration.c | 6 +++++- + include/hw/vfio/vfio-common.h | 1 + + migration/migration.c | 1 + + migration/migration.h | 1 + + migration/savevm.c | 1 + + migration/target.c | 17 +++++++++++++++-- + 6 files changed, 24 insertions(+), 3 deletions(-) + +diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c +index acbf0bb7ab..7cf143926c 100644 +--- a/hw/vfio/migration.c ++++ b/hw/vfio/migration.c +@@ -697,7 +697,6 @@ static void vfio_migration_state_notifier(Notifier *notifier, void *data) + case MIGRATION_STATUS_CANCELLING: + case MIGRATION_STATUS_CANCELLED: + case MIGRATION_STATUS_FAILED: +- bytes_transferred = 0; + /* + * If setting the device in RUNNING state fails, the device should + * be reset. To do so, use ERROR state as a recover state. +@@ -818,6 +817,11 @@ int64_t vfio_mig_bytes_transferred(void) + return bytes_transferred; + } + ++void vfio_reset_bytes_transferred(void) ++{ ++ bytes_transferred = 0; ++} ++ + int vfio_migration_realize(VFIODevice *vbasedev, Error **errp) + { + int ret = -ENOTSUP; +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 6d1b8487c3..1d19c6f251 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -229,6 +229,7 @@ int vfio_block_multiple_devices_migration(Error **errp); + void vfio_unblock_multiple_devices_migration(void); + int vfio_block_giommu_migration(Error **errp); + int64_t vfio_mig_bytes_transferred(void); ++void vfio_reset_bytes_transferred(void); + + #ifdef CONFIG_LINUX + int vfio_get_region_info(VFIODevice *vbasedev, int index, +diff --git a/migration/migration.c b/migration/migration.c +index 9bf1caee6c..47ad6c43cb 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -1638,6 +1638,7 @@ static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc, + */ + memset(&ram_counters, 0, sizeof(ram_counters)); + memset(&compression_counters, 0, sizeof(compression_counters)); ++ reset_vfio_bytes_transferred(); + + return true; + } +diff --git a/migration/migration.h b/migration/migration.h +index e9679f8029..7ccf460aa2 100644 +--- a/migration/migration.h ++++ b/migration/migration.h +@@ -495,6 +495,7 @@ bool migration_rate_limit(void); + void migration_cancel(const Error *error); + + void populate_vfio_info(MigrationInfo *info); ++void reset_vfio_bytes_transferred(void); + void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page); + + #endif +diff --git a/migration/savevm.c b/migration/savevm.c +index aff70e6263..83088fc3f8 100644 +--- a/migration/savevm.c ++++ b/migration/savevm.c +@@ -1620,6 +1620,7 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) + migrate_init(ms); + memset(&ram_counters, 0, sizeof(ram_counters)); + memset(&compression_counters, 0, sizeof(compression_counters)); ++ reset_vfio_bytes_transferred(); + ms->to_dst_file = f; + + qemu_mutex_unlock_iothread(); +diff --git a/migration/target.c b/migration/target.c +index 00ca007f97..f39c9a8d88 100644 +--- a/migration/target.c ++++ b/migration/target.c +@@ -14,12 +14,25 @@ + #include "hw/vfio/vfio-common.h" + #endif + ++#ifdef CONFIG_VFIO + void populate_vfio_info(MigrationInfo *info) + { +-#ifdef CONFIG_VFIO + if (vfio_mig_active()) { + info->vfio = g_malloc0(sizeof(*info->vfio)); + info->vfio->transferred = vfio_mig_bytes_transferred(); + } +-#endif + } ++ ++void reset_vfio_bytes_transferred(void) ++{ ++ vfio_reset_bytes_transferred(); ++} ++#else ++void populate_vfio_info(MigrationInfo *info) ++{ ++} ++ ++void reset_vfio_bytes_transferred(void) ++{ ++} ++#endif +-- +2.39.3 + diff --git a/kvm-vfio-migration-Return-bool-type-for-vfio_migration_r.patch b/kvm-vfio-migration-Return-bool-type-for-vfio_migration_r.patch new file mode 100644 index 0000000..efd42a9 --- /dev/null +++ b/kvm-vfio-migration-Return-bool-type-for-vfio_migration_r.patch @@ -0,0 +1,125 @@ +From 223eef8363c9ba58514b2d4f93e5ff015d111ff2 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 29/37] vfio/migration: Return bool type for + vfio_migration_realize() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [27/28] d5aea3ea4c53e4573076cbacbbe3134f9f0f9e53 (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit d4a2af747d5a +Author: Zhenzhong Duan +Date: Mon Jul 3 15:15:10 2023 +0800 + + vfio/migration: Return bool type for vfio_migration_realize() + + Make vfio_migration_realize() adhere to the convention of other realize() + callbacks(like qdev_realize) by returning bool instead of int. + + Suggested-by: Cédric Le Goater + Suggested-by: Joao Martins + Signed-off-by: Zhenzhong Duan + Reviewed-by: Joao Martins + Reviewed-by: Cédric Le Goater + Signed-off-by: Cédric Le Goater + +Signed-off-by: Cédric Le Goater +--- + hw/vfio/migration.c | 15 ++++++++++----- + hw/vfio/pci.c | 3 +-- + include/hw/vfio/vfio-common.h | 2 +- + 3 files changed, 12 insertions(+), 8 deletions(-) + +diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c +index e3954570c8..2674f4bc47 100644 +--- a/hw/vfio/migration.c ++++ b/hw/vfio/migration.c +@@ -846,7 +846,12 @@ void vfio_reset_bytes_transferred(void) + bytes_transferred = 0; + } + +-int vfio_migration_realize(VFIODevice *vbasedev, Error **errp) ++/* ++ * Return true when either migration initialized or blocker registered. ++ * Currently only return false when adding blocker fails which will ++ * de-register vfio device. ++ */ ++bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp) + { + Error *err = NULL; + int ret; +@@ -854,7 +859,7 @@ int vfio_migration_realize(VFIODevice *vbasedev, Error **errp) + if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) { + error_setg(&err, "%s: Migration is disabled for VFIO device", + vbasedev->name); +- return vfio_block_migration(vbasedev, err, errp); ++ return !vfio_block_migration(vbasedev, err, errp); + } + + ret = vfio_migration_init(vbasedev); +@@ -869,7 +874,7 @@ int vfio_migration_realize(VFIODevice *vbasedev, Error **errp) + vbasedev->name, ret, strerror(-ret)); + } + +- return vfio_block_migration(vbasedev, err, errp); ++ return !vfio_block_migration(vbasedev, err, errp); + } + + if (!vbasedev->dirty_pages_supported) { +@@ -896,7 +901,7 @@ int vfio_migration_realize(VFIODevice *vbasedev, Error **errp) + } + + trace_vfio_migration_realize(vbasedev->name); +- return 0; ++ return true; + + add_blocker: + ret = vfio_block_migration(vbasedev, err, errp); +@@ -904,7 +909,7 @@ out_deinit: + if (ret) { + vfio_migration_deinit(vbasedev); + } +- return ret; ++ return !ret; + } + + void vfio_migration_exit(VFIODevice *vbasedev) +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index a60b868c38..ba40ca8784 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -3231,8 +3231,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + } + + if (!pdev->failover_pair_id) { +- ret = vfio_migration_realize(vbasedev, errp); +- if (ret) { ++ if (!vfio_migration_realize(vbasedev, errp)) { + goto out_deregister; + } + } +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index 45167c8a8a..da43d27352 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -252,7 +252,7 @@ int vfio_spapr_create_window(VFIOContainer *container, + int vfio_spapr_remove_window(VFIOContainer *container, + hwaddr offset_within_address_space); + +-int vfio_migration_realize(VFIODevice *vbasedev, Error **errp); ++bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp); + void vfio_migration_exit(VFIODevice *vbasedev); + + #endif /* HW_VFIO_VFIO_COMMON_H */ +-- +2.39.3 + diff --git a/kvm-vfio-migration-Skip-log_sync-during-migration-SETUP-.patch b/kvm-vfio-migration-Skip-log_sync-during-migration-SETUP-.patch new file mode 100644 index 0000000..6211db7 --- /dev/null +++ b/kvm-vfio-migration-Skip-log_sync-during-migration-SETUP-.patch @@ -0,0 +1,68 @@ +From 76208f7824d5139ac8d86140b0e01031b67638cc Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:56 +0200 +Subject: [PATCH 04/37] vfio/migration: Skip log_sync during migration SETUP + state +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [2/28] 4c340992b472ac4627b57705f4e971f14bbb0846 (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit ff180c6bd7a8 +Author: Avihai Horon +Date: Mon Apr 3 16:00:00 2023 +0300 + + vfio/migration: Skip log_sync during migration SETUP state + + Currently, VFIO log_sync can be issued while migration is in SETUP + state. However, doing this log_sync is at best redundant and at worst + can fail. + + Redundant -- all RAM is marked dirty in migration SETUP state and is + transferred only after migration is set to ACTIVE state, so doing + log_sync during migration SETUP is pointless. + + Can fail -- there is a time window, between setting migration state to + SETUP and starting dirty tracking by RAM save_live_setup handler, during + which dirty tracking is still not started. Any VFIO log_sync call that + is issued during this time window will fail. For example, this error can + be triggered by migrating a VM when a GUI is active, which constantly + calls log_sync. + + Fix it by skipping VFIO log_sync while migration is in SETUP state. + + Fixes: 758b96b61d5c ("vfio/migrate: Move switch of dirty tracking into vfio_memory_listener") + Signed-off-by: Avihai Horon + Link: https://lore.kernel.org/r/20230403130000.6422-1-avihaih@nvidia.com + Signed-off-by: Alex Williamson + +Signed-off-by: Cédric Le Goater +--- + hw/vfio/common.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/hw/vfio/common.c b/hw/vfio/common.c +index 4d01ea3515..78358ede27 100644 +--- a/hw/vfio/common.c ++++ b/hw/vfio/common.c +@@ -478,7 +478,8 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) + VFIODevice *vbasedev; + MigrationState *ms = migrate_get_current(); + +- if (!migration_is_setup_or_active(ms->state)) { ++ if (ms->state != MIGRATION_STATUS_ACTIVE && ++ ms->state != MIGRATION_STATUS_DEVICE) { + return false; + } + +-- +2.39.3 + diff --git a/kvm-vfio-migration-Store-VFIO-migration-flags-in-VFIOMig.patch b/kvm-vfio-migration-Store-VFIO-migration-flags-in-VFIOMig.patch new file mode 100644 index 0000000..2db8511 --- /dev/null +++ b/kvm-vfio-migration-Store-VFIO-migration-flags-in-VFIOMig.patch @@ -0,0 +1,70 @@ +From 77353cdafd08562dff9c99e9f3984d12224bee52 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 14/37] vfio/migration: Store VFIO migration flags in + VFIOMigration +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [12/28] 31a9c39e6ee6338a35dc08c3e7f5c1a204166249 (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit 6cd1fe11598a +Author: Avihai Horon +Date: Wed Jun 21 14:11:59 2023 +0300 + + vfio/migration: Store VFIO migration flags in VFIOMigration + + VFIO migration flags are queried once in vfio_migration_init(). Store + them in VFIOMigration so they can be used later to check the device's + migration capabilities without re-querying them. + + This will be used in the next patch to check if the device supports + precopy migration. + + Signed-off-by: Avihai Horon + Reviewed-by: Cédric Le Goater + Tested-by: YangHang Liu + Acked-by: Alex Williamson + Signed-off-by: Cédric Le Goater + +Signed-off-by: Cédric Le Goater +--- + hw/vfio/migration.c | 1 + + include/hw/vfio/vfio-common.h | 1 + + 2 files changed, 2 insertions(+) + +diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c +index 235978fd68..8d33414379 100644 +--- a/hw/vfio/migration.c ++++ b/hw/vfio/migration.c +@@ -603,6 +603,7 @@ static int vfio_migration_init(VFIODevice *vbasedev) + migration->vbasedev = vbasedev; + migration->device_state = VFIO_DEVICE_STATE_RUNNING; + migration->data_fd = -1; ++ migration->mig_flags = mig_flags; + + vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev); + +diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h +index eed244f25f..5f29dab839 100644 +--- a/include/hw/vfio/vfio-common.h ++++ b/include/hw/vfio/vfio-common.h +@@ -66,6 +66,7 @@ typedef struct VFIOMigration { + int data_fd; + void *data_buffer; + size_t data_buffer_size; ++ uint64_t mig_flags; + } VFIOMigration; + + typedef struct VFIOAddressSpace { +-- +2.39.3 + diff --git a/kvm-vfio-pci-Call-vfio_prepare_kvm_msi_virq_batch-in-MSI.patch b/kvm-vfio-pci-Call-vfio_prepare_kvm_msi_virq_batch-in-MSI.patch new file mode 100644 index 0000000..b5d9d37 --- /dev/null +++ b/kvm-vfio-pci-Call-vfio_prepare_kvm_msi_virq_batch-in-MSI.patch @@ -0,0 +1,67 @@ +From b5a69101abac153c9c9be7f539d810e3e4af3bdf Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 19/37] vfio/pci: Call vfio_prepare_kvm_msi_virq_batch() in MSI + retry path +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [17/28] 2067bb58f3a2c1a793e5566cee3c78a8299c9c1c (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit c17408892319 +Author: Shameer Kolothum +Date: Tue Jun 13 15:09:43 2023 +0100 + + vfio/pci: Call vfio_prepare_kvm_msi_virq_batch() in MSI retry path + + When vfio_enable_vectors() returns with less than requested nr_vectors + we retry with what kernel reported back. But the retry path doesn't + call vfio_prepare_kvm_msi_virq_batch() and this results in, + + qemu-system-aarch64: vfio: Error: Failed to enable 4 MSI vectors, retry with 1 + qemu-system-aarch64: ../hw/vfio/pci.c:602: vfio_commit_kvm_msi_virq_batch: Assertion `vdev->defer_kvm_irq_routing' failed + + Fixes: dc580d51f7dd ("vfio: defer to commit kvm irq routing when enable msi/msix") + Reviewed-by: Longpeng + Signed-off-by: Shameer Kolothum + Reviewed-by: Cédric Le Goater + Signed-off-by: Cédric Le Goater + +Signed-off-by: Cédric Le Goater +--- + hw/vfio/pci.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 7c5e2b5996..15e7554954 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -666,6 +666,8 @@ static void vfio_msi_enable(VFIOPCIDevice *vdev) + + vfio_disable_interrupts(vdev); + ++ vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev); ++retry: + /* + * Setting vector notifiers needs to enable route for each vector. + * Deferring to commit the KVM routes once rather than per vector +@@ -673,8 +675,6 @@ static void vfio_msi_enable(VFIOPCIDevice *vdev) + */ + vfio_prepare_kvm_msi_virq_batch(vdev); + +- vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev); +-retry: + vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors); + + for (i = 0; i < vdev->nr_vectors; i++) { +-- +2.39.3 + diff --git a/kvm-vfio-pci-Disable-INTx-in-vfio_realize-error-path.patch b/kvm-vfio-pci-Disable-INTx-in-vfio_realize-error-path.patch new file mode 100644 index 0000000..0aca4ef --- /dev/null +++ b/kvm-vfio-pci-Disable-INTx-in-vfio_realize-error-path.patch @@ -0,0 +1,54 @@ +From 816c20b23546d31316c9ca450db8a6668ac6216c Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 25/37] vfio/pci: Disable INTx in vfio_realize error path +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [23/28] 2fde4bad00c4286e6bbe24947c2bfd6468fc0ff3 (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit adee0da0368f +Author: Zhenzhong Duan +Date: Mon Jul 3 15:15:06 2023 +0800 + + vfio/pci: Disable INTx in vfio_realize error path + + When vfio realize fails, INTx isn't disabled if it has been enabled. + This may confuse host side with unhandled interrupt report. + + Fixes: c5478fea27ac ("vfio/pci: Respond to KVM irqchip change notifier") + Signed-off-by: Zhenzhong Duan + Reviewed-by: Joao Martins + Reviewed-by: Cédric Le Goater + Signed-off-by: Cédric Le Goater + +Signed-off-by: Cédric Le Goater +--- + hw/vfio/pci.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 87bd440504..2d059832a4 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -3244,6 +3244,9 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + return; + + out_deregister: ++ if (vdev->interrupt == VFIO_INT_INTx) { ++ vfio_intx_disable(vdev); ++ } + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); + if (vdev->irqchip_change_notifier.notify) { + kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); +-- +2.39.3 + diff --git a/kvm-vfio-pci-Fix-a-segfault-in-vfio_realize.patch b/kvm-vfio-pci-Fix-a-segfault-in-vfio_realize.patch new file mode 100644 index 0000000..d05d114 --- /dev/null +++ b/kvm-vfio-pci-Fix-a-segfault-in-vfio_realize.patch @@ -0,0 +1,67 @@ +From 0b1ab3aacc02e70bfe8440236eb9def426bbe10e Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 22/37] vfio/pci: Fix a segfault in vfio_realize +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [20/28] 48b9c1efe295c2672693d9c99f6d11738d2b98d1 (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit 357bd7932a13 +Author: Zhenzhong Duan +Date: Thu Jun 29 16:40:38 2023 +0800 + + vfio/pci: Fix a segfault in vfio_realize + + The kvm irqchip notifier is only registered if the device supports + INTx, however it's unconditionally removed in vfio realize error + path. If the assigned device does not support INTx, this will cause + QEMU to crash when vfio realize fails. Change it to conditionally + remove the notifier only if the notify hook is setup. + + Before fix: + (qemu) device_add vfio-pci,host=81:11.1,id=vfio1,bus=root1,xres=1 + Connection closed by foreign host. + + After fix: + (qemu) device_add vfio-pci,host=81:11.1,id=vfio1,bus=root1,xres=1 + Error: vfio 0000:81:11.1: xres and yres properties require display=on + (qemu) + + Fixes: c5478fea27ac ("vfio/pci: Respond to KVM irqchip change notifier") + Signed-off-by: Zhenzhong Duan + Reviewed-by: Cédric Le Goater + Reviewed-by: Joao Martins + Signed-off-by: Cédric Le Goater + +Signed-off-by: Cédric Le Goater +--- + hw/vfio/pci.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 6634945a70..d08e6c1a20 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -3245,7 +3245,9 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + + out_deregister: + pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); +- kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); ++ if (vdev->irqchip_change_notifier.notify) { ++ kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); ++ } + out_teardown: + vfio_teardown_msi(vdev); + vfio_bars_exit(vdev); +-- +2.39.3 + diff --git a/kvm-vfio-pci-Fix-a-use-after-free-issue.patch b/kvm-vfio-pci-Fix-a-use-after-free-issue.patch new file mode 100644 index 0000000..1fa725f --- /dev/null +++ b/kvm-vfio-pci-Fix-a-use-after-free-issue.patch @@ -0,0 +1,56 @@ +From 2437a06ff137c4bc856df096e42407c1f50b25b0 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:56 +0200 +Subject: [PATCH 06/37] vfio/pci: Fix a use-after-free issue +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [4/28] eca69a7e0a6fb8c1c70be8b91209a53b040e30ba (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit b83b40b61484 +Author: Zhenzhong Duan +Date: Wed May 17 10:46:51 2023 +0800 + + vfio/pci: Fix a use-after-free issue + + vbasedev->name is freed wrongly which leads to garbage VFIO trace log. + Fix it by allocating a dup of vbasedev->name and then free the dup. + + Fixes: 2dca1b37a760 ("vfio/pci: add support for VF token") + Suggested-by: Alex Williamson + Signed-off-by: Zhenzhong Duan + Reviewed-by: Cédric Le Goater + Reviewed-by: Matthew Rosato + Acked-by: Alex Williamson + Reviewed-by: Philippe Mathieu-Daudé + Signed-off-by: Cédric Le Goater + +Signed-off-by: Cédric Le Goater +--- + hw/vfio/pci.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 6cd3a98c39..7c5e2b5996 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -3018,7 +3018,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + qemu_uuid_unparse(&vdev->vf_token, uuid); + name = g_strdup_printf("%s vf_token=%s", vbasedev->name, uuid); + } else { +- name = vbasedev->name; ++ name = g_strdup(vbasedev->name); + } + + ret = vfio_get_device(group, name, vbasedev, errp); +-- +2.39.3 + diff --git a/kvm-vfio-pci-Free-leaked-timer-in-vfio_realize-error-pat.patch b/kvm-vfio-pci-Free-leaked-timer-in-vfio_realize-error-pat.patch new file mode 100644 index 0000000..3978b96 --- /dev/null +++ b/kvm-vfio-pci-Free-leaked-timer-in-vfio_realize-error-pat.patch @@ -0,0 +1,55 @@ +From 9c5016c9b3f9cf66d1b531de829e8b5010962695 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:57 +0200 +Subject: [PATCH 23/37] vfio/pci: Free leaked timer in vfio_realize error path +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [21/28] dbaae4e484de4613f7f7735be519b7357627326e (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit 0cc889c8826c +Author: Zhenzhong Duan +Date: Thu Jun 29 16:40:39 2023 +0800 + + vfio/pci: Free leaked timer in vfio_realize error path + + When vfio_realize fails, the mmap_timer used for INTx optimization + isn't freed. As this timer isn't activated yet, the potential impact + is just a piece of leaked memory. + + Fixes: ea486926b07d ("vfio-pci: Update slow path INTx algorithm timer related") + Signed-off-by: Zhenzhong Duan + Reviewed-by: Cédric Le Goater + Reviewed-by: Joao Martins + Signed-off-by: Cédric Le Goater + +Signed-off-by: Cédric Le Goater +--- + hw/vfio/pci.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index d08e6c1a20..87bd440504 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -3248,6 +3248,9 @@ out_deregister: + if (vdev->irqchip_change_notifier.notify) { + kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); + } ++ if (vdev->intx.mmap_timer) { ++ timer_free(vdev->intx.mmap_timer); ++ } + out_teardown: + vfio_teardown_msi(vdev); + vfio_bars_exit(vdev); +-- +2.39.3 + diff --git a/kvm-vfio-pci-Static-Resizable-BAR-capability.patch b/kvm-vfio-pci-Static-Resizable-BAR-capability.patch new file mode 100644 index 0000000..d937140 --- /dev/null +++ b/kvm-vfio-pci-Static-Resizable-BAR-capability.patch @@ -0,0 +1,141 @@ +From db53345dba5682c3ba0bc3fc596b30a98dadb88f Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:56 +0200 +Subject: [PATCH 05/37] vfio/pci: Static Resizable BAR capability +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [3/28] 42e9f4b517eb919c77c6fdbe771d9d05a91955bd (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit b5048a4cbfa0 +Author: Alex Williamson +Date: Thu May 4 14:42:48 2023 -0600 + + vfio/pci: Static Resizable BAR capability + + The PCI Resizable BAR (ReBAR) capability is currently hidden from the + VM because the protocol for interacting with the capability does not + support a mechanism for the device to reject an advertised supported + BAR size. However, when assigned to a VM, the act of resizing the + BAR requires adjustment of host resources for the device, which + absolutely can fail. Linux does not currently allow us to reserve + resources for the device independent of the current usage. + + The only writable field within the ReBAR capability is the BAR Size + register. The PCIe spec indicates that when written, the device + should immediately begin to operate with the provided BAR size. The + spec however also notes that software must only write values + corresponding to supported sizes as indicated in the capability and + control registers. Writing unsupported sizes produces undefined + results. Therefore, if the hypervisor were to virtualize the + capability and control registers such that the current size is the + only indicated available size, then a write of anything other than + the current size falls into the category of undefined behavior, + where we can essentially expose the modified ReBAR capability as + read-only. + + This may seem pointless, but users have reported that virtualizing + the capability in this way not only allows guest software to expose + related features as available (even if only cosmetic), but in some + scenarios can resolve guest driver issues. Additionally, no + regressions in behavior have been reported for this change. + + A caveat here is that the PCIe spec requires for compatibility that + devices report support for a size in the range of 1MB to 512GB, + therefore if the current BAR size falls outside that range we revert + to hiding the capability. + + Reviewed-by: Cédric Le Goater + Link: https://lore.kernel.org/r/20230505232308.2869912-1-alex.williamson@redhat.com + Signed-off-by: Alex Williamson + +Signed-off-by: Cédric Le Goater +--- + hw/vfio/pci.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 53 insertions(+), 1 deletion(-) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index 579b92a6ed..6cd3a98c39 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -2069,6 +2069,54 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp) + return 0; + } + ++static int vfio_setup_rebar_ecap(VFIOPCIDevice *vdev, uint16_t pos) ++{ ++ uint32_t ctrl; ++ int i, nbar; ++ ++ ctrl = pci_get_long(vdev->pdev.config + pos + PCI_REBAR_CTRL); ++ nbar = (ctrl & PCI_REBAR_CTRL_NBAR_MASK) >> PCI_REBAR_CTRL_NBAR_SHIFT; ++ ++ for (i = 0; i < nbar; i++) { ++ uint32_t cap; ++ int size; ++ ++ ctrl = pci_get_long(vdev->pdev.config + pos + PCI_REBAR_CTRL + (i * 8)); ++ size = (ctrl & PCI_REBAR_CTRL_BAR_SIZE) >> PCI_REBAR_CTRL_BAR_SHIFT; ++ ++ /* The cap register reports sizes 1MB to 128TB, with 4 reserved bits */ ++ cap = size <= 27 ? 1U << (size + 4) : 0; ++ ++ /* ++ * The PCIe spec (v6.0.1, 7.8.6) requires HW to support at least one ++ * size in the range 1MB to 512GB. We intend to mask all sizes except ++ * the one currently enabled in the size field, therefore if it's ++ * outside the range, hide the whole capability as this virtualization ++ * trick won't work. If >512GB resizable BARs start to appear, we ++ * might need an opt-in or reservation scheme in the kernel. ++ */ ++ if (!(cap & PCI_REBAR_CAP_SIZES)) { ++ return -EINVAL; ++ } ++ ++ /* Hide all sizes reported in the ctrl reg per above requirement. */ ++ ctrl &= (PCI_REBAR_CTRL_BAR_SIZE | ++ PCI_REBAR_CTRL_NBAR_MASK | ++ PCI_REBAR_CTRL_BAR_IDX); ++ ++ /* ++ * The BAR size field is RW, however we've mangled the capability ++ * register such that we only report a single size, ie. the current ++ * BAR size. A write of an unsupported value is undefined, therefore ++ * the register field is essentially RO. ++ */ ++ vfio_add_emulated_long(vdev, pos + PCI_REBAR_CAP + (i * 8), cap, ~0); ++ vfio_add_emulated_long(vdev, pos + PCI_REBAR_CTRL + (i * 8), ctrl, ~0); ++ } ++ ++ return 0; ++} ++ + static void vfio_add_ext_cap(VFIOPCIDevice *vdev) + { + PCIDevice *pdev = &vdev->pdev; +@@ -2142,9 +2190,13 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev) + case 0: /* kernel masked capability */ + case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */ + case PCI_EXT_CAP_ID_ARI: /* XXX Needs next function virtualization */ +- case PCI_EXT_CAP_ID_REBAR: /* Can't expose read-only */ + trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next); + break; ++ case PCI_EXT_CAP_ID_REBAR: ++ if (!vfio_setup_rebar_ecap(vdev, next)) { ++ pcie_add_capability(pdev, cap_id, cap_ver, next, size); ++ } ++ break; + default: + pcie_add_capability(pdev, cap_id, cap_ver, next, size); + } +-- +2.39.3 + diff --git a/kvm-vfio-pci-add-support-for-VF-token.patch b/kvm-vfio-pci-add-support-for-VF-token.patch new file mode 100644 index 0000000..7b40e5e --- /dev/null +++ b/kvm-vfio-pci-add-support-for-VF-token.patch @@ -0,0 +1,104 @@ +From 3022cc31bca5a5441e285c971eaf72b7643b9be0 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= +Date: Wed, 12 Jul 2023 17:46:56 +0200 +Subject: [PATCH 03/37] vfio/pci: add support for VF token +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Cédric Le Goater +RH-MergeRequest: 179: vfio: live migration support +RH-Bugzilla: 2192818 +RH-Acked-by: Eric Auger +RH-Acked-by: Miroslav Rezanina +RH-Commit: [1/28] ff24284ede2806e21f4f6709d8abd4c4029b7d5c (clegoate/qemu-kvm-c9s) + +Bugzilla: https://bugzilla.redhat.com/2192818 + +commit 2dca1b37a760 +Author: Minwoo Im +Date: Mon Mar 20 16:35:22 2023 +0900 + + vfio/pci: add support for VF token + + VF token was introduced [1] to kernel vfio-pci along with SR-IOV + support [2]. This patch adds support VF token among PF and VF(s). To + passthu PCIe VF to a VM, kernel >= v5.7 needs this. + + It can be configured with UUID like: + + -device vfio-pci,host=DDDD:BB:DD:F,vf-token=,... + + [1] https://lore.kernel.org/linux-pci/158396393244.5601.10297430724964025753.stgit@gimli.home/ + [2] https://lore.kernel.org/linux-pci/158396044753.5601.14804870681174789709.stgit@gimli.home/ + + Cc: Alex Williamson + Signed-off-by: Minwoo Im + Reviewed-by: Klaus Jensen + Link: https://lore.kernel.org/r/20230320073522epcms2p48f682ecdb73e0ae1a4850ad0712fd780@epcms2p4 + Signed-off-by: Alex Williamson + +Conflicts: + - hw/vfio/pci.c + context changes in vfio_realize () due to redhat commit 267071d16b23 + ("vfio: cap number of devices that can be assigned") + +Signed-off-by: Cédric Le Goater +--- + hw/vfio/pci.c | 13 ++++++++++++- + hw/vfio/pci.h | 1 + + 2 files changed, 13 insertions(+), 1 deletion(-) + +diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c +index a779053be3..579b92a6ed 100644 +--- a/hw/vfio/pci.c ++++ b/hw/vfio/pci.c +@@ -2859,6 +2859,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + int groupid; + int ret, i = 0; + bool is_mdev; ++ char uuid[UUID_FMT_LEN]; ++ char *name; + + if (device_limit && device_limit != vdev->assigned_device_limit) { + error_setg(errp, "Assigned device limit has been redefined. " +@@ -2960,7 +2962,15 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) + goto error; + } + +- ret = vfio_get_device(group, vbasedev->name, vbasedev, errp); ++ if (!qemu_uuid_is_null(&vdev->vf_token)) { ++ qemu_uuid_unparse(&vdev->vf_token, uuid); ++ name = g_strdup_printf("%s vf_token=%s", vbasedev->name, uuid); ++ } else { ++ name = vbasedev->name; ++ } ++ ++ ret = vfio_get_device(group, name, vbasedev, errp); ++ g_free(name); + if (ret) { + vfio_put_group(group); + goto error; +@@ -3292,6 +3302,7 @@ static void vfio_instance_init(Object *obj) + + static Property vfio_pci_dev_properties[] = { + DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host), ++ DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token), + DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev), + DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice, + vbasedev.pre_copy_dirty_page_tracking, +diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h +index 45235d38ba..10530743ad 100644 +--- a/hw/vfio/pci.h ++++ b/hw/vfio/pci.h +@@ -137,6 +137,7 @@ struct VFIOPCIDevice { + VFIOVGA *vga; /* 0xa0000, 0x3b0, 0x3c0 */ + void *igd_opregion; + PCIHostDeviceAddress host; ++ QemuUUID vf_token; + EventNotifier err_notifier; + EventNotifier req_notifier; + int (*resetfn)(struct VFIOPCIDevice *); +-- +2.39.3 + diff --git a/kvm-virtio-iommu-Fix-64kB-host-page-size-VFIO-device-ass.patch b/kvm-virtio-iommu-Fix-64kB-host-page-size-VFIO-device-ass.patch new file mode 100644 index 0000000..acfb3ae --- /dev/null +++ b/kvm-virtio-iommu-Fix-64kB-host-page-size-VFIO-device-ass.patch @@ -0,0 +1,151 @@ +From 08c8af80dbd03b46a6a8397ef0c41cda3e6de22c Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Wed, 5 Jul 2023 18:51:17 +0200 +Subject: [PATCH 01/37] virtio-iommu: Fix 64kB host page size VFIO device + assignment + +RH-Author: Eric Auger +RH-MergeRequest: 182: VIRTIO-IOMMU/VFIO page size related fixes +RH-Bugzilla: 2211609 2211634 +RH-Acked-by: Gavin Shan +RH-Acked-by: Sebastian Ott +RH-Commit: [1/2] b48db1c964559505dda4c6c9a3b79d68207b25eb (eauger1/centos-qemu-kvm) + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2211634 + +When running on a 64kB page size host and protecting a VFIO device +with the virtio-iommu, qemu crashes with this kind of message: + +qemu-kvm: virtio-iommu page mask 0xfffffffffffff000 is incompatible +with mask 0x20010000 +qemu: hardware error: vfio: DMA mapping failed, unable to continue + +This is due to the fact the IOMMU MR corresponding to the VFIO device +is enabled very late on domain attach, after the machine init. +The device reports a minimal 64kB page size but it is too late to be +applied. virtio_iommu_set_page_size_mask() fails and this causes +vfio_listener_region_add() to end up with hw_error(); + +To work around this issue, we transiently enable the IOMMU MR on +machine init to collect the page size requirements and then restore +the bypass state. + +Fixes: 90519b9053 ("virtio-iommu: Add bypass mode support to assigned device") +Signed-off-by: Eric Auger + +Message-Id: <20230705165118.28194-2-eric.auger@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Jean-Philippe Brucker +Tested-by: Jean-Philippe Brucker +Reviewed-by: Zhenzhong Duan +(cherry picked from commit 94df5b2180d61fb2ee2b04cc007981e58b6479a9) +Signed-off-by: Eric Auger +--- + hw/virtio/trace-events | 1 + + hw/virtio/virtio-iommu.c | 31 +++++++++++++++++++++++++++++-- + include/hw/virtio/virtio-iommu.h | 2 ++ + 3 files changed, 32 insertions(+), 2 deletions(-) + +diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events +index 8f8d05cf9b..68b752e304 100644 +--- a/hw/virtio/trace-events ++++ b/hw/virtio/trace-events +@@ -131,6 +131,7 @@ virtio_iommu_set_page_size_mask(const char *name, uint64_t old, uint64_t new) "m + virtio_iommu_notify_flag_add(const char *name) "add notifier to mr %s" + virtio_iommu_notify_flag_del(const char *name) "del notifier from mr %s" + virtio_iommu_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)" ++virtio_iommu_freeze_granule(uint64_t page_size_mask) "granule set to 0x%"PRIx64 + + # virtio-mem.c + virtio_mem_send_response(uint16_t type) "type=%" PRIu16 +diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c +index 1cd258135d..542679b321 100644 +--- a/hw/virtio/virtio-iommu.c ++++ b/hw/virtio/virtio-iommu.c +@@ -24,6 +24,7 @@ + #include "hw/virtio/virtio.h" + #include "sysemu/kvm.h" + #include "sysemu/reset.h" ++#include "sysemu/sysemu.h" + #include "qapi/error.h" + #include "qemu/error-report.h" + #include "trace.h" +@@ -1106,12 +1107,12 @@ static int virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr, + } + + /* +- * After the machine is finalized, we can't change the mask anymore. If by ++ * Once the granule is frozen we can't change the mask anymore. If by + * chance the hotplugged device supports the same granule, we can still + * accept it. Having a different masks is possible but the guest will use + * sub-optimal block sizes, so warn about it. + */ +- if (phase_check(PHASE_MACHINE_READY)) { ++ if (s->granule_frozen) { + int new_granule = ctz64(new_mask); + int cur_granule = ctz64(cur_mask); + +@@ -1146,6 +1147,28 @@ static void virtio_iommu_system_reset(void *opaque) + + } + ++static void virtio_iommu_freeze_granule(Notifier *notifier, void *data) ++{ ++ VirtIOIOMMU *s = container_of(notifier, VirtIOIOMMU, machine_done); ++ int granule; ++ ++ if (likely(s->config.bypass)) { ++ /* ++ * Transient IOMMU MR enable to collect page_size_mask requirements ++ * through memory_region_iommu_set_page_size_mask() called by ++ * VFIO region_add() callback ++ */ ++ s->config.bypass = false; ++ virtio_iommu_switch_address_space_all(s); ++ /* restore default */ ++ s->config.bypass = true; ++ virtio_iommu_switch_address_space_all(s); ++ } ++ s->granule_frozen = true; ++ granule = ctz64(s->config.page_size_mask); ++ trace_virtio_iommu_freeze_granule(BIT(granule)); ++} ++ + static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) + { + VirtIODevice *vdev = VIRTIO_DEVICE(dev); +@@ -1189,6 +1212,9 @@ static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) + error_setg(errp, "VIRTIO-IOMMU is not attached to any PCI bus!"); + } + ++ s->machine_done.notify = virtio_iommu_freeze_granule; ++ qemu_add_machine_init_done_notifier(&s->machine_done); ++ + qemu_register_reset(virtio_iommu_system_reset, s); + } + +@@ -1198,6 +1224,7 @@ static void virtio_iommu_device_unrealize(DeviceState *dev) + VirtIOIOMMU *s = VIRTIO_IOMMU(dev); + + qemu_unregister_reset(virtio_iommu_system_reset, s); ++ qemu_remove_machine_init_done_notifier(&s->machine_done); + + g_hash_table_destroy(s->as_by_busptr); + if (s->domains) { +diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h +index 2ad5ee320b..a93fc5383e 100644 +--- a/include/hw/virtio/virtio-iommu.h ++++ b/include/hw/virtio/virtio-iommu.h +@@ -61,6 +61,8 @@ struct VirtIOIOMMU { + QemuRecMutex mutex; + GTree *endpoints; + bool boot_bypass; ++ Notifier machine_done; ++ bool granule_frozen; + }; + + #endif +-- +2.39.3 + diff --git a/kvm-virtio-iommu-Rework-the-traces-in-virtio_iommu_set_p.patch b/kvm-virtio-iommu-Rework-the-traces-in-virtio_iommu_set_p.patch new file mode 100644 index 0000000..7934a12 --- /dev/null +++ b/kvm-virtio-iommu-Rework-the-traces-in-virtio_iommu_set_p.patch @@ -0,0 +1,83 @@ +From 643d93343759a350fe0f6327d308bf6a93c79d25 Mon Sep 17 00:00:00 2001 +From: Eric Auger +Date: Wed, 5 Jul 2023 18:51:18 +0200 +Subject: [PATCH 02/37] virtio-iommu: Rework the traces in + virtio_iommu_set_page_size_mask() + +RH-Author: Eric Auger +RH-MergeRequest: 182: VIRTIO-IOMMU/VFIO page size related fixes +RH-Bugzilla: 2211609 2211634 +RH-Acked-by: Gavin Shan +RH-Acked-by: Sebastian Ott +RH-Commit: [2/2] 0af7078dde158f07c83e2b293adc5d9d475688ae (eauger1/centos-qemu-kvm) + +Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2211609 + +The current error messages in virtio_iommu_set_page_size_mask() +sound quite similar for different situations and miss the IOMMU +memory region that causes the issue. + +Clarify them and rework the comment. + +Also remove the trace when the new page_size_mask is not applied as +the current frozen granule is kept. This message is rather confusing +for the end user and anyway the current granule would have been used +by the driver. + +Signed-off-by: Eric Auger +Reviewed-by: Zhenzhong Duan +Message-Id: <20230705165118.28194-3-eric.auger@redhat.com> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +Reviewed-by: Jean-Philippe Brucker +Tested-by: Jean-Philippe Brucker +(cherry picked from commit 587a7641d53055054d68d67d94c9408ef808f127) +Signed-off-by: Eric Auger +--- + hw/virtio/virtio-iommu.c | 19 +++++++------------ + 1 file changed, 7 insertions(+), 12 deletions(-) + +diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c +index 542679b321..421e2a944f 100644 +--- a/hw/virtio/virtio-iommu.c ++++ b/hw/virtio/virtio-iommu.c +@@ -1101,29 +1101,24 @@ static int virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr, + new_mask); + + if ((cur_mask & new_mask) == 0) { +- error_setg(errp, "virtio-iommu page mask 0x%"PRIx64 +- " is incompatible with mask 0x%"PRIx64, cur_mask, new_mask); ++ error_setg(errp, "virtio-iommu %s reports a page size mask 0x%"PRIx64 ++ " incompatible with currently supported mask 0x%"PRIx64, ++ mr->parent_obj.name, new_mask, cur_mask); + return -1; + } + + /* + * Once the granule is frozen we can't change the mask anymore. If by + * chance the hotplugged device supports the same granule, we can still +- * accept it. Having a different masks is possible but the guest will use +- * sub-optimal block sizes, so warn about it. ++ * accept it. + */ + if (s->granule_frozen) { +- int new_granule = ctz64(new_mask); + int cur_granule = ctz64(cur_mask); + +- if (new_granule != cur_granule) { +- error_setg(errp, "virtio-iommu page mask 0x%"PRIx64 +- " is incompatible with mask 0x%"PRIx64, cur_mask, +- new_mask); ++ if (!(BIT(cur_granule) & new_mask)) { ++ error_setg(errp, "virtio-iommu %s does not support frozen granule 0x%llx", ++ mr->parent_obj.name, BIT_ULL(cur_granule)); + return -1; +- } else if (new_mask != cur_mask) { +- warn_report("virtio-iommu page mask 0x%"PRIx64 +- " does not match 0x%"PRIx64, cur_mask, new_mask); + } + return 0; + } +-- +2.39.3 + diff --git a/qemu-kvm.spec b/qemu-kvm.spec index ad8ea7e..04fe870 100644 --- a/qemu-kvm.spec +++ b/qemu-kvm.spec @@ -100,7 +100,7 @@ %endif %global target_list %{kvm_target}-softmmu -%global block_drivers_rw_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,compress,virtio-blk-vdpa-blk,virtio-blk-vfio-pci,virtio-blk-vhost-user,io_uring,nvme-io_uring +%global block_drivers_rw_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,compress,virtio-blk-vhost-vdpa,virtio-blk-vfio-pci,virtio-blk-vhost-user,io_uring,nvme-io_uring %global block_drivers_ro_list vdi,vmdk,vhdx,vpc,https %define qemudocdir %{_docdir}/%{name} %global firmwaredirs "%{_datadir}/qemu-firmware:%{_datadir}/ipxe/qemu:%{_datadir}/seavgabios:%{_datadir}/seabios" @@ -149,7 +149,7 @@ Obsoletes: %{name}-block-ssh <= %{epoch}:%{version} \ Summary: QEMU is a machine emulator and virtualizer Name: qemu-kvm Version: 8.0.0 -Release: 7%{?rcrel}%{?dist}%{?cc_suffix} +Release: 8%{?rcrel}%{?dist}%{?cc_suffix} # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped # Epoch 15 used for RHEL 8 # Epoch 17 used for RHEL 9 (due to release versioning offset in RHEL 8.5) @@ -372,6 +372,80 @@ Patch108: kvm-vhost-fix-vhost_dev_enable_notifiers-error-case.patch Patch109: kvm-kvm-reuse-per-vcpu-stats-fd-to-avoid-vcpu-interrupti.patch # For bz#2128929 - [rhel9.2] hotplug/hotunplug mlx vdpa device to the occupied addr port, then qemu core dump occurs after shutdown guest Patch110: kvm-vhost-vdpa-do-not-cleanup-the-vdpa-vhost-net-structu.patch +# For bz#2211609 - With virtio-iommu and vfio-pci, qemu reports "warning: virtio-iommu page mask 0xfffffffffffff000 does not match 0x40201000" +# For bz#2211634 - [aarch64] With virtio-iommu and vfio-pci, qemu coredump when host using kernel-64k package +Patch111: kvm-virtio-iommu-Fix-64kB-host-page-size-VFIO-device-ass.patch +# For bz#2211609 - With virtio-iommu and vfio-pci, qemu reports "warning: virtio-iommu page mask 0xfffffffffffff000 does not match 0x40201000" +# For bz#2211634 - [aarch64] With virtio-iommu and vfio-pci, qemu coredump when host using kernel-64k package +Patch112: kvm-virtio-iommu-Rework-the-traces-in-virtio_iommu_set_p.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch113: kvm-vfio-pci-add-support-for-VF-token.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch114: kvm-vfio-migration-Skip-log_sync-during-migration-SETUP-.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch115: kvm-vfio-pci-Static-Resizable-BAR-capability.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch116: kvm-vfio-pci-Fix-a-use-after-free-issue.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch117: kvm-util-vfio-helpers-Use-g_file_read_link.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch118: kvm-migration-Make-all-functions-check-have-the-same-for.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch119: kvm-migration-Move-migration_properties-to-options.c.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch120: kvm-migration-Add-switchover-ack-capability.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch121: kvm-migration-Implement-switchover-ack-logic.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch122: kvm-migration-Enable-switchover-ack-capability.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch123: kvm-vfio-migration-Refactor-vfio_save_block-to-return-sa.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch124: kvm-vfio-migration-Store-VFIO-migration-flags-in-VFIOMig.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch125: kvm-vfio-migration-Add-VFIO-migration-pre-copy-support.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch126: kvm-vfio-migration-Add-support-for-switchover-ack-capabi.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch127: kvm-vfio-Implement-a-common-device-info-helper.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch128: kvm-hw-vfio-pci-quirks-Support-alternate-offset-for-GPUD.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch129: kvm-vfio-pci-Call-vfio_prepare_kvm_msi_virq_batch-in-MSI.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch130: kvm-vfio-migration-Reset-bytes_transferred-properly.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch131: kvm-vfio-migration-Make-VFIO-migration-non-experimental.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch132: kvm-vfio-pci-Fix-a-segfault-in-vfio_realize.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch133: kvm-vfio-pci-Free-leaked-timer-in-vfio_realize-error-pat.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch134: kvm-hw-vfio-pci-quirks-Sanitize-capability-pointer.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch135: kvm-vfio-pci-Disable-INTx-in-vfio_realize-error-path.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch136: kvm-vfio-migration-Change-vIOMMU-blocker-from-global-to-.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch137: kvm-vfio-migration-Free-resources-when-vfio_migration_re.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch138: kvm-vfio-migration-Remove-print-of-Migration-disabled.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch139: kvm-vfio-migration-Return-bool-type-for-vfio_migration_r.patch +# For bz#2192818 - [VFIO LM] Live migration +Patch140: kvm-vfio-Fix-null-pointer-dereference-bug-in-vfio_bars_f.patch +# For bz#2220866 - Misaligned symbol for s390-ccw image during qemu-kvm build +Patch141: kvm-pc-bios-s390-ccw-Makefile-Use-z-noexecstack-to-silen.patch +# For bz#2220866 - Misaligned symbol for s390-ccw image during qemu-kvm build +Patch142: kvm-pc-bios-s390-ccw-Fix-indentation-in-start.S.patch +# For bz#2220866 - Misaligned symbol for s390-ccw image during qemu-kvm build +Patch143: kvm-pc-bios-s390-ccw-Provide-space-for-initial-stack-fra.patch +# For bz#2220866 - Misaligned symbol for s390-ccw image during qemu-kvm build +Patch144: kvm-pc-bios-s390-ccw-Don-t-use-__bss_start-with-the-larl.patch +# For bz#2222579 - PNG screendump doesn't save screen correctly +Patch145: kvm-ui-Fix-pixel-colour-channel-order-for-PNG-screenshot.patch +# For bz#2213317 - Enable libblkio-based block drivers in QEMU +Patch146: kvm-block-blkio-fix-module_block.py-parsing.patch %if %{have_clang} BuildRequires: clang @@ -1412,6 +1486,57 @@ useradd -r -u 107 -g qemu -G kvm -d / -s /sbin/nologin \ %endif %changelog +* Mon Jul 17 2023 Miroslav Rezanina - 8.0.0-8 +- kvm-virtio-iommu-Fix-64kB-host-page-size-VFIO-device-ass.patch [bz#2211609 bz#2211634] +- kvm-virtio-iommu-Rework-the-traces-in-virtio_iommu_set_p.patch [bz#2211609 bz#2211634] +- kvm-vfio-pci-add-support-for-VF-token.patch [bz#2192818] +- kvm-vfio-migration-Skip-log_sync-during-migration-SETUP-.patch [bz#2192818] +- kvm-vfio-pci-Static-Resizable-BAR-capability.patch [bz#2192818] +- kvm-vfio-pci-Fix-a-use-after-free-issue.patch [bz#2192818] +- kvm-util-vfio-helpers-Use-g_file_read_link.patch [bz#2192818] +- kvm-migration-Make-all-functions-check-have-the-same-for.patch [bz#2192818] +- kvm-migration-Move-migration_properties-to-options.c.patch [bz#2192818] +- kvm-migration-Add-switchover-ack-capability.patch [bz#2192818] +- kvm-migration-Implement-switchover-ack-logic.patch [bz#2192818] +- kvm-migration-Enable-switchover-ack-capability.patch [bz#2192818] +- kvm-vfio-migration-Refactor-vfio_save_block-to-return-sa.patch [bz#2192818] +- kvm-vfio-migration-Store-VFIO-migration-flags-in-VFIOMig.patch [bz#2192818] +- kvm-vfio-migration-Add-VFIO-migration-pre-copy-support.patch [bz#2192818] +- kvm-vfio-migration-Add-support-for-switchover-ack-capabi.patch [bz#2192818] +- kvm-vfio-Implement-a-common-device-info-helper.patch [bz#2192818] +- kvm-hw-vfio-pci-quirks-Support-alternate-offset-for-GPUD.patch [bz#2192818] +- kvm-vfio-pci-Call-vfio_prepare_kvm_msi_virq_batch-in-MSI.patch [bz#2192818] +- kvm-vfio-migration-Reset-bytes_transferred-properly.patch [bz#2192818] +- kvm-vfio-migration-Make-VFIO-migration-non-experimental.patch [bz#2192818] +- kvm-vfio-pci-Fix-a-segfault-in-vfio_realize.patch [bz#2192818] +- kvm-vfio-pci-Free-leaked-timer-in-vfio_realize-error-pat.patch [bz#2192818] +- kvm-hw-vfio-pci-quirks-Sanitize-capability-pointer.patch [bz#2192818] +- kvm-vfio-pci-Disable-INTx-in-vfio_realize-error-path.patch [bz#2192818] +- kvm-vfio-migration-Change-vIOMMU-blocker-from-global-to-.patch [bz#2192818] +- kvm-vfio-migration-Free-resources-when-vfio_migration_re.patch [bz#2192818] +- kvm-vfio-migration-Remove-print-of-Migration-disabled.patch [bz#2192818] +- kvm-vfio-migration-Return-bool-type-for-vfio_migration_r.patch [bz#2192818] +- kvm-vfio-Fix-null-pointer-dereference-bug-in-vfio_bars_f.patch [bz#2192818] +- kvm-pc-bios-s390-ccw-Makefile-Use-z-noexecstack-to-silen.patch [bz#2220866] +- kvm-pc-bios-s390-ccw-Fix-indentation-in-start.S.patch [bz#2220866] +- kvm-pc-bios-s390-ccw-Provide-space-for-initial-stack-fra.patch [bz#2220866] +- kvm-pc-bios-s390-ccw-Don-t-use-__bss_start-with-the-larl.patch [bz#2220866] +- kvm-ui-Fix-pixel-colour-channel-order-for-PNG-screenshot.patch [bz#2222579] +- kvm-block-blkio-fix-module_block.py-parsing.patch [bz#2213317] +- kvm-Fix-virtio-blk-vhost-vdpa-typo-in-spec-file.patch [bz#2213317] +- Resolves: bz#2211609 + (With virtio-iommu and vfio-pci, qemu reports "warning: virtio-iommu page mask 0xfffffffffffff000 does not match 0x40201000") +- Resolves: bz#2211634 + ([aarch64] With virtio-iommu and vfio-pci, qemu coredump when host using kernel-64k package) +- Resolves: bz#2192818 + ([VFIO LM] Live migration) +- Resolves: bz#2220866 + (Misaligned symbol for s390-ccw image during qemu-kvm build) +- Resolves: bz#2222579 + (PNG screendump doesn't save screen correctly) +- Resolves: bz#2213317 + (Enable libblkio-based block drivers in QEMU) + * Mon Jul 10 2023 Miroslav Rezanina - 8.0.0-7 - kvm-numa-Validate-cluster-and-NUMA-node-boundary-if-requ.patch [bz#2171363] - kvm-hw-arm-Validate-cluster-and-NUMA-node-boundary.patch [bz#2171363]