From 4de7fac8a36e4e9f4d8801849e2de903947de5a0 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Mon, 8 Dec 2025 17:41:34 -0500 Subject: [PATCH] * Mon Dec 08 2025 Jon Maloy - 10.1.0-8 - kvm-file-posix-Handle-suspended-dm-multipath-better-for-.patch [RHEL-133303] - Resolves: RHEL-133303 (The VM hit io error when do S3-PR integration on the pass-through failover multipath device [rhel-9]) --- ...e-suspended-dm-multipath-better-for-.patch | 122 ++++++++++++++++++ qemu-kvm.spec | 9 +- 2 files changed, 130 insertions(+), 1 deletion(-) create mode 100644 kvm-file-posix-Handle-suspended-dm-multipath-better-for-.patch diff --git a/kvm-file-posix-Handle-suspended-dm-multipath-better-for-.patch b/kvm-file-posix-Handle-suspended-dm-multipath-better-for-.patch new file mode 100644 index 0000000..384fa9a --- /dev/null +++ b/kvm-file-posix-Handle-suspended-dm-multipath-better-for-.patch @@ -0,0 +1,122 @@ +From 1005d2951e4b62b07e687d570dfdda6f82461a3f Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Fri, 28 Nov 2025 23:14:40 +0100 +Subject: [PATCH] file-posix: Handle suspended dm-multipath better for SG_IO + +RH-Author: Kevin Wolf +RH-MergeRequest: 438: file-posix: Handle suspended dm-multipath better for SG_IO +RH-Jira: RHEL-133303 +RH-Acked-by: Hanna Czenczek +RH-Acked-by: Stefan Hajnoczi +RH-Commit: [1/1] 575791e00334cf288a89855552967a6eca8c44dd (kmwolf/centos-qemu-kvm) + +When introducing DM_MPATH_PROBE_PATHS, we already anticipated that +dm-multipath devices might be suspended for a short time when the DM +tables are reloaded and that they return -EAGAIN in this case. We then +wait for a millisecond and retry. + +However, meanwhile it has also turned out that libmpathpersist (which is +used by qemu-pr-helper) may need to perform more complex recovery +operations to get reservations back to expected state if a path failure +happened in the middle of a PR operation. In this case, the device is +suspended for a longer time compared to the case we originally expected. + +This patch changes hdev_co_ioctl() to treat -EAGAIN separately so that +it doesn't result in an immediate failure if the device is suspended for +more than 1ms, and moves to incremental backoff to cover both quick and +slow cases without excessive delays. + +Buglink: https://issues.redhat.com/browse/RHEL-121543 +Signed-off-by: Kevin Wolf +Message-ID: <20251128221440.89125-1-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 2c3165a1a61c299b4a3ae30899e1cc738d20e004) +Signed-off-by: Kevin Wolf +--- + block/file-posix.c | 56 ++++++++++++++++++++++++++++------------------ + 1 file changed, 34 insertions(+), 22 deletions(-) + +diff --git a/block/file-posix.c b/block/file-posix.c +index 8c738674ce..ace0e23df2 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -4284,25 +4284,8 @@ hdev_open_Mac_error: + static bool coroutine_fn sgio_path_error(int ret, sg_io_hdr_t *io_hdr) + { + if (ret < 0) { +- switch (ret) { +- case -ENODEV: +- return true; +- case -EAGAIN: +- /* +- * The device is probably suspended. This happens while the dm table +- * is reloaded, e.g. because a path is added or removed. This is an +- * operation that should complete within 1ms, so just wait a bit and +- * retry. +- * +- * If the device was suspended for another reason, we'll wait and +- * retry SG_IO_MAX_RETRIES times. This is a tolerable delay before +- * we return an error and potentially stop the VM. +- */ +- qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000); +- return true; +- default: +- return false; +- } ++ /* Path errors sometimes result in -ENODEV */ ++ return ret == -ENODEV; + } + + if (io_hdr->host_status != SCSI_HOST_OK) { +@@ -4371,6 +4354,7 @@ hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) + { + BDRVRawState *s = bs->opaque; + RawPosixAIOData acb; ++ uint64_t eagain_sleep_ns = 1 * SCALE_MS; + int retries = SG_IO_MAX_RETRIES; + int ret; + +@@ -4399,9 +4383,37 @@ hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) + }, + }; + +- do { +- ret = raw_thread_pool_submit(handle_aiocb_ioctl, &acb); +- } while (req == SG_IO && retries-- && hdev_co_ioctl_sgio_retry(&acb, ret)); ++retry: ++ ret = raw_thread_pool_submit(handle_aiocb_ioctl, &acb); ++ if (req == SG_IO && s->use_mpath) { ++ if (ret == -EAGAIN && eagain_sleep_ns < NANOSECONDS_PER_SECOND) { ++ /* ++ * If this is a multipath device, it is probably suspended. ++ * ++ * This can happen while the dm table is reloaded, e.g. because a ++ * path is added or removed. This is an operation that should ++ * complete within 1ms, so just wait a bit and retry. ++ * ++ * There are also some cases in which libmpathpersist must recover ++ * from path failure during its operation, which can leave the ++ * device suspended for a bit longer while the library brings back ++ * reservations into the expected state. ++ * ++ * Use increasing delays to cover both cases without waiting ++ * excessively, and stop after a bit more than a second (1023 ms). ++ * This is a tolerable delay before we return an error and ++ * potentially stop the VM. ++ */ ++ qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, eagain_sleep_ns); ++ eagain_sleep_ns *= 2; ++ goto retry; ++ } ++ ++ /* Even for ret == 0, the SG_IO header can contain an error */ ++ if (retries-- && hdev_co_ioctl_sgio_retry(&acb, ret)) { ++ goto retry; ++ } ++ } + + return ret; + } +-- +2.51.1 + diff --git a/qemu-kvm.spec b/qemu-kvm.spec index f83b8f3..1ecbcb3 100644 --- a/qemu-kvm.spec +++ b/qemu-kvm.spec @@ -149,7 +149,7 @@ Obsoletes: %{name}-block-ssh <= %{epoch}:%{version} \ Summary: QEMU is a machine emulator and virtualizer Name: qemu-kvm Version: 10.1.0 -Release: 7%{?rcrel}%{?dist}%{?cc_suffix} +Release: 8%{?rcrel}%{?dist}%{?cc_suffix} # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped # Epoch 15 used for RHEL 8 # Epoch 17 used for RHEL 9 (due to release versioning offset in RHEL 8.5) @@ -228,6 +228,8 @@ Patch41: kvm-ram-block-attributes-Unify-the-retrieval-of-the-bloc.patch Patch42: kvm-Fix-the-typo-of-vfio-pci-device-s-enable-migration-o.patch # For RHEL-133008 - Assertion failure on drain with iothread and I/O load [rhel-9] Patch43: kvm-block-backend-Fix-race-when-resuming-queued-requests.patch +# For RHEL-133303 - The VM hit io error when do S3-PR integration on the pass-through failover multipath device [rhel-9] +Patch44: kvm-file-posix-Handle-suspended-dm-multipath-better-for-.patch # For RHEL-11424 - [IBM 9.6 FEAT] KVM: Full boot order support - qemu part @@ -1942,6 +1944,11 @@ useradd -r -u 107 -g qemu -G kvm -d / -s /sbin/nologin \ %endif %changelog +* Mon Dec 08 2025 Jon Maloy - 10.1.0-8 +- kvm-file-posix-Handle-suspended-dm-multipath-better-for-.patch [RHEL-133303] +- Resolves: RHEL-133303 + (The VM hit io error when do S3-PR integration on the pass-through failover multipath device [rhel-9]) + * Wed Dec 03 2025 Jon Maloy - 10.1.0-7 - kvm-block-backend-Fix-race-when-resuming-queued-requests.patch [RHEL-133008] - Resolves: RHEL-133008