* Mon Dec 08 2025 Jon Maloy <jmaloy@redhat.com> - 10.1.0-8
- kvm-file-posix-Handle-suspended-dm-multipath-better-for-.patch [RHEL-133303] - Resolves: RHEL-133303 (The VM hit io error when do S3-PR integration on the pass-through failover multipath device [rhel-9])
This commit is contained in:
parent
3b087bc2a2
commit
4de7fac8a3
122
kvm-file-posix-Handle-suspended-dm-multipath-better-for-.patch
Normal file
122
kvm-file-posix-Handle-suspended-dm-multipath-better-for-.patch
Normal file
@ -0,0 +1,122 @@
|
||||
From 1005d2951e4b62b07e687d570dfdda6f82461a3f Mon Sep 17 00:00:00 2001
|
||||
From: Kevin Wolf <kwolf@redhat.com>
|
||||
Date: Fri, 28 Nov 2025 23:14:40 +0100
|
||||
Subject: [PATCH] file-posix: Handle suspended dm-multipath better for SG_IO
|
||||
|
||||
RH-Author: Kevin Wolf <kwolf@redhat.com>
|
||||
RH-MergeRequest: 438: file-posix: Handle suspended dm-multipath better for SG_IO
|
||||
RH-Jira: RHEL-133303
|
||||
RH-Acked-by: Hanna Czenczek <hreitz@redhat.com>
|
||||
RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||||
RH-Commit: [1/1] 575791e00334cf288a89855552967a6eca8c44dd (kmwolf/centos-qemu-kvm)
|
||||
|
||||
When introducing DM_MPATH_PROBE_PATHS, we already anticipated that
|
||||
dm-multipath devices might be suspended for a short time when the DM
|
||||
tables are reloaded and that they return -EAGAIN in this case. We then
|
||||
wait for a millisecond and retry.
|
||||
|
||||
However, meanwhile it has also turned out that libmpathpersist (which is
|
||||
used by qemu-pr-helper) may need to perform more complex recovery
|
||||
operations to get reservations back to expected state if a path failure
|
||||
happened in the middle of a PR operation. In this case, the device is
|
||||
suspended for a longer time compared to the case we originally expected.
|
||||
|
||||
This patch changes hdev_co_ioctl() to treat -EAGAIN separately so that
|
||||
it doesn't result in an immediate failure if the device is suspended for
|
||||
more than 1ms, and moves to incremental backoff to cover both quick and
|
||||
slow cases without excessive delays.
|
||||
|
||||
Buglink: https://issues.redhat.com/browse/RHEL-121543
|
||||
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
||||
Message-ID: <20251128221440.89125-1-kwolf@redhat.com>
|
||||
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
||||
(cherry picked from commit 2c3165a1a61c299b4a3ae30899e1cc738d20e004)
|
||||
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
|
||||
---
|
||||
block/file-posix.c | 56 ++++++++++++++++++++++++++++------------------
|
||||
1 file changed, 34 insertions(+), 22 deletions(-)
|
||||
|
||||
diff --git a/block/file-posix.c b/block/file-posix.c
|
||||
index 8c738674ce..ace0e23df2 100644
|
||||
--- a/block/file-posix.c
|
||||
+++ b/block/file-posix.c
|
||||
@@ -4284,25 +4284,8 @@ hdev_open_Mac_error:
|
||||
static bool coroutine_fn sgio_path_error(int ret, sg_io_hdr_t *io_hdr)
|
||||
{
|
||||
if (ret < 0) {
|
||||
- switch (ret) {
|
||||
- case -ENODEV:
|
||||
- return true;
|
||||
- case -EAGAIN:
|
||||
- /*
|
||||
- * The device is probably suspended. This happens while the dm table
|
||||
- * is reloaded, e.g. because a path is added or removed. This is an
|
||||
- * operation that should complete within 1ms, so just wait a bit and
|
||||
- * retry.
|
||||
- *
|
||||
- * If the device was suspended for another reason, we'll wait and
|
||||
- * retry SG_IO_MAX_RETRIES times. This is a tolerable delay before
|
||||
- * we return an error and potentially stop the VM.
|
||||
- */
|
||||
- qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000);
|
||||
- return true;
|
||||
- default:
|
||||
- return false;
|
||||
- }
|
||||
+ /* Path errors sometimes result in -ENODEV */
|
||||
+ return ret == -ENODEV;
|
||||
}
|
||||
|
||||
if (io_hdr->host_status != SCSI_HOST_OK) {
|
||||
@@ -4371,6 +4354,7 @@ hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
|
||||
{
|
||||
BDRVRawState *s = bs->opaque;
|
||||
RawPosixAIOData acb;
|
||||
+ uint64_t eagain_sleep_ns = 1 * SCALE_MS;
|
||||
int retries = SG_IO_MAX_RETRIES;
|
||||
int ret;
|
||||
|
||||
@@ -4399,9 +4383,37 @@ hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
|
||||
},
|
||||
};
|
||||
|
||||
- do {
|
||||
- ret = raw_thread_pool_submit(handle_aiocb_ioctl, &acb);
|
||||
- } while (req == SG_IO && retries-- && hdev_co_ioctl_sgio_retry(&acb, ret));
|
||||
+retry:
|
||||
+ ret = raw_thread_pool_submit(handle_aiocb_ioctl, &acb);
|
||||
+ if (req == SG_IO && s->use_mpath) {
|
||||
+ if (ret == -EAGAIN && eagain_sleep_ns < NANOSECONDS_PER_SECOND) {
|
||||
+ /*
|
||||
+ * If this is a multipath device, it is probably suspended.
|
||||
+ *
|
||||
+ * This can happen while the dm table is reloaded, e.g. because a
|
||||
+ * path is added or removed. This is an operation that should
|
||||
+ * complete within 1ms, so just wait a bit and retry.
|
||||
+ *
|
||||
+ * There are also some cases in which libmpathpersist must recover
|
||||
+ * from path failure during its operation, which can leave the
|
||||
+ * device suspended for a bit longer while the library brings back
|
||||
+ * reservations into the expected state.
|
||||
+ *
|
||||
+ * Use increasing delays to cover both cases without waiting
|
||||
+ * excessively, and stop after a bit more than a second (1023 ms).
|
||||
+ * This is a tolerable delay before we return an error and
|
||||
+ * potentially stop the VM.
|
||||
+ */
|
||||
+ qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, eagain_sleep_ns);
|
||||
+ eagain_sleep_ns *= 2;
|
||||
+ goto retry;
|
||||
+ }
|
||||
+
|
||||
+ /* Even for ret == 0, the SG_IO header can contain an error */
|
||||
+ if (retries-- && hdev_co_ioctl_sgio_retry(&acb, ret)) {
|
||||
+ goto retry;
|
||||
+ }
|
||||
+ }
|
||||
|
||||
return ret;
|
||||
}
|
||||
--
|
||||
2.51.1
|
||||
|
||||
@ -149,7 +149,7 @@ Obsoletes: %{name}-block-ssh <= %{epoch}:%{version} \
|
||||
Summary: QEMU is a machine emulator and virtualizer
|
||||
Name: qemu-kvm
|
||||
Version: 10.1.0
|
||||
Release: 7%{?rcrel}%{?dist}%{?cc_suffix}
|
||||
Release: 8%{?rcrel}%{?dist}%{?cc_suffix}
|
||||
# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
|
||||
# Epoch 15 used for RHEL 8
|
||||
# Epoch 17 used for RHEL 9 (due to release versioning offset in RHEL 8.5)
|
||||
@ -228,6 +228,8 @@ Patch41: kvm-ram-block-attributes-Unify-the-retrieval-of-the-bloc.patch
|
||||
Patch42: kvm-Fix-the-typo-of-vfio-pci-device-s-enable-migration-o.patch
|
||||
# For RHEL-133008 - Assertion failure on drain with iothread and I/O load [rhel-9]
|
||||
Patch43: kvm-block-backend-Fix-race-when-resuming-queued-requests.patch
|
||||
# For RHEL-133303 - The VM hit io error when do S3-PR integration on the pass-through failover multipath device [rhel-9]
|
||||
Patch44: kvm-file-posix-Handle-suspended-dm-multipath-better-for-.patch
|
||||
|
||||
|
||||
# For RHEL-11424 - [IBM 9.6 FEAT] KVM: Full boot order support - qemu part
|
||||
@ -1942,6 +1944,11 @@ useradd -r -u 107 -g qemu -G kvm -d / -s /sbin/nologin \
|
||||
%endif
|
||||
|
||||
%changelog
|
||||
* Mon Dec 08 2025 Jon Maloy <jmaloy@redhat.com> - 10.1.0-8
|
||||
- kvm-file-posix-Handle-suspended-dm-multipath-better-for-.patch [RHEL-133303]
|
||||
- Resolves: RHEL-133303
|
||||
(The VM hit io error when do S3-PR integration on the pass-through failover multipath device [rhel-9])
|
||||
|
||||
* Wed Dec 03 2025 Jon Maloy <jmaloy@redhat.com> - 10.1.0-7
|
||||
- kvm-block-backend-Fix-race-when-resuming-queued-requests.patch [RHEL-133008]
|
||||
- Resolves: RHEL-133008
|
||||
|
||||
Loading…
Reference in New Issue
Block a user