From cd7d34f2bcc0cfe3b70eeb996906f7fcb9f3b2f4 Mon Sep 17 00:00:00 2001 From: Miroslav Rezanina Date: Tue, 28 Apr 2026 16:45:50 +0200 Subject: [PATCH] * Tue Apr 28 2026 Miroslav Rezanina - 10.1.0-17.el10nv.1 - Backport commits of EGM virtualization support [VOYAGER-15] - Resolves: VOYAGER-15 (Backport QEMU support for EGM) --- ...-uefi-add-variable-digest-to-vmstate.patch | 90 ++++ ...-BLOCK_IO_ERROR-with-action-stop-for.patch | 99 ++++ ...d-dirty-bitmap-writes-during-startup.patch | 163 +++++++ ...-Put-all-parameters-into-qemu_laiocb.patch | 128 +++++ ...Resubmit-tails-of-short-reads-writes.patch | 216 +++++++++ ...void-potentially-getting-stuck-after.patch | 168 +++++++ ...uring-Resubmit-tails-of-short-writes.patch | 279 +++++++++++ ...-stats-intervals-for-storage-devices.patch | 457 ++++++++++++++++++ ...-qdev-Free-property-array-on-release.patch | 292 +++++++++++ ...-New-object-to-associate-device-to-E.patch | 239 +++++++++ ...acpi-Populate-DSDT-with-EGM-properti.patch | 240 +++++++++ ...arm-boot-Create-DTB-memory-regions-s.patch | 235 +++++++++ ...acpi-Add-pxb-bridge-above-GPU-in-DSD.patch | 59 +++ ...IOMMU_IOAS_MAP-for-EGM-memory-region.patch | 117 +++++ README.rst | 19 - qemu-kvm.spec | 48 +- rpminspect.yaml | 12 - 17 files changed, 2829 insertions(+), 32 deletions(-) create mode 100644 0313-hw-uefi-add-variable-digest-to-vmstate.patch create mode 100644 0314-block-Never-drop-BLOCK_IO_ERROR-with-action-stop-for.patch create mode 100644 0315-mirror-Fix-missed-dirty-bitmap-writes-during-startup.patch create mode 100644 0316-linux-aio-Put-all-parameters-into-qemu_laiocb.patch create mode 100644 0317-linux-aio-Resubmit-tails-of-short-reads-writes.patch create mode 100644 0318-block-io_uring-avoid-potentially-getting-stuck-after.patch create mode 100644 0319-io-uring-Resubmit-tails-of-short-writes.patch create mode 100644 0320-block-enable-stats-intervals-for-storage-devices.patch create mode 100644 0321-qdev-Free-property-array-on-release.patch create mode 100644 0322-NVIDIA-SAUCE-qom-New-object-to-associate-device-to-E.patch create mode 100644 0323-NVIDIA-SAUCE-hw-acpi-Populate-DSDT-with-EGM-properti.patch create mode 100644 0324-NVIDIA-SAUCE-hw-arm-boot-Create-DTB-memory-regions-s.patch create mode 100644 0325-NVIDIA-SAUCE-hw-acpi-Add-pxb-bridge-above-GPU-in-DSD.patch create mode 100644 0326-WAR-hw-vfio-Use-IOMMU_IOAS_MAP-for-EGM-memory-region.patch delete mode 100644 README.rst delete mode 100644 rpminspect.yaml diff --git a/0313-hw-uefi-add-variable-digest-to-vmstate.patch b/0313-hw-uefi-add-variable-digest-to-vmstate.patch new file mode 100644 index 0000000..e982484 --- /dev/null +++ b/0313-hw-uefi-add-variable-digest-to-vmstate.patch @@ -0,0 +1,90 @@ +From dd87fdca8abe1ff289eb766e79c8786819efd739 Mon Sep 17 00:00:00 2001 +From: Gerd Hoffmann +Date: Wed, 4 Mar 2026 08:05:34 +0100 +Subject: [PATCH] hw/uefi: add variable digest to vmstate +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Gerd Hoffmann +RH-MergeRequest: 471: hw/uefi: add variable digest to vmstate +RH-Jira: RHEL-153058 +RH-Acked-by: Peter Xu +RH-Acked-by: Miroslav Rezanina +RH-Commit: [1/1] c300b30db6bafaacd11b8383531f04873bc8b428 (kraxel.rh/centos-src-qemu-kvm) + +Add digest to vmstate if needed. Also clear digest before loading +to make sure it is initialized. + +Fixes: db1ecfb473ac ("hw/uefi: add var-service-vars.c") +Signed-off-by: Gerd Hoffmann +Reviewed-by: Philippe Mathieu-Daudé +Message-ID: <20260304075954.584423-1-kraxel@redhat.com> +Signed-off-by: Philippe Mathieu-Daudé +(cherry picked from commit b28c3ad1d63c2fe167b6f93fad1616ecd769e599) + +Resolves: RHEL-153058 + +Patch-name: kvm-hw-uefi-add-variable-digest-to-vmstate.patch +Patch-id: 128 +Patch-present-in-specfile: True +--- + hw/uefi/var-service-vars.c | 36 ++++++++++++++++++++++++++++++++++++ + 1 file changed, 36 insertions(+) + +diff --git a/hw/uefi/var-service-vars.c b/hw/uefi/var-service-vars.c +index 8533533ea5..ed4e0a6494 100644 +--- a/hw/uefi/var-service-vars.c ++++ b/hw/uefi/var-service-vars.c +@@ -37,8 +37,40 @@ const VMStateDescription vmstate_uefi_time = { + }, + }; + ++static int uefi_vars_pre_load(void *opaque) ++{ ++ uefi_variable *var = opaque; ++ ++ /* clear digest which is optional in the live migration data stream */ ++ var->digest = NULL; ++ var->digest_size = 0; ++ return 0; ++} ++ ++static bool uefi_vars_digest_is_needed(void *opaque) ++{ ++ uefi_variable *var = opaque; ++ ++ if ((var->attributes & EFI_VARIABLE_TIME_BASED_AUTHENTICATED_WRITE_ACCESS) && ++ !uefi_vars_is_sb_any(var)) { ++ return true; ++ } ++ return false; ++} ++ ++const VMStateDescription vmstate_uefi_variable_digest = { ++ .name = "uefi-variable-digest", ++ .needed = uefi_vars_digest_is_needed, ++ .fields = (VMStateField[]) { ++ VMSTATE_UINT32(digest_size, uefi_variable), ++ VMSTATE_VBUFFER_ALLOC_UINT32(digest, uefi_variable, 0, NULL, digest_size), ++ VMSTATE_END_OF_LIST() ++ }, ++}; ++ + const VMStateDescription vmstate_uefi_variable = { + .name = "uefi-variable", ++ .pre_load = uefi_vars_pre_load, + .fields = (VMStateField[]) { + VMSTATE_UINT8_ARRAY_V(guid.data, uefi_variable, sizeof(QemuUUID), 0), + VMSTATE_UINT32(name_size, uefi_variable), +@@ -49,6 +81,10 @@ const VMStateDescription vmstate_uefi_variable = { + VMSTATE_STRUCT(time, uefi_variable, 0, vmstate_uefi_time, efi_time), + VMSTATE_END_OF_LIST() + }, ++ .subsections = (const VMStateDescription * const []) { ++ &vmstate_uefi_variable_digest, ++ NULL ++ } + }; + + uefi_variable *uefi_vars_find_variable(uefi_vars_state *uv, QemuUUID guid, diff --git a/0314-block-Never-drop-BLOCK_IO_ERROR-with-action-stop-for.patch b/0314-block-Never-drop-BLOCK_IO_ERROR-with-action-stop-for.patch new file mode 100644 index 0000000..5939ee2 --- /dev/null +++ b/0314-block-Never-drop-BLOCK_IO_ERROR-with-action-stop-for.patch @@ -0,0 +1,99 @@ +From 8b4664ca97a4a783fd78b4f08d69c2e02aefc854 Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 4 Mar 2026 13:28:00 +0100 +Subject: [PATCH] block: Never drop BLOCK_IO_ERROR with action=stop for rate + limiting + +RH-Author: Kevin Wolf +RH-MergeRequest: 472: block: Never drop BLOCK_IO_ERROR with action=stop for rate limiting +RH-Jira: RHEL-144004 +RH-Acked-by: Hanna Czenczek +RH-Acked-by: Stefan Hajnoczi +RH-Commit: [1/1] 96b29a65a4a49fb159970892124e5b4bbdcdfeb7 (kmwolf/centos-qemu-kvm) + +Commit 2155d2dd introduced rate limiting for BLOCK_IO_ERROR to emit an +event only once a second. This makes sense for cases in which the guest +keeps running and can submit more requests that would possibly also fail +because there is a problem with the backend. + +However, if the error policy is configured so that the VM is stopped on +errors, this is both unnecessary because stopping the VM means that the +guest can't issue more requests and in fact harmful because stopping the +VM is an important state change that management tools need to keep track +of even if it happens more than once in a given second. If an event is +dropped, the management tool would see a VM randomly going to paused +state without an associated error, so it has a hard time deciding how to +handle the situation. + +This patch disables rate limiting for action=stop by not relying on the +event type alone any more in monitor_qapi_event_queue_no_reenter(), but +checking action for BLOCK_IO_ERROR, too. If the error is reported to the +guest or ignored, the rate limiting stays in place. + +Fixes: 2155d2dd7f73 ('block-backend: per-device throttling of BLOCK_IO_ERROR reports') +Signed-off-by: Kevin Wolf +Message-ID: <20260304122800.51923-1-kwolf@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 544ddbb6373d61292a0e2dc269809cd6bd5edec6) +Signed-off-by: Kevin Wolf + +Patch-name: kvm-block-Never-drop-BLOCK_IO_ERROR-with-action-stop-for.patch +Patch-id: 129 +Patch-present-in-specfile: True +--- + monitor/monitor.c | 21 ++++++++++++++++++++- + qapi/block-core.json | 2 +- + 2 files changed, 21 insertions(+), 2 deletions(-) + +diff --git a/monitor/monitor.c b/monitor/monitor.c +index c5a5d30877..ae7cf64de0 100644 +--- a/monitor/monitor.c ++++ b/monitor/monitor.c +@@ -363,14 +363,33 @@ monitor_qapi_event_queue_no_reenter(QAPIEvent event, QDict *qdict) + { + MonitorQAPIEventConf *evconf; + MonitorQAPIEventState *evstate; ++ bool throttled; + + assert(event < QAPI_EVENT__MAX); + evconf = &monitor_qapi_event_conf[event]; + trace_monitor_protocol_event_queue(event, qdict, evconf->rate); ++ throttled = evconf->rate; ++ ++ /* ++ * Rate limit BLOCK_IO_ERROR only for action != "stop". ++ * ++ * If the VM is stopped after an I/O error, this is important information ++ * for the management tool to keep track of the state of QEMU and we can't ++ * merge any events. At the same time, stopping the VM means that the guest ++ * can't send additional requests and the number of events is already ++ * limited, so we can do without rate limiting. ++ */ ++ if (event == QAPI_EVENT_BLOCK_IO_ERROR) { ++ QDict *data = qobject_to(QDict, qdict_get(qdict, "data")); ++ const char *action = qdict_get_str(data, "action"); ++ if (!strcmp(action, "stop")) { ++ throttled = false; ++ } ++ } + + QEMU_LOCK_GUARD(&monitor_lock); + +- if (!evconf->rate) { ++ if (!throttled) { + /* Unthrottled event */ + monitor_qapi_event_emit(event, qdict); + } else { +diff --git a/qapi/block-core.json b/qapi/block-core.json +index 2c037183f0..0236936139 100644 +--- a/qapi/block-core.json ++++ b/qapi/block-core.json +@@ -5783,7 +5783,7 @@ + # .. note:: If action is "stop", a `STOP` event will eventually follow + # the `BLOCK_IO_ERROR` event. + # +-# .. note:: This event is rate-limited. ++# .. note:: This event is rate-limited, except if action is "stop". + # + # Since: 0.13 + # diff --git a/0315-mirror-Fix-missed-dirty-bitmap-writes-during-startup.patch b/0315-mirror-Fix-missed-dirty-bitmap-writes-during-startup.patch new file mode 100644 index 0000000..9af089e --- /dev/null +++ b/0315-mirror-Fix-missed-dirty-bitmap-writes-during-startup.patch @@ -0,0 +1,163 @@ +From f7eadda5d2dd54ce2b1095c8634b44d466efbc4a Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Thu, 19 Feb 2026 21:24:46 +0100 +Subject: [PATCH] mirror: Fix missed dirty bitmap writes during startup + +RH-Author: Kevin Wolf +RH-MergeRequest: 474: mirror: Fix missed dirty bitmap writes during startup +RH-Jira: RHEL-155601 +RH-Acked-by: Hanna Czenczek +RH-Acked-by: Stefan Hajnoczi +RH-Commit: [1/1] a888c4b2085d5fc3639bf6721f1f4e974db65dba (kmwolf/centos-qemu-kvm) + +Currently, mirror disables the block layer's dirty bitmap before its own +replacement is working. This means that during startup, there is a +window in which the allocation status of blocks in the source has +already been checked, but new writes coming in aren't tracked yet, +resulting in a corrupted copy: + +1. Dirty bitmap is disabled in mirror_start_job() +2. Some request are started in mirror_top_bs while s->job == NULL +3. mirror_dirty_init() -> bdrv_co_is_allocated_above() runs and because + the request hasn't completed yet, the block isn't allocated +4. The request completes, still sees s->job == NULL and skips the + bitmap, and nothing else will mark it dirty either + +One ingredient is that mirror_top_opaque->job is only set after the +job is fully initialized. For the rationale, see commit 32125b1460 +("mirror: Fix access of uninitialised fields during start"). + +Fix this by giving mirror_top_bs access to dirty_bitmap and enabling it +to track writes from the beginning. Disabling the block layer's tracking +and enabling the mirror_top_bs one happens in a drained section, so +there is no danger of races with in-flight requests any more. All of +this happens well before the block allocation status is checked, so we +can be sure that no writes will be missed. + +Cc: qemu-stable@nongnu.org +Closes: https://gitlab.com/qemu-project/qemu/-/issues/3273 +Fixes: 32125b14606a ('mirror: Fix access of uninitialised fields during start') +Signed-off-by: Kevin Wolf +Message-ID: <20260219202446.312493-1-kwolf@redhat.com> +Reviewed-by: Fiona Ebner +Tested-by: Jean-Louis Dupond +Signed-off-by: Kevin Wolf +(cherry picked from commit 0f51f9c3420b31bb383e456dd7bf24d3056eeb73) +Signed-off-by: Kevin Wolf + +Patch-name: kvm-mirror-Fix-missed-dirty-bitmap-writes-during-startup.patch +Patch-id: 130 +Patch-present-in-specfile: True +--- + block/mirror.c | 52 +++++++++++++++++++++++++++++++------------------- + 1 file changed, 32 insertions(+), 20 deletions(-) + +diff --git a/block/mirror.c b/block/mirror.c +index b344182c74..f01be99b55 100644 +--- a/block/mirror.c ++++ b/block/mirror.c +@@ -99,6 +99,7 @@ typedef struct MirrorBlockJob { + + typedef struct MirrorBDSOpaque { + MirrorBlockJob *job; ++ BdrvDirtyBitmap *dirty_bitmap; + bool stop; + bool is_commit; + } MirrorBDSOpaque; +@@ -1672,9 +1673,11 @@ bdrv_mirror_top_do_write(BlockDriverState *bs, MirrorMethod method, + abort(); + } + +- if (!copy_to_target && s->job && s->job->dirty_bitmap) { +- qatomic_set(&s->job->actively_synced, false); +- bdrv_set_dirty_bitmap(s->job->dirty_bitmap, offset, bytes); ++ if (!copy_to_target) { ++ if (s->job) { ++ qatomic_set(&s->job->actively_synced, false); ++ } ++ bdrv_set_dirty_bitmap(s->dirty_bitmap, offset, bytes); + } + + if (ret < 0) { +@@ -1901,13 +1904,35 @@ static BlockJob *mirror_start_job( + + bdrv_drained_begin(bs); + ret = bdrv_append(mirror_top_bs, bs, errp); +- bdrv_drained_end(bs); +- + if (ret < 0) { ++ bdrv_drained_end(bs); ++ bdrv_unref(mirror_top_bs); ++ return NULL; ++ } ++ ++ bs_opaque->dirty_bitmap = bdrv_create_dirty_bitmap(mirror_top_bs, ++ granularity, ++ NULL, errp); ++ if (!bs_opaque->dirty_bitmap) { ++ bdrv_drained_end(bs); + bdrv_unref(mirror_top_bs); + return NULL; + } + ++ /* ++ * The mirror job doesn't use the block layer's dirty tracking because it ++ * needs to be able to switch seemlessly between background copy mode (which ++ * does need dirty tracking) and write blocking mode (which doesn't) and ++ * doing that would require draining the node. Instead, mirror_top_bs takes ++ * care of updating the dirty bitmap as appropriate. ++ * ++ * Note that write blocking mode only becomes effective after mirror_run() ++ * sets mirror_top_opaque->job (see should_copy_to_target()). Until then, ++ * we're still in background copy mode irrespective of @copy_mode. ++ */ ++ bdrv_disable_dirty_bitmap(bs_opaque->dirty_bitmap); ++ bdrv_drained_end(bs); ++ + /* Make sure that the source is not resized while the job is running */ + s = block_job_create(job_id, driver, NULL, mirror_top_bs, + BLK_PERM_CONSISTENT_READ, +@@ -2002,24 +2027,13 @@ static BlockJob *mirror_start_job( + s->base_overlay = bdrv_find_overlay(bs, base); + s->granularity = granularity; + s->buf_size = ROUND_UP(buf_size, granularity); ++ s->dirty_bitmap = bs_opaque->dirty_bitmap; + s->unmap = unmap; + if (auto_complete) { + s->should_complete = true; + } + bdrv_graph_rdunlock_main_loop(); + +- s->dirty_bitmap = bdrv_create_dirty_bitmap(s->mirror_top_bs, granularity, +- NULL, errp); +- if (!s->dirty_bitmap) { +- goto fail; +- } +- +- /* +- * The dirty bitmap is set by bdrv_mirror_top_do_write() when not in active +- * mode. +- */ +- bdrv_disable_dirty_bitmap(s->dirty_bitmap); +- + bdrv_graph_wrlock_drained(); + ret = block_job_add_bdrv(&s->common, "source", bs, 0, + BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE | +@@ -2099,9 +2113,6 @@ fail: + g_free(s->replaces); + blk_unref(s->target); + bs_opaque->job = NULL; +- if (s->dirty_bitmap) { +- bdrv_release_dirty_bitmap(s->dirty_bitmap); +- } + job_early_fail(&s->common.job); + } + +@@ -2115,6 +2126,7 @@ fail: + bdrv_graph_wrunlock(); + bdrv_drained_end(bs); + ++ bdrv_release_dirty_bitmap(bs_opaque->dirty_bitmap); + bdrv_unref(mirror_top_bs); + + return NULL; diff --git a/0316-linux-aio-Put-all-parameters-into-qemu_laiocb.patch b/0316-linux-aio-Put-all-parameters-into-qemu_laiocb.patch new file mode 100644 index 0000000..f0f34f7 --- /dev/null +++ b/0316-linux-aio-Put-all-parameters-into-qemu_laiocb.patch @@ -0,0 +1,128 @@ +From 8b8281105422d9c315fed7911a6dbe174d1ef9ba Mon Sep 17 00:00:00 2001 +From: Hanna Czenczek +Date: Tue, 24 Mar 2026 09:43:34 +0100 +Subject: [PATCH] linux-aio: Put all parameters into qemu_laiocb + +RH-Author: Hanna Czenczek +RH-MergeRequest: 479: linux-aio/io-uring: Resubmit tails of short requests +RH-Jira: RHEL-158224 +RH-Acked-by: Kevin Wolf +RH-Acked-by: Stefan Hajnoczi +RH-Commit: [1/4] f449ea0c49a093bbec59b24fb44308e7f58c9ed2 (hreitz/qemu-kvm-c-9-s) + +Put all request parameters into the qemu_laiocb struct, which will allow +re-submitting the tail of short reads/writes. + +Reviewed-by: Kevin Wolf +Signed-off-by: Hanna Czenczek +Message-ID: <20260324084338.37453-2-hreitz@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit cc03b62df47a09c507e199cc043f57bdc941cc67) +Signed-off-by: Hanna Czenczek + +Patch-name: kvm-linux-aio-Put-all-parameters-into-qemu_laiocb.patch +Patch-id: 131 +Patch-present-in-specfile: True +--- + block/linux-aio.c | 34 ++++++++++++++++++++++------------ + 1 file changed, 22 insertions(+), 12 deletions(-) + +diff --git a/block/linux-aio.c b/block/linux-aio.c +index c200e7ad20..c2c5e11946 100644 +--- a/block/linux-aio.c ++++ b/block/linux-aio.c +@@ -41,9 +41,15 @@ struct qemu_laiocb { + LinuxAioState *ctx; + struct iocb iocb; + ssize_t ret; ++ off_t offset; + size_t nbytes; + QEMUIOVector *qiov; +- bool is_read; ++ ++ int fd; ++ int type; ++ BdrvRequestFlags flags; ++ ++ uint64_t dev_max_batch; + QSIMPLEQ_ENTRY(qemu_laiocb) next; + }; + +@@ -87,7 +93,7 @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb) + ret = 0; + } else if (ret >= 0) { + /* Short reads mean EOF, pad with zeros. */ +- if (laiocb->is_read) { ++ if (laiocb->type == QEMU_AIO_READ) { + qemu_iovec_memset(laiocb->qiov, ret, 0, + laiocb->qiov->size - ret); + } else { +@@ -367,23 +373,23 @@ static void laio_deferred_fn(void *opaque) + } + } + +-static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, +- int type, BdrvRequestFlags flags, +- uint64_t dev_max_batch) ++static int laio_do_submit(struct qemu_laiocb *laiocb) + { + LinuxAioState *s = laiocb->ctx; + struct iocb *iocbs = &laiocb->iocb; + QEMUIOVector *qiov = laiocb->qiov; ++ int fd = laiocb->fd; ++ off_t offset = laiocb->offset; + +- switch (type) { ++ switch (laiocb->type) { + case QEMU_AIO_WRITE: + #ifdef HAVE_IO_PREP_PWRITEV2 + { +- int laio_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0; ++ int laio_flags = (laiocb->flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0; + io_prep_pwritev2(iocbs, fd, qiov->iov, qiov->niov, offset, laio_flags); + } + #else +- assert(flags == 0); ++ assert(laiocb->flags == 0); + io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset); + #endif + break; +@@ -399,7 +405,7 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, + /* Currently Linux kernel does not support other operations */ + default: + fprintf(stderr, "%s: invalid AIO request type 0x%x.\n", +- __func__, type); ++ __func__, laiocb->type); + return -EIO; + } + io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e)); +@@ -407,7 +413,7 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, + QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next); + s->io_q.in_queue++; + if (!s->io_q.blocked) { +- if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch)) { ++ if (s->io_q.in_queue >= laio_max_batch(s, laiocb->dev_max_batch)) { + ioq_submit(s); + } else { + defer_call(laio_deferred_fn, s); +@@ -425,14 +431,18 @@ int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov, + AioContext *ctx = qemu_get_current_aio_context(); + struct qemu_laiocb laiocb = { + .co = qemu_coroutine_self(), ++ .offset = offset, + .nbytes = qiov ? qiov->size : 0, + .ctx = aio_get_linux_aio(ctx), + .ret = -EINPROGRESS, +- .is_read = (type == QEMU_AIO_READ), + .qiov = qiov, ++ .fd = fd, ++ .type = type, ++ .flags = flags, ++ .dev_max_batch = dev_max_batch, + }; + +- ret = laio_do_submit(fd, &laiocb, offset, type, flags, dev_max_batch); ++ ret = laio_do_submit(&laiocb); + if (ret < 0) { + return ret; + } diff --git a/0317-linux-aio-Resubmit-tails-of-short-reads-writes.patch b/0317-linux-aio-Resubmit-tails-of-short-reads-writes.patch new file mode 100644 index 0000000..453ec74 --- /dev/null +++ b/0317-linux-aio-Resubmit-tails-of-short-reads-writes.patch @@ -0,0 +1,216 @@ +From 8016d1cb2bf4900d46c290d2a0fb00c84d22ca0f Mon Sep 17 00:00:00 2001 +From: Hanna Czenczek +Date: Tue, 24 Mar 2026 09:43:35 +0100 +Subject: [PATCH] linux-aio: Resubmit tails of short reads/writes + +RH-Author: Hanna Czenczek +RH-MergeRequest: 479: linux-aio/io-uring: Resubmit tails of short requests +RH-Jira: RHEL-158224 +RH-Acked-by: Kevin Wolf +RH-Acked-by: Stefan Hajnoczi +RH-Commit: [2/4] f97271e609a150acc04a15ffc85c40e7bcb00060 (hreitz/qemu-kvm-c-9-s) + +Short reads/writes can happen. One way to reproduce them is via our +FUSE export, with the following diff applied (%s/escaped // to apply -- +if you put plain diffs in commit messages, git-am will apply them, and I +would rather avoid breaking FUSE accidentally via this patch): + +escaped diff --git a/block/export/fuse.c b/block/export/fuse.c +escaped index a2a478d293..67dc50a412 100644 +escaped --- a/block/export/fuse.c +escaped +++ b/block/export/fuse.c +@@ -828,7 +828,7 @@ static ssize_t coroutine_fn GRAPH_RDLOCK + fuse_co_init(FuseExport *exp, struct fuse_init_out *out, + const struct fuse_init_in_compat *in) + { +- const uint32_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO; ++ const uint32_t supported_flags = FUSE_ASYNC_READ; + + if (in->major != 7) { + error_report("FUSE major version mismatch: We have 7, but kernel has %" +@@ -1060,6 +1060,8 @@ fuse_co_read(FuseExport *exp, void **bufptr, uint64_t offset, uint32_t size) + void *buf; + int ret; + ++ size = MIN(size, 4096); ++ + /* Limited by max_read, should not happen */ + if (size > FUSE_MAX_READ_BYTES) { + return -EINVAL; +@@ -1110,6 +1112,8 @@ fuse_co_write(FuseExport *exp, struct fuse_write_out *out, + int64_t blk_len; + int ret; + ++ size = MIN(size, 4096); ++ + QEMU_BUILD_BUG_ON(FUSE_MAX_WRITE_BYTES > BDRV_REQUEST_MAX_BYTES); + /* Limited by max_write, should not happen */ + if (size > FUSE_MAX_WRITE_BYTES) { + +Then: +$ ./qemu-img create -f raw test.raw 8k +Formatting 'test.raw', fmt=raw size=8192 +$ ./qemu-io -f raw -c 'write -P 42 0 8k' test.raw +wrote 8192/8192 bytes at offset 0 +8 KiB, 1 ops; 00.00 sec (64.804 MiB/sec and 8294.9003 ops/sec) +$ hexdump -C test.raw +00000000 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a |****************| +* +00002000 + +With aio=threads, short I/O works: +$ storage-daemon/qemu-storage-daemon \ + --blockdev file,node-name=test,filename=test.raw \ + --export fuse,id=exp,node-name=test,mountpoint=test.raw,writable=true + +Other shell: +$ ./qemu-io --image-opts -c 'read -P 42 0 8k' \ + driver=file,filename=test.raw,cache.direct=on,aio=threads +read 8192/8192 bytes at offset 0 +8 KiB, 1 ops; 00.00 sec (36.563 MiB/sec and 4680.0923 ops/sec) +$ ./qemu-io --image-opts -c 'write -P 23 0 8k' \ + driver=file,filename=test.raw,cache.direct=on,aio=threads +wrote 8192/8192 bytes at offset 0 +8 KiB, 1 ops; 00.00 sec (35.995 MiB/sec and 4607.2970 ops/sec) +$ hexdump -C test.raw +00000000 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 |................| +* +00002000 + +But with aio=native, it does not: +$ ./qemu-io --image-opts -c 'read -P 23 0 8k' \ + driver=file,filename=test.raw,cache.direct=on,aio=native +Pattern verification failed at offset 0, 8192 bytes +read 8192/8192 bytes at offset 0 +8 KiB, 1 ops; 00.00 sec (86.155 MiB/sec and 11027.7900 ops/sec) +$ ./qemu-io --image-opts -c 'write -P 42 0 8k' \ + driver=file,filename=test.raw,cache.direct=on,aio=native +write failed: No space left on device +$ hexdump -C test.raw +00000000 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a |****************| +* +00001000 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 |................| +* +00002000 + +This patch fixes that. + +Reviewed-by: Kevin Wolf +Signed-off-by: Hanna Czenczek +Message-ID: <20260324084338.37453-3-hreitz@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 7eca3d4883be8d328377001a9ea7ae9882b00f3c) +Signed-off-by: Hanna Czenczek + +Patch-name: kvm-linux-aio-Resubmit-tails-of-short-reads-writes.patch +Patch-id: 132 +Patch-present-in-specfile: True +--- + block/linux-aio.c | 56 ++++++++++++++++++++++++++++++++++++++++++----- + 1 file changed, 50 insertions(+), 6 deletions(-) + +diff --git a/block/linux-aio.c b/block/linux-aio.c +index c2c5e11946..84397de54c 100644 +--- a/block/linux-aio.c ++++ b/block/linux-aio.c +@@ -45,6 +45,10 @@ struct qemu_laiocb { + size_t nbytes; + QEMUIOVector *qiov; + ++ /* For handling short reads/writes */ ++ size_t total_done; ++ QEMUIOVector resubmit_qiov; ++ + int fd; + int type; + BdrvRequestFlags flags; +@@ -74,28 +78,61 @@ struct LinuxAioState { + }; + + static void ioq_submit(LinuxAioState *s); ++static int laio_do_submit(struct qemu_laiocb *laiocb); + + static inline ssize_t io_event_ret(struct io_event *ev) + { + return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res); + } + ++/** ++ * Retry tail of short requests. ++ */ ++static int laio_resubmit_short_io(struct qemu_laiocb *laiocb, size_t done) ++{ ++ QEMUIOVector *resubmit_qiov = &laiocb->resubmit_qiov; ++ ++ laiocb->total_done += done; ++ ++ if (!resubmit_qiov->iov) { ++ qemu_iovec_init(resubmit_qiov, laiocb->qiov->niov); ++ } else { ++ qemu_iovec_reset(resubmit_qiov); ++ } ++ qemu_iovec_concat(resubmit_qiov, laiocb->qiov, ++ laiocb->total_done, laiocb->nbytes - laiocb->total_done); ++ ++ return laio_do_submit(laiocb); ++} ++ + /* + * Completes an AIO request. + */ + static void qemu_laio_process_completion(struct qemu_laiocb *laiocb) + { +- int ret; ++ ssize_t ret; + + ret = laiocb->ret; + if (ret != -ECANCELED) { +- if (ret == laiocb->nbytes) { ++ if (ret == laiocb->nbytes - laiocb->total_done) { + ret = 0; ++ } else if (ret > 0 && (laiocb->type == QEMU_AIO_READ || ++ laiocb->type == QEMU_AIO_WRITE)) { ++ ret = laio_resubmit_short_io(laiocb, ret); ++ if (!ret) { ++ return; ++ } + } else if (ret >= 0) { +- /* Short reads mean EOF, pad with zeros. */ ++ /* ++ * For normal reads and writes, we only get here if ret == 0, which ++ * means EOF for reads and ENOSPC for writes. ++ * For zone-append, we get here with any ret >= 0, which we just ++ * treat as ENOSPC, too (safer than resubmitting, probably, but not ++ * 100 % clear). ++ */ + if (laiocb->type == QEMU_AIO_READ) { +- qemu_iovec_memset(laiocb->qiov, ret, 0, +- laiocb->qiov->size - ret); ++ qemu_iovec_memset(laiocb->qiov, laiocb->total_done, 0, ++ laiocb->qiov->size - laiocb->total_done); + } else { + ret = -ENOSPC; + } +@@ -103,6 +140,9 @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb) + } + + laiocb->ret = ret; ++ if (laiocb->resubmit_qiov.iov) { ++ qemu_iovec_destroy(&laiocb->resubmit_qiov); ++ } + + /* + * If the coroutine is already entered it must be in ioq_submit() and +@@ -379,7 +419,11 @@ static int laio_do_submit(struct qemu_laiocb *laiocb) + struct iocb *iocbs = &laiocb->iocb; + QEMUIOVector *qiov = laiocb->qiov; + int fd = laiocb->fd; +- off_t offset = laiocb->offset; ++ off_t offset = laiocb->offset + laiocb->total_done; ++ ++ if (laiocb->resubmit_qiov.iov) { ++ qiov = &laiocb->resubmit_qiov; ++ } + + switch (laiocb->type) { + case QEMU_AIO_WRITE: diff --git a/0318-block-io_uring-avoid-potentially-getting-stuck-after.patch b/0318-block-io_uring-avoid-potentially-getting-stuck-after.patch new file mode 100644 index 0000000..9d070b7 --- /dev/null +++ b/0318-block-io_uring-avoid-potentially-getting-stuck-after.patch @@ -0,0 +1,168 @@ +From f15767184fc265d2270ad8f0d7e422759c2de274 Mon Sep 17 00:00:00 2001 +From: Fiona Ebner +Date: Tue, 25 Nov 2025 14:31:03 +0100 +Subject: [PATCH] block/io_uring: avoid potentially getting stuck after + resubmit at the end of ioq_submit() + +RH-Author: Hanna Czenczek +RH-MergeRequest: 479: linux-aio/io-uring: Resubmit tails of short requests +RH-Jira: RHEL-158224 +RH-Acked-by: Kevin Wolf +RH-Acked-by: Stefan Hajnoczi +RH-Commit: [3/4] fd599da3ffcd6b37ceff35587ae9dbc1698b0f57 (hreitz/qemu-kvm-c-9-s) + +Note that this issue seems already fixed as a consequence of the large +io_uring rework with 047dabef97 ("block/io_uring: use aio_add_sqe()") +in current master, so this is purely for QEMU stable branches. + +At the end of ioq_submit(), there is an opportunistic call to +luring_process_completions(). This is the single caller of +luring_process_completions() that doesn't use the +luring_process_completions_and_submit() wrapper. + +Other callers use the wrapper, because luring_process_completions() +might require a subsequent call to ioq_submit() after resubmitting a +request. As noted for luring_resubmit(): + +> Resubmit a request by appending it to submit_queue. The caller must ensure +> that ioq_submit() is called later so that submit_queue requests are started. + +So the caller at the end of ioq_submit() violates the contract and can +in fact be problematic if no other requests come in later. In such a +case, the request intended to be resubmitted will never be actually be +submitted via io_uring_submit(). + +A reproducer exposing this issue is [0], which is based on user +reports from [1]. Another reproducer is iotest 109 with '-i io_uring'. + +I had the most success to trigger the issue with [0] when using a +BTRFS RAID 1 storage. With tmpfs, it can take quite a few iterations, +but also triggers eventually on my machine. With iotest 109 with '-i +io_uring' the issue triggers reliably on my ext4 file system. + +Have ioq_submit() submit any resubmitted requests after calling +luring_process_completions(). The return value from io_uring_submit() +is checked to be non-negative before the opportunistic processing of +completions and going for the new resubmit logic, to ensure that a +failure of io_uring_submit() is not missed. Also note that the return +value already was not necessarily the total number of submissions, +since the loop might've been iterated more than once even before the +current change. + +Only trigger the resubmission logic if it is actually necessary to +avoid changing behavior more than necessary. For example iotest 109 +would produce more 'mirror ready' events if always resubmitting after +luring_process_completions() at the end of ioq_submit(). + +Note iotest 109 still does not pass as is when run with '-i io_uring', +because of two offset values for BLOCK_JOB_COMPLETED events being zero +instead of non-zero as in the expected output. Note that the two +affected test cases are expected failures and still fail, so they just +fail "faster". The test cases are actually not triggering the resubmit +logic, so the reason seems to be different ordering of requests and +completions of the current aio=io_uring implementation versus +aio=threads. + +[0]: + +> #!/bin/bash -e +> #file=/mnt/btrfs/disk.raw +> file=/tmp/disk.raw +> filesize=256 +> readsize=512 +> rm -f $file +> truncate -s $filesize $file +> ./qemu-system-x86_64 --trace '*uring*' --qmp stdio \ +> --blockdev raw,node-name=node0,file.driver=file,file.cache.direct=off,file.filename=$file,file.aio=io_uring \ +> < {"execute": "qmp_capabilities"} +> {"execute": "human-monitor-command", "arguments": { "command-line": "qemu-io node0 \"read 0 $readsize \"" }} +> {"execute": "quit"} +> EOF + +[1]: https://forum.proxmox.com/threads/170045/ + +Cc: qemu-stable@nongnu.org +Signed-off-by: Fiona Ebner +Reviewed-by: Stefan Hajnoczi +Signed-off-by: Michael Tokarev +(cherry picked from commit 2bb0153cd806b8f6b4f82b353bd0113cd1c488a5) +Signed-off-by: Hanna Czenczek + +Patch-name: kvm-block-io_uring-avoid-potentially-getting-stuck-after.patch +Patch-id: 133 +Patch-present-in-specfile: True +--- + block/io_uring.c | 16 +++++++++++++--- + 1 file changed, 13 insertions(+), 3 deletions(-) + +diff --git a/block/io_uring.c b/block/io_uring.c +index dd4f304910..5dbafc8f7b 100644 +--- a/block/io_uring.c ++++ b/block/io_uring.c +@@ -120,11 +120,14 @@ static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb, + * event loop. When there are no events left to complete the BH is being + * canceled. + * ++ * Returns whether ioq_submit() must be called again afterwards since requests ++ * were resubmitted via luring_resubmit(). + */ +-static void luring_process_completions(LuringState *s) ++static bool luring_process_completions(LuringState *s) + { + struct io_uring_cqe *cqes; + int total_bytes; ++ bool resubmit = false; + + defer_call_begin(); + +@@ -182,6 +185,7 @@ static void luring_process_completions(LuringState *s) + */ + if (ret == -EINTR || ret == -EAGAIN) { + luring_resubmit(s, luringcb); ++ resubmit = true; + continue; + } + } else if (!luringcb->qiov) { +@@ -194,6 +198,7 @@ static void luring_process_completions(LuringState *s) + if (luringcb->is_read) { + if (ret > 0) { + luring_resubmit_short_read(s, luringcb, ret); ++ resubmit = true; + continue; + } else { + /* Pad with zeroes */ +@@ -224,6 +229,8 @@ end: + qemu_bh_cancel(s->completion_bh); + + defer_call_end(); ++ ++ return resubmit; + } + + static int ioq_submit(LuringState *s) +@@ -231,6 +238,7 @@ static int ioq_submit(LuringState *s) + int ret = 0; + LuringAIOCB *luringcb, *luringcb_next; + ++resubmit: + while (s->io_q.in_queue > 0) { + /* + * Try to fetch sqes from the ring for requests waiting in +@@ -260,12 +268,14 @@ static int ioq_submit(LuringState *s) + } + s->io_q.blocked = (s->io_q.in_queue > 0); + +- if (s->io_q.in_flight) { ++ if (ret >= 0 && s->io_q.in_flight) { + /* + * We can try to complete something just right away if there are + * still requests in-flight. + */ +- luring_process_completions(s); ++ if (luring_process_completions(s)) { ++ goto resubmit; ++ } + } + return ret; + } diff --git a/0319-io-uring-Resubmit-tails-of-short-writes.patch b/0319-io-uring-Resubmit-tails-of-short-writes.patch new file mode 100644 index 0000000..1a0eb7c --- /dev/null +++ b/0319-io-uring-Resubmit-tails-of-short-writes.patch @@ -0,0 +1,279 @@ +From 714d3eb4252f38fce4de45f23fc11d07ca362bf2 Mon Sep 17 00:00:00 2001 +From: Hanna Czenczek +Date: Tue, 24 Mar 2026 09:43:36 +0100 +Subject: [PATCH] io-uring: Resubmit tails of short writes + +RH-Author: Hanna Czenczek +RH-MergeRequest: 479: linux-aio/io-uring: Resubmit tails of short requests +RH-Jira: RHEL-158224 +RH-Acked-by: Kevin Wolf +RH-Acked-by: Stefan Hajnoczi +RH-Commit: [4/4] 7ff66622acbb5dbdbacafe71bffd0a277ac919d9 (hreitz/qemu-kvm-c-9-s) + +Short writes can happen, too, not just short reads. The difference to +aio=native is that the kernel will actually retry the tail of short +requests internally already -- so it is harder to reproduce. But if the +tail of a short request returns an error to the kernel, we will see it +in userspace still. To reproduce this, apply the following patch on top +of the one shown in HEAD^ (again %s/escaped // to apply): + +escaped diff --git a/block/export/fuse.c b/block/export/fuse.c +escaped index 67dc50a412..2b98489a32 100644 +escaped --- a/block/export/fuse.c +escaped +++ b/block/export/fuse.c +@@ -1059,8 +1059,15 @@ fuse_co_read(FuseExport *exp, void **bufptr, uint64_t offset, uint32_t size) + int64_t blk_len; + void *buf; + int ret; ++ static uint32_t error_size; + +- size = MIN(size, 4096); ++ if (error_size == size) { ++ error_size = 0; ++ return -EIO; ++ } else if (size > 4096) { ++ error_size = size - 4096; ++ size = 4096; ++ } + + /* Limited by max_read, should not happen */ + if (size > FUSE_MAX_READ_BYTES) { +@@ -1111,8 +1118,15 @@ fuse_co_write(FuseExport *exp, struct fuse_write_out *out, + { + int64_t blk_len; + int ret; ++ static uint32_t error_size; + +- size = MIN(size, 4096); ++ if (error_size == size) { ++ error_size = 0; ++ return -EIO; ++ } else if (size > 4096) { ++ error_size = size - 4096; ++ size = 4096; ++ } + + QEMU_BUILD_BUG_ON(FUSE_MAX_WRITE_BYTES > BDRV_REQUEST_MAX_BYTES); + /* Limited by max_write, should not happen */ + +I know this is a bit artificial because to produce this, there must be +an I/O error somewhere anyway, but if it does happen, qemu will +understand it to mean ENOSPC for short writes, which is incorrect. So I +believe we need to resubmit the tail to maybe have it succeed now, or at +least get the correct error code. + +Reproducer as before: +$ ./qemu-img create -f raw test.raw 8k +Formatting 'test.raw', fmt=raw size=8192 +$ ./qemu-io -f raw -c 'write -P 42 0 8k' test.raw +wrote 8192/8192 bytes at offset 0 +8 KiB, 1 ops; 00.00 sec (64.804 MiB/sec and 8294.9003 ops/sec) +$ hexdump -C test.raw +00000000 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a |****************| +* +00002000 +$ storage-daemon/qemu-storage-daemon \ + --blockdev file,node-name=test,filename=test.raw \ + --export fuse,id=exp,node-name=test,mountpoint=test.raw,writable=true + +$ ./qemu-io --image-opts -c 'read -P 23 0 8k' \ + driver=file,filename=test.raw,cache.direct=on,aio=io_uring +read 8192/8192 bytes at offset 0 +8 KiB, 1 ops; 00.00 sec (58.481 MiB/sec and 7485.5342 ops/sec) +$ ./qemu-io --image-opts -c 'write -P 23 0 8k' \ + driver=file,filename=test.raw,cache.direct=on,aio=io_uring +write failed: No space left on device +$ hexdump -C test.raw +00000000 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 |................| +* +00001000 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a |****************| +* +00002000 + +So short reads already work (because there is code for that), but short +writes incorrectly produce ENOSPC. This patch fixes that by +resubmitting not only the tail of short reads but short writes also. + +(And this patch uses the opportunity to make it so qemu_iovec_destroy() +is called only if req->resubmit_qiov.iov is non-NULL. Functionally a +non-op, but this is how the code generally checks whether the +resubmit_qiov has been set up or not.) + +Reviewed-by: Kevin Wolf +Signed-off-by: Hanna Czenczek +Message-ID: <20260324084338.37453-4-hreitz@redhat.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit cf9cdaea6e24d13dfdf8402f6829d2ca4dca864b) + +Conflicts: +- block/io_uring.c, block/trace-events + Missing 047dabef97bd0c4af3c3dc453b19e20345de3602 ("block/io_uring: use + aio_add_sqe()") downstream, which changed quite a few things about the + io-uring code. Backporting it seems excessive, though, especially + given that it would pull in even more dependencies (general aio and + aio-posix changes that come right before 047dabef). + +Signed-off-by: Hanna Czenczek + +Patch-name: kvm-io-uring-Resubmit-tails-of-short-writes.patch +Patch-id: 134 +Patch-present-in-specfile: True +--- + block/io_uring.c | 77 +++++++++++++++++++++++++--------------------- + block/trace-events | 2 +- + 2 files changed, 43 insertions(+), 36 deletions(-) + +diff --git a/block/io_uring.c b/block/io_uring.c +index 5dbafc8f7b..582550f8e9 100644 +--- a/block/io_uring.c ++++ b/block/io_uring.c +@@ -31,14 +31,14 @@ typedef struct LuringAIOCB { + struct io_uring_sqe sqeq; + ssize_t ret; + QEMUIOVector *qiov; +- bool is_read; ++ int type; + QSIMPLEQ_ENTRY(LuringAIOCB) next; + + /* +- * Buffered reads may require resubmission, see +- * luring_resubmit_short_read(). ++ * Short reads/writes require resubmission, see ++ * luring_resubmit_short_io(). + */ +- int total_read; ++ int total_done; + QEMUIOVector resubmit_qiov; + } LuringAIOCB; + +@@ -73,22 +73,27 @@ static void luring_resubmit(LuringState *s, LuringAIOCB *luringcb) + } + + /** +- * luring_resubmit_short_read: ++ * luring_resubmit_short_io: + * +- * Short reads are rare but may occur. The remaining read request needs to be +- * resubmitted. ++ * Short reads and writes are rare but may occur. The remaining request needs ++ * to be resubmitted. ++ * ++ * For example, short reads can be reproduced by a FUSE export deliberately ++ * executing short reads. The tail of short writes is generally resubmitted by ++ * io-uring in the kernel, but if that resubmission encounters an I/O error, the ++ * already submitted portion will be returned as a short write. + */ +-static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb, +- int nread) ++static void luring_resubmit_short_io(LuringState *s, LuringAIOCB *luringcb, ++ int ndone) + { + QEMUIOVector *resubmit_qiov; + size_t remaining; + +- trace_luring_resubmit_short_read(s, luringcb, nread); ++ trace_luring_resubmit_short_io(s, luringcb, ndone); + +- /* Update read position */ +- luringcb->total_read += nread; +- remaining = luringcb->qiov->size - luringcb->total_read; ++ /* Update I/O position */ ++ luringcb->total_done += ndone; ++ remaining = luringcb->qiov->size - luringcb->total_done; + + /* Shorten qiov */ + resubmit_qiov = &luringcb->resubmit_qiov; +@@ -97,11 +102,11 @@ static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb, + } else { + qemu_iovec_reset(resubmit_qiov); + } +- qemu_iovec_concat(resubmit_qiov, luringcb->qiov, luringcb->total_read, ++ qemu_iovec_concat(resubmit_qiov, luringcb->qiov, luringcb->total_done, + remaining); + + /* Update sqe */ +- luringcb->sqeq.off += nread; ++ luringcb->sqeq.off += ndone; + luringcb->sqeq.addr = (uintptr_t)luringcb->resubmit_qiov.iov; + luringcb->sqeq.len = luringcb->resubmit_qiov.niov; + +@@ -165,8 +170,8 @@ static bool luring_process_completions(LuringState *s) + s->io_q.in_flight--; + trace_luring_process_completion(s, luringcb, ret); + +- /* total_read is non-zero only for resubmitted read requests */ +- total_bytes = ret + luringcb->total_read; ++ /* total_done is non-zero only for resubmitted requests */ ++ total_bytes = ret + luringcb->total_done; + + if (ret < 0) { + /* +@@ -192,27 +197,29 @@ static bool luring_process_completions(LuringState *s) + goto end; + } else if (total_bytes == luringcb->qiov->size) { + ret = 0; +- /* Only read/write */ ++ } else if (ret > 0 && (luringcb->type == QEMU_AIO_READ || ++ luringcb->type == QEMU_AIO_WRITE)) { ++ luring_resubmit_short_io(s, luringcb, ret); ++ resubmit = true; ++ continue; ++ } else if (luringcb->type == QEMU_AIO_READ) { ++ /* Read ret == 0: EOF, pad with zeroes */ ++ qemu_iovec_memset(luringcb->qiov, total_bytes, 0, ++ luringcb->qiov->size - total_bytes); ++ ret = 0; + } else { +- /* Short Read/Write */ +- if (luringcb->is_read) { +- if (ret > 0) { +- luring_resubmit_short_read(s, luringcb, ret); +- resubmit = true; +- continue; +- } else { +- /* Pad with zeroes */ +- qemu_iovec_memset(luringcb->qiov, total_bytes, 0, +- luringcb->qiov->size - total_bytes); +- ret = 0; +- } +- } else { +- ret = -ENOSPC; +- } ++ /* ++ * Normal write ret == 0 means ENOSPC. ++ * For zone-append, we treat any 0 <= ret < qiov->size as ENOSPC, ++ * too, because resubmitting the tail seems a little unsafe. ++ */ ++ ret = -ENOSPC; + } + end: + luringcb->ret = ret; +- qemu_iovec_destroy(&luringcb->resubmit_qiov); ++ if (luringcb->resubmit_qiov.iov) { ++ qemu_iovec_destroy(&luringcb->resubmit_qiov); ++ } + + /* + * If the coroutine is already entered it must be in ioq_submit() +@@ -409,7 +416,7 @@ int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset, + .co = qemu_coroutine_self(), + .ret = -EINPROGRESS, + .qiov = qiov, +- .is_read = (type == QEMU_AIO_READ), ++ .type = type, + }; + trace_luring_co_submit(bs, s, &luringcb, fd, offset, qiov ? qiov->size : 0, + type); +diff --git a/block/trace-events b/block/trace-events +index 8e789e1f12..99b8c12bc8 100644 +--- a/block/trace-events ++++ b/block/trace-events +@@ -70,7 +70,7 @@ luring_do_submit_done(void *s, int ret) "LuringState %p submitted to kernel %d" + luring_co_submit(void *bs, void *s, void *luringcb, int fd, uint64_t offset, size_t nbytes, int type) "bs %p s %p luringcb %p fd %d offset %" PRId64 " nbytes %zd type %d" + luring_process_completion(void *s, void *aiocb, int ret) "LuringState %p luringcb %p ret %d" + luring_io_uring_submit(void *s, int ret) "LuringState %p ret %d" +-luring_resubmit_short_read(void *s, void *luringcb, int nread) "LuringState %p luringcb %p nread %d" ++luring_resubmit_short_io(void *s, void *luringcb, int ndone) "LuringState %p luringcb %p ndone %d" + + # qcow2.c + qcow2_add_task(void *co, void *bs, void *pool, const char *action, int cluster_type, uint64_t host_offset, uint64_t offset, uint64_t bytes, void *qiov, size_t qiov_offset) "co %p bs %p pool %p: %s: cluster_type %d file_cluster_offset %" PRIu64 " offset %" PRIu64 " bytes %" PRIu64 " qiov %p qiov_offset %zu" diff --git a/0320-block-enable-stats-intervals-for-storage-devices.patch b/0320-block-enable-stats-intervals-for-storage-devices.patch new file mode 100644 index 0000000..895b3f6 --- /dev/null +++ b/0320-block-enable-stats-intervals-for-storage-devices.patch @@ -0,0 +1,457 @@ +From 2ffa7b2725ba0ea2f6d73c08687690332f6b2f06 Mon Sep 17 00:00:00 2001 +From: Chandan Somani +Date: Fri, 3 Oct 2025 14:59:26 -0700 +Subject: [PATCH] block: enable stats-intervals for storage devices + +RH-Author: Chandan Somani +RH-MergeRequest: 467: block: enable stats-intervals for storage devices +RH-Jira: RHEL-114231 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Kevin Wolf +RH-Commit: [1/2] 0ca781da8e25bf5d2baf3dcd9f2687b4d21b8613 (csomani1/qemu-kvm) + +This patch allows stats-intervals to be used for storage +devices with the -device option. It accepts a list of interval +lengths in JSON format. + +It configures and collects the stats in the BlockBackend layer +through the storage device that consumes the BlockBackend. + +Signed-off-by: Chandan Somani +Message-ID: <20251003220039.1336663-1-csomani@redhat.com> +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf + +Patch-name: kvm-block-enable-stats-intervals-for-storage-devices.patch +Patch-id: 135 +Patch-present-in-specfile: True +--- + block/accounting.c | 17 +++++++++++++++-- + blockdev.c | 3 ++- + hw/block/block.c | 7 +++++-- + include/block/accounting.h | 5 +++-- + include/hw/block/block.h | 7 ++++++- + tests/qemu-iotests/172.out | 38 ++++++++++++++++++++++++++++++++++++++ + 6 files changed, 69 insertions(+), 8 deletions(-) + +diff --git a/block/accounting.c b/block/accounting.c +index 3e46159569..0933c61f3a 100644 +--- a/block/accounting.c ++++ b/block/accounting.c +@@ -28,6 +28,7 @@ + #include "block/block_int.h" + #include "qemu/timer.h" + #include "system/qtest.h" ++#include "qapi/error.h" + + static QEMUClockType clock_type = QEMU_CLOCK_REALTIME; + static const int qtest_latency_ns = NANOSECONDS_PER_SECOND / 1000; +@@ -56,13 +57,25 @@ static bool bool_from_onoffauto(OnOffAuto val, bool def) + } + } + +-void block_acct_setup(BlockAcctStats *stats, enum OnOffAuto account_invalid, +- enum OnOffAuto account_failed) ++bool block_acct_setup(BlockAcctStats *stats, enum OnOffAuto account_invalid, ++ enum OnOffAuto account_failed, uint32_t *stats_intervals, ++ uint32_t num_stats_intervals, Error **errp) + { + stats->account_invalid = bool_from_onoffauto(account_invalid, + stats->account_invalid); + stats->account_failed = bool_from_onoffauto(account_failed, + stats->account_failed); ++ if (stats_intervals) { ++ for (int i = 0; i < num_stats_intervals; i++) { ++ if (stats_intervals[i] <= 0) { ++ error_setg(errp, "Invalid interval length: %u", stats_intervals[i]); ++ return false; ++ } ++ block_acct_add_interval(stats, stats_intervals[i]); ++ } ++ g_free(stats_intervals); ++ } ++ return true; + } + + void block_acct_cleanup(BlockAcctStats *stats) +diff --git a/blockdev.c b/blockdev.c +index 76c8dd0573..6e86c6262f 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -617,7 +617,8 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts, + + bs->detect_zeroes = detect_zeroes; + +- block_acct_setup(blk_get_stats(blk), account_invalid, account_failed); ++ block_acct_setup(blk_get_stats(blk), account_invalid, account_failed, ++ NULL, 0, NULL); + + if (!parse_stats_intervals(blk_get_stats(blk), interval_list, errp)) { + blk_unref(blk); +diff --git a/hw/block/block.c b/hw/block/block.c +index 2e10611d95..f187fa025d 100644 +--- a/hw/block/block.c ++++ b/hw/block/block.c +@@ -249,8 +249,11 @@ bool blkconf_apply_backend_options(BlockConf *conf, bool readonly, + blk_set_enable_write_cache(blk, wce); + blk_set_on_error(blk, rerror, werror); + +- block_acct_setup(blk_get_stats(blk), conf->account_invalid, +- conf->account_failed); ++ if (!block_acct_setup(blk_get_stats(blk), conf->account_invalid, ++ conf->account_failed, conf->stats_intervals, ++ conf->num_stats_intervals, errp)) { ++ return false; ++ } + return true; + } + +diff --git a/include/block/accounting.h b/include/block/accounting.h +index a59e39f49d..b1cf417b57 100644 +--- a/include/block/accounting.h ++++ b/include/block/accounting.h +@@ -101,8 +101,9 @@ typedef struct BlockAcctCookie { + } BlockAcctCookie; + + void block_acct_init(BlockAcctStats *stats); +-void block_acct_setup(BlockAcctStats *stats, enum OnOffAuto account_invalid, +- enum OnOffAuto account_failed); ++bool block_acct_setup(BlockAcctStats *stats, enum OnOffAuto account_invalid, ++ enum OnOffAuto account_failed, uint32_t *stats_intervals, ++ uint32_t num_stats_intervals, Error **errp); + void block_acct_cleanup(BlockAcctStats *stats); + void block_acct_add_interval(BlockAcctStats *stats, unsigned interval_length); + BlockAcctTimedStats *block_acct_interval_next(BlockAcctStats *stats, +diff --git a/include/hw/block/block.h b/include/hw/block/block.h +index de3946a5f1..b4d914624e 100644 +--- a/include/hw/block/block.h ++++ b/include/hw/block/block.h +@@ -34,6 +34,8 @@ typedef struct BlockConf { + OnOffAuto account_invalid, account_failed; + BlockdevOnError rerror; + BlockdevOnError werror; ++ uint32_t num_stats_intervals; ++ uint32_t *stats_intervals; + } BlockConf; + + static inline unsigned int get_physical_block_exp(BlockConf *conf) +@@ -66,7 +68,10 @@ static inline unsigned int get_physical_block_exp(BlockConf *conf) + DEFINE_PROP_ON_OFF_AUTO("account-invalid", _state, \ + _conf.account_invalid, ON_OFF_AUTO_AUTO), \ + DEFINE_PROP_ON_OFF_AUTO("account-failed", _state, \ +- _conf.account_failed, ON_OFF_AUTO_AUTO) ++ _conf.account_failed, ON_OFF_AUTO_AUTO), \ ++ DEFINE_PROP_ARRAY("stats-intervals", _state, \ ++ _conf.num_stats_intervals, _conf.stats_intervals, \ ++ qdev_prop_uint32, uint32_t) + + #define DEFINE_BLOCK_PROPERTIES(_state, _conf) \ + DEFINE_PROP_DRIVE("drive", _state, _conf.blk), \ +diff --git a/tests/qemu-iotests/172.out b/tests/qemu-iotests/172.out +index 146fc72388..a023cd407d 100644 +--- a/tests/qemu-iotests/172.out ++++ b/tests/qemu-iotests/172.out +@@ -30,6 +30,7 @@ Testing: + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "288" + + +@@ -59,6 +60,7 @@ Testing: -fda TEST_DIR/t.qcow2 + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + floppy0 (NODE_NAME): TEST_DIR/t.qcow2 (qcow2) + Attached to: /machine/unattached/device[N] +@@ -95,6 +97,7 @@ Testing: -fdb TEST_DIR/t.qcow2 + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + dev: floppy, id "" + unit = 0 (0x0) +@@ -109,6 +112,7 @@ Testing: -fdb TEST_DIR/t.qcow2 + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "288" + floppy1 (NODE_NAME): TEST_DIR/t.qcow2 (qcow2) + Attached to: /machine/unattached/device[N] +@@ -149,6 +153,7 @@ Testing: -fda TEST_DIR/t.qcow2 -fdb TEST_DIR/t.qcow2.2 + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + dev: floppy, id "" + unit = 0 (0x0) +@@ -163,6 +168,7 @@ Testing: -fda TEST_DIR/t.qcow2 -fdb TEST_DIR/t.qcow2.2 + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + floppy0 (NODE_NAME): TEST_DIR/t.qcow2 (qcow2) + Attached to: /machine/unattached/device[N] +@@ -204,6 +210,7 @@ Testing: -fdb + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "288" + dev: floppy, id "" + unit = 0 (0x0) +@@ -218,6 +225,7 @@ Testing: -fdb + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "288" + + +@@ -247,6 +255,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2 + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + floppy0 (NODE_NAME): TEST_DIR/t.qcow2 (qcow2) + Attached to: /machine/unattached/device[N] +@@ -283,6 +292,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2,index=1 + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + dev: floppy, id "" + unit = 0 (0x0) +@@ -297,6 +307,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2,index=1 + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "288" + floppy1 (NODE_NAME): TEST_DIR/t.qcow2 (qcow2) + Attached to: /machine/unattached/device[N] +@@ -337,6 +348,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2 -drive if=floppy,file=TEST_DIR/t + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + dev: floppy, id "" + unit = 0 (0x0) +@@ -351,6 +363,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2 -drive if=floppy,file=TEST_DIR/t + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + floppy0 (NODE_NAME): TEST_DIR/t.qcow2 (qcow2) + Attached to: /machine/unattached/device[N] +@@ -395,6 +408,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -device floppy,drive=none0 + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + none0 (NODE_NAME): TEST_DIR/t.qcow2 (qcow2) + Attached to: /machine/peripheral-anon/device[N] +@@ -431,6 +445,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -device floppy,drive=none0,unit=1 + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + none0 (NODE_NAME): TEST_DIR/t.qcow2 (qcow2) + Attached to: /machine/peripheral-anon/device[N] +@@ -467,6 +482,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qco + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + dev: floppy, id "" + unit = 0 (0x0) +@@ -481,6 +497,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qco + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + none0 (NODE_NAME): TEST_DIR/t.qcow2 (qcow2) + Attached to: /machine/peripheral-anon/device[N] +@@ -531,6 +548,7 @@ Testing: -fda TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + dev: floppy, id "" + unit = 0 (0x0) +@@ -545,6 +563,7 @@ Testing: -fda TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + floppy0 (NODE_NAME): TEST_DIR/t.qcow2 (qcow2) + Attached to: /machine/unattached/device[N] +@@ -586,6 +605,7 @@ Testing: -fda TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + dev: floppy, id "" + unit = 0 (0x0) +@@ -600,6 +620,7 @@ Testing: -fda TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + floppy0 (NODE_NAME): TEST_DIR/t.qcow2 (qcow2) + Attached to: /machine/unattached/device[N] +@@ -641,6 +662,7 @@ Testing: -fdb TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + dev: floppy, id "" + unit = 1 (0x1) +@@ -655,6 +677,7 @@ Testing: -fdb TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + floppy1 (NODE_NAME): TEST_DIR/t.qcow2 (qcow2) + Attached to: /machine/unattached/device[N] +@@ -696,6 +719,7 @@ Testing: -fdb TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + dev: floppy, id "" + unit = 1 (0x1) +@@ -710,6 +734,7 @@ Testing: -fdb TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.qcow2.2 -device fl + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + floppy1 (NODE_NAME): TEST_DIR/t.qcow2 (qcow2) + Attached to: /machine/unattached/device[N] +@@ -760,6 +785,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.q + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + dev: floppy, id "" + unit = 0 (0x0) +@@ -774,6 +800,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.q + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + floppy0 (NODE_NAME): TEST_DIR/t.qcow2 (qcow2) + Attached to: /machine/unattached/device[N] +@@ -815,6 +842,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.q + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + dev: floppy, id "" + unit = 0 (0x0) +@@ -829,6 +857,7 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2 -drive if=none,file=TEST_DIR/t.q + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + floppy0 (NODE_NAME): TEST_DIR/t.qcow2 (qcow2) + Attached to: /machine/unattached/device[N] +@@ -876,6 +905,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -global floppy.drive=none0 -device + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + none0 (NODE_NAME): TEST_DIR/t.qcow2 (qcow2) + Attached to: /machine/peripheral-anon/device[N] +@@ -942,6 +972,7 @@ Testing: -device floppy + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "288" + + Testing: -device floppy,drive-type=120 +@@ -968,6 +999,7 @@ Testing: -device floppy,drive-type=120 + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "120" + + Testing: -device floppy,drive-type=144 +@@ -994,6 +1026,7 @@ Testing: -device floppy,drive-type=144 + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + + Testing: -device floppy,drive-type=288 +@@ -1020,6 +1053,7 @@ Testing: -device floppy,drive-type=288 + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "288" + + +@@ -1049,6 +1083,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -device floppy,drive=none0,drive-t + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "120" + none0 (NODE_NAME): TEST_DIR/t.qcow2 (qcow2) + Attached to: /machine/peripheral-anon/device[N] +@@ -1085,6 +1120,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -device floppy,drive=none0,drive-t + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "288" + none0 (NODE_NAME): TEST_DIR/t.qcow2 (qcow2) + Attached to: /machine/peripheral-anon/device[N] +@@ -1124,6 +1160,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -device floppy,drive=none0,logical + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + none0 (NODE_NAME): TEST_DIR/t.qcow2 (qcow2) + Attached to: /machine/peripheral-anon/device[N] +@@ -1160,6 +1197,7 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -device floppy,drive=none0,physica + share-rw = false + account-invalid = "auto" + account-failed = "auto" ++ stats-intervals = + drive-type = "144" + none0 (NODE_NAME): TEST_DIR/t.qcow2 (qcow2) + Attached to: /machine/peripheral-anon/device[N] diff --git a/0321-qdev-Free-property-array-on-release.patch b/0321-qdev-Free-property-array-on-release.patch new file mode 100644 index 0000000..eaeb1f5 --- /dev/null +++ b/0321-qdev-Free-property-array-on-release.patch @@ -0,0 +1,292 @@ +From 2e11bce8c250b1af6b0959bf08c53a990b0efa7f Mon Sep 17 00:00:00 2001 +From: Chandan Somani +Date: Thu, 8 Jan 2026 15:03:07 -0800 +Subject: [PATCH] qdev: Free property array on release + +RH-Author: Chandan Somani +RH-MergeRequest: 467: block: enable stats-intervals for storage devices +RH-Jira: RHEL-114231 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Kevin Wolf +RH-Commit: [2/2] f8b9e42543653e5a724175b73f25fa02593ff79d (csomani1/qemu-kvm) + +Before this patch, users of the property array would free the +array themselves in their cleanup functions. This causes +inconsistencies where some users leak the array and some free them. + +This patch makes it so that the property array's release function +frees the property array (instead of just its elements). It fixes any +leaks and requires less code. + +DEFINE_PROP_ARRAY leakers that are fixed in this patch: +ebpf-rss_fds in hw/net/virtio-net.c +rnmi_irqvec, rnmi_excpvec in hw/riscv/riscv_hart.c +common.display_modes in hw/display/apple-gfx-mmio.m +common.display_modes in hw/display/apple-gfx-pci.m + +Signed-off-by: Chandan Somani +Reviewed-by: Stefan Hajnoczi +Link: https://lore.kernel.org/r/20260108230311.584141-2-csomani@redhat.com +Signed-off-by: Paolo Bonzini + +Patch-name: kvm-qdev-Free-property-array-on-release.patch +Patch-id: 136 +Patch-present-in-specfile: True +--- + block/accounting.c | 1 - + hw/core/qdev-properties.c | 21 ++++++++++----------- + hw/input/stellaris_gamepad.c | 8 -------- + hw/intc/arm_gicv3_common.c | 8 -------- + hw/intc/rx_icu.c | 8 -------- + hw/misc/arm_sysctl.c | 2 -- + hw/misc/mps2-scc.c | 8 -------- + hw/net/rocker/rocker.c | 1 - + hw/nvram/xlnx-efuse.c | 8 -------- + hw/nvram/xlnx-versal-efuse-ctrl.c | 1 - + hw/virtio/virtio-iommu-pci.c | 8 -------- + 11 files changed, 10 insertions(+), 64 deletions(-) + +diff --git a/block/accounting.c b/block/accounting.c +index 0933c61f3a..5cf51f029b 100644 +--- a/block/accounting.c ++++ b/block/accounting.c +@@ -73,7 +73,6 @@ bool block_acct_setup(BlockAcctStats *stats, enum OnOffAuto account_invalid, + } + block_acct_add_interval(stats, stats_intervals[i]); + } +- g_free(stats_intervals); + } + return true; + } +diff --git a/hw/core/qdev-properties.c b/hw/core/qdev-properties.c +index b7e8a89ba5..0449226e00 100644 +--- a/hw/core/qdev-properties.c ++++ b/hw/core/qdev-properties.c +@@ -671,10 +671,8 @@ static Property array_elem_prop(Object *obj, const Property *parent_prop, + + /* + * Object property release callback for array properties: We call the +- * underlying element's property release hook for each element. +- * +- * Note that it is the responsibility of the individual device's deinit +- * to free the array proper. ++ * underlying element's property release hook for each element and free the ++ * property array. + */ + static void release_prop_array(Object *obj, const char *name, void *opaque) + { +@@ -684,15 +682,16 @@ static void release_prop_array(Object *obj, const char *name, void *opaque) + char *elem = *arrayptr; + int i; + +- if (!prop->arrayinfo->release) { +- return; ++ if (prop->arrayinfo->release) { ++ for (i = 0; i < *alenptr; i++) { ++ Property elem_prop = array_elem_prop(obj, prop, name, elem); ++ prop->arrayinfo->release(obj, NULL, &elem_prop); ++ elem += prop->arrayfieldsize; ++ } + } + +- for (i = 0; i < *alenptr; i++) { +- Property elem_prop = array_elem_prop(obj, prop, name, elem); +- prop->arrayinfo->release(obj, NULL, &elem_prop); +- elem += prop->arrayfieldsize; +- } ++ g_clear_pointer(arrayptr, g_free); ++ *alenptr = 0; + } + + /* +diff --git a/hw/input/stellaris_gamepad.c b/hw/input/stellaris_gamepad.c +index fec1161c9c..207064dacb 100644 +--- a/hw/input/stellaris_gamepad.c ++++ b/hw/input/stellaris_gamepad.c +@@ -63,13 +63,6 @@ static void stellaris_gamepad_realize(DeviceState *dev, Error **errp) + qemu_input_handler_register(dev, &stellaris_gamepad_handler); + } + +-static void stellaris_gamepad_finalize(Object *obj) +-{ +- StellarisGamepad *s = STELLARIS_GAMEPAD(obj); +- +- g_free(s->keycodes); +-} +- + static void stellaris_gamepad_reset_enter(Object *obj, ResetType type) + { + StellarisGamepad *s = STELLARIS_GAMEPAD(obj); +@@ -98,7 +91,6 @@ static const TypeInfo stellaris_gamepad_info[] = { + .name = TYPE_STELLARIS_GAMEPAD, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(StellarisGamepad), +- .instance_finalize = stellaris_gamepad_finalize, + .class_init = stellaris_gamepad_class_init, + }, + }; +diff --git a/hw/intc/arm_gicv3_common.c b/hw/intc/arm_gicv3_common.c +index e438d8c042..4fc84741a8 100644 +--- a/hw/intc/arm_gicv3_common.c ++++ b/hw/intc/arm_gicv3_common.c +@@ -488,13 +488,6 @@ static void arm_gicv3_common_realize(DeviceState *dev, Error **errp) + s->itslist = g_ptr_array_new(); + } + +-static void arm_gicv3_finalize(Object *obj) +-{ +- GICv3State *s = ARM_GICV3_COMMON(obj); +- +- g_free(s->redist_region_count); +-} +- + static void arm_gicv3_common_reset_hold(Object *obj, ResetType type) + { + GICv3State *s = ARM_GICV3_COMMON(obj); +@@ -643,7 +636,6 @@ static const TypeInfo arm_gicv3_common_type = { + .instance_size = sizeof(GICv3State), + .class_size = sizeof(ARMGICv3CommonClass), + .class_init = arm_gicv3_common_class_init, +- .instance_finalize = arm_gicv3_finalize, + .abstract = true, + .interfaces = (const InterfaceInfo[]) { + { TYPE_ARM_LINUX_BOOT_IF }, +diff --git a/hw/intc/rx_icu.c b/hw/intc/rx_icu.c +index f8615527b7..85da0624f6 100644 +--- a/hw/intc/rx_icu.c ++++ b/hw/intc/rx_icu.c +@@ -334,13 +334,6 @@ static void rxicu_init(Object *obj) + sysbus_init_irq(d, &icu->_swi); + } + +-static void rxicu_fini(Object *obj) +-{ +- RXICUState *icu = RX_ICU(obj); +- g_free(icu->map); +- g_free(icu->init_sense); +-} +- + static const VMStateDescription vmstate_rxicu = { + .name = "rx-icu", + .version_id = 1, +@@ -382,7 +375,6 @@ static const TypeInfo rxicu_info = { + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(RXICUState), + .instance_init = rxicu_init, +- .instance_finalize = rxicu_fini, + .class_init = rxicu_class_init, + }; + +diff --git a/hw/misc/arm_sysctl.c b/hw/misc/arm_sysctl.c +index 0f4e37cd47..e715ff9475 100644 +--- a/hw/misc/arm_sysctl.c ++++ b/hw/misc/arm_sysctl.c +@@ -618,9 +618,7 @@ static void arm_sysctl_finalize(Object *obj) + { + arm_sysctl_state *s = ARM_SYSCTL(obj); + +- g_free(s->db_voltage); + g_free(s->db_clock); +- g_free(s->db_clock_reset); + } + + static const Property arm_sysctl_properties[] = { +diff --git a/hw/misc/mps2-scc.c b/hw/misc/mps2-scc.c +index a9a5d4a535..acb0f5773b 100644 +--- a/hw/misc/mps2-scc.c ++++ b/hw/misc/mps2-scc.c +@@ -405,13 +405,6 @@ static void mps2_scc_realize(DeviceState *dev, Error **errp) + s->oscclk = g_new0(uint32_t, s->num_oscclk); + } + +-static void mps2_scc_finalize(Object *obj) +-{ +- MPS2SCC *s = MPS2_SCC(obj); +- +- g_free(s->oscclk_reset); +-} +- + static bool cfg7_needed(void *opaque) + { + MPS2SCC *s = opaque; +@@ -489,7 +482,6 @@ static const TypeInfo mps2_scc_info = { + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(MPS2SCC), + .instance_init = mps2_scc_init, +- .instance_finalize = mps2_scc_finalize, + .class_init = mps2_scc_class_init, + }; + +diff --git a/hw/net/rocker/rocker.c b/hw/net/rocker/rocker.c +index cc49701dd3..cbc7bd3ed9 100644 +--- a/hw/net/rocker/rocker.c ++++ b/hw/net/rocker/rocker.c +@@ -1429,7 +1429,6 @@ static void pci_rocker_uninit(PCIDevice *dev) + world_free(r->worlds[i]); + } + } +- g_free(r->fp_ports_peers); + } + + static void rocker_reset(DeviceState *dev) +diff --git a/hw/nvram/xlnx-efuse.c b/hw/nvram/xlnx-efuse.c +index 4c23f8b931..1875fdb953 100644 +--- a/hw/nvram/xlnx-efuse.c ++++ b/hw/nvram/xlnx-efuse.c +@@ -224,13 +224,6 @@ static void efuse_realize(DeviceState *dev, Error **errp) + } + } + +-static void efuse_finalize(Object *obj) +-{ +- XlnxEFuse *s = XLNX_EFUSE(obj); +- +- g_free(s->ro_bits); +-} +- + static void efuse_prop_set_drive(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) + { +@@ -288,7 +281,6 @@ static const TypeInfo efuse_info = { + .name = TYPE_XLNX_EFUSE, + .parent = TYPE_DEVICE, + .instance_size = sizeof(XlnxEFuse), +- .instance_finalize = efuse_finalize, + .class_init = efuse_class_init, + }; + +diff --git a/hw/nvram/xlnx-versal-efuse-ctrl.c b/hw/nvram/xlnx-versal-efuse-ctrl.c +index 9096219800..ec86a7d6e5 100644 +--- a/hw/nvram/xlnx-versal-efuse-ctrl.c ++++ b/hw/nvram/xlnx-versal-efuse-ctrl.c +@@ -729,7 +729,6 @@ static void efuse_ctrl_finalize(Object *obj) + XlnxVersalEFuseCtrl *s = XLNX_VERSAL_EFUSE_CTRL(obj); + + register_finalize_block(s->reg_array); +- g_free(s->extra_pg0_lock_spec); + } + + static const VMStateDescription vmstate_efuse_ctrl = { +diff --git a/hw/virtio/virtio-iommu-pci.c b/hw/virtio/virtio-iommu-pci.c +index 8123c6f83a..9396b3c07e 100644 +--- a/hw/virtio/virtio-iommu-pci.c ++++ b/hw/virtio/virtio-iommu-pci.c +@@ -94,18 +94,10 @@ static void virtio_iommu_pci_instance_init(Object *obj) + TYPE_VIRTIO_IOMMU); + } + +-static void virtio_iommu_pci_instance_finalize(Object *obj) +-{ +- VirtIOIOMMUPCI *dev = VIRTIO_IOMMU_PCI(obj); +- +- g_free(dev->vdev.prop_resv_regions); +-} +- + static const VirtioPCIDeviceTypeInfo virtio_iommu_pci_info = { + .generic_name = TYPE_VIRTIO_IOMMU_PCI, + .instance_size = sizeof(VirtIOIOMMUPCI), + .instance_init = virtio_iommu_pci_instance_init, +- .instance_finalize = virtio_iommu_pci_instance_finalize, + .class_init = virtio_iommu_pci_class_init, + }; + diff --git a/0322-NVIDIA-SAUCE-qom-New-object-to-associate-device-to-E.patch b/0322-NVIDIA-SAUCE-qom-New-object-to-associate-device-to-E.patch new file mode 100644 index 0000000..6f8859a --- /dev/null +++ b/0322-NVIDIA-SAUCE-qom-New-object-to-associate-device-to-E.patch @@ -0,0 +1,239 @@ +From 4ed20edada71321d844ef573bfa9b910101bfb93 Mon Sep 17 00:00:00 2001 +From: Gavin Shan +Date: Fri, 24 Apr 2026 14:14:42 +1000 +Subject: [PATCH] NVIDIA: SAUCE: qom: New object to associate device to EGM + node + +RH-Author: Gavin Shan +RH-MergeRequest: 477: Backport commits of EGM virtualization support +RH-Jira: VOYAGER-15 +RH-Acked-by: Eric Auger +RH-Commit: [1/5] 96591094d207858333b099f24ba29693526698a3 (gwshan/qemu-centos10) + +JIRA: https://redhat.atlassian.net/browse/VOYAGER-15 +UPSTREAM: No, git@github.com:NVIDIA/QEMU.git (branch: nvidia_stable-10.1) + +The Extended GPU Memory (EGM) feature [1] enables the GPU access to +the local or remote system memory across sockets and nodes. In +this mode, the physical memory can be allocated for GPU usage from +anywhere in a multi-node system. The feature is being extended to +virtualization. + +The CPU node with the EGM is associated with the GPUs present +on the same socket in a way that the EGM node information such +as its base physical address, length and the proximity domain ID +is populated in the ACPI DSDT entries of those associated GPUs. +This information is needed by the NVIDIA driver in the VM to +discover its local EGM memory. + +The CPU memory being utilized as EGM is exposed as a +memory-backend-file /dev/egmX backed by the nvgrace-egm module. + +To link the GPU devices to the CPU EGM node, a new qom object +acpi-egm-memory is introduced. This helps Qemu populate the DSDT +entries to the appropriate GPU device. + +An admin can provide this association as following. In the example, +the NUMA node 0 has the EGM memory created through the /dev/egm4 +device. This node is linked with the dev0 GPU device using the +acpi-egm-memory object. + +... +-numa node,memdev=m0,cpus=0-3,nodeid=0 \ +-object memory-backend-file,id=m0,mem-path=/dev/egm4,size=84G,share=on,prealloc=on \ +-device vfio-pci-nohotplug,host=0008:01:00.0,bus=pcie.0,rombar=0,id=dev0 \ +-object acpi-egm-memory,id=egm0,pci-dev=dev0,node=0 +... + +Link: https://developer.nvidia.com/blog/nvidia-grace-hopper-superchip-architecture-in-depth/#extended_gpu_memory [1] + +Signed-off-by: Ankit Agrawal +(cherry picked from commit e647ae72002c57383934077d01ca523e2b9bc2e2 https://github.com/nvmochs/QEMU/tree/stable101_smmuv3-accel-07212025_egm) +Signed-off-by: Matthew R. Ochs +(cherry picked from commit 4954345693fdcd40895049138185c9dfb4fd3ccb) +Signed-off-by: Gavin Shan +Conflicts: + hw/acpi/acpi_egm_memory.c + include/hw/acpi/acpi_egm_memory.h + qapi/qom.json + The license is changed from GPL-2.0-only to GPL-2.0-or-later. + Return (not exit) from acpi_egm_memory_set_node() on invalid NUMA node. + Mark object acpi-egm-memory as unstable in qom.json. + Mark AcpiEgmMemoryProperties available since 11.0 instead of 9.0 in qom.json +--- + hw/acpi/acpi_egm_memory.c | 72 +++++++++++++++++++++++++++++++ + hw/acpi/meson.build | 1 + + include/hw/acpi/acpi_egm_memory.h | 22 ++++++++++ + qapi/qom.json | 21 ++++++++- + 4 files changed, 114 insertions(+), 2 deletions(-) + create mode 100644 hw/acpi/acpi_egm_memory.c + create mode 100644 include/hw/acpi/acpi_egm_memory.h + +diff --git a/hw/acpi/acpi_egm_memory.c b/hw/acpi/acpi_egm_memory.c +new file mode 100644 +index 0000000000..969e272254 +--- /dev/null ++++ b/hw/acpi/acpi_egm_memory.c +@@ -0,0 +1,72 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/* ++ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved ++ */ ++ ++#include "qemu/osdep.h" ++#include "hw/acpi/acpi_egm_memory.h" ++#include "hw/boards.h" ++#include "qapi/error.h" ++#include "qemu/error-report.h" ++ ++typedef struct AcpiEgmMemoryClass { ++ ObjectClass parent_class; ++} AcpiEgmMemoryClass; ++ ++OBJECT_DEFINE_TYPE_WITH_INTERFACES(AcpiEgmMemory, acpi_egm_memory, ++ ACPI_EGM_MEMORY, OBJECT, ++ { TYPE_USER_CREATABLE }, ++ { NULL }) ++ ++OBJECT_DECLARE_SIMPLE_TYPE(AcpiEgmMemory, ACPI_EGM_MEMORY) ++ ++static void acpi_egm_memory_init(Object *obj) ++{ ++ AcpiEgmMemory *egm = ACPI_EGM_MEMORY(obj); ++ ++ egm->node = MAX_NODES; ++ egm->pci_dev = NULL; ++} ++ ++static void acpi_egm_memory_finalize(Object *obj) ++{ ++ AcpiEgmMemory *egm = ACPI_EGM_MEMORY(obj); ++ ++ g_free(egm->pci_dev); ++} ++ ++static void acpi_egm_memory_set_pci_device(Object *obj, const char *val, ++ Error **errp) ++{ ++ AcpiEgmMemory *egm = ACPI_EGM_MEMORY(obj); ++ ++ egm->pci_dev = g_strdup(val); ++} ++ ++static void acpi_egm_memory_set_node(Object *obj, Visitor *v, ++ const char *name, void *opaque, ++ Error **errp) ++{ ++ AcpiEgmMemory *egm = ACPI_EGM_MEMORY(obj); ++ uint32_t value; ++ ++ if (!visit_type_uint32(v, name, &value, errp)) { ++ return; ++ } ++ ++ if (value >= MAX_NODES) { ++ error_setg(errp, "%s: Invalid NUMA node %d (max %d)", ++ TYPE_ACPI_EGM_MEMORY, value, MAX_NODES - 1); ++ return; ++ } ++ ++ egm->node = value; ++} ++ ++static void acpi_egm_memory_class_init(ObjectClass *oc, const void *data) ++{ ++ object_class_property_add_str(oc, "pci-dev", NULL, ++ acpi_egm_memory_set_pci_device); ++ object_class_property_add(oc, "node", "int", NULL, ++ acpi_egm_memory_set_node, NULL, NULL); ++} +diff --git a/hw/acpi/meson.build b/hw/acpi/meson.build +index 73f02b9691..a5db53a422 100644 +--- a/hw/acpi/meson.build ++++ b/hw/acpi/meson.build +@@ -1,5 +1,6 @@ + acpi_ss = ss.source_set() + acpi_ss.add(files( ++ 'acpi_egm_memory.c', + 'acpi_interface.c', + 'aml-build.c', + 'bios-linker-loader.c', +diff --git a/include/hw/acpi/acpi_egm_memory.h b/include/hw/acpi/acpi_egm_memory.h +new file mode 100644 +index 0000000000..b56a55c066 +--- /dev/null ++++ b/include/hw/acpi/acpi_egm_memory.h +@@ -0,0 +1,22 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/* ++ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved ++ */ ++ ++#ifndef ACPI_EGM_MEMORY_H ++#define ACPI_EGM_MEMORY_H ++ ++#include "qom/object_interfaces.h" ++ ++#define TYPE_ACPI_EGM_MEMORY "acpi-egm-memory" ++ ++typedef struct AcpiEgmMemory { ++ /* private */ ++ Object parent; ++ ++ /* public */ ++ char *pci_dev; ++ uint16_t node; ++} AcpiEgmMemory; ++ ++#endif +diff --git a/qapi/qom.json b/qapi/qom.json +index 830cb2ffe7..9235b0c7ec 100644 +--- a/qapi/qom.json ++++ b/qapi/qom.json +@@ -878,6 +878,21 @@ + { 'struct': 'IOMMUFDProperties', + 'data': { '*fd': 'str' } } + ++## ++# @AcpiEgmMemoryProperties: ++# ++# Properties for acpi-egm-memory objects. ++# ++# @pci-dev: PCI device ID to be associated with the node ++# ++# @node: NUMA node associated with the PCI device ++# ++# Since: 11.0 ++## ++{ 'struct': 'AcpiEgmMemoryProperties', ++ 'data': { 'pci-dev': 'str', ++ 'node': 'uint32' } } ++ + ## + # @AcpiGenericInitiatorProperties: + # +@@ -1175,13 +1190,14 @@ + # + # Features: + # +-# @unstable: Members @x-remote-object and @x-vfio-user-server are +-# experimental. ++# @unstable: Members @acpi-egm-memory, @x-remote-object and ++# @x-vfio-user-server are experimental. + # + # Since: 6.0 + ## + { 'enum': 'ObjectType', + 'data': [ ++ { 'name': 'acpi-egm-memory', 'features': [ 'unstable' ] }, + 'acpi-generic-initiator', + 'acpi-generic-port', + 'authz-list', +@@ -1261,6 +1277,7 @@ + 'id': 'str' }, + 'discriminator': 'qom-type', + 'data': { ++ 'acpi-egm-memory': 'AcpiEgmMemoryProperties', + 'acpi-generic-initiator': 'AcpiGenericInitiatorProperties', + 'acpi-generic-port': 'AcpiGenericPortProperties', + 'authz-list': 'AuthZListProperties', diff --git a/0323-NVIDIA-SAUCE-hw-acpi-Populate-DSDT-with-EGM-properti.patch b/0323-NVIDIA-SAUCE-hw-acpi-Populate-DSDT-with-EGM-properti.patch new file mode 100644 index 0000000..db44637 --- /dev/null +++ b/0323-NVIDIA-SAUCE-hw-acpi-Populate-DSDT-with-EGM-properti.patch @@ -0,0 +1,240 @@ +From 6ce9e589be6c63ef164919bb58d217c856c75bba Mon Sep 17 00:00:00 2001 +From: Gavin Shan +Date: Fri, 24 Apr 2026 14:14:42 +1000 +Subject: [PATCH] NVIDIA: SAUCE: hw/acpi: Populate DSDT with EGM properties + +RH-Author: Gavin Shan +RH-MergeRequest: 477: Backport commits of EGM virtualization support +RH-Jira: VOYAGER-15 +RH-Acked-by: Eric Auger +RH-Commit: [2/5] 6ebbfde1825b2d7ce5cc08f981fc40c00f8652ae (gwshan/qemu-centos10) + +JIRA: https://redhat.atlassian.net/browse/VOYAGER-15 +UPSTREAM: No, git@github.com:NVIDIA/QEMU.git (branch: nvidia_stable-10.1) + +The Qemu code builds the ACPI DSDT for the VM devices. The +Extended GPU Memory (EGM) information such as physical address, +length and proximity domain ID is populated in the DSDT entries +of the GPU devices present in the same socket as the EGM memory. +This is used by the VM NVIDIA driver to determine the EGM properties. +The GPU device is linked with the EGM memory node through the +acpi-egm-memory object. + +While building ACPI tables, go through all of the egm-memory objects. +Find the device and the EGM NUMA node association from the objects. +Patch the DSDT to create the GPU device entries and populate with +the corresponding NUMA node properties with DSDT object. + +Signed-off-by: Ankit Agrawal +(cherry picked from commit a5eae1e4755ba9ffea4b20a3678a3f4eacfd4542 https://github.com/nvmochs/QEMU/tree/stable101_smmuv3-accel-07212025_egm) +Signed-off-by: Matthew R. Ochs +(cherry picked from commit b62790fe50c3582177d7c2ddc53d469951da76bf) +Signed-off-by: Gavin Shan +Conflicts: + hw/acpi/acpi_egm_memory.c + hw/arm/virt-acpi-build.c + hw/pci-host/gpex-acpi.c + include/hw/acpi/acpi_egm_memory.h + include/hw/pci-host/gpex.h + Avoid accessing VirtMachineState in acpi_egm_memory.c by passing + the system memory base address to GPEXConfig::system_memory_base +--- + hw/acpi/acpi_egm_memory.c | 104 ++++++++++++++++++++++++++++++ + hw/arm/virt-acpi-build.c | 1 + + hw/pci-host/gpex-acpi.c | 5 ++ + include/hw/acpi/acpi_egm_memory.h | 2 + + include/hw/pci-host/gpex.h | 1 + + 5 files changed, 113 insertions(+) + +diff --git a/hw/acpi/acpi_egm_memory.c b/hw/acpi/acpi_egm_memory.c +index 969e272254..844ec1a2cd 100644 +--- a/hw/acpi/acpi_egm_memory.c ++++ b/hw/acpi/acpi_egm_memory.c +@@ -5,14 +5,19 @@ + + #include "qemu/osdep.h" + #include "hw/acpi/acpi_egm_memory.h" ++#include "hw/acpi/aml-build.h" + #include "hw/boards.h" ++#include "hw/pci/pci_device.h" + #include "qapi/error.h" + #include "qemu/error-report.h" ++#include "include/hw/boards.h" + + typedef struct AcpiEgmMemoryClass { + ObjectClass parent_class; + } AcpiEgmMemoryClass; + ++static int gpu_id; ++ + OBJECT_DEFINE_TYPE_WITH_INTERFACES(AcpiEgmMemory, acpi_egm_memory, + ACPI_EGM_MEMORY, OBJECT, + { TYPE_USER_CREATABLE }, +@@ -70,3 +75,102 @@ static void acpi_egm_memory_class_init(ObjectClass *oc, const void *data) + object_class_property_add(oc, "node", "int", NULL, + acpi_egm_memory_set_node, NULL, NULL); + } ++ ++static void acpi_dsdt_add_gpu(Aml *dev, int32_t devfn, uint64_t egm_mem_base, ++ uint64_t egm_mem_size, int egm_mem_pxm) ++{ ++ Aml *dev_gpu = aml_device("GPU%d", gpu_id++); ++ Aml *pkg = aml_package(3); ++ Aml *pkg1 = aml_package(2); ++ Aml *pkg2 = aml_package(2); ++ Aml *pkg3 = aml_package(2); ++ Aml *dev_pkg = aml_package(2); ++ Aml *UUID; ++ ++ aml_append(dev_gpu, aml_name_decl("_ADR", aml_int(PCI_SLOT(devfn) << 16))); ++ ++ aml_append(pkg1, aml_string("nvidia,egm-base-pa")); ++ aml_append(pkg1, aml_int(egm_mem_base)); ++ ++ aml_append(pkg2, aml_string("nvidia,egm-size")); ++ aml_append(pkg2, aml_int(egm_mem_size)); ++ ++ aml_append(pkg3, aml_string("nvidia,egm-pxm")); ++ aml_append(pkg3, aml_int(egm_mem_pxm)); ++ ++ aml_append(pkg, pkg1); ++ aml_append(pkg, pkg2); ++ aml_append(pkg, pkg3); ++ ++ UUID = aml_touuid("DAFFD814-6EBA-4D8C-8A91-BC9BBF4AA301"); ++ aml_append(dev_pkg, UUID); ++ aml_append(dev_pkg, pkg); ++ ++ aml_append(dev_gpu, aml_name_decl("_DSD", dev_pkg)); ++ aml_append(dev, dev_gpu); ++} ++ ++typedef struct DsdtInfo { ++ Aml *dev; ++ int bus; ++ uint64_t mem_base; ++} DsdtInfo; ++ ++static int build_all_acpi_egm_memory_dsdt(Object *obj, void *opaque) ++{ ++ MachineState *ms = MACHINE(qdev_get_machine()); ++ AcpiEgmMemory *egm; ++ DsdtInfo *info = opaque; ++ PCIDevice *pci_dev; ++ Object *o; ++ uint64_t mem_base; ++ int i; ++ ++ if (!object_dynamic_cast(obj, TYPE_ACPI_EGM_MEMORY)) { ++ return 0; ++ } ++ ++ egm = ACPI_EGM_MEMORY(obj); ++ if (egm->node >= ms->numa_state->num_nodes) { ++ error_printf("%s: Specified node %d is invalid.\n", ++ TYPE_ACPI_EGM_MEMORY, egm->node); ++ exit(1); ++ } ++ ++ o = object_resolve_path_type(egm->pci_dev, TYPE_PCI_DEVICE, NULL); ++ if (!o) { ++ error_printf("%s: Specified device must be a PCI device.\n", ++ TYPE_ACPI_EGM_MEMORY); ++ exit(1); ++ } ++ ++ pci_dev = PCI_DEVICE(o); ++ ++ if (info->bus != pci_bus_num(pci_get_bus(pci_dev))) { ++ return 0; ++ } ++ ++ mem_base = info->mem_base; ++ for (i = 0; i < ms->numa_state->num_nodes; ++i) { ++ if (i == egm->node) { ++ acpi_dsdt_add_gpu(info->dev, pci_dev->devfn, mem_base, ++ ms->numa_state->nodes[i].node_mem, i); ++ break; ++ } ++ ++ if (ms->numa_state->nodes[i].node_mem > 0) { ++ mem_base += ms->numa_state->nodes[i].node_mem; ++ } ++ } ++ ++ return 0; ++} ++ ++void build_acpi_egm_memory_dsdt(Aml *dev, int bus, uint64_t mem_base) ++{ ++ DsdtInfo info = {dev, bus, mem_base}; ++ ++ object_child_foreach_recursive(object_get_root(), ++ build_all_acpi_egm_memory_dsdt, ++ &info); ++} +diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c +index 083c97f560..f828325e27 100644 +--- a/hw/arm/virt-acpi-build.c ++++ b/hw/arm/virt-acpi-build.c +@@ -164,6 +164,7 @@ static void acpi_dsdt_add_pci(Aml *scope, const MemMapEntry *memmap, + .irq = irq, + .bus = vms->bus, + .pci_native_hotplug = !acpi_pcihp, ++ .system_memory_base = vms->memmap[VIRT_MEM].base, + }; + + /* +diff --git a/hw/pci-host/gpex-acpi.c b/hw/pci-host/gpex-acpi.c +index d9820f9b41..483bf6bd20 100644 +--- a/hw/pci-host/gpex-acpi.c ++++ b/hw/pci-host/gpex-acpi.c +@@ -7,6 +7,7 @@ + #include "hw/pci/pci_bridge.h" + #include "hw/pci/pcie_host.h" + #include "hw/acpi/cxl.h" ++#include "hw/acpi/acpi_egm_memory.h" + + static void acpi_dsdt_add_pci_route_table(Aml *dev, uint32_t irq, + Aml *scope, uint8_t bus_num) +@@ -171,6 +172,8 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) + cfg->preserve_config); + } + ++ build_acpi_egm_memory_dsdt(dev, bus_num, cfg->system_memory_base); ++ + aml_append(scope, dev); + } + } +@@ -246,6 +249,8 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) + acpi_dsdt_add_host_bridge_methods(dev, cfg->pci_native_hotplug, + cfg->preserve_config); + ++ build_acpi_egm_memory_dsdt(dev, 0, cfg->system_memory_base); ++ + Aml *dev_res0 = aml_device("%s", "RES0"); + aml_append(dev_res0, aml_name_decl("_HID", aml_string("PNP0C02"))); + crs = aml_resource_template(); +diff --git a/include/hw/acpi/acpi_egm_memory.h b/include/hw/acpi/acpi_egm_memory.h +index b56a55c066..011982c35a 100644 +--- a/include/hw/acpi/acpi_egm_memory.h ++++ b/include/hw/acpi/acpi_egm_memory.h +@@ -19,4 +19,6 @@ typedef struct AcpiEgmMemory { + uint16_t node; + } AcpiEgmMemory; + ++void build_acpi_egm_memory_dsdt(Aml *dev, int bus, uint64_t mem_base); ++ + #endif +diff --git a/include/hw/pci-host/gpex.h b/include/hw/pci-host/gpex.h +index 7eea16e728..2950be4c53 100644 +--- a/include/hw/pci-host/gpex.h ++++ b/include/hw/pci-host/gpex.h +@@ -47,6 +47,7 @@ struct GPEXConfig { + PCIBus *bus; + bool pci_native_hotplug; + bool preserve_config; ++ uint64_t system_memory_base; + }; + + typedef struct GPEXIrq GPEXIrq; diff --git a/0324-NVIDIA-SAUCE-hw-arm-boot-Create-DTB-memory-regions-s.patch b/0324-NVIDIA-SAUCE-hw-arm-boot-Create-DTB-memory-regions-s.patch new file mode 100644 index 0000000..eb90b40 --- /dev/null +++ b/0324-NVIDIA-SAUCE-hw-arm-boot-Create-DTB-memory-regions-s.patch @@ -0,0 +1,235 @@ +From 332983e6e15506ec78562acd8d258f8421cbe212 Mon Sep 17 00:00:00 2001 +From: Gavin Shan +Date: Fri, 24 Apr 2026 14:14:42 +1000 +Subject: [PATCH] NVIDIA: SAUCE: hw/arm/boot: Create DTB memory regions + skipping ECC error pages on EGM + +RH-Author: Gavin Shan +RH-MergeRequest: 477: Backport commits of EGM virtualization support +RH-Jira: VOYAGER-15 +RH-Acked-by: Eric Auger +RH-Commit: [3/5] 679071783b49403599e6249743c4411dbd0d35b5 (gwshan/qemu-centos10) + +JIRA: https://redhat.atlassian.net/browse/VOYAGER-15 +UPSTREAM: No, git@github.com:NVIDIA/QEMU.git (branch: nvidia_stable-10.1) + +The nvgrace-egm module expose the list of pages with uncorrected ECC +errors (referred as bad pages) in EGM memory through a EGM_BAD_PAGES_LIST +ioctl on the EGM char device. + +Since these pages should not be accessed by the VM OS, they need to +be kept absent from the VM memory map. This is achieved by leveraging +the memory region in DTBs. + +Fetch the list of the pages by calling the ioctl and sort. The memory +regions are built using this list by terminating a memory region at +the physical address of a bad page. The next region is started from +the next page, essentially skipping over the bad page. + +Also a minor code organization to move the fdt_add_memory_node to +a wrapper that checks if the provided memory region length is +non-zero before adding in DTB. + +Signed-off-by: Ankit Agrawal +(cherry picked from commit dba314bb6eea9988a8d56e6925e103532672f950 https://github.com/nvmochs/QEMU/tree/stable101_smmuv3-accel-07212025_egm) +Signed-off-by: Matthew R. Ochs +(cherry picked from commit 2dd5ad8b77bf80131b1d9ee1b96c215f28d582ec) +Signed-off-by: Gavin Shan +Conflicts: + hw/arm/boot.c + linux-headers/linux/egm.h + Update egm.h to that included in host's RFCv2 series +--- + hw/arm/boot.c | 128 +++++++++++++++++++++++++++++++++++--- + linux-headers/linux/egm.h | 28 +++++++++ + 2 files changed, 147 insertions(+), 9 deletions(-) + create mode 100644 linux-headers/linux/egm.h + +diff --git a/hw/arm/boot.c b/hw/arm/boot.c +index d391cd01bb..91af045385 100644 +--- a/hw/arm/boot.c ++++ b/hw/arm/boot.c +@@ -32,6 +32,14 @@ + #include "qemu/units.h" + #include "qemu/bswap.h" + ++#include ++#include "qapi/error.h" ++#include "qemu/error-report.h" ++ ++#ifdef CONFIG_LINUX ++#include ++#endif ++ + /* Kernel boot protocol is specified in the kernel docs + * Documentation/arm/Booting and Documentation/arm64/booting.txt + * They have different preferred image load offsets from system RAM base. +@@ -515,6 +523,102 @@ static void fdt_add_psci_node(void *fdt, ARMCPU *armcpu) + qemu_fdt_setprop_cell(fdt, "/psci", "migrate", migrate_fn); + } + ++static int fdt_add_memory_node_wrapper(void *fdt, uint32_t acells, ++ hwaddr mem_base, uint32_t scells, ++ hwaddr mem_len, int numa_node_id) ++{ ++ int rc; ++ ++ if (!mem_len) { ++ return 0; ++ } ++ ++ rc = fdt_add_memory_node(fdt, acells, mem_base, scells, ++ mem_len, numa_node_id); ++ if (rc < 0) { ++ fprintf(stderr, "couldn't add /memory@%"PRIx64" node\n", ++ mem_base); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int offset_compare(const void *a, const void *b) ++{ ++ struct egm_retired_pages_info *page1 = (struct egm_retired_pages_info *) a; ++ struct egm_retired_pages_info *page2 = (struct egm_retired_pages_info *) b; ++ ++ if (page1->offset > page2->offset) { ++ return 1; ++ } else if (page1->offset < page2->offset) { ++ return -1; ++ } else { ++ return 0; ++ } ++} ++ ++static int add_memory_regions(struct egm_retired_pages_list *info, void *fdt, ++ uint32_t acells, hwaddr mem_base, uint32_t scells, ++ hwaddr mem_len, int numa_node_id) ++{ ++ int index; ++ hwaddr mem_base_curr, mem_last_curr = mem_len; ++ ++ if (!info) { ++ goto no_retired_pages; ++ } ++ ++ qsort(info->retired_pages, info->count, ++ sizeof(struct egm_retired_pages_info), &offset_compare); ++ mem_last_curr = mem_len; ++ for (index = info->count - 1; index >= 0; index--) { ++ mem_base_curr = info->retired_pages[index].offset + ++ info->retired_pages[index].size; ++ if (fdt_add_memory_node_wrapper(fdt, acells, mem_base_curr + mem_base, ++ scells, mem_last_curr - mem_base_curr, ++ numa_node_id)) { ++ return -1; ++ } ++ ++ mem_last_curr = info->retired_pages[index].offset; ++ } ++ ++no_retired_pages: ++ if (fdt_add_memory_node_wrapper(fdt, acells, mem_base, scells, ++ mem_last_curr, numa_node_id)) { ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static void fetch_retired_pages(MemoryRegion *mr, ++ struct egm_retired_pages_list **info) ++{ ++ size_t argsz = sizeof(struct egm_retired_pages_list); ++ int fd; ++ ++ *info = g_malloc0(argsz); ++ ++ fd = memory_region_get_fd(mr); ++ ++retry: ++ (*info)->argsz = argsz; ++ ++ if (ioctl(fd, EGM_RETIRED_PAGES_LIST, *info)) { ++ g_free(*info); ++ *info = NULL; ++ return; ++ } ++ ++ if ((*info)->argsz > argsz) { ++ argsz = (*info)->argsz; ++ *info = g_realloc(*info, argsz); ++ goto retry; ++ } ++} ++ + int arm_load_dtb(hwaddr addr, const struct arm_boot_info *binfo, + hwaddr addr_limit, AddressSpace *as, MachineState *ms, + ARMCPU *cpu) +@@ -604,16 +708,22 @@ int arm_load_dtb(hwaddr addr, const struct arm_boot_info *binfo, + mem_base = binfo->loader_start; + for (i = 0; i < ms->numa_state->num_nodes; i++) { + mem_len = ms->numa_state->nodes[i].node_mem; +- if (!mem_len) { +- continue; +- } + +- rc = fdt_add_memory_node(fdt, acells, mem_base, +- scells, mem_len, i); +- if (rc < 0) { +- fprintf(stderr, "couldn't add /memory@%"PRIx64" node\n", +- mem_base); +- goto fail; ++ if (ms->numa_state->nodes[i].node_memdev) { ++ struct egm_retired_pages_list *info = NULL; ++ fetch_retired_pages(&(ms->numa_state->nodes[i].node_memdev->mr), ++ &info); ++ rc = add_memory_regions(info, fdt, acells, mem_base, scells, ++ mem_len, i); ++ g_free(info); ++ ++ if (rc) { ++ goto fail; ++ } ++ } else { ++ if (fdt_add_memory_node_wrapper(fdt, acells, mem_base, ++ scells, mem_len, i)) ++ goto fail; + } + + mem_base += mem_len; +diff --git a/linux-headers/linux/egm.h b/linux-headers/linux/egm.h +new file mode 100644 +index 0000000000..4d3a2304d4 +--- /dev/null ++++ b/linux-headers/linux/egm.h +@@ -0,0 +1,28 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++/* ++ * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved ++ */ ++ ++#ifndef _UAPI_LINUX_EGM_H ++#define _UAPI_LINUX_EGM_H ++ ++#include ++ ++#define EGM_TYPE ('E') ++ ++struct egm_retired_pages_info { ++ __aligned_u64 offset; ++ __aligned_u64 size; ++}; ++ ++struct egm_retired_pages_list { ++ __u32 argsz; ++ /* out */ ++ __u32 count; ++ /* out */ ++ struct egm_retired_pages_info retired_pages[]; ++}; ++ ++#define EGM_RETIRED_PAGES_LIST _IO(EGM_TYPE, 100) ++ ++#endif /* _UAPI_LINUX_EGM_H */ diff --git a/0325-NVIDIA-SAUCE-hw-acpi-Add-pxb-bridge-above-GPU-in-DSD.patch b/0325-NVIDIA-SAUCE-hw-acpi-Add-pxb-bridge-above-GPU-in-DSD.patch new file mode 100644 index 0000000..88da52c --- /dev/null +++ b/0325-NVIDIA-SAUCE-hw-acpi-Add-pxb-bridge-above-GPU-in-DSD.patch @@ -0,0 +1,59 @@ +From 11bc54f51b82fee3aec8a495fea3eca4aee88ee8 Mon Sep 17 00:00:00 2001 +From: Gavin Shan +Date: Fri, 24 Apr 2026 14:14:42 +1000 +Subject: [PATCH] NVIDIA: SAUCE: hw/acpi: Add pxb bridge above GPU in DSDT + +RH-Author: Gavin Shan +RH-MergeRequest: 477: Backport commits of EGM virtualization support +RH-Jira: VOYAGER-15 +RH-Acked-by: Eric Auger +RH-Commit: [4/5] 33444c059ec1151b6d47631e934ebd075e3f692b (gwshan/qemu-centos10) + +JIRA: https://redhat.atlassian.net/browse/VOYAGER-15 +UPSTREAM: No, git@github.com:NVIDIA/QEMU.git (branch: nvidia_stable-10.1) + +Signed-off-by: Ankit Agrawal +(cherry picked from commit 3f183b46d7e2d184f9def55f09b9495e63a18b5e https://github.com/nvmochs/QEMU/tree/stable101_smmuv3-accel-07212025_egm) +Signed-off-by: Matthew R. Ochs +(cherry picked from commit 09d8e4815e1758841fc59fcf5ad81c3cd55ca0be) +Signed-off-by: Gavin Shan +--- + hw/pci-host/gpex-acpi.c | 18 +++++++++++++++++- + 1 file changed, 17 insertions(+), 1 deletion(-) + +diff --git a/hw/pci-host/gpex-acpi.c b/hw/pci-host/gpex-acpi.c +index 483bf6bd20..b590008613 100644 +--- a/hw/pci-host/gpex-acpi.c ++++ b/hw/pci-host/gpex-acpi.c +@@ -118,6 +118,8 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) + uint8_t numa_node = pci_bus_numa_node(bus); + uint32_t uid; + bool is_cxl = pci_bus_is_cxl(bus); ++ int devfn; ++ Aml *brg; + + if (!pci_bus_is_root(bus)) { + continue; +@@ -172,7 +174,21 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) + cfg->preserve_config); + } + +- build_acpi_egm_memory_dsdt(dev, bus_num, cfg->system_memory_base); ++ for (devfn = 0; devfn < ARRAY_SIZE(bus->devices); devfn++) { ++ /* ACPI spec: 1.0b: Table 6-2 _ADR Object Bus Types, PCI type */ ++ int adr = PCI_SLOT(devfn) << 16 | PCI_FUNC(devfn); ++ PCIDevice *pdev = bus->devices[devfn]; ++ ++ if (!pdev) { ++ continue; ++ } ++ ++ brg = aml_device("RP%.02X", devfn); ++ aml_append(brg, aml_name_decl("_ADR", aml_int(adr))); ++ build_acpi_egm_memory_dsdt(brg, bus_num + 1, ++ cfg->system_memory_base); ++ aml_append(dev, brg); ++ } + + aml_append(scope, dev); + } diff --git a/0326-WAR-hw-vfio-Use-IOMMU_IOAS_MAP-for-EGM-memory-region.patch b/0326-WAR-hw-vfio-Use-IOMMU_IOAS_MAP-for-EGM-memory-region.patch new file mode 100644 index 0000000..9b042ff --- /dev/null +++ b/0326-WAR-hw-vfio-Use-IOMMU_IOAS_MAP-for-EGM-memory-region.patch @@ -0,0 +1,117 @@ +From 30fdd1153dff5c752601a008a68823c7d16fff2b Mon Sep 17 00:00:00 2001 +From: Gavin Shan +Date: Fri, 24 Apr 2026 14:14:42 +1000 +Subject: [PATCH] WAR: hw/vfio: Use IOMMU_IOAS_MAP for EGM memory region + +RH-Author: Gavin Shan +RH-MergeRequest: 477: Backport commits of EGM virtualization support +RH-Jira: VOYAGER-15 +RH-Acked-by: Eric Auger +RH-Commit: [5/5] abe2753afdf459ceacf1a80941588130b964919b (gwshan/qemu-centos10) + +JIRA: https://redhat.atlassian.net/browse/VOYAGER-15 +UPSTREAM: RHEL-only + +There are two ioctl commands used to map user space regions to IOVA +when iommufd is used: IOMMU_IOAS_MAP and IOMMU_IOAS_MAP_FILE. The +mapping requests for user space region associated with /dev/egmx +is done by IOMMU_IOAS_MAP_FILE. Unfortunately, that ioctl command +doesn't work for that specific user space region because the file +descriptor for the region doesn't fall into memory file descriptor +(memfd) category, which is required for IOMMU_IOAS_MAP_FILE to work. + +This leads to GH200 passthrough failure on NVidia grace-hopper machine +as the error messages spew from QEMU indicates. + +host$ /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ +-accel kvm -machine virt,gic-version=host,ras=on,highmem-mmio-size=4T \ +-cpu host -smp cpus=32 -m size=8G \ +-object memory-backend-file,id=mem0,mem-path=/dev/egm4,share=on,prealloc=on,size=8G \ + : +qemu-system-aarch64: -device vfio-pci-nohotplug,host=0009:01:00.0, \ +bus=pcie.1.0,rombar=0,id=pt0,iommufd=iommufd0: vfio 0009:01:00.0: \ +memory listener initialization failed: Region mem0: \ +vfio_container_dma_map(0xaaaaf789f430, 0x40000000, 0x200000000, \ +0xfffab3ff0000) = -22 (Invalid argument) + +Enforce the DMA mapping requests on /dev/egmx regions to be covered +by IOMMU_IOAS_MAP so that GH200 can be passed to the guest when /dev/egmx +region is used as the memory backend. + +Signed-off-by: Gavin Shan +--- + hw/vfio/container.c | 54 ++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 53 insertions(+), 1 deletion(-) + +diff --git a/hw/vfio/container.c b/hw/vfio/container.c +index 1b8569d36a..d26d87fe71 100644 +--- a/hw/vfio/container.c ++++ b/hw/vfio/container.c +@@ -73,6 +73,58 @@ void vfio_address_space_insert(VFIOAddressSpace *space, + bcontainer->space = space; + } + ++static bool is_egm_file(int fd) ++{ ++ DIR *dir; ++ struct dirent *ent; ++ struct stat st; ++ int index; ++ ssize_t len; ++ bool found = false; ++ g_autofree char *filename = g_malloc(PATH_MAX); ++ g_autofree char *link_filename = g_malloc(PATH_MAX); ++ g_autofree char *base_filename = NULL; ++ ++ /* Get the full file name */ ++ snprintf(link_filename, PATH_MAX, "/proc/self/fd/%d", fd); ++ len = readlink(link_filename, filename, PATH_MAX - 1); ++ if (len <= 0) { ++ return false; ++ } ++ ++ /* The full file name should start with "/dev/egm" */ ++ filename[len] = '\0'; ++ if (strncmp(filename, "/dev/egm", 8)) { ++ return false; ++ } ++ ++ /* Get the index */ ++ base_filename = g_path_get_basename(filename); ++ if (sscanf(base_filename, "egm%d", &index) != 1) { ++ return false; ++ } ++ ++ /* Check if the EGM device is associated with any PCI device */ ++ dir = opendir("/sys/bus/pci/devices"); ++ if (dir == NULL) { ++ return false; ++ } ++ ++ while ((ent = readdir(dir)) != NULL) { ++ snprintf(filename, PATH_MAX, "/sys/bus/pci/devices/%s/" ++ "nvgrace_gpu_vfio_pci.egm.%d/egm/egm%d/dev", ++ ent->d_name, index, index); ++ if (stat(filename, &st) == 0) { ++ found = true; ++ break; ++ } ++ } ++ ++ closedir(dir); ++ ++ return found; ++} ++ + int vfio_container_dma_map(VFIOContainer *bcontainer, + hwaddr iova, uint64_t size, + void *vaddr, bool readonly, MemoryRegion *mr) +@@ -81,7 +133,7 @@ int vfio_container_dma_map(VFIOContainer *bcontainer, + RAMBlock *rb = mr->ram_block; + int mfd = rb ? qemu_ram_get_fd(rb) : -1; + +- if (mfd >= 0 && vioc->dma_map_file) { ++ if (mfd >= 0 && !is_egm_file(mfd) && vioc->dma_map_file) { + unsigned long start = vaddr - qemu_ram_get_host_addr(rb); + unsigned long offset = qemu_ram_get_fd_offset(rb); + diff --git a/README.rst b/README.rst deleted file mode 100644 index 15f93e6..0000000 --- a/README.rst +++ /dev/null @@ -1,19 +0,0 @@ -=================== -qemu-kvm development -=================== - -qemu-kvm is maintained in a `source tree`_ rather than directly in dist-git. -This provides way to develope using regular source code structure and provides -way to generate SRPM and build using koji service. In addition, local build using -CentOS 9 Stream specific configuration. - -Developers deliver all changes to source-git using merge request. Only maintainers -will be pushing changes sent to source-git to dist-git. - -Each release in dist-git is tagged in the source repository so you can easily -check out the source tree for a build. The tags are in the format -name-version-release, but note release doesn't contain the dist tag since the -source can be built in different build roots (Fedora, CentOS, etc.) - -.. _source tree: https://gitlab.com/redhat/centos-stream/src/qemu-kvm - diff --git a/qemu-kvm.spec b/qemu-kvm.spec index 9129cf4..61c677d 100644 --- a/qemu-kvm.spec +++ b/qemu-kvm.spec @@ -143,7 +143,7 @@ Obsoletes: %{name}-block-ssh <= %{epoch}:%{version} \ Summary: QEMU is a machine emulator and virtualizer Name: qemu-kvm Version: 10.1.0 -Release: 13%{?rcrel}%{?dist}%{?cc_suffix}.1 +Release: 17%{?rcrel}%{?dist}%{?cc_suffix}.1 # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped # Epoch 15 used for RHEL 8 # Epoch 17 used for RHEL 9 (due to release versioning offset in RHEL 8.5) @@ -478,6 +478,20 @@ Patch309: 0309-scsi-track-SCSI-reservation-state-for-live-migration.patch Patch310: 0310-scsi-save-load-SCSI-reservation-state.patch Patch311: 0311-docs-add-SCSI-migrate-pr-documentation.patch Patch312: 0312-Revert-hw-arm-virt-Use-ACPI-PCI-hotplug-by-default-f.patch +Patch313: 0313-hw-uefi-add-variable-digest-to-vmstate.patch +Patch314: 0314-block-Never-drop-BLOCK_IO_ERROR-with-action-stop-for.patch +Patch315: 0315-mirror-Fix-missed-dirty-bitmap-writes-during-startup.patch +Patch316: 0316-linux-aio-Put-all-parameters-into-qemu_laiocb.patch +Patch317: 0317-linux-aio-Resubmit-tails-of-short-reads-writes.patch +Patch318: 0318-block-io_uring-avoid-potentially-getting-stuck-after.patch +Patch319: 0319-io-uring-Resubmit-tails-of-short-writes.patch +Patch320: 0320-block-enable-stats-intervals-for-storage-devices.patch +Patch321: 0321-qdev-Free-property-array-on-release.patch +Patch322: 0322-NVIDIA-SAUCE-qom-New-object-to-associate-device-to-E.patch +Patch323: 0323-NVIDIA-SAUCE-hw-acpi-Populate-DSDT-with-EGM-properti.patch +Patch324: 0324-NVIDIA-SAUCE-hw-arm-boot-Create-DTB-memory-regions-s.patch +Patch325: 0325-NVIDIA-SAUCE-hw-acpi-Add-pxb-bridge-above-GPU-in-DSD.patch +Patch326: 0326-WAR-hw-vfio-Use-IOMMU_IOAS_MAP-for-EGM-memory-region.patch # For RHEL-112882 - [DEV Task]: Assertion `core->delayed_causes == 0' failed with e1000e NIC @@ -1675,6 +1689,38 @@ useradd -r -u 107 -g qemu -G kvm -d / -s /sbin/nologin \ %endif %changelog +* Tue Apr 28 2026 Miroslav Rezanina - 10.1.0-17.el10nv.1 +- Backport commits of EGM virtualization support [VOYAGER-15] +- Resolves: VOYAGER-15 + (Backport QEMU support for EGM) + +* Tue Apr 21 2026 Miroslav Rezanina - 10.1.0-17 +- kvm-block-enable-stats-intervals-for-storage-devices.patch [RHEL-114231] +- kvm-qdev-Free-property-array-on-release.patch [RHEL-114231] +- Resolves: RHEL-114231 + (Add stats-intervals support to --blockdev) + +* Mon Mar 30 2026 Miroslav Rezanina - 10.1.0-16 +- kvm-linux-aio-Put-all-parameters-into-qemu_laiocb.patch [RHEL-158224] +- kvm-linux-aio-Resubmit-tails-of-short-reads-writes.patch [RHEL-158224] +- kvm-block-io_uring-avoid-potentially-getting-stuck-after.patch [RHEL-158224] +- kvm-io-uring-Resubmit-tails-of-short-writes.patch [RHEL-158224] +- Resolves: RHEL-158224 + (qemu-kvm: disk writes of fewer bytes than requested is a retry condition, not necessarily an indication of ENOSPC [rhel-10.2]) + +* Thu Mar 26 2026 Miroslav Rezanina - 10.1.0-15 +- kvm-mirror-Fix-missed-dirty-bitmap-writes-during-startup.patch [RHEL-155601] +- Resolves: RHEL-155601 + (Mirror job can miss writes during startup, corrupting the copy [rhel-10.2]) + +* Wed Mar 18 2026 Miroslav Rezanina - 10.1.0-14 +- kvm-hw-uefi-add-variable-digest-to-vmstate.patch [RHEL-153058] +- kvm-block-Never-drop-BLOCK_IO_ERROR-with-action-stop-for.patch [RHEL-144004] +- Resolves: RHEL-153058 + (Qemu crashes with "double free" during restore --reset-nvram with uefi-vars secure boot) +- Resolves: RHEL-144004 + ([rhel-10] Regression in BLOCK_IO_ERROR event delivery with (w|r)error setting of 'stop' or 'enospc' due to event rate limiting) + * Wed Mar 11 2026 Miroslav Rezanina - 10.1.0-13.el10nv.1 - Add DMABUF support [VOYAGER-19 VOYAGER-53] - Accelerated SMMU device for GH GPU passthrough [VOYAGER-5 VOYAGER-16 VOYAGER-17 VOYAGER-48] diff --git a/rpminspect.yaml b/rpminspect.yaml deleted file mode 100644 index 30274b3..0000000 --- a/rpminspect.yaml +++ /dev/null @@ -1,12 +0,0 @@ ---- -elf: - exclude_path: (.*s390-ccw.img.*)|(.*s390-netboot.img.*) -inspections: - badfuncs: off -annocheck: - - hardened: --skip-cf-protection --skip-property-note --skip-stack-clash --ignore-unknown --verbose - - rhel-policy: --skip-cf-protection --skip-property-note --skip-stack-clash --ignore-unknown --verbose - ignore: - - /usr/share/qemu-kvm/s390-ccw.img - - /usr/share/qemu-kvm/s390-netboot.img -