147 changed files with 19849 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,5 @@
 SOURCES/qemu-6.2.0.tar.xz
 SOURCES/tests_data_acpi_pc_SSDT.dimmpxm
 SOURCES/tests_data_acpi_q35_FACP.slic
 SOURCES/tests_data_acpi_q35_SSDT.dimmpxm
 SOURCES/tests_data_acpi_virt_SSDT.memhp
--- a/.qemu-kvm.metadata
+++ b/.qemu-kvm.metadata
@ -1 +1,5 @@
 68cd61a466170115b88817e2d52db2cd7a92f43a SOURCES/qemu-6.2.0.tar.xz
 c4b34092bc5af1ba7febfca1477320fb024e8acd SOURCES/tests_data_acpi_pc_SSDT.dimmpxm
 19349e3517143bd1af56a5444e927ba37a111f72 SOURCES/tests_data_acpi_q35_FACP.slic
 4632d10ae8cedad4d5d760ed211f83f0dc81005d SOURCES/tests_data_acpi_q35_SSDT.dimmpxm
 ef12eed43cc357fb134db6fa3c7ffc83e222a97d SOURCES/tests_data_acpi_virt_SSDT.memhp
--- a/SOURCES/kvm-Fix-thread-pool-size-default-value-in-the-man-page.patch
+++ b/SOURCES/kvm-Fix-thread-pool-size-default-value-in-the-man-page.patch
@ -0,0 +1,36 @@
 From a707eff49800045d07afbcd8a74617c50b960151 Mon Sep 17 00:00:00 2001
 From: German Maglione <gmaglione@redhat.com>
 Date: Thu, 10 Oct 2024 13:23:25 +0200
 Subject: [PATCH] Fix thread-pool-size default value in the man page
 RH-Author: German Maglione <None>
 RH-MergeRequest: 417: Fix thread-pool-size default value in the man page
 RH-Jira: RHEL-26197
 RH-Acked-by: Hanna Czenczek <hreitz@redhat.com>
 RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
 RH-Commit: [1/1] bdf22ed4600ac7f02a4b08c54f162b1f89c44a99
 The current --thread-pool-size default value is 0, let's reflect it
 in the man page.
 Signed-off-by: German Maglione <gmaglione@redhat.com>
 ---
 docs/tools/virtiofsd.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 diff --git a/docs/tools/virtiofsd.rst b/docs/tools/virtiofsd.rst
 index 07ac0be551..fb3d59c449 100644
 --- a/docs/tools/virtiofsd.rst
 +++ b/docs/tools/virtiofsd.rst
@@ -120,7 +120,7 @@ Options
 .. option:: --thread-pool-size=NUM
   Restrict the number of worker threads per request queue to NUM.  The default
 -  is 64.
 +  is 0.
 .. option:: --cache=none|auto|always
 -- 
 2.45.2
--- a/SOURCES/kvm-KVM-keep-track-of-running-ioctls.patch
+++ b/SOURCES/kvm-KVM-keep-track-of-running-ioctls.patch
@ -0,0 +1,82 @@
 From 9bacf8c4104ff3cff2e0e2c2179ec4fda633167f Mon Sep 17 00:00:00 2001
 From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 Date: Mon, 16 Jan 2023 07:51:08 -0500
 Subject: [PATCH 05/11] KVM: keep track of running ioctls
 RH-Author: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 RH-MergeRequest: 247: accel: introduce accelerator blocker API
 RH-Bugzilla: 2161188
 RH-Acked-by: David Hildenbrand <david@redhat.com>
 RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [2/3] 357508389e2a0fd996206b406e9e235e50b5f0b6
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2161188
 commit a27dd2de68f37ba96fe164a42121daa5f0750afc
 Author: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 Date:   Fri Nov 11 10:47:57 2022 -0500
    KVM: keep track of running ioctls
    Using the new accel-blocker API, mark where ioctls are being called
    in KVM. Next, we will implement the critical section that will take
    care of performing memslots modifications atomically, therefore
    preventing any new ioctl from running and allowing the running ones
    to finish.
    Signed-off-by: David Hildenbrand <david@redhat.com>
    Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
    Message-Id: <20221111154758.1372674-3-eesposit@redhat.com>
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 ---
 accel/kvm/kvm-all.c | 7 +++++++
 1 file changed, 7 insertions(+)
 diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
 index 8f2a53438f..221aadfda7 100644
 --- a/accel/kvm/kvm-all.c
 +++ b/accel/kvm/kvm-all.c
@@ -2337,6 +2337,7 @@ static int kvm_init(MachineState *ms)
     assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size);
     s->sigmask_len = 8;
 +    accel_blocker_init();
 #ifdef KVM_CAP_SET_GUEST_DEBUG
     QTAILQ_INIT(&s->kvm_sw_breakpoints);
@@ -3018,7 +3019,9 @@ int kvm_vm_ioctl(KVMState *s, int type, ...)
     va_end(ap);
     trace_kvm_vm_ioctl(type, arg);
 +    accel_ioctl_begin();
     ret = ioctl(s->vmfd, type, arg);
 +    accel_ioctl_end();
     if (ret == -1) {
         ret = -errno;
     }
@@ -3036,7 +3039,9 @@ int kvm_vcpu_ioctl(CPUState *cpu, int type, ...)
     va_end(ap);
     trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
 +    accel_cpu_ioctl_begin(cpu);
     ret = ioctl(cpu->kvm_fd, type, arg);
 +    accel_cpu_ioctl_end(cpu);
     if (ret == -1) {
         ret = -errno;
     }
@@ -3054,7 +3059,9 @@ int kvm_device_ioctl(int fd, int type, ...)
     va_end(ap);
     trace_kvm_device_ioctl(fd, type, arg);
 +    accel_ioctl_begin();
     ret = ioctl(fd, type, arg);
 +    accel_ioctl_end();
     if (ret == -1) {
         ret = -errno;
     }
 -- 
 2.37.3
--- a/SOURCES/kvm-MAINTAINERS-split-out-s390x-sections.patch
+++ b/SOURCES/kvm-MAINTAINERS-split-out-s390x-sections.patch
@ -0,0 +1,181 @@
 From 440ee491240f2f02f9a6082d8aad98d88c1039dd Mon Sep 17 00:00:00 2001
 From: Thomas Huth <thuth@redhat.com>
 Date: Mon, 15 Jan 2024 14:00:04 +0100
 Subject: [PATCH 1/5] MAINTAINERS: split out s390x sections
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Thomas Huth <thuth@redhat.com>
 RH-MergeRequest: 348: s390x: Provide some more useful information if decryption of a PV image fails
 RH-Jira: RHEL-18214
 RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
 RH-Acked-by: Cédric Le Goater <clg@redhat.com>
 RH-Commit: [1/5] a71a3c11922481f97c36570e361088d17474e481
 JIRA: https://issues.redhat.com/browse/RHEL-18214
 commit 56e34834029c7c6862cb0095d95ad83c50485f88
 Author: Cornelia Huck <cohuck@redhat.com>
 Date:   Wed Dec 22 11:55:48 2021 +0100
    MAINTAINERS: split out s390x sections
    Split out some more specialized devices etc., so that we can build
    smarter lists of people to be put on cc: in the future.
    Signed-off-by: Cornelia Huck <cohuck@redhat.com>
    Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
    Acked-by: David Hildenbrand <david@redhat.com>
    Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
    Acked-by: Thomas Huth <thuth@redhat.com>
    Acked-by: Halil Pasic <pasic@linux.ibm.com>
    Acked-by: Eric Farman <farman@linux.ibm.com>
    Message-Id: <20211222105548.356852-1-cohuck@redhat.com>
    Signed-off-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Thomas Huth <thuth@redhat.com>
 ---
 MAINTAINERS | 85 ++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 74 insertions(+), 11 deletions(-)
 diff --git a/MAINTAINERS b/MAINTAINERS
 index 7543eb4d59..b893206fc3 100644
 --- a/MAINTAINERS
 +++ b/MAINTAINERS
@@ -297,7 +297,6 @@ M: David Hildenbrand <david@redhat.com>
 S: Maintained
 F: target/s390x/
 F: target/s390x/tcg
 -F: target/s390x/cpu_models_*.[ch]
 F: hw/s390x/
 F: disas/s390.c
 F: tests/tcg/s390x/
@@ -396,16 +395,10 @@ M: Halil Pasic <pasic@linux.ibm.com>
 M: Christian Borntraeger <borntraeger@de.ibm.com>
 S: Supported
 F: target/s390x/kvm/
 -F: target/s390x/ioinst.[ch]
 F: target/s390x/machine.c
 F: target/s390x/sigp.c
 -F: target/s390x/cpu_features*.[ch]
 -F: target/s390x/cpu_models.[ch]
 F: hw/s390x/pv.c
 F: include/hw/s390x/pv.h
 -F: hw/intc/s390_flic.c
 -F: hw/intc/s390_flic_kvm.c
 -F: include/hw/s390x/s390_flic.h
 F: gdb-xml/s390*.xml
 T: git https://github.com/borntraeger/qemu.git s390-next
 L: qemu-s390x@nongnu.org
@@ -1529,12 +1522,8 @@ S390 Virtio-ccw
 M: Halil Pasic <pasic@linux.ibm.com>
 M: Christian Borntraeger <borntraeger@de.ibm.com>
 S: Supported
 -F: hw/char/sclp*.[hc]
 -F: hw/char/terminal3270.c
 F: hw/s390x/
 F: include/hw/s390x/
 -F: hw/watchdog/wdt_diag288.c
 -F: include/hw/watchdog/wdt_diag288.h
 F: configs/devices/s390x-softmmu/default.mak
 F: tests/avocado/machine_s390_ccw_virtio.py
 T: git https://github.com/borntraeger/qemu.git s390-next
@@ -1559,6 +1548,37 @@ F: hw/s390x/s390-pci*
 F: include/hw/s390x/s390-pci*
 L: qemu-s390x@nongnu.org
 +S390 channel subsystem
 +M: Halil Pasic <pasic@linux.ibm.com>
 +M: Christian Borntraeger <borntraeger@linux.ibm.com>
 +S: Supported
 +F: hw/s390x/ccw-device.[ch]
 +F: hw/s390x/css.c
 +F: hw/s390x/css-bridge.c
 +F: include/hw/s390x/css.h
 +F: include/hw/s390x/css-bridge.h
 +F: include/hw/s390x/ioinst.h
 +F: target/s390x/ioinst.c
 +L: qemu-s390x@nongnu.org
 +
 +S390 CPU models
 +M: David Hildenbrand <david@redhat.com>
 +S: Maintained
 +F: target/s390x/cpu_features*.[ch]
 +F: target/s390x/cpu_models.[ch]
 +L: qemu-s390x@nongnu.org
 +
 +S390 SCLP-backed devices
 +M: Halil Pasic <pasic@linux.ibm.com>
 +M: Christian Borntraeger <borntraeger@linux.ibm.com>
 +S: Supported
 +F: include/hw/s390x/event-facility.h
 +F: include/hw/s390x/sclp.h
 +F: hw/char/sclp*.[hc]
 +F: hw/s390x/event-facility.c
 +F: hw/s390x/sclp*.c
 +L: qemu-s390x@nongnu.org
 +
 X86 Machines
 ------------
 PC
@@ -1956,6 +1976,7 @@ M: Halil Pasic <pasic@linux.ibm.com>
 S: Supported
 F: hw/s390x/virtio-ccw*.[hc]
 F: hw/s390x/vhost-vsock-ccw.c
 +F: hw/s390x/vhost-user-fs-ccw.c
 T: git https://gitlab.com/cohuck/qemu.git s390-next
 T: git https://github.com/borntraeger/qemu.git s390-next
 L: qemu-s390x@nongnu.org
@@ -2294,6 +2315,48 @@ F: hw/timer/mips_gictimer.c
 F: include/hw/intc/mips_gic.h
 F: include/hw/timer/mips_gictimer.h
 +S390 3270 device
 +M: Halil Pasic <pasic@linux.ibm.com>
 +M: Christian Borntraeger <borntraeger@linux.ibm.com>
 +S: Odd fixes
 +F: include/hw/s390x/3270-ccw.h
 +F: hw/char/terminal3270.c
 +F: hw/s390x/3270-ccw.c
 +L: qemu-s390x@nongnu.org
 +
 +S390 diag 288 watchdog
 +M: Halil Pasic <pasic@linux.ibm.com>
 +M: Christian Borntraeger <borntraeger@linux.ibm.com>
 +S: Supported
 +F: hw/watchdog/wdt_diag288.c
 +F: include/hw/watchdog/wdt_diag288.h
 +L: qemu-s390x@nongnu.org
 +
 +S390 storage key device
 +M: Halil Pasic <pasic@linux.ibm.com>
 +M: Christian Borntraeger <borntraeger@linux.ibm.com>
 +S: Supported
 +F: hw/s390x/storage-keys.h
 +F: hw/390x/s390-skeys*.c
 +L: qemu-s390x@nongnu.org
 +
 +S390 storage attribute device
 +M: Halil Pasic <pasic@linux.ibm.com>
 +M: Christian Borntraeger <borntraeger@linux.ibm.com>
 +S: Supported
 +F: hw/s390x/storage-attributes.h
 +F: hw/s390/s390-stattrib*.c
 +L: qemu-s390x@nongnu.org
 +
 +S390 floating interrupt controller
 +M: Halil Pasic <pasic@linux.ibm.com>
 +M: Christian Borntraeger <borntraeger@linux.ibm.com>
 +M: David Hildenbrand <david@redhat.com>
 +S: Supported
 +F: hw/intc/s390_flic*.c
 +F: include/hw/s390x/s390_flic.h
 +L: qemu-s390x@nongnu.org
 +
 Subsystems
 ----------
 Overall Audio backends
 -- 
 2.41.0
--- a/SOURCES/kvm-RHEL-Enable-x-not-migrate-acpi-index-for-all-pre-RHE.patch
+++ b/SOURCES/kvm-RHEL-Enable-x-not-migrate-acpi-index-for-all-pre-RHE.patch
@ -0,0 +1,43 @@
 From f1480fe9a4054113ddacd218961e29f31c33d329 Mon Sep 17 00:00:00 2001
 From: Peter Xu <peterx@redhat.com>
 Date: Wed, 6 Sep 2023 16:29:23 -0400
 Subject: [PATCH 2/3] RHEL: Enable "x-not-migrate-acpi-index" for all pre-RHEL8
 guests
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Peter Xu <peterx@redhat.com>
 RH-MergeRequest: 343: acpi: fix acpi_index migration
 RH-Jira: RHEL-20189
 RH-Acked-by: Leonardo Brás <leobras@redhat.com>
 RH-Acked-by: Igor Mammedov <imammedo@redhat.com>
 RH-Acked-by: Prasad Pandit <None>
 RH-Commit: [2/2] 0a26a71236e68dd7feb5d2063254090e3852d6ba
 The acpi index migration is simply broken before for all pre-RHEL8
 branches.  Don't migrate it for all of them.
 Signed-off-by: Peter Xu <peterx@redhat.com>
 ---
 hw/core/machine.c | 4 ++++
 1 file changed, 4 insertions(+)
 diff --git a/hw/core/machine.c b/hw/core/machine.c
 index 2724f6848a..6650a3d7b7 100644
 --- a/hw/core/machine.c
 +++ b/hw/core/machine.c
@@ -44,6 +44,10 @@ GlobalProperty hw_compat_rhel_8_6[] = {
      * we need do disable it downstream on the latest hw_compat_rhel_8.
      */
     { "vhost-vsock-device", "seqpacket", "off" },
 +    /*
 +     * RHEL-2186: all rhel8 machines should not migrate acpi index.
 +     */
 +    { "PIIX4_PM", "x-not-migrate-acpi-index", "on"},
 };
 const size_t hw_compat_rhel_8_6_len = G_N_ELEMENTS(hw_compat_rhel_8_6);
 -- 
 2.41.0
--- a/SOURCES/kvm-accel-introduce-accelerator-blocker-API.patch
+++ b/SOURCES/kvm-accel-introduce-accelerator-blocker-API.patch
@ -0,0 +1,349 @@
 From a5e7bb1f7a88efb5574266a76e80fd7604d19921 Mon Sep 17 00:00:00 2001
 From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 Date: Mon, 16 Jan 2023 07:49:59 -0500
 Subject: [PATCH 04/11] accel: introduce accelerator blocker API
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 RH-MergeRequest: 247: accel: introduce accelerator blocker API
 RH-Bugzilla: 2161188
 RH-Acked-by: David Hildenbrand <david@redhat.com>
 RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [1/3] 9d3d7f9554974a79042c915763288cce07aef135
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2161188
 commit bd688fc93120fb3e28aa70e3dfdf567ccc1e0bc1
 Author: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 Date:   Fri Nov 11 10:47:56 2022 -0500
    accel: introduce accelerator blocker API
    This API allows the accelerators to prevent vcpus from issuing
    new ioctls while execting a critical section marked with the
    accel_ioctl_inhibit_begin/end functions.
    Note that all functions submitting ioctls must mark where the
    ioctl is being called with accel_{cpu_}ioctl_begin/end().
    This API requires the caller to always hold the BQL.
    API documentation is in sysemu/accel-blocker.h
    Internally, it uses a QemuLockCnt together with a per-CPU QemuLockCnt
    (to minimize cache line bouncing) to keep avoid that new ioctls
    run when the critical section starts, and a QemuEvent to wait
    that all running ioctls finish.
    Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
    Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
    Message-Id: <20221111154758.1372674-2-eesposit@redhat.com>
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Conflicts:
 	util/meson.build: files are missing in rhel 8.8.0
 	namely int128.c, memalign.c and interval-tree.c
 Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 ---
 accel/accel-blocker.c          | 154 +++++++++++++++++++++++++++++++++
 accel/meson.build              |   2 +-
 hw/core/cpu-common.c           |   2 +
 include/hw/core/cpu.h          |   3 +
 include/sysemu/accel-blocker.h |  56 ++++++++++++
 util/meson.build               |   2 +-
 6 files changed, 217 insertions(+), 2 deletions(-)
 create mode 100644 accel/accel-blocker.c
 create mode 100644 include/sysemu/accel-blocker.h
 diff --git a/accel/accel-blocker.c b/accel/accel-blocker.c
 new file mode 100644
 index 0000000000..1e7f423462
 --- /dev/null
 +++ b/accel/accel-blocker.c
@@ -0,0 +1,154 @@
 +/*
 + * Lock to inhibit accelerator ioctls
 + *
 + * Copyright (c) 2022 Red Hat Inc.
 + *
 + * Author: Emanuele Giuseppe Esposito       <eesposit@redhat.com>
 + *
 + * Permission is hereby granted, free of charge, to any person obtaining a copy
 + * of this software and associated documentation files (the "Software"), to deal
 + * in the Software without restriction, including without limitation the rights
 + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 + * copies of the Software, and to permit persons to whom the Software is
 + * furnished to do so, subject to the following conditions:
 + *
 + * The above copyright notice and this permission notice shall be included in
 + * all copies or substantial portions of the Software.
 + *
 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 + * THE SOFTWARE.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qemu/thread.h"
 +#include "qemu/main-loop.h"
 +#include "hw/core/cpu.h"
 +#include "sysemu/accel-blocker.h"
 +
 +static QemuLockCnt accel_in_ioctl_lock;
 +static QemuEvent accel_in_ioctl_event;
 +
 +void accel_blocker_init(void)
 +{
 +    qemu_lockcnt_init(&accel_in_ioctl_lock);
 +    qemu_event_init(&accel_in_ioctl_event, false);
 +}
 +
 +void accel_ioctl_begin(void)
 +{
 +    if (likely(qemu_mutex_iothread_locked())) {
 +        return;
 +    }
 +
 +    /* block if lock is taken in kvm_ioctl_inhibit_begin() */
 +    qemu_lockcnt_inc(&accel_in_ioctl_lock);
 +}
 +
 +void accel_ioctl_end(void)
 +{
 +    if (likely(qemu_mutex_iothread_locked())) {
 +        return;
 +    }
 +
 +    qemu_lockcnt_dec(&accel_in_ioctl_lock);
 +    /* change event to SET. If event was BUSY, wake up all waiters */
 +    qemu_event_set(&accel_in_ioctl_event);
 +}
 +
 +void accel_cpu_ioctl_begin(CPUState *cpu)
 +{
 +    if (unlikely(qemu_mutex_iothread_locked())) {
 +        return;
 +    }
 +
 +    /* block if lock is taken in kvm_ioctl_inhibit_begin() */
 +    qemu_lockcnt_inc(&cpu->in_ioctl_lock);
 +}
 +
 +void accel_cpu_ioctl_end(CPUState *cpu)
 +{
 +    if (unlikely(qemu_mutex_iothread_locked())) {
 +        return;
 +    }
 +
 +    qemu_lockcnt_dec(&cpu->in_ioctl_lock);
 +    /* change event to SET. If event was BUSY, wake up all waiters */
 +    qemu_event_set(&accel_in_ioctl_event);
 +}
 +
 +static bool accel_has_to_wait(void)
 +{
 +    CPUState *cpu;
 +    bool needs_to_wait = false;
 +
 +    CPU_FOREACH(cpu) {
 +        if (qemu_lockcnt_count(&cpu->in_ioctl_lock)) {
 +            /* exit the ioctl, if vcpu is running it */
 +            qemu_cpu_kick(cpu);
 +            needs_to_wait = true;
 +        }
 +    }
 +
 +    return needs_to_wait || qemu_lockcnt_count(&accel_in_ioctl_lock);
 +}
 +
 +void accel_ioctl_inhibit_begin(void)
 +{
 +    CPUState *cpu;
 +
 +    /*
 +     * We allow to inhibit only when holding the BQL, so we can identify
 +     * when an inhibitor wants to issue an ioctl easily.
 +     */
 +    g_assert(qemu_mutex_iothread_locked());
 +
 +    /* Block further invocations of the ioctls outside the BQL.  */
 +    CPU_FOREACH(cpu) {
 +        qemu_lockcnt_lock(&cpu->in_ioctl_lock);
 +    }
 +    qemu_lockcnt_lock(&accel_in_ioctl_lock);
 +
 +    /* Keep waiting until there are running ioctls */
 +    while (true) {
 +
 +        /* Reset event to FREE. */
 +        qemu_event_reset(&accel_in_ioctl_event);
 +
 +        if (accel_has_to_wait()) {
 +            /*
 +             * If event is still FREE, and there are ioctls still in progress,
 +             * wait.
 +             *
 +             *  If an ioctl finishes before qemu_event_wait(), it will change
 +             * the event state to SET. This will prevent qemu_event_wait() from
 +             * blocking, but it's not a problem because if other ioctls are
 +             * still running the loop will iterate once more and reset the event
 +             * status to FREE so that it can wait properly.
 +             *
 +             * If an ioctls finishes while qemu_event_wait() is blocking, then
 +             * it will be waken up, but also here the while loop makes sure
 +             * to re-enter the wait if there are other running ioctls.
 +             */
 +            qemu_event_wait(&accel_in_ioctl_event);
 +        } else {
 +            /* No ioctl is running */
 +            return;
 +        }
 +    }
 +}
 +
 +void accel_ioctl_inhibit_end(void)
 +{
 +    CPUState *cpu;
 +
 +    qemu_lockcnt_unlock(&accel_in_ioctl_lock);
 +    CPU_FOREACH(cpu) {
 +        qemu_lockcnt_unlock(&cpu->in_ioctl_lock);
 +    }
 +}
 +
 diff --git a/accel/meson.build b/accel/meson.build
 index dfd808d2c8..801b4d44e8 100644
 --- a/accel/meson.build
 +++ b/accel/meson.build
@@ -1,4 +1,4 @@
 -specific_ss.add(files('accel-common.c'))
 +specific_ss.add(files('accel-common.c', 'accel-blocker.c'))
 softmmu_ss.add(files('accel-softmmu.c'))
 user_ss.add(files('accel-user.c'))
 diff --git a/hw/core/cpu-common.c b/hw/core/cpu-common.c
 index 9e3241b430..b6e83acf0a 100644
 --- a/hw/core/cpu-common.c
 +++ b/hw/core/cpu-common.c
@@ -238,6 +238,7 @@ static void cpu_common_initfn(Object *obj)
     cpu->nr_threads = 1;
     qemu_mutex_init(&cpu->work_mutex);
 +    qemu_lockcnt_init(&cpu->in_ioctl_lock);
     QSIMPLEQ_INIT(&cpu->work_list);
     QTAILQ_INIT(&cpu->breakpoints);
     QTAILQ_INIT(&cpu->watchpoints);
@@ -249,6 +250,7 @@ static void cpu_common_finalize(Object *obj)
 {
     CPUState *cpu = CPU(obj);
 +    qemu_lockcnt_destroy(&cpu->in_ioctl_lock);
     qemu_mutex_destroy(&cpu->work_mutex);
 }
 diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
 index e948e81f1a..49d9c73f97 100644
 --- a/include/hw/core/cpu.h
 +++ b/include/hw/core/cpu.h
@@ -383,6 +383,9 @@ struct CPUState {
     uint32_t kvm_fetch_index;
     uint64_t dirty_pages;
 +    /* Use by accel-block: CPU is executing an ioctl() */
 +    QemuLockCnt in_ioctl_lock;
 +
     /* Used for events with 'vcpu' and *without* the 'disabled' properties */
     DECLARE_BITMAP(trace_dstate_delayed, CPU_TRACE_DSTATE_MAX_EVENTS);
     DECLARE_BITMAP(trace_dstate, CPU_TRACE_DSTATE_MAX_EVENTS);
 diff --git a/include/sysemu/accel-blocker.h b/include/sysemu/accel-blocker.h
 new file mode 100644
 index 0000000000..72020529ef
 --- /dev/null
 +++ b/include/sysemu/accel-blocker.h
@@ -0,0 +1,56 @@
 +/*
 + * Accelerator blocking API, to prevent new ioctls from starting and wait the
 + * running ones finish.
 + * This mechanism differs from pause/resume_all_vcpus() in that it does not
 + * release the BQL.
 + *
 + *  Copyright (c) 2022 Red Hat Inc.
 + *
 + * Author: Emanuele Giuseppe Esposito       <eesposit@redhat.com>
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or later.
 + * See the COPYING file in the top-level directory.
 + */
 +#ifndef ACCEL_BLOCKER_H
 +#define ACCEL_BLOCKER_H
 +
 +#include "qemu/osdep.h"
 +#include "sysemu/cpus.h"
 +
 +extern void accel_blocker_init(void);
 +
 +/*
 + * accel_{cpu_}ioctl_begin/end:
 + * Mark when ioctl is about to run or just finished.
 + *
 + * accel_{cpu_}ioctl_begin will block after accel_ioctl_inhibit_begin() is
 + * called, preventing new ioctls to run. They will continue only after
 + * accel_ioctl_inibith_end().
 + */
 +extern void accel_ioctl_begin(void);
 +extern void accel_ioctl_end(void);
 +extern void accel_cpu_ioctl_begin(CPUState *cpu);
 +extern void accel_cpu_ioctl_end(CPUState *cpu);
 +
 +/*
 + * accel_ioctl_inhibit_begin: start critical section
 + *
 + * This function makes sure that:
 + * 1) incoming accel_{cpu_}ioctl_begin() calls block
 + * 2) wait that all ioctls that were already running reach
 + *    accel_{cpu_}ioctl_end(), kicking vcpus if necessary.
 + *
 + * This allows the caller to access shared data or perform operations without
 + * worrying of concurrent vcpus accesses.
 + */
 +extern void accel_ioctl_inhibit_begin(void);
 +
 +/*
 + * accel_ioctl_inhibit_end: end critical section started by
 + * accel_ioctl_inhibit_begin()
 + *
 + * This function allows blocked accel_{cpu_}ioctl_begin() to continue.
 + */
 +extern void accel_ioctl_inhibit_end(void);
 +
 +#endif /* ACCEL_BLOCKER_H */
 diff --git a/util/meson.build b/util/meson.build
 index 05b593055a..b5f153b0e8 100644
 --- a/util/meson.build
 +++ b/util/meson.build
@@ -48,6 +48,7 @@ util_ss.add(files('transactions.c'))
 util_ss.add(when: 'CONFIG_POSIX', if_true: files('drm.c'))
 util_ss.add(files('guest-random.c'))
 util_ss.add(files('yank.c'))
 +util_ss.add(files('lockcnt.c'))
 if have_user
   util_ss.add(files('selfmap.c'))
@@ -69,7 +70,6 @@ if have_block
   util_ss.add(files('hexdump.c'))
   util_ss.add(files('iova-tree.c'))
   util_ss.add(files('iov.c', 'qemu-sockets.c', 'uri.c'))
 -  util_ss.add(files('lockcnt.c'))
   util_ss.add(files('main-loop.c'))
   util_ss.add(files('nvdimm-utils.c'))
   util_ss.add(files('qemu-coroutine.c', 'qemu-coroutine-lock.c', 'qemu-coroutine-io.c'))
 -- 
 2.37.3
--- a/SOURCES/kvm-acpi-fix-acpi_index-migration.patch
+++ b/SOURCES/kvm-acpi-fix-acpi_index-migration.patch
@ -0,0 +1,165 @@
 From 3deffc03c2e9b0053eec5aeb5b5d633dfe29f499 Mon Sep 17 00:00:00 2001
 From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
 Date: Wed, 6 Apr 2022 14:58:12 -0400
 Subject: [PATCH 1/3] acpi: fix acpi_index migration
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Peter Xu <peterx@redhat.com>
 RH-MergeRequest: 343: acpi: fix acpi_index migration
 RH-Jira: RHEL-20189
 RH-Acked-by: Leonardo Brás <leobras@redhat.com>
 RH-Acked-by: Igor Mammedov <imammedo@redhat.com>
 RH-Acked-by: Prasad Pandit <None>
 RH-Commit: [1/2] c5b9cdf5791cd856207b7df7e2ef5df360ec8de4
 vmstate_acpi_pcihp_use_acpi_index() was expecting AcpiPciHpState
 as state but it actually received PIIX4PMState, because
 VMSTATE_PCI_HOTPLUG is a macro and not another struct.
 So it ended up accessing random pointer, which resulted
 in 'false' return value and acpi_index field wasn't ever
 sent.
 However in 7.0 that pointer de-references to value > 0, and
 destination QEMU starts to expect the field which isn't
 sent in migratioon stream from older QEMU (6.2 and older).
 As result migration fails with:
  qemu-system-x86_64: Missing section footer for 0000:00:01.3/piix4_pm
  qemu-system-x86_64: load of migration failed: Invalid argument
 In addition with QEMU-6.2, destination due to not expected
 state, also never expects the acpi_index field in migration
 stream.
 Q35 is not affected as it always sends/expects the field as
 long as acpi based PCI hotplug is enabled.
 Fix issue by introducing compat knob to never send/expect
 acpi_index in migration stream for 6.2 and older PC machine
 types and always send it for 7.0 and newer PC machine types.
 Diagnosed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 Fixes: b32bd76 ("pci: introduce acpi-index property for PCI device")
 Resolves: https://gitlab.com/qemu-project/qemu/-/issues/932
 Signed-off-by: Igor Mammedov <imammedo@redhat.com>
 Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
 (cherry picked from commit a83c2844903c45aa7d32cdd17305f23ce2c56ab9)
 Signed-off-by: Peter Xu <peterx@redhat.com>
 ---
 hw/acpi/acpi-pci-hotplug-stub.c |  4 ----
 hw/acpi/pcihp.c                 |  6 ------
 hw/acpi/piix4.c                 | 15 ++++++++++++++-
 hw/core/machine.c               |  5 +++++
 include/hw/acpi/pcihp.h         |  2 --
 5 files changed, 19 insertions(+), 13 deletions(-)
 diff --git a/hw/acpi/acpi-pci-hotplug-stub.c b/hw/acpi/acpi-pci-hotplug-stub.c
 index 734e4c5986..a43f6dafc9 100644
 --- a/hw/acpi/acpi-pci-hotplug-stub.c
 +++ b/hw/acpi/acpi-pci-hotplug-stub.c
@@ -41,7 +41,3 @@ void acpi_pcihp_reset(AcpiPciHpState *s, bool acpihp_root_off)
     return;
 }
 -bool vmstate_acpi_pcihp_use_acpi_index(void *opaque, int version_id)
 -{
 -    return false;
 -}
 diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c
 index be0e846b34..ec861661c3 100644
 --- a/hw/acpi/pcihp.c
 +++ b/hw/acpi/pcihp.c
@@ -559,12 +559,6 @@ void acpi_pcihp_init(Object *owner, AcpiPciHpState *s, PCIBus *root_bus,
                                    OBJ_PROP_FLAG_READ);
 }
 -bool vmstate_acpi_pcihp_use_acpi_index(void *opaque, int version_id)
 -{
 -     AcpiPciHpState *s = opaque;
 -     return s->acpi_index;
 -}
 -
 const VMStateDescription vmstate_acpi_pcihp_pci_status = {
     .name = "acpi_pcihp_pci_status",
     .version_id = 1,
 diff --git a/hw/acpi/piix4.c b/hw/acpi/piix4.c
 index 8d6011c0a3..033e75ce5b 100644
 --- a/hw/acpi/piix4.c
 +++ b/hw/acpi/piix4.c
@@ -82,6 +82,7 @@ struct PIIX4PMState {
     AcpiPciHpState acpi_pci_hotplug;
     bool use_acpi_hotplug_bridge;
     bool use_acpi_root_pci_hotplug;
 +    bool not_migrate_acpi_index;
     uint8_t disable_s3;
     uint8_t disable_s4;
@@ -269,6 +270,16 @@ static bool piix4_vmstate_need_smbus(void *opaque, int version_id)
     return pm_smbus_vmstate_needed();
 }
 +/*
 + * This is a fudge to turn off the acpi_index field,
 + * whose test was always broken on piix4 with 6.2 and older machine types.
 + */
 +static bool vmstate_test_migrate_acpi_index(void *opaque, int version_id)
 +{
 +    PIIX4PMState *s = PIIX4_PM(opaque);
 +    return s->use_acpi_hotplug_bridge && !s->not_migrate_acpi_index;
 +}
 +
 /* qemu-kvm 1.2 uses version 3 but advertised as 2
  * To support incoming qemu-kvm 1.2 migration, change version_id
  * and minimum_version_id to 2 below (which breaks migration from
@@ -299,7 +310,7 @@ static const VMStateDescription vmstate_acpi = {
             struct AcpiPciHpPciStatus),
         VMSTATE_PCI_HOTPLUG(acpi_pci_hotplug, PIIX4PMState,
                             vmstate_test_use_acpi_hotplug_bridge,
 -                            vmstate_acpi_pcihp_use_acpi_index),
 +                            vmstate_test_migrate_acpi_index),
         VMSTATE_END_OF_LIST()
     },
     .subsections = (const VMStateDescription*[]) {
@@ -654,6 +665,8 @@ static Property piix4_pm_properties[] = {
     DEFINE_PROP_BOOL("memory-hotplug-support", PIIX4PMState,
                      acpi_memory_hotplug.is_enabled, true),
     DEFINE_PROP_BOOL("smm-compat", PIIX4PMState, smm_compat, false),
 +    DEFINE_PROP_BOOL("x-not-migrate-acpi-index", PIIX4PMState,
 +                      not_migrate_acpi_index, false),
     DEFINE_PROP_END_OF_LIST(),
 };
 diff --git a/hw/core/machine.c b/hw/core/machine.c
 index 76fcabec7a..2724f6848a 100644
 --- a/hw/core/machine.c
 +++ b/hw/core/machine.c
@@ -331,6 +331,11 @@ GlobalProperty hw_compat_rhel_7_1[] = {
 };
 const size_t hw_compat_rhel_7_1_len = G_N_ELEMENTS(hw_compat_rhel_7_1);
 +GlobalProperty hw_compat_6_2[] = {
 +    { "PIIX4_PM", "x-not-migrate-acpi-index", "on"},
 +};
 +const size_t hw_compat_6_2_len = G_N_ELEMENTS(hw_compat_6_2);
 +
 GlobalProperty hw_compat_6_1[] = {
     { "vhost-user-vsock-device", "seqpacket", "off" },
     { "nvme-ns", "shared", "off" },
 diff --git a/include/hw/acpi/pcihp.h b/include/hw/acpi/pcihp.h
 index af1a169fc3..7e268c2c9c 100644
 --- a/include/hw/acpi/pcihp.h
 +++ b/include/hw/acpi/pcihp.h
@@ -73,8 +73,6 @@ void acpi_pcihp_reset(AcpiPciHpState *s, bool acpihp_root_off);
 extern const VMStateDescription vmstate_acpi_pcihp_pci_status;
 -bool vmstate_acpi_pcihp_use_acpi_index(void *opaque, int version_id);
 -
 #define VMSTATE_PCI_HOTPLUG(pcihp, state, test_pcihp, test_acpi_index) \
         VMSTATE_UINT32_TEST(pcihp.hotplug_select, state, \
                             test_pcihp), \
 -- 
 2.41.0
--- a/SOURCES/kvm-aio-wait-switch-to-smp_mb__after_rmw.patch
+++ b/SOURCES/kvm-aio-wait-switch-to-smp_mb__after_rmw.patch
@ -0,0 +1,50 @@
 From 953c5c0982b61b0a3f8f03452844b5487eb22fc7 Mon Sep 17 00:00:00 2001
 From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 Date: Thu, 9 Mar 2023 08:13:17 -0500
 Subject: [PATCH 06/13] aio-wait: switch to smp_mb__after_rmw()
 RH-Author: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 RH-MergeRequest: 263: qatomic: add smp_mb__before/after_rmw()
 RH-Bugzilla: 2168472
 RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
 RH-Acked-by: Eric Auger <eric.auger@redhat.com>
 RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
 RH-Acked-by: David Hildenbrand <david@redhat.com>
 RH-Commit: [6/10] 9f30f97754139ffd18d36b2350f9ed4e59ac496e
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2168472
 commit b532526a07ef3b903ead2e055fe6cc87b41057a3
 Author: Paolo Bonzini <pbonzini@redhat.com>
 Date:   Fri Mar 3 11:03:52 2023 +0100
    aio-wait: switch to smp_mb__after_rmw()
    The barrier comes after an atomic increment, so it is enough to use
    smp_mb__after_rmw(); this avoids a double barrier on x86 systems.
    Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
    Reviewed-by: David Hildenbrand <david@redhat.com>
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 ---
 include/block/aio-wait.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h
 index 54840f8622..03b6394c78 100644
 --- a/include/block/aio-wait.h
 +++ b/include/block/aio-wait.h
@@ -82,7 +82,7 @@ extern AioWait global_aio_wait;
     /* Increment wait_->num_waiters before evaluating cond. */     \
     qatomic_inc(&wait_->num_waiters);                              \
     /* Paired with smp_mb in aio_wait_kick(). */                   \
 -    smp_mb();                                                      \
 +    smp_mb__after_rmw();                                           \
     if (ctx_ && in_aio_context_home_thread(ctx_)) {                \
         while ((cond)) {                                           \
             aio_poll(ctx_, true);                                  \
 -- 
 2.37.3
--- a/SOURCES/kvm-aio_wait_kick-add-missing-memory-barrier.patch
+++ b/SOURCES/kvm-aio_wait_kick-add-missing-memory-barrier.patch
@ -0,0 +1,86 @@
 From d7eae0ff4c7f7f7bf10f10272adf7c6971c0db9b Mon Sep 17 00:00:00 2001
 From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 Date: Thu, 9 Mar 2023 09:26:35 -0500
 Subject: [PATCH 01/13] aio_wait_kick: add missing memory barrier
 RH-Author: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 RH-MergeRequest: 263: qatomic: add smp_mb__before/after_rmw()
 RH-Bugzilla: 2168472
 RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
 RH-Acked-by: Eric Auger <eric.auger@redhat.com>
 RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
 RH-Acked-by: David Hildenbrand <david@redhat.com>
 RH-Commit: [1/10] eb774aee79864052e14e706d931e52e7bd1162c8
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2168472
 commit 7455ff1aa01564cc175db5b2373e610503ad4411
 Author: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 Date:   Tue May 24 13:30:54 2022 -0400
    aio_wait_kick: add missing memory barrier
    It seems that aio_wait_kick always required a memory barrier
    or atomic operation in the caller, but nobody actually
    took care of doing it.
    Let's put the barrier in the function instead, and pair it
    with another one in AIO_WAIT_WHILE. Read aio_wait_kick()
    comment for further explanation.
    Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
    Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
    Message-Id: <20220524173054.12651-1-eesposit@redhat.com>
    Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
    Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
    Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 ---
 include/block/aio-wait.h |  2 ++
 util/aio-wait.c          | 16 +++++++++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)
 diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h
 index b39eefb38d..54840f8622 100644
 --- a/include/block/aio-wait.h
 +++ b/include/block/aio-wait.h
@@ -81,6 +81,8 @@ extern AioWait global_aio_wait;
     AioContext *ctx_ = (ctx);                                      \
     /* Increment wait_->num_waiters before evaluating cond. */     \
     qatomic_inc(&wait_->num_waiters);                              \
 +    /* Paired with smp_mb in aio_wait_kick(). */                   \
 +    smp_mb();                                                      \
     if (ctx_ && in_aio_context_home_thread(ctx_)) {                \
         while ((cond)) {                                           \
             aio_poll(ctx_, true);                                  \
 diff --git a/util/aio-wait.c b/util/aio-wait.c
 index bdb3d3af22..98c5accd29 100644
 --- a/util/aio-wait.c
 +++ b/util/aio-wait.c
@@ -35,7 +35,21 @@ static void dummy_bh_cb(void *opaque)
 void aio_wait_kick(void)
 {
 -    /* The barrier (or an atomic op) is in the caller.  */
 +    /*
 +     * Paired with smp_mb in AIO_WAIT_WHILE. Here we have:
 +     * write(condition);
 +     * aio_wait_kick() {
 +     *      smp_mb();
 +     *      read(num_waiters);
 +     * }
 +     *
 +     * And in AIO_WAIT_WHILE:
 +     * write(num_waiters);
 +     * smp_mb();
 +     * read(condition);
 +     */
 +    smp_mb();
 +
     if (qatomic_read(&global_aio_wait.num_waiters)) {
         aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
     }
 -- 
 2.37.3
--- a/SOURCES/kvm-apic-disable-reentrancy-detection-for-apic-msi.patch
+++ b/SOURCES/kvm-apic-disable-reentrancy-detection-for-apic-msi.patch
@ -0,0 +1,56 @@
 From 47d027147694fde94dd73305ee53b6a136cbeced Mon Sep 17 00:00:00 2001
 From: Jon Maloy <jmaloy@redhat.com>
 Date: Tue, 9 May 2023 10:29:03 -0400
 Subject: [PATCH 08/15] apic: disable reentrancy detection for apic-msi
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 277: memory: prevent dma-reentracy issues
 RH-Bugzilla: 1999236
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [8/12] 25c3cf99b00cd9adc10d6e7afa9c3e3b7da08de2 (redhat/rhel/src/qemu-kvm/jons-qemu-kvm-2)
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1999236
 Upstream: Merged
 CVE: CVE-2021-3750
 commit 50795ee051a342c681a9b45671c552fbd6274db8
 Author: Alexander Bulekov <alxndr@bu.edu>
 Date:   Thu Apr 27 17:10:13 2023 -0400
    apic: disable reentrancy detection for apic-msi
    As the code is designed for re-entrant calls to apic-msi, mark apic-msi
    as reentrancy-safe.
    Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
    Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
    Message-Id: <20230427211013.2994127-9-alxndr@bu.edu>
    Signed-off-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 hw/intc/apic.c | 7 +++++++
 1 file changed, 7 insertions(+)
 diff --git a/hw/intc/apic.c b/hw/intc/apic.c
 index 3df11c34d6..a7c2b301a8 100644
 --- a/hw/intc/apic.c
 +++ b/hw/intc/apic.c
@@ -883,6 +883,13 @@ static void apic_realize(DeviceState *dev, Error **errp)
     memory_region_init_io(&s->io_memory, OBJECT(s), &apic_io_ops, s, "apic-msi",
                           APIC_SPACE_SIZE);
 +    /*
 +     * apic-msi's apic_mem_write can call into ioapic_eoi_broadcast, which can
 +     * write back to apic-msi. As such mark the apic-msi region re-entrancy
 +     * safe.
 +     */
 +    s->io_memory.disable_reentrancy_guard = true;
 +
     s->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, apic_timer, s);
     local_apics[s->id] = s;
 -- 
 2.37.3
--- a/SOURCES/kvm-async-Add-an-optional-reentrancy-guard-to-the-BH-API.patch
+++ b/SOURCES/kvm-async-Add-an-optional-reentrancy-guard-to-the-BH-API.patch
@ -0,0 +1,235 @@
 From 8996ac4369de7e0cb6f911db6f47c3e4ae88c8aa Mon Sep 17 00:00:00 2001
 From: Jon Maloy <jmaloy@redhat.com>
 Date: Tue, 9 May 2023 10:29:03 -0400
 Subject: [PATCH 02/15] async: Add an optional reentrancy guard to the BH API
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 277: memory: prevent dma-reentracy issues
 RH-Bugzilla: 1999236
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [2/12] b03f247e242a6cdb3eebec36477234ac77dcd20c (redhat/rhel/src/qemu-kvm/jons-qemu-kvm-2)
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1999236
 Upstream: Merged
 CVE: CVE-2021-3750
 Conflict: The file block/graph-lock.h, inluded from include/block/aio.h,
          doesn't exist in this code version. The code compiles without
          issues if this include is just omitted, so we do that.
 commit 9c86c97f12c060bf7484dd931f38634e166a81f0
 Author: Alexander Bulekov <alxndr@bu.edu>
 Date:   Thu Apr 27 17:10:07 2023 -0400
    async: Add an optional reentrancy guard to the BH API
    Devices can pass their MemoryReentrancyGuard (from their DeviceState),
    when creating new BHes. Then, the async API will toggle the guard
    before/after calling the BH call-back. This prevents bh->mmio reentrancy
    issues.
    Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
    Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
    Message-Id: <20230427211013.2994127-3-alxndr@bu.edu>
    [thuth: Fix "line over 90 characters" checkpatch.pl error]
    Signed-off-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 docs/devel/multiple-iothreads.txt |  7 +++++++
 include/block/aio.h               | 18 ++++++++++++++++--
 include/qemu/main-loop.h          |  7 +++++--
 tests/unit/ptimer-test-stubs.c    |  3 ++-
 util/async.c                      | 18 +++++++++++++++++-
 util/main-loop.c                  |  6 ++++--
 util/trace-events                 |  1 +
 7 files changed, 52 insertions(+), 8 deletions(-)
 diff --git a/docs/devel/multiple-iothreads.txt b/docs/devel/multiple-iothreads.txt
 index aeb997bed5..a11576bc74 100644
 --- a/docs/devel/multiple-iothreads.txt
 +++ b/docs/devel/multiple-iothreads.txt
@@ -61,6 +61,7 @@ There are several old APIs that use the main loop AioContext:
  * LEGACY qemu_aio_set_event_notifier() - monitor an event notifier
  * LEGACY timer_new_ms() - create a timer
  * LEGACY qemu_bh_new() - create a BH
 + * LEGACY qemu_bh_new_guarded() - create a BH with a device re-entrancy guard
  * LEGACY qemu_aio_wait() - run an event loop iteration
 Since they implicitly work on the main loop they cannot be used in code that
@@ -72,8 +73,14 @@ Instead, use the AioContext functions directly (see include/block/aio.h):
  * aio_set_event_notifier() - monitor an event notifier
  * aio_timer_new() - create a timer
  * aio_bh_new() - create a BH
 + * aio_bh_new_guarded() - create a BH with a device re-entrancy guard
  * aio_poll() - run an event loop iteration
 +The qemu_bh_new_guarded/aio_bh_new_guarded APIs accept a "MemReentrancyGuard"
 +argument, which is used to check for and prevent re-entrancy problems. For
 +BHs associated with devices, the reentrancy-guard is contained in the
 +corresponding DeviceState and named "mem_reentrancy_guard".
 +
 The AioContext can be obtained from the IOThread using
 iothread_get_aio_context() or for the main loop using qemu_get_aio_context().
 Code that takes an AioContext argument works both in IOThreads or the main
 diff --git a/include/block/aio.h b/include/block/aio.h
 index 47fbe9d81f..c7da152985 100644
 --- a/include/block/aio.h
 +++ b/include/block/aio.h
@@ -22,6 +22,8 @@
 #include "qemu/event_notifier.h"
 #include "qemu/thread.h"
 #include "qemu/timer.h"
 +#include "hw/qdev-core.h"
 +
 typedef struct BlockAIOCB BlockAIOCB;
 typedef void BlockCompletionFunc(void *opaque, int ret);
@@ -321,9 +323,11 @@ void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
  * is opaque and must be allocated prior to its use.
  *
  * @name: A human-readable identifier for debugging purposes.
 + * @reentrancy_guard: A guard set when entering a cb to prevent
 + * device-reentrancy issues
  */
 QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
 -                        const char *name);
 +                        const char *name, MemReentrancyGuard *reentrancy_guard);
 /**
  * aio_bh_new: Allocate a new bottom half structure
@@ -332,7 +336,17 @@ QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
  * string.
  */
 #define aio_bh_new(ctx, cb, opaque) \
 -    aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)))
 +    aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), NULL)
 +
 +/**
 + * aio_bh_new_guarded: Allocate a new bottom half structure with a
 + * reentrancy_guard
 + *
 + * A convenience wrapper for aio_bh_new_full() that uses the cb as the name
 + * string.
 + */
 +#define aio_bh_new_guarded(ctx, cb, opaque, guard) \
 +    aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), guard)
 /**
  * aio_notify: Force processing of pending events.
 diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
 index 8dbc6fcb89..85dd5ada9e 100644
 --- a/include/qemu/main-loop.h
 +++ b/include/qemu/main-loop.h
@@ -294,9 +294,12 @@ void qemu_cond_timedwait_iothread(QemuCond *cond, int ms);
 void qemu_fd_register(int fd);
 +#define qemu_bh_new_guarded(cb, opaque, guard) \
 +    qemu_bh_new_full((cb), (opaque), (stringify(cb)), guard)
 #define qemu_bh_new(cb, opaque) \
 -    qemu_bh_new_full((cb), (opaque), (stringify(cb)))
 -QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name);
 +    qemu_bh_new_full((cb), (opaque), (stringify(cb)), NULL)
 +QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name,
 +                         MemReentrancyGuard *reentrancy_guard);
 void qemu_bh_schedule_idle(QEMUBH *bh);
 enum {
 diff --git a/tests/unit/ptimer-test-stubs.c b/tests/unit/ptimer-test-stubs.c
 index 2a3ef58799..a7a2d08e7e 100644
 --- a/tests/unit/ptimer-test-stubs.c
 +++ b/tests/unit/ptimer-test-stubs.c
@@ -108,7 +108,8 @@ int64_t qemu_clock_deadline_ns_all(QEMUClockType type, int attr_mask)
     return deadline;
 }
 -QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name)
 +QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name,
 +                         MemReentrancyGuard *reentrancy_guard)
 {
     QEMUBH *bh = g_new(QEMUBH, 1);
 diff --git a/util/async.c b/util/async.c
 index 2a63bf90f2..1fff02e7fc 100644
 --- a/util/async.c
 +++ b/util/async.c
@@ -62,6 +62,7 @@ struct QEMUBH {
     void *opaque;
     QSLIST_ENTRY(QEMUBH) next;
     unsigned flags;
 +    MemReentrancyGuard *reentrancy_guard;
 };
 /* Called concurrently from any thread */
@@ -127,7 +128,7 @@ void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb,
 }
 QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
 -                        const char *name)
 +                        const char *name, MemReentrancyGuard *reentrancy_guard)
 {
     QEMUBH *bh;
     bh = g_new(QEMUBH, 1);
@@ -136,13 +137,28 @@ QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
         .cb = cb,
         .opaque = opaque,
         .name = name,
 +        .reentrancy_guard = reentrancy_guard,
     };
     return bh;
 }
 void aio_bh_call(QEMUBH *bh)
 {
 +    bool last_engaged_in_io = false;
 +
 +    if (bh->reentrancy_guard) {
 +        last_engaged_in_io = bh->reentrancy_guard->engaged_in_io;
 +        if (bh->reentrancy_guard->engaged_in_io) {
 +            trace_reentrant_aio(bh->ctx, bh->name);
 +        }
 +        bh->reentrancy_guard->engaged_in_io = true;
 +    }
 +
     bh->cb(bh->opaque);
 +
 +    if (bh->reentrancy_guard) {
 +        bh->reentrancy_guard->engaged_in_io = last_engaged_in_io;
 +    }
 }
 /* Multiple occurrences of aio_bh_poll cannot be called concurrently. */
 diff --git a/util/main-loop.c b/util/main-loop.c
 index 06b18b195c..1eacf04691 100644
 --- a/util/main-loop.c
 +++ b/util/main-loop.c
@@ -544,9 +544,11 @@ void main_loop_wait(int nonblocking)
 /* Functions to operate on the main QEMU AioContext.  */
 -QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name)
 +QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name,
 +                         MemReentrancyGuard *reentrancy_guard)
 {
 -    return aio_bh_new_full(qemu_aio_context, cb, opaque, name);
 +    return aio_bh_new_full(qemu_aio_context, cb, opaque, name,
 +                           reentrancy_guard);
 }
 /*
 diff --git a/util/trace-events b/util/trace-events
 index c8f53d7d9f..dc3b1eb3bf 100644
 --- a/util/trace-events
 +++ b/util/trace-events
@@ -11,6 +11,7 @@ poll_remove(void *ctx, void *node, int fd) "ctx %p node %p fd %d"
 # async.c
 aio_co_schedule(void *ctx, void *co) "ctx %p co %p"
 aio_co_schedule_bh_cb(void *ctx, void *co) "ctx %p co %p"
 +reentrant_aio(void *ctx, const char *name) "ctx %p name %s"
 # thread-pool.c
 thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
 -- 
 2.37.3
--- a/SOURCES/kvm-async-avoid-use-after-free-on-re-entrancy-guard.patch
+++ b/SOURCES/kvm-async-avoid-use-after-free-on-re-entrancy-guard.patch
@ -0,0 +1,71 @@
 From d754050d260e2ad890cecd975df6e163c531b40e Mon Sep 17 00:00:00 2001
 From: Jon Maloy <jmaloy@redhat.com>
 Date: Tue, 9 May 2023 10:29:03 -0400
 Subject: [PATCH 09/15] async: avoid use-after-free on re-entrancy guard
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 277: memory: prevent dma-reentracy issues
 RH-Bugzilla: 1999236
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [9/12] d357650e581c3921bbfe3e2fde5e3f55853b5fab (redhat/rhel/src/qemu-kvm/jons-qemu-kvm-2)
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1999236
 Upstream: Merged
 CVE: CVE-2021-3750
 commit 7915bd06f25e1803778081161bf6fa10c42dc7cd
 Author: Alexander Bulekov <alxndr@bu.edu>
 Date:   Mon May 1 10:19:56 2023 -0400
    async: avoid use-after-free on re-entrancy guard
    A BH callback can free the BH, causing a use-after-free in aio_bh_call.
    Fix that by keeping a local copy of the re-entrancy guard pointer.
    Buglink: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=58513
    Fixes: 9c86c97f12 ("async: Add an optional reentrancy guard to the BH API")
    Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
    Message-Id: <20230501141956.3444868-1-alxndr@bu.edu>
    Reviewed-by: Thomas Huth <thuth@redhat.com>
    Signed-off-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 util/async.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)
 diff --git a/util/async.c b/util/async.c
 index 1fff02e7fc..ffe0541c3b 100644
 --- a/util/async.c
 +++ b/util/async.c
@@ -146,18 +146,20 @@ void aio_bh_call(QEMUBH *bh)
 {
     bool last_engaged_in_io = false;
 -    if (bh->reentrancy_guard) {
 -        last_engaged_in_io = bh->reentrancy_guard->engaged_in_io;
 -        if (bh->reentrancy_guard->engaged_in_io) {
 +    /* Make a copy of the guard-pointer as cb may free the bh */
 +    MemReentrancyGuard *reentrancy_guard = bh->reentrancy_guard;
 +    if (reentrancy_guard) {
 +        last_engaged_in_io = reentrancy_guard->engaged_in_io;
 +        if (reentrancy_guard->engaged_in_io) {
             trace_reentrant_aio(bh->ctx, bh->name);
         }
 -        bh->reentrancy_guard->engaged_in_io = true;
 +        reentrancy_guard->engaged_in_io = true;
     }
     bh->cb(bh->opaque);
 -    if (bh->reentrancy_guard) {
 -        bh->reentrancy_guard->engaged_in_io = last_engaged_in_io;
 +    if (reentrancy_guard) {
 +        reentrancy_guard->engaged_in_io = last_engaged_in_io;
     }
 }
 -- 
 2.37.3
--- a/SOURCES/kvm-async-clarify-usage-of-barriers-in-the-polling-case.patch
+++ b/SOURCES/kvm-async-clarify-usage-of-barriers-in-the-polling-case.patch
@ -0,0 +1,66 @@
 From 187eb7a418af93375e42298d06e231e2bec3cf00 Mon Sep 17 00:00:00 2001
 From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 Date: Thu, 9 Mar 2023 08:15:42 -0500
 Subject: [PATCH 10/13] async: clarify usage of barriers in the polling case
 RH-Author: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 RH-MergeRequest: 263: qatomic: add smp_mb__before/after_rmw()
 RH-Bugzilla: 2168472
 RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
 RH-Acked-by: Eric Auger <eric.auger@redhat.com>
 RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
 RH-Acked-by: David Hildenbrand <david@redhat.com>
 RH-Commit: [10/10] 3be07ccc6137a0336becfe63a818d9cbadb38e9c
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2168472
 commit 6229438cca037d42f44a96d38feb15cb102a444f
 Author: Paolo Bonzini <pbonzini@redhat.com>
 Date:   Mon Mar 6 10:43:52 2023 +0100
    async: clarify usage of barriers in the polling case
    Explain that aio_context_notifier_poll() relies on
    aio_notify_accept() to catch all the memory writes that were
    done before ctx->notified was set to true.
    Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
    Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 ---
 util/async.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)
 diff --git a/util/async.c b/util/async.c
 index 795fe699b6..2a63bf90f2 100644
 --- a/util/async.c
 +++ b/util/async.c
@@ -463,8 +463,9 @@ void aio_notify_accept(AioContext *ctx)
     qatomic_set(&ctx->notified, false);
     /*
 -     * Write ctx->notified before reading e.g. bh->flags.  Pairs with smp_wmb
 -     * in aio_notify.
 +     * Order reads of ctx->notified (in aio_context_notifier_poll()) and the
 +     * above clearing of ctx->notified before reads of e.g. bh->flags.  Pairs
 +     * with smp_wmb() in aio_notify.
      */
     smp_mb();
 }
@@ -487,6 +488,11 @@ static bool aio_context_notifier_poll(void *opaque)
     EventNotifier *e = opaque;
     AioContext *ctx = container_of(e, AioContext, notifier);
 +    /*
 +     * No need for load-acquire because we just want to kick the
 +     * event loop.  aio_notify_accept() takes care of synchronizing
 +     * the event loop with the producers.
 +     */
     return qatomic_read(&ctx->notified);
 }
 -- 
 2.37.3
--- a/SOURCES/kvm-async-update-documentation-of-the-memory-barriers.patch
+++ b/SOURCES/kvm-async-update-documentation-of-the-memory-barriers.patch
@ -0,0 +1,111 @@
 From ea3856bb545d19499602830cdc3076d83a981e7a Mon Sep 17 00:00:00 2001
 From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 Date: Thu, 9 Mar 2023 08:15:36 -0500
 Subject: [PATCH 09/13] async: update documentation of the memory barriers
 RH-Author: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 RH-MergeRequest: 263: qatomic: add smp_mb__before/after_rmw()
 RH-Bugzilla: 2168472
 RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
 RH-Acked-by: Eric Auger <eric.auger@redhat.com>
 RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
 RH-Acked-by: David Hildenbrand <david@redhat.com>
 RH-Commit: [9/10] d471da2acf7a107cf75f3327c5e8d7456307160e
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2168472
 commit 8dd48650b43dfde4ebea34191ac267e474bcc29e
 Author: Paolo Bonzini <pbonzini@redhat.com>
 Date:   Mon Mar 6 10:15:06 2023 +0100
    async: update documentation of the memory barriers
    Ever since commit 8c6b0356b539 ("util/async: make bh_aio_poll() O(1)",
    2020-02-22), synchronization between qemu_bh_schedule() and aio_bh_poll()
    is happening when the bottom half is enqueued in the bh_list; not
    when the flags are set.  Update the documentation to match.
    Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 ---
 util/async.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)
 diff --git a/util/async.c b/util/async.c
 index 6f6717a34b..795fe699b6 100644
 --- a/util/async.c
 +++ b/util/async.c
@@ -71,14 +71,21 @@ static void aio_bh_enqueue(QEMUBH *bh, unsigned new_flags)
     unsigned old_flags;
     /*
 -     * The memory barrier implicit in qatomic_fetch_or makes sure that:
 -     * 1. idle & any writes needed by the callback are done before the
 -     *    locations are read in the aio_bh_poll.
 -     * 2. ctx is loaded before the callback has a chance to execute and bh
 -     *    could be freed.
 +     * Synchronizes with atomic_fetch_and() in aio_bh_dequeue(), ensuring that
 +     * insertion starts after BH_PENDING is set.
      */
     old_flags = qatomic_fetch_or(&bh->flags, BH_PENDING | new_flags);
 +
     if (!(old_flags & BH_PENDING)) {
 +        /*
 +         * At this point the bottom half becomes visible to aio_bh_poll().
 +         * This insertion thus synchronizes with QSLIST_MOVE_ATOMIC in
 +         * aio_bh_poll(), ensuring that:
 +         * 1. any writes needed by the callback are visible from the callback
 +         *    after aio_bh_dequeue() returns bh.
 +         * 2. ctx is loaded before the callback has a chance to execute and bh
 +         *    could be freed.
 +         */
         QSLIST_INSERT_HEAD_ATOMIC(&ctx->bh_list, bh, next);
     }
@@ -97,11 +104,8 @@ static QEMUBH *aio_bh_dequeue(BHList *head, unsigned *flags)
     QSLIST_REMOVE_HEAD(head, next);
     /*
 -     * The qatomic_and is paired with aio_bh_enqueue().  The implicit memory
 -     * barrier ensures that the callback sees all writes done by the scheduling
 -     * thread.  It also ensures that the scheduling thread sees the cleared
 -     * flag before bh->cb has run, and thus will call aio_notify again if
 -     * necessary.
 +     * Synchronizes with qatomic_fetch_or() in aio_bh_enqueue(), ensuring that
 +     * the removal finishes before BH_PENDING is reset.
      */
     *flags = qatomic_fetch_and(&bh->flags,
                               ~(BH_PENDING | BH_SCHEDULED | BH_IDLE));
@@ -148,6 +152,7 @@ int aio_bh_poll(AioContext *ctx)
     BHListSlice *s;
     int ret = 0;
 +    /* Synchronizes with QSLIST_INSERT_HEAD_ATOMIC in aio_bh_enqueue().  */
     QSLIST_MOVE_ATOMIC(&slice.bh_list, &ctx->bh_list);
     QSIMPLEQ_INSERT_TAIL(&ctx->bh_slice_list, &slice, next);
@@ -437,15 +442,15 @@ LuringState *aio_get_linux_io_uring(AioContext *ctx)
 void aio_notify(AioContext *ctx)
 {
     /*
 -     * Write e.g. bh->flags before writing ctx->notified.  Pairs with smp_mb in
 -     * aio_notify_accept.
 +     * Write e.g. ctx->bh_list before writing ctx->notified.  Pairs with
 +     * smp_mb() in aio_notify_accept().
      */
     smp_wmb();
     qatomic_set(&ctx->notified, true);
     /*
 -     * Write ctx->notified before reading ctx->notify_me.  Pairs
 -     * with smp_mb in aio_ctx_prepare or aio_poll.
 +     * Write ctx->notified (and also ctx->bh_list) before reading ctx->notify_me.
 +     * Pairs with smp_mb() in aio_ctx_prepare or aio_poll.
      */
     smp_mb();
     if (qatomic_read(&ctx->notify_me)) {
 -- 
 2.37.3
--- a/SOURCES/kvm-bcm2835_property-disable-reentrancy-detection-for-io.patch
+++ b/SOURCES/kvm-bcm2835_property-disable-reentrancy-detection-for-io.patch
@ -0,0 +1,58 @@
 From 7715635d018351e0a5c4c25aec2c71a2fe3b9e69 Mon Sep 17 00:00:00 2001
 From: Jon Maloy <jmaloy@redhat.com>
 Date: Tue, 9 May 2023 10:29:03 -0400
 Subject: [PATCH 06/15] bcm2835_property: disable reentrancy detection for
 iomem
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 277: memory: prevent dma-reentracy issues
 RH-Bugzilla: 1999236
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [6/12] 4d6187430ca1c4309a36824c0c6815d2a763db1a (redhat/rhel/src/qemu-kvm/jons-qemu-kvm-2)
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1999236
 Upstream: Merged
 CVE: CVE-2021-3750
 commit 985c4a4e547afb9573b6bd6843d20eb2c3d1d1cd
 Author: Alexander Bulekov <alxndr@bu.edu>
 Date:   Thu Apr 27 17:10:11 2023 -0400
    bcm2835_property: disable reentrancy detection for iomem
    As the code is designed for re-entrant calls from bcm2835_property to
    bcm2835_mbox and back into bcm2835_property, mark iomem as
    reentrancy-safe.
    Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
    Reviewed-by: Thomas Huth <thuth@redhat.com>
    Message-Id: <20230427211013.2994127-7-alxndr@bu.edu>
    Signed-off-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 hw/misc/bcm2835_property.c | 7 +++++++
 1 file changed, 7 insertions(+)
 diff --git a/hw/misc/bcm2835_property.c b/hw/misc/bcm2835_property.c
 index 73941bdae9..022b5a849c 100644
 --- a/hw/misc/bcm2835_property.c
 +++ b/hw/misc/bcm2835_property.c
@@ -377,6 +377,13 @@ static void bcm2835_property_init(Object *obj)
     memory_region_init_io(&s->iomem, OBJECT(s), &bcm2835_property_ops, s,
                           TYPE_BCM2835_PROPERTY, 0x10);
 +
 +    /*
 +     * bcm2835_property_ops call into bcm2835_mbox, which in-turn reads from
 +     * iomem. As such, mark iomem as re-entracy safe.
 +     */
 +    s->iomem.disable_reentrancy_guard = true;
 +
     sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->iomem);
     sysbus_init_irq(SYS_BUS_DEVICE(s), &s->mbox_irq);
 }
 -- 
 2.37.3
--- a/SOURCES/kvm-block-Collapse-padded-I-O-vecs-exceeding-IOV_MAX.patch
+++ b/SOURCES/kvm-block-Collapse-padded-I-O-vecs-exceeding-IOV_MAX.patch
@ -0,0 +1,359 @@
 From 1f7520baa6f0bf02ccba2ebfe7d1d5bf6520f95a Mon Sep 17 00:00:00 2001
 From: Hanna Czenczek <hreitz@redhat.com>
 Date: Tue, 11 Apr 2023 19:34:16 +0200
 Subject: [PATCH 2/5] block: Collapse padded I/O vecs exceeding IOV_MAX
 RH-Author: Hanna Czenczek <hreitz@redhat.com>
 RH-MergeRequest: 291: block: Split padded I/O vectors exceeding IOV_MAX
 RH-Bugzilla: 2141964
 RH-Acked-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
 RH-Commit: [2/5] 1d86ce8398e4ab66e308a686f9855c963e52b0a9
 When processing vectored guest requests that are not aligned to the
 storage request alignment, we pad them by adding head and/or tail
 buffers for a read-modify-write cycle.
 The guest can submit I/O vectors up to IOV_MAX (1024) in length, but
 with this padding, the vector can exceed that limit.  As of
 4c002cef0e9abe7135d7916c51abce47f7fc1ee2 ("util/iov: make
 qemu_iovec_init_extended() honest"), we refuse to pad vectors beyond the
 limit, instead returning an error to the guest.
 To the guest, this appears as a random I/O error.  We should not return
 an I/O error to the guest when it issued a perfectly valid request.
 Before 4c002cef0e9abe7135d7916c51abce47f7fc1ee2, we just made the vector
 longer than IOV_MAX, which generally seems to work (because the guest
 assumes a smaller alignment than we really have, file-posix's
 raw_co_prw() will generally see bdrv_qiov_is_aligned() return false, and
 so emulate the request, so that the IOV_MAX does not matter).  However,
 that does not seem exactly great.
 I see two ways to fix this problem:
 1. We split such long requests into two requests.
 2. We join some elements of the vector into new buffers to make it
   shorter.
 I am wary of (1), because it seems like it may have unintended side
 effects.
 (2) on the other hand seems relatively simple to implement, with
 hopefully few side effects, so this patch does that.
 To do this, the use of qemu_iovec_init_extended() in bdrv_pad_request()
 is effectively replaced by the new function bdrv_create_padded_qiov(),
 which not only wraps the request IOV with padding head/tail, but also
 ensures that the resulting vector will not have more than IOV_MAX
 elements.  Putting that functionality into qemu_iovec_init_extended() is
 infeasible because it requires allocating a bounce buffer; doing so
 would require many more parameters (buffer alignment, how to initialize
 the buffer, and out parameters like the buffer, its length, and the
 original elements), which is not reasonable.
 Conversely, it is not difficult to move qemu_iovec_init_extended()'s
 functionality into bdrv_create_padded_qiov() by using public
 qemu_iovec_* functions, so that is what this patch does.
 Because bdrv_pad_request() was the only "serious" user of
 qemu_iovec_init_extended(), the next patch will remove the latter
 function, so the functionality is not implemented twice.
 Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2141964
 Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
 Message-Id: <20230411173418.19549-3-hreitz@redhat.com>
 Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
 (cherry picked from commit 18743311b829cafc1737a5f20bc3248d5f91ee2a)
 Conflicts:
 	block/io.c: Downstream bdrv_pad_request() has no @flags
        parameter.
 Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
 ---
 block/io.c | 166 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 151 insertions(+), 15 deletions(-)
 diff --git a/block/io.c b/block/io.c
 index c3e7301613..0fe8f0dd40 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -1624,6 +1624,14 @@ out:
  * @merge_reads is true for small requests,
  * if @buf_len == @head + bytes + @tail. In this case it is possible that both
  * head and tail exist but @buf_len == align and @tail_buf == @buf.
 + *
 + * @write is true for write requests, false for read requests.
 + *
 + * If padding makes the vector too long (exceeding IOV_MAX), then we need to
 + * merge existing vector elements into a single one.  @collapse_bounce_buf acts
 + * as the bounce buffer in such cases.  @pre_collapse_qiov has the pre-collapse
 + * I/O vector elements so for read requests, the data can be copied back after
 + * the read is done.
  */
 typedef struct BdrvRequestPadding {
     uint8_t *buf;
@@ -1632,11 +1640,17 @@ typedef struct BdrvRequestPadding {
     size_t head;
     size_t tail;
     bool merge_reads;
 +    bool write;
     QEMUIOVector local_qiov;
 +
 +    uint8_t *collapse_bounce_buf;
 +    size_t collapse_len;
 +    QEMUIOVector pre_collapse_qiov;
 } BdrvRequestPadding;
 static bool bdrv_init_padding(BlockDriverState *bs,
                               int64_t offset, int64_t bytes,
 +                              bool write,
                               BdrvRequestPadding *pad)
 {
     int64_t align = bs->bl.request_alignment;
@@ -1668,6 +1682,8 @@ static bool bdrv_init_padding(BlockDriverState *bs,
         pad->tail_buf = pad->buf + pad->buf_len - align;
     }
 +    pad->write = write;
 +
     return true;
 }
@@ -1733,8 +1749,23 @@ zero_mem:
     return 0;
 }
 -static void bdrv_padding_destroy(BdrvRequestPadding *pad)
 +/**
 + * Free *pad's associated buffers, and perform any necessary finalization steps.
 + */
 +static void bdrv_padding_finalize(BdrvRequestPadding *pad)
 {
 +    if (pad->collapse_bounce_buf) {
 +        if (!pad->write) {
 +            /*
 +             * If padding required elements in the vector to be collapsed into a
 +             * bounce buffer, copy the bounce buffer content back
 +             */
 +            qemu_iovec_from_buf(&pad->pre_collapse_qiov, 0,
 +                                pad->collapse_bounce_buf, pad->collapse_len);
 +        }
 +        qemu_vfree(pad->collapse_bounce_buf);
 +        qemu_iovec_destroy(&pad->pre_collapse_qiov);
 +    }
     if (pad->buf) {
         qemu_vfree(pad->buf);
         qemu_iovec_destroy(&pad->local_qiov);
@@ -1742,6 +1773,101 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad)
     memset(pad, 0, sizeof(*pad));
 }
 +/*
 + * Create pad->local_qiov by wrapping @iov in the padding head and tail, while
 + * ensuring that the resulting vector will not exceed IOV_MAX elements.
 + *
 + * To ensure this, when necessary, the first two or three elements of @iov are
 + * merged into pad->collapse_bounce_buf and replaced by a reference to that
 + * bounce buffer in pad->local_qiov.
 + *
 + * After performing a read request, the data from the bounce buffer must be
 + * copied back into pad->pre_collapse_qiov (e.g. by bdrv_padding_finalize()).
 + */
 +static int bdrv_create_padded_qiov(BlockDriverState *bs,
 +                                   BdrvRequestPadding *pad,
 +                                   struct iovec *iov, int niov,
 +                                   size_t iov_offset, size_t bytes)
 +{
 +    int padded_niov, surplus_count, collapse_count;
 +
 +    /* Assert this invariant */
 +    assert(niov <= IOV_MAX);
 +
 +    /*
 +     * Cannot pad if resulting length would exceed SIZE_MAX.  Returning an error
 +     * to the guest is not ideal, but there is little else we can do.  At least
 +     * this will practically never happen on 64-bit systems.
 +     */
 +    if (SIZE_MAX - pad->head < bytes ||
 +        SIZE_MAX - pad->head - bytes < pad->tail)
 +    {
 +        return -EINVAL;
 +    }
 +
 +    /* Length of the resulting IOV if we just concatenated everything */
 +    padded_niov = !!pad->head + niov + !!pad->tail;
 +
 +    qemu_iovec_init(&pad->local_qiov, MIN(padded_niov, IOV_MAX));
 +
 +    if (pad->head) {
 +        qemu_iovec_add(&pad->local_qiov, pad->buf, pad->head);
 +    }
 +
 +    /*
 +     * If padded_niov > IOV_MAX, we cannot just concatenate everything.
 +     * Instead, merge the first two or three elements of @iov to reduce the
 +     * number of vector elements as necessary.
 +     */
 +    if (padded_niov > IOV_MAX) {
 +        /*
 +         * Only head and tail can have lead to the number of entries exceeding
 +         * IOV_MAX, so we can exceed it by the head and tail at most.  We need
 +         * to reduce the number of elements by `surplus_count`, so we merge that
 +         * many elements plus one into one element.
 +         */
 +        surplus_count = padded_niov - IOV_MAX;
 +        assert(surplus_count <= !!pad->head + !!pad->tail);
 +        collapse_count = surplus_count + 1;
 +
 +        /*
 +         * Move the elements to collapse into `pad->pre_collapse_qiov`, then
 +         * advance `iov` (and associated variables) by those elements.
 +         */
 +        qemu_iovec_init(&pad->pre_collapse_qiov, collapse_count);
 +        qemu_iovec_concat_iov(&pad->pre_collapse_qiov, iov,
 +                              collapse_count, iov_offset, SIZE_MAX);
 +        iov += collapse_count;
 +        iov_offset = 0;
 +        niov -= collapse_count;
 +        bytes -= pad->pre_collapse_qiov.size;
 +
 +        /*
 +         * Construct the bounce buffer to match the length of the to-collapse
 +         * vector elements, and for write requests, initialize it with the data
 +         * from those elements.  Then add it to `pad->local_qiov`.
 +         */
 +        pad->collapse_len = pad->pre_collapse_qiov.size;
 +        pad->collapse_bounce_buf = qemu_blockalign(bs, pad->collapse_len);
 +        if (pad->write) {
 +            qemu_iovec_to_buf(&pad->pre_collapse_qiov, 0,
 +                              pad->collapse_bounce_buf, pad->collapse_len);
 +        }
 +        qemu_iovec_add(&pad->local_qiov,
 +                       pad->collapse_bounce_buf, pad->collapse_len);
 +    }
 +
 +    qemu_iovec_concat_iov(&pad->local_qiov, iov, niov, iov_offset, bytes);
 +
 +    if (pad->tail) {
 +        qemu_iovec_add(&pad->local_qiov,
 +                       pad->buf + pad->buf_len - pad->tail, pad->tail);
 +    }
 +
 +    assert(pad->local_qiov.niov == MIN(padded_niov, IOV_MAX));
 +    return 0;
 +}
 +
 /*
  * bdrv_pad_request
  *
@@ -1749,6 +1875,8 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad)
  * read of padding, bdrv_padding_rmw_read() should be called separately if
  * needed.
  *
 + * @write is true for write requests, false for read requests.
 + *
  * Request parameters (@qiov, &qiov_offset, &offset, &bytes) are in-out:
  *  - on function start they represent original request
  *  - on failure or when padding is not needed they are unchanged
@@ -1757,25 +1885,33 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad)
 static int bdrv_pad_request(BlockDriverState *bs,
                             QEMUIOVector **qiov, size_t *qiov_offset,
                             int64_t *offset, int64_t *bytes,
 +                            bool write,
                             BdrvRequestPadding *pad, bool *padded)
 {
     int ret;
 +    struct iovec *sliced_iov;
 +    int sliced_niov;
 +    size_t sliced_head, sliced_tail;
     bdrv_check_qiov_request(*offset, *bytes, *qiov, *qiov_offset, &error_abort);
 -    if (!bdrv_init_padding(bs, *offset, *bytes, pad)) {
 +    if (!bdrv_init_padding(bs, *offset, *bytes, write, pad)) {
         if (padded) {
             *padded = false;
         }
         return 0;
     }
 -    ret = qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head,
 -                                   *qiov, *qiov_offset, *bytes,
 -                                   pad->buf + pad->buf_len - pad->tail,
 -                                   pad->tail);
 +    sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes,
 +                                  &sliced_head, &sliced_tail,
 +                                  &sliced_niov);
 +
 +    /* Guaranteed by bdrv_check_qiov_request() */
 +    assert(*bytes <= SIZE_MAX);
 +    ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov,
 +                                  sliced_head, *bytes);
     if (ret < 0) {
 -        bdrv_padding_destroy(pad);
 +        bdrv_padding_finalize(pad);
         return ret;
     }
     *bytes += pad->head + pad->tail;
@@ -1836,8 +1972,8 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
         flags |= BDRV_REQ_COPY_ON_READ;
     }
 -    ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad,
 -                           NULL);
 +    ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, false,
 +                           &pad, NULL);
     if (ret < 0) {
         goto fail;
     }
@@ -1847,7 +1983,7 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
                               bs->bl.request_alignment,
                               qiov, qiov_offset, flags);
     tracked_request_end(&req);
 -    bdrv_padding_destroy(&pad);
 +    bdrv_padding_finalize(&pad);
 fail:
     bdrv_dec_in_flight(bs);
@@ -2167,7 +2303,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
     bool padding;
     BdrvRequestPadding pad;
 -    padding = bdrv_init_padding(bs, offset, bytes, &pad);
 +    padding = bdrv_init_padding(bs, offset, bytes, true, &pad);
     if (padding) {
         bdrv_make_request_serialising(req, align);
@@ -2214,7 +2350,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
     }
 out:
 -    bdrv_padding_destroy(&pad);
 +    bdrv_padding_finalize(&pad);
     return ret;
 }
@@ -2280,8 +2416,8 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
          * bdrv_co_do_zero_pwritev() does aligning by itself, so, we do
          * alignment only if there is no ZERO flag.
          */
 -        ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad,
 -                               &padded);
 +        ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, true,
 +                               &pad, &padded);
         if (ret < 0) {
             return ret;
         }
@@ -2310,7 +2446,7 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
     ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
                                qiov, qiov_offset, flags);
 -    bdrv_padding_destroy(&pad);
 +    bdrv_padding_finalize(&pad);
 out:
     tracked_request_end(&req);
 -- 
 2.39.3
--- a/SOURCES/kvm-block-Fix-pad_request-s-request-restriction.patch
+++ b/SOURCES/kvm-block-Fix-pad_request-s-request-restriction.patch
@ -0,0 +1,75 @@
 From b9866279996ee065cb524bf30bc70e22efbab303 Mon Sep 17 00:00:00 2001
 From: Hanna Czenczek <hreitz@redhat.com>
 Date: Fri, 14 Jul 2023 10:59:38 +0200
 Subject: [PATCH 5/5] block: Fix pad_request's request restriction
 RH-Author: Hanna Czenczek <hreitz@redhat.com>
 RH-MergeRequest: 291: block: Split padded I/O vectors exceeding IOV_MAX
 RH-Bugzilla: 2141964
 RH-Acked-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
 RH-Commit: [5/5] f9188bd089d6c67185ea1accde20d491a2ed3193
 bdrv_pad_request() relies on requests' lengths not to exceed SIZE_MAX,
 which bdrv_check_qiov_request() does not guarantee.
 bdrv_check_request32() however will guarantee this, and both of
 bdrv_pad_request()'s callers (bdrv_co_preadv_part() and
 bdrv_co_pwritev_part()) already run it before calling
 bdrv_pad_request().  Therefore, bdrv_pad_request() can safely call
 bdrv_check_request32() without expecting error, too.
 In effect, this patch will not change guest-visible behavior.  It is a
 clean-up to tighten a condition to match what is guaranteed by our
 callers, and which exists purely to show clearly why the subsequent
 assertion (`assert(*bytes <= SIZE_MAX)`) is always true.
 Note there is a difference between the interfaces of
 bdrv_check_qiov_request() and bdrv_check_request32(): The former takes
 an errp, the latter does not, so we can no longer just pass
 &error_abort.  Instead, we need to check the returned value.  While we
 do expect success (because the callers have already run this function),
 an assert(ret == 0) is not much simpler than just to return an error if
 it occurs, so let us handle errors by returning them up the stack now.
 Reported-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
 Message-id: 20230714085938.202730-1-hreitz@redhat.com
 Fixes: 18743311b829cafc1737a5f20bc3248d5f91ee2a
       ("block: Collapse padded I/O vecs exceeding IOV_MAX")
 Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
 ---
 block/io.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)
 diff --git a/block/io.c b/block/io.c
 index 0fe8f0dd40..8ae57728a6 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -1893,7 +1893,11 @@ static int bdrv_pad_request(BlockDriverState *bs,
     int sliced_niov;
     size_t sliced_head, sliced_tail;
 -    bdrv_check_qiov_request(*offset, *bytes, *qiov, *qiov_offset, &error_abort);
 +    /* Should have been checked by the caller already */
 +    ret = bdrv_check_request32(*offset, *bytes, *qiov, *qiov_offset);
 +    if (ret < 0) {
 +        return ret;
 +    }
     if (!bdrv_init_padding(bs, *offset, *bytes, write, pad)) {
         if (padded) {
@@ -1906,7 +1910,7 @@ static int bdrv_pad_request(BlockDriverState *bs,
                                   &sliced_head, &sliced_tail,
                                   &sliced_niov);
 -    /* Guaranteed by bdrv_check_qiov_request() */
 +    /* Guaranteed by bdrv_check_request32() */
     assert(*bytes <= SIZE_MAX);
     ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov,
                                   sliced_head, *bytes);
 -- 
 2.39.3
--- a/SOURCES/kvm-block-Parse-filenames-only-when-explicitly-requested.patch
+++ b/SOURCES/kvm-block-Parse-filenames-only-when-explicitly-requested.patch
@ -0,0 +1,260 @@
 From c4ba1f1755031a0ac2f600ed8c17e7dcb6b2b857 Mon Sep 17 00:00:00 2001
 From: Jon Maloy <jmaloy@redhat.com>
 Date: Wed, 5 Jun 2024 19:56:51 -0400
 Subject: [PATCH 5/5] block: Parse filenames only when explicitly requested
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 5: EMBARGOED CVE-2024-4467 for rhel-8.10.z (PRDSC)
 RH-Jira: RHEL-35616
 RH-CVE: CVE-2024-4467
 RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
 RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
 RH-Commit: [5/5] a3e197add64fc6950c4ac576e34d833dfae7ee34
 Conflicts: - brdv_open_child_common(): bdrv_graph_wrlock/unlock()
             don't exist in this code version. We ignore them.
 	     bdrv_open_inherit(): no_coroutine_fn/GRAPH_UNLOCKED
             doesn't exist. We ignore it.
           - Changes to bdrv_open_file_child() didn't apply cleanly,
             but fixing it is straight-forward.
           - GLOBAL_STATE_CODE() not present in this code. Ignoring it.
           - bdrv_open_file_child(): Need to continue setting of
 	     parent->file.
 commit f44c2941d4419e60f16dea3e9adca164e75aa78d
 Author: Kevin Wolf <kwolf@redhat.com>
 Date:   Thu Apr 25 14:56:02 2024 +0200
    block: Parse filenames only when explicitly requested
    When handling image filenames from legacy options such as -drive or from
    tools, these filenames are parsed for protocol prefixes, including for
    the json:{} pseudo-protocol.
    This behaviour is intended for filenames that come directly from the
    command line and for backing files, which may come from the image file
    itself. Higher level management tools generally take care to verify that
    untrusted images don't contain a bad (or any) backing file reference;
    'qemu-img info' is a suitable tool for this.
    However, for other files that can be referenced in images, such as
    qcow2 data files or VMDK extents, the string from the image file is
    usually not verified by management tools - and 'qemu-img info' wouldn't
    be suitable because in contrast to backing files, it already opens these
    other referenced files. So here the string should be interpreted as a
    literal local filename. More complex configurations need to be specified
    explicitly on the command line or in QMP.
    This patch changes bdrv_open_inherit() so that it only parses filenames
    if a new parameter parse_filename is true. It is set for the top level
    in bdrv_open(), for the file child and for the backing file child. All
    other callers pass false and disable filename parsing this way.
    Signed-off-by: Kevin Wolf <kwolf@redhat.com>
    Reviewed-by: Eric Blake <eblake@redhat.com>
    Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
    Reviewed-by: Hanna Czenczek <hreitz@redhat.com>
    Upstream: N/A, embargoed
    Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 block.c | 81 +++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 56 insertions(+), 25 deletions(-)
 diff --git a/block.c b/block.c
 index 889f878565..ddebf50efa 100644
 --- a/block.c
 +++ b/block.c
@@ -82,6 +82,7 @@ static BlockDriverState *bdrv_open_inherit(const char *filename,
                                            BlockDriverState *parent,
                                            const BdrvChildClass *child_class,
                                            BdrvChildRole child_role,
 +                                           bool parse_filename,
                                            Error **errp);
 static bool bdrv_recurse_has_child(BlockDriverState *bs,
@@ -1926,7 +1927,8 @@ static void parse_json_protocol(QDict *options, const char **pfilename,
  * block driver has been specified explicitly.
  */
 static int bdrv_fill_options(QDict **options, const char *filename,
 -                             int *flags, Error **errp)
 +                             int *flags, bool allow_parse_filename,
 +                             Error **errp)
 {
     const char *drvname;
     bool protocol = *flags & BDRV_O_PROTOCOL;
@@ -1966,7 +1968,7 @@ static int bdrv_fill_options(QDict **options, const char *filename,
     if (protocol && filename) {
         if (!qdict_haskey(*options, "filename")) {
             qdict_put_str(*options, "filename", filename);
 -            parse_filename = true;
 +            parse_filename = allow_parse_filename;
         } else {
             error_setg(errp, "Can't specify 'file' and 'filename' options at "
                              "the same time");
@@ -3439,7 +3441,8 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
     }
     backing_hd = bdrv_open_inherit(backing_filename, reference, options, 0, bs,
 -                                   &child_of_bds, bdrv_backing_role(bs), errp);
 +                                   &child_of_bds, bdrv_backing_role(bs), true,
 +                                   errp);
     if (!backing_hd) {
         bs->open_flags |= BDRV_O_NO_BACKING;
         error_prepend(errp, "Could not open backing file: ");
@@ -3472,7 +3475,8 @@ free_exit:
 static BlockDriverState *
 bdrv_open_child_bs(const char *filename, QDict *options, const char *bdref_key,
                    BlockDriverState *parent, const BdrvChildClass *child_class,
 -                   BdrvChildRole child_role, bool allow_none, Error **errp)
 +                   BdrvChildRole child_role, bool allow_none,
 +                   bool parse_filename, Error **errp)
 {
     BlockDriverState *bs = NULL;
     QDict *image_options;
@@ -3503,7 +3507,8 @@ bdrv_open_child_bs(const char *filename, QDict *options, const char *bdref_key,
     }
     bs = bdrv_open_inherit(filename, reference, image_options, 0,
 -                           parent, child_class, child_role, errp);
 +                           parent, child_class, child_role, parse_filename,
 +                           errp);
     if (!bs) {
         goto done;
     }
@@ -3513,6 +3518,29 @@ done:
     return bs;
 }
 +static BdrvChild *bdrv_open_child_common(const char *filename,
 +                                         QDict *options, const char *bdref_key,
 +                                         BlockDriverState *parent,
 +                                         const BdrvChildClass *child_class,
 +                                         BdrvChildRole child_role,
 +                                         bool allow_none, bool parse_filename,
 +                                         Error **errp)
 +{
 +    BlockDriverState *bs;
 +    BdrvChild *child;
 +
 +    bs = bdrv_open_child_bs(filename, options, bdref_key, parent, child_class,
 +                            child_role, allow_none, parse_filename, errp);
 +    if (bs == NULL) {
 +        return NULL;
 +    }
 +
 +    child = bdrv_attach_child(parent, bs, bdref_key, child_class, child_role,
 +                              errp);
 +
 +    return child;
 +}
 +
 /*
  * Opens a disk image whose options are given as BlockdevRef in another block
  * device's options.
@@ -3534,20 +3562,17 @@ BdrvChild *bdrv_open_child(const char *filename,
                            BdrvChildRole child_role,
                            bool allow_none, Error **errp)
 {
 -    BlockDriverState *bs;
 -
 -    bs = bdrv_open_child_bs(filename, options, bdref_key, parent, child_class,
 -                            child_role, allow_none, errp);
 -    if (bs == NULL) {
 -        return NULL;
 -    }
 -
 -    return bdrv_attach_child(parent, bs, bdref_key, child_class, child_role,
 -                             errp);
 +    return bdrv_open_child_common(filename, options, bdref_key, parent,
 +                                  child_class, child_role, allow_none, false,
 +                                  errp);
 }
 /*
 - * Wrapper on bdrv_open_child() for most popular case: open primary child of bs.
 + * This does mostly the same as bdrv_open_child(), but for opening the primary
 + * child of a node. A notable difference from bdrv_open_child() is that it
 + * enables filename parsing for protocol names (including json:).
 + *
 + * @parent can move to a different AioContext in this function.
  */
 int bdrv_open_file_child(const char *filename,
                          QDict *options, const char *bdref_key,
@@ -3558,8 +3583,9 @@ int bdrv_open_file_child(const char *filename,
     role = parent->drv->is_filter ?
         (BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY) : BDRV_CHILD_IMAGE;
 -    parent->file = bdrv_open_child(filename, options, bdref_key, parent,
 -                                   &child_of_bds, role, false, errp);
 +    parent->file = bdrv_open_child_common(filename, options, bdref_key, parent,
 +                                          &child_of_bds, role, false, true,
 +                                          errp);
     return parent->file ? 0 : -EINVAL;
 }
@@ -3599,7 +3625,8 @@ BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp)
     }
 -    bs = bdrv_open_inherit(NULL, reference, qdict, 0, NULL, NULL, 0, errp);
 +    bs = bdrv_open_inherit(NULL, reference, qdict, 0, NULL, NULL, 0, false,
 +                           errp);
     obj = NULL;
     qobject_unref(obj);
     visit_free(v);
@@ -3690,6 +3717,7 @@ static BlockDriverState *bdrv_open_inherit(const char *filename,
                                            BlockDriverState *parent,
                                            const BdrvChildClass *child_class,
                                            BdrvChildRole child_role,
 +                                           bool parse_filename,
                                            Error **errp)
 {
     int ret;
@@ -3733,9 +3761,11 @@ static BlockDriverState *bdrv_open_inherit(const char *filename,
     }
     /* json: syntax counts as explicit options, as if in the QDict */
 -    parse_json_protocol(options, &filename, &local_err);
 -    if (local_err) {
 -        goto fail;
 +    if (parse_filename) {
 +        parse_json_protocol(options, &filename, &local_err);
 +        if (local_err) {
 +            goto fail;
 +        }
     }
     bs->explicit_options = qdict_clone_shallow(options);
@@ -3760,7 +3790,8 @@ static BlockDriverState *bdrv_open_inherit(const char *filename,
                                      parent->open_flags, parent->options);
     }
 -    ret = bdrv_fill_options(&options, filename, &flags, &local_err);
 +    ret = bdrv_fill_options(&options, filename, &flags, parse_filename,
 +                            &local_err);
     if (ret < 0) {
         goto fail;
     }
@@ -3829,7 +3860,7 @@ static BlockDriverState *bdrv_open_inherit(const char *filename,
         file_bs = bdrv_open_child_bs(filename, options, "file", bs,
                                      &child_of_bds, BDRV_CHILD_IMAGE,
 -                                     true, &local_err);
 +                                     true, true, &local_err);
         if (local_err) {
             goto fail;
         }
@@ -3974,7 +4005,7 @@ BlockDriverState *bdrv_open(const char *filename, const char *reference,
                             QDict *options, int flags, Error **errp)
 {
     return bdrv_open_inherit(filename, reference, options, flags, NULL,
 -                             NULL, 0, errp);
 +                             NULL, 0, true, errp);
 }
 /* Return true if the NULL-terminated @list contains @str */
 -- 
 2.39.3
--- a/SOURCES/kvm-block-introduce-bdrv_open_file_child-helper.patch
+++ b/SOURCES/kvm-block-introduce-bdrv_open_file_child-helper.patch
@ -0,0 +1,566 @@
 From 996680dd6d5afd51918e600126dbfed4dfe89e05 Mon Sep 17 00:00:00 2001
 From: Jon Maloy <jmaloy@redhat.com>
 Date: Sun, 9 Jun 2024 23:08:39 -0400
 Subject: [PATCH 4/5] block: introduce bdrv_open_file_child() helper
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 5: EMBARGOED CVE-2024-4467 for rhel-8.10.z (PRDSC)
 RH-Jira: RHEL-35616
 RH-CVE: CVE-2024-4467
 RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
 RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
 RH-Commit: [4/5] 9f582a9aff740eb9ec6f64bfec94854038d8545f
 Conflicts: - copy-before-write.c::cbw_copy() is an older version than
             upstream, but introduction of the new function is
 	     straight-forward.
           - include/block/block-global-state.h doesn't exist in this
             code version. Adding the prototype to
             include/block/block.h instead.
           - struct BlockDriver has no field 'filtered_child_is_backing'
             We remove the corresponding assert() in the new function.
 commit 83930780325b144a5908c45b3957b9b6457b3831
 Author: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
 Date:   Tue Jul 26 23:11:21 2022 +0300
    block: introduce bdrv_open_file_child() helper
    Almost all drivers call bdrv_open_child() similarly. Let's create a
    helper for this.
    The only not updated drivers that call bdrv_open_child() to set
    bs->file are raw-format and snapshot-access:
        raw-format sometimes want to have filtered child but
            don't set drv->is_filter to true.
        snapshot-access wants only DATA | PRIMARY
    Possibly we should implement drv->is_filter_func() handler, to consider
    raw-format as filter when it works as filter.. But it's another story.
    Note also, that we decrease assignments to bs->file in code: it helps
    us restrict modifying this field in further commit.
    Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
    Reviewed-by: Hanna Reitz <hreitz@redhat.com>
    Message-Id: <20220726201134.924743-3-vsementsov@yandex-team.ru>
    Reviewed-by: Kevin Wolf <kwolf@redhat.com>
    Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 block.c                   | 18 ++++++++++++++++++
 block/blkdebug.c          |  9 +++------
 block/blklogwrites.c      |  7 ++-----
 block/blkreplay.c         |  7 ++-----
 block/blkverify.c         |  9 +++------
 block/bochs.c             |  7 +++----
 block/cloop.c             |  7 +++----
 block/copy-before-write.c |  9 ++++-----
 block/copy-on-read.c      |  9 ++++-----
 block/crypto.c            | 11 ++++++-----
 block/dmg.c               |  7 +++----
 block/filter-compress.c   |  8 +++-----
 block/parallels.c         |  7 +++----
 block/preallocate.c       |  9 ++++-----
 block/qcow.c              |  6 ++----
 block/qcow2.c             |  8 ++++----
 block/qed.c               |  8 ++++----
 block/replication.c       |  8 +++-----
 block/throttle.c          |  8 +++-----
 block/vdi.c               |  7 +++----
 block/vhdx.c              |  7 +++----
 block/vmdk.c              |  7 +++----
 block/vpc.c               |  7 +++----
 include/block/block.h     |  3 +++
 24 files changed, 92 insertions(+), 101 deletions(-)
 diff --git a/block.c b/block.c
 index 0ac5b163d2..889f878565 100644
 --- a/block.c
 +++ b/block.c
@@ -3546,6 +3546,24 @@ BdrvChild *bdrv_open_child(const char *filename,
                              errp);
 }
 +/*
 + * Wrapper on bdrv_open_child() for most popular case: open primary child of bs.
 + */
 +int bdrv_open_file_child(const char *filename,
 +                         QDict *options, const char *bdref_key,
 +                         BlockDriverState *parent, Error **errp)
 +{
 +    BdrvChildRole role;
 +
 +    role = parent->drv->is_filter ?
 +        (BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY) : BDRV_CHILD_IMAGE;
 +
 +    parent->file = bdrv_open_child(filename, options, bdref_key, parent,
 +                                   &child_of_bds, role, false, errp);
 +
 +    return parent->file ? 0 : -EINVAL;
 +}
 +
 /*
  * TODO Future callers may need to specify parent/child_class in order for
  * option inheritance to work. Existing callers use it for the root node.
 diff --git a/block/blkdebug.c b/block/blkdebug.c
 index bbf2948703..5fcfc8ac6f 100644
 --- a/block/blkdebug.c
 +++ b/block/blkdebug.c
@@ -503,12 +503,9 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
     }
     /* Open the image file */
 -    bs->file = bdrv_open_child(qemu_opt_get(opts, "x-image"), options, "image",
 -                               bs, &child_of_bds,
 -                               BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
 -                               false, errp);
 -    if (!bs->file) {
 -        ret = -EINVAL;
 +    ret = bdrv_open_file_child(qemu_opt_get(opts, "x-image"), options, "image",
 +                               bs, errp);
 +    if (ret < 0) {
         goto out;
     }
 diff --git a/block/blklogwrites.c b/block/blklogwrites.c
 index f7a251e91f..f66a617eb3 100644
 --- a/block/blklogwrites.c
 +++ b/block/blklogwrites.c
@@ -155,11 +155,8 @@ static int blk_log_writes_open(BlockDriverState *bs, QDict *options, int flags,
     }
     /* Open the file */
 -    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
 -                               BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, false,
 -                               errp);
 -    if (!bs->file) {
 -        ret = -EINVAL;
 +    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 +    if (ret < 0) {
         goto fail;
     }
 diff --git a/block/blkreplay.c b/block/blkreplay.c
 index dcbe780ddb..76a0b8d12a 100644
 --- a/block/blkreplay.c
 +++ b/block/blkreplay.c
@@ -26,11 +26,8 @@ static int blkreplay_open(BlockDriverState *bs, QDict *options, int flags,
     int ret;
     /* Open the image file */
 -    bs->file = bdrv_open_child(NULL, options, "image", bs, &child_of_bds,
 -                               BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
 -                               false, errp);
 -    if (!bs->file) {
 -        ret = -EINVAL;
 +    ret = bdrv_open_file_child(NULL, options, "image", bs, errp);
 +    if (ret < 0) {
         goto fail;
     }
 diff --git a/block/blkverify.c b/block/blkverify.c
 index d1facf5ba9..920e891684 100644
 --- a/block/blkverify.c
 +++ b/block/blkverify.c
@@ -121,12 +121,9 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags,
     }
     /* Open the raw file */
 -    bs->file = bdrv_open_child(qemu_opt_get(opts, "x-raw"), options, "raw",
 -                               bs, &child_of_bds,
 -                               BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
 -                               false, errp);
 -    if (!bs->file) {
 -        ret = -EINVAL;
 +    ret = bdrv_open_file_child(qemu_opt_get(opts, "x-raw"), options, "raw",
 +                               bs, errp);
 +    if (ret < 0) {
         goto fail;
     }
 diff --git a/block/bochs.c b/block/bochs.c
 index 4d68658087..b2dc06bbfd 100644
 --- a/block/bochs.c
 +++ b/block/bochs.c
@@ -110,10 +110,9 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags,
         return ret;
     }
 -    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
 -                               BDRV_CHILD_IMAGE, false, errp);
 -    if (!bs->file) {
 -        return -EINVAL;
 +    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 +    if (ret < 0) {
 +        return ret;
     }
     ret = bdrv_pread(bs->file, 0, &bochs, sizeof(bochs));
 diff --git a/block/cloop.c b/block/cloop.c
 index b8c6d0eccd..bee87da173 100644
 --- a/block/cloop.c
 +++ b/block/cloop.c
@@ -71,10 +71,9 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags,
         return ret;
     }
 -    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
 -                               BDRV_CHILD_IMAGE, false, errp);
 -    if (!bs->file) {
 -        return -EINVAL;
 +    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 +    if (ret < 0) {
 +        return ret;
     }
     /* read header */
 diff --git a/block/copy-before-write.c b/block/copy-before-write.c
 index c30a5ff8de..8aa2cb6a85 100644
 --- a/block/copy-before-write.c
 +++ b/block/copy-before-write.c
@@ -150,12 +150,11 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
 {
     BDRVCopyBeforeWriteState *s = bs->opaque;
     BdrvDirtyBitmap *copy_bitmap;
 +    int ret;
 -    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
 -                               BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
 -                               false, errp);
 -    if (!bs->file) {
 -        return -EINVAL;
 +    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 +    if (ret < 0) {
 +        return ret;
     }
     s->target = bdrv_open_child(NULL, options, "target", bs, &child_of_bds,
 diff --git a/block/copy-on-read.c b/block/copy-on-read.c
 index 1fc7fb3333..815ac1d835 100644
 --- a/block/copy-on-read.c
 +++ b/block/copy-on-read.c
@@ -41,12 +41,11 @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags,
     BDRVStateCOR *state = bs->opaque;
     /* Find a bottom node name, if any */
     const char *bottom_node = qdict_get_try_str(options, "bottom");
 +    int ret;
 -    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
 -                               BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
 -                               false, errp);
 -    if (!bs->file) {
 -        return -EINVAL;
 +    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 +    if (ret < 0) {
 +        return ret;
     }
     bs->supported_read_flags = BDRV_REQ_PREFETCH;
 diff --git a/block/crypto.c b/block/crypto.c
 index c8ba4681e2..abfce39230 100644
 --- a/block/crypto.c
 +++ b/block/crypto.c
@@ -260,15 +260,14 @@ static int block_crypto_open_generic(QCryptoBlockFormat format,
 {
     BlockCrypto *crypto = bs->opaque;
     QemuOpts *opts = NULL;
 -    int ret = -EINVAL;
 +    int ret;
     QCryptoBlockOpenOptions *open_opts = NULL;
     unsigned int cflags = 0;
     QDict *cryptoopts = NULL;
 -    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
 -                               BDRV_CHILD_IMAGE, false, errp);
 -    if (!bs->file) {
 -        return -EINVAL;
 +    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 +    if (ret < 0) {
 +        return ret;
     }
     bs->supported_write_flags = BDRV_REQ_FUA &
@@ -276,6 +275,7 @@ static int block_crypto_open_generic(QCryptoBlockFormat format,
     opts = qemu_opts_create(opts_spec, NULL, 0, &error_abort);
     if (!qemu_opts_absorb_qdict(opts, options, errp)) {
 +        ret = -EINVAL;
         goto cleanup;
     }
@@ -284,6 +284,7 @@ static int block_crypto_open_generic(QCryptoBlockFormat format,
     open_opts = block_crypto_open_opts_init(cryptoopts, errp);
     if (!open_opts) {
 +        ret = -EINVAL;
         goto cleanup;
     }
 diff --git a/block/dmg.c b/block/dmg.c
 index 447901fbb8..38c363dd39 100644
 --- a/block/dmg.c
 +++ b/block/dmg.c
@@ -439,10 +439,9 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
         return ret;
     }
 -    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
 -                               BDRV_CHILD_IMAGE, false, errp);
 -    if (!bs->file) {
 -        return -EINVAL;
 +    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 +    if (ret < 0) {
 +        return ret;
     }
     block_module_load_one("dmg-bz2");
 diff --git a/block/filter-compress.c b/block/filter-compress.c
 index d5be538619..305716c86c 100644
 --- a/block/filter-compress.c
 +++ b/block/filter-compress.c
@@ -30,11 +30,9 @@
 static int compress_open(BlockDriverState *bs, QDict *options, int flags,
                          Error **errp)
 {
 -    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
 -                               BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
 -                               false, errp);
 -    if (!bs->file) {
 -        return -EINVAL;
 +    int ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 +    if (ret < 0) {
 +        return ret;
     }
     if (!bs->file->bs->drv || !block_driver_can_compress(bs->file->bs->drv)) {
 diff --git a/block/parallels.c b/block/parallels.c
 index 6ebad2a2bb..ed4debd899 100644
 --- a/block/parallels.c
 +++ b/block/parallels.c
@@ -735,10 +735,9 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
     Error *local_err = NULL;
     char *buf;
 -    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
 -                               BDRV_CHILD_IMAGE, false, errp);
 -    if (!bs->file) {
 -        return -EINVAL;
 +    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 +    if (ret < 0) {
 +        return ret;
     }
     ret = bdrv_pread(bs->file, 0, &ph, sizeof(ph));
 diff --git a/block/preallocate.c b/block/preallocate.c
 index 1d4233f730..332408bdc9 100644
 --- a/block/preallocate.c
 +++ b/block/preallocate.c
@@ -134,6 +134,7 @@ static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
                             Error **errp)
 {
     BDRVPreallocateState *s = bs->opaque;
 +    int ret;
     /*
      * s->data_end and friends should be initialized on permission update.
@@ -141,11 +142,9 @@ static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
      */
     s->file_end = s->zero_start = s->data_end = -EINVAL;
 -    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
 -                               BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
 -                               false, errp);
 -    if (!bs->file) {
 -        return -EINVAL;
 +    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 +    if (ret < 0) {
 +        return ret;
     }
     if (!preallocate_absorb_opts(&s->opts, options, bs->file->bs, errp)) {
 diff --git a/block/qcow.c b/block/qcow.c
 index c39940f33e..544a17261f 100644
 --- a/block/qcow.c
 +++ b/block/qcow.c
@@ -120,10 +120,8 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
     qdict_extract_subqdict(options, &encryptopts, "encrypt.");
     encryptfmt = qdict_get_try_str(encryptopts, "format");
 -    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
 -                               BDRV_CHILD_IMAGE, false, errp);
 -    if (!bs->file) {
 -        ret = -EINVAL;
 +    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 +    if (ret < 0) {
         goto fail;
     }
 diff --git a/block/qcow2.c b/block/qcow2.c
 index 6ee1919612..29ea157e6b 100644
 --- a/block/qcow2.c
 +++ b/block/qcow2.c
@@ -1907,11 +1907,11 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
         .errp = errp,
         .ret = -EINPROGRESS
     };
 +    int ret;
 -    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
 -                               BDRV_CHILD_IMAGE, false, errp);
 -    if (!bs->file) {
 -        return -EINVAL;
 +    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 +    if (ret < 0) {
 +        return ret;
     }
     /* Initialise locks */
 diff --git a/block/qed.c b/block/qed.c
 index 558d3646c4..e3b06a3d00 100644
 --- a/block/qed.c
 +++ b/block/qed.c
@@ -558,11 +558,11 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
         .errp = errp,
         .ret = -EINPROGRESS
     };
 +    int ret;
 -    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
 -                               BDRV_CHILD_IMAGE, false, errp);
 -    if (!bs->file) {
 -        return -EINVAL;
 +    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 +    if (ret < 0) {
 +        return ret;
     }
     bdrv_qed_init_state(bs);
 diff --git a/block/replication.c b/block/replication.c
 index 55c8f894aa..2f17397764 100644
 --- a/block/replication.c
 +++ b/block/replication.c
@@ -88,11 +88,9 @@ static int replication_open(BlockDriverState *bs, QDict *options,
     const char *mode;
     const char *top_id;
 -    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
 -                               BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
 -                               false, errp);
 -    if (!bs->file) {
 -        return -EINVAL;
 +    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 +    if (ret < 0) {
 +        return ret;
     }
     ret = -EINVAL;
 diff --git a/block/throttle.c b/block/throttle.c
 index 6e8d52fa24..4fb5798c27 100644
 --- a/block/throttle.c
 +++ b/block/throttle.c
@@ -78,11 +78,9 @@ static int throttle_open(BlockDriverState *bs, QDict *options,
     char *group;
     int ret;
 -    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
 -                               BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
 -                               false, errp);
 -    if (!bs->file) {
 -        return -EINVAL;
 +    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 +    if (ret < 0) {
 +        return ret;
     }
     bs->supported_write_flags = bs->file->bs->supported_write_flags |
                                 BDRV_REQ_WRITE_UNCHANGED;
 diff --git a/block/vdi.c b/block/vdi.c
 index bdc58d726e..c50c0ed61f 100644
 --- a/block/vdi.c
 +++ b/block/vdi.c
@@ -376,10 +376,9 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
     int ret;
     QemuUUID uuid_link, uuid_parent;
 -    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
 -                               BDRV_CHILD_IMAGE, false, errp);
 -    if (!bs->file) {
 -        return -EINVAL;
 +    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 +    if (ret < 0) {
 +        return ret;
     }
     logout("\n");
 diff --git a/block/vhdx.c b/block/vhdx.c
 index 356ec4c455..e7d6d7509a 100644
 --- a/block/vhdx.c
 +++ b/block/vhdx.c
@@ -996,10 +996,9 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
     uint64_t signature;
     Error *local_err = NULL;
 -    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
 -                               BDRV_CHILD_IMAGE, false, errp);
 -    if (!bs->file) {
 -        return -EINVAL;
 +    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 +    if (ret < 0) {
 +        return ret;
     }
     s->bat = NULL;
 diff --git a/block/vmdk.c b/block/vmdk.c
 index 0dfab6e941..7d7e56b36c 100644
 --- a/block/vmdk.c
 +++ b/block/vmdk.c
@@ -1262,10 +1262,9 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
     BDRVVmdkState *s = bs->opaque;
     uint32_t magic;
 -    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
 -                               BDRV_CHILD_IMAGE, false, errp);
 -    if (!bs->file) {
 -        return -EINVAL;
 +    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 +    if (ret < 0) {
 +        return ret;
     }
     buf = vmdk_read_desc(bs->file, 0, errp);
 diff --git a/block/vpc.c b/block/vpc.c
 index 297a26262a..430cab1cbb 100644
 --- a/block/vpc.c
 +++ b/block/vpc.c
@@ -232,10 +232,9 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
     int ret;
     int64_t bs_size;
 -    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
 -                               BDRV_CHILD_IMAGE, false, errp);
 -    if (!bs->file) {
 -        return -EINVAL;
 +    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 +    if (ret < 0) {
 +        return ret;
     }
     opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
 diff --git a/include/block/block.h b/include/block/block.h
 index e5dd22b034..f885f113ef 100644
 --- a/include/block/block.h
 +++ b/include/block/block.h
@@ -376,6 +376,9 @@ BdrvChild *bdrv_open_child(const char *filename,
                            const BdrvChildClass *child_class,
                            BdrvChildRole child_role,
                            bool allow_none, Error **errp);
 +int bdrv_open_file_child(const char *filename,
 +                         QDict *options, const char *bdref_key,
 +                         BlockDriverState *parent, Error **errp);
 BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp);
 int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
                         Error **errp);
 -- 
 2.39.3
--- a/SOURCES/kvm-block-mirror-Do-not-wait-for-active-writes.patch
+++ b/SOURCES/kvm-block-mirror-Do-not-wait-for-active-writes.patch
@ -0,0 +1,153 @@
 From 192f956f2b0761f270070555f8feb1f0544e5558 Mon Sep 17 00:00:00 2001
 From: Hanna Reitz <hreitz@redhat.com>
 Date: Wed, 9 Nov 2022 17:54:48 +0100
 Subject: [PATCH 01/11] block/mirror: Do not wait for active writes
 RH-Author: Hanna Czenczek <hreitz@redhat.com>
 RH-MergeRequest: 246: block/mirror: Make active mirror progress even under full load
 RH-Bugzilla: 2125119
 RH-Acked-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
 RH-Acked-by: Stefano Garzarella <sgarzare@redhat.com>
 RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
 RH-Commit: [1/3] 652d1e55b954f13eaec2c86f58735d4942837e16
 Waiting for all active writes to settle before daring to create a
 background copying operation means that we will never do background
 operations while the guest does anything (in write-blocking mode), and
 therefore cannot converge.  Yes, we also will not diverge, but actually
 converging would be even nicer.
 It is unclear why we did decide to wait for all active writes to settle
 before creating a background operation, but it just does not seem
 necessary.  Active writes will put themselves into the in_flight bitmap
 and thus properly block actually conflicting background requests.
 It is important for active requests to wait on overlapping background
 requests, which we do in active_write_prepare().  However, so far it was
 not documented why it is important.  Add such documentation now, and
 also to the other call of mirror_wait_on_conflicts(), so that it becomes
 more clear why and when requests need to actively wait for other
 requests to settle.
 Another thing to note is that of course we need to ensure that there are
 no active requests when the job completes, but that is done by virtue of
 the BDS being drained anyway, so there cannot be any active requests at
 that point.
 With this change, we will need to explicitly keep track of how many
 bytes are in flight in active requests so that
 job_progress_set_remaining() in mirror_run() can set the correct number
 of remaining bytes.
 Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2123297
 Signed-off-by: Hanna Reitz <hreitz@redhat.com>
 Message-Id: <20221109165452.67927-2-hreitz@redhat.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 (cherry picked from commit d69a879bdf1aed586478eaa161ee064fe1b92f1a)
 Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
 ---
 block/mirror.c | 37 ++++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)
 diff --git a/block/mirror.c b/block/mirror.c
 index efec2c7674..282f428cb7 100644
 --- a/block/mirror.c
 +++ b/block/mirror.c
@@ -81,6 +81,7 @@ typedef struct MirrorBlockJob {
     int max_iov;
     bool initial_zeroing_ongoing;
     int in_active_write_counter;
 +    int64_t active_write_bytes_in_flight;
     bool prepared;
     bool in_drain;
 } MirrorBlockJob;
@@ -493,6 +494,13 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
     }
     bdrv_dirty_bitmap_unlock(s->dirty_bitmap);
 +    /*
 +     * Wait for concurrent requests to @offset.  The next loop will limit the
 +     * copied area based on in_flight_bitmap so we only copy an area that does
 +     * not overlap with concurrent in-flight requests.  Still, we would like to
 +     * copy something, so wait until there are at least no more requests to the
 +     * very beginning of the area.
 +     */
     mirror_wait_on_conflicts(NULL, s, offset, 1);
     job_pause_point(&s->common.job);
@@ -993,12 +1001,6 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
         int64_t cnt, delta;
         bool should_complete;
 -        /* Do not start passive operations while there are active
 -         * writes in progress */
 -        while (s->in_active_write_counter) {
 -            mirror_wait_for_any_operation(s, true);
 -        }
 -
         if (s->ret < 0) {
             ret = s->ret;
             goto immediate_exit;
@@ -1015,7 +1017,9 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
         /* cnt is the number of dirty bytes remaining and s->bytes_in_flight is
          * the number of bytes currently being processed; together those are
          * the current remaining operation length */
 -        job_progress_set_remaining(&s->common.job, s->bytes_in_flight + cnt);
 +        job_progress_set_remaining(&s->common.job,
 +                                   s->bytes_in_flight + cnt +
 +                                   s->active_write_bytes_in_flight);
         /* Note that even when no rate limit is applied we need to yield
          * periodically with no pending I/O so that bdrv_drain_all() returns.
@@ -1073,6 +1077,10 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
             s->in_drain = true;
             bdrv_drained_begin(bs);
 +
 +            /* Must be zero because we are drained */
 +            assert(s->in_active_write_counter == 0);
 +
             cnt = bdrv_get_dirty_count(s->dirty_bitmap);
             if (cnt > 0 || mirror_flush(s) < 0) {
                 bdrv_drained_end(bs);
@@ -1306,6 +1314,7 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
     }
     job_progress_increase_remaining(&job->common.job, bytes);
 +    job->active_write_bytes_in_flight += bytes;
     switch (method) {
     case MIRROR_METHOD_COPY:
@@ -1327,6 +1336,7 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
         abort();
     }
 +    job->active_write_bytes_in_flight -= bytes;
     if (ret >= 0) {
         job_progress_update(&job->common.job, bytes);
     } else {
@@ -1375,6 +1385,19 @@ static MirrorOp *coroutine_fn active_write_prepare(MirrorBlockJob *s,
     s->in_active_write_counter++;
 +    /*
 +     * Wait for concurrent requests affecting the area.  If there are already
 +     * running requests that are copying off now-to-be stale data in the area,
 +     * we must wait for them to finish before we begin writing fresh data to the
 +     * target so that the write operations appear in the correct order.
 +     * Note that background requests (see mirror_iteration()) in contrast only
 +     * wait for conflicting requests at the start of the dirty area, and then
 +     * (based on the in_flight_bitmap) truncate the area to copy so it will not
 +     * conflict with any requests beyond that.  For active writes, however, we
 +     * cannot truncate that area.  The request from our parent must be blocked
 +     * until the area is copied in full.  Therefore, we must wait for the whole
 +     * area to become free of concurrent requests.
 +     */
     mirror_wait_on_conflicts(op, s, offset, bytes);
     bitmap_set(s->in_flight_bitmap, start_chunk, end_chunk - start_chunk);
 -- 
 2.37.3
--- a/SOURCES/kvm-block-mirror-Drop-mirror_wait_for_any_operation.patch
+++ b/SOURCES/kvm-block-mirror-Drop-mirror_wait_for_any_operation.patch
@ -0,0 +1,76 @@
 From 57c79ed20cb73aa9aa4dd7487379b85ea3f936f6 Mon Sep 17 00:00:00 2001
 From: Hanna Reitz <hreitz@redhat.com>
 Date: Wed, 9 Nov 2022 17:54:49 +0100
 Subject: [PATCH 02/11] block/mirror: Drop mirror_wait_for_any_operation()
 RH-Author: Hanna Czenczek <hreitz@redhat.com>
 RH-MergeRequest: 246: block/mirror: Make active mirror progress even under full load
 RH-Bugzilla: 2125119
 RH-Acked-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
 RH-Acked-by: Stefano Garzarella <sgarzare@redhat.com>
 RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
 RH-Commit: [2/3] dec37883bcc491441ae08d9592d1ec26a47765c0
 mirror_wait_for_free_in_flight_slot() is the only remaining user of
 mirror_wait_for_any_operation(), so inline the latter into the former.
 Signed-off-by: Hanna Reitz <hreitz@redhat.com>
 Message-Id: <20221109165452.67927-3-hreitz@redhat.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 (cherry picked from commit eb994912993077f178ccb43b20e422ecf9ae4ac7)
 Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
 ---
 block/mirror.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)
 diff --git a/block/mirror.c b/block/mirror.c
 index 282f428cb7..6b02555ad7 100644
 --- a/block/mirror.c
 +++ b/block/mirror.c
@@ -304,19 +304,21 @@ static int mirror_cow_align(MirrorBlockJob *s, int64_t *offset,
 }
 static inline void coroutine_fn
 -mirror_wait_for_any_operation(MirrorBlockJob *s, bool active)
 +mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s)
 {
     MirrorOp *op;
     QTAILQ_FOREACH(op, &s->ops_in_flight, next) {
 -        /* Do not wait on pseudo ops, because it may in turn wait on
 +        /*
 +         * Do not wait on pseudo ops, because it may in turn wait on
          * some other operation to start, which may in fact be the
          * caller of this function.  Since there is only one pseudo op
          * at any given time, we will always find some real operation
 -         * to wait on. */
 -        if (!op->is_pseudo_op && op->is_in_flight &&
 -            op->is_active_write == active)
 -        {
 +         * to wait on.
 +         * Also, do not wait on active operations, because they do not
 +         * use up in-flight slots.
 +         */
 +        if (!op->is_pseudo_op && op->is_in_flight && !op->is_active_write) {
             qemu_co_queue_wait(&op->waiting_requests, NULL);
             return;
         }
@@ -324,13 +326,6 @@ mirror_wait_for_any_operation(MirrorBlockJob *s, bool active)
     abort();
 }
 -static inline void coroutine_fn
 -mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s)
 -{
 -    /* Only non-active operations use up in-flight slots */
 -    mirror_wait_for_any_operation(s, false);
 -}
 -
 /* Perform a mirror copy operation.
  *
  * *op->bytes_handled is set to the number of bytes copied after and
 -- 
 2.37.3
--- a/SOURCES/kvm-block-mirror-Fix-NULL-s-job-in-active-writes.patch
+++ b/SOURCES/kvm-block-mirror-Fix-NULL-s-job-in-active-writes.patch
@ -0,0 +1,75 @@
 From b1f5aa5a342a25dc558ee9d435fed0643fe5155f Mon Sep 17 00:00:00 2001
 From: Hanna Reitz <hreitz@redhat.com>
 Date: Wed, 9 Nov 2022 17:54:50 +0100
 Subject: [PATCH 03/11] block/mirror: Fix NULL s->job in active writes
 RH-Author: Hanna Czenczek <hreitz@redhat.com>
 RH-MergeRequest: 246: block/mirror: Make active mirror progress even under full load
 RH-Bugzilla: 2125119
 RH-Acked-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
 RH-Acked-by: Stefano Garzarella <sgarzare@redhat.com>
 RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
 RH-Commit: [3/3] 49d7ebd15667151a6e14228a8260cfdd0aa27a78
 There is a small gap in mirror_start_job() before putting the mirror
 filter node into the block graph (bdrv_append() call) and the actual job
 being created.  Before the job is created, MirrorBDSOpaque.job is NULL.
 It is possible that requests come in when bdrv_drained_end() is called,
 and those requests would see MirrorBDSOpaque.job == NULL.  Have our
 filter node handle that case gracefully.
 Signed-off-by: Hanna Reitz <hreitz@redhat.com>
 Message-Id: <20221109165452.67927-4-hreitz@redhat.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 (cherry picked from commit da93d5c84e56e6b4e84aa8e98b6b984c9b6bb528)
 Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
 ---
 block/mirror.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)
 diff --git a/block/mirror.c b/block/mirror.c
 index 6b02555ad7..50289fca49 100644
 --- a/block/mirror.c
 +++ b/block/mirror.c
@@ -1438,11 +1438,13 @@ static int coroutine_fn bdrv_mirror_top_do_write(BlockDriverState *bs,
     MirrorOp *op = NULL;
     MirrorBDSOpaque *s = bs->opaque;
     int ret = 0;
 -    bool copy_to_target;
 +    bool copy_to_target = false;
 -    copy_to_target = s->job->ret >= 0 &&
 -                     !job_is_cancelled(&s->job->common.job) &&
 -                     s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING;
 +    if (s->job) {
 +        copy_to_target = s->job->ret >= 0 &&
 +                         !job_is_cancelled(&s->job->common.job) &&
 +                         s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING;
 +    }
     if (copy_to_target) {
         op = active_write_prepare(s->job, offset, bytes);
@@ -1487,11 +1489,13 @@ static int coroutine_fn bdrv_mirror_top_pwritev(BlockDriverState *bs,
     QEMUIOVector bounce_qiov;
     void *bounce_buf;
     int ret = 0;
 -    bool copy_to_target;
 +    bool copy_to_target = false;
 -    copy_to_target = s->job->ret >= 0 &&
 -                     !job_is_cancelled(&s->job->common.job) &&
 -                     s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING;
 +    if (s->job) {
 +        copy_to_target = s->job->ret >= 0 &&
 +                         !job_is_cancelled(&s->job->common.job) &&
 +                         s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING;
 +    }
     if (copy_to_target) {
         /* The guest might concurrently modify the data to write; but
 -- 
 2.37.3
--- a/SOURCES/kvm-block-move-bdrv_qiov_is_aligned-to-file-posix.patch
+++ b/SOURCES/kvm-block-move-bdrv_qiov_is_aligned-to-file-posix.patch
@ -0,0 +1,104 @@
 From 636e32b4c570ddb20266b6672311174353644f0e Mon Sep 17 00:00:00 2001
 From: Keith Busch <kbusch@kernel.org>
 Date: Thu, 29 Sep 2022 13:05:22 -0700
 Subject: [PATCH 1/2] block: move bdrv_qiov_is_aligned to file-posix
 RH-Author: Kevin Wolf <kwolf@redhat.com>
 RH-MergeRequest: 411: block: Fix iov_len check in bdrv_qiov_is_aligned()
 RH-Jira: RHEL-60553
 RH-Acked-by: Eric Blake <eblake@redhat.com>
 RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
 RH-Commit: [1/2] 682c1b81b42959d9d91e0f68cd70e9753e53a279
 There is only user of bdrv_qiov_is_aligned(), so move the alignment
 function to there and make it static.
 Signed-off-by: Keith Busch <kbusch@kernel.org>
 Message-Id: <20220929200523.3218710-2-kbusch@meta.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 (cherry picked from commit a7c5f67a78569f8c275ea4ea9962e9c79b9d03cb)
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
 block/file-posix.c    | 20 ++++++++++++++++++++
 block/io.c            | 20 --------------------
 include/block/block.h |  1 -
 3 files changed, 20 insertions(+), 21 deletions(-)
 diff --git a/block/file-posix.c b/block/file-posix.c
 index b283093e5b..b404e1544f 100644
 --- a/block/file-posix.c
 +++ b/block/file-posix.c
@@ -2051,6 +2051,26 @@ static int coroutine_fn raw_thread_pool_submit(BlockDriverState *bs,
     return thread_pool_submit_co(pool, func, arg);
 }
 +/*
 + * Check if all memory in this vector is sector aligned.
 + */
 +static bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
 +{
 +    int i;
 +    size_t alignment = bdrv_min_mem_align(bs);
 +
 +    for (i = 0; i < qiov->niov; i++) {
 +        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
 +            return false;
 +        }
 +        if (qiov->iov[i].iov_len % alignment) {
 +            return false;
 +        }
 +    }
 +
 +    return true;
 +}
 +
 static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
                                    uint64_t bytes, QEMUIOVector *qiov, int type)
 {
 diff --git a/block/io.c b/block/io.c
 index 8ae57728a6..639e171eff 100644
 --- a/block/io.c
 +++ b/block/io.c
@@ -3375,26 +3375,6 @@ void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
     return mem;
 }
 -/*
 - * Check if all memory in this vector is sector aligned.
 - */
 -bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
 -{
 -    int i;
 -    size_t alignment = bdrv_min_mem_align(bs);
 -
 -    for (i = 0; i < qiov->niov; i++) {
 -        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
 -            return false;
 -        }
 -        if (qiov->iov[i].iov_len % alignment) {
 -            return false;
 -        }
 -    }
 -
 -    return true;
 -}
 -
 void bdrv_io_plug(BlockDriverState *bs)
 {
     BdrvChild *child;
 diff --git a/include/block/block.h b/include/block/block.h
 index f885f113ef..09b374b496 100644
 --- a/include/block/block.h
 +++ b/include/block/block.h
@@ -622,7 +622,6 @@ void *qemu_blockalign(BlockDriverState *bs, size_t size);
 void *qemu_blockalign0(BlockDriverState *bs, size_t size);
 void *qemu_try_blockalign(BlockDriverState *bs, size_t size);
 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size);
 -bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov);
 void bdrv_enable_copy_on_read(BlockDriverState *bs);
 void bdrv_disable_copy_on_read(BlockDriverState *bs);
 -- 
 2.45.2
--- a/SOURCES/kvm-block-use-the-request-length-for-iov-alignment.patch
+++ b/SOURCES/kvm-block-use-the-request-length-for-iov-alignment.patch
@ -0,0 +1,48 @@
 From 9009b674a01dc0cd92c319c87714b5aca6e639f8 Mon Sep 17 00:00:00 2001
 From: Keith Busch <kbusch@kernel.org>
 Date: Thu, 29 Sep 2022 13:05:23 -0700
 Subject: [PATCH 2/2] block: use the request length for iov alignment
 RH-Author: Kevin Wolf <kwolf@redhat.com>
 RH-MergeRequest: 411: block: Fix iov_len check in bdrv_qiov_is_aligned()
 RH-Jira: RHEL-60553
 RH-Acked-by: Eric Blake <eblake@redhat.com>
 RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
 RH-Commit: [2/2] 0e01d51cfb21ca43283626c2367e5c5d0d531736
 An iov length needs to be aligned to the logical block size, which may
 be larger than the memory alignment.
 Tested-by: Jens Axboe <axboe@kernel.dk>
 Signed-off-by: Keith Busch <kbusch@kernel.org>
 Message-Id: <20220929200523.3218710-3-kbusch@meta.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 (cherry picked from commit 25474d90aa50bd32e0de395a33d8de42dd6f2aef)
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
 block/file-posix.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
 diff --git a/block/file-posix.c b/block/file-posix.c
 index b404e1544f..b84c5725cc 100644
 --- a/block/file-posix.c
 +++ b/block/file-posix.c
@@ -2058,12 +2058,13 @@ static bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
 {
     int i;
     size_t alignment = bdrv_min_mem_align(bs);
 +    size_t len = bs->bl.request_alignment;
     for (i = 0; i < qiov->niov; i++) {
         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
             return false;
         }
 -        if (qiov->iov[i].iov_len % alignment) {
 +        if (qiov->iov[i].iov_len % len) {
             return false;
         }
     }
 -- 
 2.45.2
--- a/SOURCES/kvm-checkpatch-add-qemu_bh_new-aio_bh_new-checks.patch
+++ b/SOURCES/kvm-checkpatch-add-qemu_bh_new-aio_bh_new-checks.patch
@ -0,0 +1,56 @@
 From 866a3b56f6a2d43f3cf7b3313fb41808bc5e6e1f Mon Sep 17 00:00:00 2001
 From: Jon Maloy <jmaloy@redhat.com>
 Date: Tue, 9 May 2023 10:29:03 -0400
 Subject: [PATCH 03/15] checkpatch: add qemu_bh_new/aio_bh_new checks
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 277: memory: prevent dma-reentracy issues
 RH-Bugzilla: 1999236
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [3/12] 620b480b0878c18223f3cc103450bc16aa6d7e21 (redhat/rhel/src/qemu-kvm/jons-qemu-kvm-2)
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1999236
 Upstream: Merged
 CVE: CVE-2021-3750
 commit ef56ffbdd6b0605dc1e305611287b948c970e236
 Author: Alexander Bulekov <alxndr@bu.edu>
 Date:   Thu Apr 27 17:10:08 2023 -0400
    checkpatch: add qemu_bh_new/aio_bh_new checks
    Advise authors to use the _guarded versions of the APIs, instead.
    Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
    Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
    Message-Id: <20230427211013.2994127-4-alxndr@bu.edu>
    Signed-off-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 scripts/checkpatch.pl | 8 ++++++++
 1 file changed, 8 insertions(+)
 diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
 index cb8eff233e..b2428e80cc 100755
 --- a/scripts/checkpatch.pl
 +++ b/scripts/checkpatch.pl
@@ -2858,6 +2858,14 @@ sub process {
 		if ($line =~ /\bsignal\s*\(/ && !($line =~ /SIG_(?:IGN|DFL)/)) {
 			ERROR("use sigaction to establish signal handlers; signal is not portable\n" . $herecurr);
 		}
 +# recommend qemu_bh_new_guarded instead of qemu_bh_new
 +        if ($realfile =~ /.*\/hw\/.*/ && $line =~ /\bqemu_bh_new\s*\(/) {
 +			ERROR("use qemu_bh_new_guarded() instead of qemu_bh_new() to avoid reentrancy problems\n" . $herecurr);
 +		}
 +# recommend aio_bh_new_guarded instead of aio_bh_new
 +        if ($realfile =~ /.*\/hw\/.*/ && $line =~ /\baio_bh_new\s*\(/) {
 +			ERROR("use aio_bh_new_guarded() instead of aio_bh_new() to avoid reentrancy problems\n" . $herecurr);
 +		}
 # check for module_init(), use category-specific init macros explicitly please
 		if ($line =~ /^module_init\s*\(/) {
 			ERROR("please use block_init(), type_init() etc. instead of module_init()\n" . $herecurr);
 -- 
 2.37.3
--- a/SOURCES/kvm-dma-helpers-prevent-dma_blk_cb-vs-dma_aio_cancel-rac.patch
+++ b/SOURCES/kvm-dma-helpers-prevent-dma_blk_cb-vs-dma_aio_cancel-rac.patch
@ -0,0 +1,127 @@
 From 103608465b8bd2edf7f9aaef5c3c93309ccf9ec2 Mon Sep 17 00:00:00 2001
 From: Stefan Hajnoczi <stefanha@redhat.com>
 Date: Tue, 21 Feb 2023 16:22:17 -0500
 Subject: [PATCH 12/13] dma-helpers: prevent dma_blk_cb() vs dma_aio_cancel()
 race
 RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
 RH-MergeRequest: 264: scsi: protect req->aiocb with AioContext lock
 RH-Bugzilla: 2090990
 RH-Acked-by: Stefano Garzarella <sgarzare@redhat.com>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
 RH-Commit: [2/3] 14f5835093ba8c5111f3ada2fe87730371aca733
 dma_blk_cb() only takes the AioContext lock around ->io_func(). That
 means the rest of dma_blk_cb() is not protected. In particular, the
 DMAAIOCB field accesses happen outside the lock.
 There is a race when the main loop thread holds the AioContext lock and
 invokes scsi_device_purge_requests() -> bdrv_aio_cancel() ->
 dma_aio_cancel() while an IOThread executes dma_blk_cb(). The dbs->acb
 field determines how cancellation proceeds. If dma_aio_cancel() sees
 dbs->acb == NULL while dma_blk_cb() is still running, the request can be
 completed twice (-ECANCELED and the actual return value).
 The following assertion can occur with virtio-scsi when an IOThread is
 used:
  ../hw/scsi/scsi-disk.c:368: scsi_dma_complete: Assertion `r->req.aiocb != NULL' failed.
 Fix the race by holding the AioContext across dma_blk_cb(). Now
 dma_aio_cancel() under the AioContext lock will not see
 inconsistent/intermediate states.
 Cc: Paolo Bonzini <pbonzini@redhat.com>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Message-Id: <20230221212218.1378734-3-stefanha@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 (cherry picked from commit abfcd2760b3e70727bbc0792221b8b98a733dc32)
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
 hw/scsi/scsi-disk.c   |  4 +---
 softmmu/dma-helpers.c | 12 +++++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)
 diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
 index 179ce22c4a..c8109a673e 100644
 --- a/hw/scsi/scsi-disk.c
 +++ b/hw/scsi/scsi-disk.c
@@ -351,13 +351,12 @@ done:
     scsi_req_unref(&r->req);
 }
 +/* Called with AioContext lock held */
 static void scsi_dma_complete(void *opaque, int ret)
 {
     SCSIDiskReq *r = (SCSIDiskReq *)opaque;
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
 -    aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
 -
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
@@ -367,7 +366,6 @@ static void scsi_dma_complete(void *opaque, int ret)
         block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
     }
     scsi_dma_complete_noio(r, ret);
 -    aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
 }
 static void scsi_read_complete_noio(SCSIDiskReq *r, int ret)
 diff --git a/softmmu/dma-helpers.c b/softmmu/dma-helpers.c
 index 7d766a5e89..42af18719a 100644
 --- a/softmmu/dma-helpers.c
 +++ b/softmmu/dma-helpers.c
@@ -127,17 +127,19 @@ static void dma_complete(DMAAIOCB *dbs, int ret)
 static void dma_blk_cb(void *opaque, int ret)
 {
     DMAAIOCB *dbs = (DMAAIOCB *)opaque;
 +    AioContext *ctx = dbs->ctx;
     dma_addr_t cur_addr, cur_len;
     void *mem;
     trace_dma_blk_cb(dbs, ret);
 +    aio_context_acquire(ctx);
     dbs->acb = NULL;
     dbs->offset += dbs->iov.size;
     if (dbs->sg_cur_index == dbs->sg->nsg || ret < 0) {
         dma_complete(dbs, ret);
 -        return;
 +        goto out;
     }
     dma_blk_unmap(dbs);
@@ -177,9 +179,9 @@ static void dma_blk_cb(void *opaque, int ret)
     if (dbs->iov.size == 0) {
         trace_dma_map_wait(dbs);
 -        dbs->bh = aio_bh_new(dbs->ctx, reschedule_dma, dbs);
 +        dbs->bh = aio_bh_new(ctx, reschedule_dma, dbs);
         cpu_register_map_client(dbs->bh);
 -        return;
 +        goto out;
     }
     if (!QEMU_IS_ALIGNED(dbs->iov.size, dbs->align)) {
@@ -187,11 +189,11 @@ static void dma_blk_cb(void *opaque, int ret)
                                 QEMU_ALIGN_DOWN(dbs->iov.size, dbs->align));
     }
 -    aio_context_acquire(dbs->ctx);
     dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
                             dma_blk_cb, dbs, dbs->io_func_opaque);
 -    aio_context_release(dbs->ctx);
     assert(dbs->acb);
 +out:
 +    aio_context_release(ctx);
 }
 static void dma_aio_cancel(BlockAIOCB *acb)
 -- 
 2.37.3
--- a/SOURCES/kvm-dump-Add-arch-cleanup-function.patch
+++ b/SOURCES/kvm-dump-Add-arch-cleanup-function.patch
@ -0,0 +1,69 @@
 From 837e09b1a8a38b53488f59aad090fbe6bb94e257 Mon Sep 17 00:00:00 2001
 From: Thomas Huth <thuth@redhat.com>
 Date: Fri, 17 Nov 2023 11:32:37 +0100
 Subject: [PATCH 2/3] dump: Add arch cleanup function
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Thomas Huth <thuth@redhat.com>
 RH-MergeRequest: 323: Fix problem that secure execution guest might remain in "paused" state after failed dump
 RH-Jira: RHEL-16696
 RH-Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
 RH-Acked-by: Cédric Le Goater <clg@redhat.com>
 RH-Commit: [2/3] b70f406dec88ffd4877f3d5d580fc8f821bdb252
 JIRA: https://issues.redhat.com/browse/RHEL-16696
 commit e72629e5149aba6f44122ea6d2a803ef136a0c6b
 Author: Janosch Frank <frankja@linux.ibm.com>
 Date:   Thu Nov 9 12:04:42 2023 +0000
    dump: Add arch cleanup function
    Some architectures (s390x) need to cleanup after a failed dump to be
    able to continue to run the vm. Add a cleanup function pointer and
    call it if it's set.
    Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
    Reviewed-by: Thomas Huth <thuth@redhat.com>
    Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
    Message-ID: <20231109120443.185979-3-frankja@linux.ibm.com>
    Signed-off-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Thomas Huth <thuth@redhat.com>
 ---
 dump/dump.c                | 4 ++++
 include/sysemu/dump-arch.h | 1 +
 2 files changed, 5 insertions(+)
 diff --git a/dump/dump.c b/dump/dump.c
 index 5dee060b73..93edb89547 100644
 --- a/dump/dump.c
 +++ b/dump/dump.c
@@ -100,6 +100,10 @@ uint64_t cpu_to_dump64(DumpState *s, uint64_t val)
 static int dump_cleanup(DumpState *s)
 {
 +    if (s->dump_info.arch_cleanup_fn) {
 +        s->dump_info.arch_cleanup_fn(s);
 +    }
 +
     guest_phys_blocks_free(&s->guest_phys_blocks);
     memory_mapping_list_free(&s->list);
     close(s->fd);
 diff --git a/include/sysemu/dump-arch.h b/include/sysemu/dump-arch.h
 index 59bbc9be38..743916e46c 100644
 --- a/include/sysemu/dump-arch.h
 +++ b/include/sysemu/dump-arch.h
@@ -24,6 +24,7 @@ typedef struct ArchDumpInfo {
     void (*arch_sections_add_fn)(DumpState *s);
     uint64_t (*arch_sections_write_hdr_fn)(DumpState *s, uint8_t *buff);
     int (*arch_sections_write_fn)(DumpState *s, uint8_t *buff);
 +    void (*arch_cleanup_fn)(DumpState *s);
 } ArchDumpInfo;
 struct GuestPhysBlockList; /* memory_mapping.h */
 -- 
 2.39.3
--- a/SOURCES/kvm-edu-add-smp_mb__after_rmw.patch
+++ b/SOURCES/kvm-edu-add-smp_mb__after_rmw.patch
@ -0,0 +1,61 @@
 From 7693449b235bbab6d32a1b87fa1d0e101c786f3b Mon Sep 17 00:00:00 2001
 From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 Date: Thu, 9 Mar 2023 08:11:14 -0500
 Subject: [PATCH 05/13] edu: add smp_mb__after_rmw()
 RH-Author: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 RH-MergeRequest: 263: qatomic: add smp_mb__before/after_rmw()
 RH-Bugzilla: 2168472
 RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
 RH-Acked-by: Eric Auger <eric.auger@redhat.com>
 RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
 RH-Acked-by: David Hildenbrand <david@redhat.com>
 RH-Commit: [5/10] 300901290e08b253b1278eedc39cd07c1e202b96
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2168472
 commit 2482aeea4195ad84cf3d4e5b15b28ec5b420ed5a
 Author: Paolo Bonzini <pbonzini@redhat.com>
 Date:   Thu Mar 2 11:16:13 2023 +0100
    edu: add smp_mb__after_rmw()
    Ensure ordering between clearing the COMPUTING flag and checking
    IRQFACT, and between setting the IRQFACT flag and checking
    COMPUTING.  This ensures that no wakeups are lost.
    Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
    Reviewed-by: David Hildenbrand <david@redhat.com>
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 ---
 hw/misc/edu.c | 5 +++++
 1 file changed, 5 insertions(+)
 diff --git a/hw/misc/edu.c b/hw/misc/edu.c
 index e935c418d4..a1f8bc77e7 100644
 --- a/hw/misc/edu.c
 +++ b/hw/misc/edu.c
@@ -267,6 +267,8 @@ static void edu_mmio_write(void *opaque, hwaddr addr, uint64_t val,
     case 0x20:
         if (val & EDU_STATUS_IRQFACT) {
             qatomic_or(&edu->status, EDU_STATUS_IRQFACT);
 +            /* Order check of the COMPUTING flag after setting IRQFACT.  */
 +            smp_mb__after_rmw();
         } else {
             qatomic_and(&edu->status, ~EDU_STATUS_IRQFACT);
         }
@@ -349,6 +351,9 @@ static void *edu_fact_thread(void *opaque)
         qemu_mutex_unlock(&edu->thr_mutex);
         qatomic_and(&edu->status, ~EDU_STATUS_COMPUTING);
 +        /* Clear COMPUTING flag before checking IRQFACT.  */
 +        smp_mb__after_rmw();
 +
         if (qatomic_read(&edu->status) & EDU_STATUS_IRQFACT) {
             qemu_mutex_lock_iothread();
             edu_raise_irq(edu, FACT_IRQ);
 -- 
 2.37.3
--- a/SOURCES/kvm-glib-compat-Introduce-g_memdup2-wrapper.patch
+++ b/SOURCES/kvm-glib-compat-Introduce-g_memdup2-wrapper.patch
@ -0,0 +1,105 @@
 From 939c75ab92ac608893cad0e46f55527950518a57 Mon Sep 17 00:00:00 2001
 From: Jon Maloy <jmaloy@redhat.com>
 Date: Tue, 5 Mar 2024 11:36:15 -0500
 Subject: [PATCH 1/3] glib-compat: Introduce g_memdup2() wrapper
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 353: ui/clipboard: mark type as not available when there is no data
 RH-Jira: RHEL-19628
 RH-Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
 RH-Acked-by: Gerd Hoffmann <None>
 RH-Commit: [1/2] f401c63303ef558bfcbb36e4c8fcc8bf2b1c3eb4 (redhat/rhel/src/qemu-kvm/jons-qemu-kvm-2)
 JIRA: https://issues.redhat.com/browse/RHEL-19628
 CVE: CVE-2023-6683
 Upstream: Merged
 commit 2c674fada72079583a3f2cc1790b16a0259c4fa0
 Author: Philippe Mathieu-Daudé <philmd@linaro.org>
 Date:   Fri Sep 3 19:44:44 2021 +0200
    glib-compat: Introduce g_memdup2() wrapper
    When experimenting raising GLIB_VERSION_MIN_REQUIRED to 2.68
    (Fedora 34 provides GLib 2.68.1) we get:
      hw/virtio/virtio-crypto.c:245:24: error: 'g_memdup' is deprecated: Use 'g_memdup2' instead [-Werror,-Wdeprecated-declarations]
      ...
    g_memdup() has been updated by g_memdup2() to fix eventual security
    issues (size argument is 32-bit and could be truncated / wrapping).
    GLib recommends to copy their static inline version of g_memdup2():
    https://discourse.gnome.org/t/port-your-module-from-g-memdup-to-g-memdup2-now/5538
    Our glib-compat.h provides a comment explaining how to deal with
    these deprecated declarations (see commit e71e8cc0355
    "glib: enforce the minimum required version and warn about old APIs").
    Following this comment suggestion, implement the g_memdup2_qemu()
    wrapper to g_memdup2(), and use the safer equivalent inlined when
    we are using pre-2.68 GLib.
    Reported-by: Eric Blake <eblake@redhat.com>
    Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
    Reviewed-by: Eric Blake <eblake@redhat.com>
    Message-Id: <20210903174510.751630-3-philmd@redhat.com>
    Signed-off-by: Laurent Vivier <laurent@vivier.eu>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 include/glib-compat.h | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 diff --git a/include/glib-compat.h b/include/glib-compat.h
 index 9e95c888f5..8d01a8c01f 100644
 --- a/include/glib-compat.h
 +++ b/include/glib-compat.h
@@ -68,6 +68,43 @@
  * without generating warnings.
  */
 +/*
 + * g_memdup2_qemu:
 + * @mem: (nullable): the memory to copy.
 + * @byte_size: the number of bytes to copy.
 + *
 + * Allocates @byte_size bytes of memory, and copies @byte_size bytes into it
 + * from @mem. If @mem is %NULL it returns %NULL.
 + *
 + * This replaces g_memdup(), which was prone to integer overflows when
 + * converting the argument from a #gsize to a #guint.
 + *
 + * This static inline version is a backport of the new public API from
 + * GLib 2.68, kept internal to GLib for backport to older stable releases.
 + * See https://gitlab.gnome.org/GNOME/glib/-/issues/2319.
 + *
 + * Returns: (nullable): a pointer to the newly-allocated copy of the memory,
 + *          or %NULL if @mem is %NULL.
 + */
 +static inline gpointer g_memdup2_qemu(gconstpointer mem, gsize byte_size)
 +{
 +#if GLIB_CHECK_VERSION(2, 68, 0)
 +    return g_memdup2(mem, byte_size);
 +#else
 +    gpointer new_mem;
 +
 +    if (mem && byte_size != 0) {
 +        new_mem = g_malloc(byte_size);
 +        memcpy(new_mem, mem, byte_size);
 +    } else {
 +        new_mem = NULL;
 +    }
 +
 +    return new_mem;
 +#endif
 +}
 +#define g_memdup2(m, s) g_memdup2_qemu(m, s)
 +
 #if defined(G_OS_UNIX)
 /*
  * Note: The fallback implementation is not MT-safe, and it returns a copy of
 -- 
 2.41.0
--- a/SOURCES/kvm-hw-arm-virt-Do-not-load-efi-virtio.rom-for-all-virti.patch
+++ b/SOURCES/kvm-hw-arm-virt-Do-not-load-efi-virtio.rom-for-all-virti.patch
@ -0,0 +1,119 @@
 From 4f6f881de10e31cac4636d5fde4b7ed4c8affadb Mon Sep 17 00:00:00 2001
 From: Eric Auger <eric.auger@redhat.com>
 Date: Thu, 4 Jan 2024 12:02:31 +0100
 Subject: [PATCH 3/3] hw/arm/virt: Do not load efi-virtio.rom for all
 virtio-net-pci variants
 RH-Author: Eric Auger <eric.auger@redhat.com>
 RH-MergeRequest: 344: hw/arm/virt: Do not load efi-virtio.rom for any virtio-net-pci variants
 RH-Jira: RHEL-14870
 RH-Acked-by: Gerd Hoffmann <None>
 RH-Acked-by: Sebastian Ott <None>
 RH-Commit: [1/1] ffeaa78ad0a1cff5b49009dfb32d25e5cadc0e05
 Upstream: RHEL-only
 Brew: http://brewweb.engineering.redhat.com/brew/taskinfo?taskID=5785640
 Currently arm_rhel_compat just sets the romfile to "" for
 virtio-net-pci and not for transitional and non transitional
 variants. However, on aarch64 RHEL, efi-virtio.rom is not
 shipped so transitional and non-transitional variants cannot
 be used and the following error is obeserved:
 "Could not open option rom 'efi-virtio.rom': No such file or directory"
 In practice, we do not need any rom file for those virtio-net-pci
 variants either because edk2 already brings the full functionality.
 So let's change the applied compat to cover all the variants. While
 at it also change the way arm_rhel_compat is applied. Instead of
 applying it from the latest _virt_options(), which is error prone
 when upgrading the machine type, let's apply it before calling
 *virt_options in the non abstract machine class. That way the setting
 will apply to any machine type without any need to add it in any
 future machine types.
 We don't really care keeping non void romfiles for transitional and
 non transitional devices on previous machine types because this
 was not working anyway.
 Signed-off-by: Eric Auger <eric.auger@redhat.com>
 ---
 hw/arm/virt.c | 42 ++++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 14 deletions(-)
 diff --git a/hw/arm/virt.c b/hw/arm/virt.c
 index dbf0a6d62f..46c72a9611 100644
 --- a/hw/arm/virt.c
 +++ b/hw/arm/virt.c
@@ -108,11 +108,39 @@
     DEFINE_VIRT_MACHINE_LATEST(major, minor, false)
 #endif /* disabled for RHEL */
 +/*
 + * This variable is for changes to properties that are RHEL specific,
 + * different to the current upstream and to be applied to the latest
 + * machine type. They may be overriden by older machine compats.
 + *
 + * virtio-net-pci variant romfiles are not needed because edk2 does
 + * fully support the pxe boot. Besides virtio romfiles are not shipped
 + * on rhel/aarch64.
 + */
 +GlobalProperty arm_rhel_compat[] = {
 +    {"virtio-net-pci", "romfile", "" },
 +    {"virtio-net-pci-transitional", "romfile", "" },
 +    {"virtio-net-pci-non-transitional", "romfile", "" },
 +};
 +const size_t arm_rhel_compat_len = G_N_ELEMENTS(arm_rhel_compat);
 +
 +/*
 + * This cannot be called from the rhel_virt_class_init() because
 + * TYPE_RHEL_MACHINE is abstract and mc->compat_props g_ptr_array_new()
 + * only is called on virt-rhelm.n.s non abstract class init.
 + */
 +static void arm_rhel_compat_set(MachineClass *mc)
 +{
 +    compat_props_add(mc->compat_props, arm_rhel_compat,
 +                     arm_rhel_compat_len);
 +}
 +
 #define DEFINE_RHEL_MACHINE_LATEST(m, n, s, latest)                     \
     static void rhel##m##n##s##_virt_class_init(ObjectClass *oc,        \
                                                 void *data)             \
     {                                                                   \
         MachineClass *mc = MACHINE_CLASS(oc);                           \
 +        arm_rhel_compat_set(mc);                                        \
         rhel##m##n##s##_virt_options(mc);                               \
         mc->desc = "RHEL " # m "." # n "." # s " ARM Virtual Machine";  \
         if (latest) {                                                   \
@@ -136,19 +164,6 @@
 #define DEFINE_RHEL_MACHINE(major, minor, subminor)             \
     DEFINE_RHEL_MACHINE_LATEST(major, minor, subminor, false)
 -/* This variable is for changes to properties that are RHEL specific,
 - * different to the current upstream and to be applied to the latest
 - * machine type.
 - */
 -GlobalProperty arm_rhel_compat[] = {
 -    {
 -        .driver   = "virtio-net-pci",
 -        .property = "romfile",
 -        .value    = "",
 -    },
 -};
 -const size_t arm_rhel_compat_len = G_N_ELEMENTS(arm_rhel_compat);
 -
 /* Number of external interrupt lines to configure the GIC with */
 #define NUM_IRQS 256
@@ -3240,7 +3255,6 @@ type_init(rhel_machine_init);
 static void rhel860_virt_options(MachineClass *mc)
 {
 -    compat_props_add(mc->compat_props, arm_rhel_compat, arm_rhel_compat_len);
 }
 DEFINE_RHEL_MACHINE_AS_LATEST(8, 6, 0)
 -- 
 2.41.0
--- a/SOURCES/kvm-hw-char-virtio-serial-bus-Protect-from-DMA-re-entran.patch
+++ b/SOURCES/kvm-hw-char-virtio-serial-bus-Protect-from-DMA-re-entran.patch
@ -0,0 +1,61 @@
 From f4623ea611a74c684b0097b98a803cbe7ffb0825 Mon Sep 17 00:00:00 2001
 From: Jon Maloy <jmaloy@redhat.com>
 Date: Thu, 18 Jul 2024 09:26:55 -0400
 Subject: [PATCH 5/6] hw/char/virtio-serial-bus: Protect from DMA re-entrancy
 bugs
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 380: QEMU: virtio: DMA reentrancy issue leads to double free vulnerability
 RH-Jira: RHEL-32276
 RH-Acked-by: Gerd Hoffmann <None>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [5/6] fc8a445ebf6e763cd1482cd1f7ee23e5b5bbb388 (redhat/rhel/src/qemu-kvm/jons-qemu-kvm-2)
 JIRA: https://issues.redhat.com/browse/RHEL-32276
 CVE: CVE-2024-3446
 Upstream: Merged
 commit b4295bff25f7b50de1d9cc94a9c6effd40056bca
 Author: Philippe Mathieu-Daudé <philmd@linaro.org>
 Date:   Thu Apr 4 20:56:35 2024 +0200
    hw/char/virtio-serial-bus: Protect from DMA re-entrancy bugs
    Replace qemu_bh_new_guarded() by virtio_bh_new_guarded()
    so the bus and device use the same guard. Otherwise the
    DMA-reentrancy protection can be bypassed.
    Fixes: CVE-2024-3446
    Cc: qemu-stable@nongnu.org
    Suggested-by: Alexander Bulekov <alxndr@bu.edu>
    Reviewed-by: Gerd Hoffmann <kraxel@redhat.com>
    Acked-by: Michael S. Tsirkin <mst@redhat.com>
    Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
    Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
    Message-Id: <20240409105537.18308-4-philmd@linaro.org>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 hw/char/virtio-serial-bus.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)
 diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c
 index f18124b155..791b7ac59e 100644
 --- a/hw/char/virtio-serial-bus.c
 +++ b/hw/char/virtio-serial-bus.c
@@ -985,8 +985,7 @@ static void virtser_port_device_realize(DeviceState *dev, Error **errp)
         return;
     }
 -    port->bh = qemu_bh_new_guarded(flush_queued_data_bh, port,
 -                                   &dev->mem_reentrancy_guard);
 +    port->bh = virtio_bh_new_guarded(dev, flush_queued_data_bh, port);
     port->elem = NULL;
 }
 -- 
 2.39.3
--- a/SOURCES/kvm-hw-display-virtio-gpu-Protect-from-DMA-re-entrancy-b.patch
+++ b/SOURCES/kvm-hw-display-virtio-gpu-Protect-from-DMA-re-entrancy-b.patch
@ -0,0 +1,160 @@
 From d37035373a266644b241aab1f041ab09c9185540 Mon Sep 17 00:00:00 2001
 From: Jon Maloy <jmaloy@redhat.com>
 Date: Thu, 18 Jul 2024 09:29:54 -0400
 Subject: [PATCH 4/6] hw/display/virtio-gpu: Protect from DMA re-entrancy bugs
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 380: QEMU: virtio: DMA reentrancy issue leads to double free vulnerability
 RH-Jira: RHEL-32276
 RH-Acked-by: Gerd Hoffmann <None>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [4/6] e3cd21742228528a1a74ea62d55b5941d3efb261 (redhat/rhel/src/qemu-kvm/jons-qemu-kvm-2)
 JIRA: https://issues.redhat.com/browse/RHEL-32276
 CVE: CVE-2024-3446
 Upstream: Merged
 commit ba28e0ff4d95b56dc334aac2730ab3651ffc3132
 Author: Philippe Mathieu-Daudé <philmd@linaro.org>
 Date:   Thu Apr 4 20:56:27 2024 +0200
    hw/display/virtio-gpu: Protect from DMA re-entrancy bugs
    Replace qemu_bh_new_guarded() by virtio_bh_new_guarded()
    so the bus and device use the same guard. Otherwise the
    DMA-reentrancy protection can be bypassed:
      $ cat << EOF | qemu-system-i386 -display none -nodefaults \
                                      -machine q35,accel=qtest \
                                      -m 512M \
                                      -device virtio-gpu \
                                      -qtest stdio
      outl 0xcf8 0x80000820
      outl 0xcfc 0xe0004000
      outl 0xcf8 0x80000804
      outw 0xcfc 0x06
      write 0xe0004030 0x4 0x024000e0
      write 0xe0004028 0x1 0xff
      write 0xe0004020 0x4 0x00009300
      write 0xe000401c 0x1 0x01
      write 0x101 0x1 0x04
      write 0x103 0x1 0x1c
      write 0x9301c8 0x1 0x18
      write 0x105 0x1 0x1c
      write 0x107 0x1 0x1c
      write 0x109 0x1 0x1c
      write 0x10b 0x1 0x00
      write 0x10d 0x1 0x00
      write 0x10f 0x1 0x00
      write 0x111 0x1 0x00
      write 0x113 0x1 0x00
      write 0x115 0x1 0x00
      write 0x117 0x1 0x00
      write 0x119 0x1 0x00
      write 0x11b 0x1 0x00
      write 0x11d 0x1 0x00
      write 0x11f 0x1 0x00
      write 0x121 0x1 0x00
      write 0x123 0x1 0x00
      write 0x125 0x1 0x00
      write 0x127 0x1 0x00
      write 0x129 0x1 0x00
      write 0x12b 0x1 0x00
      write 0x12d 0x1 0x00
      write 0x12f 0x1 0x00
      write 0x131 0x1 0x00
      write 0x133 0x1 0x00
      write 0x135 0x1 0x00
      write 0x137 0x1 0x00
      write 0x139 0x1 0x00
      write 0xe0007003 0x1 0x00
      EOF
      ...
      =================================================================
      ==276099==ERROR: AddressSanitizer: heap-use-after-free on address 0x60d000011178
      at pc 0x562cc3b736c7 bp 0x7ffed49dee60 sp 0x7ffed49dee58
      READ of size 8 at 0x60d000011178 thread T0
          #0 0x562cc3b736c6 in virtio_gpu_ctrl_response hw/display/virtio-gpu.c:180:42
          #1 0x562cc3b7c40b in virtio_gpu_ctrl_response_nodata hw/display/virtio-gpu.c:192:5
          #2 0x562cc3b7c40b in virtio_gpu_simple_process_cmd hw/display/virtio-gpu.c:1015:13
          #3 0x562cc3b82873 in virtio_gpu_process_cmdq hw/display/virtio-gpu.c:1050:9
          #4 0x562cc4a85514 in aio_bh_call util/async.c:169:5
          #5 0x562cc4a85c52 in aio_bh_poll util/async.c:216:13
          #6 0x562cc4a1a79b in aio_dispatch util/aio-posix.c:423:5
          #7 0x562cc4a8a2da in aio_ctx_dispatch util/async.c:358:5
          #8 0x7f36840547a8 in g_main_context_dispatch (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x547a8)
          #9 0x562cc4a8b753 in glib_pollfds_poll util/main-loop.c:290:9
          #10 0x562cc4a8b753 in os_host_main_loop_wait util/main-loop.c:313:5
          #11 0x562cc4a8b753 in main_loop_wait util/main-loop.c:592:11
          #12 0x562cc3938186 in qemu_main_loop system/runstate.c:782:9
          #13 0x562cc43b7af5 in qemu_default_main system/main.c:37:14
          #14 0x7f3683a6c189 in __libc_start_call_main csu/../sysdeps/nptl/libc_start_call_main.h:58:16
          #15 0x7f3683a6c244 in __libc_start_main csu/../csu/libc-start.c:381:3
          #16 0x562cc2a58ac0 in _start (qemu-system-i386+0x231bac0)
      0x60d000011178 is located 56 bytes inside of 136-byte region [0x60d000011140,0x60d0000111c8)
      freed by thread T0 here:
          #0 0x562cc2adb662 in __interceptor_free (qemu-system-i386+0x239e662)
          #1 0x562cc3b86b21 in virtio_gpu_reset hw/display/virtio-gpu.c:1524:9
          #2 0x562cc416e20e in virtio_reset hw/virtio/virtio.c:2145:9
          #3 0x562cc37c5644 in virtio_pci_reset hw/virtio/virtio-pci.c:2249:5
          #4 0x562cc4233758 in memory_region_write_accessor system/memory.c:497:5
          #5 0x562cc4232eea in access_with_adjusted_size system/memory.c:573:18
      previously allocated by thread T0 here:
          #0 0x562cc2adb90e in malloc (qemu-system-i386+0x239e90e)
          #1 0x7f368405a678 in g_malloc (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x5a678)
          #2 0x562cc4163ffc in virtqueue_split_pop hw/virtio/virtio.c:1612:12
          #3 0x562cc4163ffc in virtqueue_pop hw/virtio/virtio.c:1783:16
          #4 0x562cc3b91a95 in virtio_gpu_handle_ctrl hw/display/virtio-gpu.c:1112:15
          #5 0x562cc4a85514 in aio_bh_call util/async.c:169:5
          #6 0x562cc4a85c52 in aio_bh_poll util/async.c:216:13
          #7 0x562cc4a1a79b in aio_dispatch util/aio-posix.c:423:5
      SUMMARY: AddressSanitizer: heap-use-after-free hw/display/virtio-gpu.c:180:42 in virtio_gpu_ctrl_response
    With this change, the same reproducer triggers:
      qemu-system-i386: warning: Blocked re-entrant IO on MemoryRegion: virtio-pci-common-virtio-gpu at addr: 0x6
    Fixes: CVE-2024-3446
    Cc: qemu-stable@nongnu.org
    Reported-by: Alexander Bulekov <alxndr@bu.edu>
    Reported-by: Yongkang Jia <kangel@zju.edu.cn>
    Reported-by: Xiao Lei <nop.leixiao@gmail.com>
    Reported-by: Yiming Tao <taoym@zju.edu.cn>
    Buglink: https://bugs.launchpad.net/qemu/+bug/1888606
    Reviewed-by: Gerd Hoffmann <kraxel@redhat.com>
    Acked-by: Michael S. Tsirkin <mst@redhat.com>
    Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
    Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
    Message-Id: <20240409105537.18308-3-philmd@linaro.org>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 hw/display/virtio-gpu.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)
 diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
 index c28ce1ea72..64fdc18478 100644
 --- a/hw/display/virtio-gpu.c
 +++ b/hw/display/virtio-gpu.c
@@ -1334,10 +1334,8 @@ void virtio_gpu_device_realize(DeviceState *qdev, Error **errp)
     g->ctrl_vq = virtio_get_queue(vdev, 0);
     g->cursor_vq = virtio_get_queue(vdev, 1);
 -    g->ctrl_bh = qemu_bh_new_guarded(virtio_gpu_ctrl_bh, g,
 -                                     &qdev->mem_reentrancy_guard);
 -    g->cursor_bh = qemu_bh_new_guarded(virtio_gpu_cursor_bh, g,
 -                                       &qdev->mem_reentrancy_guard);
 +    g->ctrl_bh = virtio_bh_new_guarded(qdev, virtio_gpu_ctrl_bh, g);
 +    g->cursor_bh = virtio_bh_new_guarded(qdev, virtio_gpu_cursor_bh, g);
     g->reset_bh = qemu_bh_new(virtio_gpu_reset_bh, g);
     qemu_cond_init(&g->reset_cond);
     QTAILQ_INIT(&g->reslist);
 -- 
 2.39.3
--- a/SOURCES/kvm-hw-ide-reset-cancel-async-DMA-operation-before-reset.patch
+++ b/SOURCES/kvm-hw-ide-reset-cancel-async-DMA-operation-before-reset.patch
@ -0,0 +1,128 @@
 From 2308abf0c5da2fe35a0721318c31d22e077663c2 Mon Sep 17 00:00:00 2001
 From: Jon Maloy <jmaloy@redhat.com>
 Date: Fri, 24 Nov 2023 12:17:11 -0500
 Subject: [PATCH 1/2] hw/ide: reset: cancel async DMA operation before
 resetting state
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 335: hw/ide: reset: cancel async DMA operation before resetting state
 RH-Jira: RHEL-15437
 RH-Acked-by: Hanna Czenczek <hreitz@redhat.com>
 RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
 RH-Commit: [1/2] b0f5f7f888559a210f1c6b3c545e337dbbc9cf22 (redhat/rhel/src/qemu-kvm/jons-qemu-kvm-2)
 JIRA: https://issues.redhat.com/browse/RHEL-15437
 CVE: CVE-2023-5088
 Upstream: Merged
 commit 7d7512019fc40c577e2bdd61f114f31a9eb84a8e
 Author: Fiona Ebner <f.ebner@proxmox.com>
 Date:   Wed Sep 6 15:09:21 2023 +0200
    hw/ide: reset: cancel async DMA operation before resetting state
    If there is a pending DMA operation during ide_bus_reset(), the fact
    that the IDEState is already reset before the operation is canceled
    can be problematic. In particular, ide_dma_cb() might be called and
    then use the reset IDEState which contains the signature after the
    reset. When used to construct the IO operation this leads to
    ide_get_sector() returning 0 and nsector being 1. This is particularly
    bad, because a write command will thus destroy the first sector which
    often contains a partition table or similar.
    Traces showing the unsolicited write happening with IDEState
    0x5595af6949d0 being used after reset:
    > ahci_port_write ahci(0x5595af6923f0)[0]: port write [reg:PxSCTL] @ 0x2c: 0x00000300
    > ahci_reset_port ahci(0x5595af6923f0)[0]: reset port
    > ide_reset IDEstate 0x5595af6949d0
    > ide_reset IDEstate 0x5595af694da8
    > ide_bus_reset_aio aio_cancel
    > dma_aio_cancel dbs=0x7f64600089a0
    > dma_blk_cb dbs=0x7f64600089a0 ret=0
    > dma_complete dbs=0x7f64600089a0 ret=0 cb=0x5595acd40b30
    > ahci_populate_sglist ahci(0x5595af6923f0)[0]
    > ahci_dma_prepare_buf ahci(0x5595af6923f0)[0]: prepare buf limit=512 prepared=512
    > ide_dma_cb IDEState 0x5595af6949d0; sector_num=0 n=1 cmd=DMA WRITE
    > dma_blk_io dbs=0x7f6420802010 bs=0x5595ae2c6c30 offset=0 to_dev=1
    > dma_blk_cb dbs=0x7f6420802010 ret=0
    > (gdb) p *qiov
    > $11 = {iov = 0x7f647c76d840, niov = 1, {{nalloc = 1, local_iov = {iov_base = 0x0,
    >       iov_len = 512}}, {__pad = "\001\000\000\000\000\000\000\000\000\000\000",
    >       size = 512}}}
    > (gdb) bt
    > #0  blk_aio_pwritev (blk=0x5595ae2c6c30, offset=0, qiov=0x7f6420802070, flags=0,
    >     cb=0x5595ace6f0b0 <dma_blk_cb>, opaque=0x7f6420802010)
    >     at ../block/block-backend.c:1682
    > #1  0x00005595ace6f185 in dma_blk_cb (opaque=0x7f6420802010, ret=<optimized out>)
    >     at ../softmmu/dma-helpers.c:179
    > #2  0x00005595ace6f778 in dma_blk_io (ctx=0x5595ae0609f0,
    >     sg=sg@entry=0x5595af694d00, offset=offset@entry=0, align=align@entry=512,
    >     io_func=io_func@entry=0x5595ace6ee30 <dma_blk_write_io_func>,
    >     io_func_opaque=io_func_opaque@entry=0x5595ae2c6c30,
    >     cb=0x5595acd40b30 <ide_dma_cb>, opaque=0x5595af6949d0,
    >     dir=DMA_DIRECTION_TO_DEVICE) at ../softmmu/dma-helpers.c:244
    > #3  0x00005595ace6f90a in dma_blk_write (blk=0x5595ae2c6c30,
    >     sg=sg@entry=0x5595af694d00, offset=offset@entry=0, align=align@entry=512,
    >     cb=cb@entry=0x5595acd40b30 <ide_dma_cb>, opaque=opaque@entry=0x5595af6949d0)
    >     at ../softmmu/dma-helpers.c:280
    > #4  0x00005595acd40e18 in ide_dma_cb (opaque=0x5595af6949d0, ret=<optimized out>)
    >     at ../hw/ide/core.c:953
    > #5  0x00005595ace6f319 in dma_complete (ret=0, dbs=0x7f64600089a0)
    >     at ../softmmu/dma-helpers.c:107
    > #6  dma_blk_cb (opaque=0x7f64600089a0, ret=0) at ../softmmu/dma-helpers.c:127
    > #7  0x00005595ad12227d in blk_aio_complete (acb=0x7f6460005b10)
    >     at ../block/block-backend.c:1527
    > #8  blk_aio_complete (acb=0x7f6460005b10) at ../block/block-backend.c:1524
    > #9  blk_aio_write_entry (opaque=0x7f6460005b10) at ../block/block-backend.c:1594
    > #10 0x00005595ad258cfb in coroutine_trampoline (i0=<optimized out>,
    >     i1=<optimized out>) at ../util/coroutine-ucontext.c:177
    Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
    Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
    Tested-by: simon.rowe@nutanix.com
    Message-ID: <20230906130922.142845-1-f.ebner@proxmox.com>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 hw/ide/core.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)
 diff --git a/hw/ide/core.c b/hw/ide/core.c
 index 05a32d0a99..fd50c123e8 100644
 --- a/hw/ide/core.c
 +++ b/hw/ide/core.c
@@ -2456,19 +2456,19 @@ static void ide_dummy_transfer_stop(IDEState *s)
 void ide_bus_reset(IDEBus *bus)
 {
 -    bus->unit = 0;
 -    bus->cmd = 0;
 -    ide_reset(&bus->ifs[0]);
 -    ide_reset(&bus->ifs[1]);
 -    ide_clear_hob(bus);
 -
 -    /* pending async DMA */
 +    /* pending async DMA - needs the IDEState before it is reset */
     if (bus->dma->aiocb) {
         trace_ide_bus_reset_aio();
         blk_aio_cancel(bus->dma->aiocb);
         bus->dma->aiocb = NULL;
     }
 +    bus->unit = 0;
 +    bus->cmd = 0;
 +    ide_reset(&bus->ifs[0]);
 +    ide_reset(&bus->ifs[1]);
 +    ide_clear_hob(bus);
 +
     /* reset dma provider too */
     if (bus->dma->ops->reset) {
         bus->dma->ops->reset(bus->dma);
 -- 
 2.41.0
--- a/SOURCES/kvm-hw-replace-most-qemu_bh_new-calls-with-qemu_bh_new_g.patch
+++ b/SOURCES/kvm-hw-replace-most-qemu_bh_new-calls-with-qemu_bh_new_g.patch
@ -0,0 +1,449 @@
 From 146cfb23b76b898f08690ffc14aab16d22a41404 Mon Sep 17 00:00:00 2001
 From: Jon Maloy <jmaloy@redhat.com>
 Date: Tue, 9 May 2023 10:29:03 -0400
 Subject: [PATCH 04/15] hw: replace most qemu_bh_new calls with
 qemu_bh_new_guarded
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 277: memory: prevent dma-reentracy issues
 RH-Bugzilla: 1999236
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [4/12] 00c51d30246b3aa529f6043e35ee471660aa1fce (redhat/rhel/src/qemu-kvm/jons-qemu-kvm-2)
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1999236
 Upstream: Merged
 CVE: CVE-2021-3750
 Conflicts: In hw/nvme/ctrl.c there are no calls to qemu_bh_new() at the two locations
           the replacement is done in the upstream commit. Instead, timer_new_ns() is
           used. We leave these functions unaltered.
 commit f63192b0544af5d3e4d5edfd85ab520fcf671377
 Author: Alexander Bulekov <alxndr@bu.edu>
 Date:   Thu Apr 27 17:10:09 2023 -0400
    hw: replace most qemu_bh_new calls with qemu_bh_new_guarded
    This protects devices from bh->mmio reentrancy issues.
    Thanks: Thomas Huth <thuth@redhat.com> for diagnosing OS X test failure.
    Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
    Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
    Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
    Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
    Reviewed-by: Paul Durrant <paul@xen.org>
    Reviewed-by: Thomas Huth <thuth@redhat.com>
    Message-Id: <20230427211013.2994127-5-alxndr@bu.edu>
    Signed-off-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 hw/9pfs/xen-9p-backend.c        | 5 ++++-
 hw/block/dataplane/virtio-blk.c | 3 ++-
 hw/block/dataplane/xen-block.c  | 5 +++--
 hw/char/virtio-serial-bus.c     | 3 ++-
 hw/display/qxl.c                | 9 ++++++---
 hw/display/virtio-gpu.c         | 6 ++++--
 hw/ide/ahci.c                   | 3 ++-
 hw/ide/ahci_internal.h          | 1 +
 hw/ide/core.c                   | 4 +++-
 hw/misc/imx_rngc.c              | 6 ++++--
 hw/misc/macio/mac_dbdma.c       | 2 +-
 hw/net/virtio-net.c             | 3 ++-
 hw/scsi/mptsas.c                | 3 ++-
 hw/scsi/scsi-bus.c              | 3 ++-
 hw/scsi/vmw_pvscsi.c            | 3 ++-
 hw/usb/dev-uas.c                | 3 ++-
 hw/usb/hcd-dwc2.c               | 3 ++-
 hw/usb/hcd-ehci.c               | 3 ++-
 hw/usb/hcd-uhci.c               | 2 +-
 hw/usb/host-libusb.c            | 6 ++++--
 hw/usb/redirect.c               | 6 ++++--
 hw/usb/xen-usb.c                | 3 ++-
 hw/virtio/virtio-balloon.c      | 5 +++--
 hw/virtio/virtio-crypto.c       | 3 ++-
 24 files changed, 62 insertions(+), 31 deletions(-)
 diff --git a/hw/9pfs/xen-9p-backend.c b/hw/9pfs/xen-9p-backend.c
 index 65c4979c3c..09f7c13588 100644
 --- a/hw/9pfs/xen-9p-backend.c
 +++ b/hw/9pfs/xen-9p-backend.c
@@ -60,6 +60,7 @@ typedef struct Xen9pfsDev {
     int num_rings;
     Xen9pfsRing *rings;
 +    MemReentrancyGuard mem_reentrancy_guard;
 } Xen9pfsDev;
 static void xen_9pfs_disconnect(struct XenLegacyDevice *xendev);
@@ -441,7 +442,9 @@ static int xen_9pfs_connect(struct XenLegacyDevice *xendev)
         xen_9pdev->rings[i].ring.out = xen_9pdev->rings[i].data +
                                        XEN_FLEX_RING_SIZE(ring_order);
 -        xen_9pdev->rings[i].bh = qemu_bh_new(xen_9pfs_bh, &xen_9pdev->rings[i]);
 +        xen_9pdev->rings[i].bh = qemu_bh_new_guarded(xen_9pfs_bh,
 +                                                     &xen_9pdev->rings[i],
 +                                                     &xen_9pdev->mem_reentrancy_guard);
         xen_9pdev->rings[i].out_cons = 0;
         xen_9pdev->rings[i].out_size = 0;
         xen_9pdev->rings[i].inprogress = false;
 diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
 index ee5a5352dc..5f0de7da1e 100644
 --- a/hw/block/dataplane/virtio-blk.c
 +++ b/hw/block/dataplane/virtio-blk.c
@@ -127,7 +127,8 @@ bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *conf,
     } else {
         s->ctx = qemu_get_aio_context();
     }
 -    s->bh = aio_bh_new(s->ctx, notify_guest_bh, s);
 +    s->bh = aio_bh_new_guarded(s->ctx, notify_guest_bh, s,
 +                               &DEVICE(vdev)->mem_reentrancy_guard);
     s->batch_notify_vqs = bitmap_new(conf->num_queues);
     *dataplane = s;
 diff --git a/hw/block/dataplane/xen-block.c b/hw/block/dataplane/xen-block.c
 index 860787580a..07855feea6 100644
 --- a/hw/block/dataplane/xen-block.c
 +++ b/hw/block/dataplane/xen-block.c
@@ -631,8 +631,9 @@ XenBlockDataPlane *xen_block_dataplane_create(XenDevice *xendev,
     } else {
         dataplane->ctx = qemu_get_aio_context();
     }
 -    dataplane->bh = aio_bh_new(dataplane->ctx, xen_block_dataplane_bh,
 -                               dataplane);
 +    dataplane->bh = aio_bh_new_guarded(dataplane->ctx, xen_block_dataplane_bh,
 +                                       dataplane,
 +                                       &DEVICE(xendev)->mem_reentrancy_guard);
     return dataplane;
 }
 diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c
 index f01ec2137c..f18124b155 100644
 --- a/hw/char/virtio-serial-bus.c
 +++ b/hw/char/virtio-serial-bus.c
@@ -985,7 +985,8 @@ static void virtser_port_device_realize(DeviceState *dev, Error **errp)
         return;
     }
 -    port->bh = qemu_bh_new(flush_queued_data_bh, port);
 +    port->bh = qemu_bh_new_guarded(flush_queued_data_bh, port,
 +                                   &dev->mem_reentrancy_guard);
     port->elem = NULL;
 }
 diff --git a/hw/display/qxl.c b/hw/display/qxl.c
 index bcd9e8716a..0f663b9912 100644
 --- a/hw/display/qxl.c
 +++ b/hw/display/qxl.c
@@ -2206,11 +2206,14 @@ static void qxl_realize_common(PCIQXLDevice *qxl, Error **errp)
     qemu_add_vm_change_state_handler(qxl_vm_change_state_handler, qxl);
 -    qxl->update_irq = qemu_bh_new(qxl_update_irq_bh, qxl);
 +    qxl->update_irq = qemu_bh_new_guarded(qxl_update_irq_bh, qxl,
 +                                          &DEVICE(qxl)->mem_reentrancy_guard);
     qxl_reset_state(qxl);
 -    qxl->update_area_bh = qemu_bh_new(qxl_render_update_area_bh, qxl);
 -    qxl->ssd.cursor_bh = qemu_bh_new(qemu_spice_cursor_refresh_bh, &qxl->ssd);
 +    qxl->update_area_bh = qemu_bh_new_guarded(qxl_render_update_area_bh, qxl,
 +                                              &DEVICE(qxl)->mem_reentrancy_guard);
 +    qxl->ssd.cursor_bh = qemu_bh_new_guarded(qemu_spice_cursor_refresh_bh, &qxl->ssd,
 +                                             &DEVICE(qxl)->mem_reentrancy_guard);
 }
 static void qxl_realize_primary(PCIDevice *dev, Error **errp)
 diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
 index d78b9700c7..ecf9079145 100644
 --- a/hw/display/virtio-gpu.c
 +++ b/hw/display/virtio-gpu.c
@@ -1332,8 +1332,10 @@ void virtio_gpu_device_realize(DeviceState *qdev, Error **errp)
     g->ctrl_vq = virtio_get_queue(vdev, 0);
     g->cursor_vq = virtio_get_queue(vdev, 1);
 -    g->ctrl_bh = qemu_bh_new(virtio_gpu_ctrl_bh, g);
 -    g->cursor_bh = qemu_bh_new(virtio_gpu_cursor_bh, g);
 +    g->ctrl_bh = qemu_bh_new_guarded(virtio_gpu_ctrl_bh, g,
 +                                     &qdev->mem_reentrancy_guard);
 +    g->cursor_bh = qemu_bh_new_guarded(virtio_gpu_cursor_bh, g,
 +                                       &qdev->mem_reentrancy_guard);
     QTAILQ_INIT(&g->reslist);
     QTAILQ_INIT(&g->cmdq);
     QTAILQ_INIT(&g->fenceq);
 diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
 index a94c6e26fb..7488b28065 100644
 --- a/hw/ide/ahci.c
 +++ b/hw/ide/ahci.c
@@ -1504,7 +1504,8 @@ static void ahci_cmd_done(const IDEDMA *dma)
     ahci_write_fis_d2h(ad);
     if (ad->port_regs.cmd_issue && !ad->check_bh) {
 -        ad->check_bh = qemu_bh_new(ahci_check_cmd_bh, ad);
 +        ad->check_bh = qemu_bh_new_guarded(ahci_check_cmd_bh, ad,
 +                                           &ad->mem_reentrancy_guard);
         qemu_bh_schedule(ad->check_bh);
     }
 }
 diff --git a/hw/ide/ahci_internal.h b/hw/ide/ahci_internal.h
 index 109de9e2d1..a7768dd69e 100644
 --- a/hw/ide/ahci_internal.h
 +++ b/hw/ide/ahci_internal.h
@@ -321,6 +321,7 @@ struct AHCIDevice {
     bool init_d2h_sent;
     AHCICmdHdr *cur_cmd;
     NCQTransferState ncq_tfs[AHCI_MAX_CMDS];
 +    MemReentrancyGuard mem_reentrancy_guard;
 };
 struct AHCIPCIState {
 diff --git a/hw/ide/core.c b/hw/ide/core.c
 index 15138225be..05a32d0a99 100644
 --- a/hw/ide/core.c
 +++ b/hw/ide/core.c
@@ -510,6 +510,7 @@ BlockAIOCB *ide_issue_trim(
         BlockCompletionFunc *cb, void *cb_opaque, void *opaque)
 {
     IDEState *s = opaque;
 +    IDEDevice *dev = s->unit ? s->bus->slave : s->bus->master;
     TrimAIOCB *iocb;
     /* Paired with a decrement in ide_trim_bh_cb() */
@@ -517,7 +518,8 @@ BlockAIOCB *ide_issue_trim(
     iocb = blk_aio_get(&trim_aiocb_info, s->blk, cb, cb_opaque);
     iocb->s = s;
 -    iocb->bh = qemu_bh_new(ide_trim_bh_cb, iocb);
 +    iocb->bh = qemu_bh_new_guarded(ide_trim_bh_cb, iocb,
 +                                   &DEVICE(dev)->mem_reentrancy_guard);
     iocb->ret = 0;
     iocb->qiov = qiov;
     iocb->i = -1;
 diff --git a/hw/misc/imx_rngc.c b/hw/misc/imx_rngc.c
 index 632c03779c..082c6980ad 100644
 --- a/hw/misc/imx_rngc.c
 +++ b/hw/misc/imx_rngc.c
@@ -228,8 +228,10 @@ static void imx_rngc_realize(DeviceState *dev, Error **errp)
     sysbus_init_mmio(sbd, &s->iomem);
     sysbus_init_irq(sbd, &s->irq);
 -    s->self_test_bh = qemu_bh_new(imx_rngc_self_test, s);
 -    s->seed_bh = qemu_bh_new(imx_rngc_seed, s);
 +    s->self_test_bh = qemu_bh_new_guarded(imx_rngc_self_test, s,
 +                                          &dev->mem_reentrancy_guard);
 +    s->seed_bh = qemu_bh_new_guarded(imx_rngc_seed, s,
 +                                     &dev->mem_reentrancy_guard);
 }
 static void imx_rngc_reset(DeviceState *dev)
 diff --git a/hw/misc/macio/mac_dbdma.c b/hw/misc/macio/mac_dbdma.c
 index e220f1a927..f6a9e76fe7 100644
 --- a/hw/misc/macio/mac_dbdma.c
 +++ b/hw/misc/macio/mac_dbdma.c
@@ -912,7 +912,7 @@ static void mac_dbdma_realize(DeviceState *dev, Error **errp)
 {
     DBDMAState *s = MAC_DBDMA(dev);
 -    s->bh = qemu_bh_new(DBDMA_run_bh, s);
 +    s->bh = qemu_bh_new_guarded(DBDMA_run_bh, s, &dev->mem_reentrancy_guard);
 }
 static void mac_dbdma_class_init(ObjectClass *oc, void *data)
 diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
 index 7e172ef829..ddaa8fa122 100644
 --- a/hw/net/virtio-net.c
 +++ b/hw/net/virtio-net.c
@@ -2753,7 +2753,8 @@ static void virtio_net_add_queue(VirtIONet *n, int index)
         n->vqs[index].tx_vq =
             virtio_add_queue(vdev, n->net_conf.tx_queue_size,
                              virtio_net_handle_tx_bh);
 -        n->vqs[index].tx_bh = qemu_bh_new(virtio_net_tx_bh, &n->vqs[index]);
 +        n->vqs[index].tx_bh = qemu_bh_new_guarded(virtio_net_tx_bh, &n->vqs[index],
 +                                                  &DEVICE(vdev)->mem_reentrancy_guard);
     }
     n->vqs[index].tx_waiting = 0;
 diff --git a/hw/scsi/mptsas.c b/hw/scsi/mptsas.c
 index f6c7765544..ab8aaca85d 100644
 --- a/hw/scsi/mptsas.c
 +++ b/hw/scsi/mptsas.c
@@ -1313,7 +1313,8 @@ static void mptsas_scsi_realize(PCIDevice *dev, Error **errp)
     }
     s->max_devices = MPTSAS_NUM_PORTS;
 -    s->request_bh = qemu_bh_new(mptsas_fetch_requests, s);
 +    s->request_bh = qemu_bh_new_guarded(mptsas_fetch_requests, s,
 +                                        &DEVICE(dev)->mem_reentrancy_guard);
     scsi_bus_init(&s->bus, sizeof(s->bus), &dev->qdev, &mptsas_scsi_info);
 }
 diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
 index 77325d8cc7..b506ab7d04 100644
 --- a/hw/scsi/scsi-bus.c
 +++ b/hw/scsi/scsi-bus.c
@@ -192,7 +192,8 @@ static void scsi_dma_restart_cb(void *opaque, bool running, RunState state)
         AioContext *ctx = blk_get_aio_context(s->conf.blk);
         /* The reference is dropped in scsi_dma_restart_bh.*/
         object_ref(OBJECT(s));
 -        s->bh = aio_bh_new(ctx, scsi_dma_restart_bh, s);
 +        s->bh = aio_bh_new_guarded(ctx, scsi_dma_restart_bh, s,
 +                                   &DEVICE(s)->mem_reentrancy_guard);
         qemu_bh_schedule(s->bh);
     }
 }
 diff --git a/hw/scsi/vmw_pvscsi.c b/hw/scsi/vmw_pvscsi.c
 index cd76bd67ab..4c36febbc0 100644
 --- a/hw/scsi/vmw_pvscsi.c
 +++ b/hw/scsi/vmw_pvscsi.c
@@ -1178,7 +1178,8 @@ pvscsi_realizefn(PCIDevice *pci_dev, Error **errp)
         pcie_endpoint_cap_init(pci_dev, PVSCSI_EXP_EP_OFFSET);
     }
 -    s->completion_worker = qemu_bh_new(pvscsi_process_completion_queue, s);
 +    s->completion_worker = qemu_bh_new_guarded(pvscsi_process_completion_queue, s,
 +                                               &DEVICE(pci_dev)->mem_reentrancy_guard);
     scsi_bus_init(&s->bus, sizeof(s->bus), DEVICE(pci_dev), &pvscsi_scsi_info);
     /* override default SCSI bus hotplug-handler, with pvscsi's one */
 diff --git a/hw/usb/dev-uas.c b/hw/usb/dev-uas.c
 index 599d6b52a0..a36a7c3013 100644
 --- a/hw/usb/dev-uas.c
 +++ b/hw/usb/dev-uas.c
@@ -935,7 +935,8 @@ static void usb_uas_realize(USBDevice *dev, Error **errp)
     QTAILQ_INIT(&uas->results);
     QTAILQ_INIT(&uas->requests);
 -    uas->status_bh = qemu_bh_new(usb_uas_send_status_bh, uas);
 +    uas->status_bh = qemu_bh_new_guarded(usb_uas_send_status_bh, uas,
 +                                         &d->mem_reentrancy_guard);
     dev->flags |= (1 << USB_DEV_FLAG_IS_SCSI_STORAGE);
     scsi_bus_init(&uas->bus, sizeof(uas->bus), DEVICE(dev), &usb_uas_scsi_info);
 diff --git a/hw/usb/hcd-dwc2.c b/hw/usb/hcd-dwc2.c
 index e1d96acf7e..0e238f8422 100644
 --- a/hw/usb/hcd-dwc2.c
 +++ b/hw/usb/hcd-dwc2.c
@@ -1364,7 +1364,8 @@ static void dwc2_realize(DeviceState *dev, Error **errp)
     s->fi = USB_FRMINTVL - 1;
     s->eof_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, dwc2_frame_boundary, s);
     s->frame_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, dwc2_work_timer, s);
 -    s->async_bh = qemu_bh_new(dwc2_work_bh, s);
 +    s->async_bh = qemu_bh_new_guarded(dwc2_work_bh, s,
 +                                      &dev->mem_reentrancy_guard);
     sysbus_init_irq(sbd, &s->irq);
 }
 diff --git a/hw/usb/hcd-ehci.c b/hw/usb/hcd-ehci.c
 index 6caa7ac6c2..df4ff6f2c1 100644
 --- a/hw/usb/hcd-ehci.c
 +++ b/hw/usb/hcd-ehci.c
@@ -2528,7 +2528,8 @@ void usb_ehci_realize(EHCIState *s, DeviceState *dev, Error **errp)
     }
     s->frame_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, ehci_work_timer, s);
 -    s->async_bh = qemu_bh_new(ehci_work_bh, s);
 +    s->async_bh = qemu_bh_new_guarded(ehci_work_bh, s,
 +                                      &dev->mem_reentrancy_guard);
     s->device = dev;
     s->vmstate = qemu_add_vm_change_state_handler(usb_ehci_vm_state_change, s);
 diff --git a/hw/usb/hcd-uhci.c b/hw/usb/hcd-uhci.c
 index 7930b868fa..469c5e57e9 100644
 --- a/hw/usb/hcd-uhci.c
 +++ b/hw/usb/hcd-uhci.c
@@ -1195,7 +1195,7 @@ void usb_uhci_common_realize(PCIDevice *dev, Error **errp)
                               USB_SPEED_MASK_LOW | USB_SPEED_MASK_FULL);
         }
     }
 -    s->bh = qemu_bh_new(uhci_bh, s);
 +    s->bh = qemu_bh_new_guarded(uhci_bh, s, &DEVICE(dev)->mem_reentrancy_guard);
     s->frame_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, uhci_frame_timer, s);
     s->num_ports_vmstate = NB_PORTS;
     QTAILQ_INIT(&s->queues);
 diff --git a/hw/usb/host-libusb.c b/hw/usb/host-libusb.c
 index d0d46dd0a4..09b961116b 100644
 --- a/hw/usb/host-libusb.c
 +++ b/hw/usb/host-libusb.c
@@ -1141,7 +1141,8 @@ static void usb_host_nodev_bh(void *opaque)
 static void usb_host_nodev(USBHostDevice *s)
 {
     if (!s->bh_nodev) {
 -        s->bh_nodev = qemu_bh_new(usb_host_nodev_bh, s);
 +        s->bh_nodev = qemu_bh_new_guarded(usb_host_nodev_bh, s,
 +                                          &DEVICE(s)->mem_reentrancy_guard);
     }
     qemu_bh_schedule(s->bh_nodev);
 }
@@ -1739,7 +1740,8 @@ static int usb_host_post_load(void *opaque, int version_id)
     USBHostDevice *dev = opaque;
     if (!dev->bh_postld) {
 -        dev->bh_postld = qemu_bh_new(usb_host_post_load_bh, dev);
 +        dev->bh_postld = qemu_bh_new_guarded(usb_host_post_load_bh, dev,
 +                                             &DEVICE(dev)->mem_reentrancy_guard);
     }
     qemu_bh_schedule(dev->bh_postld);
     dev->bh_postld_pending = true;
 diff --git a/hw/usb/redirect.c b/hw/usb/redirect.c
 index 5f0ef9cb3b..59cd3cd7c4 100644
 --- a/hw/usb/redirect.c
 +++ b/hw/usb/redirect.c
@@ -1437,8 +1437,10 @@ static void usbredir_realize(USBDevice *udev, Error **errp)
         }
     }
 -    dev->chardev_close_bh = qemu_bh_new(usbredir_chardev_close_bh, dev);
 -    dev->device_reject_bh = qemu_bh_new(usbredir_device_reject_bh, dev);
 +    dev->chardev_close_bh = qemu_bh_new_guarded(usbredir_chardev_close_bh, dev,
 +                                                &DEVICE(dev)->mem_reentrancy_guard);
 +    dev->device_reject_bh = qemu_bh_new_guarded(usbredir_device_reject_bh, dev,
 +                                                &DEVICE(dev)->mem_reentrancy_guard);
     dev->attach_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, usbredir_do_attach, dev);
     packet_id_queue_init(&dev->cancelled, dev, "cancelled");
 diff --git a/hw/usb/xen-usb.c b/hw/usb/xen-usb.c
 index 0f7369e7ed..dec91294ad 100644
 --- a/hw/usb/xen-usb.c
 +++ b/hw/usb/xen-usb.c
@@ -1021,7 +1021,8 @@ static void usbback_alloc(struct XenLegacyDevice *xendev)
     QTAILQ_INIT(&usbif->req_free_q);
     QSIMPLEQ_INIT(&usbif->hotplug_q);
 -    usbif->bh = qemu_bh_new(usbback_bh, usbif);
 +    usbif->bh = qemu_bh_new_guarded(usbback_bh, usbif,
 +                                    &DEVICE(xendev)->mem_reentrancy_guard);
 }
 static int usbback_free(struct XenLegacyDevice *xendev)
 diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
 index 9a4f491b54..f503572e27 100644
 --- a/hw/virtio/virtio-balloon.c
 +++ b/hw/virtio/virtio-balloon.c
@@ -917,8 +917,9 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp)
         precopy_add_notifier(&s->free_page_hint_notify);
         object_ref(OBJECT(s->iothread));
 -        s->free_page_bh = aio_bh_new(iothread_get_aio_context(s->iothread),
 -                                     virtio_ballloon_get_free_page_hints, s);
 +        s->free_page_bh = aio_bh_new_guarded(iothread_get_aio_context(s->iothread),
 +                                             virtio_ballloon_get_free_page_hints, s,
 +                                             &dev->mem_reentrancy_guard);
     }
     if (virtio_has_feature(s->host_features, VIRTIO_BALLOON_F_REPORTING)) {
 diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c
 index 54f9bbb789..1be7bb543c 100644
 --- a/hw/virtio/virtio-crypto.c
 +++ b/hw/virtio/virtio-crypto.c
@@ -817,7 +817,8 @@ static void virtio_crypto_device_realize(DeviceState *dev, Error **errp)
         vcrypto->vqs[i].dataq =
                  virtio_add_queue(vdev, 1024, virtio_crypto_handle_dataq_bh);
         vcrypto->vqs[i].dataq_bh =
 -                 qemu_bh_new(virtio_crypto_dataq_bh, &vcrypto->vqs[i]);
 +                 qemu_bh_new_guarded(virtio_crypto_dataq_bh, &vcrypto->vqs[i],
 +                                     &dev->mem_reentrancy_guard);
         vcrypto->vqs[i].vcrypto = vcrypto;
     }
 -- 
 2.37.3
--- a/SOURCES/kvm-hw-s390x-Move-KVM-specific-PV-from-hw-to-target-s390.patch
+++ b/SOURCES/kvm-hw-s390x-Move-KVM-specific-PV-from-hw-to-target-s390.patch
@ -0,0 +1,283 @@
 From 59f02a421ecdba6e856597367020926fc0cb5177 Mon Sep 17 00:00:00 2001
 From: Thomas Huth <thuth@redhat.com>
 Date: Mon, 15 Jan 2024 18:52:30 +0100
 Subject: [PATCH 4/5] hw/s390x: Move KVM specific PV from hw/ to
 target/s390x/kvm/
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Thomas Huth <thuth@redhat.com>
 RH-MergeRequest: 348: s390x: Provide some more useful information if decryption of a PV image fails
 RH-Jira: RHEL-18214
 RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
 RH-Acked-by: Cédric Le Goater <clg@redhat.com>
 RH-Commit: [4/5] f6095bfdb89268007a0741665284955db4752d46
 JIRA: https://issues.redhat.com/browse/RHEL-18214
 commit f5f9c6ea11bc807664fdeb9354915c2c9cdcbd89
 Author: Philippe Mathieu-Daudé <philmd@linaro.org>
 Date:   Sat Jun 24 22:06:44 2023 +0200
    hw/s390x: Move KVM specific PV from hw/ to target/s390x/kvm/
    Protected Virtualization (PV) is not a real hardware device:
    it is a feature of the firmware on s390x that is exposed to
    userspace via the KVM interface.
    Move the pv.c/pv.h files to target/s390x/kvm/ to make this clearer.
    Suggested-by: Thomas Huth <thuth@redhat.com>
    Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
    Message-Id: <20230624200644.23931-1-philmd@linaro.org>
    Signed-off-by: Thomas Huth <thuth@redhat.com>
 Conflicts:
    hw/s390x/ipl.c
    hw/s390x/s390-virtio-ccw.c
    target/s390x/diag.c
    (simple contextual conflict due to differce with #include statements)
 Signed-off-by: Thomas Huth <thuth@redhat.com>
 ---
 MAINTAINERS                                 | 2 --
 hw/s390x/ipl.c                              | 2 +-
 hw/s390x/meson.build                        | 1 -
 hw/s390x/s390-pci-kvm.c                     | 2 +-
 hw/s390x/s390-virtio-ccw.c                  | 2 +-
 hw/s390x/tod-kvm.c                          | 2 +-
 target/s390x/arch_dump.c                    | 2 +-
 target/s390x/cpu-sysemu.c                   | 2 +-
 target/s390x/cpu_features.c                 | 2 +-
 target/s390x/cpu_models.c                   | 2 +-
 target/s390x/diag.c                         | 2 +-
 target/s390x/helper.c                       | 2 +-
 target/s390x/ioinst.c                       | 2 +-
 target/s390x/kvm/kvm.c                      | 2 +-
 target/s390x/kvm/meson.build                | 1 +
 {hw/s390x => target/s390x/kvm}/pv.c         | 2 +-
 {include/hw/s390x => target/s390x/kvm}/pv.h | 0
 17 files changed, 14 insertions(+), 16 deletions(-)
 rename {hw/s390x => target/s390x/kvm}/pv.c (99%)
 rename {include/hw/s390x => target/s390x/kvm}/pv.h (100%)
 diff --git a/MAINTAINERS b/MAINTAINERS
 index b893206fc3..d74ca51154 100644
 --- a/MAINTAINERS
 +++ b/MAINTAINERS
@@ -397,8 +397,6 @@ S: Supported
 F: target/s390x/kvm/
 F: target/s390x/machine.c
 F: target/s390x/sigp.c
 -F: hw/s390x/pv.c
 -F: include/hw/s390x/pv.h
 F: gdb-xml/s390*.xml
 T: git https://github.com/borntraeger/qemu.git s390-next
 L: qemu-s390x@nongnu.org
 diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c
 index 9051d8652d..c25e247426 100644
 --- a/hw/s390x/ipl.c
 +++ b/hw/s390x/ipl.c
@@ -27,7 +27,7 @@
 #include "hw/s390x/vfio-ccw.h"
 #include "hw/s390x/css.h"
 #include "hw/s390x/ebcdic.h"
 -#include "hw/s390x/pv.h"
 +#include "target/s390x/kvm/pv.h"
 #include "ipl.h"
 #include "qemu/error-report.h"
 #include "qemu/config-file.h"
 diff --git a/hw/s390x/meson.build b/hw/s390x/meson.build
 index 6e6e47fcda..bb3b42f613 100644
 --- a/hw/s390x/meson.build
 +++ b/hw/s390x/meson.build
@@ -22,7 +22,6 @@ s390x_ss.add(when: 'CONFIG_KVM', if_true: files(
   'tod-kvm.c',
   's390-skeys-kvm.c',
   's390-stattrib-kvm.c',
 -  'pv.c',
   's390-pci-kvm.c',
 ))
 s390x_ss.add(when: 'CONFIG_TCG', if_true: files(
 diff --git a/hw/s390x/s390-pci-kvm.c b/hw/s390x/s390-pci-kvm.c
 index 9134fe185f..ff41e4106d 100644
 --- a/hw/s390x/s390-pci-kvm.c
 +++ b/hw/s390x/s390-pci-kvm.c
@@ -14,7 +14,7 @@
 #include <linux/kvm.h>
 #include "kvm/kvm_s390x.h"
 -#include "hw/s390x/pv.h"
 +#include "target/s390x/kvm/pv.h"
 #include "hw/s390x/s390-pci-bus.h"
 #include "hw/s390x/s390-pci-kvm.h"
 #include "hw/s390x/s390-pci-inst.h"
 diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
 index 17146469ee..7bfa5b4e8f 100644
 --- a/hw/s390x/s390-virtio-ccw.c
 +++ b/hw/s390x/s390-virtio-ccw.c
@@ -40,7 +40,7 @@
 #include "hw/qdev-properties.h"
 #include "hw/s390x/tod.h"
 #include "sysemu/sysemu.h"
 -#include "hw/s390x/pv.h"
 +#include "target/s390x/kvm/pv.h"
 #include "migration/blocker.h"
 #include "qapi/visitor.h"
 diff --git a/hw/s390x/tod-kvm.c b/hw/s390x/tod-kvm.c
 index c804c979b5..9776cda50a 100644
 --- a/hw/s390x/tod-kvm.c
 +++ b/hw/s390x/tod-kvm.c
@@ -13,7 +13,7 @@
 #include "qemu/module.h"
 #include "sysemu/runstate.h"
 #include "hw/s390x/tod.h"
 -#include "hw/s390x/pv.h"
 +#include "target/s390x/kvm/pv.h"
 #include "kvm/kvm_s390x.h"
 static void kvm_s390_get_tod_raw(S390TOD *tod, Error **errp)
 diff --git a/target/s390x/arch_dump.c b/target/s390x/arch_dump.c
 index 3b1f178dc3..2554238c16 100644
 --- a/target/s390x/arch_dump.c
 +++ b/target/s390x/arch_dump.c
@@ -17,8 +17,8 @@
 #include "s390x-internal.h"
 #include "elf.h"
 #include "sysemu/dump.h"
 -#include "hw/s390x/pv.h"
 #include "kvm/kvm_s390x.h"
 +#include "target/s390x/kvm/pv.h"
 struct S390xUserRegsStruct {
     uint64_t psw[2];
 diff --git a/target/s390x/cpu-sysemu.c b/target/s390x/cpu-sysemu.c
 index 5471e01ee8..547287a949 100644
 --- a/target/s390x/cpu-sysemu.c
 +++ b/target/s390x/cpu-sysemu.c
@@ -32,7 +32,7 @@
 #include "qapi/qapi-visit-run-state.h"
 #include "sysemu/hw_accel.h"
 -#include "hw/s390x/pv.h"
 +#include "target/s390x/kvm/pv.h"
 #include "hw/boards.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/tcg.h"
 diff --git a/target/s390x/cpu_features.c b/target/s390x/cpu_features.c
 index 2e4e11d264..ebb155ce1c 100644
 --- a/target/s390x/cpu_features.c
 +++ b/target/s390x/cpu_features.c
@@ -15,7 +15,7 @@
 #include "qemu/module.h"
 #include "cpu_features.h"
 #ifndef CONFIG_USER_ONLY
 -#include "hw/s390x/pv.h"
 +#include "target/s390x/kvm/pv.h"
 #endif
 #define DEF_FEAT(_FEAT, _NAME, _TYPE, _BIT, _DESC) \
 diff --git a/target/s390x/cpu_models.c b/target/s390x/cpu_models.c
 index e7c586c76e..100c5e7b3a 100644
 --- a/target/s390x/cpu_models.c
 +++ b/target/s390x/cpu_models.c
@@ -22,7 +22,7 @@
 #include "qemu/qemu-print.h"
 #ifndef CONFIG_USER_ONLY
 #include "sysemu/sysemu.h"
 -#include "hw/s390x/pv.h"
 +#include "target/s390x/kvm/pv.h"
 #endif
 #define CPUDEF_INIT(_type, _gen, _ec_ga, _mha_pow, _hmfai, _name, _desc) \
 diff --git a/target/s390x/diag.c b/target/s390x/diag.c
 index 76b01dcd68..7c8714cc27 100644
 --- a/target/s390x/diag.c
 +++ b/target/s390x/diag.c
@@ -19,9 +19,9 @@
 #include "sysemu/cpus.h"
 #include "hw/s390x/ipl.h"
 #include "hw/s390x/s390-virtio-ccw.h"
 -#include "hw/s390x/pv.h"
 #include "sysemu/kvm.h"
 #include "kvm/kvm_s390x.h"
 +#include "target/s390x/kvm/pv.h"
 int handle_diag_288(CPUS390XState *env, uint64_t r1, uint64_t r3)
 {
 diff --git a/target/s390x/helper.c b/target/s390x/helper.c
 index 6e35473c7f..860977126a 100644
 --- a/target/s390x/helper.c
 +++ b/target/s390x/helper.c
@@ -24,7 +24,7 @@
 #include "exec/gdbstub.h"
 #include "qemu/timer.h"
 #include "hw/s390x/ioinst.h"
 -#include "hw/s390x/pv.h"
 +#include "target/s390x/kvm/pv.h"
 #include "sysemu/hw_accel.h"
 #include "sysemu/runstate.h"
 #include "sysemu/tcg.h"
 diff --git a/target/s390x/ioinst.c b/target/s390x/ioinst.c
 index bdae5090bc..409f3e3e63 100644
 --- a/target/s390x/ioinst.c
 +++ b/target/s390x/ioinst.c
@@ -16,7 +16,7 @@
 #include "hw/s390x/ioinst.h"
 #include "trace.h"
 #include "hw/s390x/s390-pci-bus.h"
 -#include "hw/s390x/pv.h"
 +#include "target/s390x/kvm/pv.h"
 /* All I/O instructions but chsc use the s format */
 static uint64_t get_address_from_regs(CPUS390XState *env, uint32_t ipb,
 diff --git a/target/s390x/kvm/kvm.c b/target/s390x/kvm/kvm.c
 index a963866ef4..6d1a6324b9 100644
 --- a/target/s390x/kvm/kvm.c
 +++ b/target/s390x/kvm/kvm.c
@@ -51,7 +51,7 @@
 #include "exec/memattrs.h"
 #include "hw/s390x/s390-virtio-ccw.h"
 #include "hw/s390x/s390-virtio-hcall.h"
 -#include "hw/s390x/pv.h"
 +#include "target/s390x/kvm/pv.h"
 #ifndef DEBUG_KVM
 #define DEBUG_KVM  0
 diff --git a/target/s390x/kvm/meson.build b/target/s390x/kvm/meson.build
 index aef52b6686..739d5b9f54 100644
 --- a/target/s390x/kvm/meson.build
 +++ b/target/s390x/kvm/meson.build
@@ -1,5 +1,6 @@
 s390x_ss.add(when: 'CONFIG_KVM', if_true: files(
 +  'pv.c',
   'kvm.c'
 ), if_false: files(
   'stubs.c'
 diff --git a/hw/s390x/pv.c b/target/s390x/kvm/pv.c
 similarity index 99%
 rename from hw/s390x/pv.c
 rename to target/s390x/kvm/pv.c
 index 8a1c71436b..e14db4f41a 100644
 --- a/hw/s390x/pv.c
 +++ b/target/s390x/kvm/pv.c
@@ -19,9 +19,9 @@
 #include "qom/object_interfaces.h"
 #include "exec/confidential-guest-support.h"
 #include "hw/s390x/ipl.h"
 -#include "hw/s390x/pv.h"
 #include "hw/s390x/sclp.h"
 #include "target/s390x/kvm/kvm_s390x.h"
 +#include "target/s390x/kvm/pv.h"
 static bool info_valid;
 static struct kvm_s390_pv_info_vm info_vm;
 diff --git a/include/hw/s390x/pv.h b/target/s390x/kvm/pv.h
 similarity index 100%
 rename from include/hw/s390x/pv.h
 rename to target/s390x/kvm/pv.h
 -- 
 2.41.0
--- a/SOURCES/kvm-hw-s390x-pv-Restrict-Protected-Virtualization-to-sys.patch
+++ b/SOURCES/kvm-hw-s390x-pv-Restrict-Protected-Virtualization-to-sys.patch
@ -0,0 +1,100 @@
 From 053faafcf523b0ea4d841c0af8e7e26a2cddd5e8 Mon Sep 17 00:00:00 2001
 From: Thomas Huth <thuth@redhat.com>
 Date: Mon, 15 Jan 2024 14:00:04 +0100
 Subject: [PATCH 3/5] hw/s390x/pv: Restrict Protected Virtualization to sysemu
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Thomas Huth <thuth@redhat.com>
 RH-MergeRequest: 348: s390x: Provide some more useful information if decryption of a PV image fails
 RH-Jira: RHEL-18214
 RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
 RH-Acked-by: Cédric Le Goater <clg@redhat.com>
 RH-Commit: [3/5] 17b11f9fd2b53c7d33c09a62f28cfca19b18e798
 JIRA: https://issues.redhat.com/browse/RHEL-18214
 commit 3ea7e312671686e616efa1b8caa5f5ce2d06543a
 Author: Philippe Mathieu-Daudé <philmd@linaro.org>
 Date:   Sat Dec 17 16:24:52 2022 +0100
    hw/s390x/pv: Restrict Protected Virtualization to sysemu
    Protected Virtualization is irrelevant in user emulation.
    Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
    Message-Id: <20221217152454.96388-4-philmd@linaro.org>
    Reviewed-by: Thomas Huth <thuth@redhat.com>
    Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
    Signed-off-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Thomas Huth <thuth@redhat.com>
 ---
 target/s390x/cpu_features.c | 4 ++++
 target/s390x/cpu_models.c   | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)
 diff --git a/target/s390x/cpu_features.c b/target/s390x/cpu_features.c
 index 5528acd082..2e4e11d264 100644
 --- a/target/s390x/cpu_features.c
 +++ b/target/s390x/cpu_features.c
@@ -14,7 +14,9 @@
 #include "qemu/osdep.h"
 #include "qemu/module.h"
 #include "cpu_features.h"
 +#ifndef CONFIG_USER_ONLY
 #include "hw/s390x/pv.h"
 +#endif
 #define DEF_FEAT(_FEAT, _NAME, _TYPE, _BIT, _DESC) \
     [S390_FEAT_##_FEAT] = {                        \
@@ -107,6 +109,7 @@ void s390_fill_feat_block(const S390FeatBitmap features, S390FeatType type,
         feat = find_next_bit(features, S390_FEAT_MAX, feat + 1);
     }
 +#ifndef CONFIG_USER_ONLY
     if (!s390_is_pv()) {
         return;
     }
@@ -147,6 +150,7 @@ void s390_fill_feat_block(const S390FeatBitmap features, S390FeatType type,
     default:
         return;
     }
 +#endif
 }
 void s390_add_from_feat_block(S390FeatBitmap features, S390FeatType type,
 diff --git a/target/s390x/cpu_models.c b/target/s390x/cpu_models.c
 index 454485e706..e7c586c76e 100644
 --- a/target/s390x/cpu_models.c
 +++ b/target/s390x/cpu_models.c
@@ -22,8 +22,8 @@
 #include "qemu/qemu-print.h"
 #ifndef CONFIG_USER_ONLY
 #include "sysemu/sysemu.h"
 -#endif
 #include "hw/s390x/pv.h"
 +#endif
 #define CPUDEF_INIT(_type, _gen, _ec_ga, _mha_pow, _hmfai, _name, _desc) \
     {                                                                    \
@@ -236,6 +236,7 @@ bool s390_has_feat(S390Feat feat)
         return 0;
     }
 +#ifndef CONFIG_USER_ONLY
     if (s390_is_pv()) {
         switch (feat) {
         case S390_FEAT_DIAG_318:
@@ -259,6 +260,7 @@ bool s390_has_feat(S390Feat feat)
             break;
         }
     }
 +#endif
     return test_bit(feat, cpu->model->features);
 }
 -- 
 2.41.0
--- a/SOURCES/kvm-hw-scsi-lsi53c895a-Fix-reentrancy-issues-in-the-LSI-.patch
+++ b/SOURCES/kvm-hw-scsi-lsi53c895a-Fix-reentrancy-issues-in-the-LSI-.patch
@ -0,0 +1,260 @@
 From 57a26ba1c4053cdc426653f921e66f7a8efd3ce7 Mon Sep 17 00:00:00 2001
 From: Thomas Huth <thuth@redhat.com>
 Date: Mon, 22 May 2023 11:10:11 +0200
 Subject: [PATCH 12/15] hw/scsi/lsi53c895a: Fix reentrancy issues in the LSI
 controller (CVE-2023-0330)
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 277: memory: prevent dma-reentracy issues
 RH-Bugzilla: 1999236
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [12/12] 28f5e04344109d8514869c50468bef481437201d (redhat/rhel/src/qemu-kvm/jons-qemu-kvm-2)
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1999236
 Upstream: Merged
 CVE: CVE-2021-3750
 commit b987718bbb1d0eabf95499b976212dd5f0120d75
 Author: Thomas Huth <thuth@redhat.com>
 Date:   Mon May 22 11:10:11 2023 +0200
    hw/scsi/lsi53c895a: Fix reentrancy issues in the LSI controller (CVE-2023-0330)
    We cannot use the generic reentrancy guard in the LSI code, so
    we have to manually prevent endless reentrancy here. The problematic
    lsi_execute_script() function has already a way to detect whether
    too many instructions have been executed - we just have to slightly
    change the logic here that it also takes into account if the function
    has been called too often in a reentrant way.
    The code in fuzz-lsi53c895a-test.c has been taken from an earlier
    patch by Mauro Matteo Cascella.
    Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1563
    Message-Id: <20230522091011.1082574-1-thuth@redhat.com>
    Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
    Reviewed-by: Alexander Bulekov <alxndr@bu.edu>
    Signed-off-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 hw/scsi/lsi53c895a.c               |  23 +++--
 tests/qtest/fuzz-lsi53c895a-test.c | 161 +++++++++++++++++++++++++++++
 2 files changed, 178 insertions(+), 6 deletions(-)
 create mode 100644 tests/qtest/fuzz-lsi53c895a-test.c
 diff --git a/hw/scsi/lsi53c895a.c b/hw/scsi/lsi53c895a.c
 index 2b9cb2ac5d..b60786fd56 100644
 --- a/hw/scsi/lsi53c895a.c
 +++ b/hw/scsi/lsi53c895a.c
@@ -1133,15 +1133,24 @@ static void lsi_execute_script(LSIState *s)
     uint32_t addr, addr_high;
     int opcode;
     int insn_processed = 0;
 +    static int reentrancy_level;
 +
 +    reentrancy_level++;
     s->istat1 |= LSI_ISTAT1_SRUN;
 again:
 -    if (++insn_processed > LSI_MAX_INSN) {
 -        /* Some windows drivers make the device spin waiting for a memory
 -           location to change.  If we have been executed a lot of code then
 -           assume this is the case and force an unexpected device disconnect.
 -           This is apparently sufficient to beat the drivers into submission.
 -         */
 +    /*
 +     * Some windows drivers make the device spin waiting for a memory location
 +     * to change. If we have executed more than LSI_MAX_INSN instructions then
 +     * assume this is the case and force an unexpected device disconnect. This
 +     * is apparently sufficient to beat the drivers into submission.
 +     *
 +     * Another issue (CVE-2023-0330) can occur if the script is programmed to
 +     * trigger itself again and again. Avoid this problem by stopping after
 +     * being called multiple times in a reentrant way (8 is an arbitrary value
 +     * which should be enough for all valid use cases).
 +     */
 +    if (++insn_processed > LSI_MAX_INSN || reentrancy_level > 8) {
         if (!(s->sien0 & LSI_SIST0_UDC)) {
             qemu_log_mask(LOG_GUEST_ERROR,
                           "lsi_scsi: inf. loop with UDC masked");
@@ -1595,6 +1604,8 @@ again:
         }
     }
     trace_lsi_execute_script_stop();
 +
 +    reentrancy_level--;
 }
 static uint8_t lsi_reg_readb(LSIState *s, int offset)
 diff --git a/tests/qtest/fuzz-lsi53c895a-test.c b/tests/qtest/fuzz-lsi53c895a-test.c
 new file mode 100644
 index 0000000000..1b55928b9f
 --- /dev/null
 +++ b/tests/qtest/fuzz-lsi53c895a-test.c
@@ -0,0 +1,161 @@
 +/* SPDX-License-Identifier: GPL-2.0-or-later */
 +/*
 + * QTest fuzzer-generated testcase for LSI53C895A device
 + *
 + * Copyright (c) Red Hat
 + */
 +
 +#include "qemu/osdep.h"
 +#include "libqtest.h"
 +
 +/*
 + * This used to trigger a DMA reentrancy issue
 + * leading to memory corruption bugs like stack
 + * overflow or use-after-free
 + * https://gitlab.com/qemu-project/qemu/-/issues/1563
 + */
 +static void test_lsi_dma_reentrancy(void)
 +{
 +    QTestState *s;
 +
 +    s = qtest_init("-M q35 -m 512M -nodefaults "
 +                   "-blockdev driver=null-co,node-name=null0 "
 +                   "-device lsi53c810 -device scsi-cd,drive=null0");
 +
 +    qtest_outl(s, 0xcf8, 0x80000804); /* PCI Command Register */
 +    qtest_outw(s, 0xcfc, 0x7);        /* Enables accesses */
 +    qtest_outl(s, 0xcf8, 0x80000814); /* Memory Bar 1 */
 +    qtest_outl(s, 0xcfc, 0xff100000); /* Set MMIO Address*/
 +    qtest_outl(s, 0xcf8, 0x80000818); /* Memory Bar 2 */
 +    qtest_outl(s, 0xcfc, 0xff000000); /* Set RAM Address*/
 +    qtest_writel(s, 0xff000000, 0xc0000024);
 +    qtest_writel(s, 0xff000114, 0x00000080);
 +    qtest_writel(s, 0xff00012c, 0xff000000);
 +    qtest_writel(s, 0xff000004, 0xff000114);
 +    qtest_writel(s, 0xff000008, 0xff100014);
 +    qtest_writel(s, 0xff10002f, 0x000000ff);
 +
 +    qtest_quit(s);
 +}
 +
 +/*
 + * This used to trigger a UAF in lsi_do_msgout()
 + * https://gitlab.com/qemu-project/qemu/-/issues/972
 + */
 +static void test_lsi_do_msgout_cancel_req(void)
 +{
 +    QTestState *s;
 +
 +    if (sizeof(void *) == 4) {
 +        g_test_skip("memory size too big for 32-bit build");
 +        return;
 +    }
 +
 +    s = qtest_init("-M q35 -m 2G -nodefaults "
 +                   "-device lsi53c895a,id=scsi "
 +                   "-device scsi-hd,drive=disk0 "
 +                   "-drive file=null-co://,id=disk0,if=none,format=raw");
 +
 +    qtest_outl(s, 0xcf8, 0x80000810);
 +    qtest_outl(s, 0xcf8, 0xc000);
 +    qtest_outl(s, 0xcf8, 0x80000810);
 +    qtest_outw(s, 0xcfc, 0x7);
 +    qtest_outl(s, 0xcf8, 0x80000810);
 +    qtest_outl(s, 0xcfc, 0xc000);
 +    qtest_outl(s, 0xcf8, 0x80000804);
 +    qtest_outw(s, 0xcfc, 0x05);
 +    qtest_writeb(s, 0x69736c10, 0x08);
 +    qtest_writeb(s, 0x69736c13, 0x58);
 +    qtest_writeb(s, 0x69736c1a, 0x01);
 +    qtest_writeb(s, 0x69736c1b, 0x06);
 +    qtest_writeb(s, 0x69736c22, 0x01);
 +    qtest_writeb(s, 0x69736c23, 0x07);
 +    qtest_writeb(s, 0x69736c2b, 0x02);
 +    qtest_writeb(s, 0x69736c48, 0x08);
 +    qtest_writeb(s, 0x69736c4b, 0x58);
 +    qtest_writeb(s, 0x69736c52, 0x04);
 +    qtest_writeb(s, 0x69736c53, 0x06);
 +    qtest_writeb(s, 0x69736c5b, 0x02);
 +    qtest_outl(s, 0xc02d, 0x697300);
 +    qtest_writeb(s, 0x5a554662, 0x01);
 +    qtest_writeb(s, 0x5a554663, 0x07);
 +    qtest_writeb(s, 0x5a55466a, 0x10);
 +    qtest_writeb(s, 0x5a55466b, 0x22);
 +    qtest_writeb(s, 0x5a55466c, 0x5a);
 +    qtest_writeb(s, 0x5a55466d, 0x5a);
 +    qtest_writeb(s, 0x5a55466e, 0x34);
 +    qtest_writeb(s, 0x5a55466f, 0x5a);
 +    qtest_writeb(s, 0x5a345a5a, 0x77);
 +    qtest_writeb(s, 0x5a345a5b, 0x55);
 +    qtest_writeb(s, 0x5a345a5c, 0x51);
 +    qtest_writeb(s, 0x5a345a5d, 0x27);
 +    qtest_writeb(s, 0x27515577, 0x41);
 +    qtest_outl(s, 0xc02d, 0x5a5500);
 +    qtest_writeb(s, 0x364001d0, 0x08);
 +    qtest_writeb(s, 0x364001d3, 0x58);
 +    qtest_writeb(s, 0x364001da, 0x01);
 +    qtest_writeb(s, 0x364001db, 0x26);
 +    qtest_writeb(s, 0x364001dc, 0x0d);
 +    qtest_writeb(s, 0x364001dd, 0xae);
 +    qtest_writeb(s, 0x364001de, 0x41);
 +    qtest_writeb(s, 0x364001df, 0x5a);
 +    qtest_writeb(s, 0x5a41ae0d, 0xf8);
 +    qtest_writeb(s, 0x5a41ae0e, 0x36);
 +    qtest_writeb(s, 0x5a41ae0f, 0xd7);
 +    qtest_writeb(s, 0x5a41ae10, 0x36);
 +    qtest_writeb(s, 0x36d736f8, 0x0c);
 +    qtest_writeb(s, 0x36d736f9, 0x80);
 +    qtest_writeb(s, 0x36d736fa, 0x0d);
 +    qtest_outl(s, 0xc02d, 0x364000);
 +
 +    qtest_quit(s);
 +}
 +
 +/*
 + * This used to trigger the assert in lsi_do_dma()
 + * https://bugs.launchpad.net/qemu/+bug/697510
 + * https://bugs.launchpad.net/qemu/+bug/1905521
 + * https://bugs.launchpad.net/qemu/+bug/1908515
 + */
 +static void test_lsi_do_dma_empty_queue(void)
 +{
 +    QTestState *s;
 +
 +    s = qtest_init("-M q35 -nographic -monitor none -serial none "
 +                   "-drive if=none,id=drive0,"
 +                            "file=null-co://,file.read-zeroes=on,format=raw "
 +                   "-device lsi53c895a,id=scsi0 "
 +                   "-device scsi-hd,drive=drive0,"
 +                            "bus=scsi0.0,channel=0,scsi-id=0,lun=0");
 +    qtest_outl(s, 0xcf8, 0x80001814);
 +    qtest_outl(s, 0xcfc, 0xe1068000);
 +    qtest_outl(s, 0xcf8, 0x80001818);
 +    qtest_outl(s, 0xcf8, 0x80001804);
 +    qtest_outw(s, 0xcfc, 0x7);
 +    qtest_outl(s, 0xcf8, 0x80002010);
 +
 +    qtest_writeb(s, 0xe106802e, 0xff); /* Fill DSP bits 16-23 */
 +    qtest_writeb(s, 0xe106802f, 0xff); /* Fill DSP bits 24-31: trigger SCRIPT */
 +
 +    qtest_quit(s);
 +}
 +
 +int main(int argc, char **argv)
 +{
 +    g_test_init(&argc, &argv, NULL);
 +
 +    if (!qtest_has_device("lsi53c895a")) {
 +        return 0;
 +    }
 +
 +    qtest_add_func("fuzz/lsi53c895a/lsi_do_dma_empty_queue",
 +                   test_lsi_do_dma_empty_queue);
 +
 +    qtest_add_func("fuzz/lsi53c895a/lsi_do_msgout_cancel_req",
 +                   test_lsi_do_msgout_cancel_req);
 +
 +    qtest_add_func("fuzz/lsi53c895a/lsi_dma_reentrancy",
 +                   test_lsi_dma_reentrancy);
 +
 +    return g_test_run();
 +}
 -- 
 2.37.3
--- a/SOURCES/kvm-hw-virtio-Introduce-virtio_bh_new_guarded-helper.patch
+++ b/SOURCES/kvm-hw-virtio-Introduce-virtio_bh_new_guarded-helper.patch
@ -0,0 +1,86 @@
 From 1b62d61c495bf4cd3a819ab8d1ef024d153e0ece Mon Sep 17 00:00:00 2001
 From: Jon Maloy <jmaloy@redhat.com>
 Date: Thu, 18 Jul 2024 09:40:29 -0400
 Subject: [PATCH 3/6] hw/virtio: Introduce virtio_bh_new_guarded() helper
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 380: QEMU: virtio: DMA reentrancy issue leads to double free vulnerability
 RH-Jira: RHEL-32276
 RH-Acked-by: Gerd Hoffmann <None>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [3/6] 1cbde7ddb8393b72e2e8d457b5e2d739116567a9 (redhat/rhel/src/qemu-kvm/jons-qemu-kvm-2)
 JIRA: https://issues.redhat.com/browse/RHEL-32276
 CVE: CVE-2024-3446
 Upstream: Merged
 commit ec0504b989ca61e03636384d3602b7bf07ffe4da
 Author: Philippe Mathieu-Daudé <philmd@linaro.org>
 Date:   Thu Apr 4 20:56:11 2024 +0200
    hw/virtio: Introduce virtio_bh_new_guarded() helper
    Introduce virtio_bh_new_guarded(), similar to qemu_bh_new_guarded()
    but using the transport memory guard, instead of the device one
    (there can only be one virtio device per virtio bus).
    Inspired-by: Gerd Hoffmann <kraxel@redhat.com>
    Reviewed-by: Gerd Hoffmann <kraxel@redhat.com>
    Acked-by: Michael S. Tsirkin <mst@redhat.com>
    Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
    Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
    Message-Id: <20240409105537.18308-2-philmd@linaro.org>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 hw/virtio/virtio.c         | 10 ++++++++++
 include/hw/virtio/virtio.h |  7 +++++++
 2 files changed, 17 insertions(+)
 diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
 index ea7c079fb0..5ae9c44841 100644
 --- a/hw/virtio/virtio.c
 +++ b/hw/virtio/virtio.c
@@ -3874,3 +3874,13 @@ static void virtio_register_types(void)
 }
 type_init(virtio_register_types)
 +
 +QEMUBH *virtio_bh_new_guarded_full(DeviceState *dev,
 +                                   QEMUBHFunc *cb, void *opaque,
 +                                   const char *name)
 +{
 +    DeviceState *transport = qdev_get_parent_bus(dev)->parent;
 +
 +    return qemu_bh_new_full(cb, opaque, name,
 +                            &transport->mem_reentrancy_guard);
 +}
 diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
 index 8bab9cfb75..731c631a81 100644
 --- a/include/hw/virtio/virtio.h
 +++ b/include/hw/virtio/virtio.h
@@ -22,6 +22,7 @@
 #include "standard-headers/linux/virtio_config.h"
 #include "standard-headers/linux/virtio_ring.h"
 #include "qom/object.h"
 +#include "block/aio.h"
 /* A guest should never accept this.  It implies negotiation is broken. */
 #define VIRTIO_F_BAD_FEATURE		30
@@ -397,4 +398,10 @@ static inline bool virtio_device_disabled(VirtIODevice *vdev)
 bool virtio_legacy_allowed(VirtIODevice *vdev);
 bool virtio_legacy_check_disabled(VirtIODevice *vdev);
 +QEMUBH *virtio_bh_new_guarded_full(DeviceState *dev,
 +                                   QEMUBHFunc *cb, void *opaque,
 +                                   const char *name);
 +#define virtio_bh_new_guarded(dev, cb, opaque) \
 +    virtio_bh_new_guarded_full((dev), (cb), (opaque), (stringify(cb)))
 +
 #endif
 -- 
 2.39.3
--- a/SOURCES/kvm-hw-virtio-virtio-crypto-Protect-from-DMA-re-entrancy.patch
+++ b/SOURCES/kvm-hw-virtio-virtio-crypto-Protect-from-DMA-re-entrancy.patch
@ -0,0 +1,62 @@
 From 2ecbd673a0e2191821ce88128587f709936ad765 Mon Sep 17 00:00:00 2001
 From: Jon Maloy <jmaloy@redhat.com>
 Date: Thu, 18 Jul 2024 09:21:27 -0400
 Subject: [PATCH 6/6] hw/virtio/virtio-crypto: Protect from DMA re-entrancy
 bugs
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 380: QEMU: virtio: DMA reentrancy issue leads to double free vulnerability
 RH-Jira: RHEL-32276
 RH-Acked-by: Gerd Hoffmann <None>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [6/6] 975ac4640fd8e7cbf3820757787ee7b1270173be (redhat/rhel/src/qemu-kvm/jons-qemu-kvm-2)
 JIRA: https://issues.redhat.com/browse/RHEL-32276
 CVE: CVE-2024-3446
 Upstream: Merged
 commit f4729ec39ad97a42ceaa7b5697f84f440ea6e5dc
 Author: Philippe Mathieu-Daudé <philmd@linaro.org>
 Date:   Thu Apr 4 20:56:41 2024 +0200
    hw/virtio/virtio-crypto: Protect from DMA re-entrancy bugs
    Replace qemu_bh_new_guarded() by virtio_bh_new_guarded()
    so the bus and device use the same guard. Otherwise the
    DMA-reentrancy protection can be bypassed.
    Fixes: CVE-2024-3446
    Cc: qemu-stable@nongnu.org
    Suggested-by: Alexander Bulekov <alxndr@bu.edu>
    Reviewed-by: Gerd Hoffmann <kraxel@redhat.com>
    Acked-by: Michael S. Tsirkin <mst@redhat.com>
    Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
    Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
    Message-Id: <20240409105537.18308-5-philmd@linaro.org>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 hw/virtio/virtio-crypto.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c
 index 1be7bb543c..1741d4aba1 100644
 --- a/hw/virtio/virtio-crypto.c
 +++ b/hw/virtio/virtio-crypto.c
@@ -817,8 +817,8 @@ static void virtio_crypto_device_realize(DeviceState *dev, Error **errp)
         vcrypto->vqs[i].dataq =
                  virtio_add_queue(vdev, 1024, virtio_crypto_handle_dataq_bh);
         vcrypto->vqs[i].dataq_bh =
 -                 qemu_bh_new_guarded(virtio_crypto_dataq_bh, &vcrypto->vqs[i],
 -                                     &dev->mem_reentrancy_guard);
 +                 virtio_bh_new_guarded(dev, virtio_crypto_dataq_bh,
 +                                       &vcrypto->vqs[i]);
         vcrypto->vqs[i].vcrypto = vcrypto;
     }
 -- 
 2.39.3
--- a/SOURCES/kvm-i386-cpu-Update-how-the-EBX-register-of-CPUID-0x8000.patch
+++ b/SOURCES/kvm-i386-cpu-Update-how-the-EBX-register-of-CPUID-0x8000.patch
@ -0,0 +1,53 @@
 From 18ac13c7d64266238bd44b2188e0d044af3c3377 Mon Sep 17 00:00:00 2001
 From: Bandan Das <bsd@redhat.com>
 Date: Thu, 3 Aug 2023 15:14:14 -0400
 Subject: [PATCH 4/5] i386/cpu: Update how the EBX register of CPUID 0x8000001F
 is set
 RH-Author: Bandan Das <None>
 RH-MergeRequest: 296: Updates to SEV reduced-phys-bits parameter
 RH-Bugzilla: 2214840
 RH-Acked-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
 RH-Commit: [4/4] 8b236fd9bc4c177bfacf6220a429e711b5bf062e
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2214840
 commit fb6bbafc0f19385fb257ee073ed13dcaf613f2f8
 Author: Tom Lendacky <thomas.lendacky@amd.com>
 Date:   Fri Sep 30 10:14:30 2022 -0500
    i386/cpu: Update how the EBX register of CPUID 0x8000001F is set
    Update the setting of CPUID 0x8000001F EBX to clearly document the ranges
    associated with fields being set.
    Fixes: 6cb8f2a663 ("cpu/i386: populate CPUID 0x8000_001F when SEV is active")
    Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
    Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
    Message-Id: <5822fd7d02b575121380e1f493a8f6d9eba2b11a.1664550870.git.thomas.lendacky@amd.com>
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Signed-off-by: Bandan Das <bsd@redhat.com>
 ---
 target/i386/cpu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 diff --git a/target/i386/cpu.c b/target/i386/cpu.c
 index 9d3dcdcc0d..265f0aadfc 100644
 --- a/target/i386/cpu.c
 +++ b/target/i386/cpu.c
@@ -5836,8 +5836,8 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
         if (sev_enabled()) {
             *eax = 0x2;
             *eax |= sev_es_enabled() ? 0x8 : 0;
 -            *ebx = sev_get_cbit_position();
 -            *ebx |= sev_get_reduced_phys_bits() << 6;
 +            *ebx = sev_get_cbit_position() & 0x3f; /* EBX[5:0] */
 +            *ebx |= (sev_get_reduced_phys_bits() & 0x3f) << 6; /* EBX[11:6] */
         }
         break;
     default:
 -- 
 2.37.3
--- a/SOURCES/kvm-i386-sev-Update-checks-and-information-related-to-re.patch
+++ b/SOURCES/kvm-i386-sev-Update-checks-and-information-related-to-re.patch
@ -0,0 +1,78 @@
 From 19504ea76b6341c11213316402bb5194487e1f01 Mon Sep 17 00:00:00 2001
 From: Bandan Das <bsd@redhat.com>
 Date: Thu, 3 Aug 2023 15:13:19 -0400
 Subject: [PATCH 3/5] i386/sev: Update checks and information related to
 reduced-phys-bits
 RH-Author: Bandan Das <None>
 RH-MergeRequest: 296: Updates to SEV reduced-phys-bits parameter
 RH-Bugzilla: 2214840
 RH-Acked-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
 RH-Commit: [3/4] b617173d2b15fa39cdc02b5c1ac4d52e9b0dfede
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2214840
 commit 8168fed9f84e3128f7628969ae78af49433d5ce7
 Author: Tom Lendacky <thomas.lendacky@amd.com>
 Date:   Fri Sep 30 10:14:29 2022 -0500
    i386/sev: Update checks and information related to reduced-phys-bits
    The value of the reduced-phys-bits parameter is propogated to the CPUID
    information exposed to the guest. Update the current validation check to
    account for the size of the CPUID field (6-bits), ensuring the value is
    in the range of 1 to 63.
    Maintain backward compatibility, to an extent, by allowing a value greater
    than 1 (so that the previously documented value of 5 still works), but not
    allowing anything over 63.
    Fixes: d8575c6c02 ("sev/i386: add command to initialize the memory encryption context")
    Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
    Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
    Message-Id: <cca5341a95ac73f904e6300f10b04f9c62e4e8ff.1664550870.git.thomas.lendacky@amd.com>
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Signed-off-by: Bandan Das <bsd@redhat.com>
 ---
 target/i386/sev.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)
 diff --git a/target/i386/sev.c b/target/i386/sev.c
 index 025ff7a6f8..ba6a65e90c 100644
 --- a/target/i386/sev.c
 +++ b/target/i386/sev.c
@@ -892,15 +892,26 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
     host_cpuid(0x8000001F, 0, NULL, &ebx, NULL, NULL);
     host_cbitpos = ebx & 0x3f;
 +    /*
 +     * The cbitpos value will be placed in bit positions 5:0 of the EBX
 +     * register of CPUID 0x8000001F. No need to verify the range as the
 +     * comparison against the host value accomplishes that.
 +     */
     if (host_cbitpos != sev->cbitpos) {
         error_setg(errp, "%s: cbitpos check failed, host '%d' requested '%d'",
                    __func__, host_cbitpos, sev->cbitpos);
         goto err;
     }
 -    if (sev->reduced_phys_bits < 1) {
 -        error_setg(errp, "%s: reduced_phys_bits check failed, it should be >=1,"
 -                   " requested '%d'", __func__, sev->reduced_phys_bits);
 +    /*
 +     * The reduced-phys-bits value will be placed in bit positions 11:6 of
 +     * the EBX register of CPUID 0x8000001F, so verify the supplied value
 +     * is in the range of 1 to 63.
 +     */
 +    if (sev->reduced_phys_bits < 1 || sev->reduced_phys_bits > 63) {
 +        error_setg(errp, "%s: reduced_phys_bits check failed,"
 +                   " it should be in the range of 1 to 63, requested '%d'",
 +                   __func__, sev->reduced_phys_bits);
         goto err;
     }
 -- 
 2.37.3
--- a/SOURCES/kvm-io-Add-support-for-MSG_PEEK-for-socket-channel.patch
+++ b/SOURCES/kvm-io-Add-support-for-MSG_PEEK-for-socket-channel.patch
@ -0,0 +1,367 @@
 From 88b5e059462a72ca758d84c0d4d0895a03baac50 Mon Sep 17 00:00:00 2001
 From: "manish.mishra" <manish.mishra@nutanix.com>
 Date: Tue, 20 Dec 2022 18:44:17 +0000
 Subject: [PATCH 1/3] io: Add support for MSG_PEEK for socket channel
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Peter Xu <peterx@redhat.com>
 RH-MergeRequest: 258: migration: Fix multifd crash due to channel disorder
 RH-Bugzilla: 2137740
 RH-Acked-by: quintela1 <quintela@redhat.com>
 RH-Acked-by: Leonardo Brás <leobras@redhat.com>
 RH-Acked-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 RH-Commit: [1/2] 04fc6fae358599b8509f5355469d2e8720f01903
 Conflicts:
 	io/channel-null.c
 	migration/channel-block.c
        Because these two files do not exist in rhel8.8 tree, dropping the
        changes.
 MSG_PEEK peeks at the channel, The data is treated as unread and
 the next read shall still return this data. This support is
 currently added only for socket class. Extra parameter 'flags'
 is added to io_readv calls to pass extra read flags like MSG_PEEK.
 Reviewed-by: Peter Xu <peterx@redhat.com>
 Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
 Reviewed-by: Juan Quintela <quintela@redhat.com>
 Suggested-by: Daniel P. Berrange <berrange@redhat.com>
 Signed-off-by: manish.mishra <manish.mishra@nutanix.com>
 Signed-off-by: Juan Quintela <quintela@redhat.com>
 (cherry picked from commit 84615a19ddf2bfb38d7b3a0d487d2397ee55e4f3)
 Signed-off-by: Peter Xu <peterx@redhat.com>
 ---
 chardev/char-socket.c               |  4 ++--
 include/io/channel.h                |  6 ++++++
 io/channel-buffer.c                 |  1 +
 io/channel-command.c                |  1 +
 io/channel-file.c                   |  1 +
 io/channel-socket.c                 | 19 ++++++++++++++++++-
 io/channel-tls.c                    |  1 +
 io/channel-websock.c                |  1 +
 io/channel.c                        | 16 ++++++++++++----
 migration/rdma.c                    |  1 +
 scsi/qemu-pr-helper.c               |  2 +-
 tests/qtest/tpm-emu.c               |  2 +-
 tests/unit/test-io-channel-socket.c |  1 +
 util/vhost-user-server.c            |  2 +-
 14 files changed, 48 insertions(+), 10 deletions(-)
 diff --git a/chardev/char-socket.c b/chardev/char-socket.c
 index 836cfa0bc2..4cdf79e0c2 100644
 --- a/chardev/char-socket.c
 +++ b/chardev/char-socket.c
@@ -339,11 +339,11 @@ static ssize_t tcp_chr_recv(Chardev *chr, char *buf, size_t len)
     if (qio_channel_has_feature(s->ioc, QIO_CHANNEL_FEATURE_FD_PASS)) {
         ret = qio_channel_readv_full(s->ioc, &iov, 1,
                                      &msgfds, &msgfds_num,
 -                                     NULL);
 +                                     0, NULL);
     } else {
         ret = qio_channel_readv_full(s->ioc, &iov, 1,
                                      NULL, NULL,
 -                                     NULL);
 +                                     0, NULL);
     }
     if (ret == QIO_CHANNEL_ERR_BLOCK) {
 diff --git a/include/io/channel.h b/include/io/channel.h
 index c680ee7480..716235d496 100644
 --- a/include/io/channel.h
 +++ b/include/io/channel.h
@@ -34,6 +34,8 @@ OBJECT_DECLARE_TYPE(QIOChannel, QIOChannelClass,
 #define QIO_CHANNEL_WRITE_FLAG_ZERO_COPY 0x1
 +#define QIO_CHANNEL_READ_FLAG_MSG_PEEK 0x1
 +
 typedef enum QIOChannelFeature QIOChannelFeature;
 enum QIOChannelFeature {
@@ -41,6 +43,7 @@ enum QIOChannelFeature {
     QIO_CHANNEL_FEATURE_SHUTDOWN,
     QIO_CHANNEL_FEATURE_LISTEN,
     QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY,
 +    QIO_CHANNEL_FEATURE_READ_MSG_PEEK,
 };
@@ -114,6 +117,7 @@ struct QIOChannelClass {
                         size_t niov,
                         int **fds,
                         size_t *nfds,
 +                        int flags,
                         Error **errp);
     int (*io_close)(QIOChannel *ioc,
                     Error **errp);
@@ -188,6 +192,7 @@ void qio_channel_set_name(QIOChannel *ioc,
  * @niov: the length of the @iov array
  * @fds: pointer to an array that will received file handles
  * @nfds: pointer filled with number of elements in @fds on return
 + * @flags: read flags (QIO_CHANNEL_READ_FLAG_*)
  * @errp: pointer to a NULL-initialized error object
  *
  * Read data from the IO channel, storing it in the
@@ -224,6 +229,7 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc,
                                size_t niov,
                                int **fds,
                                size_t *nfds,
 +                               int flags,
                                Error **errp);
 diff --git a/io/channel-buffer.c b/io/channel-buffer.c
 index bf52011be2..8096180f85 100644
 --- a/io/channel-buffer.c
 +++ b/io/channel-buffer.c
@@ -54,6 +54,7 @@ static ssize_t qio_channel_buffer_readv(QIOChannel *ioc,
                                         size_t niov,
                                         int **fds,
                                         size_t *nfds,
 +                                        int flags,
                                         Error **errp)
 {
     QIOChannelBuffer *bioc = QIO_CHANNEL_BUFFER(ioc);
 diff --git a/io/channel-command.c b/io/channel-command.c
 index 5ff1691bad..2834413b3a 100644
 --- a/io/channel-command.c
 +++ b/io/channel-command.c
@@ -230,6 +230,7 @@ static ssize_t qio_channel_command_readv(QIOChannel *ioc,
                                          size_t niov,
                                          int **fds,
                                          size_t *nfds,
 +                                         int flags,
                                          Error **errp)
 {
     QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc);
 diff --git a/io/channel-file.c b/io/channel-file.c
 index 348a48545e..490f0e5d84 100644
 --- a/io/channel-file.c
 +++ b/io/channel-file.c
@@ -86,6 +86,7 @@ static ssize_t qio_channel_file_readv(QIOChannel *ioc,
                                       size_t niov,
                                       int **fds,
                                       size_t *nfds,
 +                                      int flags,
                                       Error **errp)
 {
     QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc);
 diff --git a/io/channel-socket.c b/io/channel-socket.c
 index 6010ad7017..ca8b180b69 100644
 --- a/io/channel-socket.c
 +++ b/io/channel-socket.c
@@ -174,6 +174,9 @@ int qio_channel_socket_connect_sync(QIOChannelSocket *ioc,
     }
 #endif
 +    qio_channel_set_feature(QIO_CHANNEL(ioc),
 +                            QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
 +
     return 0;
 }
@@ -407,6 +410,9 @@ qio_channel_socket_accept(QIOChannelSocket *ioc,
     }
 #endif /* WIN32 */
 +    qio_channel_set_feature(QIO_CHANNEL(cioc),
 +                            QIO_CHANNEL_FEATURE_READ_MSG_PEEK);
 +
     trace_qio_channel_socket_accept_complete(ioc, cioc, cioc->fd);
     return cioc;
@@ -497,6 +503,7 @@ static ssize_t qio_channel_socket_readv(QIOChannel *ioc,
                                         size_t niov,
                                         int **fds,
                                         size_t *nfds,
 +                                        int flags,
                                         Error **errp)
 {
     QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
@@ -518,6 +525,10 @@ static ssize_t qio_channel_socket_readv(QIOChannel *ioc,
     }
 +    if (flags & QIO_CHANNEL_READ_FLAG_MSG_PEEK) {
 +        sflags |= MSG_PEEK;
 +    }
 +
  retry:
     ret = recvmsg(sioc->fd, &msg, sflags);
     if (ret < 0) {
@@ -625,11 +636,17 @@ static ssize_t qio_channel_socket_readv(QIOChannel *ioc,
                                         size_t niov,
                                         int **fds,
                                         size_t *nfds,
 +                                        int flags,
                                         Error **errp)
 {
     QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
     ssize_t done = 0;
     ssize_t i;
 +    int sflags = 0;
 +
 +    if (flags & QIO_CHANNEL_READ_FLAG_MSG_PEEK) {
 +        sflags |= MSG_PEEK;
 +    }
     for (i = 0; i < niov; i++) {
         ssize_t ret;
@@ -637,7 +654,7 @@ static ssize_t qio_channel_socket_readv(QIOChannel *ioc,
         ret = recv(sioc->fd,
                    iov[i].iov_base,
                    iov[i].iov_len,
 -                   0);
 +                   sflags);
         if (ret < 0) {
             if (errno == EAGAIN) {
                 if (done) {
 diff --git a/io/channel-tls.c b/io/channel-tls.c
 index 4ce890a538..c730cb8ec5 100644
 --- a/io/channel-tls.c
 +++ b/io/channel-tls.c
@@ -260,6 +260,7 @@ static ssize_t qio_channel_tls_readv(QIOChannel *ioc,
                                      size_t niov,
                                      int **fds,
                                      size_t *nfds,
 +                                     int flags,
                                      Error **errp)
 {
     QIOChannelTLS *tioc = QIO_CHANNEL_TLS(ioc);
 diff --git a/io/channel-websock.c b/io/channel-websock.c
 index 035dd6075b..13c94f2afe 100644
 --- a/io/channel-websock.c
 +++ b/io/channel-websock.c
@@ -1081,6 +1081,7 @@ static ssize_t qio_channel_websock_readv(QIOChannel *ioc,
                                          size_t niov,
                                          int **fds,
                                          size_t *nfds,
 +                                         int flags,
                                          Error **errp)
 {
     QIOChannelWebsock *wioc = QIO_CHANNEL_WEBSOCK(ioc);
 diff --git a/io/channel.c b/io/channel.c
 index 0640941ac5..a8c7f11649 100644
 --- a/io/channel.c
 +++ b/io/channel.c
@@ -52,6 +52,7 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc,
                                size_t niov,
                                int **fds,
                                size_t *nfds,
 +                               int flags,
                                Error **errp)
 {
     QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
@@ -63,7 +64,14 @@ ssize_t qio_channel_readv_full(QIOChannel *ioc,
         return -1;
     }
 -    return klass->io_readv(ioc, iov, niov, fds, nfds, errp);
 +    if ((flags & QIO_CHANNEL_READ_FLAG_MSG_PEEK) &&
 +        !qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
 +        error_setg_errno(errp, EINVAL,
 +                         "Channel does not support peek read");
 +        return -1;
 +    }
 +
 +    return klass->io_readv(ioc, iov, niov, fds, nfds, flags, errp);
 }
@@ -146,7 +154,7 @@ int qio_channel_readv_full_all_eof(QIOChannel *ioc,
     while ((nlocal_iov > 0) || local_fds) {
         ssize_t len;
         len = qio_channel_readv_full(ioc, local_iov, nlocal_iov, local_fds,
 -                                     local_nfds, errp);
 +                                     local_nfds, 0, errp);
         if (len == QIO_CHANNEL_ERR_BLOCK) {
             if (qemu_in_coroutine()) {
                 qio_channel_yield(ioc, G_IO_IN);
@@ -284,7 +292,7 @@ ssize_t qio_channel_readv(QIOChannel *ioc,
                           size_t niov,
                           Error **errp)
 {
 -    return qio_channel_readv_full(ioc, iov, niov, NULL, NULL, errp);
 +    return qio_channel_readv_full(ioc, iov, niov, NULL, NULL, 0, errp);
 }
@@ -303,7 +311,7 @@ ssize_t qio_channel_read(QIOChannel *ioc,
                          Error **errp)
 {
     struct iovec iov = { .iov_base = buf, .iov_len = buflen };
 -    return qio_channel_readv_full(ioc, &iov, 1, NULL, NULL, errp);
 +    return qio_channel_readv_full(ioc, &iov, 1, NULL, NULL, 0, errp);
 }
 diff --git a/migration/rdma.c b/migration/rdma.c
 index 54acd2000e..dcf98bd7f8 100644
 --- a/migration/rdma.c
 +++ b/migration/rdma.c
@@ -2917,6 +2917,7 @@ static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,
                                       size_t niov,
                                       int **fds,
                                       size_t *nfds,
 +                                      int flags,
                                       Error **errp)
 {
     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
 diff --git a/scsi/qemu-pr-helper.c b/scsi/qemu-pr-helper.c
 index f281daeced..12ec8e9368 100644
 --- a/scsi/qemu-pr-helper.c
 +++ b/scsi/qemu-pr-helper.c
@@ -612,7 +612,7 @@ static int coroutine_fn prh_read(PRHelperClient *client, void *buf, int sz,
         iov.iov_base = buf;
         iov.iov_len = sz;
         n_read = qio_channel_readv_full(QIO_CHANNEL(client->ioc), &iov, 1,
 -                                        &fds, &nfds, errp);
 +                                        &fds, &nfds, 0, errp);
         if (n_read == QIO_CHANNEL_ERR_BLOCK) {
             qio_channel_yield(QIO_CHANNEL(client->ioc), G_IO_IN);
 diff --git a/tests/qtest/tpm-emu.c b/tests/qtest/tpm-emu.c
 index 2994d1cf42..3cf1acaf7d 100644
 --- a/tests/qtest/tpm-emu.c
 +++ b/tests/qtest/tpm-emu.c
@@ -106,7 +106,7 @@ void *tpm_emu_ctrl_thread(void *data)
         int *pfd = NULL;
         size_t nfd = 0;
 -        qio_channel_readv_full(ioc, &iov, 1, &pfd, &nfd, &error_abort);
 +        qio_channel_readv_full(ioc, &iov, 1, &pfd, &nfd, 0, &error_abort);
         cmd = be32_to_cpu(cmd);
         g_assert_cmpint(cmd, ==, CMD_SET_DATAFD);
         g_assert_cmpint(nfd, ==, 1);
 diff --git a/tests/unit/test-io-channel-socket.c b/tests/unit/test-io-channel-socket.c
 index 6713886d02..de2930f203 100644
 --- a/tests/unit/test-io-channel-socket.c
 +++ b/tests/unit/test-io-channel-socket.c
@@ -452,6 +452,7 @@ static void test_io_channel_unix_fd_pass(void)
                            G_N_ELEMENTS(iorecv),
                            &fdrecv,
                            &nfdrecv,
 +                           0,
                            &error_abort);
     g_assert(nfdrecv == G_N_ELEMENTS(fdsend));
 diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c
 index 783d847a6d..e6a9ef72b7 100644
 --- a/util/vhost-user-server.c
 +++ b/util/vhost-user-server.c
@@ -102,7 +102,7 @@ vu_message_read(VuDev *vu_dev, int conn_fd, VhostUserMsg *vmsg)
          * qio_channel_readv_full may have short reads, keeping calling it
          * until getting VHOST_USER_HDR_SIZE or 0 bytes in total
          */
 -        rc = qio_channel_readv_full(ioc, &iov, 1, &fds, &nfds, &local_err);
 +        rc = qio_channel_readv_full(ioc, &iov, 1, &fds, &nfds, 0, &local_err);
         if (rc < 0) {
             if (rc == QIO_CHANNEL_ERR_BLOCK) {
                 assert(local_err == NULL);
 -- 
 2.37.3
--- a/SOURCES/kvm-iotests-244-Don-t-store-data-file-with-protocol-in-i.patch
+++ b/SOURCES/kvm-iotests-244-Don-t-store-data-file-with-protocol-in-i.patch
@ -0,0 +1,68 @@
 From 3cb587f460ec432f329fb83df034bbb7e79e17aa Mon Sep 17 00:00:00 2001
 From: Jon Maloy <jmaloy@redhat.com>
 Date: Wed, 5 Jun 2024 19:56:51 -0400
 Subject: [PATCH 2/5] iotests/244: Don't store data-file with protocol in image
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 5: EMBARGOED CVE-2024-4467 for rhel-8.10.z (PRDSC)
 RH-Jira: RHEL-35616
 RH-CVE: CVE-2024-4467
 RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
 RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
 RH-Commit: [2/5] a422cfdba938e1bd857008ccbbddc695011ae0ff
 commit 92e00dab8be1570b13172353d77d2af44cb4e22b
 Author: Kevin Wolf <kwolf@redhat.com>
 Date:   Thu Apr 25 14:49:40 2024 +0200
    iotests/244: Don't store data-file with protocol in image
    We want to disable filename parsing for data files because it's too easy
    to abuse in malicious image files. Make the test ready for the change by
    passing the data file explicitly in command line options.
    Signed-off-by: Kevin Wolf <kwolf@redhat.com>
    Reviewed-by: Eric Blake <eblake@redhat.com>
    Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
    Reviewed-by: Hanna Czenczek <hreitz@redhat.com>
    Upstream: N/A, embargoed
    Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 tests/qemu-iotests/244 | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)
 diff --git a/tests/qemu-iotests/244 b/tests/qemu-iotests/244
 index 3e61fa25bb..bb9cc6512f 100755
 --- a/tests/qemu-iotests/244
 +++ b/tests/qemu-iotests/244
@@ -215,9 +215,22 @@ $QEMU_IMG convert -f $IMGFMT -O $IMGFMT -n -C "$TEST_IMG.src" "$TEST_IMG"
 $QEMU_IMG compare -f $IMGFMT -F $IMGFMT "$TEST_IMG.src" "$TEST_IMG"
 # blkdebug doesn't support copy offloading, so this tests the error path
 -$QEMU_IMG amend -f $IMGFMT -o "data_file=blkdebug::$TEST_IMG.data" "$TEST_IMG"
 -$QEMU_IMG convert -f $IMGFMT -O $IMGFMT -n -C "$TEST_IMG.src" "$TEST_IMG"
 -$QEMU_IMG compare -f $IMGFMT -F $IMGFMT "$TEST_IMG.src" "$TEST_IMG"
 +test_img_with_blkdebug="json:{
 +    'driver': 'qcow2',
 +    'file': {
 +        'driver': 'file',
 +        'filename': '$TEST_IMG'
 +    },
 +    'data-file': {
 +        'driver': 'blkdebug',
 +        'image': {
 +            'driver': 'file',
 +            'filename': '$TEST_IMG.data'
 +        }
 +    }
 +}"
 +$QEMU_IMG convert -f $IMGFMT -O $IMGFMT -n -C "$TEST_IMG.src" "$test_img_with_blkdebug"
 +$QEMU_IMG compare -f $IMGFMT -F $IMGFMT "$TEST_IMG.src" "$test_img_with_blkdebug"
 echo
 echo "=== Flushing should flush the data file ==="
 -- 
 2.39.3
--- a/SOURCES/kvm-iotests-270-Don-t-store-data-file-with-json-prefix-i.patch
+++ b/SOURCES/kvm-iotests-270-Don-t-store-data-file-with-json-prefix-i.patch
@ -0,0 +1,71 @@
 From 59a84673079f9763e9507733e308442397aba703 Mon Sep 17 00:00:00 2001
 From: Jon Maloy <jmaloy@redhat.com>
 Date: Wed, 5 Jun 2024 19:56:51 -0400
 Subject: [PATCH 3/5] iotests/270: Don't store data-file with json: prefix in
 image
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 5: EMBARGOED CVE-2024-4467 for rhel-8.10.z (PRDSC)
 RH-Jira: RHEL-35616
 RH-CVE: CVE-2024-4467
 RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
 RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
 RH-Commit: [3/5] ac08690fd3ea3af6e24b2f6a8beedcfe469917a8
 commit 705bcc2819ce8e0f8b9d660a93bc48de26413aec
 Author: Kevin Wolf <kwolf@redhat.com>
 Date:   Thu Apr 25 14:49:40 2024 +0200
    iotests/270: Don't store data-file with json: prefix in image
    We want to disable filename parsing for data files because it's too easy
    to abuse in malicious image files. Make the test ready for the change by
    passing the data file explicitly in command line options.
    Signed-off-by: Kevin Wolf <kwolf@redhat.com>
    Reviewed-by: Eric Blake <eblake@redhat.com>
    Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
    Reviewed-by: Hanna Czenczek <hreitz@redhat.com>
    Upstream: N/A, embargoed
    Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 tests/qemu-iotests/270 | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)
 diff --git a/tests/qemu-iotests/270 b/tests/qemu-iotests/270
 index 74352342db..c37b674aa2 100755
 --- a/tests/qemu-iotests/270
 +++ b/tests/qemu-iotests/270
@@ -60,8 +60,16 @@ _make_test_img -o cluster_size=2M,data_file="$TEST_IMG.orig" \
 # "write" 2G of data without using any space.
 # (qemu-img create does not like it, though, because null-co does not
 # support image creation.)
 -$QEMU_IMG amend -o data_file="json:{'driver':'null-co',,'size':'4294967296'}" \
 -    "$TEST_IMG"
 +test_img_with_null_data="json:{
 +    'driver': '$IMGFMT',
 +    'file': {
 +        'filename': '$TEST_IMG'
 +    },
 +    'data-file': {
 +        'driver': 'null-co',
 +        'size':'4294967296'
 +    }
 +}"
 # This gives us a range of:
 #   2^31 - 512 + 768 - 1 = 2^31 + 255 > 2^31
@@ -74,7 +82,7 @@ $QEMU_IMG amend -o data_file="json:{'driver':'null-co',,'size':'4294967296'}" \
 # on L2 boundaries, we need large L2 tables; hence the cluster size of
 # 2 MB.  (Anything from 256 kB should work, though, because then one L2
 # table covers 8 GB.)
 -$QEMU_IO -c "write 768 $((2 ** 31 - 512))" "$TEST_IMG" | _filter_qemu_io
 +$QEMU_IO -c "write 768 $((2 ** 31 - 512))" "$test_img_with_null_data" | _filter_qemu_io
 _check_test_img
 -- 
 2.39.3
--- a/SOURCES/kvm-iotests-Make-144-deterministic-again.patch
+++ b/SOURCES/kvm-iotests-Make-144-deterministic-again.patch
@ -0,0 +1,82 @@
 From 9b5e69ce5f4ba9541e55d801af16ece4969379e9 Mon Sep 17 00:00:00 2001
 From: Kevin Wolf <kwolf@redhat.com>
 Date: Fri, 9 Feb 2024 18:31:03 +0100
 Subject: [PATCH 4/4] iotests: Make 144 deterministic again
 RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
 RH-MergeRequest: 352: monitor: only run coroutine commands in qemu_aio_context
 RH-Jira: RHEL-7353
 RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
 RH-Acked-by: Hanna Czenczek <hreitz@redhat.com>
 RH-Commit: [4/4] 4974a32174abefb509b7c46671a364b4b991449e
 Since commit effd60c8 changed how QMP commands are processed, the order
 of the block-commit return value and job events in iotests 144 wasn't
 fixed and more and caused the test to fail intermittently.
 Change the test to cache events first and then print them in a
 predefined order.
 Waiting three times for JOB_STATUS_CHANGE is a bit uglier than just
 waiting for the JOB_STATUS_CHANGE that has "status": "ready", but the
 tooling we have doesn't seem to allow the latter easily.
 Fixes: effd60c878176bcaf97fa7ce2b12d04bb8ead6f7
 Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2126
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 Message-id: 20240209173103.239994-1-kwolf@redhat.com
 Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
 (cherry picked from commit cc29c12ec629ba68a4a6cb7d165c94cc8502815a)
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
 tests/qemu-iotests/144     | 12 +++++++++++-
 tests/qemu-iotests/144.out |  2 +-
 2 files changed, 12 insertions(+), 2 deletions(-)
 diff --git a/tests/qemu-iotests/144 b/tests/qemu-iotests/144
 index 60e9ddd75f..8c50d6487e 100755
 --- a/tests/qemu-iotests/144
 +++ b/tests/qemu-iotests/144
@@ -83,12 +83,22 @@ echo
 echo === Performing block-commit on active layer ===
 echo
 +capture_events="BLOCK_JOB_READY JOB_STATUS_CHANGE"
 +
 # Block commit on active layer, push the new overlay into base
 _send_qemu_cmd $h "{ 'execute': 'block-commit',
                                 'arguments': {
                                                  'device': 'virtio0'
                                               }
 -                    }" "READY"
 +                    }" "return"
 +
 +_wait_event $h "JOB_STATUS_CHANGE"
 +_wait_event $h "JOB_STATUS_CHANGE"
 +_wait_event $h "JOB_STATUS_CHANGE"
 +
 +_wait_event $h "BLOCK_JOB_READY"
 +
 +capture_events=
 _send_qemu_cmd $h "{ 'execute': 'block-job-complete',
                                 'arguments': {
 diff --git a/tests/qemu-iotests/144.out b/tests/qemu-iotests/144.out
 index b3b4812015..2245ddfa10 100644
 --- a/tests/qemu-iotests/144.out
 +++ b/tests/qemu-iotests/144.out
@@ -25,9 +25,9 @@ Formatting 'TEST_DIR/tmp.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off co
                                                  'device': 'virtio0'
                                               }
                     }
 +{"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "virtio0"}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "virtio0"}}
 -{"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "virtio0"}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "virtio0", "len": 0, "offset": 0, "speed": 0, "type": "commit"}}
 { 'execute': 'block-job-complete',
 -- 
 2.39.3
--- a/SOURCES/kvm-iotests-add-filter_qmp_generated_node_ids.patch
+++ b/SOURCES/kvm-iotests-add-filter_qmp_generated_node_ids.patch
@ -0,0 +1,49 @@
 From f164083416a9d09712b8cb8c654dd3b8988e6c5c Mon Sep 17 00:00:00 2001
 From: Stefan Hajnoczi <stefanha@redhat.com>
 Date: Thu, 18 Jan 2024 09:48:21 -0500
 Subject: [PATCH 1/4] iotests: add filter_qmp_generated_node_ids()
 RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
 RH-MergeRequest: 352: monitor: only run coroutine commands in qemu_aio_context
 RH-Jira: RHEL-7353
 RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
 RH-Acked-by: Hanna Czenczek <hreitz@redhat.com>
 RH-Commit: [1/4] cc276c8ef9e140203afc19fcd8b5b8e20577054d
 Add a filter function for QMP responses that contain QEMU's
 automatically generated node ids. The ids change between runs and must
 be masked in the reference output.
 The next commit will use this new function.
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Message-ID: <20240118144823.1497953-2-stefanha@redhat.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 (cherry picked from commit da62b507a20510d819bcfbe8f5e573409b954006)
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 ---
 tests/qemu-iotests/iotests.py | 7 +++++++
 1 file changed, 7 insertions(+)
 diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
 index 2ef493755c..fd41f93421 100644
 --- a/tests/qemu-iotests/iotests.py
 +++ b/tests/qemu-iotests/iotests.py
@@ -521,6 +521,13 @@ def _filter(_key, value):
 def filter_generated_node_ids(msg):
     return re.sub("#block[0-9]+", "NODE_NAME", msg)
 +def filter_qmp_generated_node_ids(qmsg):
 +    def _filter(_key, value):
 +        if is_str(value):
 +            return filter_generated_node_ids(value)
 +        return value
 +    return filter_qmp(qmsg, _filter)
 +
 def filter_img_info(output, filename):
     lines = []
     for line in output.split('\n'):
 -- 
 2.39.3
--- a/SOURCES/kvm-iotests-iov-padding-New-test.patch
+++ b/SOURCES/kvm-iotests-iov-padding-New-test.patch
@ -0,0 +1,187 @@
 From 084e211448f40c3e9d9b1907f6c98dca9f998bc3 Mon Sep 17 00:00:00 2001
 From: Hanna Czenczek <hreitz@redhat.com>
 Date: Tue, 11 Apr 2023 19:34:18 +0200
 Subject: [PATCH 4/5] iotests/iov-padding: New test
 RH-Author: Hanna Czenczek <hreitz@redhat.com>
 RH-MergeRequest: 291: block: Split padded I/O vectors exceeding IOV_MAX
 RH-Bugzilla: 2141964
 RH-Acked-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
 RH-Commit: [4/5] a80be9c26ebd5503745989cd6823cb4814264258
 Test that even vectored IO requests with 1024 vector elements that are
 not aligned to the device's request alignment will succeed.
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
 Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
 Message-Id: <20230411173418.19549-5-hreitz@redhat.com>
 (cherry picked from commit d7e1905e3f54ff9512db4c7a946a8603b62b108d)
 Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
 ---
 tests/qemu-iotests/tests/iov-padding     | 85 ++++++++++++++++++++++++
 tests/qemu-iotests/tests/iov-padding.out | 59 ++++++++++++++++
 2 files changed, 144 insertions(+)
 create mode 100755 tests/qemu-iotests/tests/iov-padding
 create mode 100644 tests/qemu-iotests/tests/iov-padding.out
 diff --git a/tests/qemu-iotests/tests/iov-padding b/tests/qemu-iotests/tests/iov-padding
 new file mode 100755
 index 0000000000..b9604900c7
 --- /dev/null
 +++ b/tests/qemu-iotests/tests/iov-padding
@@ -0,0 +1,85 @@
 +#!/usr/bin/env bash
 +# group: rw quick
 +#
 +# Check the interaction of request padding (to fit alignment restrictions) with
 +# vectored I/O from the guest
 +#
 +# Copyright Red Hat
 +#
 +# This program is free software; you can redistribute it and/or modify
 +# it under the terms of the GNU General Public License as published by
 +# the Free Software Foundation; either version 2 of the License, or
 +# (at your option) any later version.
 +#
 +# This program is distributed in the hope that it will be useful,
 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 +# GNU General Public License for more details.
 +#
 +# You should have received a copy of the GNU General Public License
 +# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 +#
 +
 +seq=$(basename $0)
 +echo "QA output created by $seq"
 +
 +status=1	# failure is the default!
 +
 +_cleanup()
 +{
 +    _cleanup_test_img
 +}
 +trap "_cleanup; exit \$status" 0 1 2 3 15
 +
 +# get standard environment, filters and checks
 +cd ..
 +. ./common.rc
 +. ./common.filter
 +
 +_supported_fmt raw
 +_supported_proto file
 +
 +_make_test_img 1M
 +
 +IMGSPEC="driver=blkdebug,align=4096,image.driver=file,image.filename=$TEST_IMG"
 +
 +# Four combinations:
 +# - Offset 4096, length 1023 * 512 + 512: Fully aligned to 4k
 +# - Offset 4096, length 1023 * 512 + 4096: Head is aligned, tail is not
 +# - Offset 512, length 1023 * 512 + 512: Neither head nor tail are aligned
 +# - Offset 512, length 1023 * 512 + 4096: Tail is aligned, head is not
 +for start_offset in 4096 512; do
 +    for last_element_length in 512 4096; do
 +        length=$((1023 * 512 + $last_element_length))
 +
 +        echo
 +        echo "== performing 1024-element vectored requests to image (offset: $start_offset; length: $length) =="
 +
 +        # Fill with data for testing
 +        $QEMU_IO -c 'write -P 1 0 1M' "$TEST_IMG" | _filter_qemu_io
 +
 +        # 1023 512-byte buffers, and then one with length $last_element_length
 +        cmd_params="-P 2 $start_offset $(yes 512 | head -n 1023 | tr '\n' ' ') $last_element_length"
 +        QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS_NO_FMT" $QEMU_IO \
 +            -c "writev $cmd_params" \
 +            --image-opts \
 +            "$IMGSPEC" \
 +            | _filter_qemu_io
 +
 +        # Read all patterns -- read the part we just wrote with writev twice,
 +        # once "normally", and once with a readv, so we see that that works, too
 +        QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS_NO_FMT" $QEMU_IO \
 +            -c "read -P 1 0 $start_offset" \
 +            -c "read -P 2 $start_offset $length" \
 +            -c "readv $cmd_params" \
 +            -c "read -P 1 $((start_offset + length)) $((1024 * 1024 - length - start_offset))" \
 +            --image-opts \
 +            "$IMGSPEC" \
 +            | _filter_qemu_io
 +    done
 +done
 +
 +# success, all done
 +echo "*** done"
 +rm -f $seq.full
 +status=0
 diff --git a/tests/qemu-iotests/tests/iov-padding.out b/tests/qemu-iotests/tests/iov-padding.out
 new file mode 100644
 index 0000000000..e07a91fac7
 --- /dev/null
 +++ b/tests/qemu-iotests/tests/iov-padding.out
@@ -0,0 +1,59 @@
 +QA output created by iov-padding
 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
 +
 +== performing 1024-element vectored requests to image (offset: 4096; length: 524288) ==
 +wrote 1048576/1048576 bytes at offset 0
 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +wrote 524288/524288 bytes at offset 4096
 +512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +read 4096/4096 bytes at offset 0
 +4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +read 524288/524288 bytes at offset 4096
 +512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +read 524288/524288 bytes at offset 4096
 +512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +read 520192/520192 bytes at offset 528384
 +508 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +
 +== performing 1024-element vectored requests to image (offset: 4096; length: 527872) ==
 +wrote 1048576/1048576 bytes at offset 0
 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +wrote 527872/527872 bytes at offset 4096
 +515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +read 4096/4096 bytes at offset 0
 +4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +read 527872/527872 bytes at offset 4096
 +515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +read 527872/527872 bytes at offset 4096
 +515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +read 516608/516608 bytes at offset 531968
 +504.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +
 +== performing 1024-element vectored requests to image (offset: 512; length: 524288) ==
 +wrote 1048576/1048576 bytes at offset 0
 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +wrote 524288/524288 bytes at offset 512
 +512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +read 512/512 bytes at offset 0
 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +read 524288/524288 bytes at offset 512
 +512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +read 524288/524288 bytes at offset 512
 +512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +read 523776/523776 bytes at offset 524800
 +511.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +
 +== performing 1024-element vectored requests to image (offset: 512; length: 527872) ==
 +wrote 1048576/1048576 bytes at offset 0
 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +wrote 527872/527872 bytes at offset 512
 +515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +read 512/512 bytes at offset 0
 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +read 527872/527872 bytes at offset 512
 +515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +read 527872/527872 bytes at offset 512
 +515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +read 520192/520192 bytes at offset 528384
 +508 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +*** done
 -- 
 2.39.3
--- a/SOURCES/kvm-iotests-port-141-to-Python-for-reliable-QMP-testing.patch
+++ b/SOURCES/kvm-iotests-port-141-to-Python-for-reliable-QMP-testing.patch
@ -0,0 +1,601 @@
 From 968c8ff7ea7d43bf29d8e5f6e9e17f84168c22c4 Mon Sep 17 00:00:00 2001
 From: Stefan Hajnoczi <stefanha@redhat.com>
 Date: Thu, 18 Jan 2024 09:48:22 -0500
 Subject: [PATCH 2/4] iotests: port 141 to Python for reliable QMP testing
 RH-Author: Stefan Hajnoczi <stefanha@redhat.com>
 RH-MergeRequest: 352: monitor: only run coroutine commands in qemu_aio_context
 RH-Jira: RHEL-7353
 RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
 RH-Acked-by: Hanna Czenczek <hreitz@redhat.com>
 RH-Commit: [2/4] ff0899262544b1b61b4c7de2eb798b664fe5202e
 The common.qemu bash functions allow tests to interact with the QMP
 monitor of a QEMU process. I spent two days trying to update 141 when
 the order of the test output changed, but found it would still fail
 occassionally because printf() and QMP events race with synchronous QMP
 communication.
 I gave up and ported 141 to the existing Python API for QMP tests. The
 Python API is less affected by the order in which QEMU prints output
 because it does not print all QMP traffic by default.
 The next commit changes the order in which QMP messages are received.
 Make 141 reliable first.
 Cc: Hanna Czenczek <hreitz@redhat.com>
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Message-ID: <20240118144823.1497953-3-stefanha@redhat.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 (cherry picked from commit 9ee2dd4c22a3639c5462b3fc20df60c005c3de64)
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 Conflicts:
  tests/qemu-iotests/141
  tests/qemu-iotests/141.out
  This commit replaces these files anyway, so apply our changes instead
  of dragging in more dependencies to resolve context conflicts.
 ---
 tests/qemu-iotests/141     | 307 ++++++++++++++++---------------------
 tests/qemu-iotests/141.out | 204 ++++++------------------
 2 files changed, 178 insertions(+), 333 deletions(-)
 diff --git a/tests/qemu-iotests/141 b/tests/qemu-iotests/141
 index 115cc1691e..a7d3985a02 100755
 --- a/tests/qemu-iotests/141
 +++ b/tests/qemu-iotests/141
@@ -1,9 +1,12 @@
 -#!/usr/bin/env bash
 +#!/usr/bin/env python3
 # group: rw auto quick
 #
 # Test case for ejecting BDSs with block jobs still running on them
 #
 -# Copyright (C) 2016 Red Hat, Inc.
 +# Originally written in bash by Hanna Czenczek, ported to Python by Stefan
 +# Hajnoczi.
 +#
 +# Copyright Red Hat
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -19,177 +22,129 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 -# creator
 -owner=mreitz@redhat.com
 -
 -seq="$(basename $0)"
 -echo "QA output created by $seq"
 -
 -status=1	# failure is the default!
 -
 -_cleanup()
 -{
 -    _cleanup_qemu
 -    _cleanup_test_img
 -    for img in "$TEST_DIR"/{b,m,o}.$IMGFMT; do
 -        _rm_test_img "$img"
 -    done
 -}
 -trap "_cleanup; exit \$status" 0 1 2 3 15
 -
 -# get standard environment, filters and checks
 -. ./common.rc
 -. ./common.filter
 -. ./common.qemu
 -
 -# Needs backing file and backing format support
 -_supported_fmt qcow2 qed
 -_supported_proto file
 -_supported_os Linux
 -
 -
 -test_blockjob()
 -{
 -    _send_qemu_cmd $QEMU_HANDLE \
 -        "{'execute': 'blockdev-add',
 -          'arguments': {
 -              'node-name': 'drv0',
 -              'driver': '$IMGFMT',
 -              'file': {
 -                  'driver': 'file',
 -                  'filename': '$TEST_IMG'
 -              }}}" \
 -        'return'
 -
 -    # If "$2" is an event, we may or may not see it before the
 -    # {"return": {}}.  Therefore, filter the {"return": {}} out both
 -    # here and in the next command.  (Naturally, if we do not see it
 -    # here, we will see it before the next command can be executed,
 -    # so it will appear in the next _send_qemu_cmd's output.)
 -    _send_qemu_cmd $QEMU_HANDLE \
 -        "$1" \
 -        "$2" \
 -        | _filter_img_create | _filter_qmp_empty_return
 -
 -    # We want this to return an error because the block job is still running
 -    _send_qemu_cmd $QEMU_HANDLE \
 -        "{'execute': 'blockdev-del',
 -          'arguments': {'node-name': 'drv0'}}" \
 -        'error' | _filter_generated_node_ids | _filter_qmp_empty_return
 -
 -    _send_qemu_cmd $QEMU_HANDLE \
 -        "{'execute': 'block-job-cancel',
 -          'arguments': {'device': 'job0'}}" \
 -        "$3"
 -
 -    _send_qemu_cmd $QEMU_HANDLE \
 -        "{'execute': 'blockdev-del',
 -          'arguments': {'node-name': 'drv0'}}" \
 -        'return'
 -}
 -
 -
 -TEST_IMG="$TEST_DIR/b.$IMGFMT" _make_test_img 1M
 -TEST_IMG="$TEST_DIR/m.$IMGFMT" _make_test_img -b "$TEST_DIR/b.$IMGFMT" -F $IMGFMT 1M
 -_make_test_img -b "$TEST_DIR/m.$IMGFMT" 1M -F $IMGFMT
 -
 -_launch_qemu -nodefaults
 -
 -_send_qemu_cmd $QEMU_HANDLE \
 -    "{'execute': 'qmp_capabilities'}" \
 -    'return'
 -
 -echo
 -echo '=== Testing drive-backup ==='
 -echo
 -
 -# drive-backup will not send BLOCK_JOB_READY by itself, and cancelling the job
 -# will consequently result in BLOCK_JOB_CANCELLED being emitted.
 -
 -test_blockjob \
 -    "{'execute': 'drive-backup',
 -      'arguments': {'job-id': 'job0',
 -                    'device': 'drv0',
 -                    'target': '$TEST_DIR/o.$IMGFMT',
 -                    'format': '$IMGFMT',
 -                    'sync': 'none'}}" \
 -    'return' \
 -    '"status": "null"'
 -
 -echo
 -echo '=== Testing drive-mirror ==='
 -echo
 -
 -# drive-mirror will send BLOCK_JOB_READY basically immediately, and cancelling
 -# the job will consequently result in BLOCK_JOB_COMPLETED being emitted.
 -
 -test_blockjob \
 -    "{'execute': 'drive-mirror',
 -      'arguments': {'job-id': 'job0',
 -                    'device': 'drv0',
 -                    'target': '$TEST_DIR/o.$IMGFMT',
 -                    'format': '$IMGFMT',
 -                    'sync': 'none'}}" \
 -    'BLOCK_JOB_READY' \
 -    '"status": "null"'
 -
 -echo
 -echo '=== Testing active block-commit ==='
 -echo
 -
 -# An active block-commit will send BLOCK_JOB_READY basically immediately, and
 -# cancelling the job will consequently result in BLOCK_JOB_COMPLETED being
 -# emitted.
 -
 -test_blockjob \
 -    "{'execute': 'block-commit',
 -      'arguments': {'job-id': 'job0', 'device': 'drv0'}}" \
 -    'BLOCK_JOB_READY' \
 -    '"status": "null"'
 -
 -echo
 -echo '=== Testing non-active block-commit ==='
 -echo
 -
 -# Give block-commit something to work on, otherwise it would be done
 -# immediately, send a BLOCK_JOB_COMPLETED and ejecting the BDS would work just
 -# fine without the block job still running.
 -
 -$QEMU_IO -c 'write 0 1M' "$TEST_DIR/m.$IMGFMT" | _filter_qemu_io
 -
 -test_blockjob \
 -    "{'execute': 'block-commit',
 -      'arguments': {'job-id': 'job0',
 -                    'device': 'drv0',
 -                    'top':    '$TEST_DIR/m.$IMGFMT',
 -                    'speed':  1}}" \
 -    'return' \
 -    '"status": "null"'
 -
 -echo
 -echo '=== Testing block-stream ==='
 -echo
 -
 -# Give block-stream something to work on, otherwise it would be done
 -# immediately, send a BLOCK_JOB_COMPLETED and ejecting the BDS would work just
 -# fine without the block job still running.
 -
 -$QEMU_IO -c 'write 0 1M' "$TEST_DIR/b.$IMGFMT" | _filter_qemu_io
 -
 -# With some data to stream (and @speed set to 1), block-stream will not complete
 -# until we send the block-job-cancel command.
 -
 -test_blockjob \
 -    "{'execute': 'block-stream',
 -      'arguments': {'job-id': 'job0',
 -                    'device': 'drv0',
 -                    'speed': 1}}" \
 -    'return' \
 -    '"status": "null"'
 -
 -_cleanup_qemu
 -
 -# success, all done
 -echo "*** done"
 -rm -f $seq.full
 -status=0
 +import iotests
 +
 +# Common filters to mask values that vary in the test output
 +QMP_FILTERS = [iotests.filter_qmp_testfiles, \
 +               iotests.filter_qmp_imgfmt]
 +
 +
 +class TestCase:
 +    def __init__(self, name, vm, image_path, cancel_event):
 +        self.name = name
 +        self.vm = vm
 +        self.image_path = image_path
 +        self.cancel_event = cancel_event
 +
 +    def __enter__(self):
 +        iotests.log(f'=== Testing {self.name} ===')
 +        self.vm.qmp_log('blockdev-add', \
 +                        node_name='drv0', \
 +                        driver=iotests.imgfmt, \
 +                        file={'driver': 'file', 'filename': self.image_path}, \
 +                        filters=QMP_FILTERS)
 +
 +    def __exit__(self, *exc_details):
 +        # This is expected to fail because the job still exists
 +        self.vm.qmp_log('blockdev-del', node_name='drv0', \
 +                        filters=[iotests.filter_qmp_generated_node_ids])
 +
 +        self.vm.qmp_log('block-job-cancel', device='job0')
 +        event = self.vm.event_wait(self.cancel_event)
 +        iotests.log(event, filters=[iotests.filter_qmp_event])
 +
 +        # This time it succeeds
 +        self.vm.qmp_log('blockdev-del', node_name='drv0')
 +
 +        # Separate test cases in output
 +        iotests.log('')
 +
 +
 +def main() -> None:
 +    with iotests.FilePath('bottom', 'middle', 'top', 'target') as \
 +            (bottom_path, middle_path, top_path, target_path), \
 +         iotests.VM() as vm:
 +
 +        iotests.log('Creating bottom <- middle <- top backing file chain...')
 +        IMAGE_SIZE='1M'
 +        iotests.qemu_img_create('-f', iotests.imgfmt, bottom_path, IMAGE_SIZE)
 +        iotests.qemu_img_create('-f', iotests.imgfmt, \
 +                                '-F', iotests.imgfmt, \
 +                                '-b', bottom_path, \
 +                                middle_path, \
 +                                IMAGE_SIZE)
 +        iotests.qemu_img_create('-f', iotests.imgfmt, \
 +                                '-F', iotests.imgfmt, \
 +                                '-b', middle_path, \
 +                                top_path, \
 +                                IMAGE_SIZE)
 +
 +        iotests.log('Starting VM...')
 +        vm.add_args('-nodefaults')
 +        vm.launch()
 +
 +        # drive-backup will not send BLOCK_JOB_READY by itself, and cancelling
 +        # the job will consequently result in BLOCK_JOB_CANCELLED being
 +        # emitted.
 +        with TestCase('drive-backup', vm, top_path, 'BLOCK_JOB_CANCELLED'):
 +            vm.qmp_log('drive-backup', \
 +                       job_id='job0', \
 +                       device='drv0', \
 +                       target=target_path, \
 +                       format=iotests.imgfmt, \
 +                       sync='none', \
 +                       filters=QMP_FILTERS)
 +
 +        # drive-mirror will send BLOCK_JOB_READY basically immediately, and
 +        # cancelling the job will consequently result in BLOCK_JOB_COMPLETED
 +        # being emitted.
 +        with TestCase('drive-mirror', vm, top_path, 'BLOCK_JOB_COMPLETED'):
 +            vm.qmp_log('drive-mirror', \
 +                       job_id='job0', \
 +                       device='drv0', \
 +                       target=target_path, \
 +                       format=iotests.imgfmt, \
 +                       sync='none', \
 +                       filters=QMP_FILTERS)
 +            event = vm.event_wait('BLOCK_JOB_READY')
 +            assert event is not None # silence mypy
 +            iotests.log(event, filters=[iotests.filter_qmp_event])
 +
 +        # An active block-commit will send BLOCK_JOB_READY basically
 +        # immediately, and cancelling the job will consequently result in
 +        # BLOCK_JOB_COMPLETED being emitted.
 +        with TestCase('active block-commit', vm, top_path, \
 +                      'BLOCK_JOB_COMPLETED'):
 +            vm.qmp_log('block-commit', \
 +                       job_id='job0', \
 +                       device='drv0')
 +            event = vm.event_wait('BLOCK_JOB_READY')
 +            assert event is not None # silence mypy
 +            iotests.log(event, filters=[iotests.filter_qmp_event])
 +
 +        # Give block-commit something to work on, otherwise it would be done
 +        # immediately, send a BLOCK_JOB_COMPLETED and ejecting the BDS would
 +        # work just fine without the block job still running.
 +        iotests.qemu_io(middle_path, '-c', f'write 0 {IMAGE_SIZE}')
 +        with TestCase('non-active block-commit', vm, top_path, \
 +                      'BLOCK_JOB_CANCELLED'):
 +            vm.qmp_log('block-commit', \
 +                       job_id='job0', \
 +                       device='drv0', \
 +                       top=middle_path, \
 +                       speed=1, \
 +                       filters=[iotests.filter_qmp_testfiles])
 +
 +        # Give block-stream something to work on, otherwise it would be done
 +        # immediately, send a BLOCK_JOB_COMPLETED and ejecting the BDS would
 +        # work just fine without the block job still running.
 +        iotests.qemu_io(bottom_path, '-c', f'write 0 {IMAGE_SIZE}')
 +        with TestCase('block-stream', vm, top_path, 'BLOCK_JOB_CANCELLED'):
 +            vm.qmp_log('block-stream', \
 +                       job_id='job0', \
 +                       device='drv0', \
 +                       speed=1)
 +
 +if __name__ == '__main__':
 +    iotests.script_main(main, supported_fmts=['qcow2', 'qed'],
 +                        supported_protocols=['file'])
 diff --git a/tests/qemu-iotests/141.out b/tests/qemu-iotests/141.out
 index c4c15fb275..91b7ba50af 100644
 --- a/tests/qemu-iotests/141.out
 +++ b/tests/qemu-iotests/141.out
@@ -1,179 +1,69 @@
 -QA output created by 141
 -Formatting 'TEST_DIR/b.IMGFMT', fmt=IMGFMT size=1048576
 -Formatting 'TEST_DIR/m.IMGFMT', fmt=IMGFMT size=1048576 backing_file=TEST_DIR/b.IMGFMT backing_fmt=IMGFMT
 -Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 backing_file=TEST_DIR/m.IMGFMT backing_fmt=IMGFMT
 -{'execute': 'qmp_capabilities'}
 -{"return": {}}
 -
 +Creating bottom <- middle <- top backing file chain...
 +Starting VM...
 === Testing drive-backup ===
 -
 -{'execute': 'blockdev-add',
 -          'arguments': {
 -              'node-name': 'drv0',
 -              'driver': 'IMGFMT',
 -              'file': {
 -                  'driver': 'file',
 -                  'filename': 'TEST_DIR/t.IMGFMT'
 -              }}}
 -{"return": {}}
 -{'execute': 'drive-backup',
 -'arguments': {'job-id': 'job0',
 -'device': 'drv0',
 -'target': 'TEST_DIR/o.IMGFMT',
 -'format': 'IMGFMT',
 -'sync': 'none'}}
 -Formatting 'TEST_DIR/o.IMGFMT', fmt=IMGFMT size=1048576 backing_file=TEST_DIR/t.IMGFMT backing_fmt=IMGFMT
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job0"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job0"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "paused", "id": "job0"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job0"}}
 -{'execute': 'blockdev-del',
 -          'arguments': {'node-name': 'drv0'}}
 +{"execute": "blockdev-add", "arguments": {"driver": "IMGFMT", "file": {"driver": "file", "filename": "TEST_DIR/PID-top"}, "node-name": "drv0"}}
 +{"return": {}}
 +{"execute": "drive-backup", "arguments": {"device": "drv0", "format": "IMGFMT", "job-id": "job0", "sync": "none", "target": "TEST_DIR/PID-target"}}
 +{"return": {}}
 +{"execute": "blockdev-del", "arguments": {"node-name": "drv0"}}
 {"error": {"class": "GenericError", "desc": "Node 'drv0' is busy: node is used as backing hd of 'NODE_NAME'"}}
 -{'execute': 'block-job-cancel',
 -          'arguments': {'device': 'job0'}}
 +{"execute": "block-job-cancel", "arguments": {"device": "job0"}}
 {"return": {}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "aborting", "id": "job0"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "job0", "len": 1048576, "offset": 0, "speed": 0, "type": "backup"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job0"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job0"}}
 -{'execute': 'blockdev-del',
 -          'arguments': {'node-name': 'drv0'}}
 +{"data": {"device": "job0", "len": 1048576, "offset": 0, "speed": 0, "type": "backup"}, "event": "BLOCK_JOB_CANCELLED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
 +{"execute": "blockdev-del", "arguments": {"node-name": "drv0"}}
 {"return": {}}
 === Testing drive-mirror ===
 -
 -{'execute': 'blockdev-add',
 -          'arguments': {
 -              'node-name': 'drv0',
 -              'driver': 'IMGFMT',
 -              'file': {
 -                  'driver': 'file',
 -                  'filename': 'TEST_DIR/t.IMGFMT'
 -              }}}
 -{"return": {}}
 -{'execute': 'drive-mirror',
 -'arguments': {'job-id': 'job0',
 -'device': 'drv0',
 -'target': 'TEST_DIR/o.IMGFMT',
 -'format': 'IMGFMT',
 -'sync': 'none'}}
 -Formatting 'TEST_DIR/o.IMGFMT', fmt=IMGFMT size=1048576 backing_file=TEST_DIR/t.IMGFMT backing_fmt=IMGFMT
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job0"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job0"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "job0"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "job0", "len": 0, "offset": 0, "speed": 0, "type": "mirror"}}
 -{'execute': 'blockdev-del',
 -          'arguments': {'node-name': 'drv0'}}
 +{"execute": "blockdev-add", "arguments": {"driver": "IMGFMT", "file": {"driver": "file", "filename": "TEST_DIR/PID-top"}, "node-name": "drv0"}}
 +{"return": {}}
 +{"execute": "drive-mirror", "arguments": {"device": "drv0", "format": "IMGFMT", "job-id": "job0", "sync": "none", "target": "TEST_DIR/PID-target"}}
 +{"return": {}}
 +{"data": {"device": "job0", "len": 0, "offset": 0, "speed": 0, "type": "mirror"}, "event": "BLOCK_JOB_READY", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
 +{"execute": "blockdev-del", "arguments": {"node-name": "drv0"}}
 {"error": {"class": "GenericError", "desc": "Node 'drv0' is busy: block device is in use by block job: mirror"}}
 -{'execute': 'block-job-cancel',
 -          'arguments': {'device': 'job0'}}
 +{"execute": "block-job-cancel", "arguments": {"device": "job0"}}
 {"return": {}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "job0"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "pending", "id": "job0"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "job0", "len": 0, "offset": 0, "speed": 0, "type": "mirror"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job0"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job0"}}
 -{'execute': 'blockdev-del',
 -          'arguments': {'node-name': 'drv0'}}
 +{"data": {"device": "job0", "len": 0, "offset": 0, "speed": 0, "type": "mirror"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
 +{"execute": "blockdev-del", "arguments": {"node-name": "drv0"}}
 {"return": {}}
 === Testing active block-commit ===
 -
 -{'execute': 'blockdev-add',
 -          'arguments': {
 -              'node-name': 'drv0',
 -              'driver': 'IMGFMT',
 -              'file': {
 -                  'driver': 'file',
 -                  'filename': 'TEST_DIR/t.IMGFMT'
 -              }}}
 -{"return": {}}
 -{'execute': 'block-commit',
 -'arguments': {'job-id': 'job0', 'device': 'drv0'}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job0"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job0"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "job0"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "job0", "len": 0, "offset": 0, "speed": 0, "type": "commit"}}
 -{'execute': 'blockdev-del',
 -          'arguments': {'node-name': 'drv0'}}
 +{"execute": "blockdev-add", "arguments": {"driver": "IMGFMT", "file": {"driver": "file", "filename": "TEST_DIR/PID-top"}, "node-name": "drv0"}}
 +{"return": {}}
 +{"execute": "block-commit", "arguments": {"device": "drv0", "job-id": "job0"}}
 +{"return": {}}
 +{"data": {"device": "job0", "len": 0, "offset": 0, "speed": 0, "type": "commit"}, "event": "BLOCK_JOB_READY", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
 +{"execute": "blockdev-del", "arguments": {"node-name": "drv0"}}
 {"error": {"class": "GenericError", "desc": "Node 'drv0' is busy: block device is in use by block job: commit"}}
 -{'execute': 'block-job-cancel',
 -          'arguments': {'device': 'job0'}}
 +{"execute": "block-job-cancel", "arguments": {"device": "job0"}}
 {"return": {}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "job0"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "pending", "id": "job0"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "job0", "len": 0, "offset": 0, "speed": 0, "type": "commit"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job0"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job0"}}
 -{'execute': 'blockdev-del',
 -          'arguments': {'node-name': 'drv0'}}
 +{"data": {"device": "job0", "len": 0, "offset": 0, "speed": 0, "type": "commit"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
 +{"execute": "blockdev-del", "arguments": {"node-name": "drv0"}}
 {"return": {}}
 === Testing non-active block-commit ===
 -
 -wrote 1048576/1048576 bytes at offset 0
 -1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 -{'execute': 'blockdev-add',
 -          'arguments': {
 -              'node-name': 'drv0',
 -              'driver': 'IMGFMT',
 -              'file': {
 -                  'driver': 'file',
 -                  'filename': 'TEST_DIR/t.IMGFMT'
 -              }}}
 -{"return": {}}
 -{'execute': 'block-commit',
 -'arguments': {'job-id': 'job0',
 -'device': 'drv0',
 -'top':    'TEST_DIR/m.IMGFMT',
 -'speed':  1}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job0"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job0"}}
 -{'execute': 'blockdev-del',
 -          'arguments': {'node-name': 'drv0'}}
 -{"error": {"class": "GenericError", "desc": "Node drv0 is in use"}}
 -{'execute': 'block-job-cancel',
 -          'arguments': {'device': 'job0'}}
 -{"return": {}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "aborting", "id": "job0"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "job0", "len": 1048576, "offset": 524288, "speed": 1, "type": "commit"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job0"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job0"}}
 -{'execute': 'blockdev-del',
 -          'arguments': {'node-name': 'drv0'}}
 +{"execute": "blockdev-add", "arguments": {"driver": "IMGFMT", "file": {"driver": "file", "filename": "TEST_DIR/PID-top"}, "node-name": "drv0"}}
 +{"return": {}}
 +{"execute": "block-commit", "arguments": {"device": "drv0", "job-id": "job0", "speed": 1, "top": "TEST_DIR/PID-middle"}}
 +{"return": {}}
 +{"execute": "blockdev-del", "arguments": {"node-name": "drv0"}}
 +{"error": {"class": "GenericError", "desc": "Node 'drv0' is busy: block device is in use by block job: commit"}}
 +{"execute": "block-job-cancel", "arguments": {"device": "job0"}}
 +{"return": {}}
 +{"data": {"device": "job0", "len": 1048576, "offset": 524288, "speed": 1, "type": "commit"}, "event": "BLOCK_JOB_CANCELLED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
 +{"execute": "blockdev-del", "arguments": {"node-name": "drv0"}}
 {"return": {}}
 === Testing block-stream ===
 -
 -wrote 1048576/1048576 bytes at offset 0
 -1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 -{'execute': 'blockdev-add',
 -          'arguments': {
 -              'node-name': 'drv0',
 -              'driver': 'IMGFMT',
 -              'file': {
 -                  'driver': 'file',
 -                  'filename': 'TEST_DIR/t.IMGFMT'
 -              }}}
 -{"return": {}}
 -{'execute': 'block-stream',
 -'arguments': {'job-id': 'job0',
 -'device': 'drv0',
 -'speed': 1}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job0"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job0"}}
 -{'execute': 'blockdev-del',
 -          'arguments': {'node-name': 'drv0'}}
 +{"execute": "blockdev-add", "arguments": {"driver": "IMGFMT", "file": {"driver": "file", "filename": "TEST_DIR/PID-top"}, "node-name": "drv0"}}
 +{"return": {}}
 +{"execute": "block-stream", "arguments": {"device": "drv0", "job-id": "job0", "speed": 1}}
 +{"return": {}}
 +{"execute": "blockdev-del", "arguments": {"node-name": "drv0"}}
 {"error": {"class": "GenericError", "desc": "Node 'drv0' is busy: block device is in use by block job: stream"}}
 -{'execute': 'block-job-cancel',
 -          'arguments': {'device': 'job0'}}
 +{"execute": "block-job-cancel", "arguments": {"device": "job0"}}
 {"return": {}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "aborting", "id": "job0"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_CANCELLED", "data": {"device": "job0", "len": 1048576, "offset": 524288, "speed": 1, "type": "stream"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job0"}}
 -{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job0"}}
 -{'execute': 'blockdev-del',
 -          'arguments': {'node-name': 'drv0'}}
 +{"data": {"device": "job0", "len": 1048576, "offset": 524288, "speed": 1, "type": "stream"}, "event": "BLOCK_JOB_CANCELLED", "timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
 +{"execute": "blockdev-del", "arguments": {"node-name": "drv0"}}
 {"return": {}}
 -*** done
 +
 -- 
 2.39.3
--- a/SOURCES/kvm-iotests-test-NBD-TLS-iothread.patch
+++ b/SOURCES/kvm-iotests-test-NBD-TLS-iothread.patch
@ -0,0 +1,277 @@
 From a0b12780f3cb97abad0a2c54d185c298d3f589e7 Mon Sep 17 00:00:00 2001
 From: Eric Blake <eblake@redhat.com>
 Date: Fri, 17 May 2024 21:50:15 -0500
 Subject: [PATCH 2/3] iotests: test NBD+TLS+iothread
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Eric Blake <eblake@redhat.com>
 RH-MergeRequest: 398: nbd/server: CVE-2024-7409: Avoid use-after-free when closing server
 RH-Jira: RHEL-52611
 RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
 RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
 RH-Commit: [2/3] f522ff5156086a83a7327c379dd3ccd8b583a421 (ebblake/qemu-kvm)
 Prevent regressions when using NBD with TLS in the presence of
 iothreads, adding coverage the fix to qio channels made in the
 previous patch.
 The shell function pick_unused_port() was copied from
 nbdkit.git/tests/functions.sh.in, where it had all authors from Red
 Hat, agreeing to the resulting relicensing from 2-clause BSD to GPLv2.
 CC: qemu-stable@nongnu.org
 CC: "Richard W.M. Jones" <rjones@redhat.com>
 Signed-off-by: Eric Blake <eblake@redhat.com>
 Message-ID: <20240531180639.1392905-6-eblake@redhat.com>
 Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
 (cherry picked from commit a73c99378022ebb785481e84cfe1e81097546268)
 Jira: https://issues.redhat.com/browse/RHEL-52611
 Conflicts:
 	tests/qemu-iotests/tests/nbd-tls-iothread{,.out} - drop unknown
          "tls-hostname" parameter
 Signed-off-by: Eric Blake <eblake@redhat.com>
 ---
 tests/qemu-iotests/tests/nbd-tls-iothread     | 167 ++++++++++++++++++
 tests/qemu-iotests/tests/nbd-tls-iothread.out |  53 ++++++
 2 files changed, 220 insertions(+)
 create mode 100755 tests/qemu-iotests/tests/nbd-tls-iothread
 create mode 100644 tests/qemu-iotests/tests/nbd-tls-iothread.out
 diff --git a/tests/qemu-iotests/tests/nbd-tls-iothread b/tests/qemu-iotests/tests/nbd-tls-iothread
 new file mode 100755
 index 0000000000..9e747e2639
 --- /dev/null
 +++ b/tests/qemu-iotests/tests/nbd-tls-iothread
@@ -0,0 +1,167 @@
 +#!/usr/bin/env bash
 +# group: rw quick
 +#
 +# Test of NBD+TLS+iothread
 +#
 +# Copyright (C) 2024 Red Hat, Inc.
 +#
 +# This program is free software; you can redistribute it and/or modify
 +# it under the terms of the GNU General Public License as published by
 +# the Free Software Foundation; either version 2 of the License, or
 +# (at your option) any later version.
 +#
 +# This program is distributed in the hope that it will be useful,
 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 +# GNU General Public License for more details.
 +#
 +# You should have received a copy of the GNU General Public License
 +# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 +#
 +
 +# creator
 +owner=eblake@redhat.com
 +
 +seq=`basename $0`
 +echo "QA output created by $seq"
 +
 +status=1    # failure is the default!
 +
 +_cleanup()
 +{
 +    _cleanup_qemu
 +    _cleanup_test_img
 +    rm -f "$dst_image"
 +    tls_x509_cleanup
 +}
 +trap "_cleanup; exit \$status" 0 1 2 3 15
 +
 +# get standard environment, filters and checks
 +cd ..
 +. ./common.rc
 +. ./common.filter
 +. ./common.qemu
 +. ./common.tls
 +. ./common.nbd
 +
 +_supported_fmt qcow2  # Hardcoded to qcow2 command line and QMP below
 +_supported_proto file
 +
 +# pick_unused_port
 +#
 +# Picks and returns an "unused" port, setting the global variable
 +# $port.
 +#
 +# This is inherently racy, but we need it because qemu does not currently
 +# permit NBD+TLS over a Unix domain socket
 +pick_unused_port ()
 +{
 +    if ! (ss --version) >/dev/null 2>&1; then
 +        _notrun "ss utility required, skipped this test"
 +    fi
 +
 +    # Start at a random port to make it less likely that two parallel
 +    # tests will conflict.
 +    port=$(( 50000 + (RANDOM%15000) ))
 +    while ss -ltn | grep -sqE ":$port\b"; do
 +        ((port++))
 +        if [ $port -eq 65000 ]; then port=50000; fi
 +    done
 +    echo picked unused port
 +}
 +
 +tls_x509_init
 +
 +size=1G
 +DST_IMG="$TEST_DIR/dst.qcow2"
 +
 +echo
 +echo "== preparing TLS creds and spare port =="
 +
 +pick_unused_port
 +tls_x509_create_root_ca "ca1"
 +tls_x509_create_server "ca1" "server1"
 +tls_x509_create_client "ca1" "client1"
 +tls_obj_base=tls-creds-x509,id=tls0,verify-peer=true,dir="${tls_dir}"
 +
 +echo
 +echo "== preparing image =="
 +
 +_make_test_img $size
 +$QEMU_IMG create -f qcow2 "$DST_IMG" $size | _filter_img_create
 +
 +echo
 +echo === Starting Src QEMU ===
 +echo
 +
 +_launch_qemu -machine q35 \
 +    -object iothread,id=iothread0 \
 +    -object "${tls_obj_base}"/client1,endpoint=client \
 +    -device '{"driver":"pcie-root-port", "id":"root0", "multifunction":true,
 +              "bus":"pcie.0"}' \
 +    -device '{"driver":"virtio-scsi-pci", "id":"virtio_scsi_pci0",
 +              "bus":"root0", "iothread":"iothread0"}' \
 +    -device '{"driver":"scsi-hd", "id":"image1", "drive":"drive_image1",
 +              "bus":"virtio_scsi_pci0.0"}' \
 +    -blockdev '{"driver":"file", "cache":{"direct":true, "no-flush":false},
 +                "filename":"'"$TEST_IMG"'", "node-name":"drive_sys1"}' \
 +    -blockdev '{"driver":"qcow2", "node-name":"drive_image1",
 +                "file":"drive_sys1"}'
 +h1=$QEMU_HANDLE
 +_send_qemu_cmd $h1 '{"execute": "qmp_capabilities"}' 'return'
 +
 +echo
 +echo === Starting Dst VM2 ===
 +echo
 +
 +_launch_qemu -machine q35 \
 +    -object iothread,id=iothread0 \
 +    -object "${tls_obj_base}"/server1,endpoint=server \
 +    -device '{"driver":"pcie-root-port", "id":"root0", "multifunction":true,
 +              "bus":"pcie.0"}' \
 +    -device '{"driver":"virtio-scsi-pci", "id":"virtio_scsi_pci0",
 +              "bus":"root0", "iothread":"iothread0"}' \
 +    -device '{"driver":"scsi-hd", "id":"image1", "drive":"drive_image1",
 +              "bus":"virtio_scsi_pci0.0"}' \
 +    -blockdev '{"driver":"file", "cache":{"direct":true, "no-flush":false},
 +                "filename":"'"$DST_IMG"'", "node-name":"drive_sys1"}' \
 +    -blockdev '{"driver":"qcow2", "node-name":"drive_image1",
 +                "file":"drive_sys1"}' \
 +    -incoming defer
 +h2=$QEMU_HANDLE
 +_send_qemu_cmd $h2 '{"execute": "qmp_capabilities"}' 'return'
 +
 +echo
 +echo === Dst VM: Enable NBD server for incoming storage migration ===
 +echo
 +
 +_send_qemu_cmd $h2 '{"execute": "nbd-server-start", "arguments":
 +    {"addr": {"type": "inet", "data": {"host": "127.0.0.1", "port": "'$port'"}},
 +              "tls-creds": "tls0"}}' '{"return": {}}' | sed "s/\"$port\"/PORT/g"
 +_send_qemu_cmd $h2 '{"execute": "block-export-add", "arguments":
 +    {"node-name": "drive_image1", "type": "nbd", "writable": true,
 +      "id": "drive_image1"}}' '{"return": {}}'
 +
 +echo
 +echo === Src VM: Mirror to dst NBD for outgoing storage migration ===
 +echo
 +
 +_send_qemu_cmd $h1 '{"execute": "blockdev-add", "arguments":
 +    {"node-name": "mirror", "driver": "nbd",
 +     "server": {"type": "inet", "host": "127.0.0.1", "port": "'$port'"},
 +     "export": "drive_image1", "tls-creds": "tls0"}}' '{"return": {}}' | sed "s/\"$port\"/PORT/g"
 +_send_qemu_cmd $h1 '{"execute": "blockdev-mirror", "arguments":
 +    {"sync": "full", "device": "drive_image1", "target": "mirror",
 +     "job-id": "drive_image1_53"}}' '{"return": {}}'
 +_timed_wait_for $h1 '"ready"'
 +
 +echo
 +echo === Cleaning up ===
 +echo
 +
 +_send_qemu_cmd $h1 '{"execute":"quit"}' ''
 +_send_qemu_cmd $h2 '{"execute":"quit"}' ''
 +
 +echo "*** done"
 +rm -f $seq.full
 +status=0
 diff --git a/tests/qemu-iotests/tests/nbd-tls-iothread.out b/tests/qemu-iotests/tests/nbd-tls-iothread.out
 new file mode 100644
 index 0000000000..a3899fd2d7
 --- /dev/null
 +++ b/tests/qemu-iotests/tests/nbd-tls-iothread.out
@@ -0,0 +1,53 @@
 +QA output created by nbd-tls-iothread
 +
 +== preparing TLS creds and spare port ==
 +picked unused port
 +Generating a self signed certificate...
 +Generating a signed certificate...
 +Generating a signed certificate...
 +
 +== preparing image ==
 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 +Formatting 'TEST_DIR/dst.IMGFMT', fmt=IMGFMT size=1073741824
 +
 +=== Starting Src QEMU ===
 +
 +{"execute": "qmp_capabilities"}
 +{"return": {}}
 +
 +=== Starting Dst VM2 ===
 +
 +{"execute": "qmp_capabilities"}
 +{"return": {}}
 +
 +=== Dst VM: Enable NBD server for incoming storage migration ===
 +
 +{"execute": "nbd-server-start", "arguments":
 +    {"addr": {"type": "inet", "data": {"host": "127.0.0.1", "port": PORT}},
 +              "tls-creds": "tls0"}}
 +{"return": {}}
 +{"execute": "block-export-add", "arguments":
 +    {"node-name": "drive_image1", "type": "nbd", "writable": true,
 +      "id": "drive_image1"}}
 +{"return": {}}
 +
 +=== Src VM: Mirror to dst NBD for outgoing storage migration ===
 +
 +{"execute": "blockdev-add", "arguments":
 +    {"node-name": "mirror", "driver": "nbd",
 +     "server": {"type": "inet", "host": "127.0.0.1", "port": PORT},
 +     "export": "drive_image1", "tls-creds": "tls0"}}
 +{"return": {}}
 +{"execute": "blockdev-mirror", "arguments":
 +    {"sync": "full", "device": "drive_image1", "target": "mirror",
 +     "job-id": "drive_image1_53"}}
 +{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "drive_image1_53"}}
 +{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "drive_image1_53"}}
 +{"return": {}}
 +{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "drive_image1_53"}}
 +
 +=== Cleaning up ===
 +
 +{"execute":"quit"}
 +{"execute":"quit"}
 +*** done
 -- 
 2.39.3
--- a/SOURCES/kvm-kvm-Atomic-memslot-updates.patch
+++ b/SOURCES/kvm-kvm-Atomic-memslot-updates.patch
@ -0,0 +1,290 @@
 From 93ec857c46911b95ed8e3abc6a9d432ae847c084 Mon Sep 17 00:00:00 2001
 From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 Date: Mon, 16 Jan 2023 07:51:56 -0500
 Subject: [PATCH 06/11] kvm: Atomic memslot updates
 RH-Author: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 RH-MergeRequest: 247: accel: introduce accelerator blocker API
 RH-Bugzilla: 2161188
 RH-Acked-by: David Hildenbrand <david@redhat.com>
 RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [3/3] 520e41c0f58066a7381a5f6b32b81bc01cce51c0
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2161188
 commit f39b7d2b96e3e73c01bb678cd096f7baf0b9ab39
 Author: David Hildenbrand <david@redhat.com>
 Date:   Fri Nov 11 10:47:58 2022 -0500
    kvm: Atomic memslot updates
    If we update an existing memslot (e.g., resize, split), we temporarily
    remove the memslot to re-add it immediately afterwards. These updates
    are not atomic, especially not for KVM VCPU threads, such that we can
    get spurious faults.
    Let's inhibit most KVM ioctls while performing relevant updates, such
    that we can perform the update just as if it would happen atomically
    without additional kernel support.
    We capture the add/del changes and apply them in the notifier commit
    stage instead. There, we can check for overlaps and perform the ioctl
    inhibiting only if really required (-> overlap).
    To keep things simple we don't perform additional checks that wouldn't
    actually result in an overlap -- such as !RAM memory regions in some
    cases (see kvm_set_phys_mem()).
    To minimize cache-line bouncing, use a separate indicator
    (in_ioctl_lock) per CPU.  Also, make sure to hold the kvm_slots_lock
    while performing both actions (removing+re-adding).
    We have to wait until all IOCTLs were exited and block new ones from
    getting executed.
    This approach cannot result in a deadlock as long as the inhibitor does
    not hold any locks that might hinder an IOCTL from getting finished and
    exited - something fairly unusual. The inhibitor will always hold the BQL.
    AFAIKs, one possible candidate would be userfaultfd. If a page cannot be
    placed (e.g., during postcopy), because we're waiting for a lock, or if the
    userfaultfd thread cannot process a fault, because it is waiting for a
    lock, there could be a deadlock. However, the BQL is not applicable here,
    because any other guest memory access while holding the BQL would already
    result in a deadlock.
    Nothing else in the kernel should block forever and wait for userspace
    intervention.
    Note: pause_all_vcpus()/resume_all_vcpus() or
    start_exclusive()/end_exclusive() cannot be used, as they either drop
    the BQL or require to be called without the BQL - something inhibitors
    cannot handle. We need a low-level locking mechanism that is
    deadlock-free even when not releasing the BQL.
    Signed-off-by: David Hildenbrand <david@redhat.com>
    Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
    Tested-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
    Message-Id: <20221111154758.1372674-4-eesposit@redhat.com>
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Conflicts:
 	accel/kvm/kvm-all.c: include "sysemu/dirtylimit.h" is missing in
 	rhel 8.8.0
 Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 ---
 accel/kvm/kvm-all.c      | 101 ++++++++++++++++++++++++++++++++++-----
 include/sysemu/kvm_int.h |   8 ++++
 2 files changed, 98 insertions(+), 11 deletions(-)
 diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
 index 221aadfda7..3b7bc39823 100644
 --- a/accel/kvm/kvm-all.c
 +++ b/accel/kvm/kvm-all.c
@@ -31,6 +31,7 @@
 #include "sysemu/kvm_int.h"
 #include "sysemu/runstate.h"
 #include "sysemu/cpus.h"
 +#include "sysemu/accel-blocker.h"
 #include "qemu/bswap.h"
 #include "exec/memory.h"
 #include "exec/ram_addr.h"
@@ -45,6 +46,7 @@
 #include "qemu/guest-random.h"
 #include "sysemu/hw_accel.h"
 #include "kvm-cpus.h"
 +#include "qemu/range.h"
 #include "hw/boards.h"
@@ -1334,6 +1336,7 @@ void kvm_set_max_memslot_size(hwaddr max_slot_size)
     kvm_max_slot_size = max_slot_size;
 }
 +/* Called with KVMMemoryListener.slots_lock held */
 static void kvm_set_phys_mem(KVMMemoryListener *kml,
                              MemoryRegionSection *section, bool add)
 {
@@ -1368,14 +1371,12 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
     ram = memory_region_get_ram_ptr(mr) + mr_offset;
     ram_start_offset = memory_region_get_ram_addr(mr) + mr_offset;
 -    kvm_slots_lock();
 -
     if (!add) {
         do {
             slot_size = MIN(kvm_max_slot_size, size);
             mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
             if (!mem) {
 -                goto out;
 +                return;
             }
             if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
                 /*
@@ -1413,7 +1414,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
             start_addr += slot_size;
             size -= slot_size;
         } while (size);
 -        goto out;
 +        return;
     }
     /* register the new slot */
@@ -1438,9 +1439,6 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
         ram += slot_size;
         size -= slot_size;
     } while (size);
 -
 -out:
 -    kvm_slots_unlock();
 }
 static void *kvm_dirty_ring_reaper_thread(void *data)
@@ -1492,18 +1490,95 @@ static void kvm_region_add(MemoryListener *listener,
                            MemoryRegionSection *section)
 {
     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
 +    KVMMemoryUpdate *update;
 +
 +    update = g_new0(KVMMemoryUpdate, 1);
 +    update->section = *section;
 -    memory_region_ref(section->mr);
 -    kvm_set_phys_mem(kml, section, true);
 +    QSIMPLEQ_INSERT_TAIL(&kml->transaction_add, update, next);
 }
 static void kvm_region_del(MemoryListener *listener,
                            MemoryRegionSection *section)
 {
     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
 +    KVMMemoryUpdate *update;
 +
 +    update = g_new0(KVMMemoryUpdate, 1);
 +    update->section = *section;
 +
 +    QSIMPLEQ_INSERT_TAIL(&kml->transaction_del, update, next);
 +}
 +
 +static void kvm_region_commit(MemoryListener *listener)
 +{
 +    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener,
 +                                          listener);
 +    KVMMemoryUpdate *u1, *u2;
 +    bool need_inhibit = false;
 +
 +    if (QSIMPLEQ_EMPTY(&kml->transaction_add) &&
 +        QSIMPLEQ_EMPTY(&kml->transaction_del)) {
 +        return;
 +    }
 +
 +    /*
 +     * We have to be careful when regions to add overlap with ranges to remove.
 +     * We have to simulate atomic KVM memslot updates by making sure no ioctl()
 +     * is currently active.
 +     *
 +     * The lists are order by addresses, so it's easy to find overlaps.
 +     */
 +    u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
 +    u2 = QSIMPLEQ_FIRST(&kml->transaction_add);
 +    while (u1 && u2) {
 +        Range r1, r2;
 +
 +        range_init_nofail(&r1, u1->section.offset_within_address_space,
 +                          int128_get64(u1->section.size));
 +        range_init_nofail(&r2, u2->section.offset_within_address_space,
 +                          int128_get64(u2->section.size));
 +
 +        if (range_overlaps_range(&r1, &r2)) {
 +            need_inhibit = true;
 +            break;
 +        }
 +        if (range_lob(&r1) < range_lob(&r2)) {
 +            u1 = QSIMPLEQ_NEXT(u1, next);
 +        } else {
 +            u2 = QSIMPLEQ_NEXT(u2, next);
 +        }
 +    }
 +
 +    kvm_slots_lock();
 +    if (need_inhibit) {
 +        accel_ioctl_inhibit_begin();
 +    }
 +
 +    /* Remove all memslots before adding the new ones. */
 +    while (!QSIMPLEQ_EMPTY(&kml->transaction_del)) {
 +        u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
 +        QSIMPLEQ_REMOVE_HEAD(&kml->transaction_del, next);
 -    kvm_set_phys_mem(kml, section, false);
 -    memory_region_unref(section->mr);
 +        kvm_set_phys_mem(kml, &u1->section, false);
 +        memory_region_unref(u1->section.mr);
 +
 +        g_free(u1);
 +    }
 +    while (!QSIMPLEQ_EMPTY(&kml->transaction_add)) {
 +        u1 = QSIMPLEQ_FIRST(&kml->transaction_add);
 +        QSIMPLEQ_REMOVE_HEAD(&kml->transaction_add, next);
 +
 +        memory_region_ref(u1->section.mr);
 +        kvm_set_phys_mem(kml, &u1->section, true);
 +
 +        g_free(u1);
 +    }
 +
 +    if (need_inhibit) {
 +        accel_ioctl_inhibit_end();
 +    }
 +    kvm_slots_unlock();
 }
 static void kvm_log_sync(MemoryListener *listener,
@@ -1647,8 +1722,12 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
         kml->slots[i].slot = i;
     }
 +    QSIMPLEQ_INIT(&kml->transaction_add);
 +    QSIMPLEQ_INIT(&kml->transaction_del);
 +
     kml->listener.region_add = kvm_region_add;
     kml->listener.region_del = kvm_region_del;
 +    kml->listener.commit = kvm_region_commit;
     kml->listener.log_start = kvm_log_start;
     kml->listener.log_stop = kvm_log_stop;
     kml->listener.priority = 10;
 diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
 index 1f5487d9b7..7e18c0a3c0 100644
 --- a/include/sysemu/kvm_int.h
 +++ b/include/sysemu/kvm_int.h
@@ -11,6 +11,7 @@
 #include "exec/memory.h"
 #include "qemu/accel.h"
 +#include "qemu/queue.h"
 #include "sysemu/kvm.h"
 typedef struct KVMSlot
@@ -30,10 +31,17 @@ typedef struct KVMSlot
     ram_addr_t ram_start_offset;
 } KVMSlot;
 +typedef struct KVMMemoryUpdate {
 +    QSIMPLEQ_ENTRY(KVMMemoryUpdate) next;
 +    MemoryRegionSection section;
 +} KVMMemoryUpdate;
 +
 typedef struct KVMMemoryListener {
     MemoryListener listener;
     KVMSlot *slots;
     int as_id;
 +    QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_add;
 +    QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_del;
 } KVMMemoryListener;
 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
 -- 
 2.37.3
--- a/SOURCES/kvm-lsi53c895a-disable-reentrancy-detection-for-MMIO-reg.patch
+++ b/SOURCES/kvm-lsi53c895a-disable-reentrancy-detection-for-MMIO-reg.patch
@ -0,0 +1,71 @@
 From 8f19df61a101c1e57a1bce8adddb57a4a7123a77 Mon Sep 17 00:00:00 2001
 From: Thomas Huth <thuth@redhat.com>
 Date: Tue, 16 May 2023 11:05:56 +0200
 Subject: [PATCH 11/15] lsi53c895a: disable reentrancy detection for MMIO
 region, too
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 277: memory: prevent dma-reentracy issues
 RH-Bugzilla: 1999236
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [11/12] 8016c86f8432f5ea06c831d1181e87e6d45a6a50 (redhat/rhel/src/qemu-kvm/jons-qemu-kvm-2)
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1999236
 Upstream: Merged
 CVE: CVE-2021-3750
 commit d139fe9ad8a27bcc50b4ead77d2f97d191a0e95e
 Author: Thomas Huth <thuth@redhat.com>
 Date:   Tue May 16 11:05:56 2023 +0200
    lsi53c895a: disable reentrancy detection for MMIO region, too
    While trying to use a SCSI disk on the LSI controller with an
    older version of Fedora (25), I'm getting:
     qemu: warning: Blocked re-entrant IO on MemoryRegion: lsi-mmio at addr: 0x34
    and the SCSI controller is not usable. Seems like we have to
    disable the reentrancy checker for the MMIO region, too, to
    get this working again.
    The problem could be reproduced it like this:
    ./qemu-system-x86_64 -accel kvm -m 2G -machine q35 \
     -device lsi53c810,id=lsi1 -device scsi-hd,drive=d0 \
     -drive if=none,id=d0,file=.../somedisk.qcow2 \
     -cdrom Fedora-Everything-netinst-i386-25-1.3.iso
    Where somedisk.qcow2 is an image that contains already some partitions
    and file systems.
    In the boot menu of Fedora, go to
    "Troubleshooting" -> "Rescue a Fedora system" -> "3) Skip to shell"
    Then check "dmesg | grep -i 53c" for failure messages, and try to mount
    a partition from somedisk.qcow2.
    Message-Id: <20230516090556.553813-1-thuth@redhat.com>
    Signed-off-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 hw/scsi/lsi53c895a.c | 1 +
 1 file changed, 1 insertion(+)
 diff --git a/hw/scsi/lsi53c895a.c b/hw/scsi/lsi53c895a.c
 index 1e15e13fbf..2b9cb2ac5d 100644
 --- a/hw/scsi/lsi53c895a.c
 +++ b/hw/scsi/lsi53c895a.c
@@ -2306,6 +2306,7 @@ static void lsi_scsi_realize(PCIDevice *dev, Error **errp)
      * re-entrancy guard.
      */
     s->ram_io.disable_reentrancy_guard = true;
 +    s->mmio_io.disable_reentrancy_guard = true;
     address_space_init(&s->pci_io_as, pci_address_space_io(dev), "lsi-pci-io");
     qdev_init_gpio_out(d, &s->ext_irq, 1);
 -- 
 2.37.3
--- a/SOURCES/kvm-lsi53c895a-disable-reentrancy-detection-for-script-R.patch
+++ b/SOURCES/kvm-lsi53c895a-disable-reentrancy-detection-for-script-R.patch
@ -0,0 +1,59 @@
 From 3cffdbf3224ac21016dbee69cb2382c322d4bfbb Mon Sep 17 00:00:00 2001
 From: Jon Maloy <jmaloy@redhat.com>
 Date: Tue, 9 May 2023 10:29:03 -0400
 Subject: [PATCH 05/15] lsi53c895a: disable reentrancy detection for script RAM
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 277: memory: prevent dma-reentracy issues
 RH-Bugzilla: 1999236
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [5/12] b5334c3a34b38ed1dccf0030d5704e51e00fdce3 (redhat/rhel/src/qemu-kvm/jons-qemu-kvm-2)
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1999236
 Upstream: Merged
 CVE: CVE-2021-3750
 commit bfd6e7ae6a72b84e2eb9574f56e6ec037f05182c
 Author: Alexander Bulekov <alxndr@bu.edu>
 Date:   Thu Apr 27 17:10:10 2023 -0400
    lsi53c895a: disable reentrancy detection for script RAM
    As the code is designed to use the memory APIs to access the script ram,
    disable reentrancy checks for the pseudo-RAM ram_io MemoryRegion.
    In the future, ram_io may be converted from an IO to a proper RAM MemoryRegion.
    Reported-by: Fiona Ebner <f.ebner@proxmox.com>
    Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
    Reviewed-by: Thomas Huth <thuth@redhat.com>
    Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
    Message-Id: <20230427211013.2994127-6-alxndr@bu.edu>
    Signed-off-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 hw/scsi/lsi53c895a.c | 6 ++++++
 1 file changed, 6 insertions(+)
 diff --git a/hw/scsi/lsi53c895a.c b/hw/scsi/lsi53c895a.c
 index 85e907a785..1e15e13fbf 100644
 --- a/hw/scsi/lsi53c895a.c
 +++ b/hw/scsi/lsi53c895a.c
@@ -2301,6 +2301,12 @@ static void lsi_scsi_realize(PCIDevice *dev, Error **errp)
     memory_region_init_io(&s->io_io, OBJECT(s), &lsi_io_ops, s,
                           "lsi-io", 256);
 +    /*
 +     * Since we use the address-space API to interact with ram_io, disable the
 +     * re-entrancy guard.
 +     */
 +    s->ram_io.disable_reentrancy_guard = true;
 +
     address_space_init(&s->pci_io_as, pci_address_space_io(dev), "lsi-pci-io");
     qdev_init_gpio_out(d, &s->ext_irq, 1);
 -- 
 2.37.3
--- a/SOURCES/kvm-memory-prevent-dma-reentracy-issues.patch
+++ b/SOURCES/kvm-memory-prevent-dma-reentracy-issues.patch
@ -0,0 +1,151 @@
 From e0c811c2d13f995fe1b095f48637316be5978b0e Mon Sep 17 00:00:00 2001
 From: Jon Maloy <jmaloy@redhat.com>
 Date: Tue, 9 May 2023 10:29:03 -0400
 Subject: [PATCH 01/15] memory: prevent dma-reentracy issues
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 277: memory: prevent dma-reentracy issues
 RH-Bugzilla: 1999236
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [1/12] 8fced41b4b2105343e8f0250286b771bcb43c81f (redhat/rhel/src/qemu-kvm/jons-qemu-kvm-2)
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1999236
 Upstream: Merged
 CVE: CVE-2021-3750
 CVE: CVE-2023-0330
 commit a2e1753b8054344f32cf94f31c6399a58794a380
 Author: Alexander Bulekov <alxndr@bu.edu>
 Date:   Thu Apr 27 17:10:06 2023 -0400
    memory: prevent dma-reentracy issues
    Add a flag to the DeviceState, when a device is engaged in PIO/MMIO/DMA.
    This flag is set/checked prior to calling a device's MemoryRegion
    handlers, and set when device code initiates DMA.  The purpose of this
    flag is to prevent two types of DMA-based reentrancy issues:
    1.) mmio -> dma -> mmio case
    2.) bh -> dma write -> mmio case
    These issues have led to problems such as stack-exhaustion and
    use-after-frees.
    Summary of the problem from Peter Maydell:
    https://lore.kernel.org/qemu-devel/CAFEAcA_23vc7hE3iaM-JVA6W38LK4hJoWae5KcknhPRD5fPBZA@mail.gmail.com
    Resolves: https://gitlab.com/qemu-project/qemu/-/issues/62
    Resolves: https://gitlab.com/qemu-project/qemu/-/issues/540
    Resolves: https://gitlab.com/qemu-project/qemu/-/issues/541
    Resolves: https://gitlab.com/qemu-project/qemu/-/issues/556
    Resolves: https://gitlab.com/qemu-project/qemu/-/issues/557
    Resolves: https://gitlab.com/qemu-project/qemu/-/issues/827
    Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1282
    Resolves: CVE-2023-0330
    Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
    Reviewed-by: Thomas Huth <thuth@redhat.com>
    Message-Id: <20230427211013.2994127-2-alxndr@bu.edu>
    [thuth: Replace warn_report() with warn_report_once()]
    Signed-off-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 include/exec/memory.h  |  5 +++++
 include/hw/qdev-core.h |  7 +++++++
 softmmu/memory.c       | 16 ++++++++++++++++
 3 files changed, 28 insertions(+)
 diff --git a/include/exec/memory.h b/include/exec/memory.h
 index 20f1b27377..e089f90f9b 100644
 --- a/include/exec/memory.h
 +++ b/include/exec/memory.h
@@ -734,6 +734,8 @@ struct MemoryRegion {
     bool is_iommu;
     RAMBlock *ram_block;
     Object *owner;
 +    /* owner as TYPE_DEVICE. Used for re-entrancy checks in MR access hotpath */
 +    DeviceState *dev;
     const MemoryRegionOps *ops;
     void *opaque;
@@ -757,6 +759,9 @@ struct MemoryRegion {
     unsigned ioeventfd_nb;
     MemoryRegionIoeventfd *ioeventfds;
     RamDiscardManager *rdm; /* Only for RAM */
 +
 +    /* For devices designed to perform re-entrant IO into their own IO MRs */
 +    bool disable_reentrancy_guard;
 };
 struct IOMMUMemoryRegion {
 diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
 index 20d3066595..14226f860d 100644
 --- a/include/hw/qdev-core.h
 +++ b/include/hw/qdev-core.h
@@ -162,6 +162,10 @@ struct NamedClockList {
     QLIST_ENTRY(NamedClockList) node;
 };
 +typedef struct {
 +    bool engaged_in_io;
 +} MemReentrancyGuard;
 +
 /**
  * DeviceState:
  * @realized: Indicates whether the device has been fully constructed.
@@ -193,6 +197,9 @@ struct DeviceState {
     int instance_id_alias;
     int alias_required_for_version;
     ResettableState reset;
 +
 +    /* Is the device currently in mmio/pio/dma? Used to prevent re-entrancy */
 +    MemReentrancyGuard mem_reentrancy_guard;
 };
 struct DeviceListener {
 diff --git a/softmmu/memory.c b/softmmu/memory.c
 index 7340e19ff5..102f0a4248 100644
 --- a/softmmu/memory.c
 +++ b/softmmu/memory.c
@@ -541,6 +541,18 @@ static MemTxResult access_with_adjusted_size(hwaddr addr,
         access_size_max = 4;
     }
 +    /* Do not allow more than one simultaneous access to a device's IO Regions */
 +    if (mr->dev && !mr->disable_reentrancy_guard &&
 +        !mr->ram_device && !mr->ram && !mr->rom_device && !mr->readonly) {
 +        if (mr->dev->mem_reentrancy_guard.engaged_in_io) {
 +            warn_report_once("Blocked re-entrant IO on MemoryRegion: "
 +                             "%s at addr: 0x%" HWADDR_PRIX,
 +                             memory_region_name(mr), addr);
 +            return MEMTX_ACCESS_ERROR;
 +        }
 +        mr->dev->mem_reentrancy_guard.engaged_in_io = true;
 +    }
 +
     /* FIXME: support unaligned access? */
     access_size = MAX(MIN(size, access_size_max), access_size_min);
     access_mask = MAKE_64BIT_MASK(0, access_size * 8);
@@ -555,6 +567,9 @@ static MemTxResult access_with_adjusted_size(hwaddr addr,
                         access_mask, attrs);
         }
     }
 +    if (mr->dev) {
 +        mr->dev->mem_reentrancy_guard.engaged_in_io = false;
 +    }
     return r;
 }
@@ -1169,6 +1184,7 @@ static void memory_region_do_init(MemoryRegion *mr,
     }
     mr->name = g_strdup(name);
     mr->owner = owner;
 +    mr->dev = (DeviceState *) object_dynamic_cast(mr->owner, TYPE_DEVICE);
     mr->ram_block = NULL;
     if (name) {
 -- 
 2.37.3
--- a/SOURCES/kvm-memory-stricter-checks-prior-to-unsetting-engaged_in.patch
+++ b/SOURCES/kvm-memory-stricter-checks-prior-to-unsetting-engaged_in.patch
@ -0,0 +1,68 @@
 From c24e38eb508b3fb42ce3ea62fe8de0be6a95a6a8 Mon Sep 17 00:00:00 2001
 From: Jon Maloy <jmaloy@redhat.com>
 Date: Wed, 7 Jun 2023 11:45:09 -0400
 Subject: [PATCH 10/15] memory: stricter checks prior to unsetting
 engaged_in_io
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 277: memory: prevent dma-reentracy issues
 RH-Bugzilla: 1999236
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [10/12] 773b62a84b2bd4f5ee7fb8e1cfb3bb91c3a01de1 (redhat/rhel/src/qemu-kvm/jons-qemu-kvm-2)
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1999236
 Upstream: Merged
 CVE: CVE-2021-3750
 commit 3884bf6468ac6bbb58c2b3feaa74e87f821b52f3
 Author: Alexander Bulekov <alxndr@bu.edu>
 Date:   Tue May 16 04:40:02 2023 -0400
    memory: stricter checks prior to unsetting engaged_in_io
    engaged_in_io could be unset by an MR with re-entrancy checks disabled.
    Ensure that only MRs that can set the engaged_in_io flag can unset it.
    Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
    Message-Id: <20230516084002.3813836-1-alxndr@bu.edu>
    Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
    Signed-off-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 softmmu/memory.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
 diff --git a/softmmu/memory.c b/softmmu/memory.c
 index 102f0a4248..6b98615357 100644
 --- a/softmmu/memory.c
 +++ b/softmmu/memory.c
@@ -533,6 +533,7 @@ static MemTxResult access_with_adjusted_size(hwaddr addr,
     unsigned access_size;
     unsigned i;
     MemTxResult r = MEMTX_OK;
 +    bool reentrancy_guard_applied = false;
     if (!access_size_min) {
         access_size_min = 1;
@@ -551,6 +552,7 @@ static MemTxResult access_with_adjusted_size(hwaddr addr,
             return MEMTX_ACCESS_ERROR;
         }
         mr->dev->mem_reentrancy_guard.engaged_in_io = true;
 +        reentrancy_guard_applied = true;
     }
     /* FIXME: support unaligned access? */
@@ -567,7 +569,7 @@ static MemTxResult access_with_adjusted_size(hwaddr addr,
                         access_mask, attrs);
         }
     }
 -    if (mr->dev) {
 +    if (mr->dev && reentrancy_guard_applied) {
         mr->dev->mem_reentrancy_guard.engaged_in_io = false;
     }
     return r;
 -- 
 2.37.3
--- a/SOURCES/kvm-migration-Attempt-disk-reactivation-in-more-failure-.patch
+++ b/SOURCES/kvm-migration-Attempt-disk-reactivation-in-more-failure-.patch
@ -0,0 +1,111 @@
 From a1f2a51d1a789c46e806adb332236ca16d538bf9 Mon Sep 17 00:00:00 2001
 From: Eric Blake <eblake@redhat.com>
 Date: Tue, 2 May 2023 15:52:12 -0500
 Subject: [PATCH 3/5] migration: Attempt disk reactivation in more failure
 scenarios
 RH-Author: Eric Blake <eblake@redhat.com>
 RH-MergeRequest: 273: migration: prevent source core dump if NFS dies mid-migration
 RH-Bugzilla: 2177957
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Acked-by: quintela1 <quintela@redhat.com>
 RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
 RH-Commit: [3/3] e84bf1e7233c0273ca3136ecaa6b2cfc9c0efacb (ebblake/qemu-kvm)
 Commit fe904ea824 added a fail_inactivate label, which tries to
 reactivate disks on the source after a failure while s->state ==
 MIGRATION_STATUS_ACTIVE, but didn't actually use the label if
 qemu_savevm_state_complete_precopy() failed.  This failure to
 reactivate is also present in commit 6039dd5b1c (also covering the new
 s->state == MIGRATION_STATUS_DEVICE state) and 403d18ae (ensuring
 s->block_inactive is set more reliably).
 Consolidate the two labels back into one - no matter HOW migration is
 failed, if there is any chance we can reach vm_start() after having
 attempted inactivation, it is essential that we have tried to restart
 disks before then.  This also makes the cleanup more like
 migrate_fd_cancel().
 Suggested-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Eric Blake <eblake@redhat.com>
 Message-Id: <20230502205212.134680-1-eblake@redhat.com>
 Acked-by: Peter Xu <peterx@redhat.com>
 Reviewed-by: Juan Quintela <quintela@redhat.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 (cherry picked from commit 6dab4c93ecfae48e2e67b984d1032c1e988d3005)
 [eblake: downstream migrate_colo() => migrate_colo_enabled()]
 Signed-off-by: Eric Blake <eblake@redhat.com>
 ---
 migration/migration.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)
 diff --git a/migration/migration.c b/migration/migration.c
 index 6ba8eb0fdf..817170d52d 100644
 --- a/migration/migration.c
 +++ b/migration/migration.c
@@ -3255,6 +3255,11 @@ static void migration_completion(MigrationState *s)
                                             MIGRATION_STATUS_DEVICE);
             }
             if (ret >= 0) {
 +                /*
 +                 * Inactivate disks except in COLO, and track that we
 +                 * have done so in order to remember to reactivate
 +                 * them if migration fails or is cancelled.
 +                 */
                 s->block_inactive = !migrate_colo_enabled();
                 qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
                 ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false,
@@ -3290,13 +3295,13 @@ static void migration_completion(MigrationState *s)
         rp_error = await_return_path_close_on_source(s);
         trace_migration_return_path_end_after(rp_error);
         if (rp_error) {
 -            goto fail_invalidate;
 +            goto fail;
         }
     }
     if (qemu_file_get_error(s->to_dst_file)) {
         trace_migration_completion_file_err();
 -        goto fail_invalidate;
 +        goto fail;
     }
     if (!migrate_colo_enabled()) {
@@ -3306,26 +3311,25 @@ static void migration_completion(MigrationState *s)
     return;
 -fail_invalidate:
 -    /* If not doing postcopy, vm_start() will be called: let's regain
 -     * control on images.
 -     */
 -    if (s->state == MIGRATION_STATUS_ACTIVE ||
 -        s->state == MIGRATION_STATUS_DEVICE) {
 +fail:
 +    if (s->block_inactive && (s->state == MIGRATION_STATUS_ACTIVE ||
 +                              s->state == MIGRATION_STATUS_DEVICE)) {
 +        /*
 +         * If not doing postcopy, vm_start() will be called: let's
 +         * regain control on images.
 +         */
         Error *local_err = NULL;
         qemu_mutex_lock_iothread();
         bdrv_invalidate_cache_all(&local_err);
         if (local_err) {
             error_report_err(local_err);
 -            s->block_inactive = true;
         } else {
             s->block_inactive = false;
         }
         qemu_mutex_unlock_iothread();
     }
 -fail:
     migrate_set_state(&s->state, current_active_state,
                       MIGRATION_STATUS_FAILED);
 }
 -- 
 2.39.1
--- a/SOURCES/kvm-migration-Disable-postcopy-multifd-migration.patch
+++ b/SOURCES/kvm-migration-Disable-postcopy-multifd-migration.patch
@ -0,0 +1,59 @@
 From dd6d0eace90285c017ae40cba0ffa95ccd963ebd Mon Sep 17 00:00:00 2001
 From: Leonardo Bras <leobras@redhat.com>
 Date: Tue, 20 Jun 2023 14:51:03 -0300
 Subject: [PATCH 15/15] migration: Disable postcopy + multifd migration
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Leonardo Brás <leobras@redhat.com>
 RH-MergeRequest: 287: migration: Disable postcopy + multifd migration
 RH-Bugzilla: 2169733
 RH-Acked-by: Peter Xu <peterx@redhat.com>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [1/1] 07d26fbac35b7586fe790304f03d316ed26a4ef2
 Since the introduction of multifd, it's possible to perform a multifd
 migration and finish it using postcopy.
 A bug introduced by yank (fixed on cfc3bcf373) was previously preventing
 a successful use of this migration scenario, and now thing should be
 working on most scenarios.
 But since there is not enough testing/support nor any reported users for
 this scenario, we should disable this combination before it may cause any
 problems for users.
 Suggested-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 Signed-off-by: Leonardo Bras <leobras@redhat.com>
 Acked-by: Peter Xu <peterx@redhat.com>
 Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 Reviewed-by: Juan Quintela <quintela@redhat.com>
 Signed-off-by: Juan Quintela <quintela@redhat.com>
 (cherry picked from commit b405dfff1ea3cf0530b628895b5a7a50dc8c6996)
 [leobras: moves logic from options.c -> migration.c and use cap_list
 instead of new_caps for backward compatibility]
 Signed-off-by: Leonardo Bras <leobras@redhat.com>
 ---
 migration/migration.c | 5 +++++
 1 file changed, 5 insertions(+)
 diff --git a/migration/migration.c b/migration/migration.c
 index 817170d52d..1ad82e63f0 100644
 --- a/migration/migration.c
 +++ b/migration/migration.c
@@ -1246,6 +1246,11 @@ static bool migrate_caps_check(bool *cap_list,
             error_setg(errp, "Postcopy is not compatible with ignore-shared");
             return false;
         }
 +
 +        if (cap_list[MIGRATION_CAPABILITY_MULTIFD]) {
 +            error_setg(errp, "Postcopy is not yet compatible with multifd");
 +            return false;
 +        }
     }
     if (cap_list[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) {
 -- 
 2.37.3
--- a/SOURCES/kvm-migration-Handle-block-device-inactivation-failures-.patch
+++ b/SOURCES/kvm-migration-Handle-block-device-inactivation-failures-.patch
@ -0,0 +1,117 @@
 From 1b07c7663b6a5c19c9303088d63c39dba7e3bb36 Mon Sep 17 00:00:00 2001
 From: Eric Blake <eblake@redhat.com>
 Date: Fri, 14 Apr 2023 10:33:58 -0500
 Subject: [PATCH 1/5] migration: Handle block device inactivation failures
 better
 RH-Author: Eric Blake <eblake@redhat.com>
 RH-MergeRequest: 273: migration: prevent source core dump if NFS dies mid-migration
 RH-Bugzilla: 2177957
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Acked-by: quintela1 <quintela@redhat.com>
 RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
 RH-Commit: [1/3] 5892c17ca0a21d824d176e7398d12f7cf991651d (ebblake/qemu-kvm)
 Consider what happens when performing a migration between two host
 machines connected to an NFS server serving multiple block devices to
 the guest, when the NFS server becomes unavailable.  The migration
 attempts to inactivate all block devices on the source (a necessary
 step before the destination can take over); but if the NFS server is
 non-responsive, the attempt to inactivate can itself fail.  When that
 happens, the destination fails to get the migrated guest (good,
 because the source wasn't able to flush everything properly):
  (qemu) qemu-kvm: load of migration failed: Input/output error
 at which point, our only hope for the guest is for the source to take
 back control.  With the current code base, the host outputs a message, but then appears to resume:
  (qemu) qemu-kvm: qemu_savevm_state_complete_precopy_non_iterable: bdrv_inactivate_all() failed (-1)
  (src qemu)info status
   VM status: running
 but a second migration attempt now asserts:
  (src qemu) qemu-kvm: ../block.c:6738: int bdrv_inactivate_recurse(BlockDriverState *): Assertion `!(bs->open_flags & BDRV_O_INACTIVE)' failed.
 Whether the guest is recoverable on the source after the first failure
 is debatable, but what we do not want is to have qemu itself fail due
 to an assertion.  It looks like the problem is as follows:
 In migration.c:migration_completion(), the source sets 'inactivate' to
 true (since COLO is not enabled), then tries
 savevm.c:qemu_savevm_state_complete_precopy() with a request to
 inactivate block devices.  In turn, this calls
 block.c:bdrv_inactivate_all(), which fails when flushing runs up
 against the non-responsive NFS server.  With savevm failing, we are
 now left in a state where some, but not all, of the block devices have
 been inactivated; but migration_completion() then jumps to 'fail'
 rather than 'fail_invalidate' and skips an attempt to reclaim those
 those disks by calling bdrv_activate_all().  Even if we do attempt to
 reclaim disks, we aren't taking note of failure there, either.
 Thus, we have reached a state where the migration engine has forgotten
 all state about whether a block device is inactive, because we did not
 set s->block_inactive in enough places; so migration allows the source
 to reach vm_start() and resume execution, violating the block layer
 invariant that the guest CPUs should not be restarted while a device
 is inactive.  Note that the code in migration.c:migrate_fd_cancel()
 will also try to reactivate all block devices if s->block_inactive was
 set, but because we failed to set that flag after the first failure,
 the source assumes it has reclaimed all devices, even though it still
 has remaining inactivated devices and does not try again.  Normally,
 qmp_cont() will also try to reactivate all disks (or correctly fail if
 the disks are not reclaimable because NFS is not yet back up), but the
 auto-resumption of the source after a migration failure does not go
 through qmp_cont().  And because we have left the block layer in an
 inconsistent state with devices still inactivated, the later migration
 attempt is hitting the assertion failure.
 Since it is important to not resume the source with inactive disks,
 this patch marks s->block_inactive before attempting inactivation,
 rather than after succeeding, in order to prevent any vm_start() until
 it has successfully reactivated all devices.
 See also https://bugzilla.redhat.com/show_bug.cgi?id=2058982
 Signed-off-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Juan Quintela <quintela@redhat.com>
 Acked-by: Lukas Straub <lukasstraub2@web.de>
 Tested-by: Lukas Straub <lukasstraub2@web.de>
 Signed-off-by: Juan Quintela <quintela@redhat.com>
 (cherry picked from commit 403d18ae384239876764bbfa111d6cc5dcb673d1)
 ---
 migration/migration.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)
 diff --git a/migration/migration.c b/migration/migration.c
 index 0885549de0..08e5e8f013 100644
 --- a/migration/migration.c
 +++ b/migration/migration.c
@@ -3256,13 +3256,11 @@ static void migration_completion(MigrationState *s)
                                             MIGRATION_STATUS_DEVICE);
             }
             if (ret >= 0) {
 +                s->block_inactive = inactivate;
                 qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
                 ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false,
                                                          inactivate);
             }
 -            if (inactivate && ret >= 0) {
 -                s->block_inactive = true;
 -            }
         }
         qemu_mutex_unlock_iothread();
@@ -3321,6 +3319,7 @@ fail_invalidate:
         bdrv_invalidate_cache_all(&local_err);
         if (local_err) {
             error_report_err(local_err);
 +            s->block_inactive = true;
         } else {
             s->block_inactive = false;
         }
 -- 
 2.39.1
--- a/SOURCES/kvm-migration-Minor-control-flow-simplification.patch
+++ b/SOURCES/kvm-migration-Minor-control-flow-simplification.patch
@ -0,0 +1,53 @@
 From e79d0506184e861350d2a3e62dd986aa03d30aa8 Mon Sep 17 00:00:00 2001
 From: Eric Blake <eblake@redhat.com>
 Date: Thu, 20 Apr 2023 09:35:51 -0500
 Subject: [PATCH 2/5] migration: Minor control flow simplification
 RH-Author: Eric Blake <eblake@redhat.com>
 RH-MergeRequest: 273: migration: prevent source core dump if NFS dies mid-migration
 RH-Bugzilla: 2177957
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Acked-by: quintela1 <quintela@redhat.com>
 RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
 RH-Commit: [2/3] f00b21b6ebd377af79af93ac18f103f8dc0309d6 (ebblake/qemu-kvm)
 No need to declare a temporary variable.
 Suggested-by: Juan Quintela <quintela@redhat.com>
 Fixes: 1df36e8c6289 ("migration: Handle block device inactivation failures better")
 Signed-off-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Juan Quintela <quintela@redhat.com>
 Signed-off-by: Juan Quintela <quintela@redhat.com>
 (cherry picked from commit 5d39f44d7ac5c63f53d4d0900ceba9521bc27e49)
 ---
 migration/migration.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)
 diff --git a/migration/migration.c b/migration/migration.c
 index 08e5e8f013..6ba8eb0fdf 100644
 --- a/migration/migration.c
 +++ b/migration/migration.c
@@ -3248,7 +3248,6 @@ static void migration_completion(MigrationState *s)
         ret = global_state_store();
         if (!ret) {
 -            bool inactivate = !migrate_colo_enabled();
             ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
             trace_migration_completion_vm_stop(ret);
             if (ret >= 0) {
@@ -3256,10 +3255,10 @@ static void migration_completion(MigrationState *s)
                                             MIGRATION_STATUS_DEVICE);
             }
             if (ret >= 0) {
 -                s->block_inactive = inactivate;
 +                s->block_inactive = !migrate_colo_enabled();
                 qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
                 ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false,
 -                                                         inactivate);
 +                                                         s->block_inactive);
             }
         }
         qemu_mutex_unlock_iothread();
 -- 
 2.39.1
--- a/SOURCES/kvm-migration-Read-state-once.patch
+++ b/SOURCES/kvm-migration-Read-state-once.patch
@ -0,0 +1,76 @@
 From 34eae2d7ef928a7e0e10cc30fe76839c005998eb Mon Sep 17 00:00:00 2001
 From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
 Date: Wed, 13 Apr 2022 12:33:29 +0100
 Subject: [PATCH 07/11] migration: Read state once
 RH-Author: Dr. David Alan Gilbert <dgilbert@redhat.com>
 RH-MergeRequest: 249: migration: Read state once
 RH-Bugzilla: 2074205
 RH-Acked-by: Peter Xu <peterx@redhat.com>
 RH-Acked-by: Laszlo Ersek <lersek@redhat.com>
 RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
 RH-Acked-by: quintela1 <quintela@redhat.com>
 RH-Commit: [1/1] 9aa47b492a646fce4e66ebd9b7d7a85286d16051
 The 'status' field for the migration is updated normally using
 an atomic operation from the migration thread.
 Most readers of it aren't that careful, and in most cases it doesn't
 matter.
 In query_migrate->fill_source_migration_info the 'state'
 is read twice; the first time to decide which state fields to fill in,
 and then secondly to copy the state to the status field; that can end up
 with a status that's inconsistent; e.g. setting up the fields
 for 'setup' and then having an 'active' status.  In that case
 libvirt gets upset by the lack of ram info.
 The symptom is:
   libvirt.libvirtError: internal error: migration was active, but no RAM info was set
 Read the state exactly once in fill_source_migration_info.
 This is a possible fix for:
 https://bugzilla.redhat.com/show_bug.cgi?id=2074205
 Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 Message-Id: <20220413113329.103696-1-dgilbert@redhat.com>
 Reviewed-by: Juan Quintela <quintela@redhat.com>
 Reviewed-by: Peter Xu <peterx@redhat.com>
 Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 (cherry picked from commit 552de79bfdd5e9e53847eb3c6d6e4cd898a4370e)
 ---
 migration/migration.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
 diff --git a/migration/migration.c b/migration/migration.c
 index 51e6726dac..d8b24a2c91 100644
 --- a/migration/migration.c
 +++ b/migration/migration.c
@@ -1071,6 +1071,7 @@ static void populate_disk_info(MigrationInfo *info)
 static void fill_source_migration_info(MigrationInfo *info)
 {
     MigrationState *s = migrate_get_current();
 +    int state = qatomic_read(&s->state);
     GSList *cur_blocker = migration_blockers;
     info->blocked_reasons = NULL;
@@ -1090,7 +1091,7 @@ static void fill_source_migration_info(MigrationInfo *info)
     }
     info->has_blocked_reasons = info->blocked_reasons != NULL;
 -    switch (s->state) {
 +    switch (state) {
     case MIGRATION_STATUS_NONE:
         /* no migration has happened ever */
         /* do not overwrite destination migration status */
@@ -1135,7 +1136,7 @@ static void fill_source_migration_info(MigrationInfo *info)
         info->has_status = true;
         break;
     }
 -    info->status = s->state;
 +    info->status = state;
 }
 typedef enum WriteTrackingSupport {
 -- 
 2.37.3
--- a/SOURCES/kvm-migration-check-magic-value-for-deciding-the-mapping.patch
+++ b/SOURCES/kvm-migration-check-magic-value-for-deciding-the-mapping.patch
@ -0,0 +1,296 @@
 From f21a343af4b4d0c6e5181ae0abd0f6280dc8296c Mon Sep 17 00:00:00 2001
 From: "manish.mishra" <manish.mishra@nutanix.com>
 Date: Tue, 20 Dec 2022 18:44:18 +0000
 Subject: [PATCH 2/3] migration: check magic value for deciding the mapping of
 channels
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Peter Xu <peterx@redhat.com>
 RH-MergeRequest: 258: migration: Fix multifd crash due to channel disorder
 RH-Bugzilla: 2137740
 RH-Acked-by: quintela1 <quintela@redhat.com>
 RH-Acked-by: Leonardo Brás <leobras@redhat.com>
 RH-Acked-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
 RH-Commit: [2/2] f97bebef3d3e372cfd660e5ddb6cffba791840d2
 Conflicts:
        migration/migration.c
        migration/multifd.c
        migration/postcopy-ram.c
        migration/postcopy-ram.h
        There're a bunch of conflicts due to missing upstream patches on
        e.g. on qemufile reworks, postcopy preempt.  We don't plan to have
        preempt in rhel8 at all, probably the same as the rest.
 Current logic assumes that channel connections on the destination side are
 always established in the same order as the source and the first one will
 always be the main channel followed by the multifid or post-copy
 preemption channel. This may not be always true, as even if a channel has a
 connection established on the source side it can be in the pending state on
 the destination side and a newer connection can be established first.
 Basically causing out of order mapping of channels on the destination side.
 Currently, all channels except post-copy preempt send a magic number, this
 patch uses that magic number to decide the type of channel. This logic is
 applicable only for precopy(multifd) live migration, as mentioned, the
 post-copy preempt channel does not send any magic number. Also, tls live
 migrations already does tls handshake before creating other channels, so
 this issue is not possible with tls, hence this logic is avoided for tls
 live migrations. This patch uses read peek to check the magic number of
 channels so that current data/control stream management remains
 un-effected.
 Reviewed-by: Peter Xu <peterx@redhat.com>
 Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
 Reviewed-by: Juan Quintela <quintela@redhat.com>
 Suggested-by: Daniel P. Berrange <berrange@redhat.com>
 Signed-off-by: manish.mishra <manish.mishra@nutanix.com>
 Signed-off-by: Juan Quintela <quintela@redhat.com>
 (cherry picked from commit 6720c2b32725e6ac404f22851a0ecd0a71d0cbe2)
 Signed-off-by: Peter Xu <peterx@redhat.com>
 ---
 migration/channel.c   | 45 ++++++++++++++++++++++++++++++++++++++
 migration/channel.h   |  5 +++++
 migration/migration.c | 51 +++++++++++++++++++++++++++++++------------
 migration/multifd.c   | 19 ++++++++--------
 migration/multifd.h   |  2 +-
 5 files changed, 98 insertions(+), 24 deletions(-)
 diff --git a/migration/channel.c b/migration/channel.c
 index 086b5c0d8b..ee308fef23 100644
 --- a/migration/channel.c
 +++ b/migration/channel.c
@@ -98,3 +98,48 @@ void migration_channel_connect(MigrationState *s,
     g_free(s->hostname);
     error_free(error);
 }
 +
 +
 +/**
 + * @migration_channel_read_peek - Peek at migration channel, without
 + *     actually removing it from channel buffer.
 + *
 + * @ioc: the channel object
 + * @buf: the memory region to read data into
 + * @buflen: the number of bytes to read in @buf
 + * @errp: pointer to a NULL-initialized error object
 + *
 + * Returns 0 if successful, returns -1 and sets @errp if fails.
 + */
 +int migration_channel_read_peek(QIOChannel *ioc,
 +                                const char *buf,
 +                                const size_t buflen,
 +                                Error **errp)
 +{
 +    ssize_t len = 0;
 +    struct iovec iov = { .iov_base = (char *)buf, .iov_len = buflen };
 +
 +    while (true) {
 +        len = qio_channel_readv_full(ioc, &iov, 1, NULL, NULL,
 +                                     QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp);
 +
 +        if (len <= 0 && len != QIO_CHANNEL_ERR_BLOCK) {
 +            error_setg(errp,
 +                       "Failed to peek at channel");
 +            return -1;
 +        }
 +
 +        if (len == buflen) {
 +            break;
 +        }
 +
 +        /* 1ms sleep. */
 +        if (qemu_in_coroutine()) {
 +            qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000);
 +        } else {
 +            g_usleep(1000);
 +        }
 +    }
 +
 +    return 0;
 +}
 diff --git a/migration/channel.h b/migration/channel.h
 index 67a461c28a..5bdb8208a7 100644
 --- a/migration/channel.h
 +++ b/migration/channel.h
@@ -24,4 +24,9 @@ void migration_channel_connect(MigrationState *s,
                                QIOChannel *ioc,
                                const char *hostname,
                                Error *error_in);
 +
 +int migration_channel_read_peek(QIOChannel *ioc,
 +                                const char *buf,
 +                                const size_t buflen,
 +                                Error **errp);
 #endif
 diff --git a/migration/migration.c b/migration/migration.c
 index d8b24a2c91..0885549de0 100644
 --- a/migration/migration.c
 +++ b/migration/migration.c
@@ -32,6 +32,7 @@
 #include "savevm.h"
 #include "qemu-file-channel.h"
 #include "qemu-file.h"
 +#include "channel.h"
 #include "migration/vmstate.h"
 #include "block/block.h"
 #include "qapi/error.h"
@@ -637,10 +638,6 @@ static bool migration_incoming_setup(QEMUFile *f, Error **errp)
 {
     MigrationIncomingState *mis = migration_incoming_get_current();
 -    if (multifd_load_setup(errp) != 0) {
 -        return false;
 -    }
 -
     if (!mis->from_src_file) {
         mis->from_src_file = f;
     }
@@ -701,10 +698,42 @@ void migration_fd_process_incoming(QEMUFile *f, Error **errp)
 void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
 {
     MigrationIncomingState *mis = migration_incoming_get_current();
 +    bool default_channel = true;
 +    uint32_t channel_magic = 0;
     Error *local_err = NULL;
 -    bool start_migration;
 +    int ret = 0;
 -    if (!mis->from_src_file) {
 +    if (migrate_use_multifd() && !migrate_postcopy_ram() &&
 +        qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
 +        /*
 +         * With multiple channels, it is possible that we receive channels
 +         * out of order on destination side, causing incorrect mapping of
 +         * source channels on destination side. Check channel MAGIC to
 +         * decide type of channel. Please note this is best effort, postcopy
 +         * preempt channel does not send any magic number so avoid it for
 +         * postcopy live migration. Also tls live migration already does
 +         * tls handshake while initializing main channel so with tls this
 +         * issue is not possible.
 +         */
 +        ret = migration_channel_read_peek(ioc, (void *)&channel_magic,
 +                                          sizeof(channel_magic), &local_err);
 +
 +        if (ret != 0) {
 +            error_propagate(errp, local_err);
 +            return;
 +        }
 +
 +        default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC));
 +    } else {
 +        default_channel = !mis->from_src_file;
 +    }
 +
 +    if (multifd_load_setup(errp) != 0) {
 +        error_setg(errp, "Failed to setup multifd channels");
 +        return;
 +    }
 +
 +    if (default_channel) {
         /* The first connection (multifd may have multiple) */
         QEMUFile *f = qemu_fopen_channel_input(ioc);
@@ -716,23 +745,17 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
         if (!migration_incoming_setup(f, errp)) {
             return;
         }
 -
 -        /*
 -         * Common migration only needs one channel, so we can start
 -         * right now.  Multifd needs more than one channel, we wait.
 -         */
 -        start_migration = !migrate_use_multifd();
     } else {
         /* Multiple connections */
         assert(migrate_use_multifd());
 -        start_migration = multifd_recv_new_channel(ioc, &local_err);
 +        multifd_recv_new_channel(ioc, &local_err);
         if (local_err) {
             error_propagate(errp, local_err);
             return;
         }
     }
 -    if (start_migration) {
 +    if (migration_has_all_channels()) {
         migration_incoming_process();
     }
 }
 diff --git a/migration/multifd.c b/migration/multifd.c
 index 7c16523e6b..75ac052d2f 100644
 --- a/migration/multifd.c
 +++ b/migration/multifd.c
@@ -1183,9 +1183,14 @@ int multifd_load_setup(Error **errp)
     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
     uint8_t i;
 -    if (!migrate_use_multifd()) {
 +    /*
 +     * Return successfully if multiFD recv state is already initialised
 +     * or multiFD is not enabled.
 +     */
 +    if (multifd_recv_state || !migrate_use_multifd()) {
         return 0;
     }
 +
     if (!migrate_multifd_is_allowed()) {
         error_setg(errp, "multifd is not supported by current protocol");
         return -1;
@@ -1244,11 +1249,9 @@ bool multifd_recv_all_channels_created(void)
 /*
  * Try to receive all multifd channels to get ready for the migration.
 - * - Return true and do not set @errp when correctly receiving all channels;
 - * - Return false and do not set @errp when correctly receiving the current one;
 - * - Return false and set @errp when failing to receive the current channel.
 + * Sets @errp when failing to receive the current channel.
  */
 -bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
 +void multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
 {
     MultiFDRecvParams *p;
     Error *local_err = NULL;
@@ -1261,7 +1264,7 @@ bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
                                 "failed to receive packet"
                                 " via multifd channel %d: ",
                                 qatomic_read(&multifd_recv_state->count));
 -        return false;
 +        return;
     }
     trace_multifd_recv_new_channel(id);
@@ -1271,7 +1274,7 @@ bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
                    id);
         multifd_recv_terminate_threads(local_err);
         error_propagate(errp, local_err);
 -        return false;
 +        return;
     }
     p->c = ioc;
     object_ref(OBJECT(ioc));
@@ -1282,6 +1285,4 @@ bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
                        QEMU_THREAD_JOINABLE);
     qatomic_inc(&multifd_recv_state->count);
 -    return qatomic_read(&multifd_recv_state->count) ==
 -           migrate_multifd_channels();
 }
 diff --git a/migration/multifd.h b/migration/multifd.h
 index 11d5e273e6..9c0a2a0701 100644
 --- a/migration/multifd.h
 +++ b/migration/multifd.h
@@ -20,7 +20,7 @@ void multifd_save_cleanup(void);
 int multifd_load_setup(Error **errp);
 int multifd_load_cleanup(Error **errp);
 bool multifd_recv_all_channels_created(void);
 -bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp);
 +void multifd_recv_new_channel(QIOChannel *ioc, Error **errp);
 void multifd_recv_sync_main(void);
 int multifd_send_sync_main(QEMUFile *f);
 int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset);
 -- 
 2.37.3
--- a/SOURCES/kvm-monitor-only-run-coroutine-commands-in-qemu_aio_cont.patch
+++ b/SOURCES/kvm-monitor-only-run-coroutine-commands-in-qemu_aio_cont.patch
--- a/SOURCES/kvm-nbd-server-CVE-2024-7409-Avoid-use-after-free-when-c.patch
+++ b/SOURCES/kvm-nbd-server-CVE-2024-7409-Avoid-use-after-free-when-c.patch
@ -0,0 +1,101 @@
 From 676438ff8c42323c3e5d9e7eeeb1b3367999136c Mon Sep 17 00:00:00 2001
 From: Eric Blake <eblake@redhat.com>
 Date: Thu, 22 Aug 2024 09:35:29 -0500
 Subject: [PATCH 3/3] nbd/server: CVE-2024-7409: Avoid use-after-free when
 closing server
 RH-Author: Eric Blake <eblake@redhat.com>
 RH-MergeRequest: 398: nbd/server: CVE-2024-7409: Avoid use-after-free when closing server
 RH-Jira: RHEL-52611
 RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
 RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
 RH-Commit: [3/3] 1ee35a40ded067a085bf6fcafa690b40976d7f2d (ebblake/qemu-kvm)
 Commit 3e7ef738 plugged the use-after-free of the global nbd_server
 object, but overlooked a use-after-free of nbd_server->listener.
 Although this race is harder to hit, notice that our shutdown path
 first drops the reference count of nbd_server->listener, then triggers
 actions that can result in a pending client reaching the
 nbd_blockdev_client_closed() callback, which in turn calls
 qio_net_listener_set_client_func on a potentially stale object.
 If we know we don't want any more clients to connect, and have already
 told the listener socket to shut down, then we should not be trying to
 update the listener socket's associated function.
 Reproducer:
 > #!/usr/bin/python3
 >
 > import os
 > from threading import Thread
 >
 > def start_stop():
 >     while 1:
 >         os.system('virsh qemu-monitor-command VM \'{"execute": "nbd-server-start",
 +"arguments":{"addr":{"type":"unix","data":{"path":"/tmp/nbd-sock"}}}}\'')
 >         os.system('virsh qemu-monitor-command VM \'{"execute": "nbd-server-stop"}\'')
 >
 > def nbd_list():
 >     while 1:
 >         os.system('/path/to/build/qemu-nbd -L -k /tmp/nbd-sock')
 >
 > def test():
 >     sst = Thread(target=start_stop)
 >     sst.start()
 >     nlt = Thread(target=nbd_list)
 >     nlt.start()
 >
 >     sst.join()
 >     nlt.join()
 >
 > test()
 Fixes: CVE-2024-7409
 Fixes: 3e7ef738c8 ("nbd/server: CVE-2024-7409: Close stray clients at server-stop")
 CC: qemu-stable@nongnu.org
 Reported-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
 Signed-off-by: Eric Blake <eblake@redhat.com>
 Message-ID: <20240822143617.800419-2-eblake@redhat.com>
 Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
 (cherry picked from commit 3874f5f73c441c52f1c699c848d463b0eda01e4c)
 Jira: https://issues.redhat.com/browse/RHEL-52611
 Signed-off-by: Eric Blake <eblake@redhat.com>
 ---
 blockdev-nbd.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)
 diff --git a/blockdev-nbd.c b/blockdev-nbd.c
 index 87839c180b..b5d55e2518 100644
 --- a/blockdev-nbd.c
 +++ b/blockdev-nbd.c
@@ -87,10 +87,13 @@ static void nbd_accept(QIONetListener *listener, QIOChannelSocket *cioc,
 static void nbd_update_server_watch(NBDServerData *s)
 {
 -    if (!s->max_connections || s->connections < s->max_connections) {
 -        qio_net_listener_set_client_func(s->listener, nbd_accept, NULL, NULL);
 -    } else {
 -        qio_net_listener_set_client_func(s->listener, NULL, NULL, NULL);
 +    if (s->listener) {
 +        if (!s->max_connections || s->connections < s->max_connections) {
 +            qio_net_listener_set_client_func(s->listener, nbd_accept, NULL,
 +                                             NULL);
 +        } else {
 +            qio_net_listener_set_client_func(s->listener, NULL, NULL, NULL);
 +        }
     }
 }
@@ -108,6 +111,7 @@ static void nbd_server_free(NBDServerData *server)
      */
     qio_net_listener_disconnect(server->listener);
     object_unref(OBJECT(server->listener));
 +    server->listener = NULL;
     QLIST_FOREACH_SAFE(conn, &server->conns, next, tmp) {
         qio_channel_shutdown(QIO_CHANNEL(conn->cioc), QIO_CHANNEL_SHUTDOWN_BOTH,
                              NULL);
 -- 
 2.39.3
--- a/SOURCES/kvm-nbd-server-CVE-2024-7409-Cap-default-max-connections.patch
+++ b/SOURCES/kvm-nbd-server-CVE-2024-7409-Cap-default-max-connections.patch
@ -0,0 +1,187 @@
 From adfddc25c82576458442f61efb913e44d83bcbd0 Mon Sep 17 00:00:00 2001
 From: Eric Blake <eblake@redhat.com>
 Date: Tue, 6 Aug 2024 13:53:00 -0500
 Subject: [PATCH 2/5] nbd/server: CVE-2024-7409: Cap default max-connections to
 100
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Eric Blake <eblake@redhat.com>
 RH-MergeRequest: 388: nbd/server: fix CVE-2024-7409 (qemu crash on nbd-server-stop) [rhel-8.10.z]
 RH-Jira: RHEL-52611
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Acked-by: Richard W.M. Jones <rjones@redhat.com>
 RH-Commit: [2/4] 1f5d88d5644c46cbb957778254a993930b9d86dc (ebblake/qemu-kvm)
 Allowing an unlimited number of clients to any web service is a recipe
 for a rudimentary denial of service attack: the client merely needs to
 open lots of sockets without closing them, until qemu no longer has
 any more fds available to allocate.
 For qemu-nbd, we default to allowing only 1 connection unless more are
 explicitly asked for (-e or --shared); this was historically picked as
 a nice default (without an explicit -t, a non-persistent qemu-nbd goes
 away after a client disconnects, without needing any additional
 follow-up commands), and we are not going to change that interface now
 (besides, someday we want to point people towards qemu-storage-daemon
 instead of qemu-nbd).
 But for qemu proper, and the newer qemu-storage-daemon, the QMP
 nbd-server-start command has historically had a default of unlimited
 number of connections, in part because unlike qemu-nbd it is
 inherently persistent until nbd-server-stop.  Allowing multiple client
 sockets is particularly useful for clients that can take advantage of
 MULTI_CONN (creating parallel sockets to increase throughput),
 although known clients that do so (such as libnbd's nbdcopy) typically
 use only 8 or 16 connections (the benefits of scaling diminish once
 more sockets are competing for kernel attention).  Picking a number
 large enough for typical use cases, but not unlimited, makes it
 slightly harder for a malicious client to perform a denial of service
 merely by opening lots of connections withot progressing through the
 handshake.
 This change does not eliminate CVE-2024-7409 on its own, but reduces
 the chance for fd exhaustion or unlimited memory usage as an attack
 surface.  On the other hand, by itself, it makes it more obvious that
 with a finite limit, we have the problem of an unauthenticated client
 holding 100 fds opened as a way to block out a legitimate client from
 being able to connect; thus, later patches will further add timeouts
 to reject clients that are not making progress.
 This is an INTENTIONAL change in behavior, and will break any client
 of nbd-server-start that was not passing an explicit max-connections
 parameter, yet expects more than 100 simultaneous connections.  We are
 not aware of any such client (as stated above, most clients aware of
 MULTI_CONN get by just fine on 8 or 16 connections, and probably cope
 with later connections failing by relying on the earlier connections;
 libvirt has not yet been passing max-connections, but generally
 creates NBD servers with the intent for a single client for the sake
 of live storage migration; meanwhile, the KubeSAN project anticipates
 a large cluster sharing multiple clients [up to 8 per node, and up to
 100 nodes in a cluster], but it currently uses qemu-nbd with an
 explicit --shared=0 rather than qemu-storage-daemon with
 nbd-server-start).
 We considered using a deprecation period (declare that omitting
 max-parameters is deprecated, and make it mandatory in 3 releases -
 then we don't need to pick an arbitrary default); that has zero risk
 of breaking any apps that accidentally depended on more than 100
 connections, and where such breakage might not be noticed under unit
 testing but only under the larger loads of production usage.  But it
 does not close the denial-of-service hole until far into the future,
 and requires all apps to change to add the parameter even if 100 was
 good enough.  It also has a drawback that any app (like libvirt) that
 is accidentally relying on an unlimited default should seriously
 consider their own CVE now, at which point they are going to change to
 pass explicit max-connections sooner than waiting for 3 qemu releases.
 Finally, if our changed default breaks an app, that app can always
 pass in an explicit max-parameters with a larger value.
 It is also intentional that the HMP interface to nbd-server-start is
 not changed to expose max-connections (any client needing to fine-tune
 things should be using QMP).
 Suggested-by: Daniel P. Berrangé <berrange@redhat.com>
 Signed-off-by: Eric Blake <eblake@redhat.com>
 Message-ID: <20240807174943.771624-12-eblake@redhat.com>
 Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
 [ericb: Expand commit message to summarize Dan's argument for why we
 break corner-case back-compat behavior without a deprecation period]
 Signed-off-by: Eric Blake <eblake@redhat.com>
 (cherry picked from commit c8a76dbd90c2f48df89b75bef74917f90a59b623)
 Conflicts:
 	qapi/block-export.json - context (no multi-conn, older format)
 Jira: https://issues.redhat.com/browse/RHEL-52611
 Signed-off-by: Eric Blake <eblake@redhat.com>
 ---
 block/monitor/block-hmp-cmds.c | 3 ++-
 blockdev-nbd.c                 | 8 ++++++++
 include/block/nbd.h            | 7 +++++++
 qapi/block-export.json         | 4 ++--
 4 files changed, 19 insertions(+), 3 deletions(-)
 diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
 index 2ac4aedfff..32a666b5dc 100644
 --- a/block/monitor/block-hmp-cmds.c
 +++ b/block/monitor/block-hmp-cmds.c
@@ -411,7 +411,8 @@ void hmp_nbd_server_start(Monitor *mon, const QDict *qdict)
         goto exit;
     }
 -    nbd_server_start(addr, NULL, NULL, 0, &local_err);
 +    nbd_server_start(addr, NULL, NULL, NBD_DEFAULT_MAX_CONNECTIONS,
 +                     &local_err);
     qapi_free_SocketAddress(addr);
     if (local_err != NULL) {
         goto exit;
 diff --git a/blockdev-nbd.c b/blockdev-nbd.c
 index b9e8dc78f3..4bd90bac16 100644
 --- a/blockdev-nbd.c
 +++ b/blockdev-nbd.c
@@ -171,6 +171,10 @@ void nbd_server_start(SocketAddress *addr, const char *tls_creds,
 void nbd_server_start_options(NbdServerOptions *arg, Error **errp)
 {
 +    if (!arg->has_max_connections) {
 +        arg->max_connections = NBD_DEFAULT_MAX_CONNECTIONS;
 +    }
 +
     nbd_server_start(arg->addr, arg->tls_creds, arg->tls_authz,
                      arg->max_connections, errp);
 }
@@ -183,6 +187,10 @@ void qmp_nbd_server_start(SocketAddressLegacy *addr,
 {
     SocketAddress *addr_flat = socket_address_flatten(addr);
 +    if (!has_max_connections) {
 +        max_connections = NBD_DEFAULT_MAX_CONNECTIONS;
 +    }
 +
     nbd_server_start(addr_flat, tls_creds, tls_authz, max_connections, errp);
     qapi_free_SocketAddress(addr_flat);
 }
 diff --git a/include/block/nbd.h b/include/block/nbd.h
 index b71a297249..a31c34a8a6 100644
 --- a/include/block/nbd.h
 +++ b/include/block/nbd.h
@@ -33,6 +33,13 @@ extern const BlockExportDriver blk_exp_nbd;
  */
 #define NBD_DEFAULT_HANDSHAKE_MAX_SECS 10
 +/*
 + * NBD_DEFAULT_MAX_CONNECTIONS: Number of client sockets to allow at
 + * once; must be large enough to allow a MULTI_CONN-aware client like
 + * nbdcopy to create its typical number of 8-16 sockets.
 + */
 +#define NBD_DEFAULT_MAX_CONNECTIONS 100
 +
 /* Handshake phase structs - this struct is passed on the wire */
 struct NBDOption {
 diff --git a/qapi/block-export.json b/qapi/block-export.json
 index c1b92ce1c1..181d7238fe 100644
 --- a/qapi/block-export.json
 +++ b/qapi/block-export.json
@@ -21,7 +21,7 @@
 #             recreated on the fly while the NBD server is active.
 #             If missing, it will default to denying access (since 4.0).
 # @max-connections: The maximum number of connections to allow at the same
 -#                   time, 0 for unlimited. (since 5.2; default: 0)
 +#                   time, 0 for unlimited. (since 5.2; default: 100)
 #
 # Since: 4.2
 ##
@@ -50,7 +50,7 @@
 #             recreated on the fly while the NBD server is active.
 #             If missing, it will default to denying access (since 4.0).
 # @max-connections: The maximum number of connections to allow at the same
 -#                   time, 0 for unlimited. (since 5.2; default: 0)
 +#                   time, 0 for unlimited. (since 5.2; default: 100)
 #
 # Returns: error if the server is already running.
 #
 -- 
 2.39.3
--- a/SOURCES/kvm-nbd-server-CVE-2024-7409-Close-stray-clients-at-serv.patch
+++ b/SOURCES/kvm-nbd-server-CVE-2024-7409-Close-stray-clients-at-serv.patch
@ -0,0 +1,180 @@
 From 4ab086cdf9a5842c49f3fe59baff1747d863b97a Mon Sep 17 00:00:00 2001
 From: Eric Blake <eblake@redhat.com>
 Date: Wed, 7 Aug 2024 12:23:13 -0500
 Subject: [PATCH 4/5] nbd/server: CVE-2024-7409: Close stray clients at
 server-stop
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Eric Blake <eblake@redhat.com>
 RH-MergeRequest: 388: nbd/server: fix CVE-2024-7409 (qemu crash on nbd-server-stop) [rhel-8.10.z]
 RH-Jira: RHEL-52611
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Acked-by: Richard W.M. Jones <rjones@redhat.com>
 RH-Commit: [4/4] 92a20764dbee3cf94181cab412d90cbf92b4a417 (ebblake/qemu-kvm)
 A malicious client can attempt to connect to an NBD server, and then
 intentionally delay progress in the handshake, including if it does
 not know the TLS secrets.  Although the previous two patches reduce
 this behavior by capping the default max-connections parameter and
 killing slow clients, they did not eliminate the possibility of a
 client waiting to close the socket until after the QMP nbd-server-stop
 command is executed, at which point qemu would SEGV when trying to
 dereference the NULL nbd_server global which is no longer present.
 This amounts to a denial of service attack.  Worse, if another NBD
 server is started before the malicious client disconnects, I cannot
 rule out additional adverse effects when the old client interferes
 with the connection count of the new server (although the most likely
 is a crash due to an assertion failure when checking
 nbd_server->connections > 0).
 For environments without this patch, the CVE can be mitigated by
 ensuring (such as via a firewall) that only trusted clients can
 connect to an NBD server.  Note that using frameworks like libvirt
 that ensure that TLS is used and that nbd-server-stop is not executed
 while any trusted clients are still connected will only help if there
 is also no possibility for an untrusted client to open a connection
 but then stall on the NBD handshake.
 Given the previous patches, it would be possible to guarantee that no
 clients remain connected by having nbd-server-stop sleep for longer
 than the default handshake deadline before finally freeing the global
 nbd_server object, but that could make QMP non-responsive for a long
 time.  So intead, this patch fixes the problem by tracking all client
 sockets opened while the server is running, and forcefully closing any
 such sockets remaining without a completed handshake at the time of
 nbd-server-stop, then waiting until the coroutines servicing those
 sockets notice the state change.  nbd-server-stop now has a second
 AIO_WAIT_WHILE_UNLOCKED (the first is indirectly through the
 blk_exp_close_all_type() that disconnects all clients that completed
 handshakes), but forced socket shutdown is enough to progress the
 coroutines and quickly tear down all clients before the server is
 freed, thus finally fixing the CVE.
 This patch relies heavily on the fact that nbd/server.c guarantees
 that it only calls nbd_blockdev_client_closed() from the main loop
 (see the assertion in nbd_client_put() and the hoops used in
 nbd_client_put_nonzero() to achieve that); if we did not have that
 guarantee, we would also need a mutex protecting our accesses of the
 list of connections to survive re-entrancy from independent iothreads.
 Although I did not actually try to test old builds, it looks like this
 problem has existed since at least commit 862172f45c (v2.12.0, 2017) -
 even back when that patch started using a QIONetListener to handle
 listening on multiple sockets, nbd_server_free() was already unaware
 that the nbd_blockdev_client_closed callback can be reached later by a
 client thread that has not completed handshakes (and therefore the
 client's socket never got added to the list closed in
 nbd_export_close_all), despite that patch intentionally tearing down
 the QIONetListener to prevent new clients.
 Reported-by: Alexander Ivanov <alexander.ivanov@virtuozzo.com>
 Fixes: CVE-2024-7409
 CC: qemu-stable@nongnu.org
 Signed-off-by: Eric Blake <eblake@redhat.com>
 Message-ID: <20240807174943.771624-14-eblake@redhat.com>
 Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
 (cherry picked from commit 3e7ef738c8462c45043a1d39f702a0990406a3b3)
 Conflicts:
 - blockdev-nbd.c:
   - qemu_in_main_thread() not backported, but only used in assertions so
     safe to drop
   - AIO_WAIT_WHILE_UNLOCKED() not backported, use AIO_WAIT_WHILE() like
     blk_exp_close_all_type()
 Jira: https://issues.redhat.com/browse/RHEL-52611
 Signed-off-by: Eric Blake <eblake@redhat.com>
 ---
 blockdev-nbd.c | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)
 diff --git a/blockdev-nbd.c b/blockdev-nbd.c
 index 4bd90bac16..87839c180b 100644
 --- a/blockdev-nbd.c
 +++ b/blockdev-nbd.c
@@ -21,12 +21,18 @@
 #include "io/channel-socket.h"
 #include "io/net-listener.h"
 +typedef struct NBDConn {
 +    QIOChannelSocket *cioc;
 +    QLIST_ENTRY(NBDConn) next;
 +} NBDConn;
 +
 typedef struct NBDServerData {
     QIONetListener *listener;
     QCryptoTLSCreds *tlscreds;
     char *tlsauthz;
     uint32_t max_connections;
     uint32_t connections;
 +    QLIST_HEAD(, NBDConn) conns;
 } NBDServerData;
 static NBDServerData *nbd_server;
@@ -46,6 +52,14 @@ bool nbd_server_is_running(void)
 static void nbd_blockdev_client_closed(NBDClient *client, bool ignored)
 {
 +    NBDConn *conn = nbd_client_owner(client);
 +
 +    assert(nbd_server);
 +
 +    object_unref(OBJECT(conn->cioc));
 +    QLIST_REMOVE(conn, next);
 +    g_free(conn);
 +
     nbd_client_put(client);
     assert(nbd_server->connections > 0);
     nbd_server->connections--;
@@ -55,14 +69,20 @@ static void nbd_blockdev_client_closed(NBDClient *client, bool ignored)
 static void nbd_accept(QIONetListener *listener, QIOChannelSocket *cioc,
                        gpointer opaque)
 {
 +    NBDConn *conn = g_new0(NBDConn, 1);
 +
 +    assert(nbd_server);
     nbd_server->connections++;
 +    object_ref(OBJECT(cioc));
 +    conn->cioc = cioc;
 +    QLIST_INSERT_HEAD(&nbd_server->conns, conn, next);
     nbd_update_server_watch(nbd_server);
     qio_channel_set_name(QIO_CHANNEL(cioc), "nbd-server");
     /* TODO - expose handshake timeout as QMP option */
     nbd_client_new(cioc, NBD_DEFAULT_HANDSHAKE_MAX_SECS,
                    nbd_server->tlscreds, nbd_server->tlsauthz,
 -                   nbd_blockdev_client_closed, NULL);
 +                   nbd_blockdev_client_closed, conn);
 }
 static void nbd_update_server_watch(NBDServerData *s)
@@ -76,12 +96,25 @@ static void nbd_update_server_watch(NBDServerData *s)
 static void nbd_server_free(NBDServerData *server)
 {
 +    NBDConn *conn, *tmp;
 +
     if (!server) {
         return;
     }
 +    /*
 +     * Forcefully close the listener socket, and any clients that have
 +     * not yet disconnected on their own.
 +     */
     qio_net_listener_disconnect(server->listener);
     object_unref(OBJECT(server->listener));
 +    QLIST_FOREACH_SAFE(conn, &server->conns, next, tmp) {
 +        qio_channel_shutdown(QIO_CHANNEL(conn->cioc), QIO_CHANNEL_SHUTDOWN_BOTH,
 +                             NULL);
 +    }
 +
 +    AIO_WAIT_WHILE(NULL, server->connections > 0);
 +
     if (server->tlscreds) {
         object_unref(OBJECT(server->tlscreds));
     }
 -- 
 2.39.3
--- a/SOURCES/kvm-nbd-server-CVE-2024-7409-Drop-non-negotiating-client.patch
+++ b/SOURCES/kvm-nbd-server-CVE-2024-7409-Drop-non-negotiating-client.patch
@ -0,0 +1,135 @@
 From faac5261d5a9af155950c4e7779c5a4721562824 Mon Sep 17 00:00:00 2001
 From: Eric Blake <eblake@redhat.com>
 Date: Thu, 8 Aug 2024 16:05:08 -0500
 Subject: [PATCH 3/5] nbd/server: CVE-2024-7409: Drop non-negotiating clients
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Eric Blake <eblake@redhat.com>
 RH-MergeRequest: 388: nbd/server: fix CVE-2024-7409 (qemu crash on nbd-server-stop) [rhel-8.10.z]
 RH-Jira: RHEL-52611
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Acked-by: Richard W.M. Jones <rjones@redhat.com>
 RH-Commit: [3/4] 8c39829f8efbded9af018a4b915af266a55a793a (ebblake/qemu-kvm)
 A client that opens a socket but does not negotiate is merely hogging
 qemu's resources (an open fd and a small amount of memory); and a
 malicious client that can access the port where NBD is listening can
 attempt a denial of service attack by intentionally opening and
 abandoning lots of unfinished connections.  The previous patch put a
 default bound on the number of such ongoing connections, but once that
 limit is hit, no more clients can connect (including legitimate ones).
 The solution is to insist that clients complete handshake within a
 reasonable time limit, defaulting to 10 seconds.  A client that has
 not successfully completed NBD_OPT_GO by then (including the case of
 where the client didn't know TLS credentials to even reach the point
 of NBD_OPT_GO) is wasting our time and does not deserve to stay
 connected.  Later patches will allow fine-tuning the limit away from
 the default value (including disabling it for doing integration
 testing of the handshake process itself).
 Note that this patch in isolation actually makes it more likely to see
 qemu SEGV after nbd-server-stop, as any client socket still connected
 when the server shuts down will now be closed after 10 seconds rather
 than at the client's whims.  That will be addressed in the next patch.
 For a demo of this patch in action:
 $ qemu-nbd -f raw -r -t -e 10 file &
 $ nbdsh --opt-mode -c '
 H = list()
 for i in range(20):
  print(i)
  H.insert(i, nbd.NBD())
  H[i].set_opt_mode(True)
  H[i].connect_uri("nbd://localhost")
 '
 $ kill $!
 where later connections get to start progressing once earlier ones are
 forcefully dropped for taking too long, rather than hanging.
 Suggested-by: Daniel P. Berrangé <berrange@redhat.com>
 Signed-off-by: Eric Blake <eblake@redhat.com>
 Message-ID: <20240807174943.771624-13-eblake@redhat.com>
 Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
 [eblake: rebase to changes earlier in series, reduce scope of timer]
 Signed-off-by: Eric Blake <eblake@redhat.com>
 (cherry picked from commit b9b72cb3ce15b693148bd09cef7e50110566d8a0)
 Conflicts:
 	nbd/server.c - context with different aiocontext locking
        nbd/trace-events - context with no client-connection.c
 Jira: https://issues.redhat.com/browse/RHEL-52611
 Signed-off-by: Eric Blake <eblake@redhat.com>
 ---
 nbd/server.c     | 28 +++++++++++++++++++++++++++-
 nbd/trace-events |  1 +
 2 files changed, 28 insertions(+), 1 deletion(-)
 diff --git a/nbd/server.c b/nbd/server.c
 index cc1b6838bf..1265068f70 100644
 --- a/nbd/server.c
 +++ b/nbd/server.c
@@ -2701,22 +2701,48 @@ static void nbd_client_receive_next_request(NBDClient *client)
     }
 }
 +static void nbd_handshake_timer_cb(void *opaque)
 +{
 +    QIOChannel *ioc = opaque;
 +
 +    trace_nbd_handshake_timer_cb();
 +    qio_channel_shutdown(ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
 +}
 +
 static coroutine_fn void nbd_co_client_start(void *opaque)
 {
     NBDClient *client = opaque;
     Error *local_err = NULL;
 +    QEMUTimer *handshake_timer = NULL;
     qemu_co_mutex_init(&client->send_lock);
 -    /* TODO - utilize client->handshake_max_secs */
 +    /*
 +     * Create a timer to bound the time spent in negotiation. If the
 +     * timer expires, it is likely nbd_negotiate will fail because the
 +     * socket was shutdown.
 +     */
 +    if (client->handshake_max_secs > 0) {
 +        handshake_timer = aio_timer_new(qemu_get_aio_context(),
 +                                        QEMU_CLOCK_REALTIME,
 +                                        SCALE_NS,
 +                                        nbd_handshake_timer_cb,
 +                                        client->sioc);
 +        timer_mod(handshake_timer,
 +                  qemu_clock_get_ns(QEMU_CLOCK_REALTIME) +
 +                  client->handshake_max_secs * NANOSECONDS_PER_SECOND);
 +    }
 +
     if (nbd_negotiate(client, &local_err)) {
         if (local_err) {
             error_report_err(local_err);
         }
 +        timer_free(handshake_timer);
         client_close(client, false);
         return;
     }
 +    timer_free(handshake_timer);
     nbd_client_receive_next_request(client);
 }
 diff --git a/nbd/trace-events b/nbd/trace-events
 index c4919a2dd5..553546f1f2 100644
 --- a/nbd/trace-events
 +++ b/nbd/trace-events
@@ -73,3 +73,4 @@ nbd_co_receive_request_decode_type(uint64_t handle, uint16_t type, const char *n
 nbd_co_receive_request_payload_received(uint64_t handle, uint32_t len) "Payload received: handle = %" PRIu64 ", len = %" PRIu32
 nbd_co_receive_align_compliance(const char *op, uint64_t from, uint32_t len, uint32_t align) "client sent non-compliant unaligned %s request: from=0x%" PRIx64 ", len=0x%" PRIx32 ", align=0x%" PRIx32
 nbd_trip(void) "Reading request"
 +nbd_handshake_timer_cb(void) "client took too long to negotiate"
 -- 
 2.39.3
--- a/SOURCES/kvm-nbd-server-Favor-qemu_aio_context-over-iohandler-con.patch
+++ b/SOURCES/kvm-nbd-server-Favor-qemu_aio_context-over-iohandler-con.patch
@ -0,0 +1,161 @@
 From 00af174d1388ed2d2df7961ee78be6af3757a01c Mon Sep 17 00:00:00 2001
 From: Eric Blake <eblake@redhat.com>
 Date: Wed, 30 Aug 2023 18:48:02 -0400
 Subject: [PATCH 1/3] nbd/server: Favor qemu_aio_context over iohandler context
 RH-Author: Eric Blake <eblake@redhat.com>
 RH-MergeRequest: 398: nbd/server: CVE-2024-7409: Avoid use-after-free when closing server
 RH-Jira: RHEL-52611
 RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
 RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
 RH-Commit: [1/3] 6ec0ef287fbc976175da83a0c14d9878e83affa2 (ebblake/qemu-kvm)
 DOWNSTREAM ONLY - but based on an idea originally included as a
 side-effect in the larger upstream patch 06e0f098 "io: follow
 coroutine AioContext in qio_channel_yield()", as well as handling the
 state of the qio TLS channel before it is associated with a block
 device as an alternative to 199e84de "qio: Inherit
 follow_coroutine_ctx across TLS".
 The NBD server code wants to use qio_channel_shutdown() followed by
 AIO_WAIT_WHILE() during nbd_server_free(), but cannot attach the ioc
 to an AioContext until the client has completed the handshake to the
 point that the server knows what block device to associate with the
 connection.  The qio code is set up to handle connections with no
 AioContext in the iohandler context, but this context is specifically
 designed to NOT make progress during AIO_WAIT_WHILE().  In order to
 prevent things from deadlocking, the qio channels handling NBD
 handshake MUST be in the qemu_aio_context, so that an early shutdown
 triggered by nbd-server-stop can make progress.
 Note that upstream handled the main qio channel by the use of
 qio_channel_set_follow_coroutine_ctx() in only one place in
 nbd/server.c; upstream handled the TLS channel by a more generic
 second patch that taught qio TLS channel to inherit the
 follow_coroutine_ctx status from its parent.  But since this patch is
 already downstream only, the minimal diff is achieved by manually
 setting the status of the TLS channel in NBD code, rather than
 backporting the qio inheritance code.  For testing that the second
 call to qio_channel_set_favor_qemu_aio_ctx() matters, I used this test
 setup (borrowing a pre-built PSK file for username alice from the
 libnbd project, and using IPv4 since this qemu is too old to support
 TLS over Unix sockets):
 $ # in terminal 1:
 $ qemu-system-x86_64 --nographic --nodefaults --qmp stdio \
  --object tls-creds-psk,id=tls0,dir=/PATHTO/libnbd/tests,endpoint=server
 {"execute": "qmp_capabilities"}
 {"execute":"nbd-server-start","arguments":{"addr":{"type":"inet",
  "data":{"host":"127.0.0.1","port":"10809"}},"tls-creds":"tls0"}}
 $ # in terminal 2:
 $ nbdsh -c 'h.set_uri_allow_local_file(True)' --opt-mode -u \
  'nbds://alice@127.0.0.1/?tls-psk-file=/PATHTO/libnbd/tests/keys.psk' \
  -c 'import time; time.sleep(15)'
 $ # in terminal 1, before 10 seconds elapse
 {"execute":"nbd-server-stop"}
 {"execute":"quit"}
 and observed that, when omitting the one-line TLS setting, qemu would
 hit the same deadlock with a TLS client as what I was observing for a
 non-TLS client without this entire patch.
 Jira: https://issues.redhat.com/browse/RHEL-52611
 Suggested-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Eric Blake <eblake@redhat.com>
 ---
 include/io/channel.h | 16 ++++++++++++++++
 io/channel.c         | 14 +++++++++++++-
 nbd/server.c         |  2 ++
 3 files changed, 31 insertions(+), 1 deletion(-)
 diff --git a/include/io/channel.h b/include/io/channel.h
 index 716235d496..f1ce19ea81 100644
 --- a/include/io/channel.h
 +++ b/include/io/channel.h
@@ -84,6 +84,7 @@ struct QIOChannel {
     AioContext *ctx;
     Coroutine *read_coroutine;
     Coroutine *write_coroutine;
 +    bool favor_qemu_aio_ctx;
 #ifdef _WIN32
     HANDLE event; /* For use with GSource on Win32 */
 #endif
@@ -498,6 +499,21 @@ int qio_channel_set_blocking(QIOChannel *ioc,
                              bool enabled,
                              Error **errp);
 +/**
 + * qio_channel_set_favor_qemu_aio_ctx:
 + * @ioc: the channel object
 + * @enabled: whether to fall back to qemu_aio_context
 + *
 + * If @enabled is true, calls to qio_channel_yield() with no AioContext
 + * set use the qemu_aio_context instead of the global iohandler context.
 + *
 + * If @enabled is false, calls to qio_channel_yield() use the global iohandler
 + * AioContext. This is may be used by coroutines that run in the main loop and
 + * do not wish to respond to I/O during nested event loops. This is the
 + * default for compatibility with code that is not aware of AioContexts.
 + */
 +void qio_channel_set_favor_qemu_aio_ctx(QIOChannel *ioc, bool enabled);
 +
 /**
  * qio_channel_close:
  * @ioc: the channel object
 diff --git a/io/channel.c b/io/channel.c
 index a8c7f11649..74704d0464 100644
 --- a/io/channel.c
 +++ b/io/channel.c
@@ -364,6 +364,12 @@ int qio_channel_set_blocking(QIOChannel *ioc,
 }
 +void qio_channel_set_favor_qemu_aio_ctx(QIOChannel *ioc, bool enabled)
 +{
 +    ioc->favor_qemu_aio_ctx = enabled;
 +}
 +
 +
 int qio_channel_close(QIOChannel *ioc,
                       Error **errp)
 {
@@ -545,7 +551,13 @@ static void qio_channel_set_aio_fd_handlers(QIOChannel *ioc)
         wr_handler = qio_channel_restart_write;
     }
 -    ctx = ioc->ctx ? ioc->ctx : iohandler_get_aio_context();
 +    if (ioc->ctx) {
 +        ctx = ioc->ctx;
 +    } else if (ioc->favor_qemu_aio_ctx) {
 +        ctx = qemu_get_aio_context();
 +    } else {
 +        ctx = iohandler_get_aio_context();
 +    }
     qio_channel_set_aio_fd_handler(ioc, ctx, rd_handler, wr_handler, ioc);
 }
 diff --git a/nbd/server.c b/nbd/server.c
 index 1265068f70..41a2003300 100644
 --- a/nbd/server.c
 +++ b/nbd/server.c
@@ -758,6 +758,7 @@ static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client,
         return NULL;
     }
 +    qio_channel_set_favor_qemu_aio_ctx(QIO_CHANNEL(tioc), true);
     qio_channel_set_name(QIO_CHANNEL(tioc), "nbd-server-tls");
     trace_nbd_negotiate_handle_starttls_handshake();
     data.loop = g_main_loop_new(g_main_context_default(), FALSE);
@@ -1333,6 +1334,7 @@ static coroutine_fn int nbd_negotiate(NBDClient *client, Error **errp)
      */
     qio_channel_set_blocking(client->ioc, false, NULL);
 +    qio_channel_set_favor_qemu_aio_ctx(client->ioc, true);
     trace_nbd_negotiate_begin();
     memcpy(buf, "NBDMAGIC", 8);
 -- 
 2.39.3
--- a/SOURCES/kvm-nbd-server-Plumb-in-new-args-to-nbd_client_add.patch
+++ b/SOURCES/kvm-nbd-server-Plumb-in-new-args-to-nbd_client_add.patch
@ -0,0 +1,174 @@
 From 0d204cb81aec2b13254a0bd53938f53bfea81cb5 Mon Sep 17 00:00:00 2001
 From: Eric Blake <eblake@redhat.com>
 Date: Wed, 7 Aug 2024 08:50:01 -0500
 Subject: [PATCH 1/5] nbd/server: Plumb in new args to nbd_client_add()
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Eric Blake <eblake@redhat.com>
 RH-MergeRequest: 388: nbd/server: fix CVE-2024-7409 (qemu crash on nbd-server-stop) [rhel-8.10.z]
 RH-Jira: RHEL-52611
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Acked-by: Richard W.M. Jones <rjones@redhat.com>
 RH-Commit: [1/4] 292be8dd2df2a840b2200e31a27e9d17fdab91ad (ebblake/qemu-kvm)
 Upcoming patches to fix a CVE need to track an opaque pointer passed
 in by the owner of a client object, as well as request for a time
 limit on how fast negotiation must complete.  Prepare for that by
 changing the signature of nbd_client_new() and adding an accessor to
 get at the opaque pointer, although for now the two servers
 (qemu-nbd.c and blockdev-nbd.c) do not change behavior even though
 they pass in a new default timeout value.
 Suggested-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
 Signed-off-by: Eric Blake <eblake@redhat.com>
 Message-ID: <20240807174943.771624-11-eblake@redhat.com>
 Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
 [eblake: s/LIMIT/MAX_SECS/ as suggested by Dan]
 Signed-off-by: Eric Blake <eblake@redhat.com>
 (cherry picked from commit fb1c2aaa981e0a2fa6362c9985f1296b74f055ac)
 Jira: https://issues.redhat.com/browse/RHEL-52611
 Signed-off-by: Eric Blake <eblake@redhat.com>
 ---
 blockdev-nbd.c      |  6 ++++--
 include/block/nbd.h | 11 ++++++++++-
 nbd/server.c        | 20 +++++++++++++++++---
 qemu-nbd.c          |  4 +++-
 4 files changed, 34 insertions(+), 7 deletions(-)
 diff --git a/blockdev-nbd.c b/blockdev-nbd.c
 index bdfa7ed3a5..b9e8dc78f3 100644
 --- a/blockdev-nbd.c
 +++ b/blockdev-nbd.c
@@ -59,8 +59,10 @@ static void nbd_accept(QIONetListener *listener, QIOChannelSocket *cioc,
     nbd_update_server_watch(nbd_server);
     qio_channel_set_name(QIO_CHANNEL(cioc), "nbd-server");
 -    nbd_client_new(cioc, nbd_server->tlscreds, nbd_server->tlsauthz,
 -                   nbd_blockdev_client_closed);
 +    /* TODO - expose handshake timeout as QMP option */
 +    nbd_client_new(cioc, NBD_DEFAULT_HANDSHAKE_MAX_SECS,
 +                   nbd_server->tlscreds, nbd_server->tlsauthz,
 +                   nbd_blockdev_client_closed, NULL);
 }
 static void nbd_update_server_watch(NBDServerData *s)
 diff --git a/include/block/nbd.h b/include/block/nbd.h
 index 78d101b774..b71a297249 100644
 --- a/include/block/nbd.h
 +++ b/include/block/nbd.h
@@ -27,6 +27,12 @@
 extern const BlockExportDriver blk_exp_nbd;
 +/*
 + * NBD_DEFAULT_HANDSHAKE_MAX_SECS: Number of seconds in which client must
 + * succeed at NBD_OPT_GO before being forcefully dropped as too slow.
 + */
 +#define NBD_DEFAULT_HANDSHAKE_MAX_SECS 10
 +
 /* Handshake phase structs - this struct is passed on the wire */
 struct NBDOption {
@@ -338,9 +344,12 @@ AioContext *nbd_export_aio_context(NBDExport *exp);
 NBDExport *nbd_export_find(const char *name);
 void nbd_client_new(QIOChannelSocket *sioc,
 +                    uint32_t handshake_max_secs,
                     QCryptoTLSCreds *tlscreds,
                     const char *tlsauthz,
 -                    void (*close_fn)(NBDClient *, bool));
 +                    void (*close_fn)(NBDClient *, bool),
 +                    void *owner);
 +void *nbd_client_owner(NBDClient *client);
 void nbd_client_get(NBDClient *client);
 void nbd_client_put(NBDClient *client);
 diff --git a/nbd/server.c b/nbd/server.c
 index 6db124cf53..cc1b6838bf 100644
 --- a/nbd/server.c
 +++ b/nbd/server.c
@@ -120,10 +120,12 @@ typedef struct NBDExportMetaContexts {
 struct NBDClient {
     int refcount;
     void (*close_fn)(NBDClient *client, bool negotiated);
 +    void *owner;
     NBDExport *exp;
     QCryptoTLSCreds *tlscreds;
     char *tlsauthz;
 +    uint32_t handshake_max_secs;
     QIOChannelSocket *sioc; /* The underlying data channel */
     QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */
@@ -2706,6 +2708,7 @@ static coroutine_fn void nbd_co_client_start(void *opaque)
     qemu_co_mutex_init(&client->send_lock);
 +    /* TODO - utilize client->handshake_max_secs */
     if (nbd_negotiate(client, &local_err)) {
         if (local_err) {
             error_report_err(local_err);
@@ -2718,14 +2721,17 @@ static coroutine_fn void nbd_co_client_start(void *opaque)
 }
 /*
 - * Create a new client listener using the given channel @sioc.
 + * Create a new client listener using the given channel @sioc and @owner.
  * Begin servicing it in a coroutine.  When the connection closes, call
 - * @close_fn with an indication of whether the client completed negotiation.
 + * @close_fn with an indication of whether the client completed negotiation
 + * within @handshake_max_secs seconds (0 for unbounded).
  */
 void nbd_client_new(QIOChannelSocket *sioc,
 +                    uint32_t handshake_max_secs,
                     QCryptoTLSCreds *tlscreds,
                     const char *tlsauthz,
 -                    void (*close_fn)(NBDClient *, bool))
 +                    void (*close_fn)(NBDClient *, bool),
 +                    void *owner)
 {
     NBDClient *client;
     Coroutine *co;
@@ -2737,13 +2743,21 @@ void nbd_client_new(QIOChannelSocket *sioc,
         object_ref(OBJECT(client->tlscreds));
     }
     client->tlsauthz = g_strdup(tlsauthz);
 +    client->handshake_max_secs = handshake_max_secs;
     client->sioc = sioc;
     qio_channel_set_delay(QIO_CHANNEL(sioc), false);
     object_ref(OBJECT(client->sioc));
     client->ioc = QIO_CHANNEL(sioc);
     object_ref(OBJECT(client->ioc));
     client->close_fn = close_fn;
 +    client->owner = owner;
     co = qemu_coroutine_create(nbd_co_client_start, client);
     qemu_coroutine_enter(co);
 }
 +
 +void *
 +nbd_client_owner(NBDClient *client)
 +{
 +    return client->owner;
 +}
 diff --git a/qemu-nbd.c b/qemu-nbd.c
 index c6c20df68a..f48abf379e 100644
 --- a/qemu-nbd.c
 +++ b/qemu-nbd.c
@@ -363,7 +363,9 @@ static void nbd_accept(QIONetListener *listener, QIOChannelSocket *cioc,
     nb_fds++;
     nbd_update_server_watch();
 -    nbd_client_new(cioc, tlscreds, tlsauthz, nbd_client_closed);
 +    /* TODO - expose handshake timeout as command line option */
 +    nbd_client_new(cioc, NBD_DEFAULT_HANDSHAKE_MAX_SECS,
 +                   tlscreds, tlsauthz, nbd_client_closed, NULL);
 }
 static void nbd_update_server_watch(void)
 -- 
 2.39.3
--- a/SOURCES/kvm-nbd-server-Request-TCP_NODELAY.patch
+++ b/SOURCES/kvm-nbd-server-Request-TCP_NODELAY.patch
@ -0,0 +1,55 @@
 From 17c5524ada3f2ca9a9c645f540bedc5575302059 Mon Sep 17 00:00:00 2001
 From: Eric Blake <eblake@redhat.com>
 Date: Mon, 3 Apr 2023 19:40:47 -0500
 Subject: [PATCH 5/5] nbd/server: Request TCP_NODELAY
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Eric Blake <eblake@redhat.com>
 RH-MergeRequest: 274: nbd: improve TLS performance of NBD server
 RH-Bugzilla: 2035712
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
 RH-Acked-by: Stefano Garzarella <sgarzare@redhat.com>
 RH-Commit: [2/2] 092145077756cda2a4f849c5911031b0fc4a2134 (ebblake/qemu-kvm)
 Nagle's algorithm adds latency in order to reduce network packet
 overhead on small packets.  But when we are already using corking to
 merge smaller packets into transactional requests, the extra delay
 from TCP defaults just gets in the way (see recent commit bd2cd4a4).
 For reference, qemu as an NBD client already requests TCP_NODELAY (see
 nbd_connect() in nbd/client-connection.c); as does libnbd as a client
 [1], and nbdkit as a server [2].  Furthermore, the NBD spec recommends
 the use of TCP_NODELAY [3].
 [1] https://gitlab.com/nbdkit/libnbd/-/blob/a48a1142/generator/states-connect.c#L39
 [2] https://gitlab.com/nbdkit/nbdkit/-/blob/45b72f5b/server/sockets.c#L430
 [3] https://github.com/NetworkBlockDevice/nbd/blob/master/doc/proto.md#protocol-phases
 CC: Florian Westphal <fw@strlen.de>
 Signed-off-by: Eric Blake <eblake@redhat.com>
 Message-Id: <20230404004047.142086-1-eblake@redhat.com>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 (cherry picked from commit f1426881a827a6d3f31b65616c4a8db1e9e7c45e)
 Signed-off-by: Eric Blake <eblake@redhat.com>
 ---
 nbd/server.c | 1 +
 1 file changed, 1 insertion(+)
 diff --git a/nbd/server.c b/nbd/server.c
 index a5edc7f681..6db124cf53 100644
 --- a/nbd/server.c
 +++ b/nbd/server.c
@@ -2738,6 +2738,7 @@ void nbd_client_new(QIOChannelSocket *sioc,
     }
     client->tlsauthz = g_strdup(tlsauthz);
     client->sioc = sioc;
 +    qio_channel_set_delay(QIO_CHANNEL(sioc), false);
     object_ref(OBJECT(client->sioc));
     client->ioc = QIO_CHANNEL(sioc);
     object_ref(OBJECT(client->ioc));
 -- 
 2.39.1
--- a/SOURCES/kvm-nbd-server-push-pending-frames-after-sending-reply.patch
+++ b/SOURCES/kvm-nbd-server-push-pending-frames-after-sending-reply.patch
@ -0,0 +1,72 @@
 From 170872370c6f3c916e741eb32d80431995d7a870 Mon Sep 17 00:00:00 2001
 From: Florian Westphal <fw@strlen.de>
 Date: Fri, 24 Mar 2023 11:47:20 +0100
 Subject: [PATCH 4/5] nbd/server: push pending frames after sending reply
 RH-Author: Eric Blake <eblake@redhat.com>
 RH-MergeRequest: 274: nbd: improve TLS performance of NBD server
 RH-Bugzilla: 2035712
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
 RH-Acked-by: Stefano Garzarella <sgarzare@redhat.com>
 RH-Commit: [1/2] ab92c06c48810aa40380de0433dcac4c6e4be9a5 (ebblake/qemu-kvm)
 qemu-nbd doesn't set TCP_NODELAY on the tcp socket.
 Kernel waits for more data and avoids transmission of small packets.
 Without TLS this is barely noticeable, but with TLS this really shows.
 Booting a VM via qemu-nbd on localhost (with tls) takes more than
 2 minutes on my system.  tcpdump shows frequent wait periods, where no
 packets get sent for a 40ms period.
 Add explicit (un)corking when processing (and responding to) requests.
 "TCP_CORK, &zero" after earlier "CORK, &one" will flush pending data.
 VM Boot time:
 main:    no tls:  23s, with tls: 2m45s
 patched: no tls:  14s, with tls: 15s
 VM Boot time, qemu-nbd via network (same lan):
 main:    no tls:  18s, with tls: 1m50s
 patched: no tls:  17s, with tls: 18s
 Future optimization: if we could detect if there is another pending
 request we could defer the uncork operation because more data would be
 appended.
 Signed-off-by: Florian Westphal <fw@strlen.de>
 Message-Id: <20230324104720.2498-1-fw@strlen.de>
 Reviewed-by: Eric Blake <eblake@redhat.com>
 Reviewed-by: Kevin Wolf <kwolf@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 (cherry picked from commit bd2cd4a441ded163b62371790876f28a9b834317)
 Signed-off-by: Eric Blake <eblake@redhat.com>
 ---
 nbd/server.c | 3 +++
 1 file changed, 3 insertions(+)
 diff --git a/nbd/server.c b/nbd/server.c
 index 4630dd7322..a5edc7f681 100644
 --- a/nbd/server.c
 +++ b/nbd/server.c
@@ -2647,6 +2647,8 @@ static coroutine_fn void nbd_trip(void *opaque)
         goto disconnect;
     }
 +    qio_channel_set_cork(client->ioc, true);
 +
     if (ret < 0) {
         /* It wans't -EIO, so, according to nbd_co_receive_request()
          * semantics, we should return the error to the client. */
@@ -2672,6 +2674,7 @@ static coroutine_fn void nbd_trip(void *opaque)
         goto disconnect;
     }
 +    qio_channel_set_cork(client->ioc, false);
 done:
     nbd_request_put(req);
     nbd_client_put(client);
 -- 
 2.39.1
--- a/SOURCES/kvm-net-Provide-MemReentrancyGuard-to-qemu_new_nic.patch
+++ b/SOURCES/kvm-net-Provide-MemReentrancyGuard-to-qemu_new_nic.patch
@ -0,0 +1,611 @@
 From 2ae925a6d55a77627be8d1146f2b9ed139dbdb77 Mon Sep 17 00:00:00 2001
 From: Jon Maloy <jmaloy@redhat.com>
 Date: Thu, 23 Nov 2023 11:30:46 -0500
 Subject: [PATCH 1/4] net: Provide MemReentrancyGuard * to qemu_new_nic()
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 331: net: Provide MemReentrancyGuard * to qemu_new_nic()
 RH-Jira: RHEL-7309
 RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
 RH-Acked-by: Laurent Vivier <lvivier@redhat.com>
 RH-Acked-by: Jason Wang <jasowang@redhat.com>
 RH-Commit: [1/2] bc963fb349b90288f547de97a5cbe9a74f856419 (redhat/rhel/src/qemu-kvm/jons-qemu-kvm-2)
 Jira: https://issues.redhat.com/browse/RHEL-7309
 CVE: CVE-2023-3019
 Upstream: Merged
 Conflicts: hw/net/hw/net/xen_nic.c seems to have undergone significant changes upstream,
           so the change had to be manually adapted to the old code.
 commit 7d0fefdf81f5973334c344f6b8e1896c309dff66
 Author: Akihiko Odaki <akihiko.odaki@daynix.com>
 Date:   Thu Jun 1 12:18:58 2023 +0900
    net: Provide MemReentrancyGuard * to qemu_new_nic()
    Recently MemReentrancyGuard was added to DeviceState to record that the
    device is engaging in I/O. The network device backend needs to update it
    when delivering a packet to a device.
    In preparation for such a change, add MemReentrancyGuard * as a
    parameter of qemu_new_nic().
    Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
    Reviewed-by: Alexander Bulekov <alxndr@bu.edu>
    Signed-off-by: Jason Wang <jasowang@redhat.com>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 hw/net/allwinner-sun8i-emac.c | 3 ++-
 hw/net/allwinner_emac.c       | 3 ++-
 hw/net/cadence_gem.c          | 3 ++-
 hw/net/dp8393x.c              | 3 ++-
 hw/net/e1000.c                | 3 ++-
 hw/net/e1000e.c               | 2 +-
 hw/net/eepro100.c             | 4 +++-
 hw/net/etraxfs_eth.c          | 3 ++-
 hw/net/fsl_etsec/etsec.c      | 3 ++-
 hw/net/ftgmac100.c            | 3 ++-
 hw/net/i82596.c               | 2 +-
 hw/net/imx_fec.c              | 2 +-
 hw/net/lan9118.c              | 3 ++-
 hw/net/mcf_fec.c              | 3 ++-
 hw/net/mipsnet.c              | 3 ++-
 hw/net/msf2-emac.c            | 3 ++-
 hw/net/ne2000-isa.c           | 3 ++-
 hw/net/ne2000-pci.c           | 3 ++-
 hw/net/npcm7xx_emc.c          | 3 ++-
 hw/net/opencores_eth.c        | 3 ++-
 hw/net/pcnet.c                | 3 ++-
 hw/net/rocker/rocker_fp.c     | 4 ++--
 hw/net/rtl8139.c              | 3 ++-
 hw/net/smc91c111.c            | 3 ++-
 hw/net/spapr_llan.c           | 3 ++-
 hw/net/stellaris_enet.c       | 3 ++-
 hw/net/sungem.c               | 2 +-
 hw/net/sunhme.c               | 3 ++-
 hw/net/tulip.c                | 3 ++-
 hw/net/virtio-net.c           | 6 ++++--
 hw/net/vmxnet3.c              | 2 +-
 hw/net/xen_nic.c              | 3 ++-
 hw/net/xgmac.c                | 3 ++-
 hw/net/xilinx_axienet.c       | 3 ++-
 hw/net/xilinx_ethlite.c       | 3 ++-
 hw/usb/dev-network.c          | 3 ++-
 include/net/net.h             | 1 +
 net/net.c                     | 1 +
 38 files changed, 72 insertions(+), 38 deletions(-)
 diff --git a/hw/net/allwinner-sun8i-emac.c b/hw/net/allwinner-sun8i-emac.c
 index ff611f18fb..9d0885ee15 100644
 --- a/hw/net/allwinner-sun8i-emac.c
 +++ b/hw/net/allwinner-sun8i-emac.c
@@ -810,7 +810,8 @@ static void allwinner_sun8i_emac_realize(DeviceState *dev, Error **errp)
     qemu_macaddr_default_if_unset(&s->conf.macaddr);
     s->nic = qemu_new_nic(&net_allwinner_sun8i_emac_info, &s->conf,
 -                           object_get_typename(OBJECT(dev)), dev->id, s);
 +                          object_get_typename(OBJECT(dev)), dev->id,
 +                          &dev->mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
 }
 diff --git a/hw/net/allwinner_emac.c b/hw/net/allwinner_emac.c
 index ddddf35c45..b3d73143bf 100644
 --- a/hw/net/allwinner_emac.c
 +++ b/hw/net/allwinner_emac.c
@@ -453,7 +453,8 @@ static void aw_emac_realize(DeviceState *dev, Error **errp)
     qemu_macaddr_default_if_unset(&s->conf.macaddr);
     s->nic = qemu_new_nic(&net_aw_emac_info, &s->conf,
 -                          object_get_typename(OBJECT(dev)), dev->id, s);
 +                          object_get_typename(OBJECT(dev)), dev->id,
 +                          &dev->mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
     fifo8_create(&s->rx_fifo, RX_FIFO_SIZE);
 diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
 index 24b3a0ff66..cb61a76417 100644
 --- a/hw/net/cadence_gem.c
 +++ b/hw/net/cadence_gem.c
@@ -1633,7 +1633,8 @@ static void gem_realize(DeviceState *dev, Error **errp)
     qemu_macaddr_default_if_unset(&s->conf.macaddr);
     s->nic = qemu_new_nic(&net_gem_info, &s->conf,
 -                          object_get_typename(OBJECT(dev)), dev->id, s);
 +                          object_get_typename(OBJECT(dev)), dev->id,
 +                          &dev->mem_reentrancy_guard, s);
     if (s->jumbo_max_len > MAX_FRAME_SIZE) {
         error_setg(errp, "jumbo-max-len is greater than %d",
 diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
 index 45b954e46c..abfcc6f69f 100644
 --- a/hw/net/dp8393x.c
 +++ b/hw/net/dp8393x.c
@@ -943,7 +943,8 @@ static void dp8393x_realize(DeviceState *dev, Error **errp)
                           "dp8393x-regs", SONIC_REG_COUNT << s->it_shift);
     s->nic = qemu_new_nic(&net_dp83932_info, &s->conf,
 -                          object_get_typename(OBJECT(dev)), dev->id, s);
 +                          object_get_typename(OBJECT(dev)), dev->id,
 +                          &dev->mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
     s->watchdog = timer_new_ns(QEMU_CLOCK_VIRTUAL, dp8393x_watchdog, s);
 diff --git a/hw/net/e1000.c b/hw/net/e1000.c
 index 282d01e374..86da1ae39e 100644
 --- a/hw/net/e1000.c
 +++ b/hw/net/e1000.c
@@ -1733,7 +1733,8 @@ static void pci_e1000_realize(PCIDevice *pci_dev, Error **errp)
                                macaddr);
     d->nic = qemu_new_nic(&net_e1000_info, &d->conf,
 -                          object_get_typename(OBJECT(d)), dev->id, d);
 +                          object_get_typename(OBJECT(d)), dev->id,
 +                          &dev->mem_reentrancy_guard, d);
     qemu_format_nic_info_str(qemu_get_queue(d->nic), macaddr);
 diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
 index d35bc1f0b0..c6096fa848 100644
 --- a/hw/net/e1000e.c
 +++ b/hw/net/e1000e.c
@@ -340,7 +340,7 @@ e1000e_init_net_peer(E1000EState *s, PCIDevice *pci_dev, uint8_t *macaddr)
     int i;
     s->nic = qemu_new_nic(&net_e1000e_info, &s->conf,
 -        object_get_typename(OBJECT(s)), dev->id, s);
 +        object_get_typename(OBJECT(s)), dev->id, &dev->mem_reentrancy_guard, s);
     s->core.max_queue_num = s->conf.peers.queues ? s->conf.peers.queues - 1 : 0;
 diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c
 index 16e95ef9cc..16ca4dda04 100644
 --- a/hw/net/eepro100.c
 +++ b/hw/net/eepro100.c
@@ -1865,7 +1865,9 @@ static void e100_nic_realize(PCIDevice *pci_dev, Error **errp)
     nic_reset(s);
     s->nic = qemu_new_nic(&net_eepro100_info, &s->conf,
 -                          object_get_typename(OBJECT(pci_dev)), pci_dev->qdev.id, s);
 +                          object_get_typename(OBJECT(pci_dev)),
 +                          pci_dev->qdev.id,
 +                          &pci_dev->qdev.mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
     TRACE(OTHER, logout("%s\n", qemu_get_queue(s->nic)->info_str));
 diff --git a/hw/net/etraxfs_eth.c b/hw/net/etraxfs_eth.c
 index 1b82aec794..ba57a978d1 100644
 --- a/hw/net/etraxfs_eth.c
 +++ b/hw/net/etraxfs_eth.c
@@ -618,7 +618,8 @@ static void etraxfs_eth_realize(DeviceState *dev, Error **errp)
     qemu_macaddr_default_if_unset(&s->conf.macaddr);
     s->nic = qemu_new_nic(&net_etraxfs_info, &s->conf,
 -                          object_get_typename(OBJECT(s)), dev->id, s);
 +                          object_get_typename(OBJECT(s)), dev->id,
 +                          &dev->mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
     s->phy.read = tdk_read;
 diff --git a/hw/net/fsl_etsec/etsec.c b/hw/net/fsl_etsec/etsec.c
 index bd9d62b559..f790613b52 100644
 --- a/hw/net/fsl_etsec/etsec.c
 +++ b/hw/net/fsl_etsec/etsec.c
@@ -391,7 +391,8 @@ static void etsec_realize(DeviceState *dev, Error **errp)
     eTSEC        *etsec = ETSEC_COMMON(dev);
     etsec->nic = qemu_new_nic(&net_etsec_info, &etsec->conf,
 -                              object_get_typename(OBJECT(dev)), dev->id, etsec);
 +                              object_get_typename(OBJECT(dev)), dev->id,
 +                              &dev->mem_reentrancy_guard, etsec);
     qemu_format_nic_info_str(qemu_get_queue(etsec->nic), etsec->conf.macaddr.a);
     etsec->ptimer = ptimer_init(etsec_timer_hit, etsec, PTIMER_POLICY_DEFAULT);
 diff --git a/hw/net/ftgmac100.c b/hw/net/ftgmac100.c
 index 25685ba3a9..781e7f352e 100644
 --- a/hw/net/ftgmac100.c
 +++ b/hw/net/ftgmac100.c
@@ -1111,7 +1111,8 @@ static void ftgmac100_realize(DeviceState *dev, Error **errp)
     qemu_macaddr_default_if_unset(&s->conf.macaddr);
     s->nic = qemu_new_nic(&net_ftgmac100_info, &s->conf,
 -                          object_get_typename(OBJECT(dev)), dev->id, s);
 +                          object_get_typename(OBJECT(dev)), dev->id,
 +                          &dev->mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
 }
 diff --git a/hw/net/i82596.c b/hw/net/i82596.c
 index ec21e2699a..dc64246f75 100644
 --- a/hw/net/i82596.c
 +++ b/hw/net/i82596.c
@@ -743,7 +743,7 @@ void i82596_common_init(DeviceState *dev, I82596State *s, NetClientInfo *info)
         qemu_macaddr_default_if_unset(&s->conf.macaddr);
     }
     s->nic = qemu_new_nic(info, &s->conf, object_get_typename(OBJECT(dev)),
 -                dev->id, s);
 +                dev->id, &dev->mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
     if (USE_TIMER) {
 diff --git a/hw/net/imx_fec.c b/hw/net/imx_fec.c
 index 9c7035bc94..ed19ee9350 100644
 --- a/hw/net/imx_fec.c
 +++ b/hw/net/imx_fec.c
@@ -1310,7 +1310,7 @@ static void imx_eth_realize(DeviceState *dev, Error **errp)
     s->nic = qemu_new_nic(&imx_eth_net_info, &s->conf,
                           object_get_typename(OBJECT(dev)),
 -                          dev->id, s);
 +                          dev->id, &dev->mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
 }
 diff --git a/hw/net/lan9118.c b/hw/net/lan9118.c
 index 6aff424cbe..942bce9ae6 100644
 --- a/hw/net/lan9118.c
 +++ b/hw/net/lan9118.c
@@ -1354,7 +1354,8 @@ static void lan9118_realize(DeviceState *dev, Error **errp)
     qemu_macaddr_default_if_unset(&s->conf.macaddr);
     s->nic = qemu_new_nic(&net_lan9118_info, &s->conf,
 -                          object_get_typename(OBJECT(dev)), dev->id, s);
 +                          object_get_typename(OBJECT(dev)), dev->id,
 +                          &dev->mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
     s->eeprom[0] = 0xa5;
     for (i = 0; i < 6; i++) {
 diff --git a/hw/net/mcf_fec.c b/hw/net/mcf_fec.c
 index 25e3e453ab..a6be7bf413 100644
 --- a/hw/net/mcf_fec.c
 +++ b/hw/net/mcf_fec.c
@@ -643,7 +643,8 @@ static void mcf_fec_realize(DeviceState *dev, Error **errp)
     mcf_fec_state *s = MCF_FEC_NET(dev);
     s->nic = qemu_new_nic(&net_mcf_fec_info, &s->conf,
 -                          object_get_typename(OBJECT(dev)), dev->id, s);
 +                          object_get_typename(OBJECT(dev)), dev->id,
 +                          &dev->mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
 }
 diff --git a/hw/net/mipsnet.c b/hw/net/mipsnet.c
 index 2ade72dea0..8e925de867 100644
 --- a/hw/net/mipsnet.c
 +++ b/hw/net/mipsnet.c
@@ -255,7 +255,8 @@ static void mipsnet_realize(DeviceState *dev, Error **errp)
     sysbus_init_irq(sbd, &s->irq);
     s->nic = qemu_new_nic(&net_mipsnet_info, &s->conf,
 -                          object_get_typename(OBJECT(dev)), dev->id, s);
 +                          object_get_typename(OBJECT(dev)), dev->id,
 +                          &dev->mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
 }
 diff --git a/hw/net/msf2-emac.c b/hw/net/msf2-emac.c
 index 9278fdce0b..1efa3dbf01 100644
 --- a/hw/net/msf2-emac.c
 +++ b/hw/net/msf2-emac.c
@@ -527,7 +527,8 @@ static void msf2_emac_realize(DeviceState *dev, Error **errp)
     qemu_macaddr_default_if_unset(&s->conf.macaddr);
     s->nic = qemu_new_nic(&net_msf2_emac_info, &s->conf,
 -                          object_get_typename(OBJECT(dev)), dev->id, s);
 +                          object_get_typename(OBJECT(dev)), dev->id,
 +                          &dev->mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
 }
 diff --git a/hw/net/ne2000-isa.c b/hw/net/ne2000-isa.c
 index dd6f6e34d3..30bd20c293 100644
 --- a/hw/net/ne2000-isa.c
 +++ b/hw/net/ne2000-isa.c
@@ -74,7 +74,8 @@ static void isa_ne2000_realizefn(DeviceState *dev, Error **errp)
     ne2000_reset(s);
     s->nic = qemu_new_nic(&net_ne2000_isa_info, &s->c,
 -                          object_get_typename(OBJECT(dev)), dev->id, s);
 +                          object_get_typename(OBJECT(dev)), dev->id,
 +                          &dev->mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->c.macaddr.a);
 }
 diff --git a/hw/net/ne2000-pci.c b/hw/net/ne2000-pci.c
 index 9e5d10859a..4f8a699081 100644
 --- a/hw/net/ne2000-pci.c
 +++ b/hw/net/ne2000-pci.c
@@ -71,7 +71,8 @@ static void pci_ne2000_realize(PCIDevice *pci_dev, Error **errp)
     s->nic = qemu_new_nic(&net_ne2000_info, &s->c,
                           object_get_typename(OBJECT(pci_dev)),
 -                          pci_dev->qdev.id, s);
 +                          pci_dev->qdev.id,
 +                          &pci_dev->qdev.mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->c.macaddr.a);
 }
 diff --git a/hw/net/npcm7xx_emc.c b/hw/net/npcm7xx_emc.c
 index 7c892f820f..dd1d0ad3bc 100644
 --- a/hw/net/npcm7xx_emc.c
 +++ b/hw/net/npcm7xx_emc.c
@@ -802,7 +802,8 @@ static void npcm7xx_emc_realize(DeviceState *dev, Error **errp)
     qemu_macaddr_default_if_unset(&emc->conf.macaddr);
     emc->nic = qemu_new_nic(&net_npcm7xx_emc_info, &emc->conf,
 -                            object_get_typename(OBJECT(dev)), dev->id, emc);
 +                            object_get_typename(OBJECT(dev)), dev->id,
 +                            &dev->mem_reentrancy_guard, emc);
     qemu_format_nic_info_str(qemu_get_queue(emc->nic), emc->conf.macaddr.a);
 }
 diff --git a/hw/net/opencores_eth.c b/hw/net/opencores_eth.c
 index 0b3dc3146e..f96d6ea2cc 100644
 --- a/hw/net/opencores_eth.c
 +++ b/hw/net/opencores_eth.c
@@ -732,7 +732,8 @@ static void sysbus_open_eth_realize(DeviceState *dev, Error **errp)
     sysbus_init_irq(sbd, &s->irq);
     s->nic = qemu_new_nic(&net_open_eth_info, &s->conf,
 -                          object_get_typename(OBJECT(s)), dev->id, s);
 +                          object_get_typename(OBJECT(s)), dev->id,
 +                          &dev->mem_reentrancy_guard, s);
 }
 static void qdev_open_eth_reset(DeviceState *dev)
 diff --git a/hw/net/pcnet.c b/hw/net/pcnet.c
 index dcd3fc4948..da910a70bf 100644
 --- a/hw/net/pcnet.c
 +++ b/hw/net/pcnet.c
@@ -1718,7 +1718,8 @@ void pcnet_common_init(DeviceState *dev, PCNetState *s, NetClientInfo *info)
     s->poll_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, pcnet_poll_timer, s);
     qemu_macaddr_default_if_unset(&s->conf.macaddr);
 -    s->nic = qemu_new_nic(info, &s->conf, object_get_typename(OBJECT(dev)), dev->id, s);
 +    s->nic = qemu_new_nic(info, &s->conf, object_get_typename(OBJECT(dev)),
 +                          dev->id, &dev->mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
     /* Initialize the PROM */
 diff --git a/hw/net/rocker/rocker_fp.c b/hw/net/rocker/rocker_fp.c
 index cbeed65bd5..0d21948ada 100644
 --- a/hw/net/rocker/rocker_fp.c
 +++ b/hw/net/rocker/rocker_fp.c
@@ -241,8 +241,8 @@ FpPort *fp_port_alloc(Rocker *r, char *sw_name,
     port->conf.bootindex = -1;
     port->conf.peers = *peers;
 -    port->nic = qemu_new_nic(&fp_port_info, &port->conf,
 -                             sw_name, NULL, port);
 +    port->nic = qemu_new_nic(&fp_port_info, &port->conf, sw_name, NULL,
 +                             &DEVICE(r)->mem_reentrancy_guard, port);
     qemu_format_nic_info_str(qemu_get_queue(port->nic),
                              port->conf.macaddr.a);
 diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c
 index 3ffb9dd22c..a3565c7159 100644
 --- a/hw/net/rtl8139.c
 +++ b/hw/net/rtl8139.c
@@ -3400,7 +3400,8 @@ static void pci_rtl8139_realize(PCIDevice *dev, Error **errp)
     s->eeprom.contents[9] = s->conf.macaddr.a[4] | s->conf.macaddr.a[5] << 8;
     s->nic = qemu_new_nic(&net_rtl8139_info, &s->conf,
 -                          object_get_typename(OBJECT(dev)), d->id, s);
 +                          object_get_typename(OBJECT(dev)), d->id,
 +                          &d->mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
     s->cplus_txbuffer = NULL;
 diff --git a/hw/net/smc91c111.c b/hw/net/smc91c111.c
 index ad778cd8fc..4eda971ef3 100644
 --- a/hw/net/smc91c111.c
 +++ b/hw/net/smc91c111.c
@@ -783,7 +783,8 @@ static void smc91c111_realize(DeviceState *dev, Error **errp)
     sysbus_init_irq(sbd, &s->irq);
     qemu_macaddr_default_if_unset(&s->conf.macaddr);
     s->nic = qemu_new_nic(&net_smc91c111_info, &s->conf,
 -                          object_get_typename(OBJECT(dev)), dev->id, s);
 +                          object_get_typename(OBJECT(dev)), dev->id,
 +                          &dev->mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
     /* ??? Save/restore.  */
 }
 diff --git a/hw/net/spapr_llan.c b/hw/net/spapr_llan.c
 index a6876a936d..475d5f3a34 100644
 --- a/hw/net/spapr_llan.c
 +++ b/hw/net/spapr_llan.c
@@ -325,7 +325,8 @@ static void spapr_vlan_realize(SpaprVioDevice *sdev, Error **errp)
     memcpy(&dev->perm_mac.a, &dev->nicconf.macaddr.a, sizeof(dev->perm_mac.a));
     dev->nic = qemu_new_nic(&net_spapr_vlan_info, &dev->nicconf,
 -                            object_get_typename(OBJECT(sdev)), sdev->qdev.id, dev);
 +                            object_get_typename(OBJECT(sdev)), sdev->qdev.id,
 +                            &sdev->qdev.mem_reentrancy_guard, dev);
     qemu_format_nic_info_str(qemu_get_queue(dev->nic), dev->nicconf.macaddr.a);
     dev->rxp_timer = timer_new_us(QEMU_CLOCK_VIRTUAL, spapr_vlan_flush_rx_queue,
 diff --git a/hw/net/stellaris_enet.c b/hw/net/stellaris_enet.c
 index 8dd60783d8..6768a6912f 100644
 --- a/hw/net/stellaris_enet.c
 +++ b/hw/net/stellaris_enet.c
@@ -492,7 +492,8 @@ static void stellaris_enet_realize(DeviceState *dev, Error **errp)
     qemu_macaddr_default_if_unset(&s->conf.macaddr);
     s->nic = qemu_new_nic(&net_stellaris_enet_info, &s->conf,
 -                          object_get_typename(OBJECT(dev)), dev->id, s);
 +                          object_get_typename(OBJECT(dev)), dev->id,
 +                          &dev->mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
 }
 diff --git a/hw/net/sungem.c b/hw/net/sungem.c
 index 3684a4d733..c12d44e9dc 100644
 --- a/hw/net/sungem.c
 +++ b/hw/net/sungem.c
@@ -1361,7 +1361,7 @@ static void sungem_realize(PCIDevice *pci_dev, Error **errp)
     qemu_macaddr_default_if_unset(&s->conf.macaddr);
     s->nic = qemu_new_nic(&net_sungem_info, &s->conf,
                           object_get_typename(OBJECT(dev)),
 -                          dev->id, s);
 +                          dev->id, &dev->mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic),
                              s->conf.macaddr.a);
 }
 diff --git a/hw/net/sunhme.c b/hw/net/sunhme.c
 index fc34905f87..fa98528d71 100644
 --- a/hw/net/sunhme.c
 +++ b/hw/net/sunhme.c
@@ -892,7 +892,8 @@ static void sunhme_realize(PCIDevice *pci_dev, Error **errp)
     qemu_macaddr_default_if_unset(&s->conf.macaddr);
     s->nic = qemu_new_nic(&net_sunhme_info, &s->conf,
 -                          object_get_typename(OBJECT(d)), d->id, s);
 +                          object_get_typename(OBJECT(d)), d->id,
 +                          &d->mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
 }
 diff --git a/hw/net/tulip.c b/hw/net/tulip.c
 index ca69f7ea5e..985c4c14a4 100644
 --- a/hw/net/tulip.c
 +++ b/hw/net/tulip.c
@@ -981,7 +981,8 @@ static void pci_tulip_realize(PCIDevice *pci_dev, Error **errp)
     s->nic = qemu_new_nic(&net_tulip_info, &s->c,
                           object_get_typename(OBJECT(pci_dev)),
 -                          pci_dev->qdev.id, s);
 +                          pci_dev->qdev.id,
 +                          &pci_dev->qdev.mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->c.macaddr.a);
 }
 diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
 index ddaa8fa122..f5f07f8e63 100644
 --- a/hw/net/virtio-net.c
 +++ b/hw/net/virtio-net.c
@@ -3512,10 +3512,12 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
          * Happen when virtio_net_set_netclient_name has been called.
          */
         n->nic = qemu_new_nic(&net_virtio_info, &n->nic_conf,
 -                              n->netclient_type, n->netclient_name, n);
 +                              n->netclient_type, n->netclient_name,
 +                              &dev->mem_reentrancy_guard, n);
     } else {
         n->nic = qemu_new_nic(&net_virtio_info, &n->nic_conf,
 -                              object_get_typename(OBJECT(dev)), dev->id, n);
 +                              object_get_typename(OBJECT(dev)), dev->id,
 +                              &dev->mem_reentrancy_guard, n);
     }
     for (i = 0; i < n->max_queue_pairs; i++) {
 diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c
 index f65af4e9ef..d4df039c55 100644
 --- a/hw/net/vmxnet3.c
 +++ b/hw/net/vmxnet3.c
@@ -2078,7 +2078,7 @@ static void vmxnet3_net_init(VMXNET3State *s)
     s->nic = qemu_new_nic(&net_vmxnet3_info, &s->conf,
                           object_get_typename(OBJECT(s)),
 -                          d->id, s);
 +                          d->id, &d->mem_reentrancy_guard, s);
     s->peer_has_vhdr = vmxnet3_peer_has_vnet_hdr(s);
     s->tx_sop = true;
 diff --git a/hw/net/xen_nic.c b/hw/net/xen_nic.c
 index 5c815b4f0c..3d0b7820d3 100644
 --- a/hw/net/xen_nic.c
 +++ b/hw/net/xen_nic.c
@@ -294,7 +294,8 @@ static int net_init(struct XenLegacyDevice *xendev)
     }
     netdev->nic = qemu_new_nic(&net_xen_info, &netdev->conf,
 -                               "xen", NULL, netdev);
 +                               "xen", NULL,
 +                               &xendev->qdev.mem_reentrancy_guard, netdev);
     snprintf(qemu_get_queue(netdev->nic)->info_str,
              sizeof(qemu_get_queue(netdev->nic)->info_str),
 diff --git a/hw/net/xgmac.c b/hw/net/xgmac.c
 index 0ab6ae91aa..1f4f277d84 100644
 --- a/hw/net/xgmac.c
 +++ b/hw/net/xgmac.c
@@ -402,7 +402,8 @@ static void xgmac_enet_realize(DeviceState *dev, Error **errp)
     qemu_macaddr_default_if_unset(&s->conf.macaddr);
     s->nic = qemu_new_nic(&net_xgmac_enet_info, &s->conf,
 -                          object_get_typename(OBJECT(dev)), dev->id, s);
 +                          object_get_typename(OBJECT(dev)), dev->id,
 +                          &dev->mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
     s->regs[XGMAC_ADDR_HIGH(0)] = (s->conf.macaddr.a[5] << 8) |
 diff --git a/hw/net/xilinx_axienet.c b/hw/net/xilinx_axienet.c
 index 990ff3a1c2..8a34243803 100644
 --- a/hw/net/xilinx_axienet.c
 +++ b/hw/net/xilinx_axienet.c
@@ -968,7 +968,8 @@ static void xilinx_enet_realize(DeviceState *dev, Error **errp)
     qemu_macaddr_default_if_unset(&s->conf.macaddr);
     s->nic = qemu_new_nic(&net_xilinx_enet_info, &s->conf,
 -                          object_get_typename(OBJECT(dev)), dev->id, s);
 +                          object_get_typename(OBJECT(dev)), dev->id,
 +                          &dev->mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
     tdk_init(&s->TEMAC.phy);
 diff --git a/hw/net/xilinx_ethlite.c b/hw/net/xilinx_ethlite.c
 index 6e09f7e422..80cb869e22 100644
 --- a/hw/net/xilinx_ethlite.c
 +++ b/hw/net/xilinx_ethlite.c
@@ -235,7 +235,8 @@ static void xilinx_ethlite_realize(DeviceState *dev, Error **errp)
     qemu_macaddr_default_if_unset(&s->conf.macaddr);
     s->nic = qemu_new_nic(&net_xilinx_ethlite_info, &s->conf,
 -                          object_get_typename(OBJECT(dev)), dev->id, s);
 +                          object_get_typename(OBJECT(dev)), dev->id,
 +                          &dev->mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
 }
 diff --git a/hw/usb/dev-network.c b/hw/usb/dev-network.c
 index 6c49c16015..ae447a8bc3 100644
 --- a/hw/usb/dev-network.c
 +++ b/hw/usb/dev-network.c
@@ -1362,7 +1362,8 @@ static void usb_net_realize(USBDevice *dev, Error **errp)
     qemu_macaddr_default_if_unset(&s->conf.macaddr);
     s->nic = qemu_new_nic(&net_usbnet_info, &s->conf,
 -                          object_get_typename(OBJECT(s)), s->dev.qdev.id, s);
 +                          object_get_typename(OBJECT(s)), s->dev.qdev.id,
 +                          &s->dev.qdev.mem_reentrancy_guard, s);
     qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
     snprintf(s->usbstring_mac, sizeof(s->usbstring_mac),
              "%02x%02x%02x%02x%02x%02x",
 diff --git a/include/net/net.h b/include/net/net.h
 index 523136c7ac..1457b6c014 100644
 --- a/include/net/net.h
 +++ b/include/net/net.h
@@ -145,6 +145,7 @@ NICState *qemu_new_nic(NetClientInfo *info,
                        NICConf *conf,
                        const char *model,
                        const char *name,
 +                       MemReentrancyGuard *reentrancy_guard,
                        void *opaque);
 void qemu_del_nic(NICState *nic);
 NetClientState *qemu_get_subqueue(NICState *nic, int queue_index);
 diff --git a/net/net.c b/net/net.c
 index f0d14dbfc1..669e194c4b 100644
 --- a/net/net.c
 +++ b/net/net.c
@@ -299,6 +299,7 @@ NICState *qemu_new_nic(NetClientInfo *info,
                        NICConf *conf,
                        const char *model,
                        const char *name,
 +                       MemReentrancyGuard *reentrancy_guard,
                        void *opaque)
 {
     NetClientState **peers = conf->peers.ncs;
 -- 
 2.41.0
--- a/SOURCES/kvm-net-Update-MemReentrancyGuard-for-NIC.patch
+++ b/SOURCES/kvm-net-Update-MemReentrancyGuard-for-NIC.patch
@ -0,0 +1,105 @@
 From d58671091daf8c325a6f1cd87737d94b5fb51d12 Mon Sep 17 00:00:00 2001
 From: Jon Maloy <jmaloy@redhat.com>
 Date: Thu, 23 Nov 2023 11:30:46 -0500
 Subject: [PATCH 2/4] net: Update MemReentrancyGuard for NIC
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 331: net: Provide MemReentrancyGuard * to qemu_new_nic()
 RH-Jira: RHEL-7309
 RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
 RH-Acked-by: Laurent Vivier <lvivier@redhat.com>
 RH-Acked-by: Jason Wang <jasowang@redhat.com>
 RH-Commit: [2/2] b116efe725dd838c2cab9bd2240112f3c6c46d6a (redhat/rhel/src/qemu-kvm/jons-qemu-kvm-2)
 Jira: https://issues.redhat.com/browse/RHEL-7309
 CVE: CVE-2023-3019
 Upstream: Merged
 commit 9050f976e447444ea6ee2ba12c9f77e4b0dc54bc
 Author: Akihiko Odaki <akihiko.odaki@daynix.com>
 Date:   Thu Jun 1 12:18:59 2023 +0900
    net: Update MemReentrancyGuard for NIC
    Recently MemReentrancyGuard was added to DeviceState to record that the
    device is engaging in I/O. The network device backend needs to update it
    when delivering a packet to a device.
    This implementation follows what bottom half does, but it does not add
    a tracepoint for the case that the network device backend started
    delivering a packet to a device which is already engaging in I/O. This
    is because such reentrancy frequently happens for
    qemu_flush_queued_packets() and is insignificant.
    Fixes: CVE-2023-3019
    Reported-by: Alexander Bulekov <alxndr@bu.edu>
    Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
    Acked-by: Alexander Bulekov <alxndr@bu.edu>
    Signed-off-by: Jason Wang <jasowang@redhat.com>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 include/net/net.h |  1 +
 net/net.c         | 14 ++++++++++++++
 2 files changed, 15 insertions(+)
 diff --git a/include/net/net.h b/include/net/net.h
 index 1457b6c014..11d4564ea1 100644
 --- a/include/net/net.h
 +++ b/include/net/net.h
@@ -112,6 +112,7 @@ struct NetClientState {
 typedef struct NICState {
     NetClientState *ncs;
     NICConf *conf;
 +    MemReentrancyGuard *reentrancy_guard;
     void *opaque;
     bool peer_deleted;
 } NICState;
 diff --git a/net/net.c b/net/net.c
 index 669e194c4b..b3008a52b7 100644
 --- a/net/net.c
 +++ b/net/net.c
@@ -312,6 +312,7 @@ NICState *qemu_new_nic(NetClientInfo *info,
     nic = g_malloc0(info->size + sizeof(NetClientState) * queues);
     nic->ncs = (void *)nic + info->size;
     nic->conf = conf;
 +    nic->reentrancy_guard = reentrancy_guard,
     nic->opaque = opaque;
     for (i = 0; i < queues; i++) {
@@ -767,6 +768,7 @@ static ssize_t qemu_deliver_packet_iov(NetClientState *sender,
                                        int iovcnt,
                                        void *opaque)
 {
 +    MemReentrancyGuard *owned_reentrancy_guard;
     NetClientState *nc = opaque;
     int ret;
@@ -779,12 +781,24 @@ static ssize_t qemu_deliver_packet_iov(NetClientState *sender,
         return 0;
     }
 +    if (nc->info->type != NET_CLIENT_DRIVER_NIC ||
 +        qemu_get_nic(nc)->reentrancy_guard->engaged_in_io) {
 +        owned_reentrancy_guard = NULL;
 +    } else {
 +        owned_reentrancy_guard = qemu_get_nic(nc)->reentrancy_guard;
 +        owned_reentrancy_guard->engaged_in_io = true;
 +    }
 +
     if (nc->info->receive_iov && !(flags & QEMU_NET_PACKET_FLAG_RAW)) {
         ret = nc->info->receive_iov(nc, iov, iovcnt);
     } else {
         ret = nc_sendv_compat(nc, iov, iovcnt, flags);
     }
 +    if (owned_reentrancy_guard) {
 +        owned_reentrancy_guard->engaged_in_io = false;
 +    }
 +
     if (ret == 0) {
         nc->receive_disabled = 1;
     }
 -- 
 2.41.0
--- a/SOURCES/kvm-pc-bios-Add-support-for-List-Directed-IPL-from-ECKD-.patch
+++ b/SOURCES/kvm-pc-bios-Add-support-for-List-Directed-IPL-from-ECKD-.patch
@ -0,0 +1,376 @@
 From e11cffc152d9af9194139a37f86e357cb36298e8 Mon Sep 17 00:00:00 2001
 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
 Date: Thu, 25 May 2023 12:50:19 +0200
 Subject: [PATCH 22/22] pc-bios: Add support for List-Directed IPL from ECKD
 DASD
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Cédric Le Goater <clg@redhat.com>
 RH-MergeRequest: 279: Backport latest s390x-related fixes from upstream QEMU for qemu-kvm in RHEL 8.9
 RH-Bugzilla: 2169308 2209605
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: David Hildenbrand <david@redhat.com>
 RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
 RH-Commit: [21/21] cab945af05566d892459a7c8ea3f114310d6bb67
 Bugzilla: https://bugzilla.redhat.com/2209605
 commit 8af5d141713f5d20c4bc1719eb746ef8b1746bd6
 Author: Jared Rossi <jrossi@linux.ibm.com>
 Date:   Tue Feb 21 12:45:48 2023 -0500
    pc-bios: Add support for List-Directed IPL from ECKD DASD
    Check for a List Directed IPL Boot Record, which would supersede the CCW type
    entries.  If the record is valid, proceed to use the new style pointers
    and perform LD-IPL. Each block pointer is interpreted as either an LD-IPL
    pointer or a legacy CCW pointer depending on the type of IPL initiated.
    In either case CCW- or LD-IPL is transparent to the user and will boot the same
    image regardless of which set of pointers is used. Because the interactive boot
    menu is only written with the old style pointers, the menu will be disabled for
    List Directed IPL from ECKD DASD.
    If the LD-IPL fails, retry the IPL using the CCW type pointers.
    If no LD-IPL boot record is found, simply perform CCW type IPL as usual.
    Signed-off-by: Jared Rossi <jrossi@linux.ibm.com>
    Message-Id: <20230221174548.1866861-2-jrossi@linux.ibm.com>
    [thuth: Drop some superfluous parantheses]
    Signed-off-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Cédric Le Goater <clg@redhat.com>
 ---
 pc-bios/s390-ccw/bootmap.c | 157 ++++++++++++++++++++++++++++---------
 pc-bios/s390-ccw/bootmap.h |  30 ++++++-
 2 files changed, 148 insertions(+), 39 deletions(-)
 diff --git a/pc-bios/s390-ccw/bootmap.c b/pc-bios/s390-ccw/bootmap.c
 index 994e59c0b0..a2137449dc 100644
 --- a/pc-bios/s390-ccw/bootmap.c
 +++ b/pc-bios/s390-ccw/bootmap.c
@@ -72,42 +72,74 @@ static inline void verify_boot_info(BootInfo *bip)
                "Bad block size in zIPL section of the 1st record.");
 }
 -static block_number_t eckd_block_num(EckdCHS *chs)
 +static void eckd_format_chs(ExtEckdBlockPtr *ptr,  bool ldipl,
 +                            uint64_t *c,
 +                            uint64_t *h,
 +                            uint64_t *s)
 +{
 +    if (ldipl) {
 +        *c = ptr->ldptr.chs.cylinder;
 +        *h = ptr->ldptr.chs.head;
 +        *s = ptr->ldptr.chs.sector;
 +    } else {
 +        *c = ptr->bptr.chs.cylinder;
 +        *h = ptr->bptr.chs.head;
 +        *s = ptr->bptr.chs.sector;
 +    }
 +}
 +
 +static block_number_t eckd_chs_to_block(uint64_t c, uint64_t h, uint64_t s)
 {
     const uint64_t sectors = virtio_get_sectors();
     const uint64_t heads = virtio_get_heads();
 -    const uint64_t cylinder = chs->cylinder
 -                            + ((chs->head & 0xfff0) << 12);
 -    const uint64_t head = chs->head & 0x000f;
 +    const uint64_t cylinder = c + ((h & 0xfff0) << 12);
 +    const uint64_t head = h & 0x000f;
     const block_number_t block = sectors * heads * cylinder
                                + sectors * head
 -                               + chs->sector
 -                               - 1; /* block nr starts with zero */
 +                               + s - 1; /* block nr starts with zero */
     return block;
 }
 -static bool eckd_valid_address(BootMapPointer *p)
 +static block_number_t eckd_block_num(EckdCHS *chs)
 {
 -    const uint64_t head = p->eckd.chs.head & 0x000f;
 +    return eckd_chs_to_block(chs->cylinder, chs->head, chs->sector);
 +}
 +
 +static block_number_t gen_eckd_block_num(ExtEckdBlockPtr *ptr, bool ldipl)
 +{
 +    uint64_t cyl, head, sec;
 +    eckd_format_chs(ptr, ldipl, &cyl, &head, &sec);
 +    return eckd_chs_to_block(cyl, head, sec);
 +}
 +static bool eckd_valid_chs(uint64_t cyl, uint64_t head, uint64_t sector)
 +{
     if (head >= virtio_get_heads()
 -        ||  p->eckd.chs.sector > virtio_get_sectors()
 -        ||  p->eckd.chs.sector <= 0) {
 +        || sector > virtio_get_sectors()
 +        || sector <= 0) {
         return false;
     }
     if (!virtio_guessed_disk_nature() &&
 -        eckd_block_num(&p->eckd.chs) >= virtio_get_blocks()) {
 +        eckd_chs_to_block(cyl, head, sector) >= virtio_get_blocks()) {
         return false;
     }
     return true;
 }
 -static block_number_t load_eckd_segments(block_number_t blk, uint64_t *address)
 +static bool eckd_valid_address(ExtEckdBlockPtr *ptr, bool ldipl)
 +{
 +    uint64_t cyl, head, sec;
 +    eckd_format_chs(ptr, ldipl, &cyl, &head, &sec);
 +    return eckd_valid_chs(cyl, head, sec);
 +}
 +
 +static block_number_t load_eckd_segments(block_number_t blk, bool ldipl,
 +                                         uint64_t *address)
 {
     block_number_t block_nr;
 -    int j, rc;
 +    int j, rc, count;
     BootMapPointer *bprs = (void *)_bprs;
     bool more_data;
@@ -117,7 +149,7 @@ static block_number_t load_eckd_segments(block_number_t blk, uint64_t *address)
     do {
         more_data = false;
         for (j = 0;; j++) {
 -            block_nr = eckd_block_num(&bprs[j].xeckd.bptr.chs);
 +            block_nr = gen_eckd_block_num(&bprs[j].xeckd, ldipl);
             if (is_null_block_number(block_nr)) { /* end of chunk */
                 break;
             }
@@ -129,11 +161,26 @@ static block_number_t load_eckd_segments(block_number_t blk, uint64_t *address)
                 break;
             }
 -            IPL_assert(block_size_ok(bprs[j].xeckd.bptr.size),
 +            /* List directed pointer does not store block size */
 +            IPL_assert(ldipl || block_size_ok(bprs[j].xeckd.bptr.size),
                        "bad chunk block size");
 -            IPL_assert(eckd_valid_address(&bprs[j]), "bad chunk ECKD addr");
 -            if ((bprs[j].xeckd.bptr.count == 0) && unused_space(&(bprs[j+1]),
 +            if (!eckd_valid_address(&bprs[j].xeckd, ldipl)) {
 +                /*
 +                 * If an invalid address is found during LD-IPL then break and
 +                 * retry as CCW
 +                 */
 +                IPL_assert(ldipl, "bad chunk ECKD addr");
 +                break;
 +            }
 +
 +            if (ldipl) {
 +                count = bprs[j].xeckd.ldptr.count;
 +            } else {
 +                count = bprs[j].xeckd.bptr.count;
 +            }
 +
 +            if (count == 0 && unused_space(&bprs[j + 1],
                 sizeof(EckdBlockPtr))) {
                 /* This is a "continue" pointer.
                  * This ptr should be the last one in the current
@@ -149,11 +196,10 @@ static block_number_t load_eckd_segments(block_number_t blk, uint64_t *address)
             /* Load (count+1) blocks of code at (block_nr)
              * to memory (address).
              */
 -            rc = virtio_read_many(block_nr, (void *)(*address),
 -                                  bprs[j].xeckd.bptr.count+1);
 +            rc = virtio_read_many(block_nr, (void *)(*address), count + 1);
             IPL_assert(rc == 0, "code chunk read failed");
 -            *address += (bprs[j].xeckd.bptr.count+1) * virtio_get_block_size();
 +            *address += (count + 1) * virtio_get_block_size();
         }
     } while (more_data);
     return block_nr;
@@ -237,8 +283,10 @@ static void run_eckd_boot_script(block_number_t bmt_block_nr,
     uint64_t address;
     BootMapTable *bmt = (void *)sec;
     BootMapScript *bms = (void *)sec;
 +    /* The S1B block number is NULL_BLOCK_NR if and only if it's an LD-IPL */
 +    bool ldipl = (s1b_block_nr == NULL_BLOCK_NR);
 -    if (menu_is_enabled_zipl()) {
 +    if (menu_is_enabled_zipl() && !ldipl) {
         loadparm = eckd_get_boot_menu_index(s1b_block_nr);
     }
@@ -249,7 +297,7 @@ static void run_eckd_boot_script(block_number_t bmt_block_nr,
     memset(sec, FREE_SPACE_FILLER, sizeof(sec));
     read_block(bmt_block_nr, sec, "Cannot read Boot Map Table");
 -    block_nr = eckd_block_num(&bmt->entry[loadparm].xeckd.bptr.chs);
 +    block_nr = gen_eckd_block_num(&bmt->entry[loadparm].xeckd, ldipl);
     IPL_assert(block_nr != -1, "Cannot find Boot Map Table Entry");
     memset(sec, FREE_SPACE_FILLER, sizeof(sec));
@@ -264,13 +312,18 @@ static void run_eckd_boot_script(block_number_t bmt_block_nr,
         }
         address = bms->entry[i].address.load_address;
 -        block_nr = eckd_block_num(&bms->entry[i].blkptr.xeckd.bptr.chs);
 +        block_nr = gen_eckd_block_num(&bms->entry[i].blkptr.xeckd, ldipl);
         do {
 -            block_nr = load_eckd_segments(block_nr, &address);
 +            block_nr = load_eckd_segments(block_nr, ldipl, &address);
         } while (block_nr != -1);
     }
 +    if (ldipl && bms->entry[i].type != BOOT_SCRIPT_EXEC) {
 +        /* Abort LD-IPL and retry as CCW-IPL */
 +        return;
 +    }
 +
     IPL_assert(bms->entry[i].type == BOOT_SCRIPT_EXEC,
                "Unknown script entry type");
     write_reset_psw(bms->entry[i].address.load_address); /* no return */
@@ -380,6 +433,23 @@ static void ipl_eckd_ldl(ECKD_IPL_mode_t mode)
     /* no return */
 }
 +static block_number_t eckd_find_bmt(ExtEckdBlockPtr *ptr)
 +{
 +    block_number_t blockno;
 +    uint8_t tmp_sec[MAX_SECTOR_SIZE];
 +    BootRecord *br;
 +
 +    blockno = gen_eckd_block_num(ptr, 0);
 +    read_block(blockno, tmp_sec, "Cannot read boot record");
 +    br = (BootRecord *)tmp_sec;
 +    if (!magic_match(br->magic, ZIPL_MAGIC)) {
 +        /* If the boot record is invalid, return and try CCW-IPL instead */
 +        return NULL_BLOCK_NR;
 +    }
 +
 +    return gen_eckd_block_num(&br->pgt.xeckd, 1);
 +}
 +
 static void print_eckd_msg(void)
 {
     char msg[] = "Using ECKD scheme (block size *****), ";
@@ -401,28 +471,43 @@ static void print_eckd_msg(void)
 static void ipl_eckd(void)
 {
 -    XEckdMbr *mbr = (void *)sec;
 -    LDL_VTOC *vlbl = (void *)sec;
 +    IplVolumeLabel *vlbl = (void *)sec;
 +    LDL_VTOC *vtoc = (void *)sec;
 +    block_number_t ldipl_bmt; /* Boot Map Table for List-Directed IPL */
     print_eckd_msg();
 -    /* Grab the MBR again */
 +    /* Block 2 can contain either the CDL VOL1 label or the LDL VTOC */
     memset(sec, FREE_SPACE_FILLER, sizeof(sec));
 -    read_block(0, mbr, "Cannot read block 0 on DASD");
 +    read_block(2, vlbl, "Cannot read block 2");
 -    if (magic_match(mbr->magic, IPL1_MAGIC)) {
 -        ipl_eckd_cdl();         /* only returns in case of error */
 -        return;
 +    /*
 +     * First check for a list-directed-format pointer which would
 +     * supersede the CCW pointer.
 +     */
 +    if (eckd_valid_address((ExtEckdBlockPtr *)&vlbl->f.br, 0)) {
 +        ldipl_bmt = eckd_find_bmt((ExtEckdBlockPtr *)&vlbl->f.br);
 +        if (ldipl_bmt) {
 +            sclp_print("List-Directed\n");
 +            /* LD-IPL does not use the S1B bock, just make it NULL */
 +            run_eckd_boot_script(ldipl_bmt, NULL_BLOCK_NR);
 +            /* Only return in error, retry as CCW-IPL */
 +            sclp_print("Retrying IPL ");
 +            print_eckd_msg();
 +        }
 +        memset(sec, FREE_SPACE_FILLER, sizeof(sec));
 +        read_block(2, vtoc, "Cannot read block 2");
     }
 -    /* LDL/CMS? */
 -    memset(sec, FREE_SPACE_FILLER, sizeof(sec));
 -    read_block(2, vlbl, "Cannot read block 2");
 +    /* Not list-directed */
 +    if (magic_match(vtoc->magic, VOL1_MAGIC)) {
 +        ipl_eckd_cdl(); /* may return in error */
 +    }
 -    if (magic_match(vlbl->magic, CMS1_MAGIC)) {
 +    if (magic_match(vtoc->magic, CMS1_MAGIC)) {
         ipl_eckd_ldl(ECKD_CMS); /* no return */
     }
 -    if (magic_match(vlbl->magic, LNX1_MAGIC)) {
 +    if (magic_match(vtoc->magic, LNX1_MAGIC)) {
         ipl_eckd_ldl(ECKD_LDL); /* no return */
     }
 diff --git a/pc-bios/s390-ccw/bootmap.h b/pc-bios/s390-ccw/bootmap.h
 index 3946aa3f8d..d4690a88c2 100644
 --- a/pc-bios/s390-ccw/bootmap.h
 +++ b/pc-bios/s390-ccw/bootmap.h
@@ -45,9 +45,23 @@ typedef struct EckdBlockPtr {
                     * it's 0 for TablePtr, ScriptPtr, and SectionPtr */
 } __attribute__ ((packed)) EckdBlockPtr;
 -typedef struct ExtEckdBlockPtr {
 +typedef struct LdEckdCHS {
 +    uint32_t cylinder;
 +    uint8_t head;
 +    uint8_t sector;
 +} __attribute__ ((packed)) LdEckdCHS;
 +
 +typedef struct LdEckdBlockPtr {
 +    LdEckdCHS chs; /* cylinder/head/sector is an address of the block */
 +    uint8_t reserved[4];
 +    uint16_t count;
 +    uint32_t pad;
 +} __attribute__ ((packed)) LdEckdBlockPtr;
 +
 +/* bptr is used for CCW type IPL, while ldptr is for list-directed IPL */
 +typedef union ExtEckdBlockPtr {
     EckdBlockPtr bptr;
 -    uint8_t reserved[8];
 +    LdEckdBlockPtr ldptr;
 } __attribute__ ((packed)) ExtEckdBlockPtr;
 typedef union BootMapPointer {
@@ -57,6 +71,15 @@ typedef union BootMapPointer {
     ExtEckdBlockPtr xeckd;
 } __attribute__ ((packed)) BootMapPointer;
 +typedef struct BootRecord {
 +    uint8_t magic[4];
 +    uint32_t version;
 +    uint64_t res1;
 +    BootMapPointer pgt;
 +    uint8_t reserved[510 - 32];
 +    uint16_t os_id;
 +} __attribute__ ((packed)) BootRecord;
 +
 /* aka Program Table */
 typedef struct BootMapTable {
     uint8_t magic[4];
@@ -292,7 +315,8 @@ typedef struct IplVolumeLabel {
         struct {
             unsigned char key[4]; /* == "VOL1" */
             unsigned char volser[6];
 -            unsigned char reserved[6];
 +            unsigned char reserved[64];
 +            EckdCHS br; /* Location of Boot Record for list-directed IPL */
         } f;
     };
 } __attribute__((packed)) IplVolumeLabel;
 -- 
 2.37.3
--- a/SOURCES/kvm-physmem-add-missing-memory-barrier.patch
+++ b/SOURCES/kvm-physmem-add-missing-memory-barrier.patch
@ -0,0 +1,55 @@
 From 01c09f31978154f0d2fd699621ae958a8c3ea2a5 Mon Sep 17 00:00:00 2001
 From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 Date: Thu, 9 Mar 2023 08:15:24 -0500
 Subject: [PATCH 08/13] physmem: add missing memory barrier
 RH-Author: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 RH-MergeRequest: 263: qatomic: add smp_mb__before/after_rmw()
 RH-Bugzilla: 2168472
 RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
 RH-Acked-by: Eric Auger <eric.auger@redhat.com>
 RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
 RH-Acked-by: David Hildenbrand <david@redhat.com>
 RH-Commit: [8/10] f6a9659f7cf40b78de6e85e4a7c06842273aa770
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2168472
 commit 33828ca11da08436e1b32f3e79dabce3061a0427
 Author: Paolo Bonzini <pbonzini@redhat.com>
 Date:   Fri Mar 3 14:36:32 2023 +0100
    physmem: add missing memory barrier
    Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
    Reviewed-by: David Hildenbrand <david@redhat.com>
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 ---
 softmmu/physmem.c | 3 +++
 1 file changed, 3 insertions(+)
 diff --git a/softmmu/physmem.c b/softmmu/physmem.c
 index 4d0ef5f92f..2b96fad302 100644
 --- a/softmmu/physmem.c
 +++ b/softmmu/physmem.c
@@ -3087,6 +3087,8 @@ void cpu_register_map_client(QEMUBH *bh)
     qemu_mutex_lock(&map_client_list_lock);
     client->bh = bh;
     QLIST_INSERT_HEAD(&map_client_list, client, link);
 +    /* Write map_client_list before reading in_use.  */
 +    smp_mb();
     if (!qatomic_read(&bounce.in_use)) {
         cpu_notify_map_clients_locked();
     }
@@ -3279,6 +3281,7 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
     qemu_vfree(bounce.buffer);
     bounce.buffer = NULL;
     memory_region_unref(bounce.mr);
 +    /* Clear in_use before reading map_client_list.  */
     qatomic_mb_set(&bounce.in_use, false);
     cpu_notify_map_clients();
 }
 -- 
 2.37.3
--- a/SOURCES/kvm-qapi-i386-sev-Change-the-reduced-phys-bits-value-fro.patch
+++ b/SOURCES/kvm-qapi-i386-sev-Change-the-reduced-phys-bits-value-fro.patch
@ -0,0 +1,55 @@
 From 57ee29fbb08f7b89ee1b7c75b749392c08af3b03 Mon Sep 17 00:00:00 2001
 From: Bandan Das <bsd@redhat.com>
 Date: Thu, 3 Aug 2023 15:23:54 -0400
 Subject: [PATCH 1/5] qapi, i386/sev: Change the reduced-phys-bits value from 5
 to 1
 RH-Author: Bandan Das <None>
 RH-MergeRequest: 296: Updates to SEV reduced-phys-bits parameter
 RH-Bugzilla: 2214840
 RH-Acked-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
 RH-Commit: [1/4] 4137cb3b57cbb175078bc908fb2301ea2b97fd17
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2214840
 commit 798a818f50a9bfc01e8b5943090de458863b897b
 Author: Tom Lendacky <thomas.lendacky@amd.com>
 Date:   Fri Sep 30 10:14:27 2022 -0500
    qapi, i386/sev: Change the reduced-phys-bits value from 5 to 1
    A guest only ever experiences, at most, 1 bit of reduced physical
    addressing. Change the query-sev-capabilities json comment to use 1.
    Fixes: 31dd67f684 ("sev/i386: qmp: add query-sev-capabilities command")
    Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
    Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
    Message-Id: <cb96d8e09154533af4b4e6988469bc0b32390b65.1664550870.git.thomas.lendacky@amd.com>
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 RHEL Notes:
     Conflicts: Context differences, since commit 811b4ec7f8eb<qapi, target/i386/sev: Add cpu0-id to query-sev-capabilities>
     is missing
 Signed-off-by: Bandan Das <bsd@redhat.com>
 ---
 qapi/misc-target.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 diff --git a/qapi/misc-target.json b/qapi/misc-target.json
 index 4bc45d2474..ede9052440 100644
 --- a/qapi/misc-target.json
 +++ b/qapi/misc-target.json
@@ -205,7 +205,7 @@
 #
 # -> { "execute": "query-sev-capabilities" }
 # <- { "return": { "pdh": "8CCDD8DDD", "cert-chain": "888CCCDDDEE",
 -#                  "cbitpos": 47, "reduced-phys-bits": 5}}
 +#                  "cbitpos": 47, "reduced-phys-bits": 1}}
 #
 ##
 { 'command': 'query-sev-capabilities', 'returns': 'SevCapability',
 -- 
 2.37.3
--- a/SOURCES/kvm-qatomic-add-smp_mb__before-after_rmw.patch
+++ b/SOURCES/kvm-qatomic-add-smp_mb__before-after_rmw.patch
@ -0,0 +1,177 @@
 From e7d0e29d1962092af58d0445439671a6e1d91f71 Mon Sep 17 00:00:00 2001
 From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 Date: Thu, 9 Mar 2023 08:10:33 -0500
 Subject: [PATCH 02/13] qatomic: add smp_mb__before/after_rmw()
 RH-Author: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 RH-MergeRequest: 263: qatomic: add smp_mb__before/after_rmw()
 RH-Bugzilla: 2168472
 RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
 RH-Acked-by: Eric Auger <eric.auger@redhat.com>
 RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
 RH-Acked-by: David Hildenbrand <david@redhat.com>
 RH-Commit: [2/10] 1f87eb3157abcf23f020881cedce42f76497f348
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2168472
 commit ff00bed1897c3d27adc5b0cec6f6eeb5a7d13176
 Author: Paolo Bonzini <pbonzini@redhat.com>
 Date:   Thu Mar 2 11:10:56 2023 +0100
    qatomic: add smp_mb__before/after_rmw()
    On ARM, seqcst loads and stores (which QEMU does not use) are compiled
    respectively as LDAR and STLR instructions.  Even though LDAR is
    also used for load-acquire operations, it also waits for all STLRs to
    leave the store buffer.  Thus, LDAR and STLR alone are load-acquire
    and store-release operations, but LDAR also provides store-against-load
    ordering as long as the previous store is a STLR.
    Compare this to ARMv7, where store-release is DMB+STR and load-acquire
    is LDR+DMB, but an additional DMB is needed between store-seqcst and
    load-seqcst (e.g. DMB+STR+DMB+LDR+DMB); or with x86, where MOV provides
    load-acquire and store-release semantics and the two can be reordered.
    Likewise, on ARM sequentially consistent read-modify-write operations only
    need to use LDAXR and STLXR respectively for the load and the store, while
    on x86 they need to use the stronger LOCK prefix.
    In a strange twist of events, however, the _stronger_ semantics
    of the ARM instructions can end up causing bugs on ARM, not on x86.
    The problems occur when seqcst atomics are mixed with relaxed atomics.
    QEMU's atomics try to bridge the Linux API (that most of the developers
    are familiar with) and the C11 API, and the two have a substantial
    difference:
    - in Linux, strongly-ordered atomics such as atomic_add_return() affect
      the global ordering of _all_ memory operations, including for example
      READ_ONCE()/WRITE_ONCE()
    - in C11, sequentially consistent atomics (except for seq-cst fences)
      only affect the ordering of sequentially consistent operations.
      In particular, since relaxed loads are done with LDR on ARM, they are
      not ordered against seqcst stores (which are done with STLR).
    QEMU implements high-level synchronization primitives with the idea that
    the primitives contain the necessary memory barriers, and the callers can
    use relaxed atomics (qatomic_read/qatomic_set) or even regular accesses.
    This is very much incompatible with the C11 view that seqcst accesses
    are only ordered against other seqcst accesses, and requires using seqcst
    fences as in the following example:
       qatomic_set(&y, 1);            qatomic_set(&x, 1);
       smp_mb();                      smp_mb();
       ... qatomic_read(&x) ...       ... qatomic_read(&y) ...
    When a qatomic_*() read-modify write operation is used instead of one
    or both stores, developers that are more familiar with the Linux API may
    be tempted to omit the smp_mb(), which will work on x86 but not on ARM.
    This nasty difference between Linux and C11 read-modify-write operations
    has already caused issues in util/async.c and more are being found.
    Provide something similar to Linux smp_mb__before/after_atomic(); this
    has the double function of documenting clearly why there is a memory
    barrier, and avoiding a double barrier on x86 and s390x systems.
    The new macro can already be put to use in qatomic_mb_set().
    Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
    Reviewed-by: David Hildenbrand <david@redhat.com>
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 ---
 docs/devel/atomics.rst | 26 +++++++++++++++++++++-----
 include/qemu/atomic.h  | 17 ++++++++++++++++-
 2 files changed, 37 insertions(+), 6 deletions(-)
 diff --git a/docs/devel/atomics.rst b/docs/devel/atomics.rst
 index 52baa0736d..10fbfc58bb 100644
 --- a/docs/devel/atomics.rst
 +++ b/docs/devel/atomics.rst
@@ -25,7 +25,8 @@ provides macros that fall in three camps:
 - weak atomic access and manual memory barriers: ``qatomic_read()``,
   ``qatomic_set()``, ``smp_rmb()``, ``smp_wmb()``, ``smp_mb()``,
 -  ``smp_mb_acquire()``, ``smp_mb_release()``, ``smp_read_barrier_depends()``;
 +  ``smp_mb_acquire()``, ``smp_mb_release()``, ``smp_read_barrier_depends()``,
 +  ``smp_mb__before_rmw()``, ``smp_mb__after_rmw()``;
 - sequentially consistent atomic access: everything else.
@@ -470,7 +471,7 @@ and memory barriers, and the equivalents in QEMU:
   sequential consistency.
 - in QEMU, ``qatomic_read()`` and ``qatomic_set()`` do not participate in
 -  the total ordering enforced by sequentially-consistent operations.
 +  the ordering enforced by read-modify-write operations.
   This is because QEMU uses the C11 memory model.  The following example
   is correct in Linux but not in QEMU:
@@ -486,9 +487,24 @@ and memory barriers, and the equivalents in QEMU:
   because the read of ``y`` can be moved (by either the processor or the
   compiler) before the write of ``x``.
 -  Fixing this requires an ``smp_mb()`` memory barrier between the write
 -  of ``x`` and the read of ``y``.  In the common case where only one thread
 -  writes ``x``, it is also possible to write it like this:
 +  Fixing this requires a full memory barrier between the write of ``x`` and
 +  the read of ``y``.  QEMU provides ``smp_mb__before_rmw()`` and
 +  ``smp_mb__after_rmw()``; they act both as an optimization,
 +  avoiding the memory barrier on processors where it is unnecessary,
 +  and as a clarification of this corner case of the C11 memory model:
 +
 +      +--------------------------------+
 +      | QEMU (correct)                 |
 +      +================================+
 +      | ::                             |
 +      |                                |
 +      |   a = qatomic_fetch_add(&x, 2);|
 +      |   smp_mb__after_rmw();         |
 +      |   b = qatomic_read(&y);        |
 +      +--------------------------------+
 +
 +  In the common case where only one thread writes ``x``, it is also possible
 +  to write it like this:
       +--------------------------------+
       | QEMU (correct)                 |
 diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
 index 112a29910b..7855443cab 100644
 --- a/include/qemu/atomic.h
 +++ b/include/qemu/atomic.h
@@ -243,6 +243,20 @@
 #define smp_wmb()   smp_mb_release()
 #define smp_rmb()   smp_mb_acquire()
 +/*
 + * SEQ_CST is weaker than the older __sync_* builtins and Linux
 + * kernel read-modify-write atomics.  Provide a macro to obtain
 + * the same semantics.
 + */
 +#if !defined(QEMU_SANITIZE_THREAD) && \
 +    (defined(__i386__) || defined(__x86_64__) || defined(__s390x__))
 +# define smp_mb__before_rmw() signal_barrier()
 +# define smp_mb__after_rmw() signal_barrier()
 +#else
 +# define smp_mb__before_rmw() smp_mb()
 +# define smp_mb__after_rmw() smp_mb()
 +#endif
 +
 /* qatomic_mb_read/set semantics map Java volatile variables. They are
  * less expensive on some platforms (notably POWER) than fully
  * sequentially consistent operations.
@@ -257,7 +271,8 @@
 #if !defined(__SANITIZE_THREAD__) && \
     (defined(__i386__) || defined(__x86_64__) || defined(__s390x__))
 /* This is more efficient than a store plus a fence.  */
 -# define qatomic_mb_set(ptr, i)  ((void)qatomic_xchg(ptr, i))
 +# define qatomic_mb_set(ptr, i) \
 +    ({ (void)qatomic_xchg(ptr, i); smp_mb__after_rmw(); })
 #else
 # define qatomic_mb_set(ptr, i) \
    ({ qatomic_store_release(ptr, i); smp_mb(); })
 -- 
 2.37.3
--- a/SOURCES/kvm-qcow2-Don-t-open-data_file-with-BDRV_O_NO_IO.patch
+++ b/SOURCES/kvm-qcow2-Don-t-open-data_file-with-BDRV_O_NO_IO.patch
@ -0,0 +1,209 @@
 From 5cdbc87ab24a8cc4cf926158ec429d43d8a45f15 Mon Sep 17 00:00:00 2001
 From: Jon Maloy <jmaloy@redhat.com>
 Date: Wed, 5 Jun 2024 19:56:51 -0400
 Subject: [PATCH 1/5] qcow2: Don't open data_file with BDRV_O_NO_IO
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 5: EMBARGOED CVE-2024-4467 for rhel-8.10.z (PRDSC)
 RH-Jira: RHEL-35616
 RH-CVE: CVE-2024-4467
 RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
 RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
 RH-Commit: [1/5] 2e72d21c14d86645cf68eec78f49d5cc5d77581f
 Conflicts: qcow2_do_open(): missing boolean ´open_data_file'.
           We assume it to be true.
 commit f9843ce5c519901654a7d8ba43ee95ce25ca13c2
 Author: Kevin Wolf <kwolf@redhat.com>
 Date:   Thu Apr 11 15:06:01 2024 +0200
    qcow2: Don't open data_file with BDRV_O_NO_IO
    One use case for 'qemu-img info' is verifying that untrusted images
    don't reference an unwanted external file, be it as a backing file or an
    external data file. To make sure that calling 'qemu-img info' can't
    already have undesired side effects with a malicious image, just don't
    open the data file at all with BDRV_O_NO_IO. If nothing ever tries to do
    I/O, we don't need to have it open.
    This changes the output of iotests case 061, which used 'qemu-img info'
    to show that opening an image with an invalid data file fails. After
    this patch, it succeeds. Replace this part of the test with a qemu-io
    call, but keep the final 'qemu-img info' to show that the invalid data
    file is correctly displayed in the output.
    Signed-off-by: Kevin Wolf <kwolf@redhat.com>
    Reviewed-by: Eric Blake <eblake@redhat.com>
    Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
    Reviewed-by: Hanna Czenczek <hreitz@redhat.com>
    Upstream: N/A, embargoed
    Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 block/qcow2.c              | 87 +++++++++++++++++++++++---------------
 tests/qemu-iotests/061     |  6 ++-
 tests/qemu-iotests/061.out |  8 +++-
 3 files changed, 62 insertions(+), 39 deletions(-)
 diff --git a/block/qcow2.c b/block/qcow2.c
 index d509016756..6ee1919612 100644
 --- a/block/qcow2.c
 +++ b/block/qcow2.c
@@ -1613,50 +1613,67 @@ static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options,
         goto fail;
     }
 -    /* Open external data file */
 -    s->data_file = bdrv_open_child(NULL, options, "data-file", bs,
 -                                   &child_of_bds, BDRV_CHILD_DATA,
 -                                   true, errp);
 -    if (*errp) {
 -        ret = -EINVAL;
 -        goto fail;
 -    }
 +    if (flags & BDRV_O_NO_IO) {
 +        /*
 +         * Don't open the data file for 'qemu-img info' so that it can be used
 +         * to verify that an untrusted qcow2 image doesn't refer to external
 +         * files.
 +         *
 +         * Note: This still makes has_data_file() return true.
 +         */
 +        if (s->incompatible_features & QCOW2_INCOMPAT_DATA_FILE) {
 +            s->data_file = NULL;
 +        } else {
 +            s->data_file = bs->file;
 +        }
 +        qdict_extract_subqdict(options, NULL, "data-file.");
 +        qdict_del(options, "data-file");
 +    } else {
 +        /* Open external data file */
 +        s->data_file = bdrv_open_child(NULL, options, "data-file", bs,
 +                                       &child_of_bds, BDRV_CHILD_DATA,
 +                                       true, errp);
 +        if (*errp) {
 +            ret = -EINVAL;
 +            goto fail;
 +        }
 -    if (s->incompatible_features & QCOW2_INCOMPAT_DATA_FILE) {
 -        if (!s->data_file && s->image_data_file) {
 -            s->data_file = bdrv_open_child(s->image_data_file, options,
 -                                           "data-file", bs, &child_of_bds,
 -                                           BDRV_CHILD_DATA, false, errp);
 +        if (s->incompatible_features & QCOW2_INCOMPAT_DATA_FILE) {
 +            if (!s->data_file && s->image_data_file) {
 +                s->data_file = bdrv_open_child(s->image_data_file, options,
 +                                               "data-file", bs, &child_of_bds,
 +                                               BDRV_CHILD_DATA, false, errp);
 +                if (!s->data_file) {
 +                    ret = -EINVAL;
 +                    goto fail;
 +                }
 +            }
             if (!s->data_file) {
 +                error_setg(errp, "'data-file' is required for this image");
                 ret = -EINVAL;
                 goto fail;
             }
 -        }
 -        if (!s->data_file) {
 -            error_setg(errp, "'data-file' is required for this image");
 -            ret = -EINVAL;
 -            goto fail;
 -        }
 -        /* No data here */
 -        bs->file->role &= ~BDRV_CHILD_DATA;
 +            /* No data here */
 +            bs->file->role &= ~BDRV_CHILD_DATA;
 -        /* Must succeed because we have given up permissions if anything */
 -        bdrv_child_refresh_perms(bs, bs->file, &error_abort);
 -    } else {
 -        if (s->data_file) {
 -            error_setg(errp, "'data-file' can only be set for images with an "
 -                             "external data file");
 -            ret = -EINVAL;
 -            goto fail;
 -        }
 +            /* Must succeed because we have given up permissions if anything */
 +            bdrv_child_refresh_perms(bs, bs->file, &error_abort);
 +        } else {
 +            if (s->data_file) {
 +                error_setg(errp, "'data-file' can only be set for images with an "
 +                           "external data file");
 +                ret = -EINVAL;
 +                goto fail;
 +            }
 -        s->data_file = bs->file;
 +            s->data_file = bs->file;
 -        if (data_file_is_raw(bs)) {
 -            error_setg(errp, "data-file-raw requires a data file");
 -            ret = -EINVAL;
 -            goto fail;
 +            if (data_file_is_raw(bs)) {
 +                error_setg(errp, "data-file-raw requires a data file");
 +                ret = -EINVAL;
 +                goto fail;
 +            }
         }
     }
 diff --git a/tests/qemu-iotests/061 b/tests/qemu-iotests/061
 index 9507c223bd..6a5bd47efc 100755
 --- a/tests/qemu-iotests/061
 +++ b/tests/qemu-iotests/061
@@ -322,12 +322,14 @@ $QEMU_IMG amend -o "data_file=foo" "$TEST_IMG"
 echo
 _make_test_img -o "compat=1.1,data_file=$TEST_IMG.data" 64M
 $QEMU_IMG amend -o "data_file=foo" "$TEST_IMG"
 -_img_info --format-specific
 +$QEMU_IO -c "read 0 4k" "$TEST_IMG" 2>&1 | _filter_testdir | _filter_imgfmt
 +$QEMU_IO -c "open -o data-file.filename=$TEST_IMG.data,file.filename=$TEST_IMG" -c "read 0 4k" | _filter_qemu_io
 TEST_IMG="data-file.filename=$TEST_IMG.data,file.filename=$TEST_IMG" _img_info --format-specific --image-opts
 echo
 $QEMU_IMG amend -o "data_file=" --image-opts "data-file.filename=$TEST_IMG.data,file.filename=$TEST_IMG"
 -_img_info --format-specific
 +$QEMU_IO -c "read 0 4k" "$TEST_IMG" 2>&1 | _filter_testdir | _filter_imgfmt
 +$QEMU_IO -c "open -o data-file.filename=$TEST_IMG.data,file.filename=$TEST_IMG" -c "read 0 4k" | _filter_qemu_io
 TEST_IMG="data-file.filename=$TEST_IMG.data,file.filename=$TEST_IMG" _img_info --format-specific --image-opts
 echo
 diff --git a/tests/qemu-iotests/061.out b/tests/qemu-iotests/061.out
 index 7ecbd4dea8..99b2307a23 100644
 --- a/tests/qemu-iotests/061.out
 +++ b/tests/qemu-iotests/061.out
@@ -545,7 +545,9 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
 qemu-img: data-file can only be set for images that use an external data file
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 data_file=TEST_DIR/t.IMGFMT.data
 -qemu-img: Could not open 'TEST_DIR/t.IMGFMT': Could not open 'foo': No such file or directory
 +qemu-io: can't open device TEST_DIR/t.IMGFMT: Could not open 'foo': No such file or directory
 +read 4096/4096 bytes at offset 0
 +4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 image: TEST_DIR/t.IMGFMT
 file format: IMGFMT
 virtual size: 64 MiB (67108864 bytes)
@@ -560,7 +562,9 @@ Format specific information:
     corrupt: false
     extended l2: false
 -qemu-img: Could not open 'TEST_DIR/t.IMGFMT': 'data-file' is required for this image
 +qemu-io: can't open device TEST_DIR/t.IMGFMT: 'data-file' is required for this image
 +read 4096/4096 bytes at offset 0
 +4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 image: TEST_DIR/t.IMGFMT
 file format: IMGFMT
 virtual size: 64 MiB (67108864 bytes)
 -- 
 2.39.3
--- a/SOURCES/kvm-qcow2-Fix-theoretical-corruption-in-store_bitmap-err.patch
+++ b/SOURCES/kvm-qcow2-Fix-theoretical-corruption-in-store_bitmap-err.patch
@ -0,0 +1,67 @@
 From 06c73c4b57dd1f47f819d719a63eb39fbe799304 Mon Sep 17 00:00:00 2001
 From: Kevin Wolf <kwolf@redhat.com>
 Date: Thu, 12 Jan 2023 20:14:51 +0100
 Subject: [PATCH 1/4] qcow2: Fix theoretical corruption in store_bitmap() error
 path
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Kevin Wolf <kwolf@redhat.com>
 RH-MergeRequest: 251: qemu-img: Fix exit code for errors closing the image
 RH-Bugzilla: 2147617
 RH-Acked-by: Hanna Czenczek <hreitz@redhat.com>
 RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
 RH-Acked-by: Stefano Garzarella <sgarzare@redhat.com>
 RH-Commit: [1/4] d0a26bed7b16db41e7baee1f8f2b3ae54e52dd52
 In order to write the bitmap table to the image file, it is converted to
 big endian. If the write fails, it is passed to clear_bitmap_table() to
 free all of the clusters it had allocated before. However, if we don't
 convert it back to native endianness first, we'll free things at a wrong
 offset.
 In practical terms, the offsets will be so high that we won't actually
 free any allocated clusters, but just run into an error, but in theory
 this can cause image corruption.
 Cc: qemu-stable@nongnu.org
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Message-Id: <20230112191454.169353-2-kwolf@redhat.com>
 Reviewed-by: Hanna Czenczek <hreitz@redhat.com>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 (cherry picked from commit b03dd9613bcf8fe948581b2b3585510cb525c382)
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
 block/qcow2-bitmap.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
 diff --git a/block/qcow2-bitmap.c b/block/qcow2-bitmap.c
 index 8fb4731551..869069415c 100644
 --- a/block/qcow2-bitmap.c
 +++ b/block/qcow2-bitmap.c
@@ -115,7 +115,7 @@ static int update_header_sync(BlockDriverState *bs)
     return bdrv_flush(bs->file->bs);
 }
 -static inline void bitmap_table_to_be(uint64_t *bitmap_table, size_t size)
 +static inline void bitmap_table_bswap_be(uint64_t *bitmap_table, size_t size)
 {
     size_t i;
@@ -1401,9 +1401,10 @@ static int store_bitmap(BlockDriverState *bs, Qcow2Bitmap *bm, Error **errp)
         goto fail;
     }
 -    bitmap_table_to_be(tb, tb_size);
 +    bitmap_table_bswap_be(tb, tb_size);
     ret = bdrv_pwrite(bs->file, tb_offset, tb, tb_size * sizeof(tb[0]));
     if (ret < 0) {
 +        bitmap_table_bswap_be(tb, tb_size);
         error_setg_errno(errp, -ret, "Failed to write bitmap '%s' to file",
                          bm_name);
         goto fail;
 -- 
 2.37.3
--- a/SOURCES/kvm-qemu-coroutine-lock-add-smp_mb__after_rmw.patch
+++ b/SOURCES/kvm-qemu-coroutine-lock-add-smp_mb__after_rmw.patch
@ -0,0 +1,75 @@
 From 2f03293910f3ac559f37d45c95325ae29638003a Mon Sep 17 00:00:00 2001
 From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 Date: Thu, 9 Mar 2023 08:15:14 -0500
 Subject: [PATCH 07/13] qemu-coroutine-lock: add smp_mb__after_rmw()
 RH-Author: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 RH-MergeRequest: 263: qatomic: add smp_mb__before/after_rmw()
 RH-Bugzilla: 2168472
 RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
 RH-Acked-by: Eric Auger <eric.auger@redhat.com>
 RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
 RH-Acked-by: David Hildenbrand <david@redhat.com>
 RH-Commit: [7/10] 9cf1b6d3b0dd154489e75ad54a3000ea58983960
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2168472
 commit e3a3b6ec8169eab2feb241b4982585001512cd55
 Author: Paolo Bonzini <pbonzini@redhat.com>
 Date:   Fri Mar 3 10:52:59 2023 +0100
    qemu-coroutine-lock: add smp_mb__after_rmw()
    mutex->from_push and mutex->handoff in qemu-coroutine-lock implement
    the familiar pattern:
       write a                                  write b
       smp_mb()                                 smp_mb()
       read b                                   read a
    The memory barrier is required by the C memory model even after a
    SEQ_CST read-modify-write operation such as QSLIST_INSERT_HEAD_ATOMIC.
    Add it and avoid the unclear qatomic_mb_read() operation.
    Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
    Reviewed-by: David Hildenbrand <david@redhat.com>
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 ---
 util/qemu-coroutine-lock.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)
 diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
 index 2669403839..a03ed0e664 100644
 --- a/util/qemu-coroutine-lock.c
 +++ b/util/qemu-coroutine-lock.c
@@ -206,10 +206,16 @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(AioContext *ctx,
     trace_qemu_co_mutex_lock_entry(mutex, self);
     push_waiter(mutex, &w);
 +    /*
 +     * Add waiter before reading mutex->handoff.  Pairs with qatomic_mb_set
 +     * in qemu_co_mutex_unlock.
 +     */
 +    smp_mb__after_rmw();
 +
     /* This is the "Responsibility Hand-Off" protocol; a lock() picks from
      * a concurrent unlock() the responsibility of waking somebody up.
      */
 -    old_handoff = qatomic_mb_read(&mutex->handoff);
 +    old_handoff = qatomic_read(&mutex->handoff);
     if (old_handoff &&
         has_waiters(mutex) &&
         qatomic_cmpxchg(&mutex->handoff, old_handoff, 0) == old_handoff) {
@@ -308,6 +314,7 @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
         }
         our_handoff = mutex->sequence;
 +        /* Set handoff before checking for waiters.  */
         qatomic_mb_set(&mutex->handoff, our_handoff);
         if (!has_waiters(mutex)) {
             /* The concurrent lock has not added itself yet, so it
 -- 
 2.37.3
--- a/SOURCES/kvm-qemu-img-bitmap-Report-errors-while-closing-the-imag.patch
+++ b/SOURCES/kvm-qemu-img-bitmap-Report-errors-while-closing-the-imag.patch
@ -0,0 +1,70 @@
 From 648193b48d8aeaded90fd657e3610d8040f505fc Mon Sep 17 00:00:00 2001
 From: Kevin Wolf <kwolf@redhat.com>
 Date: Thu, 12 Jan 2023 20:14:53 +0100
 Subject: [PATCH 3/4] qemu-img bitmap: Report errors while closing the image
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Kevin Wolf <kwolf@redhat.com>
 RH-MergeRequest: 251: qemu-img: Fix exit code for errors closing the image
 RH-Bugzilla: 2147617
 RH-Acked-by: Hanna Czenczek <hreitz@redhat.com>
 RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
 RH-Acked-by: Stefano Garzarella <sgarzare@redhat.com>
 RH-Commit: [3/4] 8e13e09564718a0badd03af84f036246a46a0eba
 blk_unref() can't report any errors that happen while closing the image.
 For example, if qcow2 hits an -ENOSPC error while writing out dirty
 bitmaps when it's closed, it prints error messages to stderr, but
 'qemu-img bitmap' won't see any error return value and will therefore
 look successful with exit code 0.
 In order to fix this, manually inactivate the image first before calling
 blk_unref(). This already performs the operations that would be most
 likely to fail while closing the image, but it can still return errors.
 Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1330
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Message-Id: <20230112191454.169353-4-kwolf@redhat.com>
 Reviewed-by: Hanna Czenczek <hreitz@redhat.com>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 (cherry picked from commit c5e477110dcb8ef4642dce399777c3dee68fa96c)
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
 qemu-img.c | 11 +++++++++++
 1 file changed, 11 insertions(+)
 diff --git a/qemu-img.c b/qemu-img.c
 index 18833f7d69..7d035c0c7f 100644
 --- a/qemu-img.c
 +++ b/qemu-img.c
@@ -4622,6 +4622,7 @@ static int img_bitmap(int argc, char **argv)
     QSIMPLEQ_HEAD(, ImgBitmapAction) actions;
     ImgBitmapAction *act, *act_next;
     const char *op;
 +    int inactivate_ret;
     QSIMPLEQ_INIT(&actions);
@@ -4806,6 +4807,16 @@ static int img_bitmap(int argc, char **argv)
     ret = 0;
  out:
 +    /*
 +     * Manually inactivate the images first because this way we can know whether
 +     * an error occurred. blk_unref() doesn't tell us about failures.
 +     */
 +    inactivate_ret = bdrv_inactivate_all();
 +    if (inactivate_ret < 0) {
 +        error_report("Error while closing the image: %s", strerror(-inactivate_ret));
 +        ret = 1;
 +    }
 +
     blk_unref(src);
     blk_unref(blk);
     qemu_opts_del(opts);
 -- 
 2.37.3
--- a/SOURCES/kvm-qemu-img-commit-Report-errors-while-closing-the-imag.patch
+++ b/SOURCES/kvm-qemu-img-commit-Report-errors-while-closing-the-imag.patch
@ -0,0 +1,67 @@
 From 2396df7fe527567e8e78761ef24ea1057ef6fa48 Mon Sep 17 00:00:00 2001
 From: Kevin Wolf <kwolf@redhat.com>
 Date: Thu, 12 Jan 2023 20:14:52 +0100
 Subject: [PATCH 2/4] qemu-img commit: Report errors while closing the image
 RH-Author: Kevin Wolf <kwolf@redhat.com>
 RH-MergeRequest: 251: qemu-img: Fix exit code for errors closing the image
 RH-Bugzilla: 2147617
 RH-Acked-by: Hanna Czenczek <hreitz@redhat.com>
 RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
 RH-Acked-by: Stefano Garzarella <sgarzare@redhat.com>
 RH-Commit: [2/4] 28f95bf76d1d63e2b0bed0c2ba5206bd3e5ea4f8
 blk_unref() can't report any errors that happen while closing the image.
 For example, if qcow2 hits an -ENOSPC error while writing out dirty
 bitmaps when it's closed, it prints error messages to stderr, but
 'qemu-img commit' won't see any error return value and will therefore
 look successful with exit code 0.
 In order to fix this, manually inactivate the image first before calling
 blk_unref(). This already performs the operations that would be most
 likely to fail while closing the image, but it can still return errors.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Message-Id: <20230112191454.169353-3-kwolf@redhat.com>
 Reviewed-by: Hanna Czenczek <hreitz@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 (cherry picked from commit 44efba2d713aca076c411594d0c1a2b99155eeb3)
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
 qemu-img.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 diff --git a/qemu-img.c b/qemu-img.c
 index f036a1d428..18833f7d69 100644
 --- a/qemu-img.c
 +++ b/qemu-img.c
@@ -443,6 +443,11 @@ static BlockBackend *img_open(bool image_opts,
         blk = img_open_file(filename, NULL, fmt, flags, writethrough, quiet,
                             force_share);
     }
 +
 +    if (blk) {
 +        blk_set_force_allow_inactivate(blk);
 +    }
 +
     return blk;
 }
@@ -1110,6 +1115,14 @@ unref_backing:
 done:
     qemu_progress_end();
 +    /*
 +     * Manually inactivate the image first because this way we can know whether
 +     * an error occurred. blk_unref() doesn't tell us about failures.
 +     */
 +    ret = bdrv_inactivate_all();
 +    if (ret < 0 && !local_err) {
 +        error_setg_errno(&local_err, -ret, "Error while closing the image");
 +    }
     blk_unref(blk);
     if (local_err) {
 -- 
 2.37.3
--- a/SOURCES/kvm-qemu-iotests-Test-qemu-img-bitmap-commit-exit-code-o.patch
+++ b/SOURCES/kvm-qemu-iotests-Test-qemu-img-bitmap-commit-exit-code-o.patch
@ -0,0 +1,166 @@
 From 7c6faae20638f58681df223e0ca44e0a6cb60d2d Mon Sep 17 00:00:00 2001
 From: Kevin Wolf <kwolf@redhat.com>
 Date: Thu, 12 Jan 2023 20:14:54 +0100
 Subject: [PATCH 4/4] qemu-iotests: Test qemu-img bitmap/commit exit code on
 error
 RH-Author: Kevin Wolf <kwolf@redhat.com>
 RH-MergeRequest: 251: qemu-img: Fix exit code for errors closing the image
 RH-Bugzilla: 2147617
 RH-Acked-by: Hanna Czenczek <hreitz@redhat.com>
 RH-Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
 RH-Acked-by: Stefano Garzarella <sgarzare@redhat.com>
 RH-Commit: [4/4] fb2f9de98ddd2ee1d745119e4f15272ef44e0aae
 This tests that when an error happens while writing back bitmaps to the
 image file in qcow2_inactivate(), 'qemu-img bitmap/commit' actually
 return an error value in their exit code instead of making the operation
 look successful to scripts.
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 Message-Id: <20230112191454.169353-5-kwolf@redhat.com>
 Reviewed-by: Hanna Czenczek <hreitz@redhat.com>
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 (cherry picked from commit 07a4e1f8e5418f36424cd57d5d061b090a238c65)
 Signed-off-by: Kevin Wolf <kwolf@redhat.com>
 ---
 .../qemu-iotests/tests/qemu-img-close-errors  | 96 +++++++++++++++++++
 .../tests/qemu-img-close-errors.out           | 23 +++++
 2 files changed, 119 insertions(+)
 create mode 100755 tests/qemu-iotests/tests/qemu-img-close-errors
 create mode 100644 tests/qemu-iotests/tests/qemu-img-close-errors.out
 diff --git a/tests/qemu-iotests/tests/qemu-img-close-errors b/tests/qemu-iotests/tests/qemu-img-close-errors
 new file mode 100755
 index 0000000000..50bfb6cfa2
 --- /dev/null
 +++ b/tests/qemu-iotests/tests/qemu-img-close-errors
@@ -0,0 +1,96 @@
 +#!/usr/bin/env bash
 +# group: rw auto quick
 +#
 +# Check that errors while closing the image, in particular writing back dirty
 +# bitmaps, is correctly reported with a failing qemu-img exit code.
 +#
 +# Copyright (C) 2023 Red Hat, Inc.
 +#
 +# This program is free software; you can redistribute it and/or modify
 +# it under the terms of the GNU General Public License as published by
 +# the Free Software Foundation; either version 2 of the License, or
 +# (at your option) any later version.
 +#
 +# This program is distributed in the hope that it will be useful,
 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 +# GNU General Public License for more details.
 +#
 +# You should have received a copy of the GNU General Public License
 +# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 +#
 +
 +# creator
 +owner=kwolf@redhat.com
 +
 +seq="$(basename $0)"
 +echo "QA output created by $seq"
 +
 +status=1	# failure is the default!
 +
 +_cleanup()
 +{
 +    _cleanup_test_img
 +}
 +trap "_cleanup; exit \$status" 0 1 2 3 15
 +
 +# get standard environment, filters and checks
 +cd ..
 +. ./common.rc
 +. ./common.filter
 +
 +_supported_fmt qcow2
 +_supported_proto file
 +_supported_os Linux
 +
 +size=1G
 +
 +# The error we are going to use is ENOSPC. Depending on how many bitmaps we
 +# create in the backing file (and therefore increase the used up space), we get
 +# failures in different places. With a low number, only merging the bitmap
 +# fails, whereas with a higher number, already 'qemu-img commit' fails.
 +for max_bitmap in 6 7; do
 +    echo
 +    echo "=== Test with $max_bitmap bitmaps ==="
 +
 +    TEST_IMG="$TEST_IMG.base" _make_test_img -q $size
 +    for i in $(seq 1 $max_bitmap); do
 +        $QEMU_IMG bitmap --add "$TEST_IMG.base" "stale-bitmap-$i"
 +    done
 +
 +    # Simulate a block device of 128 MB by resizing the image file accordingly
 +    # and then enforcing the size with the raw driver
 +    $QEMU_IO -f raw -c "truncate 128M" "$TEST_IMG.base"
 +    BASE_JSON='json:{
 +        "driver": "qcow2",
 +        "file": {
 +            "driver": "raw",
 +            "size": 134217728,
 +            "file": {
 +                "driver": "file",
 +                "filename":"'"$TEST_IMG.base"'"
 +            }
 +        }
 +    }'
 +
 +    _make_test_img -q -b "$BASE_JSON" -F $IMGFMT
 +    $QEMU_IMG bitmap --add "$TEST_IMG" "good-bitmap"
 +
 +    $QEMU_IO -c 'write 0 126m' "$TEST_IMG" | _filter_qemu_io
 +
 +    $QEMU_IMG commit -d "$TEST_IMG" 2>&1 | _filter_generated_node_ids
 +    echo "qemu-img commit exit code: ${PIPESTATUS[0]}"
 +
 +    $QEMU_IMG bitmap --add "$BASE_JSON" "good-bitmap"
 +    echo "qemu-img bitmap --add exit code: $?"
 +
 +    $QEMU_IMG bitmap --merge "good-bitmap" -b "$TEST_IMG" "$BASE_JSON" \
 +        "good-bitmap" 2>&1 | _filter_generated_node_ids
 +    echo "qemu-img bitmap --merge exit code:  ${PIPESTATUS[0]}"
 +done
 +
 +# success, all done
 +echo "*** done"
 +rm -f $seq.full
 +status=0
 +
 diff --git a/tests/qemu-iotests/tests/qemu-img-close-errors.out b/tests/qemu-iotests/tests/qemu-img-close-errors.out
 new file mode 100644
 index 0000000000..1bfe88f176
 --- /dev/null
 +++ b/tests/qemu-iotests/tests/qemu-img-close-errors.out
@@ -0,0 +1,23 @@
 +QA output created by qemu-img-close-errors
 +
 +=== Test with 6 bitmaps ===
 +wrote 132120576/132120576 bytes at offset 0
 +126 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +Image committed.
 +qemu-img commit exit code: 0
 +qemu-img bitmap --add exit code: 0
 +qemu-img: Lost persistent bitmaps during inactivation of node 'NODE_NAME': Failed to write bitmap 'good-bitmap' to file: No space left on device
 +qemu-img: Error while closing the image: Invalid argument
 +qemu-img: Lost persistent bitmaps during inactivation of node 'NODE_NAME': Failed to write bitmap 'good-bitmap' to file: No space left on device
 +qemu-img bitmap --merge exit code:  1
 +
 +=== Test with 7 bitmaps ===
 +wrote 132120576/132120576 bytes at offset 0
 +126 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 +qemu-img: Lost persistent bitmaps during inactivation of node 'NODE_NAME': Failed to write bitmap 'stale-bitmap-7' to file: No space left on device
 +qemu-img: Lost persistent bitmaps during inactivation of node 'NODE_NAME': Failed to write bitmap 'stale-bitmap-7' to file: No space left on device
 +qemu-img: Error while closing the image: Invalid argument
 +qemu-img commit exit code: 1
 +qemu-img bitmap --add exit code: 0
 +qemu-img bitmap --merge exit code:  0
 +*** done
 -- 
 2.37.3
--- a/SOURCES/kvm-qemu-options.hx-Update-the-reduced-phys-bits-documen.patch
+++ b/SOURCES/kvm-qemu-options.hx-Update-the-reduced-phys-bits-documen.patch
@ -0,0 +1,61 @@
 From 095811c08557b0a2ad1a433d28699ead1e5ef664 Mon Sep 17 00:00:00 2001
 From: Bandan Das <bsd@redhat.com>
 Date: Thu, 3 Aug 2023 15:12:15 -0400
 Subject: [PATCH 2/5] qemu-options.hx: Update the reduced-phys-bits
 documentation
 RH-Author: Bandan Das <None>
 RH-MergeRequest: 296: Updates to SEV reduced-phys-bits parameter
 RH-Bugzilla: 2214840
 RH-Acked-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
 RH-Commit: [2/4] f8e8f5aeff449a34ce90c6e55e2a51873a6e6a87
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2214840
 commit 326e3015c4c6f3197157ea0bb00826ae740e2fad
 Author: Tom Lendacky <thomas.lendacky@amd.com>
 Date:   Fri Sep 30 10:14:28 2022 -0500
    qemu-options.hx: Update the reduced-phys-bits documentation
    A guest only ever experiences, at most, 1 bit of reduced physical
    addressing. Update the documentation to reflect this as well as change
    the example value on the reduced-phys-bits option.
    Fixes: a9b4942f48 ("target/i386: add Secure Encrypted Virtualization (SEV) object")
    Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
    Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
    Message-Id: <13a62ced1808546c1d398e2025cf85f4c94ae123.1664550870.git.thomas.lendacky@amd.com>
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Signed-off-by: Bandan Das <bsd@redhat.com>
 ---
 qemu-options.hx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 diff --git a/qemu-options.hx b/qemu-options.hx
 index 4b7798088b..981248e283 100644
 --- a/qemu-options.hx
 +++ b/qemu-options.hx
@@ -5204,7 +5204,7 @@ SRST
         physical address space. The ``reduced-phys-bits`` is used to
         provide the number of bits we loose in physical address space.
         Similar to C-bit, the value is Host family dependent. On EPYC,
 -        the value should be 5.
 +        a guest will lose a maximum of 1 bit, so the value should be 1.
         The ``sev-device`` provides the device file to use for
         communicating with the SEV firmware running inside AMD Secure
@@ -5239,7 +5239,7 @@ SRST
              # |qemu_system_x86| \\
                  ...... \\
 -                 -object sev-guest,id=sev0,cbitpos=47,reduced-phys-bits=5 \\
 +                 -object sev-guest,id=sev0,cbitpos=47,reduced-phys-bits=1 \\
                  -machine ...,memory-encryption=sev0 \\
                  .....
 -- 
 2.37.3
--- a/SOURCES/kvm-qemu-thread-posix-cleanup-fix-document-QemuEvent.patch
+++ b/SOURCES/kvm-qemu-thread-posix-cleanup-fix-document-QemuEvent.patch
@ -0,0 +1,146 @@
 From d46ca52c3f42add549bd3790a41d06594821334e Mon Sep 17 00:00:00 2001
 From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 Date: Thu, 9 Mar 2023 08:10:57 -0500
 Subject: [PATCH 03/13] qemu-thread-posix: cleanup, fix, document QemuEvent
 RH-Author: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 RH-MergeRequest: 263: qatomic: add smp_mb__before/after_rmw()
 RH-Bugzilla: 2168472
 RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
 RH-Acked-by: Eric Auger <eric.auger@redhat.com>
 RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
 RH-Acked-by: David Hildenbrand <david@redhat.com>
 RH-Commit: [3/10] 746070c4d78c7f0a9ac4456d9aee69475acb8964
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2168472
 commit 9586a1329f5dce6c1d7f4de53cf0536644d7e593
 Author: Paolo Bonzini <pbonzini@redhat.com>
 Date:   Thu Mar 2 11:19:52 2023 +0100
    qemu-thread-posix: cleanup, fix, document QemuEvent
    QemuEvent is currently broken on ARM due to missing memory barriers
    after qatomic_*().  Apart from adding the memory barrier, a closer look
    reveals some unpaired memory barriers too.  Document more clearly what
    is going on.
    Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
    Reviewed-by: David Hildenbrand <david@redhat.com>
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 ---
 util/qemu-thread-posix.c | 69 ++++++++++++++++++++++++++++------------
 1 file changed, 49 insertions(+), 20 deletions(-)
 diff --git a/util/qemu-thread-posix.c b/util/qemu-thread-posix.c
 index e1225b63bd..dd3b6d4670 100644
 --- a/util/qemu-thread-posix.c
 +++ b/util/qemu-thread-posix.c
@@ -430,13 +430,21 @@ void qemu_event_destroy(QemuEvent *ev)
 void qemu_event_set(QemuEvent *ev)
 {
 -    /* qemu_event_set has release semantics, but because it *loads*
 +    assert(ev->initialized);
 +
 +    /*
 +     * Pairs with both qemu_event_reset() and qemu_event_wait().
 +     *
 +     * qemu_event_set has release semantics, but because it *loads*
      * ev->value we need a full memory barrier here.
      */
 -    assert(ev->initialized);
     smp_mb();
     if (qatomic_read(&ev->value) != EV_SET) {
 -        if (qatomic_xchg(&ev->value, EV_SET) == EV_BUSY) {
 +        int old = qatomic_xchg(&ev->value, EV_SET);
 +
 +        /* Pairs with memory barrier in kernel futex_wait system call.  */
 +        smp_mb__after_rmw();
 +        if (old == EV_BUSY) {
             /* There were waiters, wake them up.  */
             qemu_futex_wake(ev, INT_MAX);
         }
@@ -445,18 +453,19 @@ void qemu_event_set(QemuEvent *ev)
 void qemu_event_reset(QemuEvent *ev)
 {
 -    unsigned value;
 -
     assert(ev->initialized);
 -    value = qatomic_read(&ev->value);
 -    smp_mb_acquire();
 -    if (value == EV_SET) {
 -        /*
 -         * If there was a concurrent reset (or even reset+wait),
 -         * do nothing.  Otherwise change EV_SET->EV_FREE.
 -         */
 -        qatomic_or(&ev->value, EV_FREE);
 -    }
 +
 +    /*
 +     * If there was a concurrent reset (or even reset+wait),
 +     * do nothing.  Otherwise change EV_SET->EV_FREE.
 +     */
 +    qatomic_or(&ev->value, EV_FREE);
 +
 +    /*
 +     * Order reset before checking the condition in the caller.
 +     * Pairs with the first memory barrier in qemu_event_set().
 +     */
 +    smp_mb__after_rmw();
 }
 void qemu_event_wait(QemuEvent *ev)
@@ -464,20 +473,40 @@ void qemu_event_wait(QemuEvent *ev)
     unsigned value;
     assert(ev->initialized);
 -    value = qatomic_read(&ev->value);
 -    smp_mb_acquire();
 +
 +    /*
 +     * qemu_event_wait must synchronize with qemu_event_set even if it does
 +     * not go down the slow path, so this load-acquire is needed that
 +     * synchronizes with the first memory barrier in qemu_event_set().
 +     *
 +     * If we do go down the slow path, there is no requirement at all: we
 +     * might miss a qemu_event_set() here but ultimately the memory barrier in
 +     * qemu_futex_wait() will ensure the check is done correctly.
 +     */
 +    value = qatomic_load_acquire(&ev->value);
     if (value != EV_SET) {
         if (value == EV_FREE) {
             /*
 -             * Leave the event reset and tell qemu_event_set that there
 -             * are waiters.  No need to retry, because there cannot be
 -             * a concurrent busy->free transition.  After the CAS, the
 -             * event will be either set or busy.
 +             * Leave the event reset and tell qemu_event_set that there are
 +             * waiters.  No need to retry, because there cannot be a concurrent
 +             * busy->free transition.  After the CAS, the event will be either
 +             * set or busy.
 +             *
 +             * This cmpxchg doesn't have particular ordering requirements if it
 +             * succeeds (moving the store earlier can only cause qemu_event_set()
 +             * to issue _more_ wakeups), the failing case needs acquire semantics
 +             * like the load above.
              */
             if (qatomic_cmpxchg(&ev->value, EV_FREE, EV_BUSY) == EV_SET) {
                 return;
             }
         }
 +
 +        /*
 +         * This is the final check for a concurrent set, so it does need
 +         * a smp_mb() pairing with the second barrier of qemu_event_set().
 +         * The barrier is inside the FUTEX_WAIT system call.
 +         */
         qemu_futex_wait(ev, EV_BUSY);
     }
 }
 -- 
 2.37.3
--- a/SOURCES/kvm-qemu-thread-win32-cleanup-fix-document-QemuEvent.patch
+++ b/SOURCES/kvm-qemu-thread-win32-cleanup-fix-document-QemuEvent.patch
@ -0,0 +1,162 @@
 From fa730378c42567e77eaf3e70983108f31f9001b9 Mon Sep 17 00:00:00 2001
 From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 Date: Thu, 9 Mar 2023 08:11:05 -0500
 Subject: [PATCH 04/13] qemu-thread-win32: cleanup, fix, document QemuEvent
 RH-Author: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 RH-MergeRequest: 263: qatomic: add smp_mb__before/after_rmw()
 RH-Bugzilla: 2168472
 RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
 RH-Acked-by: Eric Auger <eric.auger@redhat.com>
 RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
 RH-Acked-by: David Hildenbrand <david@redhat.com>
 RH-Commit: [4/10] 43d5bd903b460d4c3c5793a456820e8c5c8521d9
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2168472
 commit 6c5df4b48f0c52a61342ecb307a43f4c2a3565c4
 Author: Paolo Bonzini <pbonzini@redhat.com>
 Date:   Thu Mar 2 11:22:50 2023 +0100
    qemu-thread-win32: cleanup, fix, document QemuEvent
    QemuEvent is currently broken on ARM due to missing memory barriers
    after qatomic_*().  Apart from adding the memory barrier, a closer look
    reveals some unpaired memory barriers that are not really needed and
    complicated the functions unnecessarily.  Also, it is relying on
    a memory barrier in ResetEvent(); the barrier _ought_ to be there
    but there is really no documentation about it, so make it explicit.
    Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
    Reviewed-by: David Hildenbrand <david@redhat.com>
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
 ---
 util/qemu-thread-win32.c | 82 +++++++++++++++++++++++++++-------------
 1 file changed, 56 insertions(+), 26 deletions(-)
 diff --git a/util/qemu-thread-win32.c b/util/qemu-thread-win32.c
 index 52eb19f351..c10249bc2e 100644
 --- a/util/qemu-thread-win32.c
 +++ b/util/qemu-thread-win32.c
@@ -246,12 +246,20 @@ void qemu_event_destroy(QemuEvent *ev)
 void qemu_event_set(QemuEvent *ev)
 {
     assert(ev->initialized);
 -    /* qemu_event_set has release semantics, but because it *loads*
 +
 +    /*
 +     * Pairs with both qemu_event_reset() and qemu_event_wait().
 +     *
 +     * qemu_event_set has release semantics, but because it *loads*
      * ev->value we need a full memory barrier here.
      */
     smp_mb();
     if (qatomic_read(&ev->value) != EV_SET) {
 -        if (qatomic_xchg(&ev->value, EV_SET) == EV_BUSY) {
 +        int old = qatomic_xchg(&ev->value, EV_SET);
 +
 +        /* Pairs with memory barrier after ResetEvent.  */
 +        smp_mb__after_rmw();
 +        if (old == EV_BUSY) {
             /* There were waiters, wake them up.  */
             SetEvent(ev->event);
         }
@@ -260,17 +268,19 @@ void qemu_event_set(QemuEvent *ev)
 void qemu_event_reset(QemuEvent *ev)
 {
 -    unsigned value;
 -
     assert(ev->initialized);
 -    value = qatomic_read(&ev->value);
 -    smp_mb_acquire();
 -    if (value == EV_SET) {
 -        /* If there was a concurrent reset (or even reset+wait),
 -         * do nothing.  Otherwise change EV_SET->EV_FREE.
 -         */
 -        qatomic_or(&ev->value, EV_FREE);
 -    }
 +
 +    /*
 +     * If there was a concurrent reset (or even reset+wait),
 +     * do nothing.  Otherwise change EV_SET->EV_FREE.
 +     */
 +    qatomic_or(&ev->value, EV_FREE);
 +
 +    /*
 +     * Order reset before checking the condition in the caller.
 +     * Pairs with the first memory barrier in qemu_event_set().
 +     */
 +    smp_mb__after_rmw();
 }
 void qemu_event_wait(QemuEvent *ev)
@@ -278,29 +288,49 @@ void qemu_event_wait(QemuEvent *ev)
     unsigned value;
     assert(ev->initialized);
 -    value = qatomic_read(&ev->value);
 -    smp_mb_acquire();
 +
 +    /*
 +     * qemu_event_wait must synchronize with qemu_event_set even if it does
 +     * not go down the slow path, so this load-acquire is needed that
 +     * synchronizes with the first memory barrier in qemu_event_set().
 +     *
 +     * If we do go down the slow path, there is no requirement at all: we
 +     * might miss a qemu_event_set() here but ultimately the memory barrier in
 +     * qemu_futex_wait() will ensure the check is done correctly.
 +     */
 +    value = qatomic_load_acquire(&ev->value);
     if (value != EV_SET) {
         if (value == EV_FREE) {
 -            /* qemu_event_set is not yet going to call SetEvent, but we are
 -             * going to do another check for EV_SET below when setting EV_BUSY.
 -             * At that point it is safe to call WaitForSingleObject.
 +            /*
 +             * Here the underlying kernel event is reset, but qemu_event_set is
 +             * not yet going to call SetEvent.  However, there will be another
 +             * check for EV_SET below when setting EV_BUSY.  At that point it
 +             * is safe to call WaitForSingleObject.
              */
             ResetEvent(ev->event);
 -            /* Tell qemu_event_set that there are waiters.  No need to retry
 -             * because there cannot be a concurrent busy->free transition.
 -             * After the CAS, the event will be either set or busy.
 +            /*
 +             * It is not clear whether ResetEvent provides this barrier; kernel
 +             * APIs (KeResetEvent/KeClearEvent) do not.  Better safe than sorry!
 +             */
 +            smp_mb();
 +
 +            /*
 +             * Leave the event reset and tell qemu_event_set that there are
 +             * waiters.  No need to retry, because there cannot be a concurrent
 +             * busy->free transition.  After the CAS, the event will be either
 +             * set or busy.
              */
             if (qatomic_cmpxchg(&ev->value, EV_FREE, EV_BUSY) == EV_SET) {
 -                value = EV_SET;
 -            } else {
 -                value = EV_BUSY;
 +                return;
             }
         }
 -        if (value == EV_BUSY) {
 -            WaitForSingleObject(ev->event, INFINITE);
 -        }
 +
 +        /*
 +         * ev->value is now EV_BUSY.  Since we didn't observe EV_SET,
 +         * qemu_event_set() must observe EV_BUSY and call SetEvent().
 +         */
 +        WaitForSingleObject(ev->event, INFINITE);
     }
 }
 -- 
 2.37.3
--- a/SOURCES/kvm-raven-disable-reentrancy-detection-for-iomem.patch
+++ b/SOURCES/kvm-raven-disable-reentrancy-detection-for-iomem.patch
@ -0,0 +1,55 @@
 From c5cb3e97098834f9cf12b6c5260d9b43d68d64eb Mon Sep 17 00:00:00 2001
 From: Jon Maloy <jmaloy@redhat.com>
 Date: Tue, 9 May 2023 10:29:03 -0400
 Subject: [PATCH 07/15] raven: disable reentrancy detection for iomem
 RH-Author: Jon Maloy <jmaloy@redhat.com>
 RH-MergeRequest: 277: memory: prevent dma-reentracy issues
 RH-Bugzilla: 1999236
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [7/12] f41983390acba68043d386be090172dd17a5e58c (redhat/rhel/src/qemu-kvm/jons-qemu-kvm-2)
 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1999236
 Upstream: Merged
 CVE: CVE-2021-3750
 commit 6dad5a6810d9c60ca320d01276f6133bbcfa1fc7
 Author: Alexander Bulekov <alxndr@bu.edu>
 Date:   Thu Apr 27 17:10:12 2023 -0400
    raven: disable reentrancy detection for iomem
    As the code is designed for re-entrant calls from raven_io_ops to
    pci-conf, mark raven_io_ops as reentrancy-safe.
    Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
    Message-Id: <20230427211013.2994127-8-alxndr@bu.edu>
    Signed-off-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Jon Maloy <jmaloy@redhat.com>
 ---
 hw/pci-host/raven.c | 7 +++++++
 1 file changed, 7 insertions(+)
 diff --git a/hw/pci-host/raven.c b/hw/pci-host/raven.c
 index 6e514f75eb..245b1653e4 100644
 --- a/hw/pci-host/raven.c
 +++ b/hw/pci-host/raven.c
@@ -294,6 +294,13 @@ static void raven_pcihost_initfn(Object *obj)
     memory_region_init(&s->pci_memory, obj, "pci-memory", 0x3f000000);
     address_space_init(&s->pci_io_as, &s->pci_io, "raven-io");
 +    /*
 +     * Raven's raven_io_ops use the address-space API to access pci-conf-idx
 +     * (which is also owned by the raven device). As such, mark the
 +     * pci_io_non_contiguous as re-entrancy safe.
 +     */
 +    s->pci_io_non_contiguous.disable_reentrancy_guard = true;
 +
     /* CPU address space */
     memory_region_add_subregion(address_space_mem, PCI_IO_BASE_ADDR,
                                 &s->pci_io);
 -- 
 2.37.3
--- a/SOURCES/kvm-redhat-Update-linux-headers-for-kvm_s390_vm_cpu_uv_f.patch
+++ b/SOURCES/kvm-redhat-Update-linux-headers-for-kvm_s390_vm_cpu_uv_f.patch
@ -0,0 +1,56 @@
 From 76e75a129e59a33103aa7d1d92074ddcef556980 Mon Sep 17 00:00:00 2001
 From: Thomas Huth <thuth@redhat.com>
 Date: Tue, 12 Sep 2023 11:24:40 +0200
 Subject: [PATCH 3/5] redhat: Update linux-headers for kvm_s390_vm_cpu_uv_feat
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Thomas Huth <thuth@redhat.com>
 RH-MergeRequest: 321: Enable Secure Execution Crypto Passthrough for KVM on s390x
 RH-Bugzilla: 2111390
 RH-Acked-by: Cédric Le Goater <clg@redhat.com>
 RH-Commit: [3/5] f1329f5ce5f66033ead7777384dcc1613cad1226
 Upstream Status: rhel-only
 This hunk is part of upstream commit da3c22c74a3c
 ("linux-headers: Update to Linux v6.6-rc1"), but since that
 commit updates a lot of files and does not apply cleanly,
 we only focus on the necessary change here.
 Signed-off-by: Thomas Huth <thuth@redhat.com>
 ---
 linux-headers/asm-s390/kvm.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 diff --git a/linux-headers/asm-s390/kvm.h b/linux-headers/asm-s390/kvm.h
 index f053b8304a..6706bdc5cc 100644
 --- a/linux-headers/asm-s390/kvm.h
 +++ b/linux-headers/asm-s390/kvm.h
@@ -158,6 +158,22 @@ struct kvm_s390_vm_cpu_subfunc {
 	__u8 reserved[1728];
 };
 +#define KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST	6
 +#define KVM_S390_VM_CPU_MACHINE_UV_FEAT_GUEST	7
 +
 +#define KVM_S390_VM_CPU_UV_FEAT_NR_BITS	64
 +struct kvm_s390_vm_cpu_uv_feat {
 +	union {
 +		struct {
 +			__u64 : 4;
 +			__u64 ap : 1;		/* bit 4 */
 +			__u64 ap_intr : 1;	/* bit 5 */
 +			__u64 : 58;
 +		};
 +		__u64 feat;
 +	};
 +};
 +
 /* kvm attributes for crypto */
 #define KVM_S390_VM_CRYPTO_ENABLE_AES_KW	0
 #define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW	1
 -- 
 2.41.0
--- a/SOURCES/kvm-s390-kvm-adjust-diag318-resets-to-retain-data.patch
+++ b/SOURCES/kvm-s390-kvm-adjust-diag318-resets-to-retain-data.patch
@ -0,0 +1,88 @@
 From 3c7bc4319d4e475c820a63176d18afb7b4b2ed78 Mon Sep 17 00:00:00 2001
 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
 Date: Tue, 23 May 2023 12:34:33 +0200
 Subject: [PATCH 02/22] s390: kvm: adjust diag318 resets to retain data
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Cédric Le Goater <clg@redhat.com>
 RH-MergeRequest: 279: Backport latest s390x-related fixes from upstream QEMU for qemu-kvm in RHEL 8.9
 RH-Bugzilla: 2169308 2209605
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: David Hildenbrand <david@redhat.com>
 RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
 RH-Commit: [1/21] 16f2ff166efdd26a3be98d7c97d3b184598d1ca4
 Bugzilla: https://bugzilla.redhat.com/2169308
 commit c35aff184b2ed5be930da671ea25c857713555af
 Author: Collin L. Walling <walling@linux.ibm.com>
 Date:   Wed Nov 17 10:23:03 2021 -0500
    s390: kvm: adjust diag318 resets to retain data
    The CPNC portion of the diag318 data is erroneously reset during an
    initial CPU reset caused by SIGP. Let's go ahead and relocate the
    diag318_info field within the CPUS390XState struct such that it is
    only zeroed during a clear reset. This way, the CPNC will be retained
    for each VCPU in the configuration after the diag318 instruction
    has been invoked.
    The s390_machine_reset code already takes care of zeroing the diag318
    data on VM resets, which also cover resets caused by diag308.
    Fixes: fabdada9357b ("s390: guest support for diagnose 0x318")
    Reported-by: Christian Borntraeger <borntraeger@de.ibm.com>
    Signed-off-by: Collin Walling <walling@linux.ibm.com>
    Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
    Reviewed-by: Christian Borntraeger <borntraeger@linux.ibm.com>
    Message-Id: <20211117152303.627969-1-walling@linux.ibm.com>
    Signed-off-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Cédric Le Goater <clg@redhat.com>
 ---
 target/s390x/cpu.h     | 4 ++--
 target/s390x/kvm/kvm.c | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)
 diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
 index ca3845d023..a75e559134 100644
 --- a/target/s390x/cpu.h
 +++ b/target/s390x/cpu.h
@@ -63,6 +63,8 @@ struct CPUS390XState {
     uint64_t etoken;       /* etoken */
     uint64_t etoken_extension; /* etoken extension */
 +    uint64_t diag318_info;
 +
     /* Fields up to this point are not cleared by initial CPU reset */
     struct {} start_initial_reset_fields;
@@ -118,8 +120,6 @@ struct CPUS390XState {
     uint16_t external_call_addr;
     DECLARE_BITMAP(emergency_signals, S390_MAX_CPUS);
 -    uint64_t diag318_info;
 -
 #if !defined(CONFIG_USER_ONLY)
     uint64_t tlb_fill_tec;   /* translation exception code during tlb_fill */
     int tlb_fill_exc;        /* exception number seen during tlb_fill */
 diff --git a/target/s390x/kvm/kvm.c b/target/s390x/kvm/kvm.c
 index d36b44f32a..8d36c377b5 100644
 --- a/target/s390x/kvm/kvm.c
 +++ b/target/s390x/kvm/kvm.c
@@ -1598,6 +1598,10 @@ void kvm_s390_set_diag318(CPUState *cs, uint64_t diag318_info)
         env->diag318_info = diag318_info;
         cs->kvm_run->s.regs.diag318 = diag318_info;
         cs->kvm_run->kvm_dirty_regs |= KVM_SYNC_DIAG318;
 +        /*
 +         * diag 318 info is zeroed during a clear reset and
 +         * diag 308 IPL subcodes.
 +         */
     }
 }
 -- 
 2.37.3
--- a/SOURCES/kvm-s390x-ap-fix-missing-subsystem-reset-registration.patch
+++ b/SOURCES/kvm-s390x-ap-fix-missing-subsystem-reset-registration.patch
@ -0,0 +1,44 @@
 From eb60b6cab9550a62f0b20a9e6d69547d651e3020 Mon Sep 17 00:00:00 2001
 From: Janosch Frank <frankja@linux.ibm.com>
 Date: Wed, 23 Aug 2023 16:22:15 +0200
 Subject: [PATCH 1/5] s390x/ap: fix missing subsystem reset registration
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Thomas Huth <thuth@redhat.com>
 RH-MergeRequest: 321: Enable Secure Execution Crypto Passthrough for KVM on s390x
 RH-Bugzilla: 2111390
 RH-Acked-by: Cédric Le Goater <clg@redhat.com>
 RH-Commit: [1/5] 4ebe81bb6cc4fc137ca4ebc9c0cebdedc421cc91
 A subsystem reset contains a reset of AP resources which has been
 missing.  Adding the AP bridge to the list of device types that need
 reset fixes this issue.
 Reviewed-by: Jason J. Herne <jjherne@linux.ibm.com>
 Reviewed-by: Tony Krowiak <akrowiak@linux.ibm.com>
 Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
 Fixes: a51b3153 ("s390x/ap: base Adjunct Processor (AP) object model")
 Message-ID: <20230823142219.1046522-2-seiden@linux.ibm.com>
 Signed-off-by: Thomas Huth <thuth@redhat.com>
 (cherry picked from commit 297ec01f0b9864ea8209ca0ddc6643b4c0574bdb)
 ---
 hw/s390x/s390-virtio-ccw.c | 1 +
 1 file changed, 1 insertion(+)
 diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
 index 4a7cd21cac..412d73715a 100644
 --- a/hw/s390x/s390-virtio-ccw.c
 +++ b/hw/s390x/s390-virtio-ccw.c
@@ -100,6 +100,7 @@ static const char *const reset_dev_types[] = {
     "s390-flic",
     "diag288",
     TYPE_S390_PCI_HOST_BRIDGE,
 +    TYPE_AP_BRIDGE,
 };
 static void subsystem_reset(void)
 -- 
 2.41.0
--- a/SOURCES/kvm-s390x-css-revert-SCSW-ctrl-flag-bits-on-error.patch
+++ b/SOURCES/kvm-s390x-css-revert-SCSW-ctrl-flag-bits-on-error.patch
@ -0,0 +1,168 @@
 From 4d940934c304a71813dfa4598b20fafe9d2f5625 Mon Sep 17 00:00:00 2001
 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
 Date: Tue, 23 May 2023 12:34:33 +0200
 Subject: [PATCH 19/22] s390x/css: revert SCSW ctrl/flag bits on error
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Cédric Le Goater <clg@redhat.com>
 RH-MergeRequest: 279: Backport latest s390x-related fixes from upstream QEMU for qemu-kvm in RHEL 8.9
 RH-Bugzilla: 2169308 2209605
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: David Hildenbrand <david@redhat.com>
 RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
 RH-Commit: [18/21] e4d5797ab93ba4afd9978a1d3e1f9d05da301506
 Bugzilla: https://bugzilla.redhat.com/2169308
 commit f53b033e4cd2e7706df3cca04f3bf3c5ffc6b08c
 Author: Peter Jin <pjin@linux.ibm.com>
 Date:   Thu Oct 27 23:23:41 2022 +0200
    s390x/css: revert SCSW ctrl/flag bits on error
    Revert the control and flag bits in the subchannel status word in case
    the SSCH operation fails with non-zero CC (ditto for CSCH and HSCH).
    According to POPS, the control and flag bits are only changed if SSCH,
    CSCH, and HSCH return CC 0, and no other action should be taken otherwise.
    In order to simulate that after the fact, the bits need to be reverted on
    non-zero CC.
    While the do_subchannel_work logic for virtual (virtio) devices will
    return condition code 0, passthrough (vfio) devices may encounter
    errors from either the host kernel or real hardware that need to be
    accounted for after this point. This includes restoring the state of
    the Subchannel Status Word to reflect the subchannel, as these bits
    would not be set in the event of a non-zero condition code from the
    affected instructions.
    Experimentation has shown that a failure on a START SUBCHANNEL (SSCH)
    to a passthrough device would leave the subchannel with the START
    PENDING activity control bit set, thus blocking subsequent SSCH
    operations in css_do_ssch() until some form of error recovery was
    undertaken since no interrupt would be expected.
    Signed-off-by: Peter Jin <pjin@linux.ibm.com>
    Message-Id: <20221027212341.2904795-1-pjin@linux.ibm.com>
    Reviewed-by: Eric Farman <farman@linux.ibm.com>
    Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
    [thuth: Updated the commit description to Eric's suggestion]
    Signed-off-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Cédric Le Goater <clg@redhat.com>
 ---
 hw/s390x/css.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 48 insertions(+), 3 deletions(-)
 diff --git a/hw/s390x/css.c b/hw/s390x/css.c
 index 7d9523f811..95d1b3a3ce 100644
 --- a/hw/s390x/css.c
 +++ b/hw/s390x/css.c
@@ -1522,21 +1522,37 @@ IOInstEnding css_do_xsch(SubchDev *sch)
 IOInstEnding css_do_csch(SubchDev *sch)
 {
     SCHIB *schib = &sch->curr_status;
 +    uint16_t old_scsw_ctrl;
 +    IOInstEnding ccode;
     if (~(schib->pmcw.flags) & (PMCW_FLAGS_MASK_DNV | PMCW_FLAGS_MASK_ENA)) {
         return IOINST_CC_NOT_OPERATIONAL;
     }
 +    /*
 +     * Save the current scsw.ctrl in case CSCH fails and we need
 +     * to revert the scsw to the status quo ante.
 +     */
 +    old_scsw_ctrl = schib->scsw.ctrl;
 +
     /* Trigger the clear function. */
     schib->scsw.ctrl &= ~(SCSW_CTRL_MASK_FCTL | SCSW_CTRL_MASK_ACTL);
     schib->scsw.ctrl |= SCSW_FCTL_CLEAR_FUNC | SCSW_ACTL_CLEAR_PEND;
 -    return do_subchannel_work(sch);
 +    ccode = do_subchannel_work(sch);
 +
 +    if (ccode != IOINST_CC_EXPECTED) {
 +        schib->scsw.ctrl = old_scsw_ctrl;
 +    }
 +
 +    return ccode;
 }
 IOInstEnding css_do_hsch(SubchDev *sch)
 {
     SCHIB *schib = &sch->curr_status;
 +    uint16_t old_scsw_ctrl;
 +    IOInstEnding ccode;
     if (~(schib->pmcw.flags) & (PMCW_FLAGS_MASK_DNV | PMCW_FLAGS_MASK_ENA)) {
         return IOINST_CC_NOT_OPERATIONAL;
@@ -1553,6 +1569,12 @@ IOInstEnding css_do_hsch(SubchDev *sch)
         return IOINST_CC_BUSY;
     }
 +    /*
 +     * Save the current scsw.ctrl in case HSCH fails and we need
 +     * to revert the scsw to the status quo ante.
 +     */
 +    old_scsw_ctrl = schib->scsw.ctrl;
 +
     /* Trigger the halt function. */
     schib->scsw.ctrl |= SCSW_FCTL_HALT_FUNC;
     schib->scsw.ctrl &= ~SCSW_FCTL_START_FUNC;
@@ -1564,7 +1586,13 @@ IOInstEnding css_do_hsch(SubchDev *sch)
     }
     schib->scsw.ctrl |= SCSW_ACTL_HALT_PEND;
 -    return do_subchannel_work(sch);
 +    ccode = do_subchannel_work(sch);
 +
 +    if (ccode != IOINST_CC_EXPECTED) {
 +        schib->scsw.ctrl = old_scsw_ctrl;
 +    }
 +
 +    return ccode;
 }
 static void css_update_chnmon(SubchDev *sch)
@@ -1605,6 +1633,8 @@ static void css_update_chnmon(SubchDev *sch)
 IOInstEnding css_do_ssch(SubchDev *sch, ORB *orb)
 {
     SCHIB *schib = &sch->curr_status;
 +    uint16_t old_scsw_ctrl, old_scsw_flags;
 +    IOInstEnding ccode;
     if (~(schib->pmcw.flags) & (PMCW_FLAGS_MASK_DNV | PMCW_FLAGS_MASK_ENA)) {
         return IOINST_CC_NOT_OPERATIONAL;
@@ -1626,11 +1656,26 @@ IOInstEnding css_do_ssch(SubchDev *sch, ORB *orb)
     }
     sch->orb = *orb;
     sch->channel_prog = orb->cpa;
 +
 +    /*
 +     * Save the current scsw.ctrl and scsw.flags in case SSCH fails and we need
 +     * to revert the scsw to the status quo ante.
 +     */
 +    old_scsw_ctrl = schib->scsw.ctrl;
 +    old_scsw_flags = schib->scsw.flags;
 +
     /* Trigger the start function. */
     schib->scsw.ctrl |= (SCSW_FCTL_START_FUNC | SCSW_ACTL_START_PEND);
     schib->scsw.flags &= ~SCSW_FLAGS_MASK_PNO;
 -    return do_subchannel_work(sch);
 +    ccode = do_subchannel_work(sch);
 +
 +    if (ccode != IOINST_CC_EXPECTED) {
 +        schib->scsw.ctrl = old_scsw_ctrl;
 +        schib->scsw.flags = old_scsw_flags;
 +    }
 +
 +    return ccode;
 }
 static void copy_irb_to_guest(IRB *dest, const IRB *src, const PMCW *pmcw,
 -- 
 2.37.3
--- a/SOURCES/kvm-s390x-do-a-subsystem-reset-before-the-unprotect-on-r.patch
+++ b/SOURCES/kvm-s390x-do-a-subsystem-reset-before-the-unprotect-on-r.patch
@ -0,0 +1,68 @@
 From 05b145a8d5b1c2f796069cdd81826c00cf7c983e Mon Sep 17 00:00:00 2001
 From: Janosch Frank <frankja@linux.ibm.com>
 Date: Fri, 1 Sep 2023 11:48:51 +0000
 Subject: [PATCH 2/5] s390x: do a subsystem reset before the unprotect on
 reboot
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Thomas Huth <thuth@redhat.com>
 RH-MergeRequest: 321: Enable Secure Execution Crypto Passthrough for KVM on s390x
 RH-Bugzilla: 2111390
 RH-Acked-by: Cédric Le Goater <clg@redhat.com>
 RH-Commit: [2/5] ea430d236e1a20ddad7095d2e6d10f741f9a1907
 Bound APQNs have to be reset before tearing down the secure config via
 s390_machine_unprotect(). Otherwise the Ultravisor will return a error
 code.
 So let's do a subsystem_reset() which includes a AP reset before the
 unprotect call. We'll do a full device_reset() afterwards which will
 reset some devices twice. That's ok since we can't move the
 device_reset() before the unprotect as it includes a CPU clear reset
 which the Ultravisor does not expect at that point in time.
 Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
 Message-ID: <20230901114851.154357-1-frankja@linux.ibm.com>
 Tested-by: Viktor Mihajlovski <mihajlov@linux.ibm.com>
 Acked-by: Christian Borntraeger <borntraeger@linux.ibm.com>
 Signed-off-by: Thomas Huth <thuth@redhat.com>
 (cherry picked from commit ef1535901a07f2e49fa25c8bcee7f0b73801d824)
 Conflicts:
 	hw/s390x/s390-virtio-ccw.c
 	(contextual conflict due to missing commit 7966d70f6f6b)
 Signed-off-by: Thomas Huth <thuth@redhat.com>
 ---
 hw/s390x/s390-virtio-ccw.c | 10 ++++++++++
 1 file changed, 10 insertions(+)
 diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
 index 412d73715a..17146469ee 100644
 --- a/hw/s390x/s390-virtio-ccw.c
 +++ b/hw/s390x/s390-virtio-ccw.c
@@ -430,10 +430,20 @@ static void s390_machine_reset(MachineState *machine)
     switch (reset_type) {
     case S390_RESET_EXTERNAL:
     case S390_RESET_REIPL:
 +        /*
 +         * Reset the subsystem which includes a AP reset. If a PV
 +         * guest had APQNs attached the AP reset is a prerequisite to
 +         * unprotecting since the UV checks if all APQNs are reset.
 +         */
 +        subsystem_reset();
         if (s390_is_pv()) {
             s390_machine_unprotect(ms);
         }
 +        /*
 +         * Device reset includes CPU clear resets so this has to be
 +         * done AFTER the unprotect call above.
 +         */
         qemu_devices_reset();
         s390_crypto_reset();
 -- 
 2.41.0
--- a/SOURCES/kvm-s390x-follow-qdev-tree-to-detect-SCSI-device-on-a-CC.patch
+++ b/SOURCES/kvm-s390x-follow-qdev-tree-to-detect-SCSI-device-on-a-CC.patch
@ -0,0 +1,73 @@
 From 6c815e78cea7c26e9a3526cbb686f728eac31021 Mon Sep 17 00:00:00 2001
 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
 Date: Tue, 23 May 2023 12:34:33 +0200
 Subject: [PATCH 12/22] s390x: follow qdev tree to detect SCSI device on a CCW
 bus
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Cédric Le Goater <clg@redhat.com>
 RH-MergeRequest: 279: Backport latest s390x-related fixes from upstream QEMU for qemu-kvm in RHEL 8.9
 RH-Bugzilla: 2169308 2209605
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: David Hildenbrand <david@redhat.com>
 RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
 RH-Commit: [11/21] 97303bc9c356e8828d185868736b395bc0b70214
 Bugzilla: https://bugzilla.redhat.com/2169308
 commit 7d2eb76d0407fc391b78df16d17f1e616ec3e228
 Author: Paolo Bonzini <pbonzini@redhat.com>
 Date:   Mon Mar 28 09:40:00 2022 +0200
    s390x: follow qdev tree to detect SCSI device on a CCW bus
    Do not make assumptions on the parent type of the SCSIDevice, instead
    use object_dynamic_cast all the way up to the CcwDevice.  This is cleaner
    because there is no guarantee that the bus is on a virtio-scsi device;
    that is only the case for the default configuration of QEMU's s390x
    target.
    Reviewed-by: Thomas Huth <thuth@redhat.com>
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Signed-off-by: Cédric Le Goater <clg@redhat.com>
 ---
 hw/s390x/ipl.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)
 diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c
 index eb7fc4c4ae..9051d8652d 100644
 --- a/hw/s390x/ipl.c
 +++ b/hw/s390x/ipl.c
@@ -376,14 +376,18 @@ static CcwDevice *s390_get_ccw_device(DeviceState *dev_st, int *devtype)
                 object_dynamic_cast(OBJECT(dev_st),
                                     TYPE_SCSI_DEVICE);
             if (sd) {
 -                SCSIBus *bus = scsi_bus_from_device(sd);
 -                VirtIOSCSI *vdev = container_of(bus, VirtIOSCSI, bus);
 -                VirtIOSCSICcw *scsi_ccw = container_of(vdev, VirtIOSCSICcw,
 -                                                       vdev);
 -
 -                ccw_dev = (CcwDevice *)object_dynamic_cast(OBJECT(scsi_ccw),
 -                                                           TYPE_CCW_DEVICE);
 -                tmp_dt = CCW_DEVTYPE_SCSI;
 +                SCSIBus *sbus = scsi_bus_from_device(sd);
 +                VirtIODevice *vdev = (VirtIODevice *)
 +                    object_dynamic_cast(OBJECT(sbus->qbus.parent),
 +                                        TYPE_VIRTIO_DEVICE);
 +                if (vdev) {
 +                    ccw_dev = (CcwDevice *)
 +                        object_dynamic_cast(OBJECT(qdev_get_parent_bus(DEVICE(vdev))->parent),
 +                                            TYPE_CCW_DEVICE);
 +                    if (ccw_dev) {
 +                        tmp_dt = CCW_DEVTYPE_SCSI;
 +                    }
 +                }
             }
         }
     }
 -- 
 2.37.3
--- a/SOURCES/kvm-s390x-pci-RPCIT-second-pass-when-mappings-exhausted.patch
+++ b/SOURCES/kvm-s390x-pci-RPCIT-second-pass-when-mappings-exhausted.patch
@ -0,0 +1,114 @@
 From 2f0febd6813c4ad7f52e43afb3ecce7aef3557e6 Mon Sep 17 00:00:00 2001
 From: Matthew Rosato <mjrosato@linux.ibm.com>
 Date: Fri, 28 Oct 2022 15:47:56 -0400
 Subject: [PATCH 08/11] s390x/pci: RPCIT second pass when mappings exhausted
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Cédric Le Goater <clg@redhat.com>
 RH-MergeRequest: 250: s390x/pci: reset ISM passthrough devices on shutdown and system reset
 RH-Bugzilla: 2163713
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [1/4] 0b4500b9247725b1ef0b290bb85392300a618cac
 If we encounter a new mapping while the number of available DMA entries
 in vfio is 0, we are currently skipping that mapping which is a problem
 if we manage to free up DMA space after that within the same RPCIT --
 we will return to the guest with CC0 and have not mapped everything
 within the specified range.  This issue was uncovered while testing
 changes to the s390 linux kernel iommu/dma code, where a different
 usage pattern was employed (new mappings start at the end of the
 aperture and work back towards the front, making us far more likely
 to encounter new mappings before invalidated mappings during a
 global refresh).
 Fix this by tracking whether any mappings were skipped due to vfio
 DMA limit hitting 0; when this occurs, we still continue the range
 and unmap/map anything we can - then we must re-run the range again
 to pickup anything that was missed.  This must occur in a loop until
 all requests are satisfied (success) or we detect that we are still
 unable to complete all mappings (return ZPCI_RPCIT_ST_INSUFF_RES).
 Link: https://lore.kernel.org/linux-s390/20221019144435.369902-1-schnelle@linux.ibm.com/
 Fixes: 37fa32de70 ("s390x/pci: Honor DMA limits set by vfio")
 Reported-by: Niklas Schnelle <schnelle@linux.ibm.com>
 Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
 Message-Id: <20221028194758.204007-2-mjrosato@linux.ibm.com>
 Reviewed-by: Eric Farman <farman@linux.ibm.com>
 Signed-off-by: Thomas Huth <thuth@redhat.com>
 (cherry picked from commit 4a8d21ba50fc8625c3bd51dab903872952f95718)
 Signed-off-by: Cédric Le Goater <clg@redhat.com>
 ---
 hw/s390x/s390-pci-inst.c | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)
 diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c
 index 20a9bcc7af..7cc4bcf850 100644
 --- a/hw/s390x/s390-pci-inst.c
 +++ b/hw/s390x/s390-pci-inst.c
@@ -677,8 +677,9 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra)
     S390PCIBusDevice *pbdev;
     S390PCIIOMMU *iommu;
     S390IOTLBEntry entry;
 -    hwaddr start, end;
 +    hwaddr start, end, sstart;
     uint32_t dma_avail;
 +    bool again;
     if (env->psw.mask & PSW_MASK_PSTATE) {
         s390_program_interrupt(env, PGM_PRIVILEGED, ra);
@@ -691,7 +692,7 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra)
     }
     fh = env->regs[r1] >> 32;
 -    start = env->regs[r2];
 +    sstart = start = env->regs[r2];
     end = start + env->regs[r2 + 1];
     pbdev = s390_pci_find_dev_by_fh(s390_get_phb(), fh);
@@ -732,6 +733,9 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra)
         goto err;
     }
 + retry:
 +    start = sstart;
 +    again = false;
     while (start < end) {
         error = s390_guest_io_table_walk(iommu->g_iota, start, &entry);
         if (error) {
@@ -739,13 +743,24 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra)
         }
         start += entry.len;
 -        while (entry.iova < start && entry.iova < end &&
 -               (dma_avail > 0 || entry.perm == IOMMU_NONE)) {
 -            dma_avail = s390_pci_update_iotlb(iommu, &entry);
 -            entry.iova += TARGET_PAGE_SIZE;
 -            entry.translated_addr += TARGET_PAGE_SIZE;
 +        while (entry.iova < start && entry.iova < end) {
 +            if (dma_avail > 0 || entry.perm == IOMMU_NONE) {
 +                dma_avail = s390_pci_update_iotlb(iommu, &entry);
 +                entry.iova += TARGET_PAGE_SIZE;
 +                entry.translated_addr += TARGET_PAGE_SIZE;
 +            } else {
 +                /*
 +                 * We are unable to make a new mapping at this time, continue
 +                 * on and hopefully free up more space.  Then attempt another
 +                 * pass.
 +                 */
 +                again = true;
 +                break;
 +            }
         }
     }
 +    if (again && dma_avail > 0)
 +        goto retry;
 err:
     if (error) {
         pbdev->state = ZPCI_FS_ERROR;
 -- 
 2.37.3
--- a/SOURCES/kvm-s390x-pci-avoid-double-enable-disable-of-aif.patch
+++ b/SOURCES/kvm-s390x-pci-avoid-double-enable-disable-of-aif.patch
@ -0,0 +1,106 @@
 From 52ad0cc8a82f7a4c3581146fb4d2046898163c4e Mon Sep 17 00:00:00 2001
 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
 Date: Tue, 23 Jan 2024 13:59:24 +0100
 Subject: [PATCH 1/3] s390x/pci: avoid double enable/disable of aif
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Cédric Le Goater <clg@redhat.com>
 RH-MergeRequest: 349: s390x: Fix reset ordering of passthrough ISM devices
 RH-Jira: RHEL-22411
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
 RH-Commit: [1/3] 450e4ca607d801bce93415994250374d70fb72f6
 JIRA: https://issues.redhat.com/browse/RHEL-22411
 commit 07b2c8e034d80ff92e202405c494d2ff80fcf848
 Author: Matthew Rosato <mjrosato@linux.ibm.com>
 Date:   Thu Jan 18 13:51:49 2024 -0500
    s390x/pci: avoid double enable/disable of aif
    Use a flag to keep track of whether AIF is currently enabled.  This can be
    used to avoid enabling/disabling AIF multiple times as well as to determine
    whether or not it should be disabled during reset processing.
    Fixes: d0bc7091c2 ("s390x/pci: enable adapter event notification for interpreted devices")
    Reported-by: Cédric Le Goater <clg@redhat.com>
    Reviewed-by: Eric Farman <farman@linux.ibm.com>
    Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
    Message-ID: <20240118185151.265329-2-mjrosato@linux.ibm.com>
    Reviewed-by: Cédric Le Goater <clg@redhat.com>
    Signed-off-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Cédric Le Goater <clg@redhat.com>
 ---
 hw/s390x/s390-pci-kvm.c         | 25 +++++++++++++++++++++++--
 include/hw/s390x/s390-pci-bus.h |  1 +
 2 files changed, 24 insertions(+), 2 deletions(-)
 diff --git a/hw/s390x/s390-pci-kvm.c b/hw/s390x/s390-pci-kvm.c
 index ff41e4106d..1ee510436c 100644
 --- a/hw/s390x/s390-pci-kvm.c
 +++ b/hw/s390x/s390-pci-kvm.c
@@ -27,6 +27,7 @@ bool s390_pci_kvm_interp_allowed(void)
 int s390_pci_kvm_aif_enable(S390PCIBusDevice *pbdev, ZpciFib *fib, bool assist)
 {
 +    int rc;
     struct kvm_s390_zpci_op args = {
         .fh = pbdev->fh,
         .op = KVM_S390_ZPCIOP_REG_AEN,
@@ -38,15 +39,35 @@ int s390_pci_kvm_aif_enable(S390PCIBusDevice *pbdev, ZpciFib *fib, bool assist)
         .u.reg_aen.flags = (assist) ? 0 : KVM_S390_ZPCIOP_REGAEN_HOST
     };
 -    return kvm_vm_ioctl(kvm_state, KVM_S390_ZPCI_OP, &args);
 +    if (pbdev->aif) {
 +        return -EINVAL;
 +    }
 +
 +    rc = kvm_vm_ioctl(kvm_state, KVM_S390_ZPCI_OP, &args);
 +    if (rc == 0) {
 +        pbdev->aif = true;
 +    }
 +
 +    return rc;
 }
 int s390_pci_kvm_aif_disable(S390PCIBusDevice *pbdev)
 {
 +    int rc;
 +
     struct kvm_s390_zpci_op args = {
         .fh = pbdev->fh,
         .op = KVM_S390_ZPCIOP_DEREG_AEN
     };
 -    return kvm_vm_ioctl(kvm_state, KVM_S390_ZPCI_OP, &args);
 +    if (!pbdev->aif) {
 +        return -EINVAL;
 +    }
 +
 +    rc = kvm_vm_ioctl(kvm_state, KVM_S390_ZPCI_OP, &args);
 +    if (rc == 0) {
 +        pbdev->aif = false;
 +    }
 +
 +    return rc;
 }
 diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h
 index e0a9f9385b..7a658f5e30 100644
 --- a/include/hw/s390x/s390-pci-bus.h
 +++ b/include/hw/s390x/s390-pci-bus.h
@@ -361,6 +361,7 @@ struct S390PCIBusDevice {
     bool unplug_requested;
     bool interp;
     bool forwarding_assist;
 +    bool aif;
     QTAILQ_ENTRY(S390PCIBusDevice) link;
 };
 -- 
 2.41.0
--- a/SOURCES/kvm-s390x-pci-coalesce-unmap-operations.patch
+++ b/SOURCES/kvm-s390x-pci-coalesce-unmap-operations.patch
@ -0,0 +1,125 @@
 From b972c5a2763a91024725c147cf1691ed8e180c7c Mon Sep 17 00:00:00 2001
 From: Matthew Rosato <mjrosato@linux.ibm.com>
 Date: Fri, 28 Oct 2022 15:47:57 -0400
 Subject: [PATCH 09/11] s390x/pci: coalesce unmap operations
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Cédric Le Goater <clg@redhat.com>
 RH-MergeRequest: 250: s390x/pci: reset ISM passthrough devices on shutdown and system reset
 RH-Bugzilla: 2163713
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [2/4] 7b5ee38eca565f5a7cbede4b9883ba3a508fb46c
 Currently, each unmapped page is handled as an individual iommu
 region notification.  Attempt to group contiguous unmap operations
 into fewer notifications to reduce overhead.
 Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
 Message-Id: <20221028194758.204007-3-mjrosato@linux.ibm.com>
 Reviewed-by: Eric Farman <farman@linux.ibm.com>
 Signed-off-by: Thomas Huth <thuth@redhat.com>
 (cherry picked from commit ef536007c3301bbd6a787e4c2210ea289adaa6f0)
 Signed-off-by: Cédric Le Goater <clg@redhat.com>
 ---
 hw/s390x/s390-pci-inst.c | 51 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c
 index 7cc4bcf850..66e764f901 100644
 --- a/hw/s390x/s390-pci-inst.c
 +++ b/hw/s390x/s390-pci-inst.c
@@ -640,6 +640,8 @@ static uint32_t s390_pci_update_iotlb(S390PCIIOMMU *iommu,
         }
         g_hash_table_remove(iommu->iotlb, &entry->iova);
         inc_dma_avail(iommu);
 +        /* Don't notify the iommu yet, maybe we can bundle contiguous unmaps */
 +        goto out;
     } else {
         if (cache) {
             if (cache->perm == entry->perm &&
@@ -663,15 +665,44 @@ static uint32_t s390_pci_update_iotlb(S390PCIIOMMU *iommu,
         dec_dma_avail(iommu);
     }
 +    /*
 +     * All associated iotlb entries have already been cleared, trigger the
 +     * unmaps.
 +     */
     memory_region_notify_iommu(&iommu->iommu_mr, 0, event);
 out:
     return iommu->dma_limit ? iommu->dma_limit->avail : 1;
 }
 +static void s390_pci_batch_unmap(S390PCIIOMMU *iommu, uint64_t iova,
 +                                 uint64_t len)
 +{
 +    uint64_t remain = len, start = iova, end = start + len - 1, mask, size;
 +    IOMMUTLBEvent event = {
 +        .type = IOMMU_NOTIFIER_UNMAP,
 +        .entry = {
 +            .target_as = &address_space_memory,
 +            .translated_addr = 0,
 +            .perm = IOMMU_NONE,
 +        },
 +    };
 +
 +    while (remain >= TARGET_PAGE_SIZE) {
 +        mask = dma_aligned_pow2_mask(start, end, 64);
 +        size = mask + 1;
 +        event.entry.iova = start;
 +        event.entry.addr_mask = mask;
 +        memory_region_notify_iommu(&iommu->iommu_mr, 0, event);
 +        start += size;
 +        remain -= size;
 +    }
 +}
 +
 int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra)
 {
     CPUS390XState *env = &cpu->env;
 +    uint64_t iova, coalesce = 0;
     uint32_t fh;
     uint16_t error = 0;
     S390PCIBusDevice *pbdev;
@@ -742,6 +773,21 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra)
             break;
         }
 +        /*
 +         * If this is an unmap of a PTE, let's try to coalesce multiple unmaps
 +         * into as few notifier events as possible.
 +         */
 +        if (entry.perm == IOMMU_NONE && entry.len == TARGET_PAGE_SIZE) {
 +            if (coalesce == 0) {
 +                iova = entry.iova;
 +            }
 +            coalesce += entry.len;
 +        } else if (coalesce > 0) {
 +            /* Unleash the coalesced unmap before processing a new map */
 +            s390_pci_batch_unmap(iommu, iova, coalesce);
 +            coalesce = 0;
 +        }
 +
         start += entry.len;
         while (entry.iova < start && entry.iova < end) {
             if (dma_avail > 0 || entry.perm == IOMMU_NONE) {
@@ -759,6 +805,11 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra)
             }
         }
     }
 +    if (coalesce) {
 +            /* Unleash the coalesced unmap before finishing rpcit */
 +            s390_pci_batch_unmap(iommu, iova, coalesce);
 +            coalesce = 0;
 +    }
     if (again && dma_avail > 0)
         goto retry;
 err:
 -- 
 2.37.3
--- a/SOURCES/kvm-s390x-pci-drive-ISM-reset-from-subsystem-reset.patch
+++ b/SOURCES/kvm-s390x-pci-drive-ISM-reset-from-subsystem-reset.patch
@ -0,0 +1,137 @@
 From dda71c431be22772f3241af45b62737c988e85d4 Mon Sep 17 00:00:00 2001
 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
 Date: Tue, 23 Jan 2024 13:59:24 +0100
 Subject: [PATCH 3/3] s390x/pci: drive ISM reset from subsystem reset
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Cédric Le Goater <clg@redhat.com>
 RH-MergeRequest: 349: s390x: Fix reset ordering of passthrough ISM devices
 RH-Jira: RHEL-22411
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
 RH-Commit: [3/3] 42e89595dd5e24538a2d3f075391b4534497eece
 JIRA: https://issues.redhat.com/browse/RHEL-22411
 commit 68c691ca99a2538d6a53a70ce8a9ce06ee307ff1
 Author: Matthew Rosato <mjrosato@linux.ibm.com>
 Date:   Thu Jan 18 13:51:51 2024 -0500
    s390x/pci: drive ISM reset from subsystem reset
    ISM devices are sensitive to manipulation of the IOMMU, so the ISM device
    needs to be reset before the vfio-pci device is reset (triggering a full
    UNMAP).  In order to ensure this occurs, trigger ISM device resets from
    subsystem_reset before triggering the PCI bus reset (which will also
    trigger vfio-pci reset).  This only needs to be done for ISM devices
    which were enabled for use by the guest.
    Further, ensure that AIF is disabled as part of the reset event.
    Fixes: ef1535901a ("s390x: do a subsystem reset before the unprotect on reboot")
    Fixes: 03451953c7 ("s390x/pci: reset ISM passthrough devices on shutdown and system reset")
    Reported-by: Cédric Le Goater <clg@redhat.com>
    Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
    Message-ID: <20240118185151.265329-4-mjrosato@linux.ibm.com>
    Reviewed-by: Eric Farman <farman@linux.ibm.com>
    Reviewed-by: Cédric Le Goater <clg@redhat.com>
    Signed-off-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Cédric Le Goater <clg@redhat.com>
 ---
 hw/s390x/s390-pci-bus.c         | 26 +++++++++++++++++---------
 hw/s390x/s390-virtio-ccw.c      |  8 ++++++++
 include/hw/s390x/s390-pci-bus.h |  1 +
 3 files changed, 26 insertions(+), 9 deletions(-)
 diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
 index 2d92848b0f..a8953693b9 100644
 --- a/hw/s390x/s390-pci-bus.c
 +++ b/hw/s390x/s390-pci-bus.c
@@ -160,20 +160,12 @@ static void s390_pci_shutdown_notifier(Notifier *n, void *opaque)
     pci_device_reset(pbdev->pdev);
 }
 -static void s390_pci_reset_cb(void *opaque)
 -{
 -    S390PCIBusDevice *pbdev = opaque;
 -
 -    pci_device_reset(pbdev->pdev);
 -}
 -
 static void s390_pci_perform_unplug(S390PCIBusDevice *pbdev)
 {
     HotplugHandler *hotplug_ctrl;
     if (pbdev->pft == ZPCI_PFT_ISM) {
         notifier_remove(&pbdev->shutdown_notifier);
 -        qemu_unregister_reset(s390_pci_reset_cb, pbdev);
     }
     /* Unplug the PCI device */
@@ -1137,7 +1129,6 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
             if (pbdev->pft == ZPCI_PFT_ISM) {
                 pbdev->shutdown_notifier.notify = s390_pci_shutdown_notifier;
                 qemu_register_shutdown_notifier(&pbdev->shutdown_notifier);
 -                qemu_register_reset(s390_pci_reset_cb, pbdev);
             }
         } else {
             pbdev->fh |= FH_SHM_EMUL;
@@ -1284,6 +1275,23 @@ static void s390_pci_enumerate_bridge(PCIBus *bus, PCIDevice *pdev,
     pci_default_write_config(pdev, PCI_SUBORDINATE_BUS, s->bus_no, 1);
 }
 +void s390_pci_ism_reset(void)
 +{
 +    S390pciState *s = s390_get_phb();
 +
 +    S390PCIBusDevice *pbdev, *next;
 +
 +    /* Trigger reset event for each passthrough ISM device currently in-use */
 +    QTAILQ_FOREACH_SAFE(pbdev, &s->zpci_devs, link, next) {
 +        if (pbdev->interp && pbdev->pft == ZPCI_PFT_ISM &&
 +            pbdev->fh & FH_MASK_ENABLE) {
 +            s390_pci_kvm_aif_disable(pbdev);
 +
 +            pci_device_reset(pbdev->pdev);
 +        }
 +    }
 +}
 +
 static void s390_pcihost_reset(DeviceState *dev)
 {
     S390pciState *s = S390_PCI_HOST_BRIDGE(dev);
 diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
 index 94434c3bb1..51e5b39888 100644
 --- a/hw/s390x/s390-virtio-ccw.c
 +++ b/hw/s390x/s390-virtio-ccw.c
@@ -108,6 +108,14 @@ static void subsystem_reset(void)
     DeviceState *dev;
     int i;
 +    /*
 +     * ISM firmware is sensitive to unexpected changes to the IOMMU, which can
 +     * occur during reset of the vfio-pci device (unmap of entire aperture).
 +     * Ensure any passthrough ISM devices are reset now, while CPUs are paused
 +     * but before vfio-pci cleanup occurs.
 +     */
 +    s390_pci_ism_reset();
 +
     for (i = 0; i < ARRAY_SIZE(reset_dev_types); i++) {
         dev = DEVICE(object_resolve_path_type("", reset_dev_types[i], NULL));
         if (dev) {
 diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h
 index 7a658f5e30..2bfad5563a 100644
 --- a/include/hw/s390x/s390-pci-bus.h
 +++ b/include/hw/s390x/s390-pci-bus.h
@@ -401,5 +401,6 @@ S390PCIBusDevice *s390_pci_find_dev_by_target(S390pciState *s,
                                               const char *target);
 S390PCIBusDevice *s390_pci_find_next_avail_dev(S390pciState *s,
                                                S390PCIBusDevice *pbdev);
 +void s390_pci_ism_reset(void);
 #endif
 -- 
 2.41.0
--- a/SOURCES/kvm-s390x-pci-refresh-fh-before-disabling-aif.patch
+++ b/SOURCES/kvm-s390x-pci-refresh-fh-before-disabling-aif.patch
@ -0,0 +1,71 @@
 From fe70e87ef8d2f7e538867052e06012051919083f Mon Sep 17 00:00:00 2001
 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
 Date: Tue, 23 Jan 2024 13:59:24 +0100
 Subject: [PATCH 2/3] s390x/pci: refresh fh before disabling aif
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Cédric Le Goater <clg@redhat.com>
 RH-MergeRequest: 349: s390x: Fix reset ordering of passthrough ISM devices
 RH-Jira: RHEL-22411
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
 RH-Commit: [2/3] 4a7d3fccdac508253bd7e5765973a08482022edb
 JIRA: https://issues.redhat.com/browse/RHEL-22411
 commit 30e35258e25c75c9d799c34fd89afcafffb37084
 Author: Matthew Rosato <mjrosato@linux.ibm.com>
 Date:   Thu Jan 18 13:51:50 2024 -0500
    s390x/pci: refresh fh before disabling aif
    Typically we refresh the host fh during CLP enable, however it's possible
    that the device goes through multiple reset events before the guest
    performs another CLP enable.  Let's handle this for now by refreshing the
    host handle from vfio before disabling aif.
    Fixes: 03451953c7 ("s390x/pci: reset ISM passthrough devices on shutdown and system reset")
    Reported-by: Cédric Le Goater <clg@redhat.com>
    Reviewed-by: Eric Farman <farman@linux.ibm.com>
    Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
    Message-ID: <20240118185151.265329-3-mjrosato@linux.ibm.com>
    Reviewed-by: Cédric Le Goater <clg@redhat.com>
    Signed-off-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Cédric Le Goater <clg@redhat.com>
 ---
 hw/s390x/s390-pci-kvm.c | 9 +++++++++
 1 file changed, 9 insertions(+)
 diff --git a/hw/s390x/s390-pci-kvm.c b/hw/s390x/s390-pci-kvm.c
 index 1ee510436c..9eef4fc3ec 100644
 --- a/hw/s390x/s390-pci-kvm.c
 +++ b/hw/s390x/s390-pci-kvm.c
@@ -18,6 +18,7 @@
 #include "hw/s390x/s390-pci-bus.h"
 #include "hw/s390x/s390-pci-kvm.h"
 #include "hw/s390x/s390-pci-inst.h"
 +#include "hw/s390x/s390-pci-vfio.h"
 #include "cpu_models.h"
 bool s390_pci_kvm_interp_allowed(void)
@@ -64,6 +65,14 @@ int s390_pci_kvm_aif_disable(S390PCIBusDevice *pbdev)
         return -EINVAL;
     }
 +    /*
 +     * The device may have already been reset but we still want to relinquish
 +     * the guest ISC, so always be sure to use an up-to-date host fh.
 +     */
 +    if (!s390_pci_get_host_fh(pbdev, &args.fh)) {
 +        return -EPERM;
 +    }
 +
     rc = kvm_vm_ioctl(kvm_state, KVM_S390_ZPCI_OP, &args);
     if (rc == 0) {
         pbdev->aif = false;
 -- 
 2.41.0
--- a/SOURCES/kvm-s390x-pci-reset-ISM-passthrough-devices-on-shutdown-.patch
+++ b/SOURCES/kvm-s390x-pci-reset-ISM-passthrough-devices-on-shutdown-.patch
@ -0,0 +1,147 @@
 From 9ec96a236be84e34b16681e658d3910fc3877a44 Mon Sep 17 00:00:00 2001
 From: Matthew Rosato <mjrosato@linux.ibm.com>
 Date: Fri, 9 Dec 2022 14:57:00 -0500
 Subject: [PATCH 11/11] s390x/pci: reset ISM passthrough devices on shutdown
 and system reset
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Cédric Le Goater <clg@redhat.com>
 RH-MergeRequest: 250: s390x/pci: reset ISM passthrough devices on shutdown and system reset
 RH-Bugzilla: 2163713
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [4/4] c857d022c7c2f43cdeb66c4f6acfd9272c925b35
 ISM device firmware stores unique state information that can
 can cause a wholesale unmap of the associated IOMMU (e.g. when
 we get a termination signal for QEMU) to trigger firmware errors
 because firmware believes we are attempting to invalidate entries
 that are still in-use by the guest OS (when in fact that guest is
 in the process of being terminated or rebooted).
 To alleviate this, register both a shutdown notifier (for unexpected
 termination cases e.g. virsh destroy) as well as a reset callback
 (for cases like guest OS reboot).  For each of these scenarios, trigger
 PCI device reset; this is enough to indicate to firmware that the IOMMU
 is no longer in-use by the guest OS, making it safe to invalidate any
 associated IOMMU entries.
 Fixes: 15d0e7942d3b ("s390x/pci: don't fence interpreted devices without MSI-X")
 Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
 Message-Id: <20221209195700.263824-1-mjrosato@linux.ibm.com>
 Reviewed-by: Eric Farman <farman@linux.ibm.com>
 [thuth: Adjusted the hunk in s390-pci-vfio.c due to different context]
 Signed-off-by: Thomas Huth <thuth@redhat.com>
 (cherry picked from commit 03451953c79e6b31f7860ee0c35b28e181d573c1)
 Signed-off-by: Cédric Le Goater <clg@redhat.com>
 ---
 hw/s390x/s390-pci-bus.c         | 28 ++++++++++++++++++++++++++++
 hw/s390x/s390-pci-vfio.c        |  2 ++
 include/hw/s390x/s390-pci-bus.h |  5 +++++
 3 files changed, 35 insertions(+)
 diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
 index d8b1e44a02..2d92848b0f 100644
 --- a/hw/s390x/s390-pci-bus.c
 +++ b/hw/s390x/s390-pci-bus.c
@@ -24,6 +24,8 @@
 #include "hw/pci/msi.h"
 #include "qemu/error-report.h"
 #include "qemu/module.h"
 +#include "sysemu/reset.h"
 +#include "sysemu/runstate.h"
 #ifndef DEBUG_S390PCI_BUS
 #define DEBUG_S390PCI_BUS  0
@@ -150,10 +152,30 @@ out:
     psccb->header.response_code = cpu_to_be16(rc);
 }
 +static void s390_pci_shutdown_notifier(Notifier *n, void *opaque)
 +{
 +    S390PCIBusDevice *pbdev = container_of(n, S390PCIBusDevice,
 +                                           shutdown_notifier);
 +
 +    pci_device_reset(pbdev->pdev);
 +}
 +
 +static void s390_pci_reset_cb(void *opaque)
 +{
 +    S390PCIBusDevice *pbdev = opaque;
 +
 +    pci_device_reset(pbdev->pdev);
 +}
 +
 static void s390_pci_perform_unplug(S390PCIBusDevice *pbdev)
 {
     HotplugHandler *hotplug_ctrl;
 +    if (pbdev->pft == ZPCI_PFT_ISM) {
 +        notifier_remove(&pbdev->shutdown_notifier);
 +        qemu_unregister_reset(s390_pci_reset_cb, pbdev);
 +    }
 +
     /* Unplug the PCI device */
     if (pbdev->pdev) {
         DeviceState *pdev = DEVICE(pbdev->pdev);
@@ -1111,6 +1133,12 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
                 pbdev->fh |= FH_SHM_VFIO;
                 pbdev->forwarding_assist = false;
             }
 +            /* Register shutdown notifier and reset callback for ISM devices */
 +            if (pbdev->pft == ZPCI_PFT_ISM) {
 +                pbdev->shutdown_notifier.notify = s390_pci_shutdown_notifier;
 +                qemu_register_shutdown_notifier(&pbdev->shutdown_notifier);
 +                qemu_register_reset(s390_pci_reset_cb, pbdev);
 +            }
         } else {
             pbdev->fh |= FH_SHM_EMUL;
             /* Always intercept emulated devices */
 diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c
 index 99806e2a84..69af35f4fe 100644
 --- a/hw/s390x/s390-pci-vfio.c
 +++ b/hw/s390x/s390-pci-vfio.c
@@ -124,6 +124,8 @@ static void s390_pci_read_base(S390PCIBusDevice *pbdev,
     /* The following values remain 0 until we support other FMB formats */
     pbdev->zpci_fn.fmbl = 0;
     pbdev->zpci_fn.pft = 0;
 +    /* Store function type separately for type-specific behavior */
 +    pbdev->pft = cap->pft;
     /*
      * If appropriate, reduce the size of the supported DMA aperture reported
 diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h
 index 1c46e3a269..e0a9f9385b 100644
 --- a/include/hw/s390x/s390-pci-bus.h
 +++ b/include/hw/s390x/s390-pci-bus.h
@@ -39,6 +39,9 @@
 #define UID_CHECKING_ENABLED 0x01
 #define ZPCI_DTSM 0x40
 +/* zPCI Function Types */
 +#define ZPCI_PFT_ISM 5
 +
 OBJECT_DECLARE_SIMPLE_TYPE(S390pciState, S390_PCI_HOST_BRIDGE)
 OBJECT_DECLARE_SIMPLE_TYPE(S390PCIBus, S390_PCI_BUS)
 OBJECT_DECLARE_SIMPLE_TYPE(S390PCIBusDevice, S390_PCI_DEVICE)
@@ -344,6 +347,7 @@ struct S390PCIBusDevice {
     uint16_t noi;
     uint16_t maxstbl;
     uint8_t sum;
 +    uint8_t pft;
     S390PCIGroup *pci_group;
     ClpRspQueryPci zpci_fn;
     S390MsixInfo msix;
@@ -352,6 +356,7 @@ struct S390PCIBusDevice {
     MemoryRegion msix_notify_mr;
     IndAddr *summary_ind;
     IndAddr *indicator;
 +    Notifier shutdown_notifier;
     bool pci_unplug_request_processed;
     bool unplug_requested;
     bool interp;
 -- 
 2.37.3
--- a/SOURCES/kvm-s390x-pci-shrink-DMA-aperture-to-be-bound-by-vfio-DM.patch
+++ b/SOURCES/kvm-s390x-pci-shrink-DMA-aperture-to-be-bound-by-vfio-DM.patch
@ -0,0 +1,91 @@
 From a0b6c21b555566eb6bc38643269d14c82dfd0226 Mon Sep 17 00:00:00 2001
 From: Matthew Rosato <mjrosato@linux.ibm.com>
 Date: Fri, 28 Oct 2022 15:47:58 -0400
 Subject: [PATCH 10/11] s390x/pci: shrink DMA aperture to be bound by vfio DMA
 limit
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Cédric Le Goater <clg@redhat.com>
 RH-MergeRequest: 250: s390x/pci: reset ISM passthrough devices on shutdown and system reset
 RH-Bugzilla: 2163713
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
 RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
 RH-Commit: [3/4] aa241dd250ad5e696b67c87dddc31ee5aaee9c0e
 Currently, s390x-pci performs accounting against the vfio DMA
 limit and triggers the guest to clean up mappings when the limit
 is reached. Let's go a step further and also limit the size of
 the supported DMA aperture reported to the guest based upon the
 initial vfio DMA limit reported for the container (if less than
 than the size reported by the firmware/host zPCI layer).  This
 avoids processing sections of the guest DMA table during global
 refresh that, for common use cases, will never be used anway, and
 makes exhausting the vfio DMA limit due to mismatch between guest
 aperture size and host limit far less likely and more indicitive
 of an error.
 Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
 Message-Id: <20221028194758.204007-4-mjrosato@linux.ibm.com>
 Reviewed-by: Eric Farman <farman@linux.ibm.com>
 Signed-off-by: Thomas Huth <thuth@redhat.com>
 (cherry picked from commit df202e3ff3fccb49868e08f20d0bda86cb953fbe)
 Signed-off-by: Cédric Le Goater <clg@redhat.com>
 ---
 hw/s390x/s390-pci-vfio.c        | 11 +++++++++++
 include/hw/s390x/s390-pci-bus.h |  1 +
 2 files changed, 12 insertions(+)
 diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c
 index 2aefa508a0..99806e2a84 100644
 --- a/hw/s390x/s390-pci-vfio.c
 +++ b/hw/s390x/s390-pci-vfio.c
@@ -84,6 +84,7 @@ S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s,
     cnt->users = 1;
     cnt->avail = avail;
     QTAILQ_INSERT_TAIL(&s->zpci_dma_limit, cnt, link);
 +    pbdev->iommu->max_dma_limit = avail;
     return cnt;
 }
@@ -103,6 +104,7 @@ static void s390_pci_read_base(S390PCIBusDevice *pbdev,
     struct vfio_info_cap_header *hdr;
     struct vfio_device_info_cap_zpci_base *cap;
     VFIOPCIDevice *vpci =  container_of(pbdev->pdev, VFIOPCIDevice, pdev);
 +    uint64_t vfio_size;
     hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_ZPCI_BASE);
@@ -122,6 +124,15 @@ static void s390_pci_read_base(S390PCIBusDevice *pbdev,
     /* The following values remain 0 until we support other FMB formats */
     pbdev->zpci_fn.fmbl = 0;
     pbdev->zpci_fn.pft = 0;
 +
 +    /*
 +     * If appropriate, reduce the size of the supported DMA aperture reported
 +     * to the guest based upon the vfio DMA limit.
 +     */
 +    vfio_size = pbdev->iommu->max_dma_limit << TARGET_PAGE_BITS;
 +    if (vfio_size < (cap->end_dma - cap->start_dma + 1)) {
 +        pbdev->zpci_fn.edma = cap->start_dma + vfio_size - 1;
 +    }
 }
 static bool get_host_fh(S390PCIBusDevice *pbdev, struct vfio_device_info *info,
 diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h
 index 0605fcea24..1c46e3a269 100644
 --- a/include/hw/s390x/s390-pci-bus.h
 +++ b/include/hw/s390x/s390-pci-bus.h
@@ -278,6 +278,7 @@ struct S390PCIIOMMU {
     uint64_t g_iota;
     uint64_t pba;
     uint64_t pal;
 +    uint64_t max_dma_limit;
     GHashTable *iotlb;
     S390PCIDMACount *dma_limit;
 };
 -- 
 2.37.3
--- a/SOURCES/kvm-s390x-pv-Implement-a-CGS-check-helper.patch
+++ b/SOURCES/kvm-s390x-pv-Implement-a-CGS-check-helper.patch
@ -0,0 +1,109 @@
 From 2fc8489b70445a3db0a2e72c1f1edb4d61d404d6 Mon Sep 17 00:00:00 2001
 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@redhat.com>
 Date: Mon, 16 Jan 2023 18:46:05 +0100
 Subject: [PATCH] s390x/pv: Implement a CGS check helper
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 RH-Author: Cédric Le Goater <clg@redhat.com>
 RH-MergeRequest: 271: Secure guest can't boot with maximal number of vcpus (248)
 RH-Bugzilla: 2187159
 RH-Acked-by: Thomas Huth <thuth@redhat.com>
 RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
 RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
 RH-Commit: [1/1] c870d525c48ab6d0df964b5abe48efe2528c9883
 When a protected VM is started with the maximum number of CPUs (248),
 the service call providing information on the CPUs requires more
 buffer space than allocated and QEMU disgracefully aborts :
    LOADPARM=[........]
    Using virtio-blk.
    Using SCSI scheme.
    ...................................................................................
    qemu-system-s390x: KVM_S390_MEM_OP failed: Argument list too long
 When protected virtualization is initialized, compute the maximum
 number of vCPUs supported by the machine and return useful information
 to the user before the machine starts in case of error.
 Suggested-by: Thomas Huth <thuth@redhat.com>
 Reviewed-by: Thomas Huth <thuth@redhat.com>
 Signed-off-by: Cédric Le Goater <clg@redhat.com>
 Message-Id: <20230116174607.2459498-2-clg@kaod.org>
 Signed-off-by: Thomas Huth <thuth@redhat.com>
 (cherry picked from commit 75d7150c636569f6687f7e70a33be893be43eb5f)
 Signed-off-by: Cédric Le Goater <clg@redhat.com>
 ---
 hw/s390x/pv.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c
 index 728ba24547..749e5db1ce 100644
 --- a/hw/s390x/pv.c
 +++ b/hw/s390x/pv.c
@@ -20,6 +20,7 @@
 #include "exec/confidential-guest-support.h"
 #include "hw/s390x/ipl.h"
 #include "hw/s390x/pv.h"
 +#include "hw/s390x/sclp.h"
 #include "target/s390x/kvm/kvm_s390x.h"
 static bool info_valid;
@@ -249,6 +250,41 @@ struct S390PVGuestClass {
     ConfidentialGuestSupportClass parent_class;
 };
 +/*
 + * If protected virtualization is enabled, the amount of data that the
 + * Read SCP Info Service Call can use is limited to one page. The
 + * available space also depends on the Extended-Length SCCB (ELS)
 + * feature which can take more buffer space to store feature
 + * information. This impacts the maximum number of CPUs supported in
 + * the machine.
 + */
 +static uint32_t s390_pv_get_max_cpus(void)
 +{
 +    int offset_cpu = s390_has_feat(S390_FEAT_EXTENDED_LENGTH_SCCB) ?
 +        offsetof(ReadInfo, entries) : SCLP_READ_SCP_INFO_FIXED_CPU_OFFSET;
 +
 +    return (TARGET_PAGE_SIZE - offset_cpu) / sizeof(CPUEntry);
 +}
 +
 +static bool s390_pv_check_cpus(Error **errp)
 +{
 +    MachineState *ms = MACHINE(qdev_get_machine());
 +    uint32_t pv_max_cpus = s390_pv_get_max_cpus();
 +
 +    if (ms->smp.max_cpus > pv_max_cpus) {
 +        error_setg(errp, "Protected VMs support a maximum of %d CPUs",
 +                   pv_max_cpus);
 +        return false;
 +    }
 +
 +    return true;
 +}
 +
 +static bool s390_pv_guest_check(ConfidentialGuestSupport *cgs, Error **errp)
 +{
 +    return s390_pv_check_cpus(errp);
 +}
 +
 int s390_pv_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
 {
     if (!object_dynamic_cast(OBJECT(cgs), TYPE_S390_PV_GUEST)) {
@@ -261,6 +297,10 @@ int s390_pv_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
         return -1;
     }
 +    if (!s390_pv_guest_check(cgs, errp)) {
 +        return -1;
 +    }
 +
     cgs->ready = true;
     return 0;
 -- 
 2.39.1
--- a/Show More
+++ b/Show More