- Add DMABUF support [VOYAGER-19 VOYAGER-53] - Accelerated SMMU device for GH GPU passthrough [VOYAGER-5 VOYAGER-16 VOYAGER-17 VOYAGER-48] - Resolves: VOYAGER-5 (Backport CMDQV support) - Resolves: VOYAGER-16 (Backport HW accelerated nesting support for arm SMMUv3) - Resolves: VOYAGER-17 (Backport vEVENTQ support for smmuv3) - Resolves: VOYAGER-19 (Backport vfio: Add DMABUF support for PCI BAR regions - qemu-kvm) - Resolves: VOYAGER-48 (qemu-kvm coredump when using traditional smmuv3 device without any GPU device) - Resolves: VOYAGER-53 (qemu-kvm coredump when hotunplug NIC VF interface without smmuv3)
416 lines
14 KiB
Diff
416 lines
14 KiB
Diff
From fe3b723b9ed23f89e86af5f0bc227a5535a09681 Mon Sep 17 00:00:00 2001
|
|
From: Steve Sistare <steven.sistare@oracle.com>
|
|
Date: Wed, 1 Oct 2025 08:33:58 -0700
|
|
Subject: [PATCH] migration: cpr-exec mode
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
RH-Author: Cédric Le Goater <clg@redhat.com>
|
|
RH-MergeRequest: 458: Add DMABUF support
|
|
RH-Jira: VOYAGER-19 VOYAGER-53
|
|
RH-Acked-by: Eric Auger <eric.auger@redhat.com>
|
|
RH-Commit: [1/101] 7aeb2d01b3 (clegoate/qemu-kvm-centos)
|
|
|
|
Add the cpr-exec migration mode. Usage:
|
|
qemu-system-$arch -machine aux-ram-share=on ...
|
|
migrate_set_parameter mode cpr-exec
|
|
migrate_set_parameter cpr-exec-command \
|
|
<arg1> <arg2> ... -incoming <uri-1> \
|
|
migrate -d <uri-1>
|
|
|
|
The migrate command stops the VM, saves state to uri-1,
|
|
directly exec's a new version of QEMU on the same host,
|
|
replacing the original process while retaining its PID, and
|
|
loads state from uri-1. Guest RAM is preserved in place,
|
|
albeit with new virtual addresses.
|
|
|
|
The new QEMU process is started by exec'ing the command
|
|
specified by the @cpr-exec-command parameter. The first word of
|
|
the command is the binary, and the remaining words are its
|
|
arguments. The command may be a direct invocation of new QEMU,
|
|
or may be a non-QEMU command that exec's the new QEMU binary.
|
|
|
|
This mode creates a second migration channel that is not visible
|
|
to the user. At the start of migration, old QEMU saves CPR state
|
|
to the second channel, and at the end of migration, it tells the
|
|
main loop to call cpr_exec. New QEMU loads CPR state early, before
|
|
objects are created.
|
|
|
|
Because old QEMU terminates when new QEMU starts, one cannot
|
|
stream data between the two, so uri-1 must be a type,
|
|
such as a file, that accepts all data before old QEMU exits.
|
|
Otherwise, old QEMU may quietly block writing to the channel.
|
|
|
|
Memory-backend objects must have the share=on attribute, but
|
|
memory-backend-epc is not supported. The VM must be started with
|
|
the '-machine aux-ram-share=on' option, which allows anonymous
|
|
memory to be transferred in place to the new process. The memfds
|
|
are kept open across exec by clearing the close-on-exec flag, their
|
|
values are saved in CPR state, and they are mmap'd in new QEMU.
|
|
|
|
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
|
|
Acked-by: Markus Armbruster <armbru@redhat.com>
|
|
Link: https://lore.kernel.org/r/1759332851-370353-7-git-send-email-steven.sistare@oracle.com
|
|
Signed-off-by: Peter Xu <peterx@redhat.com>
|
|
---
|
|
include/migration/cpr.h | 2 +
|
|
migration/cpr-exec.c | 95 +++++++++++++++++++++++++++++++++++++++
|
|
migration/cpr.c | 23 +++++++++-
|
|
migration/migration.c | 10 ++++-
|
|
migration/ram.c | 1 +
|
|
migration/trace-events | 1 +
|
|
migration/vmstate-types.c | 8 ++++
|
|
qapi/migration.json | 25 ++++++++++-
|
|
system/vl.c | 4 +-
|
|
9 files changed, 164 insertions(+), 5 deletions(-)
|
|
|
|
diff --git a/include/migration/cpr.h b/include/migration/cpr.h
|
|
index b84389ff04..a412d6663c 100644
|
|
--- a/include/migration/cpr.h
|
|
+++ b/include/migration/cpr.h
|
|
@@ -53,9 +53,11 @@ int cpr_get_fd_param(const char *name, const char *fdname, int index,
|
|
QEMUFile *cpr_transfer_output(MigrationChannel *channel, Error **errp);
|
|
QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp);
|
|
|
|
+void cpr_exec_init(void);
|
|
QEMUFile *cpr_exec_output(Error **errp);
|
|
QEMUFile *cpr_exec_input(Error **errp);
|
|
void cpr_exec_persist_state(QEMUFile *f);
|
|
bool cpr_exec_has_state(void);
|
|
void cpr_exec_unpersist_state(void);
|
|
+void cpr_exec_unpreserve_fds(void);
|
|
#endif
|
|
diff --git a/migration/cpr-exec.c b/migration/cpr-exec.c
|
|
index 81d84425e1..d57714bc5d 100644
|
|
--- a/migration/cpr-exec.c
|
|
+++ b/migration/cpr-exec.c
|
|
@@ -6,15 +6,21 @@
|
|
|
|
#include "qemu/osdep.h"
|
|
#include "qemu/cutils.h"
|
|
+#include "qemu/error-report.h"
|
|
#include "qemu/memfd.h"
|
|
#include "qapi/error.h"
|
|
+#include "qapi/type-helpers.h"
|
|
#include "io/channel-file.h"
|
|
#include "io/channel-socket.h"
|
|
+#include "block/block-global-state.h"
|
|
+#include "qemu/main-loop.h"
|
|
#include "migration/cpr.h"
|
|
#include "migration/qemu-file.h"
|
|
+#include "migration/migration.h"
|
|
#include "migration/misc.h"
|
|
#include "migration/vmstate.h"
|
|
#include "system/runstate.h"
|
|
+#include "trace.h"
|
|
|
|
#define CPR_EXEC_STATE_NAME "QEMU_CPR_EXEC_STATE"
|
|
|
|
@@ -97,3 +103,92 @@ QEMUFile *cpr_exec_input(Error **errp)
|
|
lseek(mfd, 0, SEEK_SET);
|
|
return qemu_file_new_fd_input(mfd, CPR_EXEC_STATE_NAME);
|
|
}
|
|
+
|
|
+static bool preserve_fd(int fd)
|
|
+{
|
|
+ qemu_clear_cloexec(fd);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static bool unpreserve_fd(int fd)
|
|
+{
|
|
+ qemu_set_cloexec(fd);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static void cpr_exec_preserve_fds(void)
|
|
+{
|
|
+ cpr_walk_fd(preserve_fd);
|
|
+}
|
|
+
|
|
+void cpr_exec_unpreserve_fds(void)
|
|
+{
|
|
+ cpr_walk_fd(unpreserve_fd);
|
|
+}
|
|
+
|
|
+static void cpr_exec_cb(void *opaque)
|
|
+{
|
|
+ MigrationState *s = migrate_get_current();
|
|
+ char **argv = strv_from_str_list(s->parameters.cpr_exec_command);
|
|
+ Error *err = NULL;
|
|
+
|
|
+ /*
|
|
+ * Clear the close-on-exec flag for all preserved fd's. We cannot do so
|
|
+ * earlier because they should not persist across miscellaneous fork and
|
|
+ * exec calls that are performed during normal operation.
|
|
+ */
|
|
+ cpr_exec_preserve_fds();
|
|
+
|
|
+ trace_cpr_exec();
|
|
+ execvp(argv[0], argv);
|
|
+
|
|
+ /*
|
|
+ * exec should only fail if argv[0] is bogus, or has a permissions problem,
|
|
+ * or the system is very short on resources.
|
|
+ */
|
|
+ g_strfreev(argv);
|
|
+ cpr_exec_unpreserve_fds();
|
|
+
|
|
+ error_setg_errno(&err, errno, "execvp %s failed", argv[0]);
|
|
+ error_report_err(error_copy(err));
|
|
+ migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
|
|
+ migrate_set_error(s, err);
|
|
+
|
|
+ /* Note, we can go from state COMPLETED to FAILED */
|
|
+ migration_call_notifiers(s, MIG_EVENT_PRECOPY_FAILED, NULL);
|
|
+
|
|
+ err = NULL;
|
|
+ if (!migration_block_activate(&err)) {
|
|
+ /* error was already reported */
|
|
+ error_free(err);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (runstate_is_live(s->vm_old_state)) {
|
|
+ vm_start();
|
|
+ }
|
|
+}
|
|
+
|
|
+static int cpr_exec_notifier(NotifierWithReturn *notifier, MigrationEvent *e,
|
|
+ Error **errp)
|
|
+{
|
|
+ MigrationState *s = migrate_get_current();
|
|
+
|
|
+ if (e->type == MIG_EVENT_PRECOPY_DONE) {
|
|
+ QEMUBH *cpr_exec_bh = qemu_bh_new(cpr_exec_cb, NULL);
|
|
+ assert(s->state == MIGRATION_STATUS_COMPLETED);
|
|
+ qemu_bh_schedule(cpr_exec_bh);
|
|
+ qemu_notify_event();
|
|
+ } else if (e->type == MIG_EVENT_PRECOPY_FAILED) {
|
|
+ cpr_exec_unpersist_state();
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void cpr_exec_init(void)
|
|
+{
|
|
+ static NotifierWithReturn exec_notifier;
|
|
+
|
|
+ migration_add_notifier_mode(&exec_notifier, cpr_exec_notifier,
|
|
+ MIG_MODE_CPR_EXEC);
|
|
+}
|
|
diff --git a/migration/cpr.c b/migration/cpr.c
|
|
index a995b349d9..0b87c2343f 100644
|
|
--- a/migration/cpr.c
|
|
+++ b/migration/cpr.c
|
|
@@ -6,6 +6,7 @@
|
|
*/
|
|
|
|
#include "qemu/osdep.h"
|
|
+#include "qemu/error-report.h"
|
|
#include "qapi/error.h"
|
|
#include "hw/vfio/vfio-device.h"
|
|
#include "migration/cpr.h"
|
|
@@ -185,6 +186,8 @@ int cpr_state_save(MigrationChannel *channel, Error **errp)
|
|
if (mode == MIG_MODE_CPR_TRANSFER) {
|
|
g_assert(channel);
|
|
f = cpr_transfer_output(channel, errp);
|
|
+ } else if (mode == MIG_MODE_CPR_EXEC) {
|
|
+ f = cpr_exec_output(errp);
|
|
} else {
|
|
return 0;
|
|
}
|
|
@@ -201,6 +204,10 @@ int cpr_state_save(MigrationChannel *channel, Error **errp)
|
|
return ret;
|
|
}
|
|
|
|
+ if (migrate_mode() == MIG_MODE_CPR_EXEC) {
|
|
+ cpr_exec_persist_state(f);
|
|
+ }
|
|
+
|
|
/*
|
|
* Close the socket only partially so we can later detect when the other
|
|
* end closes by getting a HUP event.
|
|
@@ -219,7 +226,13 @@ int cpr_state_load(MigrationChannel *channel, Error **errp)
|
|
QEMUFile *f;
|
|
MigMode mode = 0;
|
|
|
|
- if (channel) {
|
|
+ if (cpr_exec_has_state()) {
|
|
+ mode = MIG_MODE_CPR_EXEC;
|
|
+ f = cpr_exec_input(errp);
|
|
+ if (channel) {
|
|
+ warn_report("ignoring cpr channel for migration mode cpr-exec");
|
|
+ }
|
|
+ } else if (channel) {
|
|
mode = MIG_MODE_CPR_TRANSFER;
|
|
cpr_set_incoming_mode(mode);
|
|
f = cpr_transfer_input(channel, errp);
|
|
@@ -231,6 +244,7 @@ int cpr_state_load(MigrationChannel *channel, Error **errp)
|
|
}
|
|
|
|
trace_cpr_state_load(MigMode_str(mode));
|
|
+ cpr_set_incoming_mode(mode);
|
|
|
|
v = qemu_get_be32(f);
|
|
if (v != QEMU_CPR_FILE_MAGIC) {
|
|
@@ -251,6 +265,11 @@ int cpr_state_load(MigrationChannel *channel, Error **errp)
|
|
return ret;
|
|
}
|
|
|
|
+ if (migrate_mode() == MIG_MODE_CPR_EXEC) {
|
|
+ /* Set cloexec to prevent fd leaks from fork until the next cpr-exec */
|
|
+ cpr_exec_unpreserve_fds();
|
|
+ }
|
|
+
|
|
/*
|
|
* Let the caller decide when to close the socket (and generate a HUP event
|
|
* for the sending side).
|
|
@@ -271,7 +290,7 @@ void cpr_state_close(void)
|
|
bool cpr_incoming_needed(void *opaque)
|
|
{
|
|
MigMode mode = migrate_mode();
|
|
- return mode == MIG_MODE_CPR_TRANSFER;
|
|
+ return mode == MIG_MODE_CPR_TRANSFER || mode == MIG_MODE_CPR_EXEC;
|
|
}
|
|
|
|
/*
|
|
diff --git a/migration/migration.c b/migration/migration.c
|
|
index 08a98f74ef..2515bec48f 100644
|
|
--- a/migration/migration.c
|
|
+++ b/migration/migration.c
|
|
@@ -333,6 +333,7 @@ void migration_object_init(void)
|
|
|
|
ram_mig_init();
|
|
dirty_bitmap_mig_init();
|
|
+ cpr_exec_init();
|
|
|
|
/* Initialize cpu throttle timers */
|
|
cpu_throttle_init();
|
|
@@ -1796,7 +1797,8 @@ bool migrate_mode_is_cpr(MigrationState *s)
|
|
{
|
|
MigMode mode = s->parameters.mode;
|
|
return mode == MIG_MODE_CPR_REBOOT ||
|
|
- mode == MIG_MODE_CPR_TRANSFER;
|
|
+ mode == MIG_MODE_CPR_TRANSFER ||
|
|
+ mode == MIG_MODE_CPR_EXEC;
|
|
}
|
|
|
|
int migrate_init(MigrationState *s, Error **errp)
|
|
@@ -2145,6 +2147,12 @@ static bool migrate_prepare(MigrationState *s, bool resume, Error **errp)
|
|
return false;
|
|
}
|
|
|
|
+ if (migrate_mode() == MIG_MODE_CPR_EXEC &&
|
|
+ !s->parameters.has_cpr_exec_command) {
|
|
+ error_setg(errp, "cpr-exec mode requires setting cpr-exec-command");
|
|
+ return false;
|
|
+ }
|
|
+
|
|
if (migration_is_blocked(errp)) {
|
|
return false;
|
|
}
|
|
diff --git a/migration/ram.c b/migration/ram.c
|
|
index 7208bc114f..6730a41ff5 100644
|
|
--- a/migration/ram.c
|
|
+++ b/migration/ram.c
|
|
@@ -228,6 +228,7 @@ bool migrate_ram_is_ignored(RAMBlock *block)
|
|
MigMode mode = migrate_mode();
|
|
return !qemu_ram_is_migratable(block) ||
|
|
mode == MIG_MODE_CPR_TRANSFER ||
|
|
+ mode == MIG_MODE_CPR_EXEC ||
|
|
(migrate_ignore_shared() && qemu_ram_is_shared(block)
|
|
&& qemu_ram_is_named_file(block));
|
|
}
|
|
diff --git a/migration/trace-events b/migration/trace-events
|
|
index 706db97def..e8edd1fbba 100644
|
|
--- a/migration/trace-events
|
|
+++ b/migration/trace-events
|
|
@@ -354,6 +354,7 @@ cpr_state_save(const char *mode) "%s mode"
|
|
cpr_state_load(const char *mode) "%s mode"
|
|
cpr_transfer_input(const char *path) "%s"
|
|
cpr_transfer_output(const char *path) "%s"
|
|
+cpr_exec(void) ""
|
|
|
|
# block-dirty-bitmap.c
|
|
send_bitmap_header_enter(void) ""
|
|
diff --git a/migration/vmstate-types.c b/migration/vmstate-types.c
|
|
index a1cd7a95fa..4b01dc19c2 100644
|
|
--- a/migration/vmstate-types.c
|
|
+++ b/migration/vmstate-types.c
|
|
@@ -322,6 +322,10 @@ static int get_fd(QEMUFile *f, void *pv, size_t size,
|
|
const VMStateField *field)
|
|
{
|
|
int32_t *v = pv;
|
|
+ if (migrate_mode() == MIG_MODE_CPR_EXEC) {
|
|
+ qemu_get_sbe32s(f, v);
|
|
+ return 0;
|
|
+ }
|
|
*v = qemu_file_get_fd(f);
|
|
return 0;
|
|
}
|
|
@@ -330,6 +334,10 @@ static int put_fd(QEMUFile *f, void *pv, size_t size,
|
|
const VMStateField *field, JSONWriter *vmdesc)
|
|
{
|
|
int32_t *v = pv;
|
|
+ if (migrate_mode() == MIG_MODE_CPR_EXEC) {
|
|
+ qemu_put_sbe32s(f, v);
|
|
+ return 0;
|
|
+ }
|
|
return qemu_file_put_fd(f, *v);
|
|
}
|
|
|
|
diff --git a/qapi/migration.json b/qapi/migration.json
|
|
index 2be8fa1d16..be0f3fcc12 100644
|
|
--- a/qapi/migration.json
|
|
+++ b/qapi/migration.json
|
|
@@ -694,9 +694,32 @@
|
|
# until you issue the `migrate-incoming` command.
|
|
#
|
|
# (since 10.0)
|
|
+#
|
|
+# @cpr-exec: The migrate command stops the VM, saves state to the
|
|
+# migration channel, directly exec's a new version of QEMU on the
|
|
+# same host, replacing the original process while retaining its
|
|
+# PID, and loads state from the channel. Guest RAM is preserved
|
|
+# in place. Devices and their pinned pages are also preserved for
|
|
+# VFIO and IOMMUFD.
|
|
+#
|
|
+# Old QEMU starts new QEMU by exec'ing the command specified by
|
|
+# the @cpr-exec-command parameter. The command may be a direct
|
|
+# invocation of new QEMU, or may be a wrapper that exec's the new
|
|
+# QEMU binary.
|
|
+#
|
|
+# Because old QEMU terminates when new QEMU starts, one cannot
|
|
+# stream data between the two, so the channel must be a type,
|
|
+# such as a file, that accepts all data before old QEMU exits.
|
|
+# Otherwise, old QEMU may quietly block writing to the channel.
|
|
+#
|
|
+# Memory-backend objects must have the share=on attribute, but
|
|
+# memory-backend-epc is not supported. The VM must be started
|
|
+# with the '-machine aux-ram-share=on' option.
|
|
+#
|
|
+# (since 10.2)
|
|
##
|
|
{ 'enum': 'MigMode',
|
|
- 'data': [ 'normal', 'cpr-reboot', 'cpr-transfer' ] }
|
|
+ 'data': [ 'normal', 'cpr-reboot', 'cpr-transfer', 'cpr-exec' ] }
|
|
|
|
##
|
|
# @ZeroPageDetection:
|
|
diff --git a/system/vl.c b/system/vl.c
|
|
index d3e6158753..7a32043625 100644
|
|
--- a/system/vl.c
|
|
+++ b/system/vl.c
|
|
@@ -3850,6 +3850,8 @@ void qemu_init(int argc, char **argv)
|
|
}
|
|
qemu_init_displays();
|
|
accel_setup_post(current_machine);
|
|
- os_setup_post();
|
|
+ if (migrate_mode() != MIG_MODE_CPR_EXEC) {
|
|
+ os_setup_post();
|
|
+ }
|
|
resume_mux_open();
|
|
}
|