173 lines
7.5 KiB
Diff
173 lines
7.5 KiB
Diff
|
From c98eb88ce49927f612753b53c890135fde14c3a4 Mon Sep 17 00:00:00 2001
|
||
|
From: Peter Xu <peterx@redhat.com>
|
||
|
Date: Mon, 18 Sep 2023 14:28:15 -0300
|
||
|
Subject: [PATCH 01/13] migration: Fix race that dest preempt thread close too
|
||
|
early
|
||
|
MIME-Version: 1.0
|
||
|
Content-Type: text/plain; charset=UTF-8
|
||
|
Content-Transfer-Encoding: 8bit
|
||
|
|
||
|
RH-Author: Peter Xu <peterx@redhat.com>
|
||
|
RH-MergeRequest: 203: migration: Fix race that dest preempt thread close too early
|
||
|
RH-Jira: RHEL-11219
|
||
|
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
|
||
|
RH-Acked-by: Leonardo Brás <leobras@redhat.com>
|
||
|
RH-Commit: [1/8] a1197ca99878d5d8bbb271c6941aa36d91c66122 (peterx/qemu-kvm)
|
||
|
|
||
|
We hit intermit CI issue on failing at migration-test over the unit test
|
||
|
preempt/plain:
|
||
|
|
||
|
qemu-system-x86_64: Unable to read from socket: Connection reset by peer
|
||
|
Memory content inconsistency at 5b43000 first_byte = bd last_byte = bc current = 4f hit_edge = 1
|
||
|
**
|
||
|
ERROR:../tests/qtest/migration-test.c:300:check_guests_ram: assertion failed: (bad == 0)
|
||
|
(test program exited with status code -6)
|
||
|
|
||
|
Fabiano debugged into it and found that the preempt thread can quit even
|
||
|
without receiving all the pages, which can cause guest not receiving all
|
||
|
the pages and corrupt the guest memory.
|
||
|
|
||
|
To make sure preempt thread finished receiving all the pages, we can rely
|
||
|
on the page_requested_count being zero because preempt channel will only
|
||
|
receive requested page faults. Note, not all the faulted pages are required
|
||
|
to be sent via the preempt channel/thread; imagine the case when a
|
||
|
requested page is just queued into the background main channel for
|
||
|
migration, the src qemu will just still send it via the background channel.
|
||
|
|
||
|
Here instead of spinning over reading the count, we add a condvar so the
|
||
|
main thread can wait on it if that unusual case happened, without burning
|
||
|
the cpu for no good reason, even if the duration is short; so even if we
|
||
|
spin in this rare case is probably fine. It's just better to not do so.
|
||
|
|
||
|
The condvar is only used when that special case is triggered. Some memory
|
||
|
ordering trick is needed to guarantee it from happening (against the
|
||
|
preempt thread status field), so the main thread will always get a kick
|
||
|
when that triggers correctly.
|
||
|
|
||
|
Closes: https://gitlab.com/qemu-project/qemu/-/issues/1886
|
||
|
Debugged-by: Fabiano Rosas <farosas@suse.de>
|
||
|
Signed-off-by: Peter Xu <peterx@redhat.com>
|
||
|
Signed-off-by: Fabiano Rosas <farosas@suse.de>
|
||
|
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
|
||
|
Message-ID: <20230918172822.19052-2-farosas@suse.de>
|
||
|
(cherry picked from commit cf02f29e1e3843784630d04783e372fa541a77e5)
|
||
|
Signed-off-by: Peter Xu <peterx@redhat.com>
|
||
|
---
|
||
|
migration/migration.c | 3 ++-
|
||
|
migration/migration.h | 13 ++++++++++++-
|
||
|
migration/postcopy-ram.c | 38 +++++++++++++++++++++++++++++++++++++-
|
||
|
3 files changed, 51 insertions(+), 3 deletions(-)
|
||
|
|
||
|
diff --git a/migration/migration.c b/migration/migration.c
|
||
|
index 5528acb65e..6a7122694a 100644
|
||
|
--- a/migration/migration.c
|
||
|
+++ b/migration/migration.c
|
||
|
@@ -153,6 +153,7 @@ void migration_object_init(void)
|
||
|
qemu_sem_init(¤t_incoming->postcopy_qemufile_dst_done, 0);
|
||
|
|
||
|
qemu_mutex_init(¤t_incoming->page_request_mutex);
|
||
|
+ qemu_cond_init(¤t_incoming->page_request_cond);
|
||
|
current_incoming->page_requested = g_tree_new(page_request_addr_cmp);
|
||
|
|
||
|
migration_object_check(current_migration, &error_fatal);
|
||
|
@@ -367,7 +368,7 @@ int migrate_send_rp_req_pages(MigrationIncomingState *mis,
|
||
|
* things like g_tree_lookup() will return TRUE (1) when found.
|
||
|
*/
|
||
|
g_tree_insert(mis->page_requested, aligned, (gpointer)1);
|
||
|
- mis->page_requested_count++;
|
||
|
+ qatomic_inc(&mis->page_requested_count);
|
||
|
trace_postcopy_page_req_add(aligned, mis->page_requested_count);
|
||
|
}
|
||
|
}
|
||
|
diff --git a/migration/migration.h b/migration/migration.h
|
||
|
index 6eea18db36..9a30216895 100644
|
||
|
--- a/migration/migration.h
|
||
|
+++ b/migration/migration.h
|
||
|
@@ -196,7 +196,10 @@ struct MigrationIncomingState {
|
||
|
|
||
|
/* A tree of pages that we requested to the source VM */
|
||
|
GTree *page_requested;
|
||
|
- /* For debugging purpose only, but would be nice to keep */
|
||
|
+ /*
|
||
|
+ * For postcopy only, count the number of requested page faults that
|
||
|
+ * still haven't been resolved.
|
||
|
+ */
|
||
|
int page_requested_count;
|
||
|
/*
|
||
|
* The mutex helps to maintain the requested pages that we sent to the
|
||
|
@@ -210,6 +213,14 @@ struct MigrationIncomingState {
|
||
|
* contains valid information.
|
||
|
*/
|
||
|
QemuMutex page_request_mutex;
|
||
|
+ /*
|
||
|
+ * If postcopy preempt is enabled, there is a chance that the main
|
||
|
+ * thread finished loading its data before the preempt channel has
|
||
|
+ * finished loading the urgent pages. If that happens, the two threads
|
||
|
+ * will use this condvar to synchronize, so the main thread will always
|
||
|
+ * wait until all pages received.
|
||
|
+ */
|
||
|
+ QemuCond page_request_cond;
|
||
|
|
||
|
/*
|
||
|
* Number of devices that have yet to approve switchover. When this reaches
|
||
|
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
|
||
|
index 29aea9456d..5408e028c6 100644
|
||
|
--- a/migration/postcopy-ram.c
|
||
|
+++ b/migration/postcopy-ram.c
|
||
|
@@ -599,6 +599,30 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
|
||
|
if (mis->preempt_thread_status == PREEMPT_THREAD_CREATED) {
|
||
|
/* Notify the fast load thread to quit */
|
||
|
mis->preempt_thread_status = PREEMPT_THREAD_QUIT;
|
||
|
+ /*
|
||
|
+ * Update preempt_thread_status before reading count. Note: mutex
|
||
|
+ * lock only provide ACQUIRE semantic, and it doesn't stops this
|
||
|
+ * write to be reordered after reading the count.
|
||
|
+ */
|
||
|
+ smp_mb();
|
||
|
+ /*
|
||
|
+ * It's possible that the preempt thread is still handling the last
|
||
|
+ * pages to arrive which were requested by guest page faults.
|
||
|
+ * Making sure nothing is left behind by waiting on the condvar if
|
||
|
+ * that unlikely case happened.
|
||
|
+ */
|
||
|
+ WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
|
||
|
+ if (qatomic_read(&mis->page_requested_count)) {
|
||
|
+ /*
|
||
|
+ * It is guaranteed to receive a signal later, because the
|
||
|
+ * count>0 now, so it's destined to be decreased to zero
|
||
|
+ * very soon by the preempt thread.
|
||
|
+ */
|
||
|
+ qemu_cond_wait(&mis->page_request_cond,
|
||
|
+ &mis->page_request_mutex);
|
||
|
+ }
|
||
|
+ }
|
||
|
+ /* Notify the fast load thread to quit */
|
||
|
if (mis->postcopy_qemufile_dst) {
|
||
|
qemu_file_shutdown(mis->postcopy_qemufile_dst);
|
||
|
}
|
||
|
@@ -1277,8 +1301,20 @@ static int qemu_ufd_copy_ioctl(MigrationIncomingState *mis, void *host_addr,
|
||
|
*/
|
||
|
if (g_tree_lookup(mis->page_requested, host_addr)) {
|
||
|
g_tree_remove(mis->page_requested, host_addr);
|
||
|
- mis->page_requested_count--;
|
||
|
+ int left_pages = qatomic_dec_fetch(&mis->page_requested_count);
|
||
|
+
|
||
|
trace_postcopy_page_req_del(host_addr, mis->page_requested_count);
|
||
|
+ /* Order the update of count and read of preempt status */
|
||
|
+ smp_mb();
|
||
|
+ if (mis->preempt_thread_status == PREEMPT_THREAD_QUIT &&
|
||
|
+ left_pages == 0) {
|
||
|
+ /*
|
||
|
+ * This probably means the main thread is waiting for us.
|
||
|
+ * Notify that we've finished receiving the last requested
|
||
|
+ * page.
|
||
|
+ */
|
||
|
+ qemu_cond_signal(&mis->page_request_cond);
|
||
|
+ }
|
||
|
}
|
||
|
qemu_mutex_unlock(&mis->page_request_mutex);
|
||
|
mark_postcopy_blocktime_end((uintptr_t)host_addr);
|
||
|
--
|
||
|
2.39.3
|
||
|
|