diff --git a/kvm-block-mark-bdrv_child_change_aio_context-GRAPH_RDLOC.patch b/kvm-block-mark-bdrv_child_change_aio_context-GRAPH_RDLOC.patch new file mode 100644 index 0000000..a39df88 --- /dev/null +++ b/kvm-block-mark-bdrv_child_change_aio_context-GRAPH_RDLOC.patch @@ -0,0 +1,49 @@ +From 9e52d5e0ac43c31ca39c7e1ac41bf5bcb179f848 Mon Sep 17 00:00:00 2001 +From: Fiona Ebner +Date: Fri, 30 May 2025 17:10:44 +0200 +Subject: [PATCH 19/33] block: mark bdrv_child_change_aio_context() + GRAPH_RDLOCK + +RH-Author: Kevin Wolf +RH-MergeRequest: 393: block: do not drain while holding the graph lock +RH-Jira: RHEL-88561 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Hanna Czenczek +RH-Commit: [7/21] ca3294430750a4266f4f3cbe42192d3abb84c817 (kmwolf/centos-qemu-kvm) + +This is a small step in preparation to mark bdrv_drained_begin() as +GRAPH_UNLOCKED. More concretely, it is in preparation to move the +drain out of bdrv_change_aio_context() and marking that function as +GRAPH_RDLOCK. + +Signed-off-by: Fiona Ebner +Reviewed-by: Kevin Wolf +Message-ID: <20250530151125.955508-8-f.ebner@proxmox.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 469422c45b3a816eaf36e7edc895c81e0f3d38bb) +Signed-off-by: Kevin Wolf +--- + include/block/block-global-state.h | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/include/block/block-global-state.h b/include/block/block-global-state.h +index 9be34b3c99..aad160956a 100644 +--- a/include/block/block-global-state.h ++++ b/include/block/block-global-state.h +@@ -274,9 +274,10 @@ int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag); + int bdrv_debug_resume(BlockDriverState *bs, const char *tag); + bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag); + +-bool bdrv_child_change_aio_context(BdrvChild *c, AioContext *ctx, +- GHashTable *visited, Transaction *tran, +- Error **errp); ++bool GRAPH_RDLOCK ++bdrv_child_change_aio_context(BdrvChild *c, AioContext *ctx, ++ GHashTable *visited, Transaction *tran, ++ Error **errp); + int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx, + BdrvChild *ignore_child, Error **errp); + +-- +2.39.3 + diff --git a/kvm-block-mark-bdrv_drained_begin-and-friends-as-GRAPH_U.patch b/kvm-block-mark-bdrv_drained_begin-and-friends-as-GRAPH_U.patch new file mode 100644 index 0000000..f1bad96 --- /dev/null +++ b/kvm-block-mark-bdrv_drained_begin-and-friends-as-GRAPH_U.patch @@ -0,0 +1,62 @@ +From e002888564647162c7796075ef7bdc14c0dc29fc Mon Sep 17 00:00:00 2001 +From: Fiona Ebner +Date: Fri, 30 May 2025 17:10:56 +0200 +Subject: [PATCH 31/33] block: mark bdrv_drained_begin() and friends as + GRAPH_UNLOCKED + +RH-Author: Kevin Wolf +RH-MergeRequest: 393: block: do not drain while holding the graph lock +RH-Jira: RHEL-88561 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Hanna Czenczek +RH-Commit: [19/21] 7dd6a447a7fabb2685bec471a28f9e3e78dbc2d1 (kmwolf/centos-qemu-kvm) + +All of bdrv_drain_all_begin(), bdrv_drain_all() and +bdrv_drained_begin() poll and are not allowed to be called with the +block graph lock held. Mark the function as such. + +Suggested-by: Kevin Wolf +Signed-off-by: Fiona Ebner +Message-ID: <20250530151125.955508-20-f.ebner@proxmox.com> +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf +(cherry picked from commit fc1d2f3eac7946658b160db0b813b81288fb1778) +Signed-off-by: Kevin Wolf +--- + include/block/block-global-state.h | 4 ++-- + include/block/block-io.h | 2 +- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/include/block/block-global-state.h b/include/block/block-global-state.h +index 91f249b5ad..84a2a4ecd5 100644 +--- a/include/block/block-global-state.h ++++ b/include/block/block-global-state.h +@@ -192,10 +192,10 @@ int bdrv_inactivate_all(void); + + int bdrv_flush_all(void); + void bdrv_close_all(void); +-void bdrv_drain_all_begin(void); ++void GRAPH_UNLOCKED bdrv_drain_all_begin(void); + void bdrv_drain_all_begin_nopoll(void); + void bdrv_drain_all_end(void); +-void bdrv_drain_all(void); ++void GRAPH_UNLOCKED bdrv_drain_all(void); + + void bdrv_aio_cancel(BlockAIOCB *acb); + +diff --git a/include/block/block-io.h b/include/block/block-io.h +index b99cc98d26..4cf83fb367 100644 +--- a/include/block/block-io.h ++++ b/include/block/block-io.h +@@ -431,7 +431,7 @@ bdrv_drain_poll(BlockDriverState *bs, BdrvChild *ignore_parent, + * + * This function can be recursive. + */ +-void bdrv_drained_begin(BlockDriverState *bs); ++void GRAPH_UNLOCKED bdrv_drained_begin(BlockDriverState *bs); + + /** + * bdrv_do_drained_begin_quiesce: +-- +2.39.3 + diff --git a/kvm-block-mark-bdrv_parent_change_aio_context-GRAPH_RDLO.patch b/kvm-block-mark-bdrv_parent_change_aio_context-GRAPH_RDLO.patch new file mode 100644 index 0000000..1524fb9 --- /dev/null +++ b/kvm-block-mark-bdrv_parent_change_aio_context-GRAPH_RDLO.patch @@ -0,0 +1,49 @@ +From 191a72ef2a40d7bd14c5ad3745f1dfccbbf95817 Mon Sep 17 00:00:00 2001 +From: Fiona Ebner +Date: Fri, 30 May 2025 17:10:42 +0200 +Subject: [PATCH 17/33] block: mark bdrv_parent_change_aio_context() + GRAPH_RDLOCK + +RH-Author: Kevin Wolf +RH-MergeRequest: 393: block: do not drain while holding the graph lock +RH-Jira: RHEL-88561 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Hanna Czenczek +RH-Commit: [5/21] e80d701ce609e823f85465a3433324cbf077b3b9 (kmwolf/centos-qemu-kvm) + +This is a small step in preparation to mark bdrv_drained_begin() as +GRAPH_UNLOCKED. More concretely, it allows marking the +change_aio_ctx() callback GRAPH_RDLOCK_PTR, which is the next step. + +Signed-off-by: Fiona Ebner +Reviewed-by: Kevin Wolf +Message-ID: <20250530151125.955508-6-f.ebner@proxmox.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 3758733959af93b5eb3283659d868ad5b24152b4) +Signed-off-by: Kevin Wolf +--- + block.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/block.c b/block.c +index e340bac177..18a6be3bd6 100644 +--- a/block.c ++++ b/block.c +@@ -7575,10 +7575,10 @@ typedef struct BdrvStateSetAioContext { + BlockDriverState *bs; + } BdrvStateSetAioContext; + +-static bool bdrv_parent_change_aio_context(BdrvChild *c, AioContext *ctx, +- GHashTable *visited, +- Transaction *tran, +- Error **errp) ++static bool GRAPH_RDLOCK ++bdrv_parent_change_aio_context(BdrvChild *c, AioContext *ctx, ++ GHashTable *visited, Transaction *tran, ++ Error **errp) + { + GLOBAL_STATE_CODE(); + if (g_hash_table_contains(visited, c)) { +-- +2.39.3 + diff --git a/kvm-block-mark-change_aio_ctx-callback-and-instances-as-.patch b/kvm-block-mark-change_aio_ctx-callback-and-instances-as-.patch new file mode 100644 index 0000000..ac0fbd7 --- /dev/null +++ b/kvm-block-mark-change_aio_ctx-callback-and-instances-as-.patch @@ -0,0 +1,103 @@ +From 1c72dbd318b6c15d7a7cbc14a270056a5cb6b182 Mon Sep 17 00:00:00 2001 +From: Fiona Ebner +Date: Fri, 30 May 2025 17:10:43 +0200 +Subject: [PATCH 18/33] block: mark change_aio_ctx() callback and instances as + GRAPH_RDLOCK(_PTR) + +RH-Author: Kevin Wolf +RH-MergeRequest: 393: block: do not drain while holding the graph lock +RH-Jira: RHEL-88561 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Hanna Czenczek +RH-Commit: [6/21] 9b1d5045f18b3393c5a35ce854da0dfd6fc2803b (kmwolf/centos-qemu-kvm) + +This is a small step in preparation to mark bdrv_drained_begin() as +GRAPH_UNLOCKED. More concretely, it is in preparation to move the +drain out of bdrv_change_aio_context() and marking that function as +GRAPH_RDLOCK. + +Signed-off-by: Fiona Ebner +Reviewed-by: Kevin Wolf +Message-ID: <20250530151125.955508-7-f.ebner@proxmox.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 844d550d09ac29ff2b1b49069587ae6a989df31d) +Signed-off-by: Kevin Wolf +--- + block.c | 7 ++++--- + block/block-backend.c | 6 +++--- + blockjob.c | 6 +++--- + include/block/block_int-common.h | 6 +++--- + 4 files changed, 13 insertions(+), 12 deletions(-) + +diff --git a/block.c b/block.c +index 18a6be3bd6..f7b21d8f27 100644 +--- a/block.c ++++ b/block.c +@@ -1226,9 +1226,10 @@ static int bdrv_child_cb_inactivate(BdrvChild *child) + return 0; + } + +-static bool bdrv_child_cb_change_aio_ctx(BdrvChild *child, AioContext *ctx, +- GHashTable *visited, Transaction *tran, +- Error **errp) ++static bool GRAPH_RDLOCK ++bdrv_child_cb_change_aio_ctx(BdrvChild *child, AioContext *ctx, ++ GHashTable *visited, Transaction *tran, ++ Error **errp) + { + BlockDriverState *bs = child->opaque; + return bdrv_change_aio_context(bs, ctx, visited, tran, errp); +diff --git a/block/block-backend.c b/block/block-backend.c +index a402db13f2..6a6949edeb 100644 +--- a/block/block-backend.c ++++ b/block/block-backend.c +@@ -136,9 +136,9 @@ static void blk_root_drained_end(BdrvChild *child); + static void blk_root_change_media(BdrvChild *child, bool load); + static void blk_root_resize(BdrvChild *child); + +-static bool blk_root_change_aio_ctx(BdrvChild *child, AioContext *ctx, +- GHashTable *visited, Transaction *tran, +- Error **errp); ++static bool GRAPH_RDLOCK ++blk_root_change_aio_ctx(BdrvChild *child, AioContext *ctx, GHashTable *visited, ++ Transaction *tran, Error **errp); + + static char *blk_root_get_parent_desc(BdrvChild *child) + { +diff --git a/blockjob.c b/blockjob.c +index 32007f31a9..34185d7715 100644 +--- a/blockjob.c ++++ b/blockjob.c +@@ -144,9 +144,9 @@ static TransactionActionDrv change_child_job_context = { + .clean = g_free, + }; + +-static bool child_job_change_aio_ctx(BdrvChild *c, AioContext *ctx, +- GHashTable *visited, Transaction *tran, +- Error **errp) ++static bool GRAPH_RDLOCK ++child_job_change_aio_ctx(BdrvChild *c, AioContext *ctx, GHashTable *visited, ++ Transaction *tran, Error **errp) + { + BlockJob *job = c->opaque; + BdrvStateChildJobContext *s; +diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h +index a9c0daa2a4..307dc56ed8 100644 +--- a/include/block/block_int-common.h ++++ b/include/block/block_int-common.h +@@ -987,9 +987,9 @@ struct BdrvChildClass { + bool backing_mask_protocol, + Error **errp); + +- bool (*change_aio_ctx)(BdrvChild *child, AioContext *ctx, +- GHashTable *visited, Transaction *tran, +- Error **errp); ++ bool GRAPH_RDLOCK_PTR (*change_aio_ctx)(BdrvChild *child, AioContext *ctx, ++ GHashTable *visited, ++ Transaction *tran, Error **errp); + + /* + * I/O API functions. These functions are thread-safe. +-- +2.39.3 + diff --git a/kvm-block-move-drain-outside-of-bdrv_attach_child.patch b/kvm-block-move-drain-outside-of-bdrv_attach_child.patch new file mode 100644 index 0000000..43037d0 --- /dev/null +++ b/kvm-block-move-drain-outside-of-bdrv_attach_child.patch @@ -0,0 +1,326 @@ +From 62e8b3e9173ea4fb85cf52c66109832ff9d4d437 Mon Sep 17 00:00:00 2001 +From: Fiona Ebner +Date: Fri, 30 May 2025 17:10:50 +0200 +Subject: [PATCH 25/33] block: move drain outside of bdrv_attach_child() + +RH-Author: Kevin Wolf +RH-MergeRequest: 393: block: do not drain while holding the graph lock +RH-Jira: RHEL-88561 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Hanna Czenczek +RH-Commit: [13/21] c68a1e34fa991dff72ec0a6403fd9786341ab534 (kmwolf/centos-qemu-kvm) + +This is part of resolving the deadlock mentioned in commit "block: +move draining out of bdrv_change_aio_context() and mark GRAPH_RDLOCK". + +The function bdrv_attach_child() runs under the graph lock, so it is +not allowed to drain. It is called by: +1. replication_start() +2. quorum_add_child() +3. bdrv_open_child_common() +4. Throughout test-bdrv-graph-mod.c and test-bdrv-drain.c unit tests. + +In all callers, a drained section is introduced. + +The function quorum_add_child() runs under the graph lock, so it is +not actually allowed to drain. This will be addressed by the following +commit. + +Signed-off-by: Fiona Ebner +Reviewed-by: Kevin Wolf +Message-ID: <20250530151125.955508-14-f.ebner@proxmox.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 77f3965ba7fed5b35212171a1e41c20c05a7ef11) +Signed-off-by: Kevin Wolf +--- + block.c | 6 ++++-- + block/quorum.c | 2 ++ + block/replication.c | 5 +++++ + tests/unit/test-bdrv-drain.c | 14 ++++++++++++++ + tests/unit/test-bdrv-graph-mod.c | 10 ++++++++++ + 5 files changed, 35 insertions(+), 2 deletions(-) + +diff --git a/block.c b/block.c +index 536a017201..3857f42877 100644 +--- a/block.c ++++ b/block.c +@@ -3269,6 +3269,8 @@ out: + * + * On failure NULL is returned, errp is set and the reference to + * child_bs is also dropped. ++ * ++ * All block nodes must be drained. + */ + BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs, + BlockDriverState *child_bs, +@@ -3283,7 +3285,6 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs, + + GLOBAL_STATE_CODE(); + +- bdrv_drain_all_begin(); + child = bdrv_attach_child_noperm(parent_bs, child_bs, child_name, + child_class, child_role, tran, errp); + if (!child) { +@@ -3298,7 +3299,6 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs, + + out: + tran_finalize(tran, ret); +- bdrv_drain_all_end(); + + bdrv_schedule_unref(child_bs); + +@@ -3789,10 +3789,12 @@ static BdrvChild *bdrv_open_child_common(const char *filename, + return NULL; + } + ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + child = bdrv_attach_child(parent, bs, bdref_key, child_class, child_role, + errp); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + + return child; + } +diff --git a/block/quorum.c b/block/quorum.c +index ed8ce801ee..ea17b0ec13 100644 +--- a/block/quorum.c ++++ b/block/quorum.c +@@ -1096,8 +1096,10 @@ quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs, Error **errp) + /* We can safely add the child now */ + bdrv_ref(child_bs); + ++ bdrv_drain_all_begin(); + child = bdrv_attach_child(bs, child_bs, indexstr, &child_of_bds, + BDRV_CHILD_DATA, errp); ++ bdrv_drain_all_end(); + if (child == NULL) { + s->next_child_index--; + return; +diff --git a/block/replication.c b/block/replication.c +index 0020f33843..02814578c6 100644 +--- a/block/replication.c ++++ b/block/replication.c +@@ -541,6 +541,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, + return; + } + ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + + bdrv_ref(hidden_disk->bs); +@@ -550,6 +551,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, + if (local_err) { + error_propagate(errp, local_err); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + return; + } + +@@ -560,6 +562,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, + if (local_err) { + error_propagate(errp, local_err); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + return; + } + +@@ -572,12 +575,14 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, + !check_top_bs(top_bs, bs)) { + error_setg(errp, "No top_bs or it is invalid"); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + reopen_backing_file(bs, false, NULL); + return; + } + bdrv_op_block_all(top_bs, s->blocker); + + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + + s->backup_job = backup_job_create( + NULL, s->secondary_disk->bs, s->hidden_disk->bs, +diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c +index 4f3057844b..ac76525e5a 100644 +--- a/tests/unit/test-bdrv-drain.c ++++ b/tests/unit/test-bdrv-drain.c +@@ -1049,10 +1049,12 @@ static void do_test_delete_by_drain(bool detach_instead_of_delete, + + null_bs = bdrv_open("null-co://", NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + &error_abort); ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + bdrv_attach_child(bs, null_bs, "null-child", &child_of_bds, + BDRV_CHILD_DATA, &error_abort); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + + /* This child will be the one to pass to requests through to, and + * it will stall until a drain occurs */ +@@ -1060,21 +1062,25 @@ static void do_test_delete_by_drain(bool detach_instead_of_delete, + &error_abort); + child_bs->total_sectors = 65536 >> BDRV_SECTOR_BITS; + /* Takes our reference to child_bs */ ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + tts->wait_child = bdrv_attach_child(bs, child_bs, "wait-child", + &child_of_bds, + BDRV_CHILD_DATA | BDRV_CHILD_PRIMARY, + &error_abort); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + + /* This child is just there to be deleted + * (for detach_instead_of_delete == true) */ + null_bs = bdrv_open("null-co://", NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + &error_abort); ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + bdrv_attach_child(bs, null_bs, "null-child", &child_of_bds, BDRV_CHILD_DATA, + &error_abort); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + + blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL); + blk_insert_bs(blk, bs, &error_abort); +@@ -1157,6 +1163,7 @@ static void no_coroutine_fn detach_indirect_bh(void *opaque) + + bdrv_dec_in_flight(data->child_b->bs); + ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + bdrv_unref_child(data->parent_b, data->child_b); + +@@ -1165,6 +1172,7 @@ static void no_coroutine_fn detach_indirect_bh(void *opaque) + &child_of_bds, BDRV_CHILD_DATA, + &error_abort); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + } + + static void coroutine_mixed_fn detach_by_parent_aio_cb(void *opaque, int ret) +@@ -1262,6 +1270,7 @@ static void TSA_NO_TSA test_detach_indirect(bool by_parent_cb) + /* Set child relationships */ + bdrv_ref(b); + bdrv_ref(a); ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + child_b = bdrv_attach_child(parent_b, b, "PB-B", &child_of_bds, + BDRV_CHILD_DATA, &error_abort); +@@ -1273,6 +1282,7 @@ static void TSA_NO_TSA test_detach_indirect(bool by_parent_cb) + by_parent_cb ? &child_of_bds : &detach_by_driver_cb_class, + BDRV_CHILD_DATA, &error_abort); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + + g_assert_cmpint(parent_a->refcnt, ==, 1); + g_assert_cmpint(parent_b->refcnt, ==, 1); +@@ -1685,6 +1695,7 @@ static void test_drop_intermediate_poll(void) + * Establish the chain last, so the chain links are the first + * elements in the BDS.parents lists + */ ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + for (i = 0; i < 3; i++) { + if (i) { +@@ -1694,6 +1705,7 @@ static void test_drop_intermediate_poll(void) + } + } + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + + job = block_job_create("job", &test_simple_job_driver, NULL, job_node, + 0, BLK_PERM_ALL, 0, 0, NULL, NULL, &error_abort); +@@ -1940,10 +1952,12 @@ static void do_test_replace_child_mid_drain(int old_drain_count, + new_child_bs->total_sectors = 1; + + bdrv_ref(old_child_bs); ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + bdrv_attach_child(parent_bs, old_child_bs, "child", &child_of_bds, + BDRV_CHILD_COW, &error_abort); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + parent_s->setup_completed = true; + + for (i = 0; i < old_drain_count; i++) { +diff --git a/tests/unit/test-bdrv-graph-mod.c b/tests/unit/test-bdrv-graph-mod.c +index d743abb4bb..7b03ebe4b0 100644 +--- a/tests/unit/test-bdrv-graph-mod.c ++++ b/tests/unit/test-bdrv-graph-mod.c +@@ -137,10 +137,12 @@ static void test_update_perm_tree(void) + + blk_insert_bs(root, bs, &error_abort); + ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + bdrv_attach_child(filter, bs, "child", &child_of_bds, + BDRV_CHILD_DATA, &error_abort); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + + ret = bdrv_append(filter, bs, NULL); + g_assert_cmpint(ret, <, 0); +@@ -204,11 +206,13 @@ static void test_should_update_child(void) + + bdrv_set_backing_hd(target, bs, &error_abort); + ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + g_assert(target->backing->bs == bs); + bdrv_attach_child(filter, target, "target", &child_of_bds, + BDRV_CHILD_DATA, &error_abort); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + bdrv_append(filter, bs, &error_abort); + + bdrv_graph_rdlock_main_loop(); +@@ -244,6 +248,7 @@ static void test_parallel_exclusive_write(void) + bdrv_ref(base); + bdrv_ref(fl1); + ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + bdrv_attach_child(top, fl1, "backing", &child_of_bds, + BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, +@@ -257,6 +262,7 @@ static void test_parallel_exclusive_write(void) + + bdrv_replace_node(fl1, fl2, &error_abort); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + + bdrv_drained_end(fl2); + bdrv_drained_end(fl1); +@@ -363,6 +369,7 @@ static void test_parallel_perm_update(void) + */ + bdrv_ref(base); + ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + bdrv_attach_child(top, ws, "file", &child_of_bds, BDRV_CHILD_DATA, + &error_abort); +@@ -377,6 +384,7 @@ static void test_parallel_perm_update(void) + BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, + &error_abort); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + + /* Select fl1 as first child to be active */ + s->selected = c_fl1; +@@ -430,11 +438,13 @@ static void test_append_greedy_filter(void) + BlockDriverState *base = no_perm_node("base"); + BlockDriverState *fl = exclusive_writer_node("fl1"); + ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + bdrv_attach_child(top, base, "backing", &child_of_bds, + BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, + &error_abort); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + + bdrv_append(fl, base, &error_abort); + bdrv_unref(fl); +-- +2.39.3 + diff --git a/kvm-block-move-drain-outside-of-bdrv_attach_child_common.patch b/kvm-block-move-drain-outside-of-bdrv_attach_child_common.patch new file mode 100644 index 0000000..3cecf05 --- /dev/null +++ b/kvm-block-move-drain-outside-of-bdrv_attach_child_common.patch @@ -0,0 +1,260 @@ +From faa96d060a393458e2e1f9ba77c53adf7d52bc85 Mon Sep 17 00:00:00 2001 +From: Fiona Ebner +Date: Fri, 30 May 2025 17:10:47 +0200 +Subject: [PATCH 22/33] block: move drain outside of + bdrv_attach_child_common(_abort)() + +RH-Author: Kevin Wolf +RH-MergeRequest: 393: block: do not drain while holding the graph lock +RH-Jira: RHEL-88561 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Hanna Czenczek +RH-Commit: [10/21] 0b8f078c657c094dbd6f967eaaccafd8112a9bda (kmwolf/centos-qemu-kvm) + +This is part of resolving the deadlock mentioned in commit "block: +move draining out of bdrv_change_aio_context() and mark GRAPH_RDLOCK". + +The function bdrv_attach_child_common_abort() is used only as the +abort callback in bdrv_attach_child_common_drv transactions, so the +tran_finalize() calls of such transactions need to be in drained +sections too. + +All code paths are covered: +The bdrv_attach_child_common_drv transactions are only used in +bdrv_attach_child_common(), so it is enough to check callers of +bdrv_attach_child_common() following the transactions. + +bdrv_attach_child_common() is called by: +1. bdrv_attach_child_noperm(), which does not finalize the + transaction yet. +2. bdrv_root_attach_child(), where a drained section is introduced. + +bdrv_attach_child_noperm() is called by: +1. bdrv_attach_child(), where a drained section is introduced. +2. bdrv_set_file_or_backing_noperm(), which does not finalize the + transaction yet. +3. bdrv_append(), where a drained section is introduced. + +bdrv_set_file_or_backing_noperm() is called by: +1. bdrv_set_backing_hd_drained(), where a drained section is + introduced. +2. bdrv_reopen_parse_file_or_backing(), which does not finalize the + transaction yet. Draining the old child bs currently happens under + the graph lock there. This is replaced with an assertion, because + the drain will be moved further up to the caller. + +bdrv_reopen_parse_file_or_backing() is called by: +1. bdrv_reopen_prepare(), which does not finalize the transaction yet. + +bdrv_reopen_prepare() is called by: +1. bdrv_reopen_multiple(), which does finalize the transaction. It is + called after bdrv_reopen_queue(), which starts a drained section. + The drained section ends, when bdrv_reopen_queue_free() is called + at the end of bdrv_reopen_multiple(). + +This resolves all code paths. + +The functions bdrv_set_backing_hd_drained(), bdrv_attach_child() and +bdrv_root_attach_child() run under the graph lock, so they are not +actually allowed to drain. This will be addressed in the following +commits. + +Signed-off-by: Fiona Ebner +Message-ID: <20250530151125.955508-11-f.ebner@proxmox.com> +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf +(cherry picked from commit 2b833595aa21679145cfe67ba720113b165c19ef) +Signed-off-by: Kevin Wolf +--- + block.c | 40 ++++++++++++++++++++++++---------------- + 1 file changed, 24 insertions(+), 16 deletions(-) + +diff --git a/block.c b/block.c +index 3c2e8c5592..2c18e0a4fa 100644 +--- a/block.c ++++ b/block.c +@@ -3028,10 +3028,8 @@ static void GRAPH_WRLOCK bdrv_attach_child_common_abort(void *opaque) + bdrv_replace_child_noperm(s->child, NULL); + + if (bdrv_get_aio_context(bs) != s->old_child_ctx) { +- bdrv_drain_all_begin(); + bdrv_try_change_aio_context_locked(bs, s->old_child_ctx, NULL, + &error_abort); +- bdrv_drain_all_end(); + } + + if (bdrv_child_get_parent_aio_context(s->child) != s->old_parent_ctx) { +@@ -3043,10 +3041,8 @@ static void GRAPH_WRLOCK bdrv_attach_child_common_abort(void *opaque) + + /* No need to visit `child`, because it has been detached already */ + visited = g_hash_table_new(NULL, NULL); +- bdrv_drain_all_begin(); + ret = s->child->klass->change_aio_ctx(s->child, s->old_parent_ctx, + visited, tran, &error_abort); +- bdrv_drain_all_end(); + g_hash_table_destroy(visited); + + /* transaction is supposed to always succeed */ +@@ -3075,6 +3071,9 @@ static TransactionActionDrv bdrv_attach_child_common_drv = { + * + * Both @parent_bs and @child_bs can move to a different AioContext in this + * function. ++ * ++ * All block nodes must be drained before this function is called until after ++ * the transaction is finalized. + */ + static BdrvChild * GRAPH_WRLOCK + bdrv_attach_child_common(BlockDriverState *child_bs, +@@ -3118,10 +3117,8 @@ bdrv_attach_child_common(BlockDriverState *child_bs, + parent_ctx = bdrv_child_get_parent_aio_context(new_child); + if (child_ctx != parent_ctx) { + Error *local_err = NULL; +- bdrv_drain_all_begin(); + int ret = bdrv_try_change_aio_context_locked(child_bs, parent_ctx, NULL, + &local_err); +- bdrv_drain_all_end(); + + if (ret < 0 && child_class->change_aio_ctx) { + Transaction *aio_ctx_tran = tran_new(); +@@ -3129,11 +3126,9 @@ bdrv_attach_child_common(BlockDriverState *child_bs, + bool ret_child; + + g_hash_table_add(visited, new_child); +- bdrv_drain_all_begin(); + ret_child = child_class->change_aio_ctx(new_child, child_ctx, + visited, aio_ctx_tran, + NULL); +- bdrv_drain_all_end(); + if (ret_child == true) { + error_free(local_err); + ret = 0; +@@ -3189,6 +3184,9 @@ bdrv_attach_child_common(BlockDriverState *child_bs, + * + * After calling this function, the transaction @tran may only be completed + * while holding a writer lock for the graph. ++ * ++ * All block nodes must be drained before this function is called until after ++ * the transaction is finalized. + */ + static BdrvChild * GRAPH_WRLOCK + bdrv_attach_child_noperm(BlockDriverState *parent_bs, +@@ -3244,6 +3242,7 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, + + GLOBAL_STATE_CODE(); + ++ bdrv_drain_all_begin(); + child = bdrv_attach_child_common(child_bs, child_name, child_class, + child_role, perm, shared_perm, opaque, + tran, errp); +@@ -3256,6 +3255,7 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, + + out: + tran_finalize(tran, ret); ++ bdrv_drain_all_end(); + + bdrv_schedule_unref(child_bs); + +@@ -3283,6 +3283,7 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs, + + GLOBAL_STATE_CODE(); + ++ bdrv_drain_all_begin(); + child = bdrv_attach_child_noperm(parent_bs, child_bs, child_name, + child_class, child_role, tran, errp); + if (!child) { +@@ -3297,6 +3298,7 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs, + + out: + tran_finalize(tran, ret); ++ bdrv_drain_all_end(); + + bdrv_schedule_unref(child_bs); + +@@ -3465,6 +3467,9 @@ static BdrvChildRole bdrv_backing_role(BlockDriverState *bs) + * + * After calling this function, the transaction @tran may only be completed + * while holding a writer lock for the graph. ++ * ++ * All block nodes must be drained before this function is called until after ++ * the transaction is finalized. + */ + static int GRAPH_WRLOCK + bdrv_set_file_or_backing_noperm(BlockDriverState *parent_bs, +@@ -3573,6 +3578,7 @@ int bdrv_set_backing_hd_drained(BlockDriverState *bs, + assert(bs->backing->bs->quiesce_counter > 0); + } + ++ bdrv_drain_all_begin(); + ret = bdrv_set_file_or_backing_noperm(bs, backing_hd, true, tran, errp); + if (ret < 0) { + goto out; +@@ -3581,6 +3587,7 @@ int bdrv_set_backing_hd_drained(BlockDriverState *bs, + ret = bdrv_refresh_perms(bs, tran, errp); + out: + tran_finalize(tran, ret); ++ bdrv_drain_all_end(); + return ret; + } + +@@ -4721,6 +4728,9 @@ int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only, + * Return 0 on success, otherwise return < 0 and set @errp. + * + * @reopen_state->bs can move to a different AioContext in this function. ++ * ++ * All block nodes must be drained before this function is called until after ++ * the transaction is finalized. + */ + static int GRAPH_UNLOCKED + bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state, +@@ -4814,7 +4824,7 @@ bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state, + + if (old_child_bs) { + bdrv_ref(old_child_bs); +- bdrv_drained_begin(old_child_bs); ++ assert(old_child_bs->quiesce_counter > 0); + } + + bdrv_graph_rdunlock_main_loop(); +@@ -4826,7 +4836,6 @@ bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state, + bdrv_graph_wrunlock(); + + if (old_child_bs) { +- bdrv_drained_end(old_child_bs); + bdrv_unref(old_child_bs); + } + +@@ -4855,6 +4864,9 @@ out_rdlock: + * + * After calling this function, the transaction @change_child_tran may only be + * completed while holding a writer lock for the graph. ++ * ++ * All block nodes must be drained before this function is called until after ++ * the transaction is finalized. + */ + static int GRAPH_UNLOCKED + bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, +@@ -5501,9 +5513,7 @@ int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top, + assert(!bs_new->backing); + bdrv_graph_rdunlock_main_loop(); + +- bdrv_drained_begin(bs_top); +- bdrv_drained_begin(bs_new); +- ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + + child = bdrv_attach_child_noperm(bs_new, bs_top, "backing", +@@ -5525,9 +5535,7 @@ out: + + bdrv_refresh_limits(bs_top, NULL, NULL); + bdrv_graph_wrunlock(); +- +- bdrv_drained_end(bs_top); +- bdrv_drained_end(bs_new); ++ bdrv_drain_all_end(); + + return ret; + } +-- +2.39.3 + diff --git a/kvm-block-move-drain-outside-of-bdrv_change_aio_context-.patch b/kvm-block-move-drain-outside-of-bdrv_change_aio_context-.patch new file mode 100644 index 0000000..14cd101 --- /dev/null +++ b/kvm-block-move-drain-outside-of-bdrv_change_aio_context-.patch @@ -0,0 +1,269 @@ +From c5b0d95edea3d1e2ef47804e0c6fe19ade4dcaed Mon Sep 17 00:00:00 2001 +From: Fiona Ebner +Date: Fri, 30 May 2025 17:10:45 +0200 +Subject: [PATCH 20/33] block: move drain outside of bdrv_change_aio_context() + and mark GRAPH_RDLOCK + +RH-Author: Kevin Wolf +RH-MergeRequest: 393: block: do not drain while holding the graph lock +RH-Jira: RHEL-88561 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Hanna Czenczek +RH-Commit: [8/21] 26dc90de9b684044752ba2ad3831bd786e40fe72 (kmwolf/centos-qemu-kvm) + +This is in preparation to mark bdrv_drained_begin() as GRAPH_UNLOCKED. + +Note that even if bdrv_drained_begin() were already marked as +GRAPH_UNLOCKED, TSA would not complain about the instance in +bdrv_change_aio_context() before this change, because it is preceded +by a bdrv_graph_rdunlock_main_loop() call. It is not correct to +release the lock here, and in case the caller holds a write lock, it +wouldn't actually release the lock. + +In combination with block-stream, there is a deadlock that can happen +because of this [0]. In particular, it can happen that +main thread IO thread +1. acquires write lock + in blk_co_do_preadv_part(): + 2. have non-zero blk->in_flight + 3. try to acquire read lock +4. begin drain + +Steps 3 and 4 might be switched. Draining will poll and get stuck, +because it will see the non-zero in_flight counter. But the IO thread +will not make any progress either, because it cannot acquire the read +lock. + +After this change, all paths to bdrv_change_aio_context() drain: +bdrv_change_aio_context() is called by: +1. bdrv_child_cb_change_aio_ctx() which is only called via the + change_aio_ctx() callback, see below. +2. bdrv_child_change_aio_context(), see below. +3. bdrv_try_change_aio_context(), where a drained section is + introduced. + +The change_aio_ctx() callback is called by: +1. bdrv_attach_child_common_abort(), where a drained section is + introduced. +2. bdrv_attach_child_common(), where a drained section is introduced. +3. bdrv_parent_change_aio_context(), see below. + +bdrv_child_change_aio_context() is called by: +1. bdrv_change_aio_context(), i.e. recursive, so being in a drained + section is invariant. +2. child_job_change_aio_ctx(), which is only called via the + change_aio_ctx() callback, see above. + +bdrv_parent_change_aio_context() is called by: +1. bdrv_change_aio_context(), i.e. recursive, so being in a drained + section is invariant. + +This resolves all code paths. Note that bdrv_attach_child_common() +and bdrv_attach_child_common_abort() hold the graph write lock and +callers of bdrv_try_change_aio_context() might too, so they are not +actually allowed to drain either. This will be addressed in the +following commits. + +More granular draining is not trivially possible, because +bdrv_change_aio_context() can recursively call itself e.g. via +bdrv_child_change_aio_context(). + +[0]: https://lore.kernel.org/qemu-devel/73839c04-7616-407e-b057-80ca69e63f51@virtuozzo.com/ + +Reported-by: Andrey Drobyshev +Signed-off-by: Fiona Ebner +Reviewed-by: Kevin Wolf +Message-ID: <20250530151125.955508-9-f.ebner@proxmox.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 91ba0e1c382bd4a4b9c6a200f8a175d6ff30ab99) +Signed-off-by: Kevin Wolf +--- + block.c | 57 +++++++++++++++++++++++--------- + include/block/block_int-common.h | 12 +++++++ + 2 files changed, 53 insertions(+), 16 deletions(-) + +diff --git a/block.c b/block.c +index f7b21d8f27..af438ae7ff 100644 +--- a/block.c ++++ b/block.c +@@ -106,9 +106,9 @@ static void bdrv_reopen_abort(BDRVReopenState *reopen_state); + + static bool bdrv_backing_overridden(BlockDriverState *bs); + +-static bool bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx, +- GHashTable *visited, Transaction *tran, +- Error **errp); ++static bool GRAPH_RDLOCK ++bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx, ++ GHashTable *visited, Transaction *tran, Error **errp); + + /* If non-zero, use only whitelisted block drivers */ + static int use_bdrv_whitelist; +@@ -3040,8 +3040,10 @@ static void GRAPH_WRLOCK bdrv_attach_child_common_abort(void *opaque) + + /* No need to visit `child`, because it has been detached already */ + visited = g_hash_table_new(NULL, NULL); ++ bdrv_drain_all_begin(); + ret = s->child->klass->change_aio_ctx(s->child, s->old_parent_ctx, + visited, tran, &error_abort); ++ bdrv_drain_all_end(); + g_hash_table_destroy(visited); + + /* transaction is supposed to always succeed */ +@@ -3122,9 +3124,11 @@ bdrv_attach_child_common(BlockDriverState *child_bs, + bool ret_child; + + g_hash_table_add(visited, new_child); ++ bdrv_drain_all_begin(); + ret_child = child_class->change_aio_ctx(new_child, child_ctx, + visited, aio_ctx_tran, + NULL); ++ bdrv_drain_all_end(); + if (ret_child == true) { + error_free(local_err); + ret = 0; +@@ -7576,6 +7580,17 @@ typedef struct BdrvStateSetAioContext { + BlockDriverState *bs; + } BdrvStateSetAioContext; + ++/* ++ * Changes the AioContext of @child to @ctx and recursively for the associated ++ * block nodes and all their children and parents. Returns true if the change is ++ * possible and the transaction @tran can be continued. Returns false and sets ++ * @errp if not and the transaction must be aborted. ++ * ++ * @visited will accumulate all visited BdrvChild objects. The caller is ++ * responsible for freeing the list afterwards. ++ * ++ * Must be called with the affected block nodes drained. ++ */ + static bool GRAPH_RDLOCK + bdrv_parent_change_aio_context(BdrvChild *c, AioContext *ctx, + GHashTable *visited, Transaction *tran, +@@ -7604,6 +7619,17 @@ bdrv_parent_change_aio_context(BdrvChild *c, AioContext *ctx, + return true; + } + ++/* ++ * Changes the AioContext of @c->bs to @ctx and recursively for all its children ++ * and parents. Returns true if the change is possible and the transaction @tran ++ * can be continued. Returns false and sets @errp if not and the transaction ++ * must be aborted. ++ * ++ * @visited will accumulate all visited BdrvChild objects. The caller is ++ * responsible for freeing the list afterwards. ++ * ++ * Must be called with the affected block nodes drained. ++ */ + bool bdrv_child_change_aio_context(BdrvChild *c, AioContext *ctx, + GHashTable *visited, Transaction *tran, + Error **errp) +@@ -7619,10 +7645,6 @@ bool bdrv_child_change_aio_context(BdrvChild *c, AioContext *ctx, + static void bdrv_set_aio_context_clean(void *opaque) + { + BdrvStateSetAioContext *state = (BdrvStateSetAioContext *) opaque; +- BlockDriverState *bs = (BlockDriverState *) state->bs; +- +- /* Paired with bdrv_drained_begin in bdrv_change_aio_context() */ +- bdrv_drained_end(bs); + + g_free(state); + } +@@ -7650,10 +7672,12 @@ static TransactionActionDrv set_aio_context = { + * + * @visited will accumulate all visited BdrvChild objects. The caller is + * responsible for freeing the list afterwards. ++ * ++ * @bs must be drained. + */ +-static bool bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx, +- GHashTable *visited, Transaction *tran, +- Error **errp) ++static bool GRAPH_RDLOCK ++bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx, ++ GHashTable *visited, Transaction *tran, Error **errp) + { + BdrvChild *c; + BdrvStateSetAioContext *state; +@@ -7664,21 +7688,17 @@ static bool bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx, + return true; + } + +- bdrv_graph_rdlock_main_loop(); + QLIST_FOREACH(c, &bs->parents, next_parent) { + if (!bdrv_parent_change_aio_context(c, ctx, visited, tran, errp)) { +- bdrv_graph_rdunlock_main_loop(); + return false; + } + } + + QLIST_FOREACH(c, &bs->children, next) { + if (!bdrv_child_change_aio_context(c, ctx, visited, tran, errp)) { +- bdrv_graph_rdunlock_main_loop(); + return false; + } + } +- bdrv_graph_rdunlock_main_loop(); + + state = g_new(BdrvStateSetAioContext, 1); + *state = (BdrvStateSetAioContext) { +@@ -7686,8 +7706,7 @@ static bool bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx, + .bs = bs, + }; + +- /* Paired with bdrv_drained_end in bdrv_set_aio_context_clean() */ +- bdrv_drained_begin(bs); ++ assert(bs->quiesce_counter > 0); + + tran_add(tran, &set_aio_context, state); + +@@ -7720,6 +7739,8 @@ int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx, + if (ignore_child) { + g_hash_table_add(visited, ignore_child); + } ++ bdrv_drain_all_begin(); ++ bdrv_graph_rdlock_main_loop(); + ret = bdrv_change_aio_context(bs, ctx, visited, tran, errp); + g_hash_table_destroy(visited); + +@@ -7733,10 +7754,14 @@ int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx, + if (!ret) { + /* Just run clean() callbacks. No AioContext changed. */ + tran_abort(tran); ++ bdrv_graph_rdunlock_main_loop(); ++ bdrv_drain_all_end(); + return -EPERM; + } + + tran_commit(tran); ++ bdrv_graph_rdunlock_main_loop(); ++ bdrv_drain_all_end(); + return 0; + } + +diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h +index 307dc56ed8..5d34e6a510 100644 +--- a/include/block/block_int-common.h ++++ b/include/block/block_int-common.h +@@ -987,6 +987,18 @@ struct BdrvChildClass { + bool backing_mask_protocol, + Error **errp); + ++ /* ++ * Notifies the parent that the child is trying to change its AioContext. ++ * The parent may in turn change the AioContext of other nodes in the same ++ * transaction. Returns true if the change is possible and the transaction ++ * can be continued. Returns false and sets @errp if not and the transaction ++ * must be aborted. ++ * ++ * @visited will accumulate all visited BdrvChild objects. The caller is ++ * responsible for freeing the list afterwards. ++ * ++ * Must be called with the affected block nodes drained. ++ */ + bool GRAPH_RDLOCK_PTR (*change_aio_ctx)(BdrvChild *child, AioContext *ctx, + GHashTable *visited, + Transaction *tran, Error **errp); +-- +2.39.3 + diff --git a/kvm-block-move-drain-outside-of-bdrv_root_attach_child.patch b/kvm-block-move-drain-outside-of-bdrv_root_attach_child.patch new file mode 100644 index 0000000..3cb6ca8 --- /dev/null +++ b/kvm-block-move-drain-outside-of-bdrv_root_attach_child.patch @@ -0,0 +1,283 @@ +From 4629f2201a220c7775df5a305f22d51ba7a34641 Mon Sep 17 00:00:00 2001 +From: Fiona Ebner +Date: Fri, 30 May 2025 17:10:49 +0200 +Subject: [PATCH 24/33] block: move drain outside of bdrv_root_attach_child() + +RH-Author: Kevin Wolf +RH-MergeRequest: 393: block: do not drain while holding the graph lock +RH-Jira: RHEL-88561 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Hanna Czenczek +RH-Commit: [12/21] fa142303dc37b7dc3555900f6e86d5d8241cb36b (kmwolf/centos-qemu-kvm) + +This is part of resolving the deadlock mentioned in commit "block: +move draining out of bdrv_change_aio_context() and mark GRAPH_RDLOCK". + +The function bdrv_root_attach_child() runs under the graph lock, so it +is not allowed to drain. It is called by: +1. blk_insert_bs(), where a drained section is introduced. +2. block_job_add_bdrv(), which holds the graph lock itself. + +block_job_add_bdrv() is called by: +1. mirror_start_job() +2. stream_start() +3. commit_start() +4. backup_job_create() +5. block_job_create() +6. In the test_blockjob_common_drain_node() unit test + +In all callers, a drained section is introduced. + +Signed-off-by: Fiona Ebner +Reviewed-by: Kevin Wolf +Message-ID: <20250530151125.955508-13-f.ebner@proxmox.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit ffdcd081f52544f065020c780a6c522dace6b0af) +Signed-off-by: Kevin Wolf +--- + block.c | 4 ++-- + block/backup.c | 2 ++ + block/block-backend.c | 2 ++ + block/commit.c | 4 ++++ + block/mirror.c | 5 +++++ + block/stream.c | 4 ++++ + blockjob.c | 4 ++++ + include/block/blockjob.h | 2 ++ + tests/unit/test-bdrv-drain.c | 2 ++ + 9 files changed, 27 insertions(+), 2 deletions(-) + +diff --git a/block.c b/block.c +index d84b8ae49e..536a017201 100644 +--- a/block.c ++++ b/block.c +@@ -3228,6 +3228,8 @@ bdrv_attach_child_noperm(BlockDriverState *parent_bs, + * + * On failure NULL is returned, errp is set and the reference to + * child_bs is also dropped. ++ * ++ * All block nodes must be drained. + */ + BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, + const char *child_name, +@@ -3242,7 +3244,6 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, + + GLOBAL_STATE_CODE(); + +- bdrv_drain_all_begin(); + child = bdrv_attach_child_common(child_bs, child_name, child_class, + child_role, perm, shared_perm, opaque, + tran, errp); +@@ -3255,7 +3256,6 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, + + out: + tran_finalize(tran, ret); +- bdrv_drain_all_end(); + + bdrv_schedule_unref(child_bs); + +diff --git a/block/backup.c b/block/backup.c +index 79652bf57b..9d55e55b79 100644 +--- a/block/backup.c ++++ b/block/backup.c +@@ -497,10 +497,12 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, + block_copy_set_speed(bcs, speed); + + /* Required permissions are taken by copy-before-write filter target */ ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + block_job_add_bdrv(&job->common, "target", target, 0, BLK_PERM_ALL, + &error_abort); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + + return &job->common; + +diff --git a/block/block-backend.c b/block/block-backend.c +index 6a6949edeb..24cae3cb55 100644 +--- a/block/block-backend.c ++++ b/block/block-backend.c +@@ -904,6 +904,7 @@ int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp) + + GLOBAL_STATE_CODE(); + bdrv_ref(bs); ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + + if ((bs->open_flags & BDRV_O_INACTIVE) && blk_can_inactivate(blk)) { +@@ -919,6 +920,7 @@ int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp) + BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, + perm, shared_perm, blk, errp); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + if (blk->root == NULL) { + return -EPERM; + } +diff --git a/block/commit.c b/block/commit.c +index 5df3d05346..6c06b894ff 100644 +--- a/block/commit.c ++++ b/block/commit.c +@@ -342,6 +342,7 @@ void commit_start(const char *job_id, BlockDriverState *bs, + * this is the responsibility of the interface (i.e. whoever calls + * commit_start()). + */ ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + s->base_overlay = bdrv_find_overlay(top, base); + assert(s->base_overlay); +@@ -374,18 +375,21 @@ void commit_start(const char *job_id, BlockDriverState *bs, + iter_shared_perms, errp); + if (ret < 0) { + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + goto fail; + } + } + + if (bdrv_freeze_backing_chain(commit_top_bs, base, errp) < 0) { + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + goto fail; + } + s->chain_frozen = true; + + ret = block_job_add_bdrv(&s->common, "base", base, 0, BLK_PERM_ALL, errp); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + + if (ret < 0) { + goto fail; +diff --git a/block/mirror.c b/block/mirror.c +index c2c5099c95..6e8caf4b49 100644 +--- a/block/mirror.c ++++ b/block/mirror.c +@@ -2014,6 +2014,7 @@ static BlockJob *mirror_start_job( + */ + bdrv_disable_dirty_bitmap(s->dirty_bitmap); + ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + ret = block_job_add_bdrv(&s->common, "source", bs, 0, + BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE | +@@ -2021,6 +2022,7 @@ static BlockJob *mirror_start_job( + errp); + if (ret < 0) { + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + goto fail; + } + +@@ -2066,16 +2068,19 @@ static BlockJob *mirror_start_job( + iter_shared_perms, errp); + if (ret < 0) { + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + goto fail; + } + } + + if (bdrv_freeze_backing_chain(mirror_top_bs, target, errp) < 0) { + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + goto fail; + } + } + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + + QTAILQ_INIT(&s->ops_in_flight); + +diff --git a/block/stream.c b/block/stream.c +index 6ba49cffd3..f5441f27f4 100644 +--- a/block/stream.c ++++ b/block/stream.c +@@ -371,10 +371,12 @@ void stream_start(const char *job_id, BlockDriverState *bs, + * already have our own plans. Also don't allow resize as the image size is + * queried only at the job start and then cached. + */ ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + if (block_job_add_bdrv(&s->common, "active node", bs, 0, + basic_flags | BLK_PERM_WRITE, errp)) { + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + goto fail; + } + +@@ -395,10 +397,12 @@ void stream_start(const char *job_id, BlockDriverState *bs, + basic_flags, errp); + if (ret < 0) { + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + goto fail; + } + } + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + + s->base_overlay = base_overlay; + s->above_base = above_base; +diff --git a/blockjob.c b/blockjob.c +index 34185d7715..44991e3ff7 100644 +--- a/blockjob.c ++++ b/blockjob.c +@@ -496,6 +496,7 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver, + int ret; + GLOBAL_STATE_CODE(); + ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + + if (job_id == NULL && !(flags & JOB_INTERNAL)) { +@@ -506,6 +507,7 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver, + flags, cb, opaque, errp); + if (job == NULL) { + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + return NULL; + } + +@@ -544,10 +546,12 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver, + } + + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + return job; + + fail: + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + job_early_fail(&job->job); + return NULL; + } +diff --git a/include/block/blockjob.h b/include/block/blockjob.h +index 7061ab7201..990f3e179a 100644 +--- a/include/block/blockjob.h ++++ b/include/block/blockjob.h +@@ -137,6 +137,8 @@ BlockJob *block_job_get_locked(const char *id); + * Add @bs to the list of BlockDriverState that are involved in + * @job. This means that all operations will be blocked on @bs while + * @job exists. ++ * ++ * All block nodes must be drained. + */ + int GRAPH_WRLOCK + block_job_add_bdrv(BlockJob *job, const char *name, BlockDriverState *bs, +diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c +index 3185f3f429..4f3057844b 100644 +--- a/tests/unit/test-bdrv-drain.c ++++ b/tests/unit/test-bdrv-drain.c +@@ -772,9 +772,11 @@ static void test_blockjob_common_drain_node(enum drain_type drain_type, + tjob->bs = src; + job = &tjob->common; + ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + block_job_add_bdrv(job, "target", target, 0, BLK_PERM_ALL, &error_abort); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + + switch (result) { + case TEST_JOB_SUCCESS: +-- +2.39.3 + diff --git a/kvm-block-move-drain-outside-of-bdrv_root_unref_child.patch b/kvm-block-move-drain-outside-of-bdrv_root_unref_child.patch new file mode 100644 index 0000000..bafe83f --- /dev/null +++ b/kvm-block-move-drain-outside-of-bdrv_root_unref_child.patch @@ -0,0 +1,405 @@ +From 1ad00825750a515baf5bd9185bbc31549a61568b Mon Sep 17 00:00:00 2001 +From: Fiona Ebner +Date: Fri, 30 May 2025 17:10:52 +0200 +Subject: [PATCH 27/33] block: move drain outside of bdrv_root_unref_child() + +RH-Author: Kevin Wolf +RH-MergeRequest: 393: block: do not drain while holding the graph lock +RH-Jira: RHEL-88561 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Hanna Czenczek +RH-Commit: [15/21] 05cc374ad5ce3f3ae4b3e5d4024c67141acf2cc2 (kmwolf/centos-qemu-kvm) + +This is part of resolving the deadlock mentioned in commit "block: +move draining out of bdrv_change_aio_context() and mark GRAPH_RDLOCK". + +bdrv_root_unref_child() is called by: +1. blk_remove_bs(), where a drained section is introduced. +2. bdrv_unref_child(), which runs under the graph lock, so the drain + will be moved further up to its callers. +3. block_job_remove_all_bdrv(), where a drained section is introduced. + +For all callers of bdrv_unref_child() and its generated +bdrv_co_unref_child() coroutine variant, a drained section is +introduced, they are not explicilty listed here. The caller +quorum_del_child() holds the graph lock, so it is not actually allowed +to drain. This will be addressed in the next commit. + +Signed-off-by: Fiona Ebner +Message-ID: <20250530151125.955508-16-f.ebner@proxmox.com> +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf +(cherry picked from commit b13f54654546cbc0661d3fe9d25f7543535c2bee) +Signed-off-by: Kevin Wolf +--- + block.c | 18 ++++++++++++++---- + block/blklogwrites.c | 4 ++++ + block/blkverify.c | 2 ++ + block/block-backend.c | 2 ++ + block/qcow2.c | 4 ++++ + block/quorum.c | 6 ++++++ + block/replication.c | 2 ++ + block/snapshot.c | 2 ++ + block/vmdk.c | 10 ++++++++++ + blockjob.c | 2 ++ + tests/unit/test-bdrv-drain.c | 4 ++++ + 11 files changed, 52 insertions(+), 4 deletions(-) + +diff --git a/block.c b/block.c +index d7425ff971..51bc084b1e 100644 +--- a/block.c ++++ b/block.c +@@ -1721,12 +1721,14 @@ bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, const char *node_name, + open_failed: + bs->drv = NULL; + ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + if (bs->file != NULL) { + bdrv_unref_child(bs, bs->file); + assert(!bs->file); + } + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + + g_free(bs->opaque); + bs->opaque = NULL; +@@ -3305,7 +3307,11 @@ out: + return ret < 0 ? NULL : child; + } + +-/* Callers must ensure that child->frozen is false. */ ++/* ++ * Callers must ensure that child->frozen is false. ++ * ++ * All block nodes must be drained. ++ */ + void bdrv_root_unref_child(BdrvChild *child) + { + BlockDriverState *child_bs = child->bs; +@@ -3326,10 +3332,8 @@ void bdrv_root_unref_child(BdrvChild *child) + * When the parent requiring a non-default AioContext is removed, the + * node moves back to the main AioContext + */ +- bdrv_drain_all_begin(); + bdrv_try_change_aio_context_locked(child_bs, qemu_get_aio_context(), + NULL, NULL); +- bdrv_drain_all_end(); + } + + bdrv_schedule_unref(child_bs); +@@ -3402,7 +3406,11 @@ bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child, + } + } + +-/* Callers must ensure that child->frozen is false. */ ++/* ++ * Callers must ensure that child->frozen is false. ++ * ++ * All block nodes must be drained. ++ */ + void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child) + { + GLOBAL_STATE_CODE(); +@@ -5172,6 +5180,7 @@ static void bdrv_close(BlockDriverState *bs) + bs->drv = NULL; + } + ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + QLIST_FOREACH_SAFE(child, &bs->children, next, next) { + bdrv_unref_child(bs, child); +@@ -5180,6 +5189,7 @@ static void bdrv_close(BlockDriverState *bs) + assert(!bs->backing); + assert(!bs->file); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + + g_free(bs->opaque); + bs->opaque = NULL; +diff --git a/block/blklogwrites.c b/block/blklogwrites.c +index b0f78c4bc7..70ac76f401 100644 +--- a/block/blklogwrites.c ++++ b/block/blklogwrites.c +@@ -281,9 +281,11 @@ static int blk_log_writes_open(BlockDriverState *bs, QDict *options, int flags, + ret = 0; + fail_log: + if (ret < 0) { ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + bdrv_unref_child(bs, s->log_file); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + s->log_file = NULL; + qemu_mutex_destroy(&s->mutex); + } +@@ -296,10 +298,12 @@ static void blk_log_writes_close(BlockDriverState *bs) + { + BDRVBlkLogWritesState *s = bs->opaque; + ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + bdrv_unref_child(bs, s->log_file); + s->log_file = NULL; + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + qemu_mutex_destroy(&s->mutex); + } + +diff --git a/block/blkverify.c b/block/blkverify.c +index db79a36681..3a71f7498c 100644 +--- a/block/blkverify.c ++++ b/block/blkverify.c +@@ -151,10 +151,12 @@ static void blkverify_close(BlockDriverState *bs) + { + BDRVBlkverifyState *s = bs->opaque; + ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + bdrv_unref_child(bs, s->test_file); + s->test_file = NULL; + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + } + + static int64_t coroutine_fn GRAPH_RDLOCK +diff --git a/block/block-backend.c b/block/block-backend.c +index 24cae3cb55..68209bb2f7 100644 +--- a/block/block-backend.c ++++ b/block/block-backend.c +@@ -889,9 +889,11 @@ void blk_remove_bs(BlockBackend *blk) + root = blk->root; + blk->root = NULL; + ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + bdrv_root_unref_child(root); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + } + + /* +diff --git a/block/qcow2.c b/block/qcow2.c +index 9fc96ba99a..9480598b6d 100644 +--- a/block/qcow2.c ++++ b/block/qcow2.c +@@ -1901,7 +1901,9 @@ qcow2_do_open(BlockDriverState *bs, QDict *options, int flags, + g_free(s->image_data_file); + if (open_data_file && has_data_file(bs)) { + bdrv_graph_co_rdunlock(); ++ bdrv_drain_all_begin(); + bdrv_co_unref_child(bs, s->data_file); ++ bdrv_drain_all_end(); + bdrv_graph_co_rdlock(); + s->data_file = NULL; + } +@@ -2827,9 +2829,11 @@ qcow2_do_close(BlockDriverState *bs, bool close_data_file) + if (close_data_file && has_data_file(bs)) { + GLOBAL_STATE_CODE(); + bdrv_graph_rdunlock_main_loop(); ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + bdrv_unref_child(bs, s->data_file); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + s->data_file = NULL; + bdrv_graph_rdlock_main_loop(); + } +diff --git a/block/quorum.c b/block/quorum.c +index ed8ce801ee..81407a38ee 100644 +--- a/block/quorum.c ++++ b/block/quorum.c +@@ -1037,6 +1037,7 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags, + + close_exit: + /* cleanup on error */ ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + for (i = 0; i < s->num_children; i++) { + if (!opened[i]) { +@@ -1045,6 +1046,7 @@ close_exit: + bdrv_unref_child(bs, s->children[i]); + } + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + g_free(s->children); + g_free(opened); + exit: +@@ -1057,11 +1059,13 @@ static void quorum_close(BlockDriverState *bs) + BDRVQuorumState *s = bs->opaque; + int i; + ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + for (i = 0; i < s->num_children; i++) { + bdrv_unref_child(bs, s->children[i]); + } + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + + g_free(s->children); + } +@@ -1143,7 +1147,9 @@ quorum_del_child(BlockDriverState *bs, BdrvChild *child, Error **errp) + (s->num_children - i - 1) * sizeof(BdrvChild *)); + s->children = g_renew(BdrvChild *, s->children, --s->num_children); + ++ bdrv_drain_all_begin(); + bdrv_unref_child(bs, child); ++ bdrv_drain_all_end(); + + quorum_refresh_flags(bs); + } +diff --git a/block/replication.c b/block/replication.c +index 02814578c6..92eb432b1b 100644 +--- a/block/replication.c ++++ b/block/replication.c +@@ -655,12 +655,14 @@ static void replication_done(void *opaque, int ret) + if (ret == 0) { + s->stage = BLOCK_REPLICATION_DONE; + ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + bdrv_unref_child(bs, s->secondary_disk); + s->secondary_disk = NULL; + bdrv_unref_child(bs, s->hidden_disk); + s->hidden_disk = NULL; + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + + s->error = 0; + } else { +diff --git a/block/snapshot.c b/block/snapshot.c +index 9f300a78bd..28c9c43621 100644 +--- a/block/snapshot.c ++++ b/block/snapshot.c +@@ -291,9 +291,11 @@ int bdrv_snapshot_goto(BlockDriverState *bs, + } + + /* .bdrv_open() will re-attach it */ ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + bdrv_unref_child(bs, fallback); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + + ret = bdrv_snapshot_goto(fallback_bs, snapshot_id, errp); + memset(bs->opaque, 0, drv->instance_size); +diff --git a/block/vmdk.c b/block/vmdk.c +index 9c7ab037e1..89a7250120 100644 +--- a/block/vmdk.c ++++ b/block/vmdk.c +@@ -271,6 +271,7 @@ static void vmdk_free_extents(BlockDriverState *bs) + BDRVVmdkState *s = bs->opaque; + VmdkExtent *e; + ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + for (i = 0; i < s->num_extents; i++) { + e = &s->extents[i]; +@@ -283,6 +284,7 @@ static void vmdk_free_extents(BlockDriverState *bs) + } + } + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + + g_free(s->extents); + } +@@ -1247,9 +1249,11 @@ vmdk_parse_extents(const char *desc, BlockDriverState *bs, QDict *options, + 0, 0, 0, 0, 0, &extent, errp); + if (ret < 0) { + bdrv_graph_rdunlock_main_loop(); ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + bdrv_unref_child(bs, extent_file); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + bdrv_graph_rdlock_main_loop(); + goto out; + } +@@ -1266,9 +1270,11 @@ vmdk_parse_extents(const char *desc, BlockDriverState *bs, QDict *options, + g_free(buf); + if (ret) { + bdrv_graph_rdunlock_main_loop(); ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + bdrv_unref_child(bs, extent_file); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + bdrv_graph_rdlock_main_loop(); + goto out; + } +@@ -1277,9 +1283,11 @@ vmdk_parse_extents(const char *desc, BlockDriverState *bs, QDict *options, + ret = vmdk_open_se_sparse(bs, extent_file, bs->open_flags, errp); + if (ret) { + bdrv_graph_rdunlock_main_loop(); ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + bdrv_unref_child(bs, extent_file); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + bdrv_graph_rdlock_main_loop(); + goto out; + } +@@ -1287,9 +1295,11 @@ vmdk_parse_extents(const char *desc, BlockDriverState *bs, QDict *options, + } else { + error_setg(errp, "Unsupported extent type '%s'", type); + bdrv_graph_rdunlock_main_loop(); ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + bdrv_unref_child(bs, extent_file); + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + bdrv_graph_rdlock_main_loop(); + ret = -ENOTSUP; + goto out; +diff --git a/blockjob.c b/blockjob.c +index 44991e3ff7..e68181a35b 100644 +--- a/blockjob.c ++++ b/blockjob.c +@@ -198,6 +198,7 @@ void block_job_remove_all_bdrv(BlockJob *job) + * one to make sure that such a concurrent access does not attempt + * to process an already freed BdrvChild. + */ ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + while (job->nodes) { + GSList *l = job->nodes; +@@ -211,6 +212,7 @@ void block_job_remove_all_bdrv(BlockJob *job) + g_slist_free_1(l); + } + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + } + + bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs) +diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c +index ac76525e5a..59c2793725 100644 +--- a/tests/unit/test-bdrv-drain.c ++++ b/tests/unit/test-bdrv-drain.c +@@ -955,11 +955,13 @@ static void bdrv_test_top_close(BlockDriverState *bs) + { + BdrvChild *c, *next_c; + ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + QLIST_FOREACH_SAFE(c, &bs->children, next, next_c) { + bdrv_unref_child(bs, c); + } + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + } + + static int coroutine_fn GRAPH_RDLOCK +@@ -1016,7 +1018,9 @@ static void coroutine_fn test_co_delete_by_drain(void *opaque) + bdrv_graph_co_rdlock(); + QLIST_FOREACH_SAFE(c, &bs->children, next, next_c) { + bdrv_graph_co_rdunlock(); ++ bdrv_drain_all_begin(); + bdrv_co_unref_child(bs, c); ++ bdrv_drain_all_end(); + bdrv_graph_co_rdlock(); + } + bdrv_graph_co_rdunlock(); +-- +2.39.3 + diff --git a/kvm-block-move-drain-outside-of-bdrv_set_backing_hd_drai.patch b/kvm-block-move-drain-outside-of-bdrv_set_backing_hd_drai.patch new file mode 100644 index 0000000..e64b8e9 --- /dev/null +++ b/kvm-block-move-drain-outside-of-bdrv_set_backing_hd_drai.patch @@ -0,0 +1,123 @@ +From e3a722681805290fced1cde3d4ac991f8278f158 Mon Sep 17 00:00:00 2001 +From: Fiona Ebner +Date: Fri, 30 May 2025 17:10:48 +0200 +Subject: [PATCH 23/33] block: move drain outside of + bdrv_set_backing_hd_drained() + +RH-Author: Kevin Wolf +RH-MergeRequest: 393: block: do not drain while holding the graph lock +RH-Jira: RHEL-88561 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Hanna Czenczek +RH-Commit: [11/21] ab1d93ce55b51312e3aac1867e8826286c5d49de (kmwolf/centos-qemu-kvm) + +This is part of resolving the deadlock mentioned in commit "block: +move draining out of bdrv_change_aio_context() and mark GRAPH_RDLOCK". + +The function bdrv_set_backing_hd_drained() holds the graph lock, so it +is not allowed to drain. It is called by: +1. bdrv_set_backing_hd(), where a drained section is introduced, + replacing the previously present bs-specific drains. +2. stream_prepare(), where a drained section is introduced replacing + the previously present bs-specific drains. + +The drain_bs variable in bdrv_set_backing_hd_drained() is now +superfluous and thus dropped. + +Signed-off-by: Fiona Ebner +Message-ID: <20250530151125.955508-12-f.ebner@proxmox.com> +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf +(cherry picked from commit e66dbda11eab2b4a091d470f3508a4d6ca60eaf5) +Signed-off-by: Kevin Wolf +--- + block.c | 16 +++------------- + block/stream.c | 6 ++---- + 2 files changed, 5 insertions(+), 17 deletions(-) + +diff --git a/block.c b/block.c +index 2c18e0a4fa..d84b8ae49e 100644 +--- a/block.c ++++ b/block.c +@@ -3562,8 +3562,7 @@ out: + * Both @bs and @backing_hd can move to a different AioContext in this + * function. + * +- * If a backing child is already present (i.e. we're detaching a node), that +- * child node must be drained. ++ * All block nodes must be drained. + */ + int bdrv_set_backing_hd_drained(BlockDriverState *bs, + BlockDriverState *backing_hd, +@@ -3578,7 +3577,6 @@ int bdrv_set_backing_hd_drained(BlockDriverState *bs, + assert(bs->backing->bs->quiesce_counter > 0); + } + +- bdrv_drain_all_begin(); + ret = bdrv_set_file_or_backing_noperm(bs, backing_hd, true, tran, errp); + if (ret < 0) { + goto out; +@@ -3587,28 +3585,20 @@ int bdrv_set_backing_hd_drained(BlockDriverState *bs, + ret = bdrv_refresh_perms(bs, tran, errp); + out: + tran_finalize(tran, ret); +- bdrv_drain_all_end(); + return ret; + } + + int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd, + Error **errp) + { +- BlockDriverState *drain_bs; + int ret; + GLOBAL_STATE_CODE(); + +- bdrv_graph_rdlock_main_loop(); +- drain_bs = bs->backing ? bs->backing->bs : bs; +- bdrv_graph_rdunlock_main_loop(); +- +- bdrv_ref(drain_bs); +- bdrv_drained_begin(drain_bs); ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + ret = bdrv_set_backing_hd_drained(bs, backing_hd, errp); + bdrv_graph_wrunlock(); +- bdrv_drained_end(drain_bs); +- bdrv_unref(drain_bs); ++ bdrv_drain_all_end(); + + return ret; + } +diff --git a/block/stream.c b/block/stream.c +index 999d9e56d4..6ba49cffd3 100644 +--- a/block/stream.c ++++ b/block/stream.c +@@ -80,11 +80,10 @@ static int stream_prepare(Job *job) + * may end up working with the wrong base node (or it might even have gone + * away by the time we want to use it). + */ +- bdrv_drained_begin(unfiltered_bs); + if (unfiltered_bs_cow) { + bdrv_ref(unfiltered_bs_cow); +- bdrv_drained_begin(unfiltered_bs_cow); + } ++ bdrv_drain_all_begin(); + + bdrv_graph_rdlock_main_loop(); + base = bdrv_filter_or_cow_bs(s->above_base); +@@ -123,11 +122,10 @@ static int stream_prepare(Job *job) + } + + out: ++ bdrv_drain_all_end(); + if (unfiltered_bs_cow) { +- bdrv_drained_end(unfiltered_bs_cow); + bdrv_unref(unfiltered_bs_cow); + } +- bdrv_drained_end(unfiltered_bs); + return ret; + } + +-- +2.39.3 + diff --git a/kvm-block-move-drain-outside-of-bdrv_try_change_aio_cont.patch b/kvm-block-move-drain-outside-of-bdrv_try_change_aio_cont.patch new file mode 100644 index 0000000..8861e99 --- /dev/null +++ b/kvm-block-move-drain-outside-of-bdrv_try_change_aio_cont.patch @@ -0,0 +1,257 @@ +From 0fc67b52f8d95a10d0bbc1d49eaf2f93c603d967 Mon Sep 17 00:00:00 2001 +From: Fiona Ebner +Date: Fri, 30 May 2025 17:10:46 +0200 +Subject: [PATCH 21/33] block: move drain outside of + bdrv_try_change_aio_context() + +RH-Author: Kevin Wolf +RH-MergeRequest: 393: block: do not drain while holding the graph lock +RH-Jira: RHEL-88561 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Hanna Czenczek +RH-Commit: [9/21] 9e421c675137364d37f9694b7df6ed5c434f20c6 (kmwolf/centos-qemu-kvm) + +This is part of resolving the deadlock mentioned in commit "block: +move draining out of bdrv_change_aio_context() and mark GRAPH_RDLOCK". + +Convert the function to a _locked() version that has to be called with +the graph lock held and add a convenience wrapper that has to be +called with the graph unlocked, which drains and takes the lock +itself. Since bdrv_try_change_aio_context() is global state code, the +wrapper is too. + +Callers are adapted to use the appropriate variant, depending on +whether the caller already holds the lock. In the +test_set_aio_context() unit test, prior drains can be removed, because +draining already happens inside the new wrapper. + +Note that bdrv_attach_child_common_abort(), bdrv_attach_child_common() +and bdrv_root_unref_child() hold the graph lock and are not actually +allowed to drain either. This will be addressed in the following +commits. + +Functions like qmp_blockdev_mirror() query the nodes to act on before +draining and locking. In theory, draining could invalidate those nodes. +This kind of issue is not addressed by these commits. + +Signed-off-by: Fiona Ebner +Reviewed-by: Kevin Wolf +Message-ID: <20250530151125.955508-10-f.ebner@proxmox.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit a1ea8eb5912256c0b2be16fae5d3786aebc80cb1) +Signed-off-by: Kevin Wolf +--- + block.c | 58 ++++++++++++++++++++++-------- + blockdev.c | 15 +++++--- + include/block/block-global-state.h | 8 +++-- + tests/unit/test-bdrv-drain.c | 4 --- + 4 files changed, 59 insertions(+), 26 deletions(-) + +diff --git a/block.c b/block.c +index af438ae7ff..3c2e8c5592 100644 +--- a/block.c ++++ b/block.c +@@ -3028,7 +3028,10 @@ static void GRAPH_WRLOCK bdrv_attach_child_common_abort(void *opaque) + bdrv_replace_child_noperm(s->child, NULL); + + if (bdrv_get_aio_context(bs) != s->old_child_ctx) { +- bdrv_try_change_aio_context(bs, s->old_child_ctx, NULL, &error_abort); ++ bdrv_drain_all_begin(); ++ bdrv_try_change_aio_context_locked(bs, s->old_child_ctx, NULL, ++ &error_abort); ++ bdrv_drain_all_end(); + } + + if (bdrv_child_get_parent_aio_context(s->child) != s->old_parent_ctx) { +@@ -3115,8 +3118,10 @@ bdrv_attach_child_common(BlockDriverState *child_bs, + parent_ctx = bdrv_child_get_parent_aio_context(new_child); + if (child_ctx != parent_ctx) { + Error *local_err = NULL; +- int ret = bdrv_try_change_aio_context(child_bs, parent_ctx, NULL, +- &local_err); ++ bdrv_drain_all_begin(); ++ int ret = bdrv_try_change_aio_context_locked(child_bs, parent_ctx, NULL, ++ &local_err); ++ bdrv_drain_all_end(); + + if (ret < 0 && child_class->change_aio_ctx) { + Transaction *aio_ctx_tran = tran_new(); +@@ -3319,8 +3324,10 @@ void bdrv_root_unref_child(BdrvChild *child) + * When the parent requiring a non-default AioContext is removed, the + * node moves back to the main AioContext + */ +- bdrv_try_change_aio_context(child_bs, qemu_get_aio_context(), NULL, +- NULL); ++ bdrv_drain_all_begin(); ++ bdrv_try_change_aio_context_locked(child_bs, qemu_get_aio_context(), ++ NULL, NULL); ++ bdrv_drain_all_end(); + } + + bdrv_schedule_unref(child_bs); +@@ -7719,9 +7726,13 @@ bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx, + * + * If ignore_child is not NULL, that child (and its subgraph) will not + * be touched. ++ * ++ * Called with the graph lock held. ++ * ++ * Called while all bs are drained. + */ +-int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx, +- BdrvChild *ignore_child, Error **errp) ++int bdrv_try_change_aio_context_locked(BlockDriverState *bs, AioContext *ctx, ++ BdrvChild *ignore_child, Error **errp) + { + Transaction *tran; + GHashTable *visited; +@@ -7730,17 +7741,15 @@ int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx, + + /* + * Recursion phase: go through all nodes of the graph. +- * Take care of checking that all nodes support changing AioContext +- * and drain them, building a linear list of callbacks to run if everything +- * is successful (the transaction itself). ++ * Take care of checking that all nodes support changing AioContext, ++ * building a linear list of callbacks to run if everything is successful ++ * (the transaction itself). + */ + tran = tran_new(); + visited = g_hash_table_new(NULL, NULL); + if (ignore_child) { + g_hash_table_add(visited, ignore_child); + } +- bdrv_drain_all_begin(); +- bdrv_graph_rdlock_main_loop(); + ret = bdrv_change_aio_context(bs, ctx, visited, tran, errp); + g_hash_table_destroy(visited); + +@@ -7754,15 +7763,34 @@ int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx, + if (!ret) { + /* Just run clean() callbacks. No AioContext changed. */ + tran_abort(tran); +- bdrv_graph_rdunlock_main_loop(); +- bdrv_drain_all_end(); + return -EPERM; + } + + tran_commit(tran); ++ return 0; ++} ++ ++/* ++ * Change bs's and recursively all of its parents' and children's AioContext ++ * to the given new context, returning an error if that isn't possible. ++ * ++ * If ignore_child is not NULL, that child (and its subgraph) will not ++ * be touched. ++ */ ++int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx, ++ BdrvChild *ignore_child, Error **errp) ++{ ++ int ret; ++ ++ GLOBAL_STATE_CODE(); ++ ++ bdrv_drain_all_begin(); ++ bdrv_graph_rdlock_main_loop(); ++ ret = bdrv_try_change_aio_context_locked(bs, ctx, ignore_child, errp); + bdrv_graph_rdunlock_main_loop(); + bdrv_drain_all_end(); +- return 0; ++ ++ return ret; + } + + void bdrv_add_aio_context_notifier(BlockDriverState *bs, +diff --git a/blockdev.c b/blockdev.c +index efa7d1d0b2..41b6481c9a 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -3595,12 +3595,13 @@ void qmp_x_blockdev_set_iothread(const char *node_name, StrOrNull *iothread, + AioContext *new_context; + BlockDriverState *bs; + +- GRAPH_RDLOCK_GUARD_MAINLOOP(); ++ bdrv_drain_all_begin(); ++ bdrv_graph_rdlock_main_loop(); + + bs = bdrv_find_node(node_name); + if (!bs) { + error_setg(errp, "Failed to find node with node-name='%s'", node_name); +- return; ++ goto out; + } + + /* Protects against accidents. */ +@@ -3608,14 +3609,14 @@ void qmp_x_blockdev_set_iothread(const char *node_name, StrOrNull *iothread, + error_setg(errp, "Node %s is associated with a BlockBackend and could " + "be in use (use force=true to override this check)", + node_name); +- return; ++ goto out; + } + + if (iothread->type == QTYPE_QSTRING) { + IOThread *obj = iothread_by_id(iothread->u.s); + if (!obj) { + error_setg(errp, "Cannot find iothread %s", iothread->u.s); +- return; ++ goto out; + } + + new_context = iothread_get_aio_context(obj); +@@ -3623,7 +3624,11 @@ void qmp_x_blockdev_set_iothread(const char *node_name, StrOrNull *iothread, + new_context = qemu_get_aio_context(); + } + +- bdrv_try_change_aio_context(bs, new_context, NULL, errp); ++ bdrv_try_change_aio_context_locked(bs, new_context, NULL, errp); ++ ++out: ++ bdrv_graph_rdunlock_main_loop(); ++ bdrv_drain_all_end(); + } + + QemuOptsList qemu_common_drive_opts = { +diff --git a/include/block/block-global-state.h b/include/block/block-global-state.h +index aad160956a..91f249b5ad 100644 +--- a/include/block/block-global-state.h ++++ b/include/block/block-global-state.h +@@ -278,8 +278,12 @@ bool GRAPH_RDLOCK + bdrv_child_change_aio_context(BdrvChild *c, AioContext *ctx, + GHashTable *visited, Transaction *tran, + Error **errp); +-int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx, +- BdrvChild *ignore_child, Error **errp); ++int GRAPH_UNLOCKED ++bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx, ++ BdrvChild *ignore_child, Error **errp); ++int GRAPH_RDLOCK ++bdrv_try_change_aio_context_locked(BlockDriverState *bs, AioContext *ctx, ++ BdrvChild *ignore_child, Error **errp); + + int GRAPH_RDLOCK bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz); + int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo); +diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c +index 290cd2a70e..3185f3f429 100644 +--- a/tests/unit/test-bdrv-drain.c ++++ b/tests/unit/test-bdrv-drain.c +@@ -1396,14 +1396,10 @@ static void test_set_aio_context(void) + bs = bdrv_new_open_driver(&bdrv_test, "test-node", BDRV_O_RDWR, + &error_abort); + +- bdrv_drained_begin(bs); + bdrv_try_change_aio_context(bs, ctx_a, NULL, &error_abort); +- bdrv_drained_end(bs); + +- bdrv_drained_begin(bs); + bdrv_try_change_aio_context(bs, ctx_b, NULL, &error_abort); + bdrv_try_change_aio_context(bs, qemu_get_aio_context(), NULL, &error_abort); +- bdrv_drained_end(bs); + + bdrv_unref(bs); + iothread_join(a); +-- +2.39.3 + diff --git a/kvm-block-move-drain-outside-of-quorum_add_child.patch b/kvm-block-move-drain-outside-of-quorum_add_child.patch new file mode 100644 index 0000000..a4edd01 --- /dev/null +++ b/kvm-block-move-drain-outside-of-quorum_add_child.patch @@ -0,0 +1,120 @@ +From af295ac08ffca4efd6f10a2d1a38eaa8d09d8e6f Mon Sep 17 00:00:00 2001 +From: Fiona Ebner +Date: Fri, 30 May 2025 17:10:51 +0200 +Subject: [PATCH 26/33] block: move drain outside of quorum_add_child() + +RH-Author: Kevin Wolf +RH-MergeRequest: 393: block: do not drain while holding the graph lock +RH-Jira: RHEL-88561 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Hanna Czenczek +RH-Commit: [14/21] 1079c7996e39d09d021d210d974522fba2e5515d (kmwolf/centos-qemu-kvm) + +This is part of resolving the deadlock mentioned in commit "block: +move draining out of bdrv_change_aio_context() and mark GRAPH_RDLOCK". + +The quorum_add_child() callback runs under the graph lock, so it is +not allowed to drain. It is only called as the .bdrv_add_child() +callback, which is only called in the bdrv_add_child() function, which +also runs under the graph lock. + +The bdrv_add_child() function is called by qmp_x_blockdev_change(), +where a drained section is introduced. + +Signed-off-by: Fiona Ebner +Message-ID: <20250530151125.955508-15-f.ebner@proxmox.com> +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf +(cherry picked from commit 0414930d3adfa89299eaea5ce92accab15d9fba5) +Signed-off-by: Kevin Wolf +--- + block.c | 10 ++++++++-- + block/quorum.c | 2 -- + blockdev.c | 2 ++ + include/block/block_int-common.h | 7 +++++++ + 4 files changed, 17 insertions(+), 4 deletions(-) + +diff --git a/block.c b/block.c +index 3857f42877..d7425ff971 100644 +--- a/block.c ++++ b/block.c +@@ -8220,8 +8220,10 @@ char *bdrv_dirname(BlockDriverState *bs, Error **errp) + } + + /* +- * Hot add/remove a BDS's child. So the user can take a child offline when +- * it is broken and take a new child online ++ * Hot add a BDS's child. Used in combination with bdrv_del_child, so the user ++ * can take a child offline when it is broken and take a new child online. ++ * ++ * All block nodes must be drained. + */ + void bdrv_add_child(BlockDriverState *parent_bs, BlockDriverState *child_bs, + Error **errp) +@@ -8261,6 +8263,10 @@ void bdrv_add_child(BlockDriverState *parent_bs, BlockDriverState *child_bs, + parent_bs->drv->bdrv_add_child(parent_bs, child_bs, errp); + } + ++/* ++ * Hot remove a BDS's child. Used in combination with bdrv_add_child, so the ++ * user can take a child offline when it is broken and take a new child online. ++ */ + void bdrv_del_child(BlockDriverState *parent_bs, BdrvChild *child, Error **errp) + { + BdrvChild *tmp; +diff --git a/block/quorum.c b/block/quorum.c +index ea17b0ec13..ed8ce801ee 100644 +--- a/block/quorum.c ++++ b/block/quorum.c +@@ -1096,10 +1096,8 @@ quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs, Error **errp) + /* We can safely add the child now */ + bdrv_ref(child_bs); + +- bdrv_drain_all_begin(); + child = bdrv_attach_child(bs, child_bs, indexstr, &child_of_bds, + BDRV_CHILD_DATA, errp); +- bdrv_drain_all_end(); + if (child == NULL) { + s->next_child_index--; + return; +diff --git a/blockdev.c b/blockdev.c +index 41b6481c9a..8edd3e7bba 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -3525,6 +3525,7 @@ void qmp_x_blockdev_change(const char *parent, const char *child, + BlockDriverState *parent_bs, *new_bs = NULL; + BdrvChild *p_child; + ++ bdrv_drain_all_begin(); + bdrv_graph_wrlock(); + + parent_bs = bdrv_lookup_bs(parent, parent, errp); +@@ -3562,6 +3563,7 @@ void qmp_x_blockdev_change(const char *parent, const char *child, + + out: + bdrv_graph_wrunlock(); ++ bdrv_drain_all_end(); + } + + BlockJobInfoList *qmp_query_block_jobs(Error **errp) +diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h +index 5d34e6a510..8d76b37c03 100644 +--- a/include/block/block_int-common.h ++++ b/include/block/block_int-common.h +@@ -396,6 +396,13 @@ struct BlockDriver { + int GRAPH_RDLOCK_PTR (*bdrv_probe_geometry)( + BlockDriverState *bs, HDGeometry *geo); + ++ /** ++ * Hot add a BDS's child. Used in combination with bdrv_del_child, so the ++ * user can take a child offline when it is broken and take a new child ++ * online. ++ * ++ * All block nodes must be drained. ++ */ + void GRAPH_WRLOCK_PTR (*bdrv_add_child)( + BlockDriverState *parent, BlockDriverState *child, Error **errp); + +-- +2.39.3 + diff --git a/kvm-block-move-drain-outside-of-quorum_del_child.patch b/kvm-block-move-drain-outside-of-quorum_del_child.patch new file mode 100644 index 0000000..c06c556 --- /dev/null +++ b/kvm-block-move-drain-outside-of-quorum_del_child.patch @@ -0,0 +1,85 @@ +From de70d5b485006ecd92e860242634a3166b709fa8 Mon Sep 17 00:00:00 2001 +From: Fiona Ebner +Date: Fri, 30 May 2025 17:10:53 +0200 +Subject: [PATCH 28/33] block: move drain outside of quorum_del_child() + +RH-Author: Kevin Wolf +RH-MergeRequest: 393: block: do not drain while holding the graph lock +RH-Jira: RHEL-88561 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Hanna Czenczek +RH-Commit: [16/21] abceebd102dde266fa880f2c40ef83de25a87ed6 (kmwolf/centos-qemu-kvm) + +The quorum_del_child() callback runs under the graph lock, so it is +not allowed to drain. It is only called as the .bdrv_del_child() +callback, which is only called in the bdrv_del_child() function, which +also runs under the graph lock. + +The bdrv_del_child() function is called by qmp_x_blockdev_change(). +A drained section was already introduced there by commit "block: move +drain out of quorum_add_child()". + +This finally finishes moving out the drain to places that are not +under the graph lock started in "block: move draining out of +bdrv_change_aio_context() and mark GRAPH_RDLOCK". + +Signed-off-by: Fiona Ebner +Message-ID: <20250530151125.955508-17-f.ebner@proxmox.com> +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf +(cherry picked from commit d75f8ed1d7fc27cf1643e549cd006a68d3bf6ef1) +Signed-off-by: Kevin Wolf +--- + block.c | 2 ++ + block/quorum.c | 2 -- + include/block/block_int-common.h | 7 +++++++ + 3 files changed, 9 insertions(+), 2 deletions(-) + +diff --git a/block.c b/block.c +index 51bc084b1e..309ef1349a 100644 +--- a/block.c ++++ b/block.c +@@ -8276,6 +8276,8 @@ void bdrv_add_child(BlockDriverState *parent_bs, BlockDriverState *child_bs, + /* + * Hot remove a BDS's child. Used in combination with bdrv_add_child, so the + * user can take a child offline when it is broken and take a new child online. ++ * ++ * All block nodes must be drained. + */ + void bdrv_del_child(BlockDriverState *parent_bs, BdrvChild *child, Error **errp) + { +diff --git a/block/quorum.c b/block/quorum.c +index 81407a38ee..cc3bc5f4e7 100644 +--- a/block/quorum.c ++++ b/block/quorum.c +@@ -1147,9 +1147,7 @@ quorum_del_child(BlockDriverState *bs, BdrvChild *child, Error **errp) + (s->num_children - i - 1) * sizeof(BdrvChild *)); + s->children = g_renew(BdrvChild *, s->children, --s->num_children); + +- bdrv_drain_all_begin(); + bdrv_unref_child(bs, child); +- bdrv_drain_all_end(); + + quorum_refresh_flags(bs); + } +diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h +index 8d76b37c03..f33695ab74 100644 +--- a/include/block/block_int-common.h ++++ b/include/block/block_int-common.h +@@ -406,6 +406,13 @@ struct BlockDriver { + void GRAPH_WRLOCK_PTR (*bdrv_add_child)( + BlockDriverState *parent, BlockDriverState *child, Error **errp); + ++ /** ++ * Hot remove a BDS's child. Used in combination with bdrv_add_child, so the ++ * user can take a child offline when it is broken and take a new child ++ * online. ++ * ++ * All block nodes must be drained. ++ */ + void GRAPH_WRLOCK_PTR (*bdrv_del_child)( + BlockDriverState *parent, BdrvChild *child, Error **errp); + +-- +2.39.3 + diff --git a/kvm-block-move-drain-outside-of-read-locked-bdrv_inactiv.patch b/kvm-block-move-drain-outside-of-read-locked-bdrv_inactiv.patch new file mode 100644 index 0000000..5504b27 --- /dev/null +++ b/kvm-block-move-drain-outside-of-read-locked-bdrv_inactiv.patch @@ -0,0 +1,106 @@ +From a4f11515016abb663e37c9796ef4655d5e7e831b Mon Sep 17 00:00:00 2001 +From: Fiona Ebner +Date: Fri, 30 May 2025 17:10:41 +0200 +Subject: [PATCH 16/33] block: move drain outside of read-locked + bdrv_inactivate_recurse() + +RH-Author: Kevin Wolf +RH-MergeRequest: 393: block: do not drain while holding the graph lock +RH-Jira: RHEL-88561 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Hanna Czenczek +RH-Commit: [4/21] 56ced7ba208c22d5998ab27c70ec6a027dc2469e (kmwolf/centos-qemu-kvm) + +This is in preparation to mark bdrv_drained_begin() as GRAPH_UNLOCKED. + +More granular draining is not trivially possible, because +bdrv_inactivate_recurse() can recursively call itself. + +Signed-off-by: Fiona Ebner +Reviewed-by: Kevin Wolf +Message-ID: <20250530151125.955508-5-f.ebner@proxmox.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit 841998e08650f5b4476fa2d1eb84a592ab405f51) +Signed-off-by: Kevin Wolf +--- + block.c | 25 ++++++++++++++++++------- + 1 file changed, 18 insertions(+), 7 deletions(-) + +diff --git a/block.c b/block.c +index 85efdf9c1b..e340bac177 100644 +--- a/block.c ++++ b/block.c +@@ -6989,6 +6989,8 @@ bdrv_inactivate_recurse(BlockDriverState *bs, bool top_level) + + GLOBAL_STATE_CODE(); + ++ assert(bs->quiesce_counter > 0); ++ + if (!bs->drv) { + return -ENOMEDIUM; + } +@@ -7032,9 +7034,7 @@ bdrv_inactivate_recurse(BlockDriverState *bs, bool top_level) + return -EPERM; + } + +- bdrv_drained_begin(bs); + bs->open_flags |= BDRV_O_INACTIVE; +- bdrv_drained_end(bs); + + /* + * Update permissions, they may differ for inactive nodes. +@@ -7059,20 +7059,26 @@ int bdrv_inactivate(BlockDriverState *bs, Error **errp) + int ret; + + GLOBAL_STATE_CODE(); +- GRAPH_RDLOCK_GUARD_MAINLOOP(); ++ ++ bdrv_drain_all_begin(); ++ bdrv_graph_rdlock_main_loop(); + + if (bdrv_has_bds_parent(bs, true)) { + error_setg(errp, "Node has active parent node"); +- return -EPERM; ++ ret = -EPERM; ++ goto out; + } + + ret = bdrv_inactivate_recurse(bs, true); + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to inactivate node"); +- return ret; ++ goto out; + } + +- return 0; ++out: ++ bdrv_graph_rdunlock_main_loop(); ++ bdrv_drain_all_end(); ++ return ret; + } + + int bdrv_inactivate_all(void) +@@ -7082,7 +7088,9 @@ int bdrv_inactivate_all(void) + int ret = 0; + + GLOBAL_STATE_CODE(); +- GRAPH_RDLOCK_GUARD_MAINLOOP(); ++ ++ bdrv_drain_all_begin(); ++ bdrv_graph_rdlock_main_loop(); + + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + /* Nodes with BDS parents are covered by recursion from the last +@@ -7098,6 +7106,9 @@ int bdrv_inactivate_all(void) + } + } + ++ bdrv_graph_rdunlock_main_loop(); ++ bdrv_drain_all_end(); ++ + return ret; + } + +-- +2.39.3 + diff --git a/kvm-block-move-drain-outside-of-read-locked-bdrv_reopen_.patch b/kvm-block-move-drain-outside-of-read-locked-bdrv_reopen_.patch new file mode 100644 index 0000000..03d8bbc --- /dev/null +++ b/kvm-block-move-drain-outside-of-read-locked-bdrv_reopen_.patch @@ -0,0 +1,87 @@ +From c5eb44ccd03e327f15977acccfeaf23a047e0dc6 Mon Sep 17 00:00:00 2001 +From: Fiona Ebner +Date: Fri, 30 May 2025 17:10:39 +0200 +Subject: [PATCH 14/33] block: move drain outside of read-locked + bdrv_reopen_queue_child() + +RH-Author: Kevin Wolf +RH-MergeRequest: 393: block: do not drain while holding the graph lock +RH-Jira: RHEL-88561 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Hanna Czenczek +RH-Commit: [2/21] 726503557f318087c2142ee95a8193f22d7df5b2 (kmwolf/centos-qemu-kvm) + +This is in preparation to mark bdrv_drained_begin() as GRAPH_UNLOCKED. + +More granular draining is not trivially possible, because +bdrv_reopen_queue_child() can recursively call itself. + +Signed-off-by: Fiona Ebner +Reviewed-by: Kevin Wolf +Message-ID: <20250530151125.955508-3-f.ebner@proxmox.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit e1d681b3e1d8256047dbfc6d2c796028b9694eaf) +Signed-off-by: Kevin Wolf +--- + block.c | 19 +++++++++++-------- + 1 file changed, 11 insertions(+), 8 deletions(-) + +diff --git a/block.c b/block.c +index 9346486ac6..85efdf9c1b 100644 +--- a/block.c ++++ b/block.c +@@ -4358,7 +4358,7 @@ bdrv_recurse_has_child(BlockDriverState *bs, BlockDriverState *child) + * returns a pointer to bs_queue, which is either the newly allocated + * bs_queue, or the existing bs_queue being used. + * +- * bs is drained here and undrained by bdrv_reopen_queue_free(). ++ * bs must be drained. + */ + static BlockReopenQueue * GRAPH_RDLOCK + bdrv_reopen_queue_child(BlockReopenQueue *bs_queue, BlockDriverState *bs, +@@ -4377,12 +4377,7 @@ bdrv_reopen_queue_child(BlockReopenQueue *bs_queue, BlockDriverState *bs, + + GLOBAL_STATE_CODE(); + +- /* +- * Strictly speaking, draining is illegal under GRAPH_RDLOCK. We know that +- * we've been called with bdrv_graph_rdlock_main_loop(), though, so it's ok +- * in practice. +- */ +- bdrv_drained_begin(bs); ++ assert(bs->quiesce_counter > 0); + + if (bs_queue == NULL) { + bs_queue = g_new0(BlockReopenQueue, 1); +@@ -4522,6 +4517,12 @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, + QDict *options, bool keep_old_opts) + { + GLOBAL_STATE_CODE(); ++ ++ if (bs_queue == NULL) { ++ /* Paired with bdrv_drain_all_end() in bdrv_reopen_queue_free(). */ ++ bdrv_drain_all_begin(); ++ } ++ + GRAPH_RDLOCK_GUARD_MAINLOOP(); + + return bdrv_reopen_queue_child(bs_queue, bs, options, NULL, 0, false, +@@ -4534,12 +4535,14 @@ void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue) + if (bs_queue) { + BlockReopenQueueEntry *bs_entry, *next; + QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) { +- bdrv_drained_end(bs_entry->state.bs); + qobject_unref(bs_entry->state.explicit_options); + qobject_unref(bs_entry->state.options); + g_free(bs_entry); + } + g_free(bs_queue); ++ ++ /* Paired with bdrv_drain_all_begin() in bdrv_reopen_queue(). */ ++ bdrv_drain_all_end(); + } + } + +-- +2.39.3 + diff --git a/kvm-block-remove-outdated-comments-about-AioContext-lock.patch b/kvm-block-remove-outdated-comments-about-AioContext-lock.patch new file mode 100644 index 0000000..9d1219b --- /dev/null +++ b/kvm-block-remove-outdated-comments-about-AioContext-lock.patch @@ -0,0 +1,61 @@ +From 055168058f614c9f3b8be6c0692794a6299420ab Mon Sep 17 00:00:00 2001 +From: Fiona Ebner +Date: Fri, 30 May 2025 17:10:38 +0200 +Subject: [PATCH 13/33] block: remove outdated comments about AioContext + locking + +RH-Author: Kevin Wolf +RH-MergeRequest: 393: block: do not drain while holding the graph lock +RH-Jira: RHEL-88561 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Hanna Czenczek +RH-Commit: [1/21] 607caf7954ab7d1cd4616335fba031cdbfa30324 (kmwolf/centos-qemu-kvm) + +AioContext locking was removed in commit b49f4755c7 ("block: remove +AioContext locking"). + +Signed-off-by: Fiona Ebner +Reviewed-by: Kevin Wolf +Message-ID: <20250530151125.955508-2-f.ebner@proxmox.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit f1bf3be14bd5d6e6a2cfbbe64cdd4d58a8595d68) +Signed-off-by: Kevin Wolf +--- + block.c | 7 ------- + 1 file changed, 7 deletions(-) + +diff --git a/block.c b/block.c +index 0ece805e41..9346486ac6 100644 +--- a/block.c ++++ b/block.c +@@ -4359,8 +4359,6 @@ bdrv_recurse_has_child(BlockDriverState *bs, BlockDriverState *child) + * bs_queue, or the existing bs_queue being used. + * + * bs is drained here and undrained by bdrv_reopen_queue_free(). +- * +- * To be called with bs->aio_context locked. + */ + static BlockReopenQueue * GRAPH_RDLOCK + bdrv_reopen_queue_child(BlockReopenQueue *bs_queue, BlockDriverState *bs, +@@ -4519,7 +4517,6 @@ bdrv_reopen_queue_child(BlockReopenQueue *bs_queue, BlockDriverState *bs, + return bs_queue; + } + +-/* To be called with bs->aio_context locked */ + BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, + BlockDriverState *bs, + QDict *options, bool keep_old_opts) +@@ -7278,10 +7275,6 @@ bool bdrv_op_blocker_is_empty(BlockDriverState *bs) + return true; + } + +-/* +- * Must not be called while holding the lock of an AioContext other than the +- * current one. +- */ + void bdrv_img_create(const char *filename, const char *fmt, + const char *base_filename, const char *base_fmt, + char *options, uint64_t img_size, int flags, bool quiet, +-- +2.39.3 + diff --git a/kvm-block-snapshot-move-drain-outside-of-read-locked-bdr.patch b/kvm-block-snapshot-move-drain-outside-of-read-locked-bdr.patch new file mode 100644 index 0000000..3f4f810 --- /dev/null +++ b/kvm-block-snapshot-move-drain-outside-of-read-locked-bdr.patch @@ -0,0 +1,231 @@ +From 5c8408a7d6b0f7a66de2bfa31ef228b1817200ec Mon Sep 17 00:00:00 2001 +From: Fiona Ebner +Date: Fri, 30 May 2025 17:10:40 +0200 +Subject: [PATCH 15/33] block/snapshot: move drain outside of read-locked + bdrv_snapshot_delete() + +RH-Author: Kevin Wolf +RH-MergeRequest: 393: block: do not drain while holding the graph lock +RH-Jira: RHEL-88561 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Hanna Czenczek +RH-Commit: [3/21] 4263420635ed51a5df458d079f60618c53120d87 (kmwolf/centos-qemu-kvm) + +This is in preparation to mark bdrv_drained_begin() as GRAPH_UNLOCKED. + +More granular draining is not trivially possible, because +bdrv_snapshot_delete() can recursively call itself. + +The return value of bdrv_all_delete_snapshot() changes from -1 to +-errno propagated from failed sub-calls. This is fine for the existing +callers of bdrv_all_delete_snapshot(). + +Signed-off-by: Fiona Ebner +Reviewed-by: Kevin Wolf +Message-ID: <20250530151125.955508-4-f.ebner@proxmox.com> +Signed-off-by: Kevin Wolf +(cherry picked from commit d4c5f8c980f1073356d2f18d51dc68d42bebb59d) +Signed-off-by: Kevin Wolf +--- + block/snapshot.c | 26 +++++++++++++++----------- + blockdev.c | 25 +++++++++++++++++-------- + qemu-img.c | 2 ++ + 3 files changed, 34 insertions(+), 19 deletions(-) + +diff --git a/block/snapshot.c b/block/snapshot.c +index 22567f1fb9..9f300a78bd 100644 +--- a/block/snapshot.c ++++ b/block/snapshot.c +@@ -327,7 +327,7 @@ int bdrv_snapshot_goto(BlockDriverState *bs, + + /** + * Delete an internal snapshot by @snapshot_id and @name. +- * @bs: block device used in the operation ++ * @bs: block device used in the operation, must be drained + * @snapshot_id: unique snapshot ID, or NULL + * @name: snapshot name, or NULL + * @errp: location to store error +@@ -358,6 +358,8 @@ int bdrv_snapshot_delete(BlockDriverState *bs, + + GLOBAL_STATE_CODE(); + ++ assert(bs->quiesce_counter > 0); ++ + if (!drv) { + error_setg(errp, "Device '%s' has no medium", + bdrv_get_device_name(bs)); +@@ -368,9 +370,6 @@ int bdrv_snapshot_delete(BlockDriverState *bs, + return -EINVAL; + } + +- /* drain all pending i/o before deleting snapshot */ +- bdrv_drained_begin(bs); +- + if (drv->bdrv_snapshot_delete) { + ret = drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp); + } else if (fallback_bs) { +@@ -382,7 +381,6 @@ int bdrv_snapshot_delete(BlockDriverState *bs, + ret = -ENOTSUP; + } + +- bdrv_drained_end(bs); + return ret; + } + +@@ -571,19 +569,22 @@ int bdrv_all_delete_snapshot(const char *name, + ERRP_GUARD(); + g_autoptr(GList) bdrvs = NULL; + GList *iterbdrvs; ++ int ret = 0; + + GLOBAL_STATE_CODE(); +- GRAPH_RDLOCK_GUARD_MAINLOOP(); + +- if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) { +- return -1; ++ bdrv_drain_all_begin(); ++ bdrv_graph_rdlock_main_loop(); ++ ++ ret = bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp); ++ if (ret < 0) { ++ goto out; + } + + iterbdrvs = bdrvs; + while (iterbdrvs) { + BlockDriverState *bs = iterbdrvs->data; + QEMUSnapshotInfo sn1, *snapshot = &sn1; +- int ret = 0; + + if ((devices || bdrv_all_snapshots_includes_bs(bs)) && + bdrv_snapshot_find(bs, snapshot, name) >= 0) +@@ -594,13 +595,16 @@ int bdrv_all_delete_snapshot(const char *name, + if (ret < 0) { + error_prepend(errp, "Could not delete snapshot '%s' on '%s': ", + name, bdrv_get_device_or_node_name(bs)); +- return -1; ++ goto out; + } + + iterbdrvs = iterbdrvs->next; + } + +- return 0; ++out: ++ bdrv_graph_rdunlock_main_loop(); ++ bdrv_drain_all_end(); ++ return ret; + } + + +diff --git a/blockdev.c b/blockdev.c +index 0fa8813efe..efa7d1d0b2 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -1132,39 +1132,41 @@ SnapshotInfo *qmp_blockdev_snapshot_delete_internal_sync(const char *device, + int ret; + + GLOBAL_STATE_CODE(); +- GRAPH_RDLOCK_GUARD_MAINLOOP(); ++ ++ bdrv_drain_all_begin(); ++ bdrv_graph_rdlock_main_loop(); + + bs = qmp_get_root_bs(device, errp); + if (!bs) { +- return NULL; ++ goto error; + } + + if (!id && !name) { + error_setg(errp, "Name or id must be provided"); +- return NULL; ++ goto error; + } + + if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_INTERNAL_SNAPSHOT_DELETE, errp)) { +- return NULL; ++ goto error; + } + + ret = bdrv_snapshot_find_by_id_and_name(bs, id, name, &sn, &local_err); + if (local_err) { + error_propagate(errp, local_err); +- return NULL; ++ goto error; + } + if (!ret) { + error_setg(errp, + "Snapshot with id '%s' and name '%s' does not exist on " + "device '%s'", + STR_OR_NULL(id), STR_OR_NULL(name), device); +- return NULL; ++ goto error; + } + + bdrv_snapshot_delete(bs, id, name, &local_err); + if (local_err) { + error_propagate(errp, local_err); +- return NULL; ++ goto error; + } + + info = g_new0(SnapshotInfo, 1); +@@ -1180,6 +1182,9 @@ SnapshotInfo *qmp_blockdev_snapshot_delete_internal_sync(const char *device, + info->has_icount = true; + } + ++error: ++ bdrv_graph_rdunlock_main_loop(); ++ bdrv_drain_all_end(); + return info; + } + +@@ -1295,12 +1300,14 @@ static void internal_snapshot_abort(void *opaque) + Error *local_error = NULL; + + GLOBAL_STATE_CODE(); +- GRAPH_RDLOCK_GUARD_MAINLOOP(); + + if (!state->created) { + return; + } + ++ bdrv_drain_all_begin(); ++ bdrv_graph_rdlock_main_loop(); ++ + if (bdrv_snapshot_delete(bs, sn->id_str, sn->name, &local_error) < 0) { + error_reportf_err(local_error, + "Failed to delete snapshot with id '%s' and " +@@ -1308,6 +1315,8 @@ static void internal_snapshot_abort(void *opaque) + sn->id_str, sn->name, + bdrv_get_device_name(bs)); + } ++ bdrv_graph_rdunlock_main_loop(); ++ bdrv_drain_all_end(); + } + + static void internal_snapshot_clean(void *opaque) +diff --git a/qemu-img.c b/qemu-img.c +index 2044c22a4c..ba8412f66e 100644 +--- a/qemu-img.c ++++ b/qemu-img.c +@@ -3505,6 +3505,7 @@ static int img_snapshot(int argc, char **argv) + break; + + case SNAPSHOT_DELETE: ++ bdrv_drain_all_begin(); + bdrv_graph_rdlock_main_loop(); + ret = bdrv_snapshot_find(bs, &sn, snapshot_name); + if (ret < 0) { +@@ -3520,6 +3521,7 @@ static int img_snapshot(int argc, char **argv) + } + } + bdrv_graph_rdunlock_main_loop(); ++ bdrv_drain_all_end(); + break; + } + +-- +2.39.3 + diff --git a/kvm-blockdev-drain-while-unlocked-in-external_snapshot_a.patch b/kvm-blockdev-drain-while-unlocked-in-external_snapshot_a.patch new file mode 100644 index 0000000..bf2907b --- /dev/null +++ b/kvm-blockdev-drain-while-unlocked-in-external_snapshot_a.patch @@ -0,0 +1,70 @@ +From 96407f6d729312c373e9b2ccbf97918d453c7a52 Mon Sep 17 00:00:00 2001 +From: Fiona Ebner +Date: Fri, 30 May 2025 17:10:55 +0200 +Subject: [PATCH 30/33] blockdev: drain while unlocked in + external_snapshot_action() + +RH-Author: Kevin Wolf +RH-MergeRequest: 393: block: do not drain while holding the graph lock +RH-Jira: RHEL-88561 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Hanna Czenczek +RH-Commit: [18/21] b651de913db70b4897f0085af696cdaf925f5f81 (kmwolf/centos-qemu-kvm) + +This is in preparation to mark bdrv_drained_begin() as GRAPH_UNLOCKED. + +Signed-off-by: Fiona Ebner +Message-ID: <20250530151125.955508-19-f.ebner@proxmox.com> +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf +(cherry picked from commit 195a8a946a8681dfe7e8aa8d49db415693db5311) +Signed-off-by: Kevin Wolf +--- + blockdev.c | 17 ++++++++++++++++- + 1 file changed, 16 insertions(+), 1 deletion(-) + +diff --git a/blockdev.c b/blockdev.c +index 2560a11a53..998dbe38a5 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -1377,9 +1377,10 @@ static void external_snapshot_action(TransactionAction *action, + const char *new_image_file; + ExternalSnapshotState *state = g_new0(ExternalSnapshotState, 1); + uint64_t perm, shared; ++ BlockDriverState *check_bs; + + /* TODO We'll eventually have to take a writer lock in this function */ +- GRAPH_RDLOCK_GUARD_MAINLOOP(); ++ bdrv_graph_rdlock_main_loop(); + + tran_add(tran, &external_snapshot_drv, state); + +@@ -1412,11 +1413,25 @@ static void external_snapshot_action(TransactionAction *action, + + state->old_bs = bdrv_lookup_bs(device, node_name, errp); + if (!state->old_bs) { ++ bdrv_graph_rdunlock_main_loop(); + return; + } + ++ /* Need to drain while unlocked. */ ++ bdrv_graph_rdunlock_main_loop(); + /* Paired with .clean() */ + bdrv_drained_begin(state->old_bs); ++ GRAPH_RDLOCK_GUARD_MAINLOOP(); ++ ++ /* Make sure the associated bs did not change with the drain. */ ++ check_bs = bdrv_lookup_bs(device, node_name, errp); ++ if (state->old_bs != check_bs) { ++ if (check_bs) { ++ error_setg(errp, "Block node of device '%s' unexpectedly changed", ++ device); ++ } /* else errp is already set */ ++ return; ++ } + + if (!bdrv_is_inserted(state->old_bs)) { + error_setg(errp, "Device '%s' has no medium", +-- +2.39.3 + diff --git a/kvm-blockdev-drain-while-unlocked-in-internal_snapshot_a.patch b/kvm-blockdev-drain-while-unlocked-in-internal_snapshot_a.patch new file mode 100644 index 0000000..db9f08a --- /dev/null +++ b/kvm-blockdev-drain-while-unlocked-in-internal_snapshot_a.patch @@ -0,0 +1,80 @@ +From a2b28210cbcbb8e66815d9b38484fde3122a3ab9 Mon Sep 17 00:00:00 2001 +From: Fiona Ebner +Date: Fri, 30 May 2025 17:10:54 +0200 +Subject: [PATCH 29/33] blockdev: drain while unlocked in + internal_snapshot_action() + +RH-Author: Kevin Wolf +RH-MergeRequest: 393: block: do not drain while holding the graph lock +RH-Jira: RHEL-88561 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Hanna Czenczek +RH-Commit: [17/21] 8379ac5e6ec0ecf71b540d2631cf42394d985f16 (kmwolf/centos-qemu-kvm) + +This is in preparation to mark bdrv_drained_begin() as GRAPH_UNLOCKED. + +Signed-off-by: Fiona Ebner +Message-ID: <20250530151125.955508-18-f.ebner@proxmox.com> +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf +(cherry picked from commit 6f101614f95c889399352b8301917c0ac7919ae7) +Signed-off-by: Kevin Wolf +--- + blockdev.c | 19 +++++++++++++++++-- + 1 file changed, 17 insertions(+), 2 deletions(-) + +diff --git a/blockdev.c b/blockdev.c +index 8edd3e7bba..2560a11a53 100644 +--- a/blockdev.c ++++ b/blockdev.c +@@ -1208,7 +1208,7 @@ static void internal_snapshot_action(BlockdevSnapshotInternal *internal, + Error *local_err = NULL; + const char *device; + const char *name; +- BlockDriverState *bs; ++ BlockDriverState *bs, *check_bs; + QEMUSnapshotInfo old_sn, *sn; + bool ret; + int64_t rt; +@@ -1216,7 +1216,7 @@ static void internal_snapshot_action(BlockdevSnapshotInternal *internal, + int ret1; + + GLOBAL_STATE_CODE(); +- GRAPH_RDLOCK_GUARD_MAINLOOP(); ++ bdrv_graph_rdlock_main_loop(); + + tran_add(tran, &internal_snapshot_drv, state); + +@@ -1225,14 +1225,29 @@ static void internal_snapshot_action(BlockdevSnapshotInternal *internal, + + bs = qmp_get_root_bs(device, errp); + if (!bs) { ++ bdrv_graph_rdunlock_main_loop(); + return; + } + + state->bs = bs; + ++ /* Need to drain while unlocked. */ ++ bdrv_graph_rdunlock_main_loop(); + /* Paired with .clean() */ + bdrv_drained_begin(bs); + ++ GRAPH_RDLOCK_GUARD_MAINLOOP(); ++ ++ /* Make sure the root bs did not change with the drain. */ ++ check_bs = qmp_get_root_bs(device, errp); ++ if (bs != check_bs) { ++ if (check_bs) { ++ error_setg(errp, "Block node of device '%s' unexpectedly changed", ++ device); ++ } /* else errp is already set */ ++ return; ++ } ++ + if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_INTERNAL_SNAPSHOT, errp)) { + return; + } +-- +2.39.3 + diff --git a/kvm-file-posix-Fix-aio-threads-performance-regression-af.patch b/kvm-file-posix-Fix-aio-threads-performance-regression-af.patch new file mode 100644 index 0000000..a077f4f --- /dev/null +++ b/kvm-file-posix-Fix-aio-threads-performance-regression-af.patch @@ -0,0 +1,106 @@ +From 53711e6aad8a6e40426ccef25e911d3cad93220a Mon Sep 17 00:00:00 2001 +From: Kevin Wolf +Date: Wed, 25 Jun 2025 10:50:19 +0200 +Subject: [PATCH 12/33] file-posix: Fix aio=threads performance regression + after enablign FUA + +RH-Author: Kevin Wolf +RH-MergeRequest: 392: file-posix: Fix aio=threads performance regression after enabling FUA +RH-Jira: RHEL-96854 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Hanna Czenczek +RH-Commit: [1/1] e523c0305e3072d59ad6454afc690cabfcfb86b2 (kmwolf/centos-qemu-kvm) + +For aio=threads, we're currently not implementing REQ_FUA in any useful +way, but just do a separate raw_co_flush_to_disk() call. This changes +behaviour compared to the old state, which used bdrv_co_flush() with its +optimisations. As a quick fix, call bdrv_co_flush() again like before. +Eventually, we can use pwritev2() to make use of RWF_DSYNC if available, +but we'll still have to keep this code path as a fallback, so this fix +is required either way. + +While the fix itself is a one-liner, some new graph locking annotations +are needed to convince TSA that the locking is correct. + +Cc: qemu-stable@nongnu.org +Fixes: 984a32f17e8d ("file-posix: Support FUA writes") +Buglink: https://issues.redhat.com/browse/RHEL-96854 +Reported-by: Tingting Mao +Signed-off-by: Kevin Wolf +Message-ID: <20250625085019.27735-1-kwolf@redhat.com> +Reviewed-by: Eric Blake +Signed-off-by: Kevin Wolf +(cherry picked from commit d402da1360c2240e81f0e5fc80ddbfc6238e0da8) +Signed-off-by: Kevin Wolf +--- + block/file-posix.c | 29 +++++++++++++++-------------- + 1 file changed, 15 insertions(+), 14 deletions(-) + +diff --git a/block/file-posix.c b/block/file-posix.c +index 77a35d9ae9..d3c7dcc7e4 100644 +--- a/block/file-posix.c ++++ b/block/file-posix.c +@@ -2573,9 +2573,9 @@ static inline bool raw_check_linux_aio(BDRVRawState *s) + } + #endif + +-static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr, +- uint64_t bytes, QEMUIOVector *qiov, int type, +- int flags) ++static int coroutine_fn GRAPH_RDLOCK ++raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr, uint64_t bytes, ++ QEMUIOVector *qiov, int type, int flags) + { + BDRVRawState *s = bs->opaque; + RawPosixAIOData acb; +@@ -2634,7 +2634,7 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr, + ret = raw_thread_pool_submit(handle_aiocb_rw, &acb); + if (ret == 0 && (flags & BDRV_REQ_FUA)) { + /* TODO Use pwritev2() instead if it's available */ +- ret = raw_co_flush_to_disk(bs); ++ ret = bdrv_co_flush(bs); + } + goto out; /* Avoid the compiler err of unused label */ + +@@ -2669,16 +2669,16 @@ out: + return ret; + } + +-static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset, +- int64_t bytes, QEMUIOVector *qiov, +- BdrvRequestFlags flags) ++static int coroutine_fn GRAPH_RDLOCK ++raw_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, ++ QEMUIOVector *qiov, BdrvRequestFlags flags) + { + return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_READ, flags); + } + +-static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset, +- int64_t bytes, QEMUIOVector *qiov, +- BdrvRequestFlags flags) ++static int coroutine_fn GRAPH_RDLOCK ++raw_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, ++ QEMUIOVector *qiov, BdrvRequestFlags flags) + { + return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_WRITE, flags); + } +@@ -3615,10 +3615,11 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, + #endif + + #if defined(CONFIG_BLKZONED) +-static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, +- int64_t *offset, +- QEMUIOVector *qiov, +- BdrvRequestFlags flags) { ++static int coroutine_fn GRAPH_RDLOCK ++raw_co_zone_append(BlockDriverState *bs, ++ int64_t *offset, ++ QEMUIOVector *qiov, ++ BdrvRequestFlags flags) { + assert(flags == 0); + int64_t zone_size_mask = bs->bl.zone_size - 1; + int64_t iov_len = 0; +-- +2.39.3 + diff --git a/kvm-iotests-graph-changes-while-io-add-test-case-with-re.patch b/kvm-iotests-graph-changes-while-io-add-test-case-with-re.patch new file mode 100644 index 0000000..4ec2cc5 --- /dev/null +++ b/kvm-iotests-graph-changes-while-io-add-test-case-with-re.patch @@ -0,0 +1,176 @@ +From 19aa4d70aa02db7183997cfb2e6086a125ee2cdd Mon Sep 17 00:00:00 2001 +From: Andrey Drobyshev +Date: Fri, 30 May 2025 17:10:58 +0200 +Subject: [PATCH 33/33] iotests/graph-changes-while-io: add test case with + removal of lower snapshot + +RH-Author: Kevin Wolf +RH-MergeRequest: 393: block: do not drain while holding the graph lock +RH-Jira: RHEL-88561 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Hanna Czenczek +RH-Commit: [21/21] 2382d5d35e99a8d2dc481e1ddf89b475d13a620f (kmwolf/centos-qemu-kvm) + +This case is catching potential deadlock which takes place when job-dismiss +is issued when I/O requests are processed in a separate iothread. + +See https://mail.gnu.org/archive/html/qemu-devel/2025-04/msg04421.html + +Signed-off-by: Andrey Drobyshev +[FE: re-use top image and rename snap1->mid as suggested by Kevin Wolf + remove image file after test as suggested by Kevin Wolf + add type annotation for function argument to make mypy happy] +Signed-off-by: Fiona Ebner +Message-ID: <20250530151125.955508-22-f.ebner@proxmox.com> +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf +(cherry picked from commit 09d98a018e1fd2db0bb73bbe9b4a7110c8ae354f) +Signed-off-by: Kevin Wolf +--- + .../qemu-iotests/tests/graph-changes-while-io | 101 ++++++++++++++++-- + .../tests/graph-changes-while-io.out | 4 +- + 2 files changed, 96 insertions(+), 9 deletions(-) + +diff --git a/tests/qemu-iotests/tests/graph-changes-while-io b/tests/qemu-iotests/tests/graph-changes-while-io +index 35489e3b5e..dca1167b6d 100755 +--- a/tests/qemu-iotests/tests/graph-changes-while-io ++++ b/tests/qemu-iotests/tests/graph-changes-while-io +@@ -27,6 +27,7 @@ from iotests import imgfmt, qemu_img, qemu_img_create, qemu_io, \ + + + top = os.path.join(iotests.test_dir, 'top.img') ++mid = os.path.join(iotests.test_dir, 'mid.img') + nbd_sock = os.path.join(iotests.sock_dir, 'nbd.sock') + + +@@ -59,6 +60,15 @@ class TestGraphChangesWhileIO(QMPTestCase): + self.qsd.stop() + os.remove(top) + ++ def _wait_for_blockjob(self, status: str) -> None: ++ done = False ++ while not done: ++ for event in self.qsd.get_qmp().get_events(wait=10.0): ++ if event['event'] != 'JOB_STATUS_CHANGE': ++ continue ++ if event['data']['status'] == status: ++ done = True ++ + def test_blockdev_add_while_io(self) -> None: + # Run qemu-img bench in the background + bench_thr = Thread(target=do_qemu_img_bench) +@@ -117,15 +127,92 @@ class TestGraphChangesWhileIO(QMPTestCase): + 'device': 'job0', + }) + +- cancelled = False +- while not cancelled: +- for event in self.qsd.get_qmp().get_events(wait=10.0): +- if event['event'] != 'JOB_STATUS_CHANGE': +- continue +- if event['data']['status'] == 'null': +- cancelled = True ++ self._wait_for_blockjob('null') ++ ++ bench_thr.join() ++ ++ def test_remove_lower_snapshot_while_io(self) -> None: ++ # Run qemu-img bench in the background ++ bench_thr = Thread(target=do_qemu_img_bench, args=(100000, )) ++ bench_thr.start() ++ ++ # While I/O is performed on 'node0' node, consequently add 2 snapshots ++ # on top of it, then remove (commit) them starting from lower one. ++ while bench_thr.is_alive(): ++ # Recreate snapshot images on every iteration ++ qemu_img_create('-f', imgfmt, mid, '1G') ++ qemu_img_create('-f', imgfmt, top, '1G') ++ ++ self.qsd.cmd('blockdev-add', { ++ 'driver': imgfmt, ++ 'node-name': 'mid', ++ 'file': { ++ 'driver': 'file', ++ 'filename': mid ++ } ++ }) ++ ++ self.qsd.cmd('blockdev-snapshot', { ++ 'node': 'node0', ++ 'overlay': 'mid', ++ }) ++ ++ self.qsd.cmd('blockdev-add', { ++ 'driver': imgfmt, ++ 'node-name': 'top', ++ 'file': { ++ 'driver': 'file', ++ 'filename': top ++ } ++ }) ++ ++ self.qsd.cmd('blockdev-snapshot', { ++ 'node': 'mid', ++ 'overlay': 'top', ++ }) ++ ++ self.qsd.cmd('block-commit', { ++ 'job-id': 'commit-mid', ++ 'device': 'top', ++ 'top-node': 'mid', ++ 'base-node': 'node0', ++ 'auto-finalize': True, ++ 'auto-dismiss': False, ++ }) ++ ++ self._wait_for_blockjob('concluded') ++ self.qsd.cmd('job-dismiss', { ++ 'id': 'commit-mid', ++ }) ++ ++ self.qsd.cmd('block-commit', { ++ 'job-id': 'commit-top', ++ 'device': 'top', ++ 'top-node': 'top', ++ 'base-node': 'node0', ++ 'auto-finalize': True, ++ 'auto-dismiss': False, ++ }) ++ ++ self._wait_for_blockjob('ready') ++ self.qsd.cmd('job-complete', { ++ 'id': 'commit-top', ++ }) ++ ++ self._wait_for_blockjob('concluded') ++ self.qsd.cmd('job-dismiss', { ++ 'id': 'commit-top', ++ }) ++ ++ self.qsd.cmd('blockdev-del', { ++ 'node-name': 'mid' ++ }) ++ self.qsd.cmd('blockdev-del', { ++ 'node-name': 'top' ++ }) + + bench_thr.join() ++ os.remove(mid) + + if __name__ == '__main__': + # Format must support raw backing files +diff --git a/tests/qemu-iotests/tests/graph-changes-while-io.out b/tests/qemu-iotests/tests/graph-changes-while-io.out +index fbc63e62f8..8d7e996700 100644 +--- a/tests/qemu-iotests/tests/graph-changes-while-io.out ++++ b/tests/qemu-iotests/tests/graph-changes-while-io.out +@@ -1,5 +1,5 @@ +-.. ++... + ---------------------------------------------------------------------- +-Ran 2 tests ++Ran 3 tests + + OK +-- +2.39.3 + diff --git a/kvm-iotests-graph-changes-while-io-remove-image-file-aft.patch b/kvm-iotests-graph-changes-while-io-remove-image-file-aft.patch new file mode 100644 index 0000000..af0c52f --- /dev/null +++ b/kvm-iotests-graph-changes-while-io-remove-image-file-aft.patch @@ -0,0 +1,39 @@ +From 727fb4bf3409e170fbab697981f5f57b6ee2f93b Mon Sep 17 00:00:00 2001 +From: Fiona Ebner +Date: Fri, 30 May 2025 17:10:57 +0200 +Subject: [PATCH 32/33] iotests/graph-changes-while-io: remove image file after + test + +RH-Author: Kevin Wolf +RH-MergeRequest: 393: block: do not drain while holding the graph lock +RH-Jira: RHEL-88561 +RH-Acked-by: Stefan Hajnoczi +RH-Acked-by: Hanna Czenczek +RH-Commit: [20/21] b7c8fe57ca9058829b6c959ca2305420261d99f5 (kmwolf/centos-qemu-kvm) + +Suggested-by: Kevin Wolf +Signed-off-by: Fiona Ebner +Message-ID: <20250530151125.955508-21-f.ebner@proxmox.com> +Reviewed-by: Kevin Wolf +Signed-off-by: Kevin Wolf +(cherry picked from commit ed8c62927e8facebb1e41b417daee3109e398712) +Signed-off-by: Kevin Wolf +--- + tests/qemu-iotests/tests/graph-changes-while-io | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/tests/qemu-iotests/tests/graph-changes-while-io b/tests/qemu-iotests/tests/graph-changes-while-io +index 194fda500e..35489e3b5e 100755 +--- a/tests/qemu-iotests/tests/graph-changes-while-io ++++ b/tests/qemu-iotests/tests/graph-changes-while-io +@@ -57,6 +57,7 @@ class TestGraphChangesWhileIO(QMPTestCase): + + def tearDown(self) -> None: + self.qsd.stop() ++ os.remove(top) + + def test_blockdev_add_while_io(self) -> None: + # Run qemu-img bench in the background +-- +2.39.3 + diff --git a/kvm-migration-Add-qtest-for-migration-over-RDMA.patch b/kvm-migration-Add-qtest-for-migration-over-RDMA.patch new file mode 100644 index 0000000..c6b3a73 --- /dev/null +++ b/kvm-migration-Add-qtest-for-migration-over-RDMA.patch @@ -0,0 +1,222 @@ +From a408d755e0c764f80c8dc50942c9d74e4458cf98 Mon Sep 17 00:00:00 2001 +From: Li Zhijian +Date: Tue, 11 Mar 2025 10:42:21 +0800 +Subject: [PATCH 08/33] migration: Add qtest for migration over RDMA +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Prasad Pandit +RH-MergeRequest: 390: migration: allow to enable multifd+postcopy features together, but use multifd during precopy only +RH-Jira: RHEL-59697 +RH-Acked-by: Juraj Marcin +RH-Acked-by: Miroslav Rezanina +RH-Commit: [8/11] bf9644e800b982d81d3b68f6d3951c207c66fc76 (pjp/cs-qemu-kvm) + +This qtest requires there is a RDMA(RoCE) link in the host. +In order to make the test work smoothly, introduce a +scripts/rdma-migration-helper.sh to detect existing RoCE link before +running the test. + +Test will be skipped if there is no available RoCE link. + # Start of rdma tests + # Running /x86_64/migration/precopy/rdma/plain + ok 1 /x86_64/migration/precopy/rdma/plain # SKIP No rdma link available + # To enable the test: + # Run 'scripts/rdma-migration-helper.sh setup' with root to setup a new rdma/rxe link and rerun the test + # Optional: run 'scripts/rdma-migration-helper.sh clean' to revert the 'setup' + + # End of rdma tests + +Jira: https://issues.redhat.com/browse/RHEL-59697 +Cc: Philippe Mathieu-Daudé +Cc: Stefan Hajnoczi +Reviewed-by: Peter Xu +Signed-off-by: Li Zhijian +Message-ID: <20250311024221.363421-1-lizhijian@fujitsu.com> +[add 'head -1' to script, reformat test message] +Signed-off-by: Fabiano Rosas +(cherry picked from commit 7d9849c3c41463ab9ba40348a8606927dc0fb85d) +Signed-off-by: Prasad Pandit +--- + MAINTAINERS | 1 + + scripts/rdma-migration-helper.sh | 70 +++++++++++++++++++++++++++ + tests/qtest/migration/precopy-tests.c | 66 +++++++++++++++++++++++++ + 3 files changed, 137 insertions(+) + create mode 100755 scripts/rdma-migration-helper.sh + +diff --git a/MAINTAINERS b/MAINTAINERS +index d54b5578f8..465aedbcfb 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -3516,6 +3516,7 @@ R: Li Zhijian + R: Peter Xu + S: Odd Fixes + F: migration/rdma* ++F: scripts/rdma-migration-helper.sh + + Migration dirty limit and dirty page rate + M: Hyman Huang +diff --git a/scripts/rdma-migration-helper.sh b/scripts/rdma-migration-helper.sh +new file mode 100755 +index 0000000000..a39f2fb0e5 +--- /dev/null ++++ b/scripts/rdma-migration-helper.sh +@@ -0,0 +1,70 @@ ++#!/bin/bash ++ ++# Copied from blktests ++get_ipv4_addr() ++{ ++ ip -4 -o addr show dev "$1" | ++ sed -n 's/.*[[:blank:]]inet[[:blank:]]*\([^[:blank:]/]*\).*/\1/p' | ++ head -1 | tr -d '\n' ++} ++ ++# existing rdma interfaces ++rdma_interfaces() ++{ ++ rdma link show | sed -nE 's/^link .* netdev ([^ ]+).*$/\1 /p' ++} ++ ++# existing valid ipv4 interfaces ++ipv4_interfaces() ++{ ++ ip -o addr show | awk '/inet / {print $2}' | grep -v -w lo ++} ++ ++rdma_rxe_detect() ++{ ++ for r in $(rdma_interfaces) ++ do ++ ipv4_interfaces | grep -qw $r && get_ipv4_addr $r && return ++ done ++ ++ return 1 ++} ++ ++rdma_rxe_setup() ++{ ++ for i in $(ipv4_interfaces) ++ do ++ rdma_interfaces | grep -qw $i && continue ++ rdma link add "${i}_rxe" type rxe netdev "$i" && { ++ echo "Setup new rdma/rxe ${i}_rxe for $i with $(get_ipv4_addr $i)" ++ return ++ } ++ done ++ ++ echo "Failed to setup any new rdma/rxe link" >&2 ++ return 1 ++} ++ ++rdma_rxe_clean() ++{ ++ modprobe -r rdma_rxe ++} ++ ++operation=${1:-detect} ++ ++command -v rdma >/dev/null || { ++ echo "Command 'rdma' is not available, please install it first." >&2 ++ exit 1 ++} ++ ++if [ "$operation" == "setup" ] || [ "$operation" == "clean" ]; then ++ [ "$UID" == 0 ] || { ++ echo "Root privilege is required to setup/clean a rdma/rxe link" >&2 ++ exit 1 ++ } ++ rdma_rxe_"$operation" ++elif [ "$operation" == "detect" ]; then ++ rdma_rxe_detect ++else ++ echo "Usage: $0 [setup | detect | clean]" ++fi +diff --git a/tests/qtest/migration/precopy-tests.c b/tests/qtest/migration/precopy-tests.c +index f8404793b8..87b0a7e8ef 100644 +--- a/tests/qtest/migration/precopy-tests.c ++++ b/tests/qtest/migration/precopy-tests.c +@@ -99,6 +99,68 @@ static void test_precopy_unix_dirty_ring(void) + test_precopy_common(&args); + } + ++#ifdef CONFIG_RDMA ++ ++#define RDMA_MIGRATION_HELPER "scripts/rdma-migration-helper.sh" ++static int new_rdma_link(char *buffer) ++{ ++ char cmd[256]; ++ bool verbose = g_getenv("QTEST_LOG"); ++ ++ snprintf(cmd, sizeof(cmd), "%s detect %s", RDMA_MIGRATION_HELPER, ++ verbose ? "" : "2>/dev/null"); ++ ++ FILE *pipe = popen(cmd, "r"); ++ if (pipe == NULL) { ++ perror("Failed to run script"); ++ return -1; ++ } ++ ++ int idx = 0; ++ while (fgets(buffer + idx, 128 - idx, pipe) != NULL) { ++ idx += strlen(buffer); ++ } ++ ++ int status = pclose(pipe); ++ if (status == -1) { ++ perror("Error reported by pclose()"); ++ return -1; ++ } else if (WIFEXITED(status)) { ++ return WEXITSTATUS(status); ++ } ++ ++ return -1; ++} ++ ++static void test_precopy_rdma_plain(void) ++{ ++ char buffer[128] = {}; ++ ++ if (new_rdma_link(buffer)) { ++ g_test_skip("No rdma link available\n" ++ "# To enable the test:\n" ++ "# Run \'" RDMA_MIGRATION_HELPER " setup\' with root to " ++ "setup a new rdma/rxe link and rerun the test\n" ++ "# Optional: run 'scripts/rdma-migration-helper.sh clean' " ++ "to revert the 'setup'"); ++ return; ++ } ++ ++ /* ++ * TODO: query a free port instead of hard code. ++ * 29200=('R'+'D'+'M'+'A')*100 ++ **/ ++ g_autofree char *uri = g_strdup_printf("rdma:%s:29200", buffer); ++ ++ MigrateCommon args = { ++ .listen_uri = uri, ++ .connect_uri = uri, ++ }; ++ ++ test_precopy_common(&args); ++} ++#endif ++ + static void test_precopy_tcp_plain(void) + { + MigrateCommon args = { +@@ -1127,6 +1189,10 @@ static void migration_test_add_precopy_smoke(MigrationTestEnv *env) + test_multifd_tcp_uri_none); + migration_test_add("/migration/multifd/tcp/plain/cancel", + test_multifd_tcp_cancel); ++#ifdef CONFIG_RDMA ++ migration_test_add("/migration/precopy/rdma/plain", ++ test_precopy_rdma_plain); ++#endif + } + + void migration_test_add_precopy(MigrationTestEnv *env) +-- +2.39.3 + diff --git a/kvm-migration-Add-save_postcopy_prepare-savevm-handler.patch b/kvm-migration-Add-save_postcopy_prepare-savevm-handler.patch new file mode 100644 index 0000000..861a15a --- /dev/null +++ b/kvm-migration-Add-save_postcopy_prepare-savevm-handler.patch @@ -0,0 +1,140 @@ +From 883ddd4af17376fc62bdee9f4b30dfaa45d0c968 Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 11 Apr 2025 17:15:30 +0530 +Subject: [PATCH 03/33] migration: Add save_postcopy_prepare() savevm handler + +RH-Author: Prasad Pandit +RH-MergeRequest: 390: migration: allow to enable multifd+postcopy features together, but use multifd during precopy only +RH-Jira: RHEL-59697 +RH-Acked-by: Juraj Marcin +RH-Acked-by: Miroslav Rezanina +RH-Commit: [3/11] 2f2f108041d921985e2ddce5fd8e805fc539e74f (pjp/cs-qemu-kvm) + +Add a savevm handler for a module to opt-in sending extra sections right +before postcopy starts, and before VM is stopped. + +RAM will start to use this new savevm handler in the next patch to do flush +and sync for multifd pages. + +Note that we choose to do it before VM stopped because the current only +potential user is not sensitive to VM status, so doing it before VM is +stopped is preferred to enlarge any postcopy downtime. + +It is still a bit unfortunate that we need to introduce such a new savevm +handler just for the only use case, however it's so far the cleanest. + +Jira: https://issues.redhat.com/browse/RHEL-59697 +Signed-off-by: Peter Xu +Signed-off-by: Prasad Pandit +Reviewed-by: Fabiano Rosas +Message-ID: <20250411114534.3370816-4-ppandit@redhat.com> +Signed-off-by: Fabiano Rosas +(cherry picked from commit 1d481116015428c02f7e3635f9bc0b88b0978fdc) +Signed-off-by: Prasad Pandit +--- + include/migration/register.h | 15 +++++++++++++++ + migration/migration.c | 4 ++++ + migration/savevm.c | 33 +++++++++++++++++++++++++++++++++ + migration/savevm.h | 1 + + 4 files changed, 53 insertions(+) + +diff --git a/include/migration/register.h b/include/migration/register.h +index c041ce32f2..b79dc81b8d 100644 +--- a/include/migration/register.h ++++ b/include/migration/register.h +@@ -189,6 +189,21 @@ typedef struct SaveVMHandlers { + + /* This runs outside the BQL! */ + ++ /** ++ * @save_postcopy_prepare ++ * ++ * This hook will be invoked on the source side right before switching ++ * to postcopy (before VM stopped). ++ * ++ * @f: QEMUFile where to send the data ++ * @opaque: Data pointer passed to register_savevm_live() ++ * @errp: Error** used to report error message ++ * ++ * Returns: true if succeeded, false if error occured. When false is ++ * returned, @errp must be set. ++ */ ++ bool (*save_postcopy_prepare)(QEMUFile *f, void *opaque, Error **errp); ++ + /** + * @state_pending_estimate + * +diff --git a/migration/migration.c b/migration/migration.c +index 64f4f40ae3..4bb29b7193 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -2717,6 +2717,10 @@ static int postcopy_start(MigrationState *ms, Error **errp) + } + } + ++ if (!qemu_savevm_state_postcopy_prepare(ms->to_dst_file, errp)) { ++ return -1; ++ } ++ + trace_postcopy_start(); + bql_lock(); + trace_postcopy_start_set_run(); +diff --git a/migration/savevm.c b/migration/savevm.c +index ce158c3512..23ef4c7dc9 100644 +--- a/migration/savevm.c ++++ b/migration/savevm.c +@@ -1523,6 +1523,39 @@ void qemu_savevm_state_complete_postcopy(QEMUFile *f) + qemu_fflush(f); + } + ++bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp) ++{ ++ SaveStateEntry *se; ++ bool ret; ++ ++ QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { ++ if (!se->ops || !se->ops->save_postcopy_prepare) { ++ continue; ++ } ++ ++ if (se->ops->is_active) { ++ if (!se->ops->is_active(se->opaque)) { ++ continue; ++ } ++ } ++ ++ trace_savevm_section_start(se->idstr, se->section_id); ++ ++ save_section_header(f, se, QEMU_VM_SECTION_PART); ++ ret = se->ops->save_postcopy_prepare(f, se->opaque, errp); ++ save_section_footer(f, se); ++ ++ trace_savevm_section_end(se->idstr, se->section_id, ret); ++ ++ if (!ret) { ++ assert(*errp); ++ return false; ++ } ++ } ++ ++ return true; ++} ++ + int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) + { + int64_t start_ts_each, end_ts_each; +diff --git a/migration/savevm.h b/migration/savevm.h +index 138c39a7f9..2d5e9c7166 100644 +--- a/migration/savevm.h ++++ b/migration/savevm.h +@@ -45,6 +45,7 @@ void qemu_savevm_state_pending_exact(uint64_t *must_precopy, + void qemu_savevm_state_pending_estimate(uint64_t *must_precopy, + uint64_t *can_postcopy); + int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy); ++bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp); + void qemu_savevm_send_ping(QEMUFile *f, uint32_t value); + void qemu_savevm_send_open_return_path(QEMUFile *f); + int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len); +-- +2.39.3 + diff --git a/kvm-migration-enable-multifd-and-postcopy-together.patch b/kvm-migration-enable-multifd-and-postcopy-together.patch new file mode 100644 index 0000000..5b8a373 --- /dev/null +++ b/kvm-migration-enable-multifd-and-postcopy-together.patch @@ -0,0 +1,111 @@ +From 12f7ba5e8b344e578dc99f5ce6e371d4c51108bb Mon Sep 17 00:00:00 2001 +From: Prasad Pandit +Date: Mon, 12 May 2025 18:21:23 +0530 +Subject: [PATCH 07/33] migration: enable multifd and postcopy together + +RH-Author: Prasad Pandit +RH-MergeRequest: 390: migration: allow to enable multifd+postcopy features together, but use multifd during precopy only +RH-Jira: RHEL-59697 +RH-Acked-by: Juraj Marcin +RH-Acked-by: Miroslav Rezanina +RH-Commit: [7/11] 3e5d91a4aed5f18f1c29ae1ab9296ae41b6cf3ca (pjp/cs-qemu-kvm) + +Enable Multifd and Postcopy migration together. +The migration_ioc_process_incoming() routine checks +magic value sent on each channel and helps to properly +setup multifd and postcopy channels. + +The Precopy and Multifd threads work during the initial +guest RAM transfer. When migration moves to the Postcopy +phase, the multifd threads cease to send data on multifd +channels and Postcopy threads on the destination +request/pull data from the source side. + +Jira: https://issues.redhat.com/browse/RHEL-59697 +Reviewed-by: Fabiano Rosas +Signed-off-by: Prasad Pandit +Link: https://lore.kernel.org/r/20250512125124.147064-3-ppandit@redhat.com +Signed-off-by: Peter Xu +(cherry picked from commit e27418861288285d20352448fef4491a68223d39) +Signed-off-by: Prasad Pandit +--- + migration/multifd-nocomp.c | 3 ++- + migration/multifd.c | 7 +++++++ + migration/options.c | 5 ----- + migration/ram.c | 5 ++--- + 4 files changed, 11 insertions(+), 9 deletions(-) + +diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c +index ffe75256c9..02f8bf8ce8 100644 +--- a/migration/multifd-nocomp.c ++++ b/migration/multifd-nocomp.c +@@ -17,6 +17,7 @@ + #include "migration-stats.h" + #include "multifd.h" + #include "options.h" ++#include "migration.h" + #include "qapi/error.h" + #include "qemu/cutils.h" + #include "qemu/error-report.h" +@@ -399,7 +400,7 @@ int multifd_ram_flush_and_sync(QEMUFile *f) + MultiFDSyncReq req; + int ret; + +- if (!migrate_multifd()) { ++ if (!migrate_multifd() || migration_in_postcopy()) { + return 0; + } + +diff --git a/migration/multifd.c b/migration/multifd.c +index 6139cabe44..074d16d07d 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -1379,6 +1379,13 @@ static void *multifd_recv_thread(void *opaque) + } + + if (has_data) { ++ /* ++ * multifd thread should not be active and receive data ++ * when migration is in the Postcopy phase. Two threads ++ * writing the same memory area could easily corrupt ++ * the guest state. ++ */ ++ assert(!migration_in_postcopy()); + if (is_device_state) { + assert(use_packets); + ret = multifd_device_state_recv(p, &local_err); +diff --git a/migration/options.c b/migration/options.c +index b0ac2ea408..48aa6076de 100644 +--- a/migration/options.c ++++ b/migration/options.c +@@ -491,11 +491,6 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp) + error_setg(errp, "Postcopy is not compatible with ignore-shared"); + return false; + } +- +- if (new_caps[MIGRATION_CAPABILITY_MULTIFD]) { +- error_setg(errp, "Postcopy is not yet compatible with multifd"); +- return false; +- } + } + + if (new_caps[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) { +diff --git a/migration/ram.c b/migration/ram.c +index 856769a77c..6f390b28d9 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -2013,9 +2013,8 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss) + } + } + +- if (migrate_multifd()) { +- RAMBlock *block = pss->block; +- return ram_save_multifd_page(block, offset); ++ if (migrate_multifd() && !migration_in_postcopy()) { ++ return ram_save_multifd_page(pss->block, offset); + } + + return ram_save_page(rs, pss); +-- +2.39.3 + diff --git a/kvm-migration-multifd-move-macros-to-multifd-header.patch b/kvm-migration-multifd-move-macros-to-multifd-header.patch new file mode 100644 index 0000000..053eeba --- /dev/null +++ b/kvm-migration-multifd-move-macros-to-multifd-header.patch @@ -0,0 +1,63 @@ +From 8eb8ea8cf070ac88d8caa50c4a0cfc9e00398616 Mon Sep 17 00:00:00 2001 +From: Prasad Pandit +Date: Fri, 11 Apr 2025 17:15:28 +0530 +Subject: [PATCH 01/33] migration/multifd: move macros to multifd header + +RH-Author: Prasad Pandit +RH-MergeRequest: 390: migration: allow to enable multifd+postcopy features together, but use multifd during precopy only +RH-Jira: RHEL-59697 +RH-Acked-by: Juraj Marcin +RH-Acked-by: Miroslav Rezanina +RH-Commit: [1/11] 438135a73f4247ac3d35f2798b7ca75b0b55cbd8 (pjp/cs-qemu-kvm) + +Move MULTIFD_ macros to the header file so that +they are accessible from other source files. + +Jira: https://issues.redhat.com/browse/RHEL-59697 +Reviewed-by: Fabiano Rosas +Signed-off-by: Prasad Pandit +Reviewed-by: Peter Xu +Message-ID: <20250411114534.3370816-2-ppandit@redhat.com> +Signed-off-by: Fabiano Rosas +(cherry picked from commit 56e3c89f44ecebc946fbe4ffed325d1a79b26e38) +Signed-off-by: Prasad Pandit +--- + migration/multifd.c | 5 ----- + migration/multifd.h | 5 +++++ + 2 files changed, 5 insertions(+), 5 deletions(-) + +diff --git a/migration/multifd.c b/migration/multifd.c +index dfb5189f0e..6139cabe44 100644 +--- a/migration/multifd.c ++++ b/migration/multifd.c +@@ -36,11 +36,6 @@ + #include "io/channel-socket.h" + #include "yank_functions.h" + +-/* Multiple fd's */ +- +-#define MULTIFD_MAGIC 0x11223344U +-#define MULTIFD_VERSION 1 +- + typedef struct { + uint32_t magic; + uint32_t version; +diff --git a/migration/multifd.h b/migration/multifd.h +index 2d337e7b3b..9b6d81e7ed 100644 +--- a/migration/multifd.h ++++ b/migration/multifd.h +@@ -49,6 +49,11 @@ bool multifd_queue_page(RAMBlock *block, ram_addr_t offset); + bool multifd_recv(void); + MultiFDRecvData *multifd_get_recv_data(void); + ++/* Multiple fd's */ ++ ++#define MULTIFD_MAGIC 0x11223344U ++#define MULTIFD_VERSION 1 ++ + /* Multifd Compression flags */ + #define MULTIFD_FLAG_SYNC (1 << 0) + +-- +2.39.3 + diff --git a/kvm-migration-ram-Implement-save_postcopy_prepare.patch b/kvm-migration-ram-Implement-save_postcopy_prepare.patch new file mode 100644 index 0000000..6991716 --- /dev/null +++ b/kvm-migration-ram-Implement-save_postcopy_prepare.patch @@ -0,0 +1,85 @@ +From d5b76b77dc891f0bec211a6d00b099a2979223ee Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Fri, 11 Apr 2025 17:15:31 +0530 +Subject: [PATCH 04/33] migration/ram: Implement save_postcopy_prepare() + +RH-Author: Prasad Pandit +RH-MergeRequest: 390: migration: allow to enable multifd+postcopy features together, but use multifd during precopy only +RH-Jira: RHEL-59697 +RH-Acked-by: Juraj Marcin +RH-Acked-by: Miroslav Rezanina +RH-Commit: [4/11] 4df55b4e65458ffccf48df5de3afad0b38cded51 (pjp/cs-qemu-kvm) + +Implement save_postcopy_prepare(), preparing for the enablement +of both multifd and postcopy. + +Jira: https://issues.redhat.com/browse/RHEL-59697 +Signed-off-by: Peter Xu +Signed-off-by: Prasad Pandit +Reviewed-by: Fabiano Rosas +Message-ID: <20250411114534.3370816-5-ppandit@redhat.com> +Signed-off-by: Fabiano Rosas +(cherry picked from commit ad8d82ffbb8b8034f58a570911e6e9c6328c9384) +Signed-off-by: Prasad Pandit +--- + migration/ram.c | 37 +++++++++++++++++++++++++++++++++++++ + 1 file changed, 37 insertions(+) + +diff --git a/migration/ram.c b/migration/ram.c +index 21d2f87ff1..856769a77c 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -4515,6 +4515,42 @@ static int ram_resume_prepare(MigrationState *s, void *opaque) + return 0; + } + ++static bool ram_save_postcopy_prepare(QEMUFile *f, void *opaque, Error **errp) ++{ ++ int ret; ++ ++ if (migrate_multifd()) { ++ /* ++ * When multifd is enabled, source QEMU needs to make sure all the ++ * pages queued before postcopy starts have been flushed. ++ * ++ * The load of these pages must happen before switching to postcopy. ++ * It's because loading of guest pages (so far) in multifd recv ++ * threads is still non-atomic, so the load cannot happen with vCPUs ++ * running on the destination side. ++ * ++ * This flush and sync will guarantee that those pages are loaded ++ * _before_ postcopy starts on the destination. The rationale is, ++ * this happens before VM stops (and before source QEMU sends all ++ * the rest of the postcopy messages). So when the destination QEMU ++ * receives the postcopy messages, it must have received the sync ++ * message on the main channel (either RAM_SAVE_FLAG_MULTIFD_FLUSH, ++ * or RAM_SAVE_FLAG_EOS), and such message would guarantee that ++ * all previous guest pages queued in the multifd channels are ++ * completely loaded. ++ */ ++ ret = multifd_ram_flush_and_sync(f); ++ if (ret < 0) { ++ error_setg(errp, "%s: multifd flush and sync failed", __func__); ++ return false; ++ } ++ } ++ ++ qemu_put_be64(f, RAM_SAVE_FLAG_EOS); ++ ++ return true; ++} ++ + void postcopy_preempt_shutdown_file(MigrationState *s) + { + qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); +@@ -4534,6 +4570,7 @@ static SaveVMHandlers savevm_ram_handlers = { + .load_setup = ram_load_setup, + .load_cleanup = ram_load_cleanup, + .resume_prepare = ram_resume_prepare, ++ .save_postcopy_prepare = ram_save_postcopy_prepare, + }; + + static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, +-- +2.39.3 + diff --git a/kvm-migration-refactor-channel-discovery-mechanism.patch b/kvm-migration-refactor-channel-discovery-mechanism.patch new file mode 100644 index 0000000..d77363b --- /dev/null +++ b/kvm-migration-refactor-channel-discovery-mechanism.patch @@ -0,0 +1,239 @@ +From 21ec86cdc48de9ddf3f5bba994edd9f9427ffd4c Mon Sep 17 00:00:00 2001 +From: Prasad Pandit +Date: Fri, 11 Apr 2025 17:15:29 +0530 +Subject: [PATCH 02/33] migration: refactor channel discovery mechanism + +RH-Author: Prasad Pandit +RH-MergeRequest: 390: migration: allow to enable multifd+postcopy features together, but use multifd during precopy only +RH-Jira: RHEL-59697 +RH-Acked-by: Juraj Marcin +RH-Acked-by: Miroslav Rezanina +RH-Commit: [2/11] 7f40da01d8c9a827627c73641f3b90a27bbfb8a0 (pjp/cs-qemu-kvm) + +The various logical migration channels don't have a +standardized way of advertising themselves and their +connections may be seen out of order by the migration +destination. When a new connection arrives, the incoming +migration currently make use of heuristics to determine +which channel it belongs to. + +The next few patches will need to change how the multifd +and postcopy capabilities interact and that affects the +channel discovery heuristic. + +Refactor the channel discovery heuristic to make it less +opaque and simplify the subsequent patches. + +Jira: https://issues.redhat.com/browse/RHEL-59697 +Signed-off-by: Prasad Pandit +Reviewed-by: Fabiano Rosas +Message-ID: <20250411114534.3370816-3-ppandit@redhat.com> +Signed-off-by: Fabiano Rosas +(cherry picked from commit 00f3fcef1981eb23f98b956d9cda2df528bfef40) +Signed-off-by: Prasad Pandit +--- + migration/migration.c | 130 +++++++++++++++++++++++------------------- + 1 file changed, 70 insertions(+), 60 deletions(-) + +diff --git a/migration/migration.c b/migration/migration.c +index d46e776e24..64f4f40ae3 100644 +--- a/migration/migration.c ++++ b/migration/migration.c +@@ -95,6 +95,9 @@ enum mig_rp_message_type { + MIG_RP_MSG_MAX + }; + ++/* Migration channel types */ ++enum { CH_MAIN, CH_MULTIFD, CH_POSTCOPY }; ++ + /* When we add fault tolerance, we could have several + migrations at once. For now we don't need to add + dynamic creation of migration */ +@@ -931,9 +934,8 @@ static void migration_incoming_setup(QEMUFile *f) + { + MigrationIncomingState *mis = migration_incoming_get_current(); + +- if (!mis->from_src_file) { +- mis->from_src_file = f; +- } ++ assert(!mis->from_src_file); ++ mis->from_src_file = f; + qemu_file_set_blocking(f, false); + } + +@@ -985,28 +987,19 @@ void migration_fd_process_incoming(QEMUFile *f) + migration_incoming_process(); + } + +-/* +- * Returns true when we want to start a new incoming migration process, +- * false otherwise. +- */ +-static bool migration_should_start_incoming(bool main_channel) ++static bool migration_has_main_and_multifd_channels(void) + { +- /* Multifd doesn't start unless all channels are established */ +- if (migrate_multifd()) { +- return migration_has_all_channels(); ++ MigrationIncomingState *mis = migration_incoming_get_current(); ++ if (!mis->from_src_file) { ++ /* main channel not established */ ++ return false; + } + +- /* Preempt channel only starts when the main channel is created */ +- if (migrate_postcopy_preempt()) { +- return main_channel; ++ if (migrate_multifd() && !multifd_recv_all_channels_created()) { ++ return false; + } + +- /* +- * For all the rest types of migration, we should only reach here when +- * it's the main channel that's being created, and we should always +- * proceed with this channel. +- */ +- assert(main_channel); ++ /* main and all multifd channels are established */ + return true; + } + +@@ -1015,59 +1008,81 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) + MigrationIncomingState *mis = migration_incoming_get_current(); + Error *local_err = NULL; + QEMUFile *f; +- bool default_channel = true; ++ uint8_t channel; + uint32_t channel_magic = 0; + int ret = 0; + +- if (migrate_multifd() && !migrate_mapped_ram() && +- !migrate_postcopy_ram() && +- qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) { +- /* +- * With multiple channels, it is possible that we receive channels +- * out of order on destination side, causing incorrect mapping of +- * source channels on destination side. Check channel MAGIC to +- * decide type of channel. Please note this is best effort, postcopy +- * preempt channel does not send any magic number so avoid it for +- * postcopy live migration. Also tls live migration already does +- * tls handshake while initializing main channel so with tls this +- * issue is not possible. +- */ +- ret = migration_channel_read_peek(ioc, (void *)&channel_magic, +- sizeof(channel_magic), errp); ++ if (!migration_has_main_and_multifd_channels()) { ++ if (qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) { ++ /* ++ * With multiple channels, it is possible that we receive channels ++ * out of order on destination side, causing incorrect mapping of ++ * source channels on destination side. Check channel MAGIC to ++ * decide type of channel. Please note this is best effort, ++ * postcopy preempt channel does not send any magic number so ++ * avoid it for postcopy live migration. Also tls live migration ++ * already does tls handshake while initializing main channel so ++ * with tls this issue is not possible. ++ */ ++ ret = migration_channel_read_peek(ioc, (void *)&channel_magic, ++ sizeof(channel_magic), errp); ++ if (ret != 0) { ++ return; ++ } + +- if (ret != 0) { ++ channel_magic = be32_to_cpu(channel_magic); ++ if (channel_magic == QEMU_VM_FILE_MAGIC) { ++ channel = CH_MAIN; ++ } else if (channel_magic == MULTIFD_MAGIC) { ++ assert(migrate_multifd()); ++ channel = CH_MULTIFD; ++ } else if (!mis->from_src_file && ++ mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) { ++ /* reconnect main channel for postcopy recovery */ ++ channel = CH_MAIN; ++ } else { ++ error_setg(errp, "unknown channel magic: %u", channel_magic); ++ return; ++ } ++ } else if (mis->from_src_file && migrate_multifd()) { ++ /* ++ * Non-peekable channels like tls/file are processed as ++ * multifd channels when multifd is enabled. ++ */ ++ channel = CH_MULTIFD; ++ } else if (!mis->from_src_file) { ++ channel = CH_MAIN; ++ } else { ++ error_setg(errp, "non-peekable channel used without multifd"); + return; + } +- +- default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC)); + } else { +- default_channel = !mis->from_src_file; ++ assert(migrate_postcopy_preempt()); ++ channel = CH_POSTCOPY; + } + + if (multifd_recv_setup(errp) != 0) { + return; + } + +- if (default_channel) { ++ if (channel == CH_MAIN) { + f = qemu_file_new_input(ioc); + migration_incoming_setup(f); +- } else { ++ } else if (channel == CH_MULTIFD) { + /* Multiple connections */ +- assert(migration_needs_multiple_sockets()); +- if (migrate_multifd()) { +- multifd_recv_new_channel(ioc, &local_err); +- } else { +- assert(migrate_postcopy_preempt()); +- f = qemu_file_new_input(ioc); +- postcopy_preempt_new_channel(mis, f); +- } ++ multifd_recv_new_channel(ioc, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } ++ } else if (channel == CH_POSTCOPY) { ++ assert(!mis->postcopy_qemufile_dst); ++ f = qemu_file_new_input(ioc); ++ postcopy_preempt_new_channel(mis, f); ++ return; + } + +- if (migration_should_start_incoming(default_channel)) { ++ if (migration_has_main_and_multifd_channels()) { + /* If it's a recovery, we're done */ + if (postcopy_try_recover()) { + return; +@@ -1084,18 +1099,13 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) + */ + bool migration_has_all_channels(void) + { +- MigrationIncomingState *mis = migration_incoming_get_current(); +- +- if (!mis->from_src_file) { ++ if (!migration_has_main_and_multifd_channels()) { + return false; + } + +- if (migrate_multifd()) { +- return multifd_recv_all_channels_created(); +- } +- +- if (migrate_postcopy_preempt()) { +- return mis->postcopy_qemufile_dst != NULL; ++ MigrationIncomingState *mis = migration_incoming_get_current(); ++ if (migrate_postcopy_preempt() && !mis->postcopy_qemufile_dst) { ++ return false; + } + + return true; +-- +2.39.3 + diff --git a/kvm-migration-write-zero-pages-when-postcopy-enabled.patch b/kvm-migration-write-zero-pages-when-postcopy-enabled.patch new file mode 100644 index 0000000..953132e --- /dev/null +++ b/kvm-migration-write-zero-pages-when-postcopy-enabled.patch @@ -0,0 +1,69 @@ +From d25e369e01fcb30d4d12802907372d3320c095ff Mon Sep 17 00:00:00 2001 +From: Prasad Pandit +Date: Mon, 12 May 2025 18:21:22 +0530 +Subject: [PATCH 06/33] migration: write zero pages when postcopy enabled + +RH-Author: Prasad Pandit +RH-MergeRequest: 390: migration: allow to enable multifd+postcopy features together, but use multifd during precopy only +RH-Jira: RHEL-59697 +RH-Acked-by: Juraj Marcin +RH-Acked-by: Miroslav Rezanina +RH-Commit: [6/11] 28ab95cd8a382d400d28511b0bb2e1ea1fd21c0a (pjp/cs-qemu-kvm) + +During multifd migration, zero pages are written if +they are migrated more than once. + +This may result in a migration thread hang issue when +multifd and postcopy are enabled together. + +When postcopy is enabled, always write zero pages as and +when they are migrated. + +Jira: https://issues.redhat.com/browse/RHEL-59697 +Signed-off-by: Prasad Pandit +Reviewed-by: Fabiano Rosas +Link: https://lore.kernel.org/r/20250512125124.147064-2-ppandit@redhat.com +Signed-off-by: Peter Xu +(cherry picked from commit 249543d0c02d7645b8bcda552dad138769e96831) +Signed-off-by: Prasad Pandit +--- + migration/multifd-zero-page.c | 22 ++++++++++++++++++++-- + 1 file changed, 20 insertions(+), 2 deletions(-) + +diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c +index f1e988a959..3e0a04f2b5 100644 +--- a/migration/multifd-zero-page.c ++++ b/migration/multifd-zero-page.c +@@ -85,9 +85,27 @@ void multifd_recv_zero_page_process(MultiFDRecvParams *p) + { + for (int i = 0; i < p->zero_num; i++) { + void *page = p->host + p->zero[i]; +- if (ramblock_recv_bitmap_test_byte_offset(p->block, p->zero[i])) { ++ bool received = ++ ramblock_recv_bitmap_test_byte_offset(p->block, p->zero[i]); ++ ++ /* ++ * During multifd migration zero page is written to the memory ++ * only if it is migrated more than once. ++ * ++ * It becomes a problem when both multifd & postcopy options are ++ * enabled. If the zero page which was skipped during multifd phase, ++ * is accessed during the postcopy phase of the migration, a page ++ * fault occurs. But this page fault is not served because the ++ * 'receivedmap' says the zero page is already received. Thus the ++ * thread accessing that page may hang. ++ * ++ * When postcopy is enabled, always write the zero page as and when ++ * it is migrated. ++ */ ++ if (migrate_postcopy_ram() || received) { + memset(page, 0, multifd_ram_page_size()); +- } else { ++ } ++ if (!received) { + ramblock_recv_bitmap_set_offset(p->block, p->zero[i]); + } + } +-- +2.39.3 + diff --git a/kvm-qtest-migration-rdma-Add-test-for-rdma-migration-wit.patch b/kvm-qtest-migration-rdma-Add-test-for-rdma-migration-wit.patch new file mode 100644 index 0000000..33482c9 --- /dev/null +++ b/kvm-qtest-migration-rdma-Add-test-for-rdma-migration-wit.patch @@ -0,0 +1,212 @@ +From 879b050c6cef5cf2ae1944ffb8b203faeca62f1a Mon Sep 17 00:00:00 2001 +From: Li Zhijian +Date: Tue, 13 May 2025 09:22:07 +0800 +Subject: [PATCH 10/33] qtest/migration/rdma: Add test for rdma migration with + ipv6 + +RH-Author: Prasad Pandit +RH-MergeRequest: 390: migration: allow to enable multifd+postcopy features together, but use multifd during precopy only +RH-Jira: RHEL-59697 +RH-Acked-by: Juraj Marcin +RH-Acked-by: Miroslav Rezanina +RH-Commit: [10/11] d2dd06604b461abf0c6d47dc860b1bb925889133 (pjp/cs-qemu-kvm) + +Recently, we removed ipv6 restriction[0] from RDMA migration, add a +test for it. + +[0] https://lore.kernel.org/qemu-devel/20250326095224.9918-1-jinpu.wang@ionos.com/ + +Jira: https://issues.redhat.com/browse/RHEL-59697 +Cc: Jack Wang +Cc: Michael R. Galaxy +Cc: Peter Xu +Cc: Yu Zhang +Reviewed-by: Jack Wang +Signed-off-by: Li Zhijian +Link: https://lore.kernel.org/r/20250513012207.2867069-1-lizhijian@fujitsu.com +[peterx: Fix over long lines] +Signed-off-by: Peter Xu +(cherry picked from commit 6b84c46e8e0ef6f83f33657a29a8abb2b8362d02) +Signed-off-by: Prasad Pandit +--- + scripts/rdma-migration-helper.sh | 57 ++++++++++++++++++++++----- + tests/qtest/migration/precopy-tests.c | 21 ++++++++-- + 2 files changed, 65 insertions(+), 13 deletions(-) + +diff --git a/scripts/rdma-migration-helper.sh b/scripts/rdma-migration-helper.sh +index a39f2fb0e5..d784d1566a 100755 +--- a/scripts/rdma-migration-helper.sh ++++ b/scripts/rdma-migration-helper.sh +@@ -8,23 +8,44 @@ get_ipv4_addr() + head -1 | tr -d '\n' + } + ++get_ipv6_addr() { ++ ipv6=$(ip -6 -o addr show dev "$1" | ++ sed -n 's/.*[[:blank:]]inet6[[:blank:]]*\([^[:blank:]/]*\).*/\1/p' | ++ head -1 | tr -d '\n') ++ ++ [ $? -eq 0 ] || return ++ ++ if [[ "$ipv6" =~ ^fe80: ]]; then ++ echo -n "[$ipv6%$1]" ++ else ++ echo -n "[$ipv6]" ++ fi ++} ++ + # existing rdma interfaces + rdma_interfaces() + { +- rdma link show | sed -nE 's/^link .* netdev ([^ ]+).*$/\1 /p' ++ rdma link show | sed -nE 's/^link .* netdev ([^ ]+).*$/\1 /p' | ++ grep -Ev '^(lo|tun|tap)' + } + + # existing valid ipv4 interfaces + ipv4_interfaces() + { +- ip -o addr show | awk '/inet / {print $2}' | grep -v -w lo ++ ip -o addr show | awk '/inet / {print $2}' | grep -Ev '^(lo|tun|tap)' ++} ++ ++ipv6_interfaces() ++{ ++ ip -o addr show | awk '/inet6 / {print $2}' | grep -Ev '^(lo|tun|tap)' + } + + rdma_rxe_detect() + { ++ family=$1 + for r in $(rdma_interfaces) + do +- ipv4_interfaces | grep -qw $r && get_ipv4_addr $r && return ++ "$family"_interfaces | grep -qw $r && get_"$family"_addr $r && return + done + + return 1 +@@ -32,16 +53,23 @@ rdma_rxe_detect() + + rdma_rxe_setup() + { +- for i in $(ipv4_interfaces) ++ family=$1 ++ for i in $("$family"_interfaces) + do +- rdma_interfaces | grep -qw $i && continue ++ if rdma_interfaces | grep -qw $i; then ++ echo "$family: Reuse the existing rdma/rxe ${i}_rxe" \ ++ "for $i with $(get_"$family"_addr $i)" ++ return ++ fi ++ + rdma link add "${i}_rxe" type rxe netdev "$i" && { +- echo "Setup new rdma/rxe ${i}_rxe for $i with $(get_ipv4_addr $i)" ++ echo "$family: Setup new rdma/rxe ${i}_rxe" \ ++ "for $i with $(get_"$family"_addr $i)" + return + } + done + +- echo "Failed to setup any new rdma/rxe link" >&2 ++ echo "$family: Failed to setup any new rdma/rxe link" >&2 + return 1 + } + +@@ -50,6 +78,12 @@ rdma_rxe_clean() + modprobe -r rdma_rxe + } + ++IP_FAMILY=${IP_FAMILY:-ipv4} ++if [ "$IP_FAMILY" != "ipv6" ] && [ "$IP_FAMILY" != "ipv4" ]; then ++ echo "Unknown ip family '$IP_FAMILY', only ipv4 or ipv6 is supported." >&2 ++ exit 1 ++fi ++ + operation=${1:-detect} + + command -v rdma >/dev/null || { +@@ -62,9 +96,14 @@ if [ "$operation" == "setup" ] || [ "$operation" == "clean" ]; then + echo "Root privilege is required to setup/clean a rdma/rxe link" >&2 + exit 1 + } +- rdma_rxe_"$operation" ++ if [ "$operation" == "setup" ]; then ++ rdma_rxe_setup ipv4 ++ rdma_rxe_setup ipv6 ++ else ++ rdma_rxe_clean ++ fi + elif [ "$operation" == "detect" ]; then +- rdma_rxe_detect ++ rdma_rxe_detect "$IP_FAMILY" + else + echo "Usage: $0 [setup | detect | clean]" + fi +diff --git a/tests/qtest/migration/precopy-tests.c b/tests/qtest/migration/precopy-tests.c +index 5be1cd5742..a62d3c5378 100644 +--- a/tests/qtest/migration/precopy-tests.c ++++ b/tests/qtest/migration/precopy-tests.c +@@ -131,12 +131,13 @@ static bool mlock_check(void) + } + + #define RDMA_MIGRATION_HELPER "scripts/rdma-migration-helper.sh" +-static int new_rdma_link(char *buffer) ++static int new_rdma_link(char *buffer, bool ipv6) + { + char cmd[256]; + bool verbose = g_getenv("QTEST_LOG"); + +- snprintf(cmd, sizeof(cmd), "%s detect %s", RDMA_MIGRATION_HELPER, ++ snprintf(cmd, sizeof(cmd), "IP_FAMILY=%s %s detect %s", ++ ipv6 ? "ipv6" : "ipv4", RDMA_MIGRATION_HELPER, + verbose ? "" : "2>/dev/null"); + + FILE *pipe = popen(cmd, "r"); +@@ -161,7 +162,7 @@ static int new_rdma_link(char *buffer) + return -1; + } + +-static void test_precopy_rdma_plain(void) ++static void __test_precopy_rdma_plain(bool ipv6) + { + char buffer[128] = {}; + +@@ -170,7 +171,7 @@ static void test_precopy_rdma_plain(void) + return; + } + +- if (new_rdma_link(buffer)) { ++ if (new_rdma_link(buffer, ipv6)) { + g_test_skip("No rdma link available\n" + "# To enable the test:\n" + "# Run \'" RDMA_MIGRATION_HELPER " setup\' with root to " +@@ -193,6 +194,16 @@ static void test_precopy_rdma_plain(void) + + test_precopy_common(&args); + } ++ ++static void test_precopy_rdma_plain(void) ++{ ++ __test_precopy_rdma_plain(false); ++} ++ ++static void test_precopy_rdma_plain_ipv6(void) ++{ ++ __test_precopy_rdma_plain(true); ++} + #endif + + static void test_precopy_tcp_plain(void) +@@ -1226,6 +1237,8 @@ static void migration_test_add_precopy_smoke(MigrationTestEnv *env) + #ifdef CONFIG_RDMA + migration_test_add("/migration/precopy/rdma/plain", + test_precopy_rdma_plain); ++ migration_test_add("/migration/precopy/rdma/plain/ipv6", ++ test_precopy_rdma_plain_ipv6); + #endif + } + +-- +2.39.3 + diff --git a/kvm-qtest-migration-rdma-Enforce-RLIMIT_MEMLOCK-128MB-re.patch b/kvm-qtest-migration-rdma-Enforce-RLIMIT_MEMLOCK-128MB-re.patch new file mode 100644 index 0000000..88cceb0 --- /dev/null +++ b/kvm-qtest-migration-rdma-Enforce-RLIMIT_MEMLOCK-128MB-re.patch @@ -0,0 +1,94 @@ +From f65658024595c8fa58c5f9a6a8892d230e68e4c7 Mon Sep 17 00:00:00 2001 +From: Li Zhijian +Date: Fri, 9 May 2025 09:42:10 +0800 +Subject: [PATCH 09/33] qtest/migration/rdma: Enforce RLIMIT_MEMLOCK >= 128MB + requirement + +RH-Author: Prasad Pandit +RH-MergeRequest: 390: migration: allow to enable multifd+postcopy features together, but use multifd during precopy only +RH-Jira: RHEL-59697 +RH-Acked-by: Juraj Marcin +RH-Acked-by: Miroslav Rezanina +RH-Commit: [9/11] bc8b4e4e55e119d1ba1e5567dfe69fe675a5f52e (pjp/cs-qemu-kvm) + +Ensure successful migration over RDMA by verifying that RLIMIT_MEMLOCK is +set to at least 128MB. This allocation is necessary due to the requirement +to pin significant portions of guest memory, typically exceeding 100MB +in this test, while the remainder is transmitted as compressed zero pages. + +Otherwise, it will fail with: +stderr: +qemu-system-x86_64: cannot get rkey +qemu-system-x86_64: error while loading state section id 2(ram) +qemu-system-x86_64: load of migration failed: Operation not permitted +qemu-system-x86_64: rdma migration: recv polling control error! +qemu-system-x86_64: RDMA is in an error state waiting migration to abort! +qemu-system-x86_64: failed to save SaveStateEntry with id(name): 2(ram): -1 +qemu-system-x86_64: Channel error: Operation not permitted + +Jira: https://issues.redhat.com/browse/RHEL-59697 +Reported-by: Peter Xu +Signed-off-by: Li Zhijian +Link: https://lore.kernel.org/r/20250509014211.1272640-1-lizhijian@fujitsu.com +Signed-off-by: Peter Xu +(cherry picked from commit 7b2e4f788d60a8ec25efbf1e6bb6552ee0cef17c) +Signed-off-by: Prasad Pandit +--- + tests/qtest/migration/precopy-tests.c | 34 +++++++++++++++++++++++++++ + 1 file changed, 34 insertions(+) + +diff --git a/tests/qtest/migration/precopy-tests.c b/tests/qtest/migration/precopy-tests.c +index 87b0a7e8ef..5be1cd5742 100644 +--- a/tests/qtest/migration/precopy-tests.c ++++ b/tests/qtest/migration/precopy-tests.c +@@ -101,6 +101,35 @@ static void test_precopy_unix_dirty_ring(void) + + #ifdef CONFIG_RDMA + ++#include ++ ++/* ++ * During migration over RDMA, it will try to pin portions of guest memory, ++ * typically exceeding 100MB in this test, while the remainder will be ++ * transmitted as compressed zero pages. ++ * ++ * REQUIRED_MEMLOCK_SZ indicates the minimal mlock size in the current context. ++ */ ++#define REQUIRED_MEMLOCK_SZ (128 << 20) /* 128MB */ ++ ++/* check 'ulimit -l' */ ++static bool mlock_check(void) ++{ ++ uid_t uid; ++ struct rlimit rlim; ++ ++ uid = getuid(); ++ if (uid == 0) { ++ return true; ++ } ++ ++ if (getrlimit(RLIMIT_MEMLOCK, &rlim) != 0) { ++ return false; ++ } ++ ++ return rlim.rlim_cur >= REQUIRED_MEMLOCK_SZ; ++} ++ + #define RDMA_MIGRATION_HELPER "scripts/rdma-migration-helper.sh" + static int new_rdma_link(char *buffer) + { +@@ -136,6 +165,11 @@ static void test_precopy_rdma_plain(void) + { + char buffer[128] = {}; + ++ if (!mlock_check()) { ++ g_test_skip("'ulimit -l' is too small, require >=128M"); ++ return; ++ } ++ + if (new_rdma_link(buffer)) { + g_test_skip("No rdma link available\n" + "# To enable the test:\n" +-- +2.39.3 + diff --git a/kvm-tests-qtest-migration-add-postcopy-tests-with-multif.patch b/kvm-tests-qtest-migration-add-postcopy-tests-with-multif.patch new file mode 100644 index 0000000..ae3fb34 --- /dev/null +++ b/kvm-tests-qtest-migration-add-postcopy-tests-with-multif.patch @@ -0,0 +1,270 @@ +From 73a8cce94ae861259b3769fe05d5cb79f5a20abb Mon Sep 17 00:00:00 2001 +From: Prasad Pandit +Date: Mon, 12 May 2025 18:21:24 +0530 +Subject: [PATCH 11/33] tests/qtest/migration: add postcopy tests with multifd + +RH-Author: Prasad Pandit +RH-MergeRequest: 390: migration: allow to enable multifd+postcopy features together, but use multifd during precopy only +RH-Jira: RHEL-59697 +RH-Acked-by: Juraj Marcin +RH-Acked-by: Miroslav Rezanina +RH-Commit: [11/11] 35d1ea7a8473725d876d02ebbc29ab6063823742 (pjp/cs-qemu-kvm) + +Add new qtests to run postcopy migration with multifd +channels enabled. + +Jira: https://issues.redhat.com/browse/RHEL-59697 +Signed-off-by: Prasad Pandit +Link: https://lore.kernel.org/r/20250512125124.147064-4-ppandit@redhat.com +[peterx: rename all new tests to be under /migration/multifd+postcopy/] +Signed-off-by: Peter Xu +(cherry picked from commit 766bbabac8f00bc5cf23ba90a8326678636280ed) +Signed-off-by: Prasad Pandit +--- + tests/qtest/migration/compression-tests.c | 18 ++++++++ + tests/qtest/migration/postcopy-tests.c | 27 ++++++++++++ + tests/qtest/migration/precopy-tests.c | 28 ++++++++++++- + tests/qtest/migration/tls-tests.c | 51 +++++++++++++++++++++++ + 4 files changed, 122 insertions(+), 2 deletions(-) + +diff --git a/tests/qtest/migration/compression-tests.c b/tests/qtest/migration/compression-tests.c +index 41e79f031b..b827665b8e 100644 +--- a/tests/qtest/migration/compression-tests.c ++++ b/tests/qtest/migration/compression-tests.c +@@ -42,6 +42,20 @@ static void test_multifd_tcp_zstd(void) + }; + test_precopy_common(&args); + } ++ ++static void test_multifd_postcopy_tcp_zstd(void) ++{ ++ MigrateCommon args = { ++ .listen_uri = "defer", ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ .caps[MIGRATION_CAPABILITY_POSTCOPY_RAM] = true, ++ }, ++ .start_hook = migrate_hook_start_precopy_tcp_multifd_zstd, ++ }; ++ ++ test_precopy_common(&args); ++} + #endif /* CONFIG_ZSTD */ + + #ifdef CONFIG_QATZIP +@@ -184,6 +198,10 @@ void migration_test_add_compression(MigrationTestEnv *env) + #ifdef CONFIG_ZSTD + migration_test_add("/migration/multifd/tcp/plain/zstd", + test_multifd_tcp_zstd); ++ if (env->has_uffd) { ++ migration_test_add("/migration/multifd+postcopy/tcp/plain/zstd", ++ test_multifd_postcopy_tcp_zstd); ++ } + #endif + + #ifdef CONFIG_QATZIP +diff --git a/tests/qtest/migration/postcopy-tests.c b/tests/qtest/migration/postcopy-tests.c +index 483e3ff99f..3773525843 100644 +--- a/tests/qtest/migration/postcopy-tests.c ++++ b/tests/qtest/migration/postcopy-tests.c +@@ -94,6 +94,29 @@ static void migration_test_add_postcopy_smoke(MigrationTestEnv *env) + } + } + ++static void test_multifd_postcopy(void) ++{ ++ MigrateCommon args = { ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ }, ++ }; ++ ++ test_postcopy_common(&args); ++} ++ ++static void test_multifd_postcopy_preempt(void) ++{ ++ MigrateCommon args = { ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ .caps[MIGRATION_CAPABILITY_POSTCOPY_PREEMPT] = true, ++ }, ++ }; ++ ++ test_postcopy_common(&args); ++} ++ + void migration_test_add_postcopy(MigrationTestEnv *env) + { + migration_test_add_postcopy_smoke(env); +@@ -114,6 +137,10 @@ void migration_test_add_postcopy(MigrationTestEnv *env) + "/migration/postcopy/recovery/double-failures/reconnect", + test_postcopy_recovery_fail_reconnect); + ++ migration_test_add("/migration/multifd+postcopy/plain", ++ test_multifd_postcopy); ++ migration_test_add("/migration/multifd+postcopy/preempt/plain", ++ test_multifd_postcopy_preempt); + if (env->is_x86) { + migration_test_add("/migration/postcopy/suspend", + test_postcopy_suspend); +diff --git a/tests/qtest/migration/precopy-tests.c b/tests/qtest/migration/precopy-tests.c +index a62d3c5378..bb38292550 100644 +--- a/tests/qtest/migration/precopy-tests.c ++++ b/tests/qtest/migration/precopy-tests.c +@@ -569,7 +569,7 @@ static void test_multifd_tcp_channels_none(void) + * + * And see that it works + */ +-static void test_multifd_tcp_cancel(void) ++static void test_multifd_tcp_cancel(bool postcopy_ram) + { + MigrateStart args = { + .hide_stderr = true, +@@ -583,6 +583,11 @@ static void test_multifd_tcp_cancel(void) + migrate_ensure_non_converge(from); + migrate_prepare_for_dirty_mem(from); + ++ if (postcopy_ram) { ++ migrate_set_capability(from, "postcopy-ram", true); ++ migrate_set_capability(to, "postcopy-ram", true); ++ } ++ + migrate_set_parameter_int(from, "multifd-channels", 16); + migrate_set_parameter_int(to, "multifd-channels", 16); + +@@ -624,6 +629,10 @@ static void test_multifd_tcp_cancel(void) + return; + } + ++ if (postcopy_ram) { ++ migrate_set_capability(to2, "postcopy-ram", true); ++ } ++ + migrate_set_parameter_int(to2, "multifd-channels", 16); + + migrate_set_capability(to2, "multifd", true); +@@ -647,6 +656,16 @@ static void test_multifd_tcp_cancel(void) + migrate_end(from, to2, true); + } + ++static void test_multifd_precopy_tcp_cancel(void) ++{ ++ test_multifd_tcp_cancel(false); ++} ++ ++static void test_multifd_postcopy_tcp_cancel(void) ++{ ++ test_multifd_tcp_cancel(true); ++} ++ + static void test_cancel_src_after_failed(QTestState *from, QTestState *to, + const char *uri, const char *phase) + { +@@ -1233,7 +1252,12 @@ static void migration_test_add_precopy_smoke(MigrationTestEnv *env) + migration_test_add("/migration/multifd/tcp/uri/plain/none", + test_multifd_tcp_uri_none); + migration_test_add("/migration/multifd/tcp/plain/cancel", +- test_multifd_tcp_cancel); ++ test_multifd_precopy_tcp_cancel); ++ if (env->has_uffd) { ++ migration_test_add("/migration/multifd+postcopy/tcp/plain/cancel", ++ test_multifd_postcopy_tcp_cancel); ++ } ++ + #ifdef CONFIG_RDMA + migration_test_add("/migration/precopy/rdma/plain", + test_precopy_rdma_plain); +diff --git a/tests/qtest/migration/tls-tests.c b/tests/qtest/migration/tls-tests.c +index 72f44defbb..21e9fec87d 100644 +--- a/tests/qtest/migration/tls-tests.c ++++ b/tests/qtest/migration/tls-tests.c +@@ -395,6 +395,19 @@ static void test_postcopy_recovery_tls_psk(void) + test_postcopy_recovery_common(&args); + } + ++static void test_multifd_postcopy_recovery_tls_psk(void) ++{ ++ MigrateCommon args = { ++ .start_hook = migrate_hook_start_tls_psk_match, ++ .end_hook = migrate_hook_end_tls_psk, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ }, ++ }; ++ ++ test_postcopy_recovery_common(&args); ++} ++ + /* This contains preempt+recovery+tls test altogether */ + static void test_postcopy_preempt_all(void) + { +@@ -409,6 +422,20 @@ static void test_postcopy_preempt_all(void) + test_postcopy_recovery_common(&args); + } + ++static void test_multifd_postcopy_preempt_recovery_tls_psk(void) ++{ ++ MigrateCommon args = { ++ .start_hook = migrate_hook_start_tls_psk_match, ++ .end_hook = migrate_hook_end_tls_psk, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ .caps[MIGRATION_CAPABILITY_POSTCOPY_PREEMPT] = true, ++ }, ++ }; ++ ++ test_postcopy_recovery_common(&args); ++} ++ + static void test_precopy_unix_tls_psk(void) + { + g_autofree char *uri = g_strdup_printf("unix:%s/migsocket", tmpfs); +@@ -657,6 +684,21 @@ static void test_multifd_tcp_tls_psk_mismatch(void) + test_precopy_common(&args); + } + ++static void test_multifd_postcopy_tcp_tls_psk_match(void) ++{ ++ MigrateCommon args = { ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ .caps[MIGRATION_CAPABILITY_POSTCOPY_RAM] = true, ++ }, ++ .listen_uri = "defer", ++ .start_hook = migrate_hook_start_multifd_tcp_tls_psk_match, ++ .end_hook = migrate_hook_end_tls_psk, ++ }; ++ ++ test_precopy_common(&args); ++} ++ + #ifdef CONFIG_TASN1 + static void test_multifd_tcp_tls_x509_default_host(void) + { +@@ -774,6 +816,11 @@ void migration_test_add_tls(MigrationTestEnv *env) + test_postcopy_preempt_tls_psk); + migration_test_add("/migration/postcopy/preempt/recovery/tls/psk", + test_postcopy_preempt_all); ++ migration_test_add("/migration/multifd+postcopy/recovery/tls/psk", ++ test_multifd_postcopy_recovery_tls_psk); ++ migration_test_add( ++ "/migration/multifd+postcopy/preempt/recovery/tls/psk", ++ test_multifd_postcopy_preempt_recovery_tls_psk); + } + #ifdef CONFIG_TASN1 + migration_test_add("/migration/precopy/unix/tls/x509/default-host", +@@ -805,6 +852,10 @@ void migration_test_add_tls(MigrationTestEnv *env) + test_multifd_tcp_tls_psk_match); + migration_test_add("/migration/multifd/tcp/tls/psk/mismatch", + test_multifd_tcp_tls_psk_mismatch); ++ if (env->has_uffd) { ++ migration_test_add("/migration/multifd+postcopy/tcp/tls/psk/match", ++ test_multifd_postcopy_tcp_tls_psk_match); ++ } + #ifdef CONFIG_TASN1 + migration_test_add("/migration/multifd/tcp/tls/x509/default-host", + test_multifd_tcp_tls_x509_default_host); +-- +2.39.3 + diff --git a/kvm-tests-qtest-migration-consolidate-set-capabilities.patch b/kvm-tests-qtest-migration-consolidate-set-capabilities.patch new file mode 100644 index 0000000..d216826 --- /dev/null +++ b/kvm-tests-qtest-migration-consolidate-set-capabilities.patch @@ -0,0 +1,659 @@ +From 450029655e9fe0b958d05ae3ba1469a2e322b59a Mon Sep 17 00:00:00 2001 +From: Prasad Pandit +Date: Fri, 11 Apr 2025 17:15:33 +0530 +Subject: [PATCH 05/33] tests/qtest/migration: consolidate set capabilities + +RH-Author: Prasad Pandit +RH-MergeRequest: 390: migration: allow to enable multifd+postcopy features together, but use multifd during precopy only +RH-Jira: RHEL-59697 +RH-Acked-by: Juraj Marcin +RH-Acked-by: Miroslav Rezanina +RH-Commit: [5/11] 9cfa760da90b57cb554a9766ad79635436fca4b9 (pjp/cs-qemu-kvm) + +Migration capabilities are set in multiple '.start_hook' +functions for various tests. Instead, consolidate setting +capabilities in 'migrate_start_set_capabilities()' function +which is called from the 'migrate_start()' function. +While simplifying the capabilities setting, it helps +to declutter the qtest sources. + +Jira: https://issues.redhat.com/browse/RHEL-59697 +Suggested-by: Fabiano Rosas +Signed-off-by: Prasad Pandit +Reviewed-by: Fabiano Rosas +Message-ID: <20250411114534.3370816-7-ppandit@redhat.com> +[fix open brace] +Signed-off-by: Fabiano Rosas +(cherry picked from commit 115cec9d663c1a2f5a73df4a5ca02b3a676e8a2a) +Signed-off-by: Prasad Pandit +--- + tests/qtest/migration/compression-tests.c | 22 +++++-- + tests/qtest/migration/cpr-tests.c | 6 +- + tests/qtest/migration/file-tests.c | 58 ++++++++---------- + tests/qtest/migration/framework.c | 75 +++++++++++++++-------- + tests/qtest/migration/framework.h | 9 ++- + tests/qtest/migration/misc-tests.c | 4 +- + tests/qtest/migration/postcopy-tests.c | 8 ++- + tests/qtest/migration/precopy-tests.c | 29 +++++---- + tests/qtest/migration/tls-tests.c | 23 ++++++- + 9 files changed, 150 insertions(+), 84 deletions(-) + +diff --git a/tests/qtest/migration/compression-tests.c b/tests/qtest/migration/compression-tests.c +index 8b58401b84..41e79f031b 100644 +--- a/tests/qtest/migration/compression-tests.c ++++ b/tests/qtest/migration/compression-tests.c +@@ -35,6 +35,9 @@ static void test_multifd_tcp_zstd(void) + { + MigrateCommon args = { + .listen_uri = "defer", ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ }, + .start_hook = migrate_hook_start_precopy_tcp_multifd_zstd, + }; + test_precopy_common(&args); +@@ -56,6 +59,9 @@ static void test_multifd_tcp_qatzip(void) + { + MigrateCommon args = { + .listen_uri = "defer", ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ }, + .start_hook = migrate_hook_start_precopy_tcp_multifd_qatzip, + }; + test_precopy_common(&args); +@@ -74,6 +80,9 @@ static void test_multifd_tcp_qpl(void) + { + MigrateCommon args = { + .listen_uri = "defer", ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ }, + .start_hook = migrate_hook_start_precopy_tcp_multifd_qpl, + }; + test_precopy_common(&args); +@@ -92,6 +101,9 @@ static void test_multifd_tcp_uadk(void) + { + MigrateCommon args = { + .listen_uri = "defer", ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ }, + .start_hook = migrate_hook_start_precopy_tcp_multifd_uadk, + }; + test_precopy_common(&args); +@@ -103,10 +115,6 @@ migrate_hook_start_xbzrle(QTestState *from, + QTestState *to) + { + migrate_set_parameter_int(from, "xbzrle-cache-size", 33554432); +- +- migrate_set_capability(from, "xbzrle", true); +- migrate_set_capability(to, "xbzrle", true); +- + return NULL; + } + +@@ -118,6 +126,9 @@ static void test_precopy_unix_xbzrle(void) + .listen_uri = uri, + .start_hook = migrate_hook_start_xbzrle, + .iterations = 2, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_XBZRLE] = true, ++ }, + /* + * XBZRLE needs pages to be modified when doing the 2nd+ round + * iteration to have real data pushed to the stream. +@@ -146,6 +157,9 @@ static void test_multifd_tcp_zlib(void) + { + MigrateCommon args = { + .listen_uri = "defer", ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ }, + .start_hook = migrate_hook_start_precopy_tcp_multifd_zlib, + }; + test_precopy_common(&args); +diff --git a/tests/qtest/migration/cpr-tests.c b/tests/qtest/migration/cpr-tests.c +index 4758841824..5536e14610 100644 +--- a/tests/qtest/migration/cpr-tests.c ++++ b/tests/qtest/migration/cpr-tests.c +@@ -24,9 +24,6 @@ static void *migrate_hook_start_mode_reboot(QTestState *from, QTestState *to) + migrate_set_parameter_str(from, "mode", "cpr-reboot"); + migrate_set_parameter_str(to, "mode", "cpr-reboot"); + +- migrate_set_capability(from, "x-ignore-shared", true); +- migrate_set_capability(to, "x-ignore-shared", true); +- + return NULL; + } + +@@ -39,6 +36,9 @@ static void test_mode_reboot(void) + .connect_uri = uri, + .listen_uri = "defer", + .start_hook = migrate_hook_start_mode_reboot, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_X_IGNORE_SHARED] = true, ++ }, + }; + + test_file_common(&args, true); +diff --git a/tests/qtest/migration/file-tests.c b/tests/qtest/migration/file-tests.c +index f260e2871d..4d78ce0855 100644 +--- a/tests/qtest/migration/file-tests.c ++++ b/tests/qtest/migration/file-tests.c +@@ -107,15 +107,6 @@ static void test_precopy_file_offset_bad(void) + test_file_common(&args, false); + } + +-static void *migrate_hook_start_mapped_ram(QTestState *from, +- QTestState *to) +-{ +- migrate_set_capability(from, "mapped-ram", true); +- migrate_set_capability(to, "mapped-ram", true); +- +- return NULL; +-} +- + static void test_precopy_file_mapped_ram_live(void) + { + g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs, +@@ -123,7 +114,9 @@ static void test_precopy_file_mapped_ram_live(void) + MigrateCommon args = { + .connect_uri = uri, + .listen_uri = "defer", +- .start_hook = migrate_hook_start_mapped_ram, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MAPPED_RAM] = true, ++ }, + }; + + test_file_common(&args, false); +@@ -136,26 +129,14 @@ static void test_precopy_file_mapped_ram(void) + MigrateCommon args = { + .connect_uri = uri, + .listen_uri = "defer", +- .start_hook = migrate_hook_start_mapped_ram, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MAPPED_RAM] = true, ++ }, + }; + + test_file_common(&args, true); + } + +-static void *migrate_hook_start_multifd_mapped_ram(QTestState *from, +- QTestState *to) +-{ +- migrate_hook_start_mapped_ram(from, to); +- +- migrate_set_parameter_int(from, "multifd-channels", 4); +- migrate_set_parameter_int(to, "multifd-channels", 4); +- +- migrate_set_capability(from, "multifd", true); +- migrate_set_capability(to, "multifd", true); +- +- return NULL; +-} +- + static void test_multifd_file_mapped_ram_live(void) + { + g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs, +@@ -163,7 +144,10 @@ static void test_multifd_file_mapped_ram_live(void) + MigrateCommon args = { + .connect_uri = uri, + .listen_uri = "defer", +- .start_hook = migrate_hook_start_multifd_mapped_ram, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ .caps[MIGRATION_CAPABILITY_MAPPED_RAM] = true, ++ }, + }; + + test_file_common(&args, false); +@@ -176,7 +160,10 @@ static void test_multifd_file_mapped_ram(void) + MigrateCommon args = { + .connect_uri = uri, + .listen_uri = "defer", +- .start_hook = migrate_hook_start_multifd_mapped_ram, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ .caps[MIGRATION_CAPABILITY_MAPPED_RAM] = true, ++ }, + }; + + test_file_common(&args, true); +@@ -185,8 +172,6 @@ static void test_multifd_file_mapped_ram(void) + static void *migrate_hook_start_multifd_mapped_ram_dio(QTestState *from, + QTestState *to) + { +- migrate_hook_start_multifd_mapped_ram(from, to); +- + migrate_set_parameter_bool(from, "direct-io", true); + migrate_set_parameter_bool(to, "direct-io", true); + +@@ -201,6 +186,10 @@ static void test_multifd_file_mapped_ram_dio(void) + .connect_uri = uri, + .listen_uri = "defer", + .start_hook = migrate_hook_start_multifd_mapped_ram_dio, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MAPPED_RAM] = true, ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ }, + }; + + if (!probe_o_direct_support(tmpfs)) { +@@ -246,7 +235,6 @@ static void *migrate_hook_start_multifd_mapped_ram_fdset_dio(QTestState *from, + fdset_add_fds(from, file, O_WRONLY, 2, true); + fdset_add_fds(to, file, O_RDONLY, 2, true); + +- migrate_hook_start_multifd_mapped_ram(from, to); + migrate_set_parameter_bool(from, "direct-io", true); + migrate_set_parameter_bool(to, "direct-io", true); + +@@ -261,8 +249,6 @@ static void *migrate_hook_start_multifd_mapped_ram_fdset(QTestState *from, + fdset_add_fds(from, file, O_WRONLY, 2, false); + fdset_add_fds(to, file, O_RDONLY, 2, false); + +- migrate_hook_start_multifd_mapped_ram(from, to); +- + return NULL; + } + +@@ -275,6 +261,10 @@ static void test_multifd_file_mapped_ram_fdset(void) + .listen_uri = "defer", + .start_hook = migrate_hook_start_multifd_mapped_ram_fdset, + .end_hook = migrate_hook_end_multifd_mapped_ram_fdset, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MAPPED_RAM] = true, ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ }, + }; + + test_file_common(&args, true); +@@ -289,6 +279,10 @@ static void test_multifd_file_mapped_ram_fdset_dio(void) + .listen_uri = "defer", + .start_hook = migrate_hook_start_multifd_mapped_ram_fdset_dio, + .end_hook = migrate_hook_end_multifd_mapped_ram_fdset, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MAPPED_RAM] = true, ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ }, + }; + + if (!probe_o_direct_support(tmpfs)) { +diff --git a/tests/qtest/migration/framework.c b/tests/qtest/migration/framework.c +index 10e1d04b58..e48b80a127 100644 +--- a/tests/qtest/migration/framework.c ++++ b/tests/qtest/migration/framework.c +@@ -30,6 +30,7 @@ + #define QEMU_VM_FILE_MAGIC 0x5145564d + #define QEMU_ENV_SRC "QTEST_QEMU_BINARY_SRC" + #define QEMU_ENV_DST "QTEST_QEMU_BINARY_DST" ++#define MULTIFD_TEST_CHANNELS 4 + + unsigned start_address; + unsigned end_address; +@@ -207,6 +208,51 @@ static QList *migrate_start_get_qmp_capabilities(const MigrateStart *args) + return capabilities; + } + ++static void migrate_start_set_capabilities(QTestState *from, QTestState *to, ++ MigrateStart *args) ++{ ++ /* ++ * MigrationCapability_lookup and MIGRATION_CAPABILITY_ constants ++ * are from qapi-types-migration.h. ++ */ ++ for (uint8_t i = 0; i < MIGRATION_CAPABILITY__MAX; i++) { ++ if (!args->caps[i]) { ++ continue; ++ } ++ if (from) { ++ migrate_set_capability(from, ++ MigrationCapability_lookup.array[i], true); ++ } ++ if (to) { ++ migrate_set_capability(to, ++ MigrationCapability_lookup.array[i], true); ++ } ++ } ++ ++ /* ++ * Always enable migration events. Libvirt always uses it, let's try ++ * to mimic as closer as that. ++ */ ++ migrate_set_capability(from, "events", true); ++ if (!args->defer_target_connect) { ++ migrate_set_capability(to, "events", true); ++ } ++ ++ /* ++ * Default number of channels should be fine for most ++ * tests. Individual tests can override by calling ++ * migrate_set_parameter() directly. ++ */ ++ if (args->caps[MIGRATION_CAPABILITY_MULTIFD]) { ++ migrate_set_parameter_int(from, "multifd-channels", ++ MULTIFD_TEST_CHANNELS); ++ migrate_set_parameter_int(to, "multifd-channels", ++ MULTIFD_TEST_CHANNELS); ++ } ++ ++ return; ++} ++ + int migrate_start(QTestState **from, QTestState **to, const char *uri, + MigrateStart *args) + { +@@ -379,14 +425,7 @@ int migrate_start(QTestState **from, QTestState **to, const char *uri, + unlink(shmem_path); + } + +- /* +- * Always enable migration events. Libvirt always uses it, let's try +- * to mimic as closer as that. +- */ +- migrate_set_capability(*from, "events", true); +- if (!args->defer_target_connect) { +- migrate_set_capability(*to, "events", true); +- } ++ migrate_start_set_capabilities(*from, *to, args); + + return 0; + } +@@ -432,6 +471,10 @@ static int migrate_postcopy_prepare(QTestState **from_ptr, + { + QTestState *from, *to; + ++ /* set postcopy capabilities */ ++ args->start.caps[MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME] = true; ++ args->start.caps[MIGRATION_CAPABILITY_POSTCOPY_RAM] = true; ++ + if (migrate_start(&from, &to, "defer", &args->start)) { + return -1; + } +@@ -440,17 +483,7 @@ static int migrate_postcopy_prepare(QTestState **from_ptr, + args->postcopy_data = args->start_hook(from, to); + } + +- migrate_set_capability(from, "postcopy-ram", true); +- migrate_set_capability(to, "postcopy-ram", true); +- migrate_set_capability(to, "postcopy-blocktime", true); +- +- if (args->postcopy_preempt) { +- migrate_set_capability(from, "postcopy-preempt", true); +- migrate_set_capability(to, "postcopy-preempt", true); +- } +- + migrate_ensure_non_converge(from); +- + migrate_prepare_for_dirty_mem(from); + qtest_qmp_assert_success(to, "{ 'execute': 'migrate-incoming'," + " 'arguments': { " +@@ -948,15 +981,9 @@ void *migrate_hook_start_precopy_tcp_multifd_common(QTestState *from, + QTestState *to, + const char *method) + { +- migrate_set_parameter_int(from, "multifd-channels", 16); +- migrate_set_parameter_int(to, "multifd-channels", 16); +- + migrate_set_parameter_str(from, "multifd-compression", method); + migrate_set_parameter_str(to, "multifd-compression", method); + +- migrate_set_capability(from, "multifd", true); +- migrate_set_capability(to, "multifd", true); +- + /* Start incoming migration from the 1st socket */ + migrate_incoming_qmp(to, "tcp:127.0.0.1:0", NULL, "{}"); + +diff --git a/tests/qtest/migration/framework.h b/tests/qtest/migration/framework.h +index e4a11870f6..01e425e64e 100644 +--- a/tests/qtest/migration/framework.h ++++ b/tests/qtest/migration/framework.h +@@ -12,6 +12,7 @@ + #define TEST_FRAMEWORK_H + + #include "libqtest.h" ++#include + + #define FILE_TEST_FILENAME "migfile" + #define FILE_TEST_OFFSET 0x1000 +@@ -120,6 +121,13 @@ typedef struct { + + /* Do not connect to target monitor and qtest sockets in qtest_init */ + bool defer_target_connect; ++ ++ /* ++ * Migration capabilities to be set in both source and ++ * destination. For unilateral capabilities, use ++ * migration_set_capabilities(). ++ */ ++ bool caps[MIGRATION_CAPABILITY__MAX]; + } MigrateStart; + + typedef enum PostcopyRecoveryFailStage { +@@ -207,7 +215,6 @@ typedef struct { + + /* Postcopy specific fields */ + void *postcopy_data; +- bool postcopy_preempt; + PostcopyRecoveryFailStage postcopy_recovery_fail_stage; + } MigrateCommon; + +diff --git a/tests/qtest/migration/misc-tests.c b/tests/qtest/migration/misc-tests.c +index 2e612d9e38..54995256d8 100644 +--- a/tests/qtest/migration/misc-tests.c ++++ b/tests/qtest/migration/misc-tests.c +@@ -98,6 +98,7 @@ static void test_ignore_shared(void) + QTestState *from, *to; + MigrateStart args = { + .use_shmem = true, ++ .caps[MIGRATION_CAPABILITY_X_IGNORE_SHARED] = true, + }; + + if (migrate_start(&from, &to, uri, &args)) { +@@ -107,9 +108,6 @@ static void test_ignore_shared(void) + migrate_ensure_non_converge(from); + migrate_prepare_for_dirty_mem(from); + +- migrate_set_capability(from, "x-ignore-shared", true); +- migrate_set_capability(to, "x-ignore-shared", true); +- + /* Wait for the first serial output from the source */ + wait_for_serial("src_serial"); + +diff --git a/tests/qtest/migration/postcopy-tests.c b/tests/qtest/migration/postcopy-tests.c +index 982457bed1..483e3ff99f 100644 +--- a/tests/qtest/migration/postcopy-tests.c ++++ b/tests/qtest/migration/postcopy-tests.c +@@ -39,7 +39,9 @@ static void test_postcopy_suspend(void) + static void test_postcopy_preempt(void) + { + MigrateCommon args = { +- .postcopy_preempt = true, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_POSTCOPY_PREEMPT] = true, ++ }, + }; + + test_postcopy_common(&args); +@@ -73,7 +75,9 @@ static void test_postcopy_recovery_fail_reconnect(void) + static void test_postcopy_preempt_recovery(void) + { + MigrateCommon args = { +- .postcopy_preempt = true, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_POSTCOPY_PREEMPT] = true, ++ }, + }; + + test_postcopy_recovery_common(&args); +diff --git a/tests/qtest/migration/precopy-tests.c b/tests/qtest/migration/precopy-tests.c +index ba273d10b9..f8404793b8 100644 +--- a/tests/qtest/migration/precopy-tests.c ++++ b/tests/qtest/migration/precopy-tests.c +@@ -108,23 +108,14 @@ static void test_precopy_tcp_plain(void) + test_precopy_common(&args); + } + +-static void *migrate_hook_start_switchover_ack(QTestState *from, QTestState *to) +-{ +- +- migrate_set_capability(from, "return-path", true); +- migrate_set_capability(to, "return-path", true); +- +- migrate_set_capability(from, "switchover-ack", true); +- migrate_set_capability(to, "switchover-ack", true); +- +- return NULL; +-} +- + static void test_precopy_tcp_switchover_ack(void) + { + MigrateCommon args = { + .listen_uri = "tcp:127.0.0.1:0", +- .start_hook = migrate_hook_start_switchover_ack, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_RETURN_PATH] = true, ++ .caps[MIGRATION_CAPABILITY_SWITCHOVER_ACK] = true, ++ }, + /* + * Source VM must be running in order to consider the switchover ACK + * when deciding to do switchover or not. +@@ -393,6 +384,9 @@ static void test_multifd_tcp_uri_none(void) + MigrateCommon args = { + .listen_uri = "defer", + .start_hook = migrate_hook_start_precopy_tcp_multifd, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ }, + /* + * Multifd is more complicated than most of the features, it + * directly takes guest page buffers when sending, make sure +@@ -408,6 +402,9 @@ static void test_multifd_tcp_zero_page_legacy(void) + MigrateCommon args = { + .listen_uri = "defer", + .start_hook = migrate_hook_start_precopy_tcp_multifd_zero_page_legacy, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ }, + /* + * Multifd is more complicated than most of the features, it + * directly takes guest page buffers when sending, make sure +@@ -423,6 +420,9 @@ static void test_multifd_tcp_no_zero_page(void) + MigrateCommon args = { + .listen_uri = "defer", + .start_hook = migrate_hook_start_precopy_tcp_multifd_no_zero_page, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ }, + /* + * Multifd is more complicated than most of the features, it + * directly takes guest page buffers when sending, make sure +@@ -439,6 +439,9 @@ static void test_multifd_tcp_channels_none(void) + .listen_uri = "defer", + .start_hook = migrate_hook_start_precopy_tcp_multifd, + .live = true, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ }, + .connect_channels = ("[ { 'channel-type': 'main'," + " 'addr': { 'transport': 'socket'," + " 'type': 'inet'," +diff --git a/tests/qtest/migration/tls-tests.c b/tests/qtest/migration/tls-tests.c +index 2cb4a44bcd..72f44defbb 100644 +--- a/tests/qtest/migration/tls-tests.c ++++ b/tests/qtest/migration/tls-tests.c +@@ -375,9 +375,11 @@ static void test_postcopy_tls_psk(void) + static void test_postcopy_preempt_tls_psk(void) + { + MigrateCommon args = { +- .postcopy_preempt = true, + .start_hook = migrate_hook_start_tls_psk_match, + .end_hook = migrate_hook_end_tls_psk, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_POSTCOPY_PREEMPT] = true, ++ }, + }; + + test_postcopy_common(&args); +@@ -397,9 +399,11 @@ static void test_postcopy_recovery_tls_psk(void) + static void test_postcopy_preempt_all(void) + { + MigrateCommon args = { +- .postcopy_preempt = true, + .start_hook = migrate_hook_start_tls_psk_match, + .end_hook = migrate_hook_end_tls_psk, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_POSTCOPY_PREEMPT] = true, ++ }, + }; + + test_postcopy_recovery_common(&args); +@@ -631,6 +635,9 @@ static void test_multifd_tcp_tls_psk_match(void) + .listen_uri = "defer", + .start_hook = migrate_hook_start_multifd_tcp_tls_psk_match, + .end_hook = migrate_hook_end_tls_psk, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ }, + }; + test_precopy_common(&args); + } +@@ -640,6 +647,7 @@ static void test_multifd_tcp_tls_psk_mismatch(void) + MigrateCommon args = { + .start = { + .hide_stderr = true, ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, + }, + .listen_uri = "defer", + .start_hook = migrate_hook_start_multifd_tcp_tls_psk_mismatch, +@@ -656,6 +664,9 @@ static void test_multifd_tcp_tls_x509_default_host(void) + .listen_uri = "defer", + .start_hook = migrate_hook_start_multifd_tls_x509_default_host, + .end_hook = migrate_hook_end_tls_x509, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ }, + }; + test_precopy_common(&args); + } +@@ -666,6 +677,9 @@ static void test_multifd_tcp_tls_x509_override_host(void) + .listen_uri = "defer", + .start_hook = migrate_hook_start_multifd_tls_x509_override_host, + .end_hook = migrate_hook_end_tls_x509, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ }, + }; + test_precopy_common(&args); + } +@@ -688,6 +702,7 @@ static void test_multifd_tcp_tls_x509_mismatch_host(void) + MigrateCommon args = { + .start = { + .hide_stderr = true, ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, + }, + .listen_uri = "defer", + .start_hook = migrate_hook_start_multifd_tls_x509_mismatch_host, +@@ -703,6 +718,9 @@ static void test_multifd_tcp_tls_x509_allow_anon_client(void) + .listen_uri = "defer", + .start_hook = migrate_hook_start_multifd_tls_x509_allow_anon_client, + .end_hook = migrate_hook_end_tls_x509, ++ .start = { ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, ++ }, + }; + test_precopy_common(&args); + } +@@ -712,6 +730,7 @@ static void test_multifd_tcp_tls_x509_reject_anon_client(void) + MigrateCommon args = { + .start = { + .hide_stderr = true, ++ .caps[MIGRATION_CAPABILITY_MULTIFD] = true, + }, + .listen_uri = "defer", + .start_hook = migrate_hook_start_multifd_tls_x509_reject_anon_client, +-- +2.39.3 + diff --git a/qemu-kvm.spec b/qemu-kvm.spec index f40dd29..b2ebb87 100644 --- a/qemu-kvm.spec +++ b/qemu-kvm.spec @@ -158,7 +158,7 @@ Obsoletes: %{name}-block-ssh <= %{epoch}:%{version} \ Summary: QEMU is a machine emulator and virtualizer Name: qemu-kvm Version: 10.0.0 -Release: 7%{?rcrel}%{?dist}%{?cc_suffix}.alma.1 +Release: 8%{?rcrel}%{?dist}%{?cc_suffix}.alma.1 # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped # Epoch 15 used for RHEL 8 # Epoch 17 used for RHEL 9 (due to release versioning offset in RHEL 8.5) @@ -363,6 +363,72 @@ Patch92: kvm-hw-i386-amd_iommu-Allow-migration-when-explicitly-cr.patch Patch93: kvm-Enable-amd-iommu-device.patch # For RHEL-83883 - Video stuck after switchover phase when play one video during migration Patch94: kvm-ui-vnc-Update-display-update-interval-when-VM-state-.patch +# For RHEL-59697 - Allow multifd+postcopy features being enabled together, but only use multifd during precopy +Patch95: kvm-migration-multifd-move-macros-to-multifd-header.patch +# For RHEL-59697 - Allow multifd+postcopy features being enabled together, but only use multifd during precopy +Patch96: kvm-migration-refactor-channel-discovery-mechanism.patch +# For RHEL-59697 - Allow multifd+postcopy features being enabled together, but only use multifd during precopy +Patch97: kvm-migration-Add-save_postcopy_prepare-savevm-handler.patch +# For RHEL-59697 - Allow multifd+postcopy features being enabled together, but only use multifd during precopy +Patch98: kvm-migration-ram-Implement-save_postcopy_prepare.patch +# For RHEL-59697 - Allow multifd+postcopy features being enabled together, but only use multifd during precopy +Patch99: kvm-tests-qtest-migration-consolidate-set-capabilities.patch +# For RHEL-59697 - Allow multifd+postcopy features being enabled together, but only use multifd during precopy +Patch100: kvm-migration-write-zero-pages-when-postcopy-enabled.patch +# For RHEL-59697 - Allow multifd+postcopy features being enabled together, but only use multifd during precopy +Patch101: kvm-migration-enable-multifd-and-postcopy-together.patch +# For RHEL-59697 - Allow multifd+postcopy features being enabled together, but only use multifd during precopy +Patch102: kvm-migration-Add-qtest-for-migration-over-RDMA.patch +# For RHEL-59697 - Allow multifd+postcopy features being enabled together, but only use multifd during precopy +Patch103: kvm-qtest-migration-rdma-Enforce-RLIMIT_MEMLOCK-128MB-re.patch +# For RHEL-59697 - Allow multifd+postcopy features being enabled together, but only use multifd during precopy +Patch104: kvm-qtest-migration-rdma-Add-test-for-rdma-migration-wit.patch +# For RHEL-59697 - Allow multifd+postcopy features being enabled together, but only use multifd during precopy +Patch105: kvm-tests-qtest-migration-add-postcopy-tests-with-multif.patch +# For RHEL-96854 - Performance Degradation(aio=threads) between Upstream Commit b75c5f9 and 984a32f +Patch106: kvm-file-posix-Fix-aio-threads-performance-regression-af.patch +# For RHEL-88561 - qemu graph deadlock during job-dismiss +Patch107: kvm-block-remove-outdated-comments-about-AioContext-lock.patch +# For RHEL-88561 - qemu graph deadlock during job-dismiss +Patch108: kvm-block-move-drain-outside-of-read-locked-bdrv_reopen_.patch +# For RHEL-88561 - qemu graph deadlock during job-dismiss +Patch109: kvm-block-snapshot-move-drain-outside-of-read-locked-bdr.patch +# For RHEL-88561 - qemu graph deadlock during job-dismiss +Patch110: kvm-block-move-drain-outside-of-read-locked-bdrv_inactiv.patch +# For RHEL-88561 - qemu graph deadlock during job-dismiss +Patch111: kvm-block-mark-bdrv_parent_change_aio_context-GRAPH_RDLO.patch +# For RHEL-88561 - qemu graph deadlock during job-dismiss +Patch112: kvm-block-mark-change_aio_ctx-callback-and-instances-as-.patch +# For RHEL-88561 - qemu graph deadlock during job-dismiss +Patch113: kvm-block-mark-bdrv_child_change_aio_context-GRAPH_RDLOC.patch +# For RHEL-88561 - qemu graph deadlock during job-dismiss +Patch114: kvm-block-move-drain-outside-of-bdrv_change_aio_context-.patch +# For RHEL-88561 - qemu graph deadlock during job-dismiss +Patch115: kvm-block-move-drain-outside-of-bdrv_try_change_aio_cont.patch +# For RHEL-88561 - qemu graph deadlock during job-dismiss +Patch116: kvm-block-move-drain-outside-of-bdrv_attach_child_common.patch +# For RHEL-88561 - qemu graph deadlock during job-dismiss +Patch117: kvm-block-move-drain-outside-of-bdrv_set_backing_hd_drai.patch +# For RHEL-88561 - qemu graph deadlock during job-dismiss +Patch118: kvm-block-move-drain-outside-of-bdrv_root_attach_child.patch +# For RHEL-88561 - qemu graph deadlock during job-dismiss +Patch119: kvm-block-move-drain-outside-of-bdrv_attach_child.patch +# For RHEL-88561 - qemu graph deadlock during job-dismiss +Patch120: kvm-block-move-drain-outside-of-quorum_add_child.patch +# For RHEL-88561 - qemu graph deadlock during job-dismiss +Patch121: kvm-block-move-drain-outside-of-bdrv_root_unref_child.patch +# For RHEL-88561 - qemu graph deadlock during job-dismiss +Patch122: kvm-block-move-drain-outside-of-quorum_del_child.patch +# For RHEL-88561 - qemu graph deadlock during job-dismiss +Patch123: kvm-blockdev-drain-while-unlocked-in-internal_snapshot_a.patch +# For RHEL-88561 - qemu graph deadlock during job-dismiss +Patch124: kvm-blockdev-drain-while-unlocked-in-external_snapshot_a.patch +# For RHEL-88561 - qemu graph deadlock during job-dismiss +Patch125: kvm-block-mark-bdrv_drained_begin-and-friends-as-GRAPH_U.patch +# For RHEL-88561 - qemu graph deadlock during job-dismiss +Patch126: kvm-iotests-graph-changes-while-io-remove-image-file-aft.patch +# For RHEL-88561 - qemu graph deadlock during job-dismiss +Patch127: kvm-iotests-graph-changes-while-io-add-test-case-with-re.patch # AlmaLinux Patch Patch2001: 2001-Add-ppc64-support.patch @@ -1494,12 +1560,53 @@ useradd -r -u 107 -g qemu -G kvm -d / -s /sbin/nologin \ %endif %changelog -* Tue Jul 08 2025 Eduard Abdullin - 18:10.0.0-7.alma.1 +* Tue Jul 29 2025 Eduard Abdullin - 18:10.0.0-8.alma.1 - Enable QXL device build - Enable building for ppc64le - Re-added Spice support - Don't remove slof.bin for ppc64le +* Mon Jul 28 2025 Miroslav Rezanina - 10.0.0-8 +- kvm-migration-multifd-move-macros-to-multifd-header.patch [RHEL-59697] +- kvm-migration-refactor-channel-discovery-mechanism.patch [RHEL-59697] +- kvm-migration-Add-save_postcopy_prepare-savevm-handler.patch [RHEL-59697] +- kvm-migration-ram-Implement-save_postcopy_prepare.patch [RHEL-59697] +- kvm-tests-qtest-migration-consolidate-set-capabilities.patch [RHEL-59697] +- kvm-migration-write-zero-pages-when-postcopy-enabled.patch [RHEL-59697] +- kvm-migration-enable-multifd-and-postcopy-together.patch [RHEL-59697] +- kvm-migration-Add-qtest-for-migration-over-RDMA.patch [RHEL-59697] +- kvm-qtest-migration-rdma-Enforce-RLIMIT_MEMLOCK-128MB-re.patch [RHEL-59697] +- kvm-qtest-migration-rdma-Add-test-for-rdma-migration-wit.patch [RHEL-59697] +- kvm-tests-qtest-migration-add-postcopy-tests-with-multif.patch [RHEL-59697] +- kvm-file-posix-Fix-aio-threads-performance-regression-af.patch [RHEL-96854] +- kvm-block-remove-outdated-comments-about-AioContext-lock.patch [RHEL-88561] +- kvm-block-move-drain-outside-of-read-locked-bdrv_reopen_.patch [RHEL-88561] +- kvm-block-snapshot-move-drain-outside-of-read-locked-bdr.patch [RHEL-88561] +- kvm-block-move-drain-outside-of-read-locked-bdrv_inactiv.patch [RHEL-88561] +- kvm-block-mark-bdrv_parent_change_aio_context-GRAPH_RDLO.patch [RHEL-88561] +- kvm-block-mark-change_aio_ctx-callback-and-instances-as-.patch [RHEL-88561] +- kvm-block-mark-bdrv_child_change_aio_context-GRAPH_RDLOC.patch [RHEL-88561] +- kvm-block-move-drain-outside-of-bdrv_change_aio_context-.patch [RHEL-88561] +- kvm-block-move-drain-outside-of-bdrv_try_change_aio_cont.patch [RHEL-88561] +- kvm-block-move-drain-outside-of-bdrv_attach_child_common.patch [RHEL-88561] +- kvm-block-move-drain-outside-of-bdrv_set_backing_hd_drai.patch [RHEL-88561] +- kvm-block-move-drain-outside-of-bdrv_root_attach_child.patch [RHEL-88561] +- kvm-block-move-drain-outside-of-bdrv_attach_child.patch [RHEL-88561] +- kvm-block-move-drain-outside-of-quorum_add_child.patch [RHEL-88561] +- kvm-block-move-drain-outside-of-bdrv_root_unref_child.patch [RHEL-88561] +- kvm-block-move-drain-outside-of-quorum_del_child.patch [RHEL-88561] +- kvm-blockdev-drain-while-unlocked-in-internal_snapshot_a.patch [RHEL-88561] +- kvm-blockdev-drain-while-unlocked-in-external_snapshot_a.patch [RHEL-88561] +- kvm-block-mark-bdrv_drained_begin-and-friends-as-GRAPH_U.patch [RHEL-88561] +- kvm-iotests-graph-changes-while-io-remove-image-file-aft.patch [RHEL-88561] +- kvm-iotests-graph-changes-while-io-add-test-case-with-re.patch [RHEL-88561] +- Resolves: RHEL-59697 + (Allow multifd+postcopy features being enabled together, but only use multifd during precopy ) +- Resolves: RHEL-96854 + (Performance Degradation(aio=threads) between Upstream Commit b75c5f9 and 984a32f) +- Resolves: RHEL-88561 + (qemu graph deadlock during job-dismiss) + * Mon Jul 07 2025 Miroslav Rezanina - 10.0.0-7 - kvm-s390x-Fix-leak-in-machine_set_loadparm.patch [RHEL-98555] - kvm-hw-s390x-ccw-device-Fix-memory-leak-in-loadparm-sett.patch [RHEL-98555]