diff --git a/kvm-coroutine-cap-per-thread-local-pool-size.patch b/kvm-coroutine-cap-per-thread-local-pool-size.patch new file mode 100644 index 0000000..6fffc23 --- /dev/null +++ b/kvm-coroutine-cap-per-thread-local-pool-size.patch @@ -0,0 +1,412 @@ +From e99c56752a1c4021a93c92b7be78856ebefaa1b3 Mon Sep 17 00:00:00 2001 +From: Stefan Hajnoczi +Date: Mon, 18 Mar 2024 14:34:29 -0400 +Subject: [PATCH 1/2] coroutine: cap per-thread local pool size + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 234: coroutine: cap per-thread local pool size +RH-Jira: RHEL-28947 +RH-Acked-by: Kevin Wolf +RH-Acked-by: Hanna Czenczek +RH-Commit: [1/2] 5971de1c1e238457925bfb9c4bfc932de857b28d (stefanha/centos-stream-qemu-kvm) + +The coroutine pool implementation can hit the Linux vm.max_map_count +limit, causing QEMU to abort with "failed to allocate memory for stack" +or "failed to set up stack guard page" during coroutine creation. + +This happens because per-thread pools can grow to tens of thousands of +coroutines. Each coroutine causes 2 virtual memory areas to be created. +Eventually vm.max_map_count is reached and memory-related syscalls fail. +The per-thread pool sizes are non-uniform and depend on past coroutine +usage in each thread, so it's possible for one thread to have a large +pool while another thread's pool is empty. + +Switch to a new coroutine pool implementation with a global pool that +grows to a maximum number of coroutines and per-thread local pools that +are capped at hardcoded small number of coroutines. + +This approach does not leave large numbers of coroutines pooled in a +thread that may not use them again. In order to perform well it +amortizes the cost of global pool accesses by working in batches of +coroutines instead of individual coroutines. + +The global pool is a list. Threads donate batches of coroutines to when +they have too many and take batches from when they have too few: + +.-----------------------------------. +| Batch 1 | Batch 2 | Batch 3 | ... | global_pool +`-----------------------------------' + +Each thread has up to 2 batches of coroutines: + +.-------------------. +| Batch 1 | Batch 2 | per-thread local_pool (maximum 2 batches) +`-------------------' + +The goal of this change is to reduce the excessive number of pooled +coroutines that cause QEMU to abort when vm.max_map_count is reached +without losing the performance of an adequately sized coroutine pool. + +Here are virtio-blk disk I/O benchmark results: + + RW BLKSIZE IODEPTH OLD NEW CHANGE +randread 4k 1 113725 117451 +3.3% +randread 4k 8 192968 198510 +2.9% +randread 4k 16 207138 209429 +1.1% +randread 4k 32 212399 215145 +1.3% +randread 4k 64 218319 221277 +1.4% +randread 128k 1 17587 17535 -0.3% +randread 128k 8 17614 17616 +0.0% +randread 128k 16 17608 17609 +0.0% +randread 128k 32 17552 17553 +0.0% +randread 128k 64 17484 17484 +0.0% + +See files/{fio.sh,test.xml.j2} for the benchmark configuration: +https://gitlab.com/stefanha/virt-playbooks/-/tree/coroutine-pool-fix-sizing + +Buglink: https://issues.redhat.com/browse/RHEL-28947 +Reported-by: Sanjay Rao +Reported-by: Boaz Ben Shabat +Reported-by: Joe Mario +Reviewed-by: Kevin Wolf +Signed-off-by: Stefan Hajnoczi +Message-ID: <20240318183429.1039340-1-stefanha@redhat.com> +(cherry picked from commit 86a637e48104ae74d8be53bed6441ce32be33433) +Signed-off-by: Stefan Hajnoczi +--- + util/qemu-coroutine.c | 282 +++++++++++++++++++++++++++++++++--------- + 1 file changed, 223 insertions(+), 59 deletions(-) + +diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c +index 5fd2dbaf8b..2790959eaf 100644 +--- a/util/qemu-coroutine.c ++++ b/util/qemu-coroutine.c +@@ -18,39 +18,200 @@ + #include "qemu/atomic.h" + #include "qemu/coroutine_int.h" + #include "qemu/coroutine-tls.h" ++#include "qemu/cutils.h" + #include "block/aio.h" + +-/** +- * The minimal batch size is always 64, coroutines from the release_pool are +- * reused as soon as there are 64 coroutines in it. The maximum pool size starts +- * with 64 and is increased on demand so that coroutines are not deleted even if +- * they are not immediately reused. +- */ + enum { +- POOL_MIN_BATCH_SIZE = 64, +- POOL_INITIAL_MAX_SIZE = 64, ++ COROUTINE_POOL_BATCH_MAX_SIZE = 128, + }; + +-/** Free list to speed up creation */ +-static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool); +-static unsigned int pool_max_size = POOL_INITIAL_MAX_SIZE; +-static unsigned int release_pool_size; ++/* ++ * Coroutine creation and deletion is expensive so a pool of unused coroutines ++ * is kept as a cache. When the pool has coroutines available, they are ++ * recycled instead of creating new ones from scratch. Coroutines are added to ++ * the pool upon termination. ++ * ++ * The pool is global but each thread maintains a small local pool to avoid ++ * global pool contention. Threads fetch and return batches of coroutines from ++ * the global pool to maintain their local pool. The local pool holds up to two ++ * batches whereas the maximum size of the global pool is controlled by the ++ * qemu_coroutine_inc_pool_size() API. ++ * ++ * .-----------------------------------. ++ * | Batch 1 | Batch 2 | Batch 3 | ... | global_pool ++ * `-----------------------------------' ++ * ++ * .-------------------. ++ * | Batch 1 | Batch 2 | per-thread local_pool (maximum 2 batches) ++ * `-------------------' ++ */ ++typedef struct CoroutinePoolBatch { ++ /* Batches are kept in a list */ ++ QSLIST_ENTRY(CoroutinePoolBatch) next; ++ ++ /* This batch holds up to @COROUTINE_POOL_BATCH_MAX_SIZE coroutines */ ++ QSLIST_HEAD(, Coroutine) list; ++ unsigned int size; ++} CoroutinePoolBatch; ++ ++typedef QSLIST_HEAD(, CoroutinePoolBatch) CoroutinePool; ++ ++/* Host operating system limit on number of pooled coroutines */ ++static unsigned int global_pool_hard_max_size; ++ ++static QemuMutex global_pool_lock; /* protects the following variables */ ++static CoroutinePool global_pool = QSLIST_HEAD_INITIALIZER(global_pool); ++static unsigned int global_pool_size; ++static unsigned int global_pool_max_size = COROUTINE_POOL_BATCH_MAX_SIZE; ++ ++QEMU_DEFINE_STATIC_CO_TLS(CoroutinePool, local_pool); ++QEMU_DEFINE_STATIC_CO_TLS(Notifier, local_pool_cleanup_notifier); + +-typedef QSLIST_HEAD(, Coroutine) CoroutineQSList; +-QEMU_DEFINE_STATIC_CO_TLS(CoroutineQSList, alloc_pool); +-QEMU_DEFINE_STATIC_CO_TLS(unsigned int, alloc_pool_size); +-QEMU_DEFINE_STATIC_CO_TLS(Notifier, coroutine_pool_cleanup_notifier); ++static CoroutinePoolBatch *coroutine_pool_batch_new(void) ++{ ++ CoroutinePoolBatch *batch = g_new(CoroutinePoolBatch, 1); ++ ++ QSLIST_INIT(&batch->list); ++ batch->size = 0; ++ return batch; ++} + +-static void coroutine_pool_cleanup(Notifier *n, void *value) ++static void coroutine_pool_batch_delete(CoroutinePoolBatch *batch) + { + Coroutine *co; + Coroutine *tmp; +- CoroutineQSList *alloc_pool = get_ptr_alloc_pool(); + +- QSLIST_FOREACH_SAFE(co, alloc_pool, pool_next, tmp) { +- QSLIST_REMOVE_HEAD(alloc_pool, pool_next); ++ QSLIST_FOREACH_SAFE(co, &batch->list, pool_next, tmp) { ++ QSLIST_REMOVE_HEAD(&batch->list, pool_next); + qemu_coroutine_delete(co); + } ++ g_free(batch); ++} ++ ++static void local_pool_cleanup(Notifier *n, void *value) ++{ ++ CoroutinePool *local_pool = get_ptr_local_pool(); ++ CoroutinePoolBatch *batch; ++ CoroutinePoolBatch *tmp; ++ ++ QSLIST_FOREACH_SAFE(batch, local_pool, next, tmp) { ++ QSLIST_REMOVE_HEAD(local_pool, next); ++ coroutine_pool_batch_delete(batch); ++ } ++} ++ ++/* Ensure the atexit notifier is registered */ ++static void local_pool_cleanup_init_once(void) ++{ ++ Notifier *notifier = get_ptr_local_pool_cleanup_notifier(); ++ if (!notifier->notify) { ++ notifier->notify = local_pool_cleanup; ++ qemu_thread_atexit_add(notifier); ++ } ++} ++ ++/* Helper to get the next unused coroutine from the local pool */ ++static Coroutine *coroutine_pool_get_local(void) ++{ ++ CoroutinePool *local_pool = get_ptr_local_pool(); ++ CoroutinePoolBatch *batch = QSLIST_FIRST(local_pool); ++ Coroutine *co; ++ ++ if (unlikely(!batch)) { ++ return NULL; ++ } ++ ++ co = QSLIST_FIRST(&batch->list); ++ QSLIST_REMOVE_HEAD(&batch->list, pool_next); ++ batch->size--; ++ ++ if (batch->size == 0) { ++ QSLIST_REMOVE_HEAD(local_pool, next); ++ coroutine_pool_batch_delete(batch); ++ } ++ return co; ++} ++ ++/* Get the next batch from the global pool */ ++static void coroutine_pool_refill_local(void) ++{ ++ CoroutinePool *local_pool = get_ptr_local_pool(); ++ CoroutinePoolBatch *batch; ++ ++ WITH_QEMU_LOCK_GUARD(&global_pool_lock) { ++ batch = QSLIST_FIRST(&global_pool); ++ ++ if (batch) { ++ QSLIST_REMOVE_HEAD(&global_pool, next); ++ global_pool_size -= batch->size; ++ } ++ } ++ ++ if (batch) { ++ QSLIST_INSERT_HEAD(local_pool, batch, next); ++ local_pool_cleanup_init_once(); ++ } ++} ++ ++/* Add a batch of coroutines to the global pool */ ++static void coroutine_pool_put_global(CoroutinePoolBatch *batch) ++{ ++ WITH_QEMU_LOCK_GUARD(&global_pool_lock) { ++ unsigned int max = MIN(global_pool_max_size, ++ global_pool_hard_max_size); ++ ++ if (global_pool_size < max) { ++ QSLIST_INSERT_HEAD(&global_pool, batch, next); ++ ++ /* Overshooting the max pool size is allowed */ ++ global_pool_size += batch->size; ++ return; ++ } ++ } ++ ++ /* The global pool was full, so throw away this batch */ ++ coroutine_pool_batch_delete(batch); ++} ++ ++/* Get the next unused coroutine from the pool or return NULL */ ++static Coroutine *coroutine_pool_get(void) ++{ ++ Coroutine *co; ++ ++ co = coroutine_pool_get_local(); ++ if (!co) { ++ coroutine_pool_refill_local(); ++ co = coroutine_pool_get_local(); ++ } ++ return co; ++} ++ ++static void coroutine_pool_put(Coroutine *co) ++{ ++ CoroutinePool *local_pool = get_ptr_local_pool(); ++ CoroutinePoolBatch *batch = QSLIST_FIRST(local_pool); ++ ++ if (unlikely(!batch)) { ++ batch = coroutine_pool_batch_new(); ++ QSLIST_INSERT_HEAD(local_pool, batch, next); ++ local_pool_cleanup_init_once(); ++ } ++ ++ if (unlikely(batch->size >= COROUTINE_POOL_BATCH_MAX_SIZE)) { ++ CoroutinePoolBatch *next = QSLIST_NEXT(batch, next); ++ ++ /* Is the local pool full? */ ++ if (next) { ++ QSLIST_REMOVE_HEAD(local_pool, next); ++ coroutine_pool_put_global(batch); ++ } ++ ++ batch = coroutine_pool_batch_new(); ++ QSLIST_INSERT_HEAD(local_pool, batch, next); ++ } ++ ++ QSLIST_INSERT_HEAD(&batch->list, co, pool_next); ++ batch->size++; + } + + Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque) +@@ -58,31 +219,7 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque) + Coroutine *co = NULL; + + if (IS_ENABLED(CONFIG_COROUTINE_POOL)) { +- CoroutineQSList *alloc_pool = get_ptr_alloc_pool(); +- +- co = QSLIST_FIRST(alloc_pool); +- if (!co) { +- if (release_pool_size > POOL_MIN_BATCH_SIZE) { +- /* Slow path; a good place to register the destructor, too. */ +- Notifier *notifier = get_ptr_coroutine_pool_cleanup_notifier(); +- if (!notifier->notify) { +- notifier->notify = coroutine_pool_cleanup; +- qemu_thread_atexit_add(notifier); +- } +- +- /* This is not exact; there could be a little skew between +- * release_pool_size and the actual size of release_pool. But +- * it is just a heuristic, it does not need to be perfect. +- */ +- set_alloc_pool_size(qatomic_xchg(&release_pool_size, 0)); +- QSLIST_MOVE_ATOMIC(alloc_pool, &release_pool); +- co = QSLIST_FIRST(alloc_pool); +- } +- } +- if (co) { +- QSLIST_REMOVE_HEAD(alloc_pool, pool_next); +- set_alloc_pool_size(get_alloc_pool_size() - 1); +- } ++ co = coroutine_pool_get(); + } + + if (!co) { +@@ -100,19 +237,10 @@ static void coroutine_delete(Coroutine *co) + co->caller = NULL; + + if (IS_ENABLED(CONFIG_COROUTINE_POOL)) { +- if (release_pool_size < qatomic_read(&pool_max_size) * 2) { +- QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next); +- qatomic_inc(&release_pool_size); +- return; +- } +- if (get_alloc_pool_size() < qatomic_read(&pool_max_size)) { +- QSLIST_INSERT_HEAD(get_ptr_alloc_pool(), co, pool_next); +- set_alloc_pool_size(get_alloc_pool_size() + 1); +- return; +- } ++ coroutine_pool_put(co); ++ } else { ++ qemu_coroutine_delete(co); + } +- +- qemu_coroutine_delete(co); + } + + void qemu_aio_coroutine_enter(AioContext *ctx, Coroutine *co) +@@ -223,10 +351,46 @@ AioContext *qemu_coroutine_get_aio_context(Coroutine *co) + + void qemu_coroutine_inc_pool_size(unsigned int additional_pool_size) + { +- qatomic_add(&pool_max_size, additional_pool_size); ++ QEMU_LOCK_GUARD(&global_pool_lock); ++ global_pool_max_size += additional_pool_size; + } + + void qemu_coroutine_dec_pool_size(unsigned int removing_pool_size) + { +- qatomic_sub(&pool_max_size, removing_pool_size); ++ QEMU_LOCK_GUARD(&global_pool_lock); ++ global_pool_max_size -= removing_pool_size; ++} ++ ++static unsigned int get_global_pool_hard_max_size(void) ++{ ++#ifdef __linux__ ++ g_autofree char *contents = NULL; ++ int max_map_count; ++ ++ /* ++ * Linux processes can have up to max_map_count virtual memory areas ++ * (VMAs). mmap(2), mprotect(2), etc fail with ENOMEM beyond this limit. We ++ * must limit the coroutine pool to a safe size to avoid running out of ++ * VMAs. ++ */ ++ if (g_file_get_contents("/proc/sys/vm/max_map_count", &contents, NULL, ++ NULL) && ++ qemu_strtoi(contents, NULL, 10, &max_map_count) == 0) { ++ /* ++ * This is a conservative upper bound that avoids exceeding ++ * max_map_count. Leave half for non-coroutine users like library ++ * dependencies, vhost-user, etc. Each coroutine takes up 2 VMAs so ++ * halve the amount again. ++ */ ++ return max_map_count / 4; ++ } ++#endif ++ ++ return UINT_MAX; ++} ++ ++static void __attribute__((constructor)) qemu_coroutine_init(void) ++{ ++ qemu_mutex_init(&global_pool_lock); ++ global_pool_hard_max_size = get_global_pool_hard_max_size(); + } +-- +2.39.3 + diff --git a/kvm-coroutine-reserve-5-000-mappings.patch b/kvm-coroutine-reserve-5-000-mappings.patch new file mode 100644 index 0000000..cdbb666 --- /dev/null +++ b/kvm-coroutine-reserve-5-000-mappings.patch @@ -0,0 +1,61 @@ +From 0aa65dc3acba481f7064df936ab49e3bceb1d5bd Mon Sep 17 00:00:00 2001 +From: Stefan Hajnoczi +Date: Wed, 20 Mar 2024 14:12:32 -0400 +Subject: [PATCH 2/2] coroutine: reserve 5,000 mappings +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +RH-Author: Stefan Hajnoczi +RH-MergeRequest: 234: coroutine: cap per-thread local pool size +RH-Jira: RHEL-28947 +RH-Acked-by: Kevin Wolf +RH-Acked-by: Hanna Czenczek +RH-Commit: [2/2] 78560c2b947471111cc16c313d6f38db42860a1c (stefanha/centos-stream-qemu-kvm) + +Daniel P. Berrangé pointed out that the coroutine +pool size heuristic is very conservative. Instead of halving +max_map_count, he suggested reserving 5,000 mappings for non-coroutine +users based on observations of guests he has access to. + +Fixes: 86a637e48104 ("coroutine: cap per-thread local pool size") +Signed-off-by: Stefan Hajnoczi +Reviewed-by: Daniel P. Berrangé +Message-id: 20240320181232.1464819-1-stefanha@redhat.com +Signed-off-by: Stefan Hajnoczi +(cherry picked from commit 9352f80cd926fe2dde7c89b93ee33bb0356ff40e) +Signed-off-by: Stefan Hajnoczi +--- + util/qemu-coroutine.c | 15 ++++++++++----- + 1 file changed, 10 insertions(+), 5 deletions(-) + +diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c +index 2790959eaf..eb4eebefdf 100644 +--- a/util/qemu-coroutine.c ++++ b/util/qemu-coroutine.c +@@ -377,12 +377,17 @@ static unsigned int get_global_pool_hard_max_size(void) + NULL) && + qemu_strtoi(contents, NULL, 10, &max_map_count) == 0) { + /* +- * This is a conservative upper bound that avoids exceeding +- * max_map_count. Leave half for non-coroutine users like library +- * dependencies, vhost-user, etc. Each coroutine takes up 2 VMAs so +- * halve the amount again. ++ * This is an upper bound that avoids exceeding max_map_count. Leave a ++ * fixed amount for non-coroutine users like library dependencies, ++ * vhost-user, etc. Each coroutine takes up 2 VMAs so halve the ++ * remaining amount. + */ +- return max_map_count / 4; ++ if (max_map_count > 5000) { ++ return (max_map_count - 5000) / 2; ++ } else { ++ /* Disable the global pool but threads still have local pools */ ++ return 0; ++ } + } + #endif + +-- +2.39.3 + diff --git a/qemu-kvm.spec b/qemu-kvm.spec index 120bf45..973e9a2 100644 --- a/qemu-kvm.spec +++ b/qemu-kvm.spec @@ -149,7 +149,7 @@ Obsoletes: %{name}-block-ssh <= %{epoch}:%{version} \ Summary: QEMU is a machine emulator and virtualizer Name: qemu-kvm Version: 8.2.0 -Release: 10%{?rcrel}%{?dist}%{?cc_suffix} +Release: 11%{?rcrel}%{?dist}%{?cc_suffix} # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped # Epoch 15 used for RHEL 8 # Epoch 17 used for RHEL 9 (due to release versioning offset in RHEL 8.5) @@ -606,6 +606,10 @@ Patch173: kvm-chardev-lower-priority-of-the-HUP-GSource-in-socket-.patch Patch174: kvm-Revert-chardev-char-socket-Fix-TLS-io-channels-sendi.patch # For RHEL-24614 - [RHEL9][chardev] qemu hit core dump while using TLS server from host to guest Patch175: kvm-Revert-chardev-use-a-child-source-for-qio-input-sour.patch +# For RHEL-28947 - Qemu crashing with "failed to set up stack guard page: Cannot allocate memory" +Patch176: kvm-coroutine-cap-per-thread-local-pool-size.patch +# For RHEL-28947 - Qemu crashing with "failed to set up stack guard page: Cannot allocate memory" +Patch177: kvm-coroutine-reserve-5-000-mappings.patch %if %{have_clang} BuildRequires: clang @@ -1667,6 +1671,12 @@ useradd -r -u 107 -g qemu -G kvm -d / -s /sbin/nologin \ %endif %changelog +* Tue Mar 26 2024 Miroslav Rezanina - 8.2.0-11 +- kvm-coroutine-cap-per-thread-local-pool-size.patch [RHEL-28947] +- kvm-coroutine-reserve-5-000-mappings.patch [RHEL-28947] +- Resolves: RHEL-28947 + (Qemu crashing with "failed to set up stack guard page: Cannot allocate memory") + * Thu Mar 21 2024 Miroslav Rezanina - 8.2.0-10 - kvm-chardev-lower-priority-of-the-HUP-GSource-in-socket-.patch [RHEL-24614] - kvm-Revert-chardev-char-socket-Fix-TLS-io-channels-sendi.patch [RHEL-24614]