From 6dbf26d6f46fce37e63fcf2eec542b03ef4e0704 Mon Sep 17 00:00:00 2001 From: DJ Delorie Date: Wed, 27 Mar 2024 21:45:36 -0400 Subject: [PATCH] x86: Fix Zen3/Zen4 ERMS selection Resolves: RHEL-25531 --- glibc-RHEL-25531-1.patch | 188 +++++++++++++++++++++++++++++++++++++++ glibc-RHEL-25531-2.patch | 155 ++++++++++++++++++++++++++++++++ glibc-RHEL-25531-3.patch | 34 +++++++ glibc-RHEL-25531-4.patch | 29 ++++++ glibc.spec | 9 +- 5 files changed, 414 insertions(+), 1 deletion(-) create mode 100644 glibc-RHEL-25531-1.patch create mode 100644 glibc-RHEL-25531-2.patch create mode 100644 glibc-RHEL-25531-3.patch create mode 100644 glibc-RHEL-25531-4.patch diff --git a/glibc-RHEL-25531-1.patch b/glibc-RHEL-25531-1.patch new file mode 100644 index 0000000..db0260f --- /dev/null +++ b/glibc-RHEL-25531-1.patch @@ -0,0 +1,188 @@ +From a4c3f5f46e850c977cda81c251036475aab8313c Mon Sep 17 00:00:00 2001 +From: Adhemerval Zanella +Date: Thu, 23 Nov 2023 14:29:14 -0300 +Subject: [PATCH] elf: Add a way to check if tunable is set (BZ 27069) +Content-type: text/plain; charset=UTF-8 + +The patch adds two new macros, TUNABLE_GET_DEFAULT and TUNABLE_IS_INITIALIZED, +here the former get the default value with a signature similar to +TUNABLE_GET, while the later returns whether the tunable was set by +the environment variable. + +Checked on x86_64-linux-gnu. +Reviewed-by: DJ Delorie +Tested-by: Zhangfei Gao + +Conflicts: + elf/Versions + (removed to preserve ABI) + elf/dl-tunable-types.h + (line numbers) + scripts/gen-tunables.awk + (account for missing TUNABLE_SECLEVEL patch) + +--- + elf/dl-tunable-types.h | 1 + + elf/dl-tunables.c | 40 ++++++++++++++++++++++++++++++++++++++++ + elf/dl-tunables.h | 28 ++++++++++++++++++++++++++++ + elf/dl-tunables.list | 1 + + scripts/gen-tunables.awk | 4 ++-- + 6 files changed, 73 insertions(+), 2 deletions(-) + +diff -rup a/elf/dl-tunable-types.h b/elf/dl-tunable-types.h +--- a/elf/dl-tunable-types.h 2021-08-01 21:33:43.000000000 -0400 ++++ b/elf/dl-tunable-types.h 2024-03-26 18:23:22.211504813 -0400 +@@ -61,6 +61,7 @@ struct _tunable + { + const char name[TUNABLE_NAME_MAX]; /* Internal name of the tunable. */ + tunable_type_t type; /* Data type of the tunable. */ ++ const tunable_val_t def; /* The value. */ + tunable_val_t val; /* The value. */ + bool initialized; /* Flag to indicate that the tunable is + initialized. */ +diff -rup a/elf/dl-tunables.c b/elf/dl-tunables.c +--- a/elf/dl-tunables.c 2024-03-26 18:21:10.090681748 -0400 ++++ b/elf/dl-tunables.c 2024-03-26 18:23:22.214504923 -0400 +@@ -152,6 +152,13 @@ tunable_initialize (tunable_t *cur, cons + do_tunable_update_val (cur, &val, NULL, NULL); + } + ++bool ++__tunable_is_initialized (tunable_id_t id) ++{ ++ return tunable_list[id].initialized; ++} ++rtld_hidden_def (__tunable_is_initialized) ++ + void + __tunable_set_val (tunable_id_t id, tunable_val_t *valp, tunable_num_t *minp, + tunable_num_t *maxp) +@@ -399,6 +406,39 @@ __tunables_print (void) + } + } + ++void ++__tunable_get_default (tunable_id_t id, void *valp) ++{ ++ tunable_t *cur = &tunable_list[id]; ++ ++ switch (cur->type.type_code) ++ { ++ case TUNABLE_TYPE_UINT_64: ++ { ++ *((uint64_t *) valp) = (uint64_t) cur->def.numval; ++ break; ++ } ++ case TUNABLE_TYPE_INT_32: ++ { ++ *((int32_t *) valp) = (int32_t) cur->def.numval; ++ break; ++ } ++ case TUNABLE_TYPE_SIZE_T: ++ { ++ *((size_t *) valp) = (size_t) cur->def.numval; ++ break; ++ } ++ case TUNABLE_TYPE_STRING: ++ { ++ *((const char **)valp) = cur->def.strval; ++ break; ++ } ++ default: ++ __builtin_unreachable (); ++ } ++} ++rtld_hidden_def (__tunable_get_default) ++ + /* Set the tunable value. This is called by the module that the tunable exists + in. */ + void +diff -rup a/elf/dl-tunables.h b/elf/dl-tunables.h +--- a/elf/dl-tunables.h 2021-08-01 21:33:43.000000000 -0400 ++++ b/elf/dl-tunables.h 2024-03-26 18:23:22.217505032 -0400 +@@ -53,18 +53,26 @@ typedef void (*tunable_callback_t) (tuna + + extern void __tunables_init (char **); + extern void __tunables_print (void); ++extern bool __tunable_is_initialized (tunable_id_t); + extern void __tunable_get_val (tunable_id_t, void *, tunable_callback_t); + extern void __tunable_set_val (tunable_id_t, tunable_val_t *, tunable_num_t *, + tunable_num_t *); ++extern void __tunable_get_default (tunable_id_t id, void *valp); + rtld_hidden_proto (__tunables_init) + rtld_hidden_proto (__tunables_print) ++rtld_hidden_proto (__tunable_is_initialized) + rtld_hidden_proto (__tunable_get_val) + rtld_hidden_proto (__tunable_set_val) ++rtld_hidden_proto (__tunable_get_default) + + /* Define TUNABLE_GET and TUNABLE_SET in short form if TOP_NAMESPACE and + TUNABLE_NAMESPACE are defined. This is useful shorthand to get and set + tunables within a module. */ + #if defined TOP_NAMESPACE && defined TUNABLE_NAMESPACE ++# define TUNABLE_IS_INITIALIZED(__id) \ ++ TUNABLE_IS_INITIALIZED_FULL(TOP_NAMESPACE, TUNABLE_NAMESPACE, __id) ++# define TUNABLE_GET_DEFAULT(__id, __type) \ ++ TUNABLE_GET_DEFAULT_FULL(TOP_NAMESPACE, TUNABLE_NAMESPACE,__id, __type) + # define TUNABLE_GET(__id, __type, __cb) \ + TUNABLE_GET_FULL (TOP_NAMESPACE, TUNABLE_NAMESPACE, __id, __type, __cb) + # define TUNABLE_SET(__id, __val) \ +@@ -73,6 +81,10 @@ rtld_hidden_proto (__tunable_set_val) + TUNABLE_SET_WITH_BOUNDS_FULL (TOP_NAMESPACE, TUNABLE_NAMESPACE, __id, \ + __val, __min, __max) + #else ++# define TUNABLE_IS_INITIALIZED(__top, __ns, __id) \ ++ TUNABLE_IS_INITIALIZED_FULL(__top, __ns, __id) ++# define TUNABLE_GET_DEFAULT(__top, __ns, __type) \ ++ TUNABLE_GET_DEFAULT_FULL(__top, __ns, __id, __type) + # define TUNABLE_GET(__top, __ns, __id, __type, __cb) \ + TUNABLE_GET_FULL (__top, __ns, __id, __type, __cb) + # define TUNABLE_SET(__top, __ns, __id, __val) \ +@@ -81,6 +93,22 @@ rtld_hidden_proto (__tunable_set_val) + TUNABLE_SET_WITH_BOUNDS_FULL (__top, __ns, __id, __val, __min, __max) + #endif + ++/* Return whether the tunable was initialized by the environment variable. */ ++#define TUNABLE_IS_INITIALIZED_FULL(__top, __ns, __id) \ ++({ \ ++ tunable_id_t id = TUNABLE_ENUM_NAME (__top, __ns, __id); \ ++ __tunable_is_initialized (id); \ ++}) ++ ++/* Return the default value of the tunable. */ ++#define TUNABLE_GET_DEFAULT_FULL(__top, __ns, __id, __type) \ ++({ \ ++ tunable_id_t id = TUNABLE_ENUM_NAME (__top, __ns, __id); \ ++ __type __ret; \ ++ __tunable_get_default (id, &__ret); \ ++ __ret; \ ++}) ++ + /* Get and return a tunable value. If the tunable was set externally and __CB + is defined then call __CB before returning the value. */ + # define TUNABLE_GET_FULL(__top, __ns, __id, __type, __cb) \ +diff -rup a/elf/dl-tunables.list b/elf/dl-tunables.list +--- a/elf/dl-tunables.list 2024-03-26 18:21:09.664666196 -0400 ++++ b/elf/dl-tunables.list 2024-03-26 18:23:22.220505142 -0400 +@@ -20,6 +20,7 @@ + # type: Defaults to STRING + # minval: Optional minimum acceptable value + # maxval: Optional maximum acceptable value ++# default: Optional default value (if not specified it will be 0 or "") + # env_alias: An alias environment variable + # security_level: Specify security level of the tunable for AT_SECURE binaries. + # Valid values are: +diff -rup a/scripts/gen-tunables.awk b/scripts/gen-tunables.awk +--- a/scripts/gen-tunables.awk 2024-03-26 18:21:09.523661049 -0400 ++++ b/scripts/gen-tunables.awk 2024-03-26 18:34:45.385462341 -0400 +@@ -236,8 +236,8 @@ END { + n = indices[2]; + m = indices[3]; + printf (" {TUNABLE_NAME_S(%s, %s, %s)", t, n, m) +- printf (", {TUNABLE_TYPE_%s, %s, %s}, {%s}, NULL, TUNABLE_SECLEVEL_%s, %s},\n", +- types[t,n,m], minvals[t,n,m], maxvals[t,n,m], ++ printf (", {TUNABLE_TYPE_%s, %s, %s}, {%s}, {%s}, NULL, TUNABLE_SECLEVEL_%s, %s},\n", ++ types[t,n,m], minvals[t,n,m], maxvals[t,n,m],default_val[t,n,m], + default_val[t,n,m], security_level[t,n,m], env_alias[t,n,m]); + } + print "};" diff --git a/glibc-RHEL-25531-2.patch b/glibc-RHEL-25531-2.patch new file mode 100644 index 0000000..8b90365 --- /dev/null +++ b/glibc-RHEL-25531-2.patch @@ -0,0 +1,155 @@ +From 0c0d39fe4aeb0f69b26e76337c5dfd5530d5d44e Mon Sep 17 00:00:00 2001 +From: Adhemerval Zanella +Date: Thu, 8 Feb 2024 10:08:38 -0300 +Subject: [PATCH] x86: Fix Zen3/Zen4 ERMS selection (BZ 30994) +Content-type: text/plain; charset=UTF-8 + +The REP MOVSB usage on memcpy/memmove does not show much performance +improvement on Zen3/Zen4 cores compared to the vectorized loops. Also, +as from BZ 30994, if the source is aligned and the destination is not +the performance can be 20x slower. + +The performance difference is noticeable with small buffer sizes, closer +to the lower bounds limits when memcpy/memmove starts to use ERMS. The +performance of REP MOVSB is similar to vectorized instruction on the +size limit (the L2 cache). Also, there is no drawback to multiple cores +sharing the cache. + +Checked on x86_64-linux-gnu on Zen3. +Reviewed-by: H.J. Lu + +Conflicts: + sysdeps/x86/dl-cacheinfo.h + (tweaked for changed context) + +--- + sysdeps/x86/dl-cacheinfo.h | 38 ++++++++++++++++++-------------------- + 1 file changed, 18 insertions(+), 20 deletions(-) + +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index d5101615e3..f34d12846c 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -791,7 +791,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + long int data = -1; + long int shared = -1; + long int shared_per_thread = -1; +- long int core = -1; + unsigned int threads = 0; + unsigned long int level1_icache_size = -1; + unsigned long int level1_icache_linesize = -1; +@@ -809,7 +808,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + if (cpu_features->basic.kind == arch_kind_intel) + { + data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features); +- core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); + shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features); + shared_per_thread = shared; + +@@ -822,7 +820,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + = handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features); + level1_dcache_linesize + = handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features); +- level2_cache_size = core; ++ level2_cache_size ++ = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); + level2_cache_assoc + = handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features); + level2_cache_linesize +@@ -835,12 +834,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + level4_cache_size + = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features); + +- get_common_cache_info (&shared, &shared_per_thread, &threads, core); ++ get_common_cache_info (&shared, &shared_per_thread, &threads, ++ level2_cache_size); + } + else if (cpu_features->basic.kind == arch_kind_zhaoxin) + { + data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE); +- core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); + shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE); + shared_per_thread = shared; + +@@ -849,19 +848,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + level1_dcache_size = data; + level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC); + level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE); +- level2_cache_size = core; ++ level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); + level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC); + level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE); + level3_cache_size = shared; + level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC); + level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE); + +- get_common_cache_info (&shared, &shared_per_thread, &threads, core); ++ get_common_cache_info (&shared, &shared_per_thread, &threads, ++ level2_cache_size); + } + else if (cpu_features->basic.kind == arch_kind_amd) + { + data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); +- core = handle_amd (_SC_LEVEL2_CACHE_SIZE); + shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); + + level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE); +@@ -869,7 +868,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + level1_dcache_size = data; + level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC); + level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE); +- level2_cache_size = core; ++ level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);; + level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC); + level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE); + level3_cache_size = shared; +@@ -880,12 +879,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + if (shared <= 0) + { + /* No shared L3 cache. All we have is the L2 cache. */ +- shared = core; ++ shared = level2_cache_size; + } + else if (cpu_features->basic.family < 0x17) + { + /* Account for exclusive L2 and L3 caches. */ +- shared += core; ++ shared += level2_cache_size; + } + + shared_per_thread = shared; +@@ -987,6 +986,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) + rep_movsb_threshold = 2112; + ++ /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of ++ cases slower than the vectorized path (and for some alignments, ++ it is really slow, check BZ #30994). */ ++ if (cpu_features->basic.kind == arch_kind_amd) ++ rep_movsb_threshold = non_temporal_threshold; ++ + /* The default threshold to use Enhanced REP STOSB. */ + unsigned long int rep_stosb_threshold = 2048; + +@@ -1028,15 +1033,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) +#endif + + unsigned long int rep_movsb_stop_threshold; +- /* ERMS feature is implemented from AMD Zen3 architecture and it is +- performing poorly for data above L2 cache size. Henceforth, adding +- an upper bound threshold parameter to limit the usage of Enhanced +- REP MOVSB operations and setting its value to L2 cache size. */ +- if (cpu_features->basic.kind == arch_kind_amd) +- rep_movsb_stop_threshold = core; + /* Setting the upper bound of ERMS to the computed value of +- non-temporal threshold for architectures other than AMD. */ +- else +- rep_movsb_stop_threshold = non_temporal_threshold; ++ non-temporal threshold for all architectures. */ ++ rep_movsb_stop_threshold = non_temporal_threshold; + + cpu_features->data_cache_size = data; + cpu_features->shared_cache_size = shared; +-- +2.39.3 + diff --git a/glibc-RHEL-25531-3.patch b/glibc-RHEL-25531-3.patch new file mode 100644 index 0000000..5d404f7 --- /dev/null +++ b/glibc-RHEL-25531-3.patch @@ -0,0 +1,34 @@ +From 272708884cb750f12f5c74a00e6620c19dc6d567 Mon Sep 17 00:00:00 2001 +From: Adhemerval Zanella +Date: Thu, 8 Feb 2024 10:08:39 -0300 +Subject: [PATCH] x86: Do not prefer ERMS for memset on Zen3+ +Content-type: text/plain; charset=UTF-8 + +For AMD Zen3+ architecture, the performance of the vectorized loop is +slightly better than ERMS. + +Checked on x86_64-linux-gnu on Zen3. +Reviewed-by: H.J. Lu +--- + sysdeps/x86/dl-cacheinfo.h | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index f34d12846c..5a98f70364 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -1021,6 +1021,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + minimum value is fixed. */ + rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold, + long int, NULL); ++ if (cpu_features->basic.kind == arch_kind_amd ++ && !TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold)) ++ /* For AMD Zen3+ architecture, the performance of the vectorized loop is ++ slightly better than ERMS. */ ++ rep_stosb_threshold = SIZE_MAX; + + TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX); + TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX); +-- +2.39.3 + diff --git a/glibc-RHEL-25531-4.patch b/glibc-RHEL-25531-4.patch new file mode 100644 index 0000000..089b29c --- /dev/null +++ b/glibc-RHEL-25531-4.patch @@ -0,0 +1,29 @@ +From 491e55beab7457ed310a4a47496f4a333c5d1032 Mon Sep 17 00:00:00 2001 +From: Adhemerval Zanella +Date: Thu, 8 Feb 2024 10:08:40 -0300 +Subject: [PATCH] x86: Expand the comment on when REP STOSB is used on memset +Content-type: text/plain; charset=UTF-8 + +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index 9984c3ca0f..97839a2248 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -21,7 +21,9 @@ + 2. If size is less than VEC, use integer register stores. + 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores. + 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores. +- 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with ++ 5. On machines ERMS feature, if size is greater or equal than ++ __x86_rep_stosb_threshold then REP STOSB will be used. ++ 6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with + 4 VEC stores and store 4 * VEC at a time until done. */ + + #include +-- +2.39.3 + diff --git a/glibc.spec b/glibc.spec index 3c13fdb..76187da 100644 --- a/glibc.spec +++ b/glibc.spec @@ -155,7 +155,7 @@ end \ Summary: The GNU libc libraries Name: glibc Version: %{glibcversion} -Release: 103%{?dist} +Release: 104%{?dist} # In general, GPLv2+ is used by programs, LGPLv2+ is used for # libraries. @@ -812,6 +812,10 @@ Patch575: glibc-RHEL-23472.patch Patch576: glibc-RHEL-20172-1.patch Patch577: glibc-RHEL-20172-2.patch Patch578: glibc-RHEL-21884.patch +Patch579: glibc-RHEL-25531-1.patch +Patch580: glibc-RHEL-25531-2.patch +Patch581: glibc-RHEL-25531-3.patch +Patch582: glibc-RHEL-25531-4.patch ############################################################################## # Continued list of core "glibc" package information: @@ -2970,6 +2974,9 @@ update_gconv_modules_cache () %endif %changelog +* Tue Mar 26 2024 DJ Delorie - 2.34-104 +- x86: Fix Zen3/Zen4 ERMS selection (RHEL-25531) + * Tue Mar 12 2024 Arjun Shankar - 2.34-103 - malloc: Do not use MAP_NORESERVE to allocate heap segments (RHEL-21884)