x86: Fix Zen3/Zen4 ERMS selection

Resolves: RHEL-25531
This commit is contained in:
DJ Delorie 2024-03-27 21:45:36 -04:00
parent 5a35e9b70f
commit 6dbf26d6f4
5 changed files with 414 additions and 1 deletions

188
glibc-RHEL-25531-1.patch Normal file
View File

@ -0,0 +1,188 @@
From a4c3f5f46e850c977cda81c251036475aab8313c Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Thu, 23 Nov 2023 14:29:14 -0300
Subject: [PATCH] elf: Add a way to check if tunable is set (BZ 27069)
Content-type: text/plain; charset=UTF-8
The patch adds two new macros, TUNABLE_GET_DEFAULT and TUNABLE_IS_INITIALIZED,
here the former get the default value with a signature similar to
TUNABLE_GET, while the later returns whether the tunable was set by
the environment variable.
Checked on x86_64-linux-gnu.
Reviewed-by: DJ Delorie <dj@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Conflicts:
elf/Versions
(removed to preserve ABI)
elf/dl-tunable-types.h
(line numbers)
scripts/gen-tunables.awk
(account for missing TUNABLE_SECLEVEL patch)
---
elf/dl-tunable-types.h | 1 +
elf/dl-tunables.c | 40 ++++++++++++++++++++++++++++++++++++++++
elf/dl-tunables.h | 28 ++++++++++++++++++++++++++++
elf/dl-tunables.list | 1 +
scripts/gen-tunables.awk | 4 ++--
6 files changed, 73 insertions(+), 2 deletions(-)
diff -rup a/elf/dl-tunable-types.h b/elf/dl-tunable-types.h
--- a/elf/dl-tunable-types.h 2021-08-01 21:33:43.000000000 -0400
+++ b/elf/dl-tunable-types.h 2024-03-26 18:23:22.211504813 -0400
@@ -61,6 +61,7 @@ struct _tunable
{
const char name[TUNABLE_NAME_MAX]; /* Internal name of the tunable. */
tunable_type_t type; /* Data type of the tunable. */
+ const tunable_val_t def; /* The value. */
tunable_val_t val; /* The value. */
bool initialized; /* Flag to indicate that the tunable is
initialized. */
diff -rup a/elf/dl-tunables.c b/elf/dl-tunables.c
--- a/elf/dl-tunables.c 2024-03-26 18:21:10.090681748 -0400
+++ b/elf/dl-tunables.c 2024-03-26 18:23:22.214504923 -0400
@@ -152,6 +152,13 @@ tunable_initialize (tunable_t *cur, cons
do_tunable_update_val (cur, &val, NULL, NULL);
}
+bool
+__tunable_is_initialized (tunable_id_t id)
+{
+ return tunable_list[id].initialized;
+}
+rtld_hidden_def (__tunable_is_initialized)
+
void
__tunable_set_val (tunable_id_t id, tunable_val_t *valp, tunable_num_t *minp,
tunable_num_t *maxp)
@@ -399,6 +406,39 @@ __tunables_print (void)
}
}
+void
+__tunable_get_default (tunable_id_t id, void *valp)
+{
+ tunable_t *cur = &tunable_list[id];
+
+ switch (cur->type.type_code)
+ {
+ case TUNABLE_TYPE_UINT_64:
+ {
+ *((uint64_t *) valp) = (uint64_t) cur->def.numval;
+ break;
+ }
+ case TUNABLE_TYPE_INT_32:
+ {
+ *((int32_t *) valp) = (int32_t) cur->def.numval;
+ break;
+ }
+ case TUNABLE_TYPE_SIZE_T:
+ {
+ *((size_t *) valp) = (size_t) cur->def.numval;
+ break;
+ }
+ case TUNABLE_TYPE_STRING:
+ {
+ *((const char **)valp) = cur->def.strval;
+ break;
+ }
+ default:
+ __builtin_unreachable ();
+ }
+}
+rtld_hidden_def (__tunable_get_default)
+
/* Set the tunable value. This is called by the module that the tunable exists
in. */
void
diff -rup a/elf/dl-tunables.h b/elf/dl-tunables.h
--- a/elf/dl-tunables.h 2021-08-01 21:33:43.000000000 -0400
+++ b/elf/dl-tunables.h 2024-03-26 18:23:22.217505032 -0400
@@ -53,18 +53,26 @@ typedef void (*tunable_callback_t) (tuna
extern void __tunables_init (char **);
extern void __tunables_print (void);
+extern bool __tunable_is_initialized (tunable_id_t);
extern void __tunable_get_val (tunable_id_t, void *, tunable_callback_t);
extern void __tunable_set_val (tunable_id_t, tunable_val_t *, tunable_num_t *,
tunable_num_t *);
+extern void __tunable_get_default (tunable_id_t id, void *valp);
rtld_hidden_proto (__tunables_init)
rtld_hidden_proto (__tunables_print)
+rtld_hidden_proto (__tunable_is_initialized)
rtld_hidden_proto (__tunable_get_val)
rtld_hidden_proto (__tunable_set_val)
+rtld_hidden_proto (__tunable_get_default)
/* Define TUNABLE_GET and TUNABLE_SET in short form if TOP_NAMESPACE and
TUNABLE_NAMESPACE are defined. This is useful shorthand to get and set
tunables within a module. */
#if defined TOP_NAMESPACE && defined TUNABLE_NAMESPACE
+# define TUNABLE_IS_INITIALIZED(__id) \
+ TUNABLE_IS_INITIALIZED_FULL(TOP_NAMESPACE, TUNABLE_NAMESPACE, __id)
+# define TUNABLE_GET_DEFAULT(__id, __type) \
+ TUNABLE_GET_DEFAULT_FULL(TOP_NAMESPACE, TUNABLE_NAMESPACE,__id, __type)
# define TUNABLE_GET(__id, __type, __cb) \
TUNABLE_GET_FULL (TOP_NAMESPACE, TUNABLE_NAMESPACE, __id, __type, __cb)
# define TUNABLE_SET(__id, __val) \
@@ -73,6 +81,10 @@ rtld_hidden_proto (__tunable_set_val)
TUNABLE_SET_WITH_BOUNDS_FULL (TOP_NAMESPACE, TUNABLE_NAMESPACE, __id, \
__val, __min, __max)
#else
+# define TUNABLE_IS_INITIALIZED(__top, __ns, __id) \
+ TUNABLE_IS_INITIALIZED_FULL(__top, __ns, __id)
+# define TUNABLE_GET_DEFAULT(__top, __ns, __type) \
+ TUNABLE_GET_DEFAULT_FULL(__top, __ns, __id, __type)
# define TUNABLE_GET(__top, __ns, __id, __type, __cb) \
TUNABLE_GET_FULL (__top, __ns, __id, __type, __cb)
# define TUNABLE_SET(__top, __ns, __id, __val) \
@@ -81,6 +93,22 @@ rtld_hidden_proto (__tunable_set_val)
TUNABLE_SET_WITH_BOUNDS_FULL (__top, __ns, __id, __val, __min, __max)
#endif
+/* Return whether the tunable was initialized by the environment variable. */
+#define TUNABLE_IS_INITIALIZED_FULL(__top, __ns, __id) \
+({ \
+ tunable_id_t id = TUNABLE_ENUM_NAME (__top, __ns, __id); \
+ __tunable_is_initialized (id); \
+})
+
+/* Return the default value of the tunable. */
+#define TUNABLE_GET_DEFAULT_FULL(__top, __ns, __id, __type) \
+({ \
+ tunable_id_t id = TUNABLE_ENUM_NAME (__top, __ns, __id); \
+ __type __ret; \
+ __tunable_get_default (id, &__ret); \
+ __ret; \
+})
+
/* Get and return a tunable value. If the tunable was set externally and __CB
is defined then call __CB before returning the value. */
# define TUNABLE_GET_FULL(__top, __ns, __id, __type, __cb) \
diff -rup a/elf/dl-tunables.list b/elf/dl-tunables.list
--- a/elf/dl-tunables.list 2024-03-26 18:21:09.664666196 -0400
+++ b/elf/dl-tunables.list 2024-03-26 18:23:22.220505142 -0400
@@ -20,6 +20,7 @@
# type: Defaults to STRING
# minval: Optional minimum acceptable value
# maxval: Optional maximum acceptable value
+# default: Optional default value (if not specified it will be 0 or "")
# env_alias: An alias environment variable
# security_level: Specify security level of the tunable for AT_SECURE binaries.
# Valid values are:
diff -rup a/scripts/gen-tunables.awk b/scripts/gen-tunables.awk
--- a/scripts/gen-tunables.awk 2024-03-26 18:21:09.523661049 -0400
+++ b/scripts/gen-tunables.awk 2024-03-26 18:34:45.385462341 -0400
@@ -236,8 +236,8 @@ END {
n = indices[2];
m = indices[3];
printf (" {TUNABLE_NAME_S(%s, %s, %s)", t, n, m)
- printf (", {TUNABLE_TYPE_%s, %s, %s}, {%s}, NULL, TUNABLE_SECLEVEL_%s, %s},\n",
- types[t,n,m], minvals[t,n,m], maxvals[t,n,m],
+ printf (", {TUNABLE_TYPE_%s, %s, %s}, {%s}, {%s}, NULL, TUNABLE_SECLEVEL_%s, %s},\n",
+ types[t,n,m], minvals[t,n,m], maxvals[t,n,m],default_val[t,n,m],
default_val[t,n,m], security_level[t,n,m], env_alias[t,n,m]);
}
print "};"

155
glibc-RHEL-25531-2.patch Normal file
View File

@ -0,0 +1,155 @@
From 0c0d39fe4aeb0f69b26e76337c5dfd5530d5d44e Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Thu, 8 Feb 2024 10:08:38 -0300
Subject: [PATCH] x86: Fix Zen3/Zen4 ERMS selection (BZ 30994)
Content-type: text/plain; charset=UTF-8
The REP MOVSB usage on memcpy/memmove does not show much performance
improvement on Zen3/Zen4 cores compared to the vectorized loops. Also,
as from BZ 30994, if the source is aligned and the destination is not
the performance can be 20x slower.
The performance difference is noticeable with small buffer sizes, closer
to the lower bounds limits when memcpy/memmove starts to use ERMS. The
performance of REP MOVSB is similar to vectorized instruction on the
size limit (the L2 cache). Also, there is no drawback to multiple cores
sharing the cache.
Checked on x86_64-linux-gnu on Zen3.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Conflicts:
sysdeps/x86/dl-cacheinfo.h
(tweaked for changed context)
---
sysdeps/x86/dl-cacheinfo.h | 38 ++++++++++++++++++--------------------
1 file changed, 18 insertions(+), 20 deletions(-)
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index d5101615e3..f34d12846c 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -791,7 +791,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
long int data = -1;
long int shared = -1;
long int shared_per_thread = -1;
- long int core = -1;
unsigned int threads = 0;
unsigned long int level1_icache_size = -1;
unsigned long int level1_icache_linesize = -1;
@@ -809,7 +808,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (cpu_features->basic.kind == arch_kind_intel)
{
data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
- core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
shared_per_thread = shared;
@@ -822,7 +820,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
= handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
level1_dcache_linesize
= handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
- level2_cache_size = core;
+ level2_cache_size
+ = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
level2_cache_assoc
= handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
level2_cache_linesize
@@ -835,12 +834,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
level4_cache_size
= handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
- get_common_cache_info (&shared, &shared_per_thread, &threads, core);
+ get_common_cache_info (&shared, &shared_per_thread, &threads,
+ level2_cache_size);
}
else if (cpu_features->basic.kind == arch_kind_zhaoxin)
{
data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
- core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
shared_per_thread = shared;
@@ -849,19 +848,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
level1_dcache_size = data;
level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
- level2_cache_size = core;
+ level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
level3_cache_size = shared;
level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
- get_common_cache_info (&shared, &shared_per_thread, &threads, core);
+ get_common_cache_info (&shared, &shared_per_thread, &threads,
+ level2_cache_size);
}
else if (cpu_features->basic.kind == arch_kind_amd)
{
data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
- core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
@@ -869,7 +868,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
level1_dcache_size = data;
level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
- level2_cache_size = core;
+ level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);;
level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
level3_cache_size = shared;
@@ -880,12 +879,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (shared <= 0)
{
/* No shared L3 cache. All we have is the L2 cache. */
- shared = core;
+ shared = level2_cache_size;
}
else if (cpu_features->basic.family < 0x17)
{
/* Account for exclusive L2 and L3 caches. */
- shared += core;
+ shared += level2_cache_size;
}
shared_per_thread = shared;
@@ -987,6 +986,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
rep_movsb_threshold = 2112;
+ /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+ cases slower than the vectorized path (and for some alignments,
+ it is really slow, check BZ #30994). */
+ if (cpu_features->basic.kind == arch_kind_amd)
+ rep_movsb_threshold = non_temporal_threshold;
+
/* The default threshold to use Enhanced REP STOSB. */
unsigned long int rep_stosb_threshold = 2048;
@@ -1028,15 +1033,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
#endif
unsigned long int rep_movsb_stop_threshold;
- /* ERMS feature is implemented from AMD Zen3 architecture and it is
- performing poorly for data above L2 cache size. Henceforth, adding
- an upper bound threshold parameter to limit the usage of Enhanced
- REP MOVSB operations and setting its value to L2 cache size. */
- if (cpu_features->basic.kind == arch_kind_amd)
- rep_movsb_stop_threshold = core;
/* Setting the upper bound of ERMS to the computed value of
- non-temporal threshold for architectures other than AMD. */
- else
- rep_movsb_stop_threshold = non_temporal_threshold;
+ non-temporal threshold for all architectures. */
+ rep_movsb_stop_threshold = non_temporal_threshold;
cpu_features->data_cache_size = data;
cpu_features->shared_cache_size = shared;
--
2.39.3

34
glibc-RHEL-25531-3.patch Normal file
View File

@ -0,0 +1,34 @@
From 272708884cb750f12f5c74a00e6620c19dc6d567 Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Thu, 8 Feb 2024 10:08:39 -0300
Subject: [PATCH] x86: Do not prefer ERMS for memset on Zen3+
Content-type: text/plain; charset=UTF-8
For AMD Zen3+ architecture, the performance of the vectorized loop is
slightly better than ERMS.
Checked on x86_64-linux-gnu on Zen3.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/dl-cacheinfo.h | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index f34d12846c..5a98f70364 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1021,6 +1021,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
minimum value is fixed. */
rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
long int, NULL);
+ if (cpu_features->basic.kind == arch_kind_amd
+ && !TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold))
+ /* For AMD Zen3+ architecture, the performance of the vectorized loop is
+ slightly better than ERMS. */
+ rep_stosb_threshold = SIZE_MAX;
TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
--
2.39.3

29
glibc-RHEL-25531-4.patch Normal file
View File

@ -0,0 +1,29 @@
From 491e55beab7457ed310a4a47496f4a333c5d1032 Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Thu, 8 Feb 2024 10:08:40 -0300
Subject: [PATCH] x86: Expand the comment on when REP STOSB is used on memset
Content-type: text/plain; charset=UTF-8
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 9984c3ca0f..97839a2248 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -21,7 +21,9 @@
2. If size is less than VEC, use integer register stores.
3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
- 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
+ 5. On machines ERMS feature, if size is greater or equal than
+ __x86_rep_stosb_threshold then REP STOSB will be used.
+ 6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
4 VEC stores and store 4 * VEC at a time until done. */
#include <sysdep.h>
--
2.39.3

View File

@ -155,7 +155,7 @@ end \
Summary: The GNU libc libraries Summary: The GNU libc libraries
Name: glibc Name: glibc
Version: %{glibcversion} Version: %{glibcversion}
Release: 103%{?dist} Release: 104%{?dist}
# In general, GPLv2+ is used by programs, LGPLv2+ is used for # In general, GPLv2+ is used by programs, LGPLv2+ is used for
# libraries. # libraries.
@ -812,6 +812,10 @@ Patch575: glibc-RHEL-23472.patch
Patch576: glibc-RHEL-20172-1.patch Patch576: glibc-RHEL-20172-1.patch
Patch577: glibc-RHEL-20172-2.patch Patch577: glibc-RHEL-20172-2.patch
Patch578: glibc-RHEL-21884.patch Patch578: glibc-RHEL-21884.patch
Patch579: glibc-RHEL-25531-1.patch
Patch580: glibc-RHEL-25531-2.patch
Patch581: glibc-RHEL-25531-3.patch
Patch582: glibc-RHEL-25531-4.patch
############################################################################## ##############################################################################
# Continued list of core "glibc" package information: # Continued list of core "glibc" package information:
@ -2970,6 +2974,9 @@ update_gconv_modules_cache ()
%endif %endif
%changelog %changelog
* Tue Mar 26 2024 DJ Delorie <dj@redhat.com> - 2.34-104
- x86: Fix Zen3/Zen4 ERMS selection (RHEL-25531)
* Tue Mar 12 2024 Arjun Shankar <arjun@redhat.com> - 2.34-103 * Tue Mar 12 2024 Arjun Shankar <arjun@redhat.com> - 2.34-103
- malloc: Do not use MAP_NORESERVE to allocate heap segments (RHEL-21884) - malloc: Do not use MAP_NORESERVE to allocate heap segments (RHEL-21884)