Compare commits
2 Commits
7936abdf7e
...
2eef845242
Author | SHA1 | Date |
---|---|---|
DJ Delorie | 2eef845242 | |
Arjun Shankar | 5a35e9b70f |
|
@ -0,0 +1 @@
|
||||||
|
7c3b8890a6346793b6334cc5f2fea5d437d307b8 glibc-2.34.tar.xz
|
|
@ -0,0 +1,73 @@
|
||||||
|
commit 85860ad6eaf4c9739318f6b2a1ff7c2fa6b12ab5
|
||||||
|
Author: Florian Weimer <fweimer@redhat.com>
|
||||||
|
Date: Mon Aug 15 16:45:40 2022 +0200
|
||||||
|
|
||||||
|
malloc: Do not use MAP_NORESERVE to allocate heap segments
|
||||||
|
|
||||||
|
Address space for heap segments is reserved in a mmap call with
|
||||||
|
MAP_ANONYMOUS | MAP_PRIVATE and protection flags PROT_NONE. This
|
||||||
|
reservation does not count against the RSS limit of the process or
|
||||||
|
system. Backing memory is allocated using mprotect in alloc_new_heap
|
||||||
|
and grow_heap, and at this point, the allocator expects the kernel
|
||||||
|
to provide memory (subject to memory overcommit).
|
||||||
|
|
||||||
|
The SIGSEGV that might generate due to MAP_NORESERVE (according to
|
||||||
|
the mmap manual page) does not seem to occur in practice, it's always
|
||||||
|
SIGKILL from the OOM killer. Even if there is a way that SIGSEGV
|
||||||
|
could be generated, it is confusing to applications that this only
|
||||||
|
happens for secondary heaps, not for large mmap-based allocations,
|
||||||
|
and not for the main arena.
|
||||||
|
|
||||||
|
Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
|
||||||
|
|
||||||
|
Conflicts:
|
||||||
|
malloc/arena.c
|
||||||
|
(huge page support was added upstream)
|
||||||
|
|
||||||
|
diff --git a/malloc/arena.c b/malloc/arena.c
|
||||||
|
index 667484630ed0afa5..2852783355d3d869 100644
|
||||||
|
--- a/malloc/arena.c
|
||||||
|
+++ b/malloc/arena.c
|
||||||
|
@@ -466,8 +466,7 @@ new_heap (size_t size, size_t top_pad)
|
||||||
|
p2 = MAP_FAILED;
|
||||||
|
if (aligned_heap_area)
|
||||||
|
{
|
||||||
|
- p2 = (char *) MMAP (aligned_heap_area, HEAP_MAX_SIZE, PROT_NONE,
|
||||||
|
- MAP_NORESERVE);
|
||||||
|
+ p2 = (char *) MMAP (aligned_heap_area, HEAP_MAX_SIZE, PROT_NONE, 0);
|
||||||
|
aligned_heap_area = NULL;
|
||||||
|
if (p2 != MAP_FAILED && ((unsigned long) p2 & (HEAP_MAX_SIZE - 1)))
|
||||||
|
{
|
||||||
|
@@ -477,7 +476,7 @@ new_heap (size_t size, size_t top_pad)
|
||||||
|
}
|
||||||
|
if (p2 == MAP_FAILED)
|
||||||
|
{
|
||||||
|
- p1 = (char *) MMAP (0, HEAP_MAX_SIZE << 1, PROT_NONE, MAP_NORESERVE);
|
||||||
|
+ p1 = (char *) MMAP (0, HEAP_MAX_SIZE << 1, PROT_NONE, 0);
|
||||||
|
if (p1 != MAP_FAILED)
|
||||||
|
{
|
||||||
|
p2 = (char *) (((unsigned long) p1 + (HEAP_MAX_SIZE - 1))
|
||||||
|
@@ -493,7 +492,7 @@ new_heap (size_t size, size_t top_pad)
|
||||||
|
{
|
||||||
|
/* Try to take the chance that an allocation of only HEAP_MAX_SIZE
|
||||||
|
is already aligned. */
|
||||||
|
- p2 = (char *) MMAP (0, HEAP_MAX_SIZE, PROT_NONE, MAP_NORESERVE);
|
||||||
|
+ p2 = (char *) MMAP (0, HEAP_MAX_SIZE, PROT_NONE, 0);
|
||||||
|
if (p2 == MAP_FAILED)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
diff --git a/malloc/malloc.c b/malloc/malloc.c
|
||||||
|
index 375f50f5db13e234..fe80b8239756a7c9 100644
|
||||||
|
--- a/malloc/malloc.c
|
||||||
|
+++ b/malloc/malloc.c
|
||||||
|
@@ -1112,10 +1112,6 @@ static mchunkptr mremap_chunk(mchunkptr p, size_t new_size);
|
||||||
|
# define MAP_ANONYMOUS MAP_ANON
|
||||||
|
#endif
|
||||||
|
|
||||||
|
-#ifndef MAP_NORESERVE
|
||||||
|
-# define MAP_NORESERVE 0
|
||||||
|
-#endif
|
||||||
|
-
|
||||||
|
#define MMAP(addr, size, prot, flags) \
|
||||||
|
__mmap((addr), (size), (prot), (flags)|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0)
|
||||||
|
|
|
@ -0,0 +1,188 @@
|
||||||
|
From a4c3f5f46e850c977cda81c251036475aab8313c Mon Sep 17 00:00:00 2001
|
||||||
|
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||||
|
Date: Thu, 23 Nov 2023 14:29:14 -0300
|
||||||
|
Subject: [PATCH] elf: Add a way to check if tunable is set (BZ 27069)
|
||||||
|
Content-type: text/plain; charset=UTF-8
|
||||||
|
|
||||||
|
The patch adds two new macros, TUNABLE_GET_DEFAULT and TUNABLE_IS_INITIALIZED,
|
||||||
|
here the former get the default value with a signature similar to
|
||||||
|
TUNABLE_GET, while the later returns whether the tunable was set by
|
||||||
|
the environment variable.
|
||||||
|
|
||||||
|
Checked on x86_64-linux-gnu.
|
||||||
|
Reviewed-by: DJ Delorie <dj@redhat.com>
|
||||||
|
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
|
||||||
|
|
||||||
|
Conflicts:
|
||||||
|
elf/Versions
|
||||||
|
(removed to preserve ABI)
|
||||||
|
elf/dl-tunable-types.h
|
||||||
|
(line numbers)
|
||||||
|
scripts/gen-tunables.awk
|
||||||
|
(account for missing TUNABLE_SECLEVEL patch)
|
||||||
|
|
||||||
|
---
|
||||||
|
elf/dl-tunable-types.h | 1 +
|
||||||
|
elf/dl-tunables.c | 40 ++++++++++++++++++++++++++++++++++++++++
|
||||||
|
elf/dl-tunables.h | 28 ++++++++++++++++++++++++++++
|
||||||
|
elf/dl-tunables.list | 1 +
|
||||||
|
scripts/gen-tunables.awk | 4 ++--
|
||||||
|
6 files changed, 73 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff -rup a/elf/dl-tunable-types.h b/elf/dl-tunable-types.h
|
||||||
|
--- a/elf/dl-tunable-types.h 2021-08-01 21:33:43.000000000 -0400
|
||||||
|
+++ b/elf/dl-tunable-types.h 2024-03-26 18:23:22.211504813 -0400
|
||||||
|
@@ -61,6 +61,7 @@ struct _tunable
|
||||||
|
{
|
||||||
|
const char name[TUNABLE_NAME_MAX]; /* Internal name of the tunable. */
|
||||||
|
tunable_type_t type; /* Data type of the tunable. */
|
||||||
|
+ const tunable_val_t def; /* The value. */
|
||||||
|
tunable_val_t val; /* The value. */
|
||||||
|
bool initialized; /* Flag to indicate that the tunable is
|
||||||
|
initialized. */
|
||||||
|
diff -rup a/elf/dl-tunables.c b/elf/dl-tunables.c
|
||||||
|
--- a/elf/dl-tunables.c 2024-03-26 18:21:10.090681748 -0400
|
||||||
|
+++ b/elf/dl-tunables.c 2024-03-26 18:23:22.214504923 -0400
|
||||||
|
@@ -152,6 +152,13 @@ tunable_initialize (tunable_t *cur, cons
|
||||||
|
do_tunable_update_val (cur, &val, NULL, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
+bool
|
||||||
|
+__tunable_is_initialized (tunable_id_t id)
|
||||||
|
+{
|
||||||
|
+ return tunable_list[id].initialized;
|
||||||
|
+}
|
||||||
|
+rtld_hidden_def (__tunable_is_initialized)
|
||||||
|
+
|
||||||
|
void
|
||||||
|
__tunable_set_val (tunable_id_t id, tunable_val_t *valp, tunable_num_t *minp,
|
||||||
|
tunable_num_t *maxp)
|
||||||
|
@@ -399,6 +406,39 @@ __tunables_print (void)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
+void
|
||||||
|
+__tunable_get_default (tunable_id_t id, void *valp)
|
||||||
|
+{
|
||||||
|
+ tunable_t *cur = &tunable_list[id];
|
||||||
|
+
|
||||||
|
+ switch (cur->type.type_code)
|
||||||
|
+ {
|
||||||
|
+ case TUNABLE_TYPE_UINT_64:
|
||||||
|
+ {
|
||||||
|
+ *((uint64_t *) valp) = (uint64_t) cur->def.numval;
|
||||||
|
+ break;
|
||||||
|
+ }
|
||||||
|
+ case TUNABLE_TYPE_INT_32:
|
||||||
|
+ {
|
||||||
|
+ *((int32_t *) valp) = (int32_t) cur->def.numval;
|
||||||
|
+ break;
|
||||||
|
+ }
|
||||||
|
+ case TUNABLE_TYPE_SIZE_T:
|
||||||
|
+ {
|
||||||
|
+ *((size_t *) valp) = (size_t) cur->def.numval;
|
||||||
|
+ break;
|
||||||
|
+ }
|
||||||
|
+ case TUNABLE_TYPE_STRING:
|
||||||
|
+ {
|
||||||
|
+ *((const char **)valp) = cur->def.strval;
|
||||||
|
+ break;
|
||||||
|
+ }
|
||||||
|
+ default:
|
||||||
|
+ __builtin_unreachable ();
|
||||||
|
+ }
|
||||||
|
+}
|
||||||
|
+rtld_hidden_def (__tunable_get_default)
|
||||||
|
+
|
||||||
|
/* Set the tunable value. This is called by the module that the tunable exists
|
||||||
|
in. */
|
||||||
|
void
|
||||||
|
diff -rup a/elf/dl-tunables.h b/elf/dl-tunables.h
|
||||||
|
--- a/elf/dl-tunables.h 2021-08-01 21:33:43.000000000 -0400
|
||||||
|
+++ b/elf/dl-tunables.h 2024-03-26 18:23:22.217505032 -0400
|
||||||
|
@@ -53,18 +53,26 @@ typedef void (*tunable_callback_t) (tuna
|
||||||
|
|
||||||
|
extern void __tunables_init (char **);
|
||||||
|
extern void __tunables_print (void);
|
||||||
|
+extern bool __tunable_is_initialized (tunable_id_t);
|
||||||
|
extern void __tunable_get_val (tunable_id_t, void *, tunable_callback_t);
|
||||||
|
extern void __tunable_set_val (tunable_id_t, tunable_val_t *, tunable_num_t *,
|
||||||
|
tunable_num_t *);
|
||||||
|
+extern void __tunable_get_default (tunable_id_t id, void *valp);
|
||||||
|
rtld_hidden_proto (__tunables_init)
|
||||||
|
rtld_hidden_proto (__tunables_print)
|
||||||
|
+rtld_hidden_proto (__tunable_is_initialized)
|
||||||
|
rtld_hidden_proto (__tunable_get_val)
|
||||||
|
rtld_hidden_proto (__tunable_set_val)
|
||||||
|
+rtld_hidden_proto (__tunable_get_default)
|
||||||
|
|
||||||
|
/* Define TUNABLE_GET and TUNABLE_SET in short form if TOP_NAMESPACE and
|
||||||
|
TUNABLE_NAMESPACE are defined. This is useful shorthand to get and set
|
||||||
|
tunables within a module. */
|
||||||
|
#if defined TOP_NAMESPACE && defined TUNABLE_NAMESPACE
|
||||||
|
+# define TUNABLE_IS_INITIALIZED(__id) \
|
||||||
|
+ TUNABLE_IS_INITIALIZED_FULL(TOP_NAMESPACE, TUNABLE_NAMESPACE, __id)
|
||||||
|
+# define TUNABLE_GET_DEFAULT(__id, __type) \
|
||||||
|
+ TUNABLE_GET_DEFAULT_FULL(TOP_NAMESPACE, TUNABLE_NAMESPACE,__id, __type)
|
||||||
|
# define TUNABLE_GET(__id, __type, __cb) \
|
||||||
|
TUNABLE_GET_FULL (TOP_NAMESPACE, TUNABLE_NAMESPACE, __id, __type, __cb)
|
||||||
|
# define TUNABLE_SET(__id, __val) \
|
||||||
|
@@ -73,6 +81,10 @@ rtld_hidden_proto (__tunable_set_val)
|
||||||
|
TUNABLE_SET_WITH_BOUNDS_FULL (TOP_NAMESPACE, TUNABLE_NAMESPACE, __id, \
|
||||||
|
__val, __min, __max)
|
||||||
|
#else
|
||||||
|
+# define TUNABLE_IS_INITIALIZED(__top, __ns, __id) \
|
||||||
|
+ TUNABLE_IS_INITIALIZED_FULL(__top, __ns, __id)
|
||||||
|
+# define TUNABLE_GET_DEFAULT(__top, __ns, __type) \
|
||||||
|
+ TUNABLE_GET_DEFAULT_FULL(__top, __ns, __id, __type)
|
||||||
|
# define TUNABLE_GET(__top, __ns, __id, __type, __cb) \
|
||||||
|
TUNABLE_GET_FULL (__top, __ns, __id, __type, __cb)
|
||||||
|
# define TUNABLE_SET(__top, __ns, __id, __val) \
|
||||||
|
@@ -81,6 +93,22 @@ rtld_hidden_proto (__tunable_set_val)
|
||||||
|
TUNABLE_SET_WITH_BOUNDS_FULL (__top, __ns, __id, __val, __min, __max)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
+/* Return whether the tunable was initialized by the environment variable. */
|
||||||
|
+#define TUNABLE_IS_INITIALIZED_FULL(__top, __ns, __id) \
|
||||||
|
+({ \
|
||||||
|
+ tunable_id_t id = TUNABLE_ENUM_NAME (__top, __ns, __id); \
|
||||||
|
+ __tunable_is_initialized (id); \
|
||||||
|
+})
|
||||||
|
+
|
||||||
|
+/* Return the default value of the tunable. */
|
||||||
|
+#define TUNABLE_GET_DEFAULT_FULL(__top, __ns, __id, __type) \
|
||||||
|
+({ \
|
||||||
|
+ tunable_id_t id = TUNABLE_ENUM_NAME (__top, __ns, __id); \
|
||||||
|
+ __type __ret; \
|
||||||
|
+ __tunable_get_default (id, &__ret); \
|
||||||
|
+ __ret; \
|
||||||
|
+})
|
||||||
|
+
|
||||||
|
/* Get and return a tunable value. If the tunable was set externally and __CB
|
||||||
|
is defined then call __CB before returning the value. */
|
||||||
|
# define TUNABLE_GET_FULL(__top, __ns, __id, __type, __cb) \
|
||||||
|
diff -rup a/elf/dl-tunables.list b/elf/dl-tunables.list
|
||||||
|
--- a/elf/dl-tunables.list 2024-03-26 18:21:09.664666196 -0400
|
||||||
|
+++ b/elf/dl-tunables.list 2024-03-26 18:23:22.220505142 -0400
|
||||||
|
@@ -20,6 +20,7 @@
|
||||||
|
# type: Defaults to STRING
|
||||||
|
# minval: Optional minimum acceptable value
|
||||||
|
# maxval: Optional maximum acceptable value
|
||||||
|
+# default: Optional default value (if not specified it will be 0 or "")
|
||||||
|
# env_alias: An alias environment variable
|
||||||
|
# security_level: Specify security level of the tunable for AT_SECURE binaries.
|
||||||
|
# Valid values are:
|
||||||
|
diff -rup a/scripts/gen-tunables.awk b/scripts/gen-tunables.awk
|
||||||
|
--- a/scripts/gen-tunables.awk 2024-03-26 18:21:09.523661049 -0400
|
||||||
|
+++ b/scripts/gen-tunables.awk 2024-03-26 18:34:45.385462341 -0400
|
||||||
|
@@ -236,8 +236,8 @@ END {
|
||||||
|
n = indices[2];
|
||||||
|
m = indices[3];
|
||||||
|
printf (" {TUNABLE_NAME_S(%s, %s, %s)", t, n, m)
|
||||||
|
- printf (", {TUNABLE_TYPE_%s, %s, %s}, {%s}, NULL, TUNABLE_SECLEVEL_%s, %s},\n",
|
||||||
|
- types[t,n,m], minvals[t,n,m], maxvals[t,n,m],
|
||||||
|
+ printf (", {TUNABLE_TYPE_%s, %s, %s}, {%s}, {%s}, NULL, TUNABLE_SECLEVEL_%s, %s},\n",
|
||||||
|
+ types[t,n,m], minvals[t,n,m], maxvals[t,n,m],default_val[t,n,m],
|
||||||
|
default_val[t,n,m], security_level[t,n,m], env_alias[t,n,m]);
|
||||||
|
}
|
||||||
|
print "};"
|
|
@ -0,0 +1,155 @@
|
||||||
|
From 0c0d39fe4aeb0f69b26e76337c5dfd5530d5d44e Mon Sep 17 00:00:00 2001
|
||||||
|
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||||
|
Date: Thu, 8 Feb 2024 10:08:38 -0300
|
||||||
|
Subject: [PATCH] x86: Fix Zen3/Zen4 ERMS selection (BZ 30994)
|
||||||
|
Content-type: text/plain; charset=UTF-8
|
||||||
|
|
||||||
|
The REP MOVSB usage on memcpy/memmove does not show much performance
|
||||||
|
improvement on Zen3/Zen4 cores compared to the vectorized loops. Also,
|
||||||
|
as from BZ 30994, if the source is aligned and the destination is not
|
||||||
|
the performance can be 20x slower.
|
||||||
|
|
||||||
|
The performance difference is noticeable with small buffer sizes, closer
|
||||||
|
to the lower bounds limits when memcpy/memmove starts to use ERMS. The
|
||||||
|
performance of REP MOVSB is similar to vectorized instruction on the
|
||||||
|
size limit (the L2 cache). Also, there is no drawback to multiple cores
|
||||||
|
sharing the cache.
|
||||||
|
|
||||||
|
Checked on x86_64-linux-gnu on Zen3.
|
||||||
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||||
|
|
||||||
|
Conflicts:
|
||||||
|
sysdeps/x86/dl-cacheinfo.h
|
||||||
|
(tweaked for changed context)
|
||||||
|
|
||||||
|
---
|
||||||
|
sysdeps/x86/dl-cacheinfo.h | 38 ++++++++++++++++++--------------------
|
||||||
|
1 file changed, 18 insertions(+), 20 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
index d5101615e3..f34d12846c 100644
|
||||||
|
--- a/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
+++ b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
@@ -791,7 +791,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
long int data = -1;
|
||||||
|
long int shared = -1;
|
||||||
|
long int shared_per_thread = -1;
|
||||||
|
- long int core = -1;
|
||||||
|
unsigned int threads = 0;
|
||||||
|
unsigned long int level1_icache_size = -1;
|
||||||
|
unsigned long int level1_icache_linesize = -1;
|
||||||
|
@@ -809,7 +808,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
if (cpu_features->basic.kind == arch_kind_intel)
|
||||||
|
{
|
||||||
|
data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
|
||||||
|
- core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
|
||||||
|
shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
|
||||||
|
shared_per_thread = shared;
|
||||||
|
|
||||||
|
@@ -822,7 +820,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
= handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
|
||||||
|
level1_dcache_linesize
|
||||||
|
= handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
|
||||||
|
- level2_cache_size = core;
|
||||||
|
+ level2_cache_size
|
||||||
|
+ = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
|
||||||
|
level2_cache_assoc
|
||||||
|
= handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
|
||||||
|
level2_cache_linesize
|
||||||
|
@@ -835,12 +834,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
level4_cache_size
|
||||||
|
= handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
|
||||||
|
|
||||||
|
- get_common_cache_info (&shared, &shared_per_thread, &threads, core);
|
||||||
|
+ get_common_cache_info (&shared, &shared_per_thread, &threads,
|
||||||
|
+ level2_cache_size);
|
||||||
|
}
|
||||||
|
else if (cpu_features->basic.kind == arch_kind_zhaoxin)
|
||||||
|
{
|
||||||
|
data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
|
||||||
|
- core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
|
||||||
|
shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
|
||||||
|
shared_per_thread = shared;
|
||||||
|
|
||||||
|
@@ -849,19 +848,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
level1_dcache_size = data;
|
||||||
|
level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
|
||||||
|
level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
|
||||||
|
- level2_cache_size = core;
|
||||||
|
+ level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
|
||||||
|
level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
|
||||||
|
level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
|
||||||
|
level3_cache_size = shared;
|
||||||
|
level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
|
||||||
|
level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
|
||||||
|
|
||||||
|
- get_common_cache_info (&shared, &shared_per_thread, &threads, core);
|
||||||
|
+ get_common_cache_info (&shared, &shared_per_thread, &threads,
|
||||||
|
+ level2_cache_size);
|
||||||
|
}
|
||||||
|
else if (cpu_features->basic.kind == arch_kind_amd)
|
||||||
|
{
|
||||||
|
data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
|
||||||
|
- core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
|
||||||
|
shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
|
||||||
|
|
||||||
|
level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
|
||||||
|
@@ -869,7 +868,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
level1_dcache_size = data;
|
||||||
|
level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
|
||||||
|
level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
|
||||||
|
- level2_cache_size = core;
|
||||||
|
+ level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);;
|
||||||
|
level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
|
||||||
|
level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
|
||||||
|
level3_cache_size = shared;
|
||||||
|
@@ -880,12 +879,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
if (shared <= 0)
|
||||||
|
{
|
||||||
|
/* No shared L3 cache. All we have is the L2 cache. */
|
||||||
|
- shared = core;
|
||||||
|
+ shared = level2_cache_size;
|
||||||
|
}
|
||||||
|
else if (cpu_features->basic.family < 0x17)
|
||||||
|
{
|
||||||
|
/* Account for exclusive L2 and L3 caches. */
|
||||||
|
- shared += core;
|
||||||
|
+ shared += level2_cache_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
shared_per_thread = shared;
|
||||||
|
@@ -987,6 +986,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
|
||||||
|
rep_movsb_threshold = 2112;
|
||||||
|
|
||||||
|
+ /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
|
||||||
|
+ cases slower than the vectorized path (and for some alignments,
|
||||||
|
+ it is really slow, check BZ #30994). */
|
||||||
|
+ if (cpu_features->basic.kind == arch_kind_amd)
|
||||||
|
+ rep_movsb_threshold = non_temporal_threshold;
|
||||||
|
+
|
||||||
|
/* The default threshold to use Enhanced REP STOSB. */
|
||||||
|
unsigned long int rep_stosb_threshold = 2048;
|
||||||
|
|
||||||
|
@@ -1028,15 +1033,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
unsigned long int rep_movsb_stop_threshold;
|
||||||
|
- /* ERMS feature is implemented from AMD Zen3 architecture and it is
|
||||||
|
- performing poorly for data above L2 cache size. Henceforth, adding
|
||||||
|
- an upper bound threshold parameter to limit the usage of Enhanced
|
||||||
|
- REP MOVSB operations and setting its value to L2 cache size. */
|
||||||
|
- if (cpu_features->basic.kind == arch_kind_amd)
|
||||||
|
- rep_movsb_stop_threshold = core;
|
||||||
|
/* Setting the upper bound of ERMS to the computed value of
|
||||||
|
- non-temporal threshold for architectures other than AMD. */
|
||||||
|
- else
|
||||||
|
- rep_movsb_stop_threshold = non_temporal_threshold;
|
||||||
|
+ non-temporal threshold for all architectures. */
|
||||||
|
+ rep_movsb_stop_threshold = non_temporal_threshold;
|
||||||
|
|
||||||
|
cpu_features->data_cache_size = data;
|
||||||
|
cpu_features->shared_cache_size = shared;
|
||||||
|
--
|
||||||
|
2.39.3
|
||||||
|
|
|
@ -0,0 +1,34 @@
|
||||||
|
From 272708884cb750f12f5c74a00e6620c19dc6d567 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||||
|
Date: Thu, 8 Feb 2024 10:08:39 -0300
|
||||||
|
Subject: [PATCH] x86: Do not prefer ERMS for memset on Zen3+
|
||||||
|
Content-type: text/plain; charset=UTF-8
|
||||||
|
|
||||||
|
For AMD Zen3+ architecture, the performance of the vectorized loop is
|
||||||
|
slightly better than ERMS.
|
||||||
|
|
||||||
|
Checked on x86_64-linux-gnu on Zen3.
|
||||||
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||||
|
---
|
||||||
|
sysdeps/x86/dl-cacheinfo.h | 5 +++++
|
||||||
|
1 file changed, 5 insertions(+)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
index f34d12846c..5a98f70364 100644
|
||||||
|
--- a/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
+++ b/sysdeps/x86/dl-cacheinfo.h
|
||||||
|
@@ -1021,6 +1021,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||||
|
minimum value is fixed. */
|
||||||
|
rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
|
||||||
|
long int, NULL);
|
||||||
|
+ if (cpu_features->basic.kind == arch_kind_amd
|
||||||
|
+ && !TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold))
|
||||||
|
+ /* For AMD Zen3+ architecture, the performance of the vectorized loop is
|
||||||
|
+ slightly better than ERMS. */
|
||||||
|
+ rep_stosb_threshold = SIZE_MAX;
|
||||||
|
|
||||||
|
TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
|
||||||
|
TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
|
||||||
|
--
|
||||||
|
2.39.3
|
||||||
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
From 491e55beab7457ed310a4a47496f4a333c5d1032 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||||
|
Date: Thu, 8 Feb 2024 10:08:40 -0300
|
||||||
|
Subject: [PATCH] x86: Expand the comment on when REP STOSB is used on memset
|
||||||
|
Content-type: text/plain; charset=UTF-8
|
||||||
|
|
||||||
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||||
|
---
|
||||||
|
sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 4 +++-
|
||||||
|
1 file changed, 3 insertions(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||||
|
index 9984c3ca0f..97839a2248 100644
|
||||||
|
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||||
|
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||||
|
@@ -21,7 +21,9 @@
|
||||||
|
2. If size is less than VEC, use integer register stores.
|
||||||
|
3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
|
||||||
|
4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
|
||||||
|
- 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
|
||||||
|
+ 5. On machines ERMS feature, if size is greater or equal than
|
||||||
|
+ __x86_rep_stosb_threshold then REP STOSB will be used.
|
||||||
|
+ 6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
|
||||||
|
4 VEC stores and store 4 * VEC at a time until done. */
|
||||||
|
|
||||||
|
#include <sysdep.h>
|
||||||
|
--
|
||||||
|
2.39.3
|
||||||
|
|
13
glibc.spec
13
glibc.spec
|
@ -155,7 +155,7 @@ end \
|
||||||
Summary: The GNU libc libraries
|
Summary: The GNU libc libraries
|
||||||
Name: glibc
|
Name: glibc
|
||||||
Version: %{glibcversion}
|
Version: %{glibcversion}
|
||||||
Release: 102%{?dist}
|
Release: 104%{?dist}
|
||||||
|
|
||||||
# In general, GPLv2+ is used by programs, LGPLv2+ is used for
|
# In general, GPLv2+ is used by programs, LGPLv2+ is used for
|
||||||
# libraries.
|
# libraries.
|
||||||
|
@ -811,6 +811,11 @@ Patch574: glibc-RHEL-21556.patch
|
||||||
Patch575: glibc-RHEL-23472.patch
|
Patch575: glibc-RHEL-23472.patch
|
||||||
Patch576: glibc-RHEL-20172-1.patch
|
Patch576: glibc-RHEL-20172-1.patch
|
||||||
Patch577: glibc-RHEL-20172-2.patch
|
Patch577: glibc-RHEL-20172-2.patch
|
||||||
|
Patch578: glibc-RHEL-21884.patch
|
||||||
|
Patch579: glibc-RHEL-25531-1.patch
|
||||||
|
Patch580: glibc-RHEL-25531-2.patch
|
||||||
|
Patch581: glibc-RHEL-25531-3.patch
|
||||||
|
Patch582: glibc-RHEL-25531-4.patch
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
# Continued list of core "glibc" package information:
|
# Continued list of core "glibc" package information:
|
||||||
|
@ -2969,6 +2974,12 @@ update_gconv_modules_cache ()
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Tue Mar 26 2024 DJ Delorie <dj@redhat.com> - 2.34-104
|
||||||
|
- x86: Fix Zen3/Zen4 ERMS selection (RHEL-25531)
|
||||||
|
|
||||||
|
* Tue Mar 12 2024 Arjun Shankar <arjun@redhat.com> - 2.34-103
|
||||||
|
- malloc: Do not use MAP_NORESERVE to allocate heap segments (RHEL-21884)
|
||||||
|
|
||||||
* Fri Mar 8 2024 DJ Delorie <dj@redhat.com> - 2.34-102
|
* Fri Mar 8 2024 DJ Delorie <dj@redhat.com> - 2.34-102
|
||||||
- Add glibc.cpu.prefer_map_32bit_exec tunable (RHEL-20172)
|
- Add glibc.cpu.prefer_map_32bit_exec tunable (RHEL-20172)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue