453 lines
15 KiB
Diff
453 lines
15 KiB
Diff
From ed2468f0a3c1c3c3b40b41047ffd97ce32346a4e Mon Sep 17 00:00:00 2001
|
|
From: Adrian Reber <areber@redhat.com>
|
|
Date: Wed, 22 Jan 2025 14:35:26 +0100
|
|
Subject: [PATCH] vdso: switch from DT_HASH to DT_GNU_HASH (aarch64)
|
|
|
|
Trying to run latest CRIU on CentOS Stream 10 or Ubuntu 24.04 (aarch64)
|
|
fails like this:
|
|
|
|
# criu/criu check -v4
|
|
[...]
|
|
(00.096460) vdso: Parsing at ffffb2e2a000 ffffb2e2c000
|
|
(00.096539) vdso: PT_LOAD p_vaddr: 0
|
|
(00.096567) vdso: DT_STRTAB: 1d0
|
|
(00.096592) vdso: DT_SYMTAB: 128
|
|
(00.096616) vdso: DT_STRSZ: 8a
|
|
(00.096640) vdso: DT_SYMENT: 18
|
|
(00.096663) Error (criu/pie-util-vdso.c:193): vdso: Not all dynamic entries are present
|
|
(00.096688) Error (criu/vdso.c:627): vdso: Failed to fill self vdso symtable
|
|
(00.096713) Error (criu/kerndat.c:1906): kerndat_vdso_fill_symtable failed when initializing kerndat.
|
|
(00.096812) Found mmap_min_addr 0x10000
|
|
(00.096881) files stat: fs/nr_open 1073741816
|
|
(00.096908) Error (criu/crtools.c:267): Could not initialize kernel features detection.
|
|
|
|
This seems to be related to the kernel (6.12.0-41.el10.aarch64). The
|
|
Ubuntu user-space is running in a container on the same kernel.
|
|
|
|
Looking at the kernel this seems to be related to:
|
|
|
|
commit 48f6430505c0b0498ee9020ce3cf9558b1caaaeb
|
|
Author: Fangrui Song <i@maskray.me>
|
|
Date: Thu Jul 18 10:34:23 2024 -0700
|
|
|
|
arm64/vdso: Remove --hash-style=sysv
|
|
|
|
glibc added support for .gnu.hash in 2006 and .hash has been obsoleted
|
|
for more than one decade in many Linux distributions. Using
|
|
--hash-style=sysv might imply unaddressed issues and confuse readers.
|
|
|
|
Just drop the option and rely on the linker default, which is likely
|
|
"both", or "gnu" when the distribution really wants to eliminate sysv
|
|
hash overhead.
|
|
|
|
Similar to commit 6b7e26547fad ("x86/vdso: Emit a GNU hash").
|
|
|
|
The commit basically does:
|
|
|
|
-ldflags-y := -shared -soname=linux-vdso.so.1 --hash-style=sysv \
|
|
+ldflags-y := -shared -soname=linux-vdso.so.1 \
|
|
|
|
Which results in only a GNU hash being added to the ELF header. This
|
|
change has been merged with 6.11.
|
|
|
|
Looking at the referenced x86 commit:
|
|
|
|
commit 6b7e26547fad7ace3dcb27a5babd2317fb9d1e12
|
|
Author: Andy Lutomirski <luto@amacapital.net>
|
|
Date: Thu Aug 6 14:45:45 2015 -0700
|
|
|
|
x86/vdso: Emit a GNU hash
|
|
|
|
Some dynamic loaders may be slightly faster if a GNU hash is
|
|
available. Strangely, this seems to have no effect at all on
|
|
the vdso size.
|
|
|
|
This is unlikely to have any measurable effect on the time it
|
|
takes to resolve vdso symbols (since there are so few of them).
|
|
In some contexts, it can be a win for a different reason: if
|
|
every DSO has a GNU hash section, then libc can avoid
|
|
calculating SysV hashes at all. Both musl and glibc appear to
|
|
have this optimization.
|
|
|
|
It's plausible that this breaks some ancient glibc version. If
|
|
so, then, depending on what glibc versions break, we could
|
|
either require COMPAT_VDSO for them or consider reverting.
|
|
|
|
Which is also a really simple change:
|
|
|
|
-VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \
|
|
+VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=both) \
|
|
|
|
The big difference here is that for x86 both hash sections are
|
|
generated. For aarch64 only the newer GNU hash is generated. That is why
|
|
we only see this error on kernel >= 6.11 and aarch64.
|
|
|
|
Changing from DT_HASH to DT_GNU_HASH seems to work on aarch64. The test
|
|
suite runs without any errors.
|
|
|
|
Unfortunately I am not aware of all implication of this change and if a
|
|
successful test suite run means that it still works.
|
|
|
|
Looking at the kernel I see following hash styles for the VDSO:
|
|
|
|
aarch64: not specified (only GNU hash style)
|
|
arm: --hash-style=sysv
|
|
loongarch: --hash-style=sysv
|
|
mips: --hash-style=sysv
|
|
powerpc: --hash-style=both
|
|
riscv: --hash-style=both
|
|
s390: --hash-style=both
|
|
x86: --hash-style=both
|
|
|
|
Only aarch64 on kernels >= 6.11 is a problem right now, because all
|
|
other platforms provide the old style hashing.
|
|
|
|
Signed-off-by: Adrian Reber <areber@redhat.com>
|
|
Co-developed-by: Dmitry Safonov <dima@arista.com>
|
|
Co-authored-by: Dmitry Safonov <dima@arista.com>
|
|
Signed-off-by: Dmitry Safonov <dima@arista.com>
|
|
---
|
|
criu/pie/util-vdso.c | 245 ++++++++++++++++++++++++++++++++++---------
|
|
1 file changed, 198 insertions(+), 47 deletions(-)
|
|
|
|
diff --git a/criu/pie/util-vdso.c b/criu/pie/util-vdso.c
|
|
index f1e3239ff5..9819335d81 100644
|
|
--- a/criu/pie/util-vdso.c
|
|
+++ b/criu/pie/util-vdso.c
|
|
@@ -5,6 +5,7 @@
|
|
#include <fcntl.h>
|
|
#include <errno.h>
|
|
#include <stdint.h>
|
|
+#include <stdbool.h>
|
|
|
|
#include <sys/types.h>
|
|
#include <sys/stat.h>
|
|
@@ -48,10 +49,25 @@ static bool __ptr_struct_oob(uintptr_t ptr, size_t struct_size, uintptr_t start,
|
|
return __ptr_oob(ptr, start, size) || __ptr_struct_end_oob(ptr, struct_size, start, size);
|
|
}
|
|
|
|
+/* Local strlen implementation */
|
|
+static size_t __strlen(const char *str)
|
|
+{
|
|
+ const char *ptr;
|
|
+
|
|
+ if (!str)
|
|
+ return 0;
|
|
+
|
|
+ ptr = str;
|
|
+ while (*ptr != '\0')
|
|
+ ptr++;
|
|
+
|
|
+ return ptr - str;
|
|
+}
|
|
+
|
|
/*
|
|
* Elf hash, see format specification.
|
|
*/
|
|
-static unsigned long elf_hash(const unsigned char *name)
|
|
+static unsigned long elf_sysv_hash(const unsigned char *name)
|
|
{
|
|
unsigned long h = 0, g;
|
|
|
|
@@ -65,6 +81,15 @@ static unsigned long elf_hash(const unsigned char *name)
|
|
return h;
|
|
}
|
|
|
|
+/* * The GNU hash format. Taken from glibc. */
|
|
+static unsigned long elf_gnu_hash(const unsigned char *name)
|
|
+{
|
|
+ unsigned long h = 5381;
|
|
+ for (unsigned char c = *name; c != '\0'; c = *++name)
|
|
+ h = h * 33 + c;
|
|
+ return h;
|
|
+}
|
|
+
|
|
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
|
#define BORD ELFDATA2MSB /* 0x02 */
|
|
#else
|
|
@@ -149,11 +174,14 @@ static int parse_elf_phdr(uintptr_t mem, size_t size, Phdr_t **dynamic, Phdr_t *
|
|
* Output parameters are:
|
|
* @dyn_strtab - address of the symbol table
|
|
* @dyn_symtab - address of the string table section
|
|
- * @dyn_hash - address of the symbol hash table
|
|
+ * @dyn_hash - address of the symbol hash table
|
|
+ * @use_gnu_hash - the format of hash DT_HASH or DT_GNU_HASH
|
|
*/
|
|
-static int parse_elf_dynamic(uintptr_t mem, size_t size, Phdr_t *dynamic, Dyn_t **dyn_strtab, Dyn_t **dyn_symtab,
|
|
- Dyn_t **dyn_hash)
|
|
+static int parse_elf_dynamic(uintptr_t mem, size_t size, Phdr_t *dynamic,
|
|
+ Dyn_t **dyn_strtab, Dyn_t **dyn_symtab,
|
|
+ Dyn_t **dyn_hash, bool *use_gnu_hash)
|
|
{
|
|
+ Dyn_t *dyn_gnu_hash = NULL, *dyn_sysv_hash = NULL;
|
|
Dyn_t *dyn_syment = NULL;
|
|
Dyn_t *dyn_strsz = NULL;
|
|
uintptr_t addr;
|
|
@@ -184,16 +212,52 @@ static int parse_elf_dynamic(uintptr_t mem, size_t size, Phdr_t *dynamic, Dyn_t
|
|
dyn_syment = d;
|
|
pr_debug("DT_SYMENT: %lx\n", (unsigned long)d->d_un.d_val);
|
|
} else if (d->d_tag == DT_HASH) {
|
|
- *dyn_hash = d;
|
|
+ dyn_sysv_hash = d;
|
|
pr_debug("DT_HASH: %lx\n", (unsigned long)d->d_un.d_ptr);
|
|
+ } else if (d->d_tag == DT_GNU_HASH) {
|
|
+ /*
|
|
+ * This is complicated.
|
|
+ *
|
|
+ * Looking at the Linux kernel source, the following can be seen
|
|
+ * regarding which hashing style the VDSO uses on each arch:
|
|
+ *
|
|
+ * aarch64: not specified (depends on linker, can be
|
|
+ * only GNU hash style)
|
|
+ * arm: --hash-style=sysv
|
|
+ * loongarch: --hash-style=sysv
|
|
+ * mips: --hash-style=sysv
|
|
+ * powerpc: --hash-style=both
|
|
+ * riscv: --hash-style=both
|
|
+ * s390: --hash-style=both
|
|
+ * x86: --hash-style=both
|
|
+ *
|
|
+ * Some architectures are using both hash-styles, that
|
|
+ * is the easiest for CRIU. Some architectures are only
|
|
+ * using the old style (sysv), that is what CRIU supports.
|
|
+ *
|
|
+ * Starting with Linux 6.11, aarch64 unfortunately decided
|
|
+ * to switch from '--hash-style=sysv' to ''. Specifying
|
|
+ * nothing unfortunately may mean GNU hash style only and not
|
|
+ * 'both' (depending on the linker).
|
|
+ */
|
|
+ dyn_gnu_hash = d;
|
|
+ pr_debug("DT_GNU_HASH: %lx\n", (unsigned long)d->d_un.d_ptr);
|
|
}
|
|
}
|
|
|
|
- if (!*dyn_strtab || !*dyn_symtab || !dyn_strsz || !dyn_syment || !*dyn_hash) {
|
|
+ if (!*dyn_strtab || !*dyn_symtab || !dyn_strsz || !dyn_syment ||
|
|
+ (!dyn_gnu_hash && !dyn_sysv_hash)) {
|
|
pr_err("Not all dynamic entries are present\n");
|
|
return -EINVAL;
|
|
}
|
|
|
|
+ /*
|
|
+ * Prefer DT_HASH over DT_GNU_HASH as it's been more tested and
|
|
+ * as a result more stable.
|
|
+ */
|
|
+ *use_gnu_hash = !dyn_sysv_hash;
|
|
+ *dyn_hash = dyn_sysv_hash ?: dyn_gnu_hash;
|
|
+
|
|
return 0;
|
|
|
|
err_oob:
|
|
@@ -208,60 +272,141 @@ typedef unsigned long Hash_t;
|
|
typedef Word_t Hash_t;
|
|
#endif
|
|
|
|
-static void parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, struct vdso_symtable *t,
|
|
- uintptr_t dynsymbol_names, Hash_t *hash, Dyn_t *dyn_symtab)
|
|
+static bool elf_symbol_match(uintptr_t mem, size_t size,
|
|
+ uintptr_t dynsymbol_names, Sym_t *sym,
|
|
+ const char *symbol, const size_t vdso_symbol_length)
|
|
{
|
|
- ARCH_VDSO_SYMBOLS_LIST
|
|
-
|
|
- const char *vdso_symbols[VDSO_SYMBOL_MAX] = { ARCH_VDSO_SYMBOLS };
|
|
- const size_t vdso_symbol_length = sizeof(t->symbols[0].name) - 1;
|
|
+ uintptr_t addr = (uintptr_t)sym;
|
|
+ char *name;
|
|
|
|
- Hash_t nbucket, nchain;
|
|
- Hash_t *bucket, *chain;
|
|
+ if (__ptr_struct_oob(addr, sizeof(Sym_t), mem, size))
|
|
+ return false;
|
|
|
|
- unsigned int i, j, k;
|
|
- uintptr_t addr;
|
|
+ if (ELF_ST_TYPE(sym->st_info) != STT_FUNC && ELF_ST_BIND(sym->st_info) != STB_GLOBAL)
|
|
+ return false;
|
|
|
|
- nbucket = hash[0];
|
|
- nchain = hash[1];
|
|
- bucket = &hash[2];
|
|
- chain = &hash[nbucket + 2];
|
|
+ addr = dynsymbol_names + sym->st_name;
|
|
+ if (__ptr_struct_oob(addr, vdso_symbol_length, mem, size))
|
|
+ return false;
|
|
+ name = (void *)addr;
|
|
|
|
- pr_debug("nbucket %lx nchain %lx bucket %lx chain %lx\n", (long)nbucket, (long)nchain, (unsigned long)bucket,
|
|
- (unsigned long)chain);
|
|
+ return !std_strncmp(name, symbol, vdso_symbol_length);
|
|
+}
|
|
|
|
- for (i = 0; i < VDSO_SYMBOL_MAX; i++) {
|
|
- const char *symbol = vdso_symbols[i];
|
|
- k = elf_hash((const unsigned char *)symbol);
|
|
|
|
- for (j = bucket[k % nbucket]; j < nchain && j != STN_UNDEF; j = chain[j]) {
|
|
- Sym_t *sym;
|
|
- char *name;
|
|
+static unsigned long elf_symbol_lookup(uintptr_t mem, size_t size,
|
|
+ const char *symbol, uint32_t symbol_hash, unsigned int sym_off,
|
|
+ uintptr_t dynsymbol_names, Dyn_t *dyn_symtab, Phdr_t *load,
|
|
+ Hash_t nbucket, Hash_t nchain, Hash_t *bucket, Hash_t *chain,
|
|
+ const size_t vdso_symbol_length, bool use_gnu_hash)
|
|
+{
|
|
+ unsigned int j;
|
|
+ uintptr_t addr;
|
|
|
|
- addr = mem + dyn_symtab->d_un.d_ptr - load->p_vaddr;
|
|
+ j = bucket[symbol_hash % nbucket];
|
|
+ if (j == STN_UNDEF)
|
|
+ return 0;
|
|
+
|
|
+ addr = mem + dyn_symtab->d_un.d_ptr - load->p_vaddr;
|
|
+
|
|
+ if (use_gnu_hash) {
|
|
+ uint32_t *h = bucket + nbucket + (j - sym_off);
|
|
+ uint32_t hash_val;
|
|
+
|
|
+ symbol_hash |= 1;
|
|
+ do {
|
|
+ Sym_t *sym = (void *)addr + sizeof(Sym_t) * j;
|
|
+
|
|
+ hash_val = *h++;
|
|
+ if ((hash_val | 1) == symbol_hash &&
|
|
+ elf_symbol_match(mem, size, dynsymbol_names, sym,
|
|
+ symbol, vdso_symbol_length))
|
|
+ return sym->st_value;
|
|
+ j++;
|
|
+ } while (!(hash_val & 1));
|
|
+ } else {
|
|
+ for (; j < nchain && j != STN_UNDEF; j = chain[j]) {
|
|
+ Sym_t *sym = (void *)addr + sizeof(Sym_t) * j;
|
|
+
|
|
+ if (elf_symbol_match(mem, size, dynsymbol_names, sym,
|
|
+ symbol, vdso_symbol_length))
|
|
+ return sym->st_value;
|
|
+ }
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
|
|
- addr += sizeof(Sym_t) * j;
|
|
- if (__ptr_struct_oob(addr, sizeof(Sym_t), mem, size))
|
|
- continue;
|
|
- sym = (void *)addr;
|
|
+static int parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load,
|
|
+ struct vdso_symtable *t, uintptr_t dynsymbol_names,
|
|
+ Hash_t *hash, Dyn_t *dyn_symtab, bool use_gnu_hash)
|
|
+{
|
|
+ ARCH_VDSO_SYMBOLS_LIST
|
|
|
|
- if (ELF_ST_TYPE(sym->st_info) != STT_FUNC && ELF_ST_BIND(sym->st_info) != STB_GLOBAL)
|
|
- continue;
|
|
+ const char *vdso_symbols[VDSO_SYMBOL_MAX] = { ARCH_VDSO_SYMBOLS };
|
|
+ const size_t vdso_symbol_length = sizeof(t->symbols[0].name) - 1;
|
|
|
|
- addr = dynsymbol_names + sym->st_name;
|
|
- if (__ptr_struct_oob(addr, vdso_symbol_length, mem, size))
|
|
- continue;
|
|
- name = (void *)addr;
|
|
+ Hash_t *bucket = NULL;
|
|
+ Hash_t *chain = NULL;
|
|
+ Hash_t nbucket = 0;
|
|
+ Hash_t nchain = 0;
|
|
+
|
|
+ unsigned int sym_off = 0;
|
|
+ unsigned int i = 0;
|
|
+
|
|
+ unsigned long (*elf_hash)(const unsigned char *);
|
|
+
|
|
+ if (use_gnu_hash) {
|
|
+ uint32_t *gnu_hash = (uint32_t *)hash;
|
|
+ uint32_t bloom_sz;
|
|
+ size_t *bloom;
|
|
+
|
|
+ nbucket = gnu_hash[0];
|
|
+ sym_off = gnu_hash[1];
|
|
+ bloom_sz = gnu_hash[2];
|
|
+ bloom = (size_t *)&gnu_hash[4];
|
|
+ bucket = (Hash_t *)(&bloom[bloom_sz]);
|
|
+ elf_hash = &elf_gnu_hash;
|
|
+ pr_debug("nbucket %lx sym_off %lx bloom_sz %lx bloom %lx bucket %lx\n",
|
|
+ (unsigned long)nbucket, (unsigned long)sym_off,
|
|
+ (unsigned long)bloom_sz, (unsigned long)bloom,
|
|
+ (unsigned long)bucket);
|
|
+ } else {
|
|
+ nbucket = hash[0];
|
|
+ nchain = hash[1];
|
|
+ bucket = &hash[2];
|
|
+ chain = &hash[nbucket + 2];
|
|
+ elf_hash = &elf_sysv_hash;
|
|
+ pr_debug("nbucket %lx nchain %lx bucket %lx chain %lx\n",
|
|
+ (unsigned long)nbucket, (unsigned long)nchain,
|
|
+ (unsigned long)bucket, (unsigned long)chain);
|
|
+ }
|
|
|
|
- if (std_strncmp(name, symbol, vdso_symbol_length))
|
|
- continue;
|
|
|
|
- /* XXX: provide strncpy() implementation for PIE */
|
|
- memcpy(t->symbols[i].name, name, vdso_symbol_length);
|
|
- t->symbols[i].offset = (unsigned long)sym->st_value - load->p_vaddr;
|
|
- break;
|
|
+ for (i = 0; i < VDSO_SYMBOL_MAX; i++) {
|
|
+ const char *symbol = vdso_symbols[i];
|
|
+ unsigned long addr, symbol_hash;
|
|
+ const size_t symbol_length = __strlen(symbol);
|
|
+
|
|
+ symbol_hash = elf_hash((const unsigned char *)symbol);
|
|
+ addr = elf_symbol_lookup(mem, size, symbol, symbol_hash,
|
|
+ sym_off, dynsymbol_names, dyn_symtab, load,
|
|
+ nbucket, nchain, bucket, chain,
|
|
+ vdso_symbol_length, use_gnu_hash);
|
|
+ pr_debug("symbol %s at address %lx\n", symbol, addr);
|
|
+ if (!addr)
|
|
+ continue;
|
|
+
|
|
+ /* XXX: provide strncpy() implementation for PIE */
|
|
+ if (symbol_length > vdso_symbol_length) {
|
|
+ pr_err("strlen(%s) %zd, only %zd bytes available\n",
|
|
+ symbol, symbol_length, vdso_symbol_length);
|
|
+ return -EINVAL;
|
|
}
|
|
+ memcpy(t->symbols[i].name, symbol, symbol_length);
|
|
+ t->symbols[i].offset = addr - load->p_vaddr;
|
|
}
|
|
+
|
|
+ return 0;
|
|
}
|
|
|
|
int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t)
|
|
@@ -271,6 +416,7 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t)
|
|
Dyn_t *dyn_symtab = NULL;
|
|
Dyn_t *dyn_hash = NULL;
|
|
Hash_t *hash = NULL;
|
|
+ bool use_gnu_hash;
|
|
|
|
uintptr_t dynsymbol_names;
|
|
uintptr_t addr;
|
|
@@ -296,7 +442,8 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t)
|
|
* needed. Note that we're interested in a small set of tags.
|
|
*/
|
|
|
|
- ret = parse_elf_dynamic(mem, size, dynamic, &dyn_strtab, &dyn_symtab, &dyn_hash);
|
|
+ ret = parse_elf_dynamic(mem, size, dynamic, &dyn_strtab, &dyn_symtab,
|
|
+ &dyn_hash, &use_gnu_hash);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
@@ -310,7 +457,11 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t)
|
|
goto err_oob;
|
|
hash = (void *)addr;
|
|
|
|
- parse_elf_symbols(mem, size, load, t, dynsymbol_names, hash, dyn_symtab);
|
|
+ ret = parse_elf_symbols(mem, size, load, t, dynsymbol_names, hash, dyn_symtab,
|
|
+ use_gnu_hash);
|
|
+
|
|
+ if (ret <0)
|
|
+ return ret;
|
|
|
|
return 0;
|
|
|