73667d0be6
* Thu Apr 28 2022 Carlos O'Donell <carlos@redhat.com> - 2.34-32 - Sync with upstream branch release/2.34/master, commit c66c92181ddbd82306537a608e8c0282587131de: - posix/glob.c: update from gnulib (BZ#25659) - linux: Fix fchmodat with AT_SYMLINK_NOFOLLOW for 64 bit time_t (BZ#29097) * Wed Apr 27 2022 Carlos O'Donell <carlos@redhat.com> - 2.34-31 - Sync with upstream branch release/2.34/master, commit 55640ed3fde48360a8e8083be4843bd2dc7cecfe: - i386: Regenerate ulps - linux: Fix missing internal 64 bit time_t stat usage - x86: Optimize L(less_vec) case in memcmp-evex-movbe.S - x86: Don't set Prefer_No_AVX512 for processors with AVX512 and AVX-VNNI - x86-64: Use notl in EVEX strcmp [BZ #28646] - x86: Shrink memcmp-sse4.S code size - x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h - x86: Optimize memmove-vec-unaligned-erms.S - x86-64: Replace movzx with movzbl - x86-64: Remove Prefer_AVX2_STRCMP - x86-64: Improve EVEX strcmp with masked load - x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S - x86: Optimize memset-vec-unaligned-erms.S - x86: Optimize memcmp-evex-movbe.S for frontend behavior and size - x86: Modify ENTRY in sysdep.h so that p2align can be specified - x86-64: Optimize load of all bits set into ZMM register [BZ #28252] - scripts/glibcelf.py: Mark as UNSUPPORTED on Python 3.5 and earlier - dlfcn: Do not use rtld_active () to determine ld.so state (bug 29078) - INSTALL: Rephrase -with-default-link documentation - misc: Fix rare fortify crash on wchar funcs. [BZ 29030] - Default to --with-default-link=no (bug 25812) - scripts: Add glibcelf.py module * Thu Apr 21 2022 Carlos O'Donell <carlos@redhat.com> - 2.34-30 - Sync with upstream branch release/2.34/master, commit 71326f1f2fd09dafb9c34404765fb88129e94237: - nptl: Fix pthread_cancel cancelhandling atomic operations - mips: Fix mips64n32 64 bit time_t stat support (BZ#29069) - hurd: Fix arbitrary error code - nptl: Handle spurious EINTR when thread cancellation is disabled (BZ#29029) - S390: Add new s390 platform z16. - NEWS: Update fixed bug list for LD_AUDIT backports. - hppa: Fix bind-now audit (BZ #28857) - elf: Replace tst-audit24bmod2.so with tst-audit24bmod2 - Fix elf/tst-audit25a with default bind now toolchains - elf: Fix runtime linker auditing on aarch64 (BZ #26643) - elf: Issue la_symbind for bind-now (BZ #23734) - elf: Fix initial-exec TLS access on audit modules (BZ #28096) - elf: Add la_activity during application exit - elf: Do not fail for failed dlmopen on audit modules (BZ #28061) - elf: Issue audit la_objopen for vDSO - elf: Add audit tests for modules with TLSDESC - elf: Avoid unnecessary slowdown from profiling with audit (BZ#15533) - elf: Add _dl_audit_pltexit - elf: Add _dl_audit_pltenter - elf: Add _dl_audit_preinit - elf: Add _dl_audit_symbind_alt and _dl_audit_symbind - elf: Add _dl_audit_objclose - elf: Add _dl_audit_objsearch - elf: Add _dl_audit_activity_map and _dl_audit_activity_nsid - elf: Add _dl_audit_objopen - elf: Move la_activity (LA_ACT_ADD) after _dl_add_to_namespace_list() (BZ #28062) - elf: Move LAV_CURRENT to link_lavcurrent.h - elf: Fix elf_get_dynamic_info() for bootstrap - elf: Fix dynamic-link.h usage on rtld.c - elf: Fix elf_get_dynamic_info definition - elf: Avoid nested functions in the loader [BZ #27220] - powerpc: Delete unneeded ELF_MACHINE_BEFORE_RTLD_RELOC - hppa: Use END instead of PSEUDO_END in swapcontext.S - hppa: Implement swapcontext in assembler (bug 28960) Resolves: #2003291 Resolves: #2064181 Resolves: #2072328 Resolves: #2075713 Resolves: #2077838
41 lines
1.6 KiB
Diff
41 lines
1.6 KiB
Diff
commit baf3ece63453adac59c5688930324a78ced5b2e4
|
|
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
Date: Sat Oct 23 01:26:47 2021 -0400
|
|
|
|
x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S
|
|
|
|
This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'.
|
|
|
|
it could potentially be dangerous to use SSE2 if this function is ever
|
|
called without using 'vzeroupper' beforehand. While compilers appear
|
|
to use 'vzeroupper' before function calls if AVX2 has been used, using
|
|
SSE2 here is more brittle. Since it is not absolutely necessary it
|
|
should be avoided.
|
|
|
|
It costs 2-extra bytes but the extra bytes should only eat into
|
|
alignment padding.
|
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
|
|
(cherry picked from commit bad852b61b79503fcb3c5fc379c70f768df3e1fb)
|
|
|
|
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
|
index 2761b54f2e7dea9f..640f6757fac8a356 100644
|
|
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
|
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
|
@@ -561,13 +561,13 @@ L(between_16_31):
|
|
/* From 16 to 31 bytes. No branch when size == 16. */
|
|
|
|
/* Use movups to save code size. */
|
|
- movups (%rsi), %xmm2
|
|
+ vmovdqu (%rsi), %xmm2
|
|
VPCMP $4, (%rdi), %xmm2, %k1
|
|
kmovd %k1, %eax
|
|
testl %eax, %eax
|
|
jnz L(return_vec_0_lv)
|
|
/* Use overlapping loads to avoid branches. */
|
|
- movups -16(%rsi, %rdx, CHAR_SIZE), %xmm2
|
|
+ vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2
|
|
VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
|
|
addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
|
|
kmovd %k1, %eax
|