From e6e4f6cc86a74fefbb3d1a6ced3ab76f4ea3e69b Mon Sep 17 00:00:00 2001 From: Patsy Griffin Date: Mon, 11 Nov 2024 17:02:45 -0500 Subject: [PATCH] aarch64: Add support for MTE to strncmp. Resolves: RHEL-61255 --- glibc-RHEL-61255.patch | 374 +++++++++++++++++++++++++++++++++++++++++ glibc.spec | 6 +- 2 files changed, 379 insertions(+), 1 deletion(-) create mode 100644 glibc-RHEL-61255.patch diff --git a/glibc-RHEL-61255.patch b/glibc-RHEL-61255.patch new file mode 100644 index 0000000..30d504d --- /dev/null +++ b/glibc-RHEL-61255.patch @@ -0,0 +1,374 @@ +commit 03e1378f94173fc192a81e421457198f7b8a34a0 +Author: Alex Butler +Date: Tue Jun 16 12:44:24 2020 +0000 + + aarch64: MTE compatible strncmp + + Add support for MTE to strncmp. Regression tested with xcheck and benchmarked + with glibc's benchtests on the Cortex-A53, Cortex-A72, and Neoverse N1. + + The existing implementation assumes that any access to the pages in which the + string resides is safe. This assumption is not true when MTE is enabled. This + patch updates the algorithm to ensure that accesses remain within the bounds + of an MTE tag (16-byte chunks) and improves overall performance. + + Co-authored-by: Branislav Rankov + Co-authored-by: Wilco Dijkstra + +diff --git a/sysdeps/aarch64/strncmp.S b/sysdeps/aarch64/strncmp.S +index c5141fab8a..ba2563490e 100644 +--- a/sysdeps/aarch64/strncmp.S ++++ b/sysdeps/aarch64/strncmp.S +@@ -25,7 +25,6 @@ + + #define REP8_01 0x0101010101010101 + #define REP8_7f 0x7f7f7f7f7f7f7f7f +-#define REP8_80 0x8080808080808080 + + /* Parameters and result. */ + #define src1 x0 +@@ -46,15 +45,31 @@ + #define tmp3 x10 + #define zeroones x11 + #define pos x12 +-#define limit_wd x13 +-#define mask x14 +-#define endloop x15 ++#define mask x13 ++#define endloop x14 + #define count mask ++#define offset pos ++#define neg_offset x15 + +-ENTRY_ALIGN_AND_PAD (strncmp, 6, 7) +- DELOUSE (0) +- DELOUSE (1) +- DELOUSE (2) ++/* Define endian dependent shift operations. ++ On big-endian early bytes are at MSB and on little-endian LSB. ++ LS_FW means shifting towards early bytes. ++ LS_BK means shifting towards later bytes. ++ */ ++#ifdef __AARCH64EB__ ++#define LS_FW lsl ++#define LS_BK lsr ++#else ++#define LS_FW lsr ++#define LS_BK lsl ++#endif ++ ++ .text ++ .p2align 6 ++ .rep 9 ++ nop /* Pad so that the loop below fits a cache line. */ ++ .endr ++ENTRY_ALIGN (strncmp, 0) + cbz limit, L(ret0) + eor tmp1, src1, src2 + mov zeroones, #REP8_01 +@@ -62,9 +77,6 @@ ENTRY_ALIGN_AND_PAD (strncmp, 6, 7) + and count, src1, #7 + b.ne L(misaligned8) + cbnz count, L(mutual_align) +- /* Calculate the number of full and partial words -1. */ +- sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ +- lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ + + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and +@@ -74,56 +86,52 @@ L(loop_aligned): + ldr data1, [src1], #8 + ldr data2, [src2], #8 + L(start_realigned): +- subs limit_wd, limit_wd, #1 ++ subs limit, limit, #8 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ +- csinv endloop, diff, xzr, pl /* Last Dword or differences. */ ++ csinv endloop, diff, xzr, hi /* Last Dword or differences. */ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp endloop, #0, #0, eq + b.eq L(loop_aligned) + /* End of performance-critical section -- one 64B cache line. */ + +- /* Not reached the limit, must have found the end or a diff. */ +- tbz limit_wd, #63, L(not_limit) +- +- /* Limit % 8 == 0 => all bytes significant. */ +- ands limit, limit, #7 +- b.eq L(not_limit) +- +- lsl limit, limit, #3 /* Bits -> bytes. */ +- mov mask, #~0 +-#ifdef __AARCH64EB__ +- lsr mask, mask, limit +-#else +- lsl mask, mask, limit +-#endif +- bic data1, data1, mask +- bic data2, data2, mask +- +- /* Make sure that the NUL byte is marked in the syndrome. */ +- orr has_nul, has_nul, mask +- +-L(not_limit): ++L(full_check): ++#ifndef __AARCH64EB__ + orr syndrome, diff, has_nul +- +-#ifndef __AARCH64EB__ ++ add limit, limit, 8 /* Rewind limit to before last subs. */ ++L(syndrome_check): ++ /* Limit was reached. Check if the NUL byte or the difference ++ is before the limit. */ + rev syndrome, syndrome + rev data1, data1 +- /* The MS-non-zero bit of the syndrome marks either the first bit +- that is different, or the top bit of the first zero byte. +- Shifting left now will bring the critical information into the +- top bits. */ + clz pos, syndrome + rev data2, data2 + lsl data1, data1, pos ++ cmp limit, pos, lsr #3 + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 +- RET ++ csel result, result, xzr, hi ++ ret + #else ++ /* Not reached the limit, must have found the end or a diff. */ ++ tbz limit, #63, L(not_limit) ++ add tmp1, limit, 8 ++ cbz limit, L(not_limit) ++ ++ lsl limit, tmp1, #3 /* Bits -> bytes. */ ++ mov mask, #~0 ++ lsr mask, mask, limit ++ bic data1, data1, mask ++ bic data2, data2, mask ++ ++ /* Make sure that the NUL byte is marked in the syndrome. */ ++ orr has_nul, has_nul, mask ++ ++L(not_limit): + /* For big-endian we cannot use the trick with the syndrome value + as carry-propagation can corrupt the upper bits if the trailing + bytes in the string contain 0x01. */ +@@ -134,7 +142,7 @@ L(not_limit): + cmp data1, data2 + cset result, ne + cneg result, result, lo +- RET ++ ret + 1: + /* Re-compute the NUL-byte detection, using a byte-reversed value. */ + rev tmp3, data1 +@@ -144,17 +152,18 @@ L(not_limit): + rev has_nul, has_nul + orr syndrome, diff, has_nul + clz pos, syndrome +- /* The MS-non-zero bit of the syndrome marks either the first bit +- that is different, or the top bit of the first zero byte. ++ /* The most-significant-non-zero bit of the syndrome marks either the ++ first bit that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ ++L(end_quick): + lsl data1, data1, pos + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 +- RET ++ ret + #endif + + L(mutual_align): +@@ -169,22 +178,12 @@ L(mutual_align): + neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ + ldr data2, [src2], #8 + mov tmp2, #~0 +- sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ +-#ifdef __AARCH64EB__ +- /* Big-endian. Early bytes are at MSB. */ +- lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */ +-#else +- /* Little-endian. Early bytes are at LSB. */ +- lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */ +-#endif +- and tmp3, limit_wd, #7 +- lsr limit_wd, limit_wd, #3 +- /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */ +- add limit, limit, count +- add tmp3, tmp3, count ++ LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ ++ /* Adjust the limit and ensure it doesn't overflow. */ ++ adds limit, limit, count ++ csinv limit, limit, xzr, lo + orr data1, data1, tmp2 + orr data2, data2, tmp2 +- add limit_wd, limit_wd, tmp3, lsr #3 + b L(start_realigned) + + .p2align 6 +@@ -203,18 +202,15 @@ L(byte_loop): + b.eq L(byte_loop) + L(done): + sub result, data1, data2 +- RET +- ++ ret + /* Align the SRC1 to a dword by doing a bytewise compare and then do + the dword loop. */ + L(try_misaligned_words): +- lsr limit_wd, limit, #3 +- cbz count, L(do_misaligned) ++ cbz count, L(src1_aligned) + + neg count, count + and count, count, #7 + sub limit, limit, count +- lsr limit_wd, limit, #3 + + L(page_end_loop): + ldrb data1w, [src1], #1 +@@ -225,48 +221,98 @@ L(page_end_loop): + subs count, count, #1 + b.hi L(page_end_loop) + +-L(do_misaligned): +- /* Prepare ourselves for the next page crossing. Unlike the aligned +- loop, we fetch 1 less dword because we risk crossing bounds on +- SRC2. */ +- mov count, #8 +- subs limit_wd, limit_wd, #1 +- b.lo L(done_loop) ++ /* The following diagram explains the comparison of misaligned strings. ++ The bytes are shown in natural order. For little-endian, it is ++ reversed in the registers. The "x" bytes are before the string. ++ The "|" separates data that is loaded at one time. ++ src1 | a a a a a a a a | b b b c c c c c | . . . ++ src2 | x x x x x a a a a a a a a b b b | c c c c c . . . ++ After shifting in each step, the data looks like this: ++ STEP_A STEP_B STEP_C ++ data1 a a a a a a a a b b b c c c c c b b b c c c c c ++ data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c ++ The bytes with "0" are eliminated from the syndrome via mask. ++ Align SRC2 down to 16 bytes. This way we can read 16 bytes at a ++ time from SRC2. The comparison happens in 3 steps. After each step ++ the loop can exit, or read from SRC1 or SRC2. */ ++L(src1_aligned): ++ /* Calculate offset from 8 byte alignment to string start in bits. No ++ need to mask offset since shifts are ignoring upper bits. */ ++ lsl offset, src2, #3 ++ bic src2, src2, #0xf ++ mov mask, -1 ++ neg neg_offset, offset ++ ldr data1, [src1], #8 ++ ldp tmp1, tmp2, [src2], #16 ++ LS_BK mask, mask, neg_offset ++ and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ ++ /* Skip the first compare if data in tmp1 is irrelevant. */ ++ tbnz offset, 6, L(misaligned_mid_loop) ++ + L(loop_misaligned): +- and tmp2, src2, #0xff8 +- eor tmp2, tmp2, #0xff8 +- cbz tmp2, L(page_end_loop) ++ /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ ++ LS_FW data2, tmp1, offset ++ LS_BK tmp1, tmp2, neg_offset ++ subs limit, limit, #8 ++ orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ ++ sub has_nul, data1, zeroones ++ eor diff, data1, data2 /* Non-zero if differences found. */ ++ orr tmp3, data1, #REP8_7f ++ csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ ++ bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ ++ orr tmp3, endloop, has_nul ++ cbnz tmp3, L(full_check) + + ldr data1, [src1], #8 +- ldr data2, [src2], #8 +- sub tmp1, data1, zeroones +- orr tmp2, data1, #REP8_7f +- eor diff, data1, data2 /* Non-zero if differences found. */ +- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ +- ccmp diff, #0, #0, eq +- b.ne L(not_limit) +- subs limit_wd, limit_wd, #1 +- b.pl L(loop_misaligned) ++L(misaligned_mid_loop): ++ /* STEP_B: Compare first part of data1 to second part of tmp2. */ ++ LS_FW data2, tmp2, offset ++#ifdef __AARCH64EB__ ++ /* For big-endian we do a byte reverse to avoid carry-propagation ++ problem described above. This way we can reuse the has_nul in the ++ next step and also use syndrome value trick at the end. */ ++ rev tmp3, data1 ++ #define data1_fixed tmp3 ++#else ++ #define data1_fixed data1 ++#endif ++ sub has_nul, data1_fixed, zeroones ++ orr tmp3, data1_fixed, #REP8_7f ++ eor diff, data2, data1 /* Non-zero if differences found. */ ++ bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ ++#ifdef __AARCH64EB__ ++ rev has_nul, has_nul ++#endif ++ cmp limit, neg_offset, lsr #3 ++ orr syndrome, diff, has_nul ++ bic syndrome, syndrome, mask /* Ignore later bytes. */ ++ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ ++ cbnz tmp3, L(syndrome_check) + +-L(done_loop): +- /* We found a difference or a NULL before the limit was reached. */ +- and limit, limit, #7 +- cbz limit, L(not_limit) +- /* Read the last word. */ +- sub src1, src1, 8 +- sub src2, src2, 8 +- ldr data1, [src1, limit] +- ldr data2, [src2, limit] +- sub tmp1, data1, zeroones +- orr tmp2, data1, #REP8_7f +- eor diff, data1, data2 /* Non-zero if differences found. */ +- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ +- ccmp diff, #0, #0, eq +- b.ne L(not_limit) ++ /* STEP_C: Compare second part of data1 to first part of tmp1. */ ++ ldp tmp1, tmp2, [src2], #16 ++ cmp limit, #8 ++ LS_BK data2, tmp1, neg_offset ++ eor diff, data2, data1 /* Non-zero if differences found. */ ++ orr syndrome, diff, has_nul ++ and syndrome, syndrome, mask /* Ignore earlier bytes. */ ++ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ ++ cbnz tmp3, L(syndrome_check) ++ ++ ldr data1, [src1], #8 ++ sub limit, limit, #8 ++ b L(loop_misaligned) ++ ++#ifdef __AARCH64EB__ ++L(syndrome_check): ++ clz pos, syndrome ++ cmp pos, limit, lsl #3 ++ b.lo L(end_quick) ++#endif + + L(ret0): + mov result, #0 +- RET ++ ret + + END (strncmp) + libc_hidden_builtin_def (strncmp) diff --git a/glibc.spec b/glibc.spec index 2c552b2..b4705f7 100644 --- a/glibc.spec +++ b/glibc.spec @@ -115,7 +115,7 @@ end \ Summary: The GNU libc libraries Name: glibc Version: %{glibcversion} -Release: %{glibcrelease}.7 +Release: %{glibcrelease}.8 # In general, GPLv2+ is used by programs, LGPLv2+ is used for # libraries. @@ -1244,6 +1244,7 @@ Patch1009: glibc-RHEL-36147-2.patch Patch1010: glibc-RHEL-36147-3.patch Patch1011: glibc-RHEL-49490-1.patch Patch1012: glibc-RHEL-49490-2.patch +Patch1013: glibc-RHEL-61255.patch ############################################################################## # Continued list of core "glibc" package information: @@ -2903,6 +2904,9 @@ fi %{_libdir}/libpthread_nonshared.a %changelog +* Mon Nov 11 2024 Patsy Griffin - 2.28-251.8 +- aarch64: MTE compatible strncmp (RHEL-61255) + * Wed Oct 23 2024 Florian Weimer - 2.28-251.7 - Use UsrMove path destination in the RPM files (RHEL-63048)