From 7878600f3a4c498359f6845b0d5f02779ec50c9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrik=20Novotn=C3=BD?= Date: Thu, 3 Jun 2021 14:22:14 +0200 Subject: [PATCH] Add IBM CRC32 optimalization patch for s390x Resolves: #1959423 --- zlib-1.2.11-s390x-vectorize-crc32.patch | 455 ++++++++++++++++++++++++ zlib.spec | 8 +- 2 files changed, 462 insertions(+), 1 deletion(-) create mode 100644 zlib-1.2.11-s390x-vectorize-crc32.patch diff --git a/zlib-1.2.11-s390x-vectorize-crc32.patch b/zlib-1.2.11-s390x-vectorize-crc32.patch new file mode 100644 index 0000000..6801f85 --- /dev/null +++ b/zlib-1.2.11-s390x-vectorize-crc32.patch @@ -0,0 +1,455 @@ +From 367e79caf76bda5fdb974420b72c6ddabdcd664e Mon Sep 17 00:00:00 2001 +From: Ilya Leoshkevich +Date: Thu, 19 Mar 2020 11:52:03 +0100 +Subject: [PATCH] s390x: vectorize crc32 + +Use vector extensions when compiling for s390x and binutils knows +about them. At runtime, check whether kernel supports vector +extensions (it has to be not just the CPU, but also the kernel) and +choose between the regular and the vectorized implementations. +--- + Makefile.in | 8 ++ + configure | 16 +++ + contrib/s390/crc32le-vx.S | 273 ++++++++++++++++++++++++++++++++++++++ + crc32.c | 66 ++++++++- + 4 files changed, 361 insertions(+), 2 deletions(-) + create mode 100644 contrib/s390/crc32le-vx.S + +diff --git a/Makefile.in b/Makefile.in +index 6070dcc..23e8694 100644 +--- a/Makefile.in ++++ b/Makefile.in +@@ -179,6 +179,9 @@ crc32_power8.o: $(SRCDIR)contrib/power8-crc/vec_crc32.c + crc32.o: $(SRCDIR)crc32.c + $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)crc32.c + ++crc32le-vx.o: $(SRCDIR)contrib/s390/crc32le-vx.S ++ $(CC) $(CFLAGS) -march=z13 $(ZINC) -c -o $@ $(SRCDIR)contrib/s390/crc32le-vx.S ++ + deflate.o: $(SRCDIR)deflate.c + $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c + +@@ -234,6 +237,11 @@ crc32.lo: $(SRCDIR)crc32.c + $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/crc32.o $(SRCDIR)crc32.c + -@mv objs/crc32.o $@ + ++crc32le-vx.lo: $(SRCDIR)contrib/s390/crc32le-vx.S ++ -@mkdir objs 2>/dev/null || test -d objs ++ $(CC) $(SFLAGS) -march=z13 $(ZINC) -DPIC -c -o objs/crc32le-vx.o $(SRCDIR)contrib/s390/crc32le-vx.S ++ -@mv objs/crc32le-vx.o $@ ++ + deflate.lo: $(SRCDIR)deflate.c + -@mkdir objs 2>/dev/null || test -d objs + $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c +diff --git a/configure b/configure +index 70ed86b..e658039 100755 +--- a/configure ++++ b/configure +@@ -923,6 +923,22 @@ EOF + fi + fi + ++# check if we are compiling for s390 and binutils support vector extensions ++cat > $test.c <> configure.log + echo ALL = $ALL >> configure.log +diff --git a/contrib/s390/crc32le-vx.S b/contrib/s390/crc32le-vx.S +new file mode 100644 +index 0000000..029cfff +--- /dev/null ++++ b/contrib/s390/crc32le-vx.S +@@ -0,0 +1,273 @@ ++/* ++ * Hardware-accelerated CRC-32 variants for Linux on z Systems ++ * ++ * Use the z/Architecture Vector Extension Facility to accelerate the ++ * computing of bitreflected CRC-32 checksums. ++ * ++ * This CRC-32 implementation algorithm is bitreflected and processes ++ * the least-significant bit first (Little-Endian). ++ * ++ * This code has been originally written by Hendrik Brueckner ++ * and included in the Linux kernel: ++ * ++ * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/s390/crypto/crc32le-vx.S?h=v5.5 ++ * ++ * Hendrik Brueckner has allowed reusing it under zlib license. ++ * ++ * The following adjustments were made: ++ * ++ * - Reformatted in order to match the zlib code style. ++ * - Changed the vector register numbers in order to avoid clobbering the call-saved %v8-%v16. ++ * - Fixed clang compatibility. ++ * - Added 31-bit compatibility. ++ */ ++ ++#ifndef __clang__ ++.machinemode zarch ++#endif ++ ++#define PART1 %v16 ++#define PART2 %v17 ++#define PART3 %v18 ++#define PART4 %v19 ++#define SHIFTS %v20 ++ ++/* Vector register range containing CRC-32 constants */ ++#define CONST_PERM_LE2BE %v21 ++#define CONST_R2R1 %v22 ++#define CONST_R4R3 %v23 ++#define CONST_R5 %v24 ++#define CONST_RU_POLY %v25 ++#define CONST_CRC_POLY %v26 ++ ++#if defined(__s390x__) ++#define AGHI aghi ++#define CGHI cghi ++#else ++#define AGHI ahi ++#define CGHI chi ++#endif ++ ++.data ++.align 8 ++ ++/* ++ * The CRC-32 constant block contains reduction constants to fold and ++ * process particular chunks of the input data stream in parallel. ++ * ++ * For the CRC-32 variants, the constants are precomputed according to ++ * these definitions: ++ * ++ * R1 = [(x4*128+32 mod P'(x) << 32)]' << 1 ++ * R2 = [(x4*128-32 mod P'(x) << 32)]' << 1 ++ * R3 = [(x128+32 mod P'(x) << 32)]' << 1 ++ * R4 = [(x128-32 mod P'(x) << 32)]' << 1 ++ * R5 = [(x64 mod P'(x) << 32)]' << 1 ++ * R6 = [(x32 mod P'(x) << 32)]' << 1 ++ * ++ * The bitreflected Barret reduction constant, u', is defined as ++ * the bit reversal of floor(x**64 / P(x)). ++ * ++ * where P(x) is the polynomial in the normal domain and the P'(x) is the ++ * polynomial in the reversed (bitreflected) domain. ++ * ++ * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials: ++ * ++ * P(x) = 0x04C11DB7 ++ * P'(x) = 0xEDB88320 ++ */ ++ ++.Lconstants_CRC_32_LE: ++ .octa 0x0F0E0D0C0B0A09080706050403020100 # BE->LE mask ++ .quad 0x1c6e41596, 0x154442bd4 # R2, R1 ++ .quad 0x0ccaa009e, 0x1751997d0 # R4, R3 ++ .octa 0x163cd6124 # R5 ++ .octa 0x1F7011641 # u' ++ .octa 0x1DB710641 # P'(x) << 1 ++ ++.text ++ ++/* ++ * The CRC-32 functions use these calling conventions: ++ * ++ * Parameters: ++ * ++ * %r2: Initial CRC value, typically ~0; and final CRC (return) value. ++ * %r3: Input buffer pointer, performance might be improved if the ++ * buffer is on a doubleword boundary. ++ * %r4: Length of the buffer, must be 64 bytes or greater. ++ * ++ * Register usage: ++ * ++ * %r5: CRC-32 constant pool base pointer. ++ * V0: Initial CRC value and intermediate constants and results. ++ * V1..V4: Data for CRC computation. ++ * V16..V19: Next data chunks that are fetched from the input buffer. ++ * V20: Constant for BE->LE conversion and shift operations ++ * ++ * V21..V26: CRC-32 constants. ++ */ ++ ++ .globl crc32_le_vgfm_16 ++ .align 4, 0x07 ++crc32_le_vgfm_16: ++ /* Load CRC-32 constants */ ++ larl %r5,.Lconstants_CRC_32_LE ++ VLM CONST_PERM_LE2BE,CONST_CRC_POLY,0(%r5) ++ ++ /* ++ * Load the initial CRC value. ++ * ++ * The CRC value is loaded into the rightmost word of the ++ * vector register and is later XORed with the LSB portion ++ * of the loaded input data. ++ */ ++ VZERO %v0 /* Clear V0 */ ++ VLVGF %v0,%r2,3 /* Load CRC into rightmost word */ ++ ++ /* Load a 64-byte data chunk and XOR with CRC */ ++ VLM %v1,%v4,0(%r3) /* 64-bytes into V1..V4 */ ++ VPERM %v1,%v1,%v1,CONST_PERM_LE2BE ++ VPERM %v2,%v2,%v2,CONST_PERM_LE2BE ++ VPERM %v3,%v3,%v3,CONST_PERM_LE2BE ++ VPERM %v4,%v4,%v4,CONST_PERM_LE2BE ++ ++ VX %v1,%v0,%v1 /* V1 ^= CRC */ ++ AGHI %r3,64 /* BUF = BUF + 64 */ ++ AGHI %r4,-64 /* LEN = LEN - 64 */ ++ ++ CGHI %r4,64 ++ jl .Lless_than_64bytes ++ ++.Lfold_64bytes_loop: ++ /* Load the next 64-byte data chunk into PART1 to PART4 */ ++ VLM PART1,PART4,0(%r3) ++ VPERM PART1,PART1,PART1,CONST_PERM_LE2BE ++ VPERM PART2,PART2,PART2,CONST_PERM_LE2BE ++ VPERM PART3,PART3,PART3,CONST_PERM_LE2BE ++ VPERM PART4,PART4,PART4,CONST_PERM_LE2BE ++ ++ /* ++ * Perform a GF(2) multiplication of the doublewords in V1 with ++ * the R1 and R2 reduction constants in V0. The intermediate result ++ * is then folded (accumulated) with the next data chunk in PART1 and ++ * stored in V1. Repeat this step for the register contents ++ * in V2, V3, and V4 respectively. ++ */ ++ VGFMAG %v1,CONST_R2R1,%v1,PART1 ++ VGFMAG %v2,CONST_R2R1,%v2,PART2 ++ VGFMAG %v3,CONST_R2R1,%v3,PART3 ++ VGFMAG %v4,CONST_R2R1,%v4,PART4 ++ ++ AGHI %r3,64 /* BUF = BUF + 64 */ ++ AGHI %r4,-64 /* LEN = LEN - 64 */ ++ ++ CGHI %r4,64 ++ jnl .Lfold_64bytes_loop ++ ++.Lless_than_64bytes: ++ /* ++ * Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3 ++ * and R4 and accumulating the next 128-bit chunk until a single 128-bit ++ * value remains. ++ */ ++ VGFMAG %v1,CONST_R4R3,%v1,%v2 ++ VGFMAG %v1,CONST_R4R3,%v1,%v3 ++ VGFMAG %v1,CONST_R4R3,%v1,%v4 ++ ++ CGHI %r4,16 ++ jl .Lfinal_fold ++ ++.Lfold_16bytes_loop: ++ ++ VL %v2,0(%r3) /* Load next data chunk */ ++ VPERM %v2,%v2,%v2,CONST_PERM_LE2BE ++ VGFMAG %v1,CONST_R4R3,%v1,%v2 /* Fold next data chunk */ ++ ++ AGHI %r3,16 ++ AGHI %r4,-16 ++ ++ CGHI %r4,16 ++ jnl .Lfold_16bytes_loop ++ ++.Lfinal_fold: ++ /* ++ * Set up a vector register for byte shifts. The shift value must ++ * be loaded in bits 1-4 in byte element 7 of a vector register. ++ * Shift by 8 bytes: 0x40 ++ * Shift by 4 bytes: 0x20 ++ */ ++ VLEIB SHIFTS,0x40,7 ++ ++ /* ++ * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes ++ * to move R4 into the rightmost doubleword and set the leftmost ++ * doubleword to 0x1. ++ */ ++ VSRLB %v0,CONST_R4R3,SHIFTS ++ VLEIG %v0,1,0 ++ ++ /* ++ * Compute GF(2) product of V1 and V0. The rightmost doubleword ++ * of V1 is multiplied with R4. The leftmost doubleword of V1 is ++ * multiplied by 0x1 and is then XORed with rightmost product. ++ * Implicitly, the intermediate leftmost product becomes padded ++ */ ++ VGFMG %v1,%v0,%v1 ++ ++ /* ++ * Now do the final 32-bit fold by multiplying the rightmost word ++ * in V1 with R5 and XOR the result with the remaining bits in V1. ++ * ++ * To achieve this by a single VGFMAG, right shift V1 by a word ++ * and store the result in V2 which is then accumulated. Use the ++ * vector unpack instruction to load the rightmost half of the ++ * doubleword into the rightmost doubleword element of V1; the other ++ * half is loaded in the leftmost doubleword. ++ * The vector register with CONST_R5 contains the R5 constant in the ++ * rightmost doubleword and the leftmost doubleword is zero to ignore ++ * the leftmost product of V1. ++ */ ++ VLEIB SHIFTS,0x20,7 /* Shift by words */ ++ VSRLB %v2,%v1,SHIFTS /* Store remaining bits in V2 */ ++ VUPLLF %v1,%v1 /* Split rightmost doubleword */ ++ VGFMAG %v1,CONST_R5,%v1,%v2 /* V1 = (V1 * R5) XOR V2 */ ++ ++ /* ++ * Apply a Barret reduction to compute the final 32-bit CRC value. ++ * ++ * The input values to the Barret reduction are the degree-63 polynomial ++ * in V1 (R(x)), degree-32 generator polynomial, and the reduction ++ * constant u. The Barret reduction result is the CRC value of R(x) mod ++ * P(x). ++ * ++ * The Barret reduction algorithm is defined as: ++ * ++ * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u ++ * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) ++ * 3. C(x) = R(x) XOR T2(x) mod x^32 ++ * ++ * Note: The leftmost doubleword of vector register containing ++ * CONST_RU_POLY is zero and, thus, the intermediate GF(2) product ++ * is zero and does not contribute to the final result. ++ */ ++ ++ /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */ ++ VUPLLF %v2,%v1 ++ VGFMG %v2,CONST_RU_POLY,%v2 ++ ++ /* ++ * Compute the GF(2) product of the CRC polynomial with T1(x) in ++ * V2 and XOR the intermediate result, T2(x), with the value in V1. ++ * The final result is stored in word element 2 of V2. ++ */ ++ VUPLLF %v2,%v2 ++ VGFMAG %v2,CONST_CRC_POLY,%v2,%v1 ++ ++.Ldone: ++ VLGVF %r2,%v2,2 ++ BR %r14 ++ .type crc32_le_vgfm_16, @function ++ .size crc32_le_vgfm_16, .-crc32_le_vgfm_16 ++ ++.previous +diff --git a/crc32.c b/crc32.c +index 34132ea..af5d3cd 100644 +--- a/crc32.c ++++ b/crc32.c +@@ -252,12 +252,26 @@ unsigned long crc32_vpmsum(unsigned long, const unsigned char FAR *, z_size_t); + #endif + #endif + ++#ifdef HAVE_S390X_VX ++#include ++ ++local unsigned long crc32_s390_vx(unsigned long crc, ++ const unsigned char FAR *buf, ++ z_size_t len); ++#endif ++ + /* due to a quirk of gnu_indirect_function - "local" (aka static) is applied to + * crc32_z which is not desired. crc32_z_ifunc is implictly "local" */ + #ifndef Z_IFUNC_ASM + local + #endif +-unsigned long (*(crc32_z_ifunc(void)))(unsigned long, const unsigned char FAR *, z_size_t) ++unsigned long (*(crc32_z_ifunc( ++#ifdef __s390__ ++unsigned long hwcap ++#else ++void ++#endif ++)))(unsigned long, const unsigned char FAR *, z_size_t) + { + #if _ARCH_PWR8==1 + #if defined(__BUILTIN_CPU_SUPPORTS__) +@@ -269,6 +283,11 @@ unsigned long (*(crc32_z_ifunc(void)))(unsigned long, const unsigned char FAR *, + #endif + #endif /* _ARCH_PWR8 */ + ++#ifdef HAVE_S390X_VX ++ if (hwcap & HWCAP_S390_VX) ++ return crc32_s390_vx; ++#endif ++ + /* return a function pointer for optimized arches here */ + + #ifdef DYNAMIC_CRC_TABLE +@@ -301,7 +320,11 @@ unsigned long ZEXPORT crc32_z(crc, buf, len) + static unsigned long ZEXPORT (*crc32_func)(unsigned long, const unsigned char FAR *, z_size_t) = NULL; + + if (!crc32_func) +- crc32_func = crc32_z_ifunc(); ++ crc32_func = crc32_z_ifunc( ++#ifdef __s390__ ++ getauxval(AT_HWCAP) ++#endif ++ ); + return (*crc32_func)(crc, buf, len); + } + +@@ -500,6 +523,45 @@ local uLong crc32_combine_(crc1, crc2, len2) + return crc1; + } + ++#ifdef HAVE_S390X_VX ++#define VX_MIN_LEN 64 ++#define VX_ALIGNMENT 16L ++#define VX_ALIGN_MASK (VX_ALIGNMENT - 1) ++ ++unsigned int crc32_le_vgfm_16(unsigned int crc, ++ unsigned char const *buf, ++ size_t size); ++ ++local unsigned long crc32_s390_vx(crc, buf, len) ++ unsigned long crc; ++ const unsigned char FAR *buf; ++ z_size_t len; ++{ ++ unsigned long prealign, aligned, remaining; ++ ++ if (buf == Z_NULL) return 0UL; ++ ++ if (len < VX_MIN_LEN + VX_ALIGN_MASK) ++ return crc32_big(crc, buf, len); ++ ++ if ((unsigned long)buf & VX_ALIGN_MASK) { ++ prealign = VX_ALIGNMENT - ((unsigned long)buf & VX_ALIGN_MASK); ++ len -= prealign; ++ crc = crc32_big(crc, buf, prealign); ++ buf = (void *)((unsigned long)buf + prealign); ++ } ++ aligned = len & ~VX_ALIGN_MASK; ++ remaining = len & VX_ALIGN_MASK; ++ ++ crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, aligned) ^ 0xffffffff; ++ ++ if (remaining) ++ crc = crc32_big(crc, buf + aligned, remaining); ++ ++ return crc; ++} ++#endif ++ + /* ========================================================================= */ + uLong ZEXPORT crc32_combine(crc1, crc2, len2) + uLong crc1; +-- +2.25.1 + diff --git a/zlib.spec b/zlib.spec index a5cb38f..0327f63 100644 --- a/zlib.spec +++ b/zlib.spec @@ -2,7 +2,7 @@ Name: zlib Version: 1.2.11 -Release: 27%{?dist} +Release: 28%{?dist} Summary: Compression and decompression library # /contrib/dotzlib/ have Boost license License: zlib and Boost @@ -33,6 +33,8 @@ Patch14: zlib-1.2.11-inflateSyncPoint-return-value-fix.patch # fixed issues found by covscan for rhel-9 # ref: https://github.com/madler/zlib/pull/554 Patch15: zlib-1.2.11-covscan-issues-rhel9.patch +# Fix for s390x vectorize CRC32 +Patch16: zlib-1.2.11-s390x-vectorize-crc32.patch BuildRequires: make BuildRequires: automake, autoconf, libtool @@ -100,6 +102,7 @@ developing applications which use minizip. %patch13 -p1 %patch14 -p1 %patch15 -p1 +%patch16 -p1 iconv -f iso-8859-2 -t utf-8 < ChangeLog > ChangeLog.tmp @@ -181,6 +184,9 @@ find $RPM_BUILD_ROOT -name '*.la' -delete %changelog +* Thu Jun 03 2021 Patrik Novotný - 1.2.11-28 +- IBM CRC32 optimalization rhbz#1959423 + * Fri Apr 16 2021 Mohan Boddu - 1.2.11-27 - Rebuilt for RHEL 9 BETA on Apr 15th 2021. Related: rhbz#1947937