zlib/zlib-1.2.11-s390x-vectorize-crc32.patch

From 2dfdc5b7d6943c0ac60eef63e361e2a50f9da610 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Thu, 19 Mar 2020 11:52:03 +0100
Subject: [PATCH] s390x: vectorize crc32

Use vector extensions when compiling for s390x and binutils knows
about them. At runtime, check whether kernel supports vector
extensions (it has to be not just the CPU, but also the kernel) and
choose between the regular and the vectorized implementations.
---
 Makefile.in             |   9 ++
 configure               |  28 ++++++
 contrib/s390/crc32-vx.c | 195 ++++++++++++++++++++++++++++++++++++++++
 crc32.c                 |  55 +++++++++++-
 4 files changed, 285 insertions(+), 2 deletions(-)
 create mode 100644 contrib/s390/crc32-vx.c

diff --git a/Makefile.in b/Makefile.in
index 6070dcc..9e9743b 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -29,6 +29,7 @@ LDFLAGS=
 TEST_LDFLAGS=-L. libz.a
 LDSHARED=$(CC)
 CPP=$(CC) -E
+VGFMAFLAG=
 
 STATICLIB=libz.a
 SHAREDLIB=libz.so
@@ -179,6 +180,9 @@ crc32_power8.o: $(SRCDIR)contrib/power8-crc/vec_crc32.c
 crc32.o: $(SRCDIR)crc32.c
 	$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)crc32.c
 
+crc32-vx.o: $(SRCDIR)contrib/s390/crc32-vx.c
+	$(CC) $(CFLAGS) $(VGFMAFLAG) $(ZINC) -c -o $@ $(SRCDIR)contrib/s390/crc32-vx.c
+
 deflate.o: $(SRCDIR)deflate.c
 	$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c
 
@@ -234,6 +238,11 @@ crc32.lo: $(SRCDIR)crc32.c
 	$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/crc32.o $(SRCDIR)crc32.c
 	-@mv objs/crc32.o $@
 
+crc32-vx.lo: $(SRCDIR)contrib/s390/crc32-vx.c
+	-@mkdir objs 2>/dev/null || test -d objs
+	$(CC) $(SFLAGS) $(VGFMAFLAG) $(ZINC) -DPIC -c -o objs/crc32-vx.o $(SRCDIR)contrib/s390/crc32-vx.c
+	-@mv objs/crc32-vx.o $@
+
 deflate.lo: $(SRCDIR)deflate.c
 	-@mkdir objs 2>/dev/null || test -d objs
 	$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c
diff --git a/configure b/configure
index 70ed86b..7941f75 100755
--- a/configure
+++ b/configure
@@ -923,6 +923,32 @@ EOF
   fi
 fi
 
+# check if we are compiling for s390 and binutils support vector extensions
+VGFMAFLAG=-march=z13
+cat > $test.c <<EOF
+#ifndef __s390__
+#error
+#endif
+EOF
+if try $CC -c $CFLAGS $VGFMAFLAG $test.c; then
+  CFLAGS="$CFLAGS -DHAVE_S390X_VX"
+  SFLAGS="$SFLAGS -DHAVE_S390X_VX"
+  OBJC="$OBJC crc32-vx.o"
+  PIC_OBJC="$PIC_OBJC crc32-vx.lo"
+  echo "Checking for s390 vector extensions... Yes." | tee -a configure.log
+
+  for flag in -mzarch -fzvector; do
+    if try $CC -c $CFLAGS $VGFMAFLAG $flag $test.c; then
+      VGFMAFLAG="$VGFMAFLAG $flag"
+      echo "Checking for $flag... Yes." | tee -a configure.log
+    else
+      echo "Checking for $flag... No." | tee -a configure.log
+    fi
+  done
+else
+  echo "Checking for s390 vector extensions... No." | tee -a configure.log
+fi
+
 # show the results in the log
 echo >> configure.log
 echo ALL = $ALL >> configure.log
@@ -955,6 +981,7 @@ echo mandir = $mandir >> configure.log
 echo prefix = $prefix >> configure.log
 echo sharedlibdir = $sharedlibdir >> configure.log
 echo uname = $uname >> configure.log
+echo VGFMAFLAG = $VGFMAFLAG >> configure.log
 
 # udpate Makefile with the configure results
 sed < ${SRCDIR}Makefile.in "
@@ -964,6 +991,7 @@ sed < ${SRCDIR}Makefile.in "
 /^LDFLAGS *=/s#=.*#=$LDFLAGS#
 /^LDSHARED *=/s#=.*#=$LDSHARED#
 /^CPP *=/s#=.*#=$CPP#
+/^VGFMAFLAG *=/s#=.*#=$VGFMAFLAG#
 /^STATICLIB *=/s#=.*#=$STATICLIB#
 /^SHAREDLIB *=/s#=.*#=$SHAREDLIB#
 /^SHAREDLIBV *=/s#=.*#=$SHAREDLIBV#
diff --git a/contrib/s390/crc32-vx.c b/contrib/s390/crc32-vx.c
new file mode 100644
index 0000000..fa5387c
--- /dev/null
+++ b/contrib/s390/crc32-vx.c
@@ -0,0 +1,195 @@
+/*
+ * Hardware-accelerated CRC-32 variants for Linux on z Systems
+ *
+ * Use the z/Architecture Vector Extension Facility to accelerate the
+ * computing of bitreflected CRC-32 checksums.
+ *
+ * This CRC-32 implementation algorithm is bitreflected and processes
+ * the least-significant bit first (Little-Endian).
+ *
+ * This code was originally written by Hendrik Brueckner
+ * <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been
+ * relicensed under the zlib license.
+ */
+
+#include "../../zutil.h"
+
+#include <stdint.h>
+#include <vecintrin.h>
+
+typedef unsigned char uv16qi __attribute__((vector_size(16)));
+typedef unsigned int uv4si __attribute__((vector_size(16)));
+typedef unsigned long long uv2di __attribute__((vector_size(16)));
+
+uint32_t crc32_le_vgfm_16(uint32_t crc, const unsigned char *buf, size_t len) {
+    /*
+     * The CRC-32 constant block contains reduction constants to fold and
+     * process particular chunks of the input data stream in parallel.
+     *
+     * For the CRC-32 variants, the constants are precomputed according to
+     * these definitions:
+     *
+     *      R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
+     *      R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
+     *      R3 = [(x128+32 mod P'(x) << 32)]'   << 1
+     *      R4 = [(x128-32 mod P'(x) << 32)]'   << 1
+     *      R5 = [(x64 mod P'(x) << 32)]'       << 1
+     *      R6 = [(x32 mod P'(x) << 32)]'       << 1
+     *
+     *      The bitreflected Barret reduction constant, u', is defined as
+     *      the bit reversal of floor(x**64 / P(x)).
+     *
+     *      where P(x) is the polynomial in the normal domain and the P'(x) is the
+     *      polynomial in the reversed (bitreflected) domain.
+     *
+     * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
+     *
+     *      P(x)  = 0x04C11DB7
+     *      P'(x) = 0xEDB88320
+     */
+    const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};  /* BE->LE mask */
+    const uv2di r2r1 = {0x1C6E41596, 0x154442BD4};                                     /* R2, R1 */
+    const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0};                                     /* R4, R3 */
+    const uv2di r5 = {0, 0x163CD6124};                                                 /* R5 */
+    const uv2di ru_poly = {0, 0x1F7011641};                                            /* u' */
+    const uv2di crc_poly = {0, 0x1DB710641};                                           /* P'(x) << 1 */
+
+    /*
+     * Load the initial CRC value.
+     *
+     * The CRC value is loaded into the rightmost word of the
+     * vector register and is later XORed with the LSB portion
+     * of the loaded input data.
+     */
+    uv2di v0 = {0, 0};
+    v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);
+
+    /* Load a 64-byte data chunk and XOR with CRC */
+    uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be);
+    uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be);
+    uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be);
+    uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be);
+
+    v1 ^= v0;
+    buf += 64;
+    len -= 64;
+
+    while (len >= 64) {
+        /* Load the next 64-byte data chunk */
+        uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be);
+        uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be);
+        uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be);
+        uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be);
+
+        /*
+         * Perform a GF(2) multiplication of the doublewords in V1 with
+         * the R1 and R2 reduction constants in V0.  The intermediate result
+         * is then folded (accumulated) with the next data chunk in PART1 and
+         * stored in V1. Repeat this step for the register contents
+         * in V2, V3, and V4 respectively.
+         */
+        v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);
+        v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);
+        v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);
+        v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);
+
+        buf += 64;
+        len -= 64;
+    }
+
+    /*
+     * Fold V1 to V4 into a single 128-bit value in V1.  Multiply V1 with R3
+     * and R4 and accumulating the next 128-bit chunk until a single 128-bit
+     * value remains.
+     */
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);
+
+    while (len >= 16) {
+        /* Load next data chunk */
+        v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be);
+
+        /* Fold next data chunk */
+        v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
+
+        buf += 16;
+        len -= 16;
+    }
+
+    /*
+     * Set up a vector register for byte shifts.  The shift value must
+     * be loaded in bits 1-4 in byte element 7 of a vector register.
+     * Shift by 8 bytes: 0x40
+     * Shift by 4 bytes: 0x20
+     */
+    uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    v9 = vec_insert((unsigned char)0x40, v9, 7);
+
+    /*
+     * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
+     * to move R4 into the rightmost doubleword and set the leftmost
+     * doubleword to 0x1.
+     */
+    v0 = vec_srb(r4r3, (uv2di)v9);
+    v0[0] = 1;
+
+    /*
+     * Compute GF(2) product of V1 and V0.  The rightmost doubleword
+     * of V1 is multiplied with R4.  The leftmost doubleword of V1 is
+     * multiplied by 0x1 and is then XORed with rightmost product.
+     * Implicitly, the intermediate leftmost product becomes padded
+     */
+    v1 = (uv2di)vec_gfmsum_128(v0, v1);
+
+    /*
+     * Now do the final 32-bit fold by multiplying the rightmost word
+     * in V1 with R5 and XOR the result with the remaining bits in V1.
+     *
+     * To achieve this by a single VGFMAG, right shift V1 by a word
+     * and store the result in V2 which is then accumulated.  Use the
+     * vector unpack instruction to load the rightmost half of the
+     * doubleword into the rightmost doubleword element of V1; the other
+     * half is loaded in the leftmost doubleword.
+     * The vector register with CONST_R5 contains the R5 constant in the
+     * rightmost doubleword and the leftmost doubleword is zero to ignore
+     * the leftmost product of V1.
+     */
+    v9 = vec_insert((unsigned char)0x20, v9, 7);
+    v2 = vec_srb(v1, (uv2di)v9);
+    v1 = vec_unpackl((uv4si)v1);  /* Split rightmost doubleword */
+    v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);
+
+    /*
+     * Apply a Barret reduction to compute the final 32-bit CRC value.
+     *
+     * The input values to the Barret reduction are the degree-63 polynomial
+     * in V1 (R(x)), degree-32 generator polynomial, and the reduction
+     * constant u.  The Barret reduction result is the CRC value of R(x) mod
+     * P(x).
+     *
+     * The Barret reduction algorithm is defined as:
+     *
+     *    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+     *    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+     *    3. C(x)  = R(x) XOR T2(x) mod x^32
+     *
+     *  Note: The leftmost doubleword of vector register containing
+     *  CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
+     *  is zero and does not contribute to the final result.
+     */
+
+    /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
+    v2 = vec_unpackl((uv4si)v1);
+    v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);
+
+    /*
+     * Compute the GF(2) product of the CRC polynomial with T1(x) in
+     * V2 and XOR the intermediate result, T2(x), with the value in V1.
+     * The final result is stored in word element 2 of V2.
+     */
+    v2 = vec_unpackl((uv4si)v2);
+    v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);
+
+    return ((uv4si)v2)[2];
+}
diff --git a/crc32.c b/crc32.c
index 34132ea..dfa33ef 100644
--- a/crc32.c
+++ b/crc32.c
@@ -252,12 +252,54 @@ unsigned long crc32_vpmsum(unsigned long, const unsigned char FAR *, z_size_t);
 #endif
 #endif
 
+#ifdef HAVE_S390X_VX
+#include <sys/auxv.h>
+
+#define VX_MIN_LEN 64
+#define VX_ALIGNMENT 16L
+#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
+
+unsigned int crc32_le_vgfm_16(unsigned int crc, const unsigned char FAR *buf, z_size_t len);
+
+local unsigned long s390_crc32_vx(unsigned long crc, const unsigned char FAR *buf, z_size_t len)
+{
+    uint64_t prealign, aligned, remaining;
+
+    if (buf == Z_NULL) return 0UL;
+
+    if (len < VX_MIN_LEN + VX_ALIGN_MASK)
+        return crc32_big(crc, buf, len);
+
+    if ((uintptr_t)buf & VX_ALIGN_MASK) {
+        prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);
+        len -= prealign;
+        crc = crc32_big(crc, buf, prealign);
+        buf += prealign;
+    }
+    aligned = len & ~VX_ALIGN_MASK;
+    remaining = len & VX_ALIGN_MASK;
+
+    crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, (size_t)aligned) ^ 0xffffffff;
+
+    if (remaining)
+        crc = crc32_big(crc, buf + aligned, remaining);
+
+    return crc;
+}
+#endif
+
 /* due to a quirk of gnu_indirect_function - "local" (aka static) is applied to
  * crc32_z which is not desired. crc32_z_ifunc is implictly "local" */
 #ifndef Z_IFUNC_ASM
 local
 #endif
-unsigned long (*(crc32_z_ifunc(void)))(unsigned long, const unsigned char FAR *, z_size_t)
+unsigned long (*(crc32_z_ifunc(
+#ifdef __s390__
+unsigned long hwcap
+#else
+void
+#endif
+)))(unsigned long, const unsigned char FAR *, z_size_t)
 {
 #if _ARCH_PWR8==1
 #if defined(__BUILTIN_CPU_SUPPORTS__)
@@ -269,6 +311,11 @@ unsigned long (*(crc32_z_ifunc(void)))(unsigned long, const unsigned char FAR *,
 #endif
 #endif /* _ARCH_PWR8 */
 
+#ifdef HAVE_S390X_VX
+    if (hwcap & HWCAP_S390_VX)
+        return s390_crc32_vx;
+#endif
+
 /* return a function pointer for optimized arches here */
 
 #ifdef DYNAMIC_CRC_TABLE
@@ -301,7 +348,11 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
     static unsigned long ZEXPORT (*crc32_func)(unsigned long, const unsigned char FAR *, z_size_t) = NULL;
 
     if (!crc32_func)
-        crc32_func = crc32_z_ifunc();
+        crc32_func = crc32_z_ifunc(
+#ifdef __s390__
+            getauxval(AT_HWCAP)
+#endif
+        );
     return (*crc32_func)(crc, buf, len);
 }
 
-- 
2.25.1
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`From 2dfdc5b7d6943c0ac60eef63e361e2a50f9da610 Mon Sep 17 00:00:00 2001`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`From: Ilya Leoshkevich <iii@linux.ibm.com>`
			`Date: Thu, 19 Mar 2020 11:52:03 +0100`
			`Subject: [PATCH] s390x: vectorize crc32`

			`Use vector extensions when compiling for s390x and binutils knows`
			`about them. At runtime, check whether kernel supports vector`
			`extensions (it has to be not just the CPU, but also the kernel) and`
			`choose between the regular and the vectorized implementations.`
			`---`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`Makefile.in \| 9 ++`
			`configure \| 28 ++++++`
			`contrib/s390/crc32-vx.c \| 195 ++++++++++++++++++++++++++++++++++++++++`
			`crc32.c \| 55 +++++++++++-`
			`4 files changed, 285 insertions(+), 2 deletions(-)`
			`create mode 100644 contrib/s390/crc32-vx.c`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00
			`diff --git a/Makefile.in b/Makefile.in`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`index 6070dcc..9e9743b 100644`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`--- a/Makefile.in`
			`+++ b/Makefile.in`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`@@ -29,6 +29,7 @@ LDFLAGS=`
			`TEST_LDFLAGS=-L. libz.a`
			`LDSHARED=$(CC)`
			`CPP=$(CC) -E`
			`+VGFMAFLAG=`

			`STATICLIB=libz.a`
			`SHAREDLIB=libz.so`
			`@@ -179,6 +180,9 @@ crc32_power8.o: $(SRCDIR)contrib/power8-crc/vec_crc32.c`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`crc32.o: $(SRCDIR)crc32.c`
			`$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)crc32.c`

- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+crc32-vx.o: $(SRCDIR)contrib/s390/crc32-vx.c`
			`+ $(CC) $(CFLAGS) $(VGFMAFLAG) $(ZINC) -c -o $@ $(SRCDIR)contrib/s390/crc32-vx.c`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+`
			`deflate.o: $(SRCDIR)deflate.c`
			`$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c`

- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`@@ -234,6 +238,11 @@ crc32.lo: $(SRCDIR)crc32.c`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/crc32.o $(SRCDIR)crc32.c`
			`-@mv objs/crc32.o $@`

- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+crc32-vx.lo: $(SRCDIR)contrib/s390/crc32-vx.c`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+ -@mkdir objs 2>/dev/null \|\| test -d objs`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+ $(CC) $(SFLAGS) $(VGFMAFLAG) $(ZINC) -DPIC -c -o objs/crc32-vx.o $(SRCDIR)contrib/s390/crc32-vx.c`
			`+ -@mv objs/crc32-vx.o $@`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+`
			`deflate.lo: $(SRCDIR)deflate.c`
			`-@mkdir objs 2>/dev/null \|\| test -d objs`
			`$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c`
			`diff --git a/configure b/configure`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`index 70ed86b..7941f75 100755`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`--- a/configure`
			`+++ b/configure`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`@@ -923,6 +923,32 @@ EOF`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`fi`
			`fi`

			`+# check if we are compiling for s390 and binutils support vector extensions`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+VGFMAFLAG=-march=z13`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+cat > $test.c <<EOF`
			`+#ifndef __s390__`
			`+#error`
			`+#endif`
			`+EOF`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+if try $CC -c $CFLAGS $VGFMAFLAG $test.c; then`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+ CFLAGS="$CFLAGS -DHAVE_S390X_VX"`
			`+ SFLAGS="$SFLAGS -DHAVE_S390X_VX"`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+ OBJC="$OBJC crc32-vx.o"`
			`+ PIC_OBJC="$PIC_OBJC crc32-vx.lo"`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+ echo "Checking for s390 vector extensions... Yes." \| tee -a configure.log`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+`
			`+ for flag in -mzarch -fzvector; do`
			`+ if try $CC -c $CFLAGS $VGFMAFLAG $flag $test.c; then`
			`+ VGFMAFLAG="$VGFMAFLAG $flag"`
			`+ echo "Checking for $flag... Yes." \| tee -a configure.log`
			`+ else`
			`+ echo "Checking for $flag... No." \| tee -a configure.log`
			`+ fi`
			`+ done`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+else`
			`+ echo "Checking for s390 vector extensions... No." \| tee -a configure.log`
			`+fi`
			`+`
			`# show the results in the log`
			`echo >> configure.log`
			`echo ALL = $ALL >> configure.log`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`@@ -955,6 +981,7 @@ echo mandir = $mandir >> configure.log`
			`echo prefix = $prefix >> configure.log`
			`echo sharedlibdir = $sharedlibdir >> configure.log`
			`echo uname = $uname >> configure.log`
			`+echo VGFMAFLAG = $VGFMAFLAG >> configure.log`

			`# udpate Makefile with the configure results`
			`sed < ${SRCDIR}Makefile.in "`
			`@@ -964,6 +991,7 @@ sed < ${SRCDIR}Makefile.in "`
			`/^LDFLAGS =/s#=.#=$LDFLAGS#`
			`/^LDSHARED =/s#=.#=$LDSHARED#`
			`/^CPP =/s#=.#=$CPP#`
			`+/^VGFMAFLAG =/s#=.#=$VGFMAFLAG#`
			`/^STATICLIB =/s#=.#=$STATICLIB#`
			`/^SHAREDLIB =/s#=.#=$SHAREDLIB#`
			`/^SHAREDLIBV =/s#=.#=$SHAREDLIBV#`
			`diff --git a/contrib/s390/crc32-vx.c b/contrib/s390/crc32-vx.c`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`new file mode 100644`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`index 0000000..fa5387c`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`--- /dev/null`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+++ b/contrib/s390/crc32-vx.c`
			`@@ -0,0 +1,195 @@`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+/*`
			`+ * Hardware-accelerated CRC-32 variants for Linux on z Systems`
			`+ *`
			`+ * Use the z/Architecture Vector Extension Facility to accelerate the`
			`+ * computing of bitreflected CRC-32 checksums.`
			`+ *`
			`+ * This CRC-32 implementation algorithm is bitreflected and processes`
			`+ * the least-significant bit first (Little-Endian).`
			`+ *`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+ * This code was originally written by Hendrik Brueckner`
			`+ * <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been`
			`+ * relicensed under the zlib license.`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+ */`
			`+`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+#include "../../zutil.h"`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+#include <stdint.h>`
			`+#include <vecintrin.h>`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+typedef unsigned char uv16qi __attribute__((vector_size(16)));`
			`+typedef unsigned int uv4si __attribute__((vector_size(16)));`
			`+typedef unsigned long long uv2di __attribute__((vector_size(16)));`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+uint32_t crc32_le_vgfm_16(uint32_t crc, const unsigned char *buf, size_t len) {`
			`+ /*`
			`+ * The CRC-32 constant block contains reduction constants to fold and`
			`+ * process particular chunks of the input data stream in parallel.`
			`+ *`
			`+ * For the CRC-32 variants, the constants are precomputed according to`
			`+ * these definitions:`
			`+ *`
			`+ * R1 = [(x4*128+32 mod P'(x) << 32)]' << 1`
			`+ * R2 = [(x4*128-32 mod P'(x) << 32)]' << 1`
			`+ * R3 = [(x128+32 mod P'(x) << 32)]' << 1`
			`+ * R4 = [(x128-32 mod P'(x) << 32)]' << 1`
			`+ * R5 = [(x64 mod P'(x) << 32)]' << 1`
			`+ * R6 = [(x32 mod P'(x) << 32)]' << 1`
			`+ *`
			`+ * The bitreflected Barret reduction constant, u', is defined as`
			`+ * the bit reversal of floor(x**64 / P(x)).`
			`+ *`
			`+ * where P(x) is the polynomial in the normal domain and the P'(x) is the`
			`+ * polynomial in the reversed (bitreflected) domain.`
			`+ *`
			`+ * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:`
			`+ *`
			`+ * P(x) = 0x04C11DB7`
			`+ * P'(x) = 0xEDB88320`
			`+ */`
			`+ const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; /* BE->LE mask */`
			`+ const uv2di r2r1 = {0x1C6E41596, 0x154442BD4}; /* R2, R1 */`
			`+ const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0}; /* R4, R3 */`
			`+ const uv2di r5 = {0, 0x163CD6124}; /* R5 */`
			`+ const uv2di ru_poly = {0, 0x1F7011641}; /* u' */`
			`+ const uv2di crc_poly = {0, 0x1DB710641}; /* P'(x) << 1 */`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+`
			`+ /*`
			`+ * Load the initial CRC value.`
			`+ *`
			`+ * The CRC value is loaded into the rightmost word of the`
			`+ * vector register and is later XORed with the LSB portion`
			`+ * of the loaded input data.`
			`+ */`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+ uv2di v0 = {0, 0};`
			`+ v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+`
			`+ /* Load a 64-byte data chunk and XOR with CRC */`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+ uv2di v1 = vec_perm(((uv2di )buf)[0], ((uv2di )buf)[0], perm_le2be);`
			`+ uv2di v2 = vec_perm(((uv2di )buf)[1], ((uv2di )buf)[1], perm_le2be);`
			`+ uv2di v3 = vec_perm(((uv2di )buf)[2], ((uv2di )buf)[2], perm_le2be);`
			`+ uv2di v4 = vec_perm(((uv2di )buf)[3], ((uv2di )buf)[3], perm_le2be);`
			`+`
			`+ v1 ^= v0;`
			`+ buf += 64;`
			`+ len -= 64;`
			`+`
			`+ while (len >= 64) {`
			`+ /* Load the next 64-byte data chunk */`
			`+ uv16qi part1 = vec_perm(((uv16qi )buf)[0], ((uv16qi )buf)[0], perm_le2be);`
			`+ uv16qi part2 = vec_perm(((uv16qi )buf)[1], ((uv16qi )buf)[1], perm_le2be);`
			`+ uv16qi part3 = vec_perm(((uv16qi )buf)[2], ((uv16qi )buf)[2], perm_le2be);`
			`+ uv16qi part4 = vec_perm(((uv16qi )buf)[3], ((uv16qi )buf)[3], perm_le2be);`
			`+`
			`+ /*`
			`+ * Perform a GF(2) multiplication of the doublewords in V1 with`
			`+ * the R1 and R2 reduction constants in V0. The intermediate result`
			`+ * is then folded (accumulated) with the next data chunk in PART1 and`
			`+ * stored in V1. Repeat this step for the register contents`
			`+ * in V2, V3, and V4 respectively.`
			`+ */`
			`+ v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);`
			`+ v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);`
			`+ v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);`
			`+ v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);`
			`+`
			`+ buf += 64;`
			`+ len -= 64;`
			`+ }`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+`
			`+ /*`
			`+ * Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3`
			`+ * and R4 and accumulating the next 128-bit chunk until a single 128-bit`
			`+ * value remains.`
			`+ */`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+ v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);`
			`+ v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);`
			`+ v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+ while (len >= 16) {`
			`+ /* Load next data chunk */`
			`+ v2 = vec_perm((uv2di )buf, (uv2di )buf, perm_le2be);`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+ /* Fold next data chunk */`
			`+ v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+ buf += 16;`
			`+ len -= 16;`
			`+ }`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+`
			`+ /*`
			`+ * Set up a vector register for byte shifts. The shift value must`
			`+ * be loaded in bits 1-4 in byte element 7 of a vector register.`
			`+ * Shift by 8 bytes: 0x40`
			`+ * Shift by 4 bytes: 0x20`
			`+ */`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+ uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};`
			`+ v9 = vec_insert((unsigned char)0x40, v9, 7);`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+`
			`+ /*`
			`+ * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes`
			`+ * to move R4 into the rightmost doubleword and set the leftmost`
			`+ * doubleword to 0x1.`
			`+ */`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+ v0 = vec_srb(r4r3, (uv2di)v9);`
			`+ v0[0] = 1;`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+`
			`+ /*`
			`+ * Compute GF(2) product of V1 and V0. The rightmost doubleword`
			`+ * of V1 is multiplied with R4. The leftmost doubleword of V1 is`
			`+ * multiplied by 0x1 and is then XORed with rightmost product.`
			`+ * Implicitly, the intermediate leftmost product becomes padded`
			`+ */`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+ v1 = (uv2di)vec_gfmsum_128(v0, v1);`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+`
			`+ /*`
			`+ * Now do the final 32-bit fold by multiplying the rightmost word`
			`+ * in V1 with R5 and XOR the result with the remaining bits in V1.`
			`+ *`
			`+ * To achieve this by a single VGFMAG, right shift V1 by a word`
			`+ * and store the result in V2 which is then accumulated. Use the`
			`+ * vector unpack instruction to load the rightmost half of the`
			`+ * doubleword into the rightmost doubleword element of V1; the other`
			`+ * half is loaded in the leftmost doubleword.`
			`+ * The vector register with CONST_R5 contains the R5 constant in the`
			`+ * rightmost doubleword and the leftmost doubleword is zero to ignore`
			`+ * the leftmost product of V1.`
			`+ */`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+ v9 = vec_insert((unsigned char)0x20, v9, 7);`
			`+ v2 = vec_srb(v1, (uv2di)v9);`
			`+ v1 = vec_unpackl((uv4si)v1); /* Split rightmost doubleword */`
			`+ v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+`
			`+ /*`
			`+ * Apply a Barret reduction to compute the final 32-bit CRC value.`
			`+ *`
			`+ * The input values to the Barret reduction are the degree-63 polynomial`
			`+ * in V1 (R(x)), degree-32 generator polynomial, and the reduction`
			`+ * constant u. The Barret reduction result is the CRC value of R(x) mod`
			`+ * P(x).`
			`+ *`
			`+ * The Barret reduction algorithm is defined as:`
			`+ *`
			`+ * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u`
			`+ * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)`
			`+ * 3. C(x) = R(x) XOR T2(x) mod x^32`
			`+ *`
			`+ * Note: The leftmost doubleword of vector register containing`
			`+ * CONST_RU_POLY is zero and, thus, the intermediate GF(2) product`
			`+ * is zero and does not contribute to the final result.`
			`+ */`
			`+`
			`+ /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+ v2 = vec_unpackl((uv4si)v1);`
			`+ v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+`
			`+ /*`
			`+ * Compute the GF(2) product of the CRC polynomial with T1(x) in`
			`+ * V2 and XOR the intermediate result, T2(x), with the value in V1.`
			`+ * The final result is stored in word element 2 of V2.`
			`+ */`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+ v2 = vec_unpackl((uv4si)v2);`
			`+ v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+ return ((uv4si)v2)[2];`
			`+}`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`diff --git a/crc32.c b/crc32.c`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`index 34132ea..dfa33ef 100644`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`--- a/crc32.c`
			`+++ b/crc32.c`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`@@ -252,12 +252,54 @@ unsigned long crc32_vpmsum(unsigned long, const unsigned char FAR *, z_size_t);`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`#endif`
			`#endif`

			`+#ifdef HAVE_S390X_VX`
			`+#include <sys/auxv.h>`
			`+`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+#define VX_MIN_LEN 64`
			`+#define VX_ALIGNMENT 16L`
			`+#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)`
			`+`
			`+unsigned int crc32_le_vgfm_16(unsigned int crc, const unsigned char FAR *buf, z_size_t len);`
			`+`
			`+local unsigned long s390_crc32_vx(unsigned long crc, const unsigned char FAR *buf, z_size_t len)`
			`+{`
			`+ uint64_t prealign, aligned, remaining;`
			`+`
			`+ if (buf == Z_NULL) return 0UL;`
			`+`
			`+ if (len < VX_MIN_LEN + VX_ALIGN_MASK)`
			`+ return crc32_big(crc, buf, len);`
			`+`
			`+ if ((uintptr_t)buf & VX_ALIGN_MASK) {`
			`+ prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);`
			`+ len -= prealign;`
			`+ crc = crc32_big(crc, buf, prealign);`
			`+ buf += prealign;`
			`+ }`
			`+ aligned = len & ~VX_ALIGN_MASK;`
			`+ remaining = len & VX_ALIGN_MASK;`
			`+`
			`+ crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, (size_t)aligned) ^ 0xffffffff;`
			`+`
			`+ if (remaining)`
			`+ crc = crc32_big(crc, buf + aligned, remaining);`
			`+`
			`+ return crc;`
			`+}`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+#endif`
			`+`
			`/* due to a quirk of gnu_indirect_function - "local" (aka static) is applied to`
			`* crc32_z which is not desired. crc32_z_ifunc is implictly "local" */`
			`#ifndef Z_IFUNC_ASM`
			`local`
			`#endif`
			`-unsigned long ((crc32_z_ifunc(void)))(unsigned long, const unsigned char FAR , z_size_t)`
			`+unsigned long (*(crc32_z_ifunc(`
			`+#ifdef __s390__`
			`+unsigned long hwcap`
			`+#else`
			`+void`
			`+#endif`
			`+)))(unsigned long, const unsigned char FAR *, z_size_t)`
			`{`
			`#if _ARCH_PWR8==1`
			`#if defined(__BUILTIN_CPU_SUPPORTS__)`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`@@ -269,6 +311,11 @@ unsigned long ((crc32_z_ifunc(void)))(unsigned long, const unsigned char FAR ,`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`#endif`
			`#endif /* _ARCH_PWR8 */`

			`+#ifdef HAVE_S390X_VX`
			`+ if (hwcap & HWCAP_S390_VX)`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`+ return s390_crc32_vx;`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`+#endif`
			`+`
			`/* return a function pointer for optimized arches here */`

			`#ifdef DYNAMIC_CRC_TABLE`
- Fix for IBM CRC32 optimalization rhbz#1959423 2021-07-29 10:30:59 +00:00			`@@ -301,7 +348,11 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)`
Add IBM CRC32 optimalization patch for s390x 2021-07-15 07:17:35 +00:00			`static unsigned long ZEXPORT (crc32_func)(unsigned long, const unsigned char FAR , z_size_t) = NULL;`

			`if (!crc32_func)`
			`- crc32_func = crc32_z_ifunc();`
			`+ crc32_func = crc32_z_ifunc(`
			`+#ifdef __s390__`
			`+ getauxval(AT_HWCAP)`
			`+#endif`
			`+ );`
			`return (*crc32_func)(crc, buf, len);`
			`}`

			`--`
			`2.25.1`