- Fix for IBM CRC32 optimalization rhbz#1959423

2021-07-29 12:30:59 +02:00 · 2021-07-29 12:30:59 +02:00 · c739e57fe8
commit c739e57fe8
parent daa1e929c7
2 changed files with 194 additions and 253 deletions
--- a/zlib-1.2.11-s390x-vectorize-crc32.patch
+++ b/zlib-1.2.11-s390x-vectorize-crc32.patch
@ -1,4 +1,4 @@
-From 367e79caf76bda5fdb974420b72c6ddabdcd664e Mon Sep 17 00:00:00 2001
+From 2dfdc5b7d6943c0ac60eef63e361e2a50f9da610 Mon Sep 17 00:00:00 2001
 From: Ilya Leoshkevich <iii@linux.ibm.com>
 Date: Thu, 19 Mar 2020 11:52:03 +0100
 Subject: [PATCH] s390x: vectorize crc32
@ -8,59 +8,77 @@ about them. At runtime, check whether kernel supports vector
 extensions (it has to be not just the CPU, but also the kernel) and
 choose between the regular and the vectorized implementations.
 ---
- Makefile.in               |   8 ++
+ Makefile.in             |   9 ++
- configure                 |  16 +++
+ configure               |  28 ++++++
- contrib/s390/crc32le-vx.S | 273 ++++++++++++++++++++++++++++++++++++++
+ contrib/s390/crc32-vx.c | 195 ++++++++++++++++++++++++++++++++++++++++
- crc32.c                   |  66 ++++++++-
+ crc32.c                 |  55 +++++++++++-
- 4 files changed, 361 insertions(+), 2 deletions(-)
+ 4 files changed, 285 insertions(+), 2 deletions(-)
- create mode 100644 contrib/s390/crc32le-vx.S
+ create mode 100644 contrib/s390/crc32-vx.c
 diff --git a/Makefile.in b/Makefile.in
-index 6070dcc..23e8694 100644
+index 6070dcc..9e9743b 100644
 --- a/Makefile.in
 +++ b/Makefile.in
-@@ -179,6 +179,9 @@ crc32_power8.o: $(SRCDIR)contrib/power8-crc/vec_crc32.c
+@@ -29,6 +29,7 @@ LDFLAGS=
 TEST_LDFLAGS=-L. libz.a
 LDSHARED=$(CC)
 CPP=$(CC) -E
 +VGFMAFLAG=
 STATICLIB=libz.a
 SHAREDLIB=libz.so
@@ -179,6 +180,9 @@ crc32_power8.o: $(SRCDIR)contrib/power8-crc/vec_crc32.c
 crc32.o: $(SRCDIR)crc32.c
 	$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)crc32.c
-+crc32le-vx.o: $(SRCDIR)contrib/s390/crc32le-vx.S
+crc32-vx.o: $(SRCDIR)contrib/s390/crc32-vx.c
-+	$(CC) $(CFLAGS) -march=z13 $(ZINC) -c -o $@ $(SRCDIR)contrib/s390/crc32le-vx.S
+	$(CC) $(CFLAGS) $(VGFMAFLAG) $(ZINC) -c -o $@ $(SRCDIR)contrib/s390/crc32-vx.c
 +
 deflate.o: $(SRCDIR)deflate.c
 	$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c
-@@ -234,6 +237,11 @@ crc32.lo: $(SRCDIR)crc32.c
+@@ -234,6 +238,11 @@ crc32.lo: $(SRCDIR)crc32.c
 	$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/crc32.o $(SRCDIR)crc32.c
 	-@mv objs/crc32.o $@
-+crc32le-vx.lo: $(SRCDIR)contrib/s390/crc32le-vx.S
+crc32-vx.lo: $(SRCDIR)contrib/s390/crc32-vx.c
 +	-@mkdir objs 2>/dev/null || test -d objs
-+	$(CC) $(SFLAGS) -march=z13 $(ZINC) -DPIC -c -o objs/crc32le-vx.o $(SRCDIR)contrib/s390/crc32le-vx.S
+	$(CC) $(SFLAGS) $(VGFMAFLAG) $(ZINC) -DPIC -c -o objs/crc32-vx.o $(SRCDIR)contrib/s390/crc32-vx.c
-+	-@mv objs/crc32le-vx.o $@
+	-@mv objs/crc32-vx.o $@
 +
 deflate.lo: $(SRCDIR)deflate.c
 	-@mkdir objs 2>/dev/null || test -d objs
 	$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c
 diff --git a/configure b/configure
-index 70ed86b..e658039 100755
+index 70ed86b..7941f75 100755
 --- a/configure
 +++ b/configure
-@@ -923,6 +923,22 @@ EOF
+@@ -923,6 +923,32 @@ EOF
   fi
 fi
 +# check if we are compiling for s390 and binutils support vector extensions
 +VGFMAFLAG=-march=z13
 +cat > $test.c <<EOF
 +#ifndef __s390__
 +#error
 +#endif
 +EOF
-+if try $CC -c $CFLAGS -march=z13 $test.c; then
+if try $CC -c $CFLAGS $VGFMAFLAG $test.c; then
 +  CFLAGS="$CFLAGS -DHAVE_S390X_VX"
 +  SFLAGS="$SFLAGS -DHAVE_S390X_VX"
-+  OBJC="$OBJC crc32le-vx.o"
+  OBJC="$OBJC crc32-vx.o"
-+  PIC_OBJC="$PIC_OBJC crc32le-vx.lo"
+  PIC_OBJC="$PIC_OBJC crc32-vx.lo"
 +  echo "Checking for s390 vector extensions... Yes." | tee -a configure.log
 +
 +  for flag in -mzarch -fzvector; do
 +    if try $CC -c $CFLAGS $VGFMAFLAG $flag $test.c; then
 +      VGFMAFLAG="$VGFMAFLAG $flag"
 +      echo "Checking for $flag... Yes." | tee -a configure.log
 +    else
 +      echo "Checking for $flag... No." | tee -a configure.log
 +    fi
 +  done
 +else
 +  echo "Checking for s390 vector extensions... No." | tee -a configure.log
 +fi
@ -68,12 +86,28 @@ index 70ed86b..e658039 100755
 # show the results in the log
 echo >> configure.log
 echo ALL = $ALL >> configure.log
-diff --git a/contrib/s390/crc32le-vx.S b/contrib/s390/crc32le-vx.S
+@@ -955,6 +981,7 @@ echo mandir = $mandir >> configure.log
 echo prefix = $prefix >> configure.log
 echo sharedlibdir = $sharedlibdir >> configure.log
 echo uname = $uname >> configure.log
 +echo VGFMAFLAG = $VGFMAFLAG >> configure.log
 # udpate Makefile with the configure results
 sed < ${SRCDIR}Makefile.in "
@@ -964,6 +991,7 @@ sed < ${SRCDIR}Makefile.in "
 /^LDFLAGS *=/s#=.*#=$LDFLAGS#
 /^LDSHARED *=/s#=.*#=$LDSHARED#
 /^CPP *=/s#=.*#=$CPP#
 +/^VGFMAFLAG *=/s#=.*#=$VGFMAFLAG#
 /^STATICLIB *=/s#=.*#=$STATICLIB#
 /^SHAREDLIB *=/s#=.*#=$SHAREDLIB#
 /^SHAREDLIBV *=/s#=.*#=$SHAREDLIBV#
 diff --git a/contrib/s390/crc32-vx.c b/contrib/s390/crc32-vx.c
 new file mode 100644
-index 0000000..029cfff
+index 0000000..fa5387c
 --- /dev/null
-+++ b/contrib/s390/crc32le-vx.S
+++ b/contrib/s390/crc32-vx.c
-@@ -0,0 +1,273 @@
+@@ -0,0 +1,195 @@
 +/*
 + * Hardware-accelerated CRC-32 variants for Linux on z Systems
 + *
@ -83,113 +117,52 @@ index 0000000..029cfff
 + * This CRC-32 implementation algorithm is bitreflected and processes
 + * the least-significant bit first (Little-Endian).
 + *
-+ * This code has been originally written by Hendrik Brueckner
+ * This code was originally written by Hendrik Brueckner
-+ * <brueckner@linux.vnet.ibm.com> and included in the Linux kernel:
+ * <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been
-+ *
+ * relicensed under the zlib license.
 + * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/s390/crypto/crc32le-vx.S?h=v5.5
 + *
 + * Hendrik Brueckner has allowed reusing it under zlib license.
 + *
 + * The following adjustments were made:
 + *
 + * - Reformatted in order to match the zlib code style.
 + * - Changed the vector register numbers in order to avoid clobbering the call-saved %v8-%v16.
 + * - Fixed clang compatibility.
 + * - Added 31-bit compatibility.
 + */
 +
-+#ifndef __clang__
+#include "../../zutil.h"
 +.machinemode zarch
 +#endif
 +
-+#define PART1                   %v16
+#include <stdint.h>
-+#define PART2                   %v17
+#include <vecintrin.h>
 +#define PART3                   %v18
 +#define PART4                   %v19
 +#define SHIFTS                  %v20
 +
-+/* Vector register range containing CRC-32 constants */
+typedef unsigned char uv16qi __attribute__((vector_size(16)));
-+#define CONST_PERM_LE2BE        %v21
+typedef unsigned int uv4si __attribute__((vector_size(16)));
-+#define CONST_R2R1              %v22
+typedef unsigned long long uv2di __attribute__((vector_size(16)));
 +#define CONST_R4R3              %v23
 +#define CONST_R5                %v24
 +#define CONST_RU_POLY           %v25
 +#define CONST_CRC_POLY          %v26
 +
-+#if defined(__s390x__)
+uint32_t crc32_le_vgfm_16(uint32_t crc, const unsigned char *buf, size_t len) {
-+#define AGHI aghi
+    /*
-+#define CGHI cghi
+     * The CRC-32 constant block contains reduction constants to fold and
-+#else
+     * process particular chunks of the input data stream in parallel.
-+#define AGHI ahi
+     *
-+#define CGHI chi
+     * For the CRC-32 variants, the constants are precomputed according to
-+#endif
+     * these definitions:
-+
+     *
-+.data
+     *      R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
-+.align 8
+     *      R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
-+
+     *      R3 = [(x128+32 mod P'(x) << 32)]'   << 1
-+/*
+     *      R4 = [(x128-32 mod P'(x) << 32)]'   << 1
-+ * The CRC-32 constant block contains reduction constants to fold and
+     *      R5 = [(x64 mod P'(x) << 32)]'       << 1
-+ * process particular chunks of the input data stream in parallel.
+     *      R6 = [(x32 mod P'(x) << 32)]'       << 1
-+ *
+     *
-+ * For the CRC-32 variants, the constants are precomputed according to
+     *      The bitreflected Barret reduction constant, u', is defined as
-+ * these definitions:
+     *      the bit reversal of floor(x**64 / P(x)).
-+ *
+     *
-+ *      R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
+     *      where P(x) is the polynomial in the normal domain and the P'(x) is the
-+ *      R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
+     *      polynomial in the reversed (bitreflected) domain.
-+ *      R3 = [(x128+32 mod P'(x) << 32)]'   << 1
+     *
-+ *      R4 = [(x128-32 mod P'(x) << 32)]'   << 1
+     * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
-+ *      R5 = [(x64 mod P'(x) << 32)]'       << 1
+     *
-+ *      R6 = [(x32 mod P'(x) << 32)]'       << 1
+     *      P(x)  = 0x04C11DB7
-+ *
+     *      P'(x) = 0xEDB88320
-+ *      The bitreflected Barret reduction constant, u', is defined as
+     */
-+ *      the bit reversal of floor(x**64 / P(x)).
+    const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};  /* BE->LE mask */
-+ *
+    const uv2di r2r1 = {0x1C6E41596, 0x154442BD4};                                     /* R2, R1 */
-+ *      where P(x) is the polynomial in the normal domain and the P'(x) is the
+    const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0};                                     /* R4, R3 */
-+ *      polynomial in the reversed (bitreflected) domain.
+    const uv2di r5 = {0, 0x163CD6124};                                                 /* R5 */
-+ *
+    const uv2di ru_poly = {0, 0x1F7011641};                                            /* u' */
-+ * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
+    const uv2di crc_poly = {0, 0x1DB710641};                                           /* P'(x) << 1 */
 + *
 + *      P(x)  = 0x04C11DB7
 + *      P'(x) = 0xEDB88320
 + */
 +
 +.Lconstants_CRC_32_LE:
 +    .octa           0x0F0E0D0C0B0A09080706050403020100      # BE->LE mask
 +    .quad           0x1c6e41596, 0x154442bd4                # R2, R1
 +    .quad           0x0ccaa009e, 0x1751997d0                # R4, R3
 +    .octa           0x163cd6124                             # R5
 +    .octa           0x1F7011641                             # u'
 +    .octa           0x1DB710641                             # P'(x) << 1
 +
 +.text
 +
 +/*
 + * The CRC-32 functions use these calling conventions:
 + *
 + * Parameters:
 + *
 + *      %r2:    Initial CRC value, typically ~0; and final CRC (return) value.
 + *      %r3:    Input buffer pointer, performance might be improved if the
 + *              buffer is on a doubleword boundary.
 + *      %r4:    Length of the buffer, must be 64 bytes or greater.
 + *
 + * Register usage:
 + *
 + *      %r5:      CRC-32 constant pool base pointer.
 + *      V0:       Initial CRC value and intermediate constants and results.
 + *      V1..V4:   Data for CRC computation.
 + *      V16..V19: Next data chunks that are fetched from the input buffer.
 + *      V20:      Constant for BE->LE conversion and shift operations
 + *
 + *      V21..V26: CRC-32 constants.
 + */
 +
 +    .globl crc32_le_vgfm_16
 +    .align 4, 0x07
 +crc32_le_vgfm_16:
 +    /* Load CRC-32 constants */
 +    larl    %r5,.Lconstants_CRC_32_LE
 +    VLM     CONST_PERM_LE2BE,CONST_CRC_POLY,0(%r5)
 +
 +    /*
 +     * Load the initial CRC value.
@ -198,90 +171,78 @@ index 0000000..029cfff
 +     * vector register and is later XORed with the LSB portion
 +     * of the loaded input data.
 +     */
-+    VZERO   %v0                     /* Clear V0 */
+    uv2di v0 = {0, 0};
-+    VLVGF   %v0,%r2,3               /* Load CRC into rightmost word */
+    v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);
 +
 +    /* Load a 64-byte data chunk and XOR with CRC */
-+    VLM     %v1,%v4,0(%r3)          /* 64-bytes into V1..V4 */
+    uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be);
-+    VPERM   %v1,%v1,%v1,CONST_PERM_LE2BE
+    uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be);
-+    VPERM   %v2,%v2,%v2,CONST_PERM_LE2BE
+    uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be);
-+    VPERM   %v3,%v3,%v3,CONST_PERM_LE2BE
+    uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be);
 +    VPERM   %v4,%v4,%v4,CONST_PERM_LE2BE
 +
-+    VX      %v1,%v0,%v1             /* V1 ^= CRC */
+    v1 ^= v0;
-+    AGHI    %r3,64                  /* BUF = BUF + 64 */
+    buf += 64;
-+    AGHI    %r4,-64                 /* LEN = LEN - 64 */
+    len -= 64;
 +
-+    CGHI    %r4,64
+    while (len >= 64) {
-+    jl      .Lless_than_64bytes
+        /* Load the next 64-byte data chunk */
 +        uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be);
 +        uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be);
 +        uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be);
 +        uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be);
 +
-+.Lfold_64bytes_loop:
+        /*
-+    /* Load the next 64-byte data chunk into PART1 to PART4 */
+         * Perform a GF(2) multiplication of the doublewords in V1 with
-+    VLM     PART1,PART4,0(%r3)
+         * the R1 and R2 reduction constants in V0.  The intermediate result
-+    VPERM   PART1,PART1,PART1,CONST_PERM_LE2BE
+         * is then folded (accumulated) with the next data chunk in PART1 and
-+    VPERM   PART2,PART2,PART2,CONST_PERM_LE2BE
+         * stored in V1. Repeat this step for the register contents
-+    VPERM   PART3,PART3,PART3,CONST_PERM_LE2BE
+         * in V2, V3, and V4 respectively.
-+    VPERM   PART4,PART4,PART4,CONST_PERM_LE2BE
+         */
 +        v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);
 +        v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);
 +        v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);
 +        v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);
 +
-+    /*
+        buf += 64;
-+     * Perform a GF(2) multiplication of the doublewords in V1 with
+        len -= 64;
-+     * the R1 and R2 reduction constants in V0.  The intermediate result
+    }
 +     * is then folded (accumulated) with the next data chunk in PART1 and
 +     * stored in V1. Repeat this step for the register contents
 +     * in V2, V3, and V4 respectively.
 +     */
 +    VGFMAG  %v1,CONST_R2R1,%v1,PART1
 +    VGFMAG  %v2,CONST_R2R1,%v2,PART2
 +    VGFMAG  %v3,CONST_R2R1,%v3,PART3
 +    VGFMAG  %v4,CONST_R2R1,%v4,PART4
 +
 +    AGHI    %r3,64                  /* BUF = BUF + 64 */
 +    AGHI    %r4,-64                 /* LEN = LEN - 64 */
 +
 +    CGHI    %r4,64
 +    jnl     .Lfold_64bytes_loop
 +
 +.Lless_than_64bytes:
 +    /*
 +     * Fold V1 to V4 into a single 128-bit value in V1.  Multiply V1 with R3
 +     * and R4 and accumulating the next 128-bit chunk until a single 128-bit
 +     * value remains.
 +     */
-+    VGFMAG  %v1,CONST_R4R3,%v1,%v2
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
-+    VGFMAG  %v1,CONST_R4R3,%v1,%v3
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);
-+    VGFMAG  %v1,CONST_R4R3,%v1,%v4
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);
 +
-+    CGHI    %r4,16
+    while (len >= 16) {
-+    jl      .Lfinal_fold
+        /* Load next data chunk */
 +        v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be);
 +
-+.Lfold_16bytes_loop:
+        /* Fold next data chunk */
 +        v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
 +
-+    VL      %v2,0(%r3)              /* Load next data chunk */
+        buf += 16;
-+    VPERM   %v2,%v2,%v2,CONST_PERM_LE2BE
+        len -= 16;
-+    VGFMAG  %v1,CONST_R4R3,%v1,%v2  /* Fold next data chunk */
+    }
 +
 +    AGHI    %r3,16
 +    AGHI    %r4,-16
 +
 +    CGHI    %r4,16
 +    jnl     .Lfold_16bytes_loop
 +
 +.Lfinal_fold:
 +    /*
 +     * Set up a vector register for byte shifts.  The shift value must
 +     * be loaded in bits 1-4 in byte element 7 of a vector register.
 +     * Shift by 8 bytes: 0x40
 +     * Shift by 4 bytes: 0x20
 +     */
-+    VLEIB   SHIFTS,0x40,7
+    uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 +    v9 = vec_insert((unsigned char)0x40, v9, 7);
 +
 +    /*
 +     * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
 +     * to move R4 into the rightmost doubleword and set the leftmost
 +     * doubleword to 0x1.
 +     */
-+    VSRLB   %v0,CONST_R4R3,SHIFTS
+    v0 = vec_srb(r4r3, (uv2di)v9);
-+    VLEIG   %v0,1,0
+    v0[0] = 1;
 +
 +    /*
 +     * Compute GF(2) product of V1 and V0.  The rightmost doubleword
@ -289,7 +250,7 @@ index 0000000..029cfff
 +     * multiplied by 0x1 and is then XORed with rightmost product.
 +     * Implicitly, the intermediate leftmost product becomes padded
 +     */
-+    VGFMG   %v1,%v0,%v1
+    v1 = (uv2di)vec_gfmsum_128(v0, v1);
 +
 +    /*
 +     * Now do the final 32-bit fold by multiplying the rightmost word
@ -304,10 +265,10 @@ index 0000000..029cfff
 +     * rightmost doubleword and the leftmost doubleword is zero to ignore
 +     * the leftmost product of V1.
 +     */
-+    VLEIB   SHIFTS,0x20,7             /* Shift by words */
+    v9 = vec_insert((unsigned char)0x20, v9, 7);
-+    VSRLB   %v2,%v1,SHIFTS            /* Store remaining bits in V2 */
+    v2 = vec_srb(v1, (uv2di)v9);
-+    VUPLLF  %v1,%v1                   /* Split rightmost doubleword */
+    v1 = vec_unpackl((uv4si)v1);  /* Split rightmost doubleword */
-+    VGFMAG  %v1,CONST_R5,%v1,%v2      /* V1 = (V1 * R5) XOR V2 */
+    v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);
 +
 +    /*
 +     * Apply a Barret reduction to compute the final 32-bit CRC value.
@ -329,38 +290,61 @@ index 0000000..029cfff
 +     */
 +
 +    /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
-+    VUPLLF  %v2,%v1
+    v2 = vec_unpackl((uv4si)v1);
-+    VGFMG   %v2,CONST_RU_POLY,%v2
+    v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);
 +
 +    /*
 +     * Compute the GF(2) product of the CRC polynomial with T1(x) in
 +     * V2 and XOR the intermediate result, T2(x), with the value in V1.
 +     * The final result is stored in word element 2 of V2.
 +     */
-+    VUPLLF  %v2,%v2
+    v2 = vec_unpackl((uv4si)v2);
-+    VGFMAG  %v2,CONST_CRC_POLY,%v2,%v1
+    v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);
 +
-+.Ldone:
+    return ((uv4si)v2)[2];
-+    VLGVF   %r2,%v2,2
+}
 +    BR      %r14
 +    .type crc32_le_vgfm_16, @function
 +    .size crc32_le_vgfm_16, .-crc32_le_vgfm_16
 +
 +.previous
 diff --git a/crc32.c b/crc32.c
-index 34132ea..af5d3cd 100644
+index 34132ea..dfa33ef 100644
 --- a/crc32.c
 +++ b/crc32.c
-@@ -252,12 +252,26 @@ unsigned long crc32_vpmsum(unsigned long, const unsigned char FAR *, z_size_t);
+@@ -252,12 +252,54 @@ unsigned long crc32_vpmsum(unsigned long, const unsigned char FAR *, z_size_t);
 #endif
 #endif
 +#ifdef HAVE_S390X_VX
 +#include <sys/auxv.h>
 +
-+local unsigned long crc32_s390_vx(unsigned long crc,
+#define VX_MIN_LEN 64
-+                                  const unsigned char FAR *buf,
+#define VX_ALIGNMENT 16L
-+                                  z_size_t len);
+#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
 +
 +unsigned int crc32_le_vgfm_16(unsigned int crc, const unsigned char FAR *buf, z_size_t len);
 +
 +local unsigned long s390_crc32_vx(unsigned long crc, const unsigned char FAR *buf, z_size_t len)
 +{
 +    uint64_t prealign, aligned, remaining;
 +
 +    if (buf == Z_NULL) return 0UL;
 +
 +    if (len < VX_MIN_LEN + VX_ALIGN_MASK)
 +        return crc32_big(crc, buf, len);
 +
 +    if ((uintptr_t)buf & VX_ALIGN_MASK) {
 +        prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);
 +        len -= prealign;
 +        crc = crc32_big(crc, buf, prealign);
 +        buf += prealign;
 +    }
 +    aligned = len & ~VX_ALIGN_MASK;
 +    remaining = len & VX_ALIGN_MASK;
 +
 +    crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, (size_t)aligned) ^ 0xffffffff;
 +
 +    if (remaining)
 +        crc = crc32_big(crc, buf + aligned, remaining);
 +
 +    return crc;
 +}
 +#endif
 +
 /* due to a quirk of gnu_indirect_function - "local" (aka static) is applied to
@ -379,19 +363,19 @@ index 34132ea..af5d3cd 100644
 {
 #if _ARCH_PWR8==1
 #if defined(__BUILTIN_CPU_SUPPORTS__)
-@@ -269,6 +283,11 @@ unsigned long (*(crc32_z_ifunc(void)))(unsigned long, const unsigned char FAR *,
+@@ -269,6 +311,11 @@ unsigned long (*(crc32_z_ifunc(void)))(unsigned long, const unsigned char FAR *,
 #endif
 #endif /* _ARCH_PWR8 */
 +#ifdef HAVE_S390X_VX
 +    if (hwcap & HWCAP_S390_VX)
-+        return crc32_s390_vx;
+        return s390_crc32_vx;
 +#endif
 +
 /* return a function pointer for optimized arches here */
 #ifdef DYNAMIC_CRC_TABLE
-@@ -301,7 +320,11 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
+@@ -301,7 +348,11 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
     static unsigned long ZEXPORT (*crc32_func)(unsigned long, const unsigned char FAR *, z_size_t) = NULL;
     if (!crc32_func)
@ -404,52 +388,6 @@ index 34132ea..af5d3cd 100644
     return (*crc32_func)(crc, buf, len);
 }
@@ -500,6 +523,45 @@ local uLong crc32_combine_(crc1, crc2, len2)
     return crc1;
 }
 +#ifdef HAVE_S390X_VX
 +#define VX_MIN_LEN 64
 +#define VX_ALIGNMENT 16L
 +#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
 +
 +unsigned int crc32_le_vgfm_16(unsigned int crc,
 +                              unsigned char const *buf,
 +                              size_t size);
 +
 +local unsigned long crc32_s390_vx(crc, buf, len)
 +    unsigned long crc;
 +    const unsigned char FAR *buf;
 +    z_size_t len;
 +{
 +    unsigned long prealign, aligned, remaining;
 +
 +    if (buf == Z_NULL) return 0UL;
 +
 +    if (len < VX_MIN_LEN + VX_ALIGN_MASK)
 +        return crc32_big(crc, buf, len);
 +
 +    if ((unsigned long)buf & VX_ALIGN_MASK) {
 +        prealign = VX_ALIGNMENT - ((unsigned long)buf & VX_ALIGN_MASK);
 +        len -= prealign;
 +        crc = crc32_big(crc, buf, prealign);
 +        buf = (void *)((unsigned long)buf + prealign);
 +    }
 +    aligned = len & ~VX_ALIGN_MASK;
 +    remaining = len & VX_ALIGN_MASK;
 +
 +    crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, aligned) ^ 0xffffffff;
 +
 +    if (remaining)
 +        crc = crc32_big(crc, buf + aligned, remaining);
 +
 +    return crc;
 +}
 +#endif
 +
 /* ========================================================================= */
 uLong ZEXPORT crc32_combine(crc1, crc2, len2)
     uLong crc1;
 -- 
 2.25.1
--- a/zlib.spec
+++ b/zlib.spec
@ -2,7 +2,7 @@
 Name:    zlib
 Version: 1.2.11
-Release: 28%{?dist}
+Release: 30%{?dist}
 Summary: Compression and decompression library
 # /contrib/dotzlib/ have Boost license
 License: zlib and Boost
@ -180,6 +180,9 @@ find $RPM_BUILD_ROOT -name '*.la' -delete
 %changelog
 * Thu Jul 29 2021 Dan Horák <dan[at]danny.cz> - 1.2.11-30
 - Fix for IBM CRC32 optimalization rhbz#1959423
 * Fri Jul 23 2021 Fedora Release Engineering <releng@fedoraproject.org> - 1.2.11-28
 - Rebuilt for https://fedoraproject.org/wiki/Fedora_35_Mass_Rebuild