aarch64 optimizations
This commit is contained in:
parent
4d2785ec31
commit
25e9802713
157
0001-Neon-Optimized-hash-chain-rebase.patch
Normal file
157
0001-Neon-Optimized-hash-chain-rebase.patch
Normal file
@ -0,0 +1,157 @@
|
|||||||
|
From f849a23e0afc8b8a670fda64eec8b573fe62daa7 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
|
||||||
|
Date: Mon, 9 Apr 2018 13:52:17 -0700
|
||||||
|
Subject: [PATCH 1/3] Neon-Optimized hash chain rebase
|
||||||
|
|
||||||
|
This should help with compression of data, using NEON instructions
|
||||||
|
(therefore useful for ARMv7/ARMv8).
|
||||||
|
|
||||||
|
Original patch by Jun He.
|
||||||
|
---
|
||||||
|
CMakeLists.txt | 5 ++-
|
||||||
|
contrib/arm/neon_slide_hash.h | 84 +++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
deflate.c | 7 ++++
|
||||||
|
3 files changed, 95 insertions(+), 1 deletion(-)
|
||||||
|
create mode 100644 contrib/arm/neon_slide_hash.h
|
||||||
|
|
||||||
|
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
||||||
|
index 98ee4dd..230ca6d 100644
|
||||||
|
--- a/CMakeLists.txt
|
||||||
|
+++ b/CMakeLists.txt
|
||||||
|
@@ -139,7 +139,10 @@ if(CMAKE_COMPILER_IS_GNUCC)
|
||||||
|
|
||||||
|
if(ARM_NEON)
|
||||||
|
list(REMOVE_ITEM ZLIB_SRCS inflate.c)
|
||||||
|
- set(ZLIB_ARM_NEON_HDRS contrib/arm/chunkcopy.h contrib/arm/inffast_chunk.h)
|
||||||
|
+ set(ZLIB_ARM_NEON_HDRS
|
||||||
|
+ contrib/arm/chunkcopy.h
|
||||||
|
+ contrib/arm/inffast_chunk.h
|
||||||
|
+ contrib/arm/neon_slide_hash.h)
|
||||||
|
set(ZLIB_ARM_NEON contrib/arm/inflate.c contrib/arm/inffast_chunk.c)
|
||||||
|
add_definitions(-DARM_NEON)
|
||||||
|
set(COMPILER ${CMAKE_C_COMPILER})
|
||||||
|
diff --git a/contrib/arm/neon_slide_hash.h b/contrib/arm/neon_slide_hash.h
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000..0daffa1
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/contrib/arm/neon_slide_hash.h
|
||||||
|
@@ -0,0 +1,84 @@
|
||||||
|
+/* Copyright (C) 1995-2011, 2016 Mark Adler
|
||||||
|
+ * Copyright (C) 2017 ARM Holdings Inc.
|
||||||
|
+ * Authors: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
|
||||||
|
+ * Jun He <jun.he@arm.com>
|
||||||
|
+ * This software is provided 'as-is', without any express or implied
|
||||||
|
+ * warranty. In no event will the authors be held liable for any damages
|
||||||
|
+ * arising from the use of this software.
|
||||||
|
+ * Permission is granted to anyone to use this software for any purpose,
|
||||||
|
+ * including commercial applications, and to alter it and redistribute it
|
||||||
|
+ * freely, subject to the following restrictions:
|
||||||
|
+ * 1. The origin of this software must not be misrepresented; you must not
|
||||||
|
+ * claim that you wrote the original software. If you use this software
|
||||||
|
+ * in a product, an acknowledgment in the product documentation would be
|
||||||
|
+ * appreciated but is not required.
|
||||||
|
+ * 2. Altered source versions must be plainly marked as such, and must not be
|
||||||
|
+ * misrepresented as being the original software.
|
||||||
|
+ * 3. This notice may not be removed or altered from any source distribution.
|
||||||
|
+ */
|
||||||
|
+#ifndef __NEON_SLIDE_HASH__
|
||||||
|
+#define __NEON_SLIDE_HASH__
|
||||||
|
+
|
||||||
|
+#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
|
||||||
|
+#include "deflate.h"
|
||||||
|
+#include <arm_neon.h>
|
||||||
|
+
|
||||||
|
+inline static void neon_slide_hash(deflate_state *s)
|
||||||
|
+{
|
||||||
|
+ /*
|
||||||
|
+ * This is ASIMD implementation for hash table rebase
|
||||||
|
+ * it assumes:
|
||||||
|
+ * 1. hash chain offset (Pos) is 2 bytes
|
||||||
|
+ * 2. hash table size is multiple*128 bytes
|
||||||
|
+ * #1 should be true as Pos is defined as "ush"
|
||||||
|
+ * #2 should be true as hash_bits are greater that 7
|
||||||
|
+ */
|
||||||
|
+ unsigned n, m;
|
||||||
|
+ unsigned short wsize = s->w_size;
|
||||||
|
+ uint16x8_t v, *p;
|
||||||
|
+ size_t size;
|
||||||
|
+
|
||||||
|
+ size = s->hash_size*sizeof(s->head[0]);
|
||||||
|
+ Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
|
||||||
|
+
|
||||||
|
+ Assert(sizeof(Pos) == 2, "Wrong Pos size");
|
||||||
|
+
|
||||||
|
+ /* slide s->head */
|
||||||
|
+ v = vdupq_n_u16(wsize);
|
||||||
|
+ p = (uint16x8_t *)(s->head);
|
||||||
|
+ n = size / (sizeof(uint16x8_t) * 8);
|
||||||
|
+ do {
|
||||||
|
+ p[0] = vqsubq_u16(p[0], v);
|
||||||
|
+ p[1] = vqsubq_u16(p[1], v);
|
||||||
|
+ p[2] = vqsubq_u16(p[2], v);
|
||||||
|
+ p[3] = vqsubq_u16(p[3], v);
|
||||||
|
+ p[4] = vqsubq_u16(p[4], v);
|
||||||
|
+ p[5] = vqsubq_u16(p[5], v);
|
||||||
|
+ p[6] = vqsubq_u16(p[6], v);
|
||||||
|
+ p[7] = vqsubq_u16(p[7], v);
|
||||||
|
+ p += 8;
|
||||||
|
+ } while (--n);
|
||||||
|
+#ifndef FASTEST
|
||||||
|
+ /* slide s->prev */
|
||||||
|
+ size = wsize*sizeof(s->prev[0]);
|
||||||
|
+
|
||||||
|
+ Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
|
||||||
|
+
|
||||||
|
+ p = (uint16x8_t *)(s->prev);
|
||||||
|
+ n = size / (sizeof(uint16x8_t) * 8);
|
||||||
|
+ do {
|
||||||
|
+ p[0] = vqsubq_u16(p[0], v);
|
||||||
|
+ p[1] = vqsubq_u16(p[1], v);
|
||||||
|
+ p[2] = vqsubq_u16(p[2], v);
|
||||||
|
+ p[3] = vqsubq_u16(p[3], v);
|
||||||
|
+ p[4] = vqsubq_u16(p[4], v);
|
||||||
|
+ p[5] = vqsubq_u16(p[5], v);
|
||||||
|
+ p[6] = vqsubq_u16(p[6], v);
|
||||||
|
+ p[7] = vqsubq_u16(p[7], v);
|
||||||
|
+ p += 8;
|
||||||
|
+ } while (--n);
|
||||||
|
+#endif
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+#endif
|
||||||
|
+#endif
|
||||||
|
diff --git a/deflate.c b/deflate.c
|
||||||
|
index 1ec7614..36f99ac 100644
|
||||||
|
--- a/deflate.c
|
||||||
|
+++ b/deflate.c
|
||||||
|
@@ -50,6 +50,9 @@
|
||||||
|
/* @(#) $Id$ */
|
||||||
|
|
||||||
|
#include "deflate.h"
|
||||||
|
+#if __ARM_NEON
|
||||||
|
+#include "contrib/arm/neon_slide_hash.h"
|
||||||
|
+#endif
|
||||||
|
|
||||||
|
const char deflate_copyright[] =
|
||||||
|
" deflate 1.2.11 Copyright 1995-2017 Jean-loup Gailly and Mark Adler ";
|
||||||
|
@@ -201,6 +204,9 @@ local const config configuration_table[10] = {
|
||||||
|
local void slide_hash(s)
|
||||||
|
deflate_state *s;
|
||||||
|
{
|
||||||
|
+#if ARM_NEON
|
||||||
|
+ return neon_slide_hash(s);
|
||||||
|
+#else
|
||||||
|
unsigned n, m;
|
||||||
|
Posf *p;
|
||||||
|
uInt wsize = s->w_size;
|
||||||
|
@@ -222,6 +228,7 @@ local void slide_hash(s)
|
||||||
|
*/
|
||||||
|
} while (--n);
|
||||||
|
#endif
|
||||||
|
+#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ========================================================================= */
|
||||||
|
--
|
||||||
|
2.14.3
|
||||||
|
|
2268
0001-Porting-inflate-using-wider-loads-and-stores.patch
Normal file
2268
0001-Porting-inflate-using-wider-loads-and-stores.patch
Normal file
File diff suppressed because it is too large
Load Diff
147
0002-Port-Fix-InflateBack-corner-case.patch
Normal file
147
0002-Port-Fix-InflateBack-corner-case.patch
Normal file
@ -0,0 +1,147 @@
|
|||||||
|
From 267e6f20170edb9a00b11fc3a2ca7649ea1c4464 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
|
||||||
|
Date: Wed, 4 Apr 2018 15:14:57 -0700
|
||||||
|
Subject: [PATCH 2/2] Port Fix InflateBack corner case
|
||||||
|
|
||||||
|
This handles the case where a zlib user could rely on InflateBack
|
||||||
|
API to decompress content.
|
||||||
|
|
||||||
|
The NEON optimization assumes that it can perform wide stores, sometimes
|
||||||
|
overwriting data on the output pointer (but never overflowing the buffer
|
||||||
|
end as it has enough room for the write).
|
||||||
|
|
||||||
|
For infback there is no such guarantees (i.e. no extra wiggle room),
|
||||||
|
which can result in illegal operations. This patch fixes the potential
|
||||||
|
issue by falling back to the non-optimized code for such cases.
|
||||||
|
|
||||||
|
Also it adds some comments about the entry assumptions in inflate and
|
||||||
|
writes out a defined value at the write buffer to identify where
|
||||||
|
the real data has ended (helpful while debugging).
|
||||||
|
|
||||||
|
For reference, please see:
|
||||||
|
https://chromium.googlesource.com/chromium/src/+/0bb11040792edc5b28fcb710fc4c01fedd98c97c
|
||||||
|
|
||||||
|
Change-Id: Iffbda9eb5e08a661aa15c6e3d1c59b678cc23b2c
|
||||||
|
---
|
||||||
|
CMakeLists.txt | 5 ++---
|
||||||
|
contrib/arm/{inffast.c => inffast_chunk.c} | 10 +++++++---
|
||||||
|
contrib/arm/inffast_chunk.h | 12 ++++++++++++
|
||||||
|
contrib/arm/inflate.c | 14 ++++++++++++--
|
||||||
|
4 files changed, 33 insertions(+), 8 deletions(-)
|
||||||
|
rename contrib/arm/{inffast.c => inffast_chunk.c} (97%)
|
||||||
|
create mode 100644 contrib/arm/inffast_chunk.h
|
||||||
|
|
||||||
|
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
||||||
|
index 09bb3db..98ee4dd 100644
|
||||||
|
--- a/CMakeLists.txt
|
||||||
|
+++ b/CMakeLists.txt
|
||||||
|
@@ -139,9 +139,8 @@ if(CMAKE_COMPILER_IS_GNUCC)
|
||||||
|
|
||||||
|
if(ARM_NEON)
|
||||||
|
list(REMOVE_ITEM ZLIB_SRCS inflate.c)
|
||||||
|
- list(REMOVE_ITEM ZLIB_SRCS inffast.c)
|
||||||
|
- set(ZLIB_ARM_NEON_HDRS contrib/arm/chunkcopy.h)
|
||||||
|
- set(ZLIB_ARM_NEON contrib/arm/inflate.c contrib/arm/inffast.c)
|
||||||
|
+ set(ZLIB_ARM_NEON_HDRS contrib/arm/chunkcopy.h contrib/arm/inffast_chunk.h)
|
||||||
|
+ set(ZLIB_ARM_NEON contrib/arm/inflate.c contrib/arm/inffast_chunk.c)
|
||||||
|
add_definitions(-DARM_NEON)
|
||||||
|
set(COMPILER ${CMAKE_C_COMPILER})
|
||||||
|
# NEON is mandatory in ARMv8.
|
||||||
|
diff --git a/contrib/arm/inffast.c b/contrib/arm/inffast_chunk.c
|
||||||
|
similarity index 97%
|
||||||
|
rename from contrib/arm/inffast.c
|
||||||
|
rename to contrib/arm/inffast_chunk.c
|
||||||
|
index f7f5007..0c5c583 100644
|
||||||
|
--- a/contrib/arm/inffast.c
|
||||||
|
+++ b/contrib/arm/inffast_chunk.c
|
||||||
|
@@ -6,8 +6,8 @@
|
||||||
|
#include "zutil.h"
|
||||||
|
#include "inftrees.h"
|
||||||
|
#include "inflate.h"
|
||||||
|
-#include "inffast.h"
|
||||||
|
-#include "chunkcopy.h"
|
||||||
|
+#include "contrib/arm/inffast_chunk.h"
|
||||||
|
+#include "contrib/arm/chunkcopy.h"
|
||||||
|
|
||||||
|
#ifdef ASMINF
|
||||||
|
# pragma message("Assembler code may have bugs -- use at your own risk")
|
||||||
|
@@ -28,6 +28,10 @@
|
||||||
|
strm->avail_out >= 258
|
||||||
|
start >= strm->avail_out
|
||||||
|
state->bits < 8
|
||||||
|
+ strm->next_out[0..strm->avail_out] does not overlap with
|
||||||
|
+ strm->next_in[0..strm->avail_in]
|
||||||
|
+ strm->state->window is allocated with an additional
|
||||||
|
+ CHUNKCOPY_CHUNK_SIZE-1 bytes of padding beyond strm->state->wsize
|
||||||
|
|
||||||
|
On return, state->mode is one of:
|
||||||
|
|
||||||
|
@@ -48,7 +52,7 @@
|
||||||
|
requires strm->avail_out >= 258 for each loop to avoid checking for
|
||||||
|
output space.
|
||||||
|
*/
|
||||||
|
-void ZLIB_INTERNAL inflate_fast(strm, start)
|
||||||
|
+void ZLIB_INTERNAL inflate_fast_chunk(strm, start)
|
||||||
|
z_streamp strm;
|
||||||
|
unsigned start; /* inflate()'s starting value for strm->avail_out */
|
||||||
|
{
|
||||||
|
diff --git a/contrib/arm/inffast_chunk.h b/contrib/arm/inffast_chunk.h
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000..7839e1d
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/contrib/arm/inffast_chunk.h
|
||||||
|
@@ -0,0 +1,12 @@
|
||||||
|
+/* inffast.h -- header to use inffast.c
|
||||||
|
+ * Copyright (C) 1995-2003, 2010 Mark Adler
|
||||||
|
+ * Copyright (C) 2017 ARM, Inc.
|
||||||
|
+ * For conditions of distribution and use, see copyright notice in zlib.h
|
||||||
|
+ */
|
||||||
|
+
|
||||||
|
+/* WARNING: this file should *not* be used by applications. It is
|
||||||
|
+ part of the implementation of the compression library and is
|
||||||
|
+ subject to change. Applications should only use zlib.h.
|
||||||
|
+ */
|
||||||
|
+
|
||||||
|
+void ZLIB_INTERNAL inflate_fast_chunk OF((z_streamp strm, unsigned start));
|
||||||
|
diff --git a/contrib/arm/inflate.c b/contrib/arm/inflate.c
|
||||||
|
index 23e95f1..d860542 100644
|
||||||
|
--- a/contrib/arm/inflate.c
|
||||||
|
+++ b/contrib/arm/inflate.c
|
||||||
|
@@ -83,7 +83,7 @@
|
||||||
|
#include "zutil.h"
|
||||||
|
#include "inftrees.h"
|
||||||
|
#include "inflate.h"
|
||||||
|
-#include "inffast.h"
|
||||||
|
+#include "contrib/arm/inffast_chunk.h"
|
||||||
|
#include "contrib/arm/chunkcopy.h"
|
||||||
|
|
||||||
|
#ifdef MAKEFIXED
|
||||||
|
@@ -1056,7 +1056,7 @@ int flush;
|
||||||
|
case LEN:
|
||||||
|
if (have >= 6 && left >= 258) {
|
||||||
|
RESTORE();
|
||||||
|
- inflate_fast(strm, out);
|
||||||
|
+ inflate_fast_chunk(strm, out);
|
||||||
|
LOAD();
|
||||||
|
if (state->mode == TYPE)
|
||||||
|
state->back = -1;
|
||||||
|
@@ -1262,6 +1262,16 @@ int flush;
|
||||||
|
Note: a memory error from inflate() is non-recoverable.
|
||||||
|
*/
|
||||||
|
inf_leave:
|
||||||
|
+ /* We write a defined value in the unused space to help mark
|
||||||
|
+ * where the stream has ended. We don't use zeros as that can
|
||||||
|
+ * mislead clients relying on undefined behavior (i.e. assuming
|
||||||
|
+ * that the data is over when the buffer has a zero/null value).
|
||||||
|
+ */
|
||||||
|
+ if (left >= CHUNKCOPY_CHUNK_SIZE)
|
||||||
|
+ memset(put, 0x55, CHUNKCOPY_CHUNK_SIZE);
|
||||||
|
+ else
|
||||||
|
+ memset(put, 0x55, left);
|
||||||
|
+
|
||||||
|
RESTORE();
|
||||||
|
if (state->wsize || (out != strm->avail_out && state->mode < BAD &&
|
||||||
|
(state->mode < CHECK || flush != Z_FINISH)))
|
||||||
|
--
|
||||||
|
2.14.3
|
||||||
|
|
218
0002-Porting-optimized-longest_match.patch
Normal file
218
0002-Porting-optimized-longest_match.patch
Normal file
@ -0,0 +1,218 @@
|
|||||||
|
From 0ad56061ade1afe2896af1acffa5e15fbe5c98ed Mon Sep 17 00:00:00 2001
|
||||||
|
From: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
|
||||||
|
Date: Mon, 9 Apr 2018 15:14:19 -0700
|
||||||
|
Subject: [PATCH 2/3] Porting optimized longest_match
|
||||||
|
|
||||||
|
This patch was contributed to zlib-ng and features an improved longest_match
|
||||||
|
function using the most distant hash code to reduce number of checks
|
||||||
|
(see: http://www.gildor.org/en/projects/zlib).
|
||||||
|
|
||||||
|
Original patch by Jun He.
|
||||||
|
---
|
||||||
|
CMakeLists.txt | 3 +-
|
||||||
|
contrib/arm/arm_longest_match.h | 142 ++++++++++++++++++++++++++++++++++++++++
|
||||||
|
deflate.c | 11 +++-
|
||||||
|
3 files changed, 152 insertions(+), 4 deletions(-)
|
||||||
|
create mode 100644 contrib/arm/arm_longest_match.h
|
||||||
|
|
||||||
|
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
||||||
|
index 230ca6d..c330093 100644
|
||||||
|
--- a/CMakeLists.txt
|
||||||
|
+++ b/CMakeLists.txt
|
||||||
|
@@ -142,7 +142,8 @@ if(CMAKE_COMPILER_IS_GNUCC)
|
||||||
|
set(ZLIB_ARM_NEON_HDRS
|
||||||
|
contrib/arm/chunkcopy.h
|
||||||
|
contrib/arm/inffast_chunk.h
|
||||||
|
- contrib/arm/neon_slide_hash.h)
|
||||||
|
+ contrib/arm/neon_slide_hash.h
|
||||||
|
+ contrib/arm/arm_longest_match.h)
|
||||||
|
set(ZLIB_ARM_NEON contrib/arm/inflate.c contrib/arm/inffast_chunk.c)
|
||||||
|
add_definitions(-DARM_NEON)
|
||||||
|
set(COMPILER ${CMAKE_C_COMPILER})
|
||||||
|
diff --git a/contrib/arm/arm_longest_match.h b/contrib/arm/arm_longest_match.h
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000..9e7083f
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/contrib/arm/arm_longest_match.h
|
||||||
|
@@ -0,0 +1,142 @@
|
||||||
|
+/* Copyright (C) 1995-2011, 2016 Mark Adler
|
||||||
|
+ * Copyright (C) 2017 ARM Holdings Inc.
|
||||||
|
+ * Authors: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
|
||||||
|
+ * Jun He <jun.he@arm.com>
|
||||||
|
+ * This software is provided 'as-is', without any express or implied
|
||||||
|
+ * warranty. In no event will the authors be held liable for any damages
|
||||||
|
+ * arising from the use of this software.
|
||||||
|
+ * Permission is granted to anyone to use this software for any purpose,
|
||||||
|
+ * including commercial applications, and to alter it and redistribute it
|
||||||
|
+ * freely, subject to the following restrictions:
|
||||||
|
+ * 1. The origin of this software must not be misrepresented; you must not
|
||||||
|
+ * claim that you wrote the original software. If you use this software
|
||||||
|
+ * in a product, an acknowledgment in the product documentation would be
|
||||||
|
+ * appreciated but is not required.
|
||||||
|
+ * 2. Altered source versions must be plainly marked as such, and must not be
|
||||||
|
+ * misrepresented as being the original software.
|
||||||
|
+ * 3. This notice may not be removed or altered from any source distribution.
|
||||||
|
+ */
|
||||||
|
+#ifndef __ARM_LONGEST__MATCH__
|
||||||
|
+#define __ARM_LONGEST__MATCH__
|
||||||
|
+
|
||||||
|
+#if defined(ARM_NEON)
|
||||||
|
+#include "deflate.h"
|
||||||
|
+#include <stdint.h>
|
||||||
|
+static inline long get_match_len(const unsigned char *a, const unsigned char *b, long max)
|
||||||
|
+{
|
||||||
|
+ register int len = 0;
|
||||||
|
+ register unsigned long xor = 0;
|
||||||
|
+ register int check_loops = max/sizeof(unsigned long);
|
||||||
|
+ while(check_loops-- > 0) {
|
||||||
|
+ xor = (*(unsigned long *)(a+len)) ^ (*(unsigned long *)(b+len));
|
||||||
|
+ if (xor) break;
|
||||||
|
+ len += sizeof(unsigned long);
|
||||||
|
+ }
|
||||||
|
+ if (0 == xor) {
|
||||||
|
+ while (len < max) {
|
||||||
|
+ if (a[len] != b[len]) break;
|
||||||
|
+ len++;
|
||||||
|
+ }
|
||||||
|
+ return len;
|
||||||
|
+ }
|
||||||
|
+ xor = __builtin_ctzl(xor)>>3;
|
||||||
|
+ return len + xor;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+/*
|
||||||
|
+ * This implementation is based on algorithm described at:
|
||||||
|
+ * http://www.gildor.org/en/projects/zlib
|
||||||
|
+ * It uses the hash chain indexed by the most distant hash code to
|
||||||
|
+ * reduce number of checks.
|
||||||
|
+ * This also eliminates the those unnecessary check loops in legacy
|
||||||
|
+ * longest_match's do..while loop if the "most distant code" is out
|
||||||
|
+ * of search buffer
|
||||||
|
+ *
|
||||||
|
+ */
|
||||||
|
+static inline unsigned arm_longest_match(deflate_state *const s, IPos cur_match) {
|
||||||
|
+ unsigned chain_length = s->max_chain_length;/* max hash chain length */
|
||||||
|
+ unsigned char *scan = s->window + s->strstart; /* current string */
|
||||||
|
+ unsigned char *match; /* matched string */
|
||||||
|
+ unsigned int len; /* length of current match */
|
||||||
|
+ unsigned int best_len = s->prev_length; /* best match length so far */
|
||||||
|
+ unsigned int nice_match = s->nice_match; /* stop if match long enough */
|
||||||
|
+ IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
|
||||||
|
+ s->strstart - (IPos)MAX_DIST(s) : 0;
|
||||||
|
+ /* Stop when cur_match becomes <= limit. To simplify the code,
|
||||||
|
+ * we prevent matches with the string of window index 0.
|
||||||
|
+ */
|
||||||
|
+ int offset = 0; /* offset of the head[most_distant_hash] from IN cur_match */
|
||||||
|
+ Pos *prev = s->prev;
|
||||||
|
+ unsigned int wmask = s->w_mask;
|
||||||
|
+ unsigned char *scan_buf_base = s->window;
|
||||||
|
+
|
||||||
|
+ /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
|
||||||
|
+ * It is easy to get rid of this optimization if necessary.
|
||||||
|
+ */
|
||||||
|
+ Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
|
||||||
|
+
|
||||||
|
+ /* Do not look for matches beyond the end of the input. This is necessary
|
||||||
|
+ * to make deflate deterministic.
|
||||||
|
+ */
|
||||||
|
+ if ((unsigned int)nice_match > s->lookahead) nice_match = s->lookahead;
|
||||||
|
+
|
||||||
|
+ Assert((unsigned long)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
|
||||||
|
+
|
||||||
|
+ /* find most distant hash code for lazy_match */
|
||||||
|
+ if (best_len > MIN_MATCH) {
|
||||||
|
+ /* search for most distant hash code */
|
||||||
|
+ int i;
|
||||||
|
+ uint16_t hash = 0;
|
||||||
|
+ IPos pos;
|
||||||
|
+
|
||||||
|
+ UPDATE_HASH(s, hash, scan[1]);
|
||||||
|
+ UPDATE_HASH(s, hash, scan[2]);
|
||||||
|
+ for (i = 3; i <= best_len; i++) {
|
||||||
|
+ UPDATE_HASH(s, hash, scan[i]);
|
||||||
|
+ /* get head IPos of hash calced by scan[i-2..i] */
|
||||||
|
+ pos = s->head[hash];
|
||||||
|
+ /* compare it to current "farthest hash" IPos */
|
||||||
|
+ if (pos <= cur_match) {
|
||||||
|
+ /* we have a new "farthest hash" now */
|
||||||
|
+ offset = i - 2;
|
||||||
|
+ cur_match = pos;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ /* update variables to correspond offset */
|
||||||
|
+ limit += offset;
|
||||||
|
+ /*
|
||||||
|
+ * check if the most distant code's offset is out of search buffer
|
||||||
|
+ * if it is true, then this means scan[offset..offset+2] are not
|
||||||
|
+ * presented in the search buffer. So we just return best_len
|
||||||
|
+ * we've found.
|
||||||
|
+ */
|
||||||
|
+ if (cur_match < limit) return best_len;
|
||||||
|
+
|
||||||
|
+ scan_buf_base -= offset;
|
||||||
|
+ /* reduce hash search depth based on best_len */
|
||||||
|
+ chain_length /= best_len - MIN_MATCH;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ do {
|
||||||
|
+ Assert(cur_match < s->strstart, "no future");
|
||||||
|
+
|
||||||
|
+ /* Determine matched length at current pos */
|
||||||
|
+ match = scan_buf_base + cur_match;
|
||||||
|
+ len = get_match_len(match, scan, MAX_MATCH);
|
||||||
|
+
|
||||||
|
+ if (len > best_len) {
|
||||||
|
+ /* found longer string */
|
||||||
|
+ s->match_start = cur_match - offset;
|
||||||
|
+ best_len = len;
|
||||||
|
+ /* good enough? */
|
||||||
|
+ if (len >= nice_match) break;
|
||||||
|
+ }
|
||||||
|
+ /* move to prev pos in this hash chain */
|
||||||
|
+ } while ((cur_match = prev[cur_match & wmask]) > limit && --chain_length != 0);
|
||||||
|
+
|
||||||
|
+ return (best_len <= s->lookahead)? best_len : s->lookahead;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+#endif
|
||||||
|
+#endif
|
||||||
|
diff --git a/deflate.c b/deflate.c
|
||||||
|
index 36f99ac..4c42259 100644
|
||||||
|
--- a/deflate.c
|
||||||
|
+++ b/deflate.c
|
||||||
|
@@ -50,9 +50,6 @@
|
||||||
|
/* @(#) $Id$ */
|
||||||
|
|
||||||
|
#include "deflate.h"
|
||||||
|
-#if __ARM_NEON
|
||||||
|
-#include "contrib/arm/neon_slide_hash.h"
|
||||||
|
-#endif
|
||||||
|
|
||||||
|
const char deflate_copyright[] =
|
||||||
|
" deflate 1.2.11 Copyright 1995-2017 Jean-loup Gailly and Mark Adler ";
|
||||||
|
@@ -196,6 +193,11 @@ local const config configuration_table[10] = {
|
||||||
|
s->head[s->hash_size-1] = NIL; \
|
||||||
|
zmemzero((Bytef *)s->head, (unsigned)(s->hash_size-1)*sizeof(*s->head));
|
||||||
|
|
||||||
|
+#if defined(ARM_NEON)
|
||||||
|
+#include "contrib/arm/arm_longest_match.h"
|
||||||
|
+#include "contrib/arm/neon_slide_hash.h"
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
/* ===========================================================================
|
||||||
|
* Slide the hash table when sliding the window down (could be avoided with 32
|
||||||
|
* bit values at the expense of memory usage). We slide even when level == 0 to
|
||||||
|
@@ -1244,6 +1246,9 @@ local uInt longest_match(s, cur_match)
|
||||||
|
deflate_state *s;
|
||||||
|
IPos cur_match; /* current match */
|
||||||
|
{
|
||||||
|
+#if defined(ARM_NEON)
|
||||||
|
+ return arm_longest_match(s, cur_match);
|
||||||
|
+#endif
|
||||||
|
unsigned chain_length = s->max_chain_length;/* max hash chain length */
|
||||||
|
register Bytef *scan = s->window + s->strstart; /* current string */
|
||||||
|
register Bytef *match; /* matched string */
|
||||||
|
--
|
||||||
|
2.14.3
|
||||||
|
|
136
0003-arm64-specific-build-patch.patch
Normal file
136
0003-arm64-specific-build-patch.patch
Normal file
@ -0,0 +1,136 @@
|
|||||||
|
From bd30e5ff76aab2668ebfd46e5dbadc44322960c1 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Jeremy Linton <jeremy.linton@arm.com>
|
||||||
|
Date: Fri, 6 Apr 2018 11:46:42 -0500
|
||||||
|
Subject: [PATCH 3/3] arm64 specific build patch
|
||||||
|
|
||||||
|
---
|
||||||
|
Makefile.in | 37 +++++++++++++++++++++++++++----------
|
||||||
|
configure | 2 +-
|
||||||
|
contrib/minizip/zip.c | 6 ++++--
|
||||||
|
3 files changed, 32 insertions(+), 13 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/Makefile.in b/Makefile.in
|
||||||
|
index 5a77949..1a1e452 100644
|
||||||
|
--- a/Makefile.in
|
||||||
|
+++ b/Makefile.in
|
||||||
|
@@ -57,11 +57,11 @@ SRCDIR=
|
||||||
|
ZINC=
|
||||||
|
ZINCOUT=-I.
|
||||||
|
|
||||||
|
-OBJZ = adler32.o crc32.o deflate.o infback.o inffast.o inflate.o inftrees.o trees.o zutil.o
|
||||||
|
+OBJZ = adler32.o crc32.o deflate.o infback.o arminffast.o inffast.o inflate.o inflate_chunk.o inftrees.o trees.o zutil.o
|
||||||
|
OBJG = compress.o uncompr.o gzclose.o gzlib.o gzread.o gzwrite.o
|
||||||
|
OBJC = $(OBJZ) $(OBJG)
|
||||||
|
|
||||||
|
-PIC_OBJZ = adler32.lo crc32.lo deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo trees.lo zutil.lo
|
||||||
|
+PIC_OBJZ = adler32.lo crc32.lo deflate.lo infback.lo inffast.lo inflate.lo inflate_chunk.lo inftrees.lo trees.lo zutil.lo
|
||||||
|
PIC_OBJG = compress.lo uncompr.lo gzclose.lo gzlib.lo gzread.lo gzwrite.lo
|
||||||
|
PIC_OBJC = $(PIC_OBJZ) $(PIC_OBJG)
|
||||||
|
|
||||||
|
@@ -163,16 +163,22 @@ crc32.o: $(SRCDIR)crc32.c
|
||||||
|
$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)crc32.c
|
||||||
|
|
||||||
|
deflate.o: $(SRCDIR)deflate.c
|
||||||
|
- $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c
|
||||||
|
+ $(CC) $(CFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -c -o $@ $(SRCDIR)deflate.c
|
||||||
|
|
||||||
|
infback.o: $(SRCDIR)infback.c
|
||||||
|
$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)infback.c
|
||||||
|
|
||||||
|
inffast.o: $(SRCDIR)inffast.c
|
||||||
|
- $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)inffast.c
|
||||||
|
+ $(CC) $(CFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -c -o $@ $(SRCDIR)inffast.c
|
||||||
|
|
||||||
|
-inflate.o: $(SRCDIR)inflate.c
|
||||||
|
- $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)inflate.c
|
||||||
|
+arminffast.o: $(SRCDIR)contrib/arm/inffast_chunk.c
|
||||||
|
+ $(CC) $(CFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -c -o $@ $(SRCDIR)contrib/arm/inffast_chunk.c
|
||||||
|
+
|
||||||
|
+inflate.o: $(SRCDIR)contrib/arm/inflate.c
|
||||||
|
+ $(CC) $(CFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -c -o $@ $(SRCDIR)contrib/arm/inflate.c
|
||||||
|
+
|
||||||
|
+inflate_chunk.o: $(SRCDIR)contrib/arm/inffast_chunk.c
|
||||||
|
+ $(CC) $(CFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -c -o $@ $(SRCDIR)contrib/arm/inffast_chunk.c
|
||||||
|
|
||||||
|
inftrees.o: $(SRCDIR)inftrees.c
|
||||||
|
$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)inftrees.c
|
||||||
|
@@ -214,7 +220,7 @@ crc32.lo: $(SRCDIR)crc32.c
|
||||||
|
|
||||||
|
deflate.lo: $(SRCDIR)deflate.c
|
||||||
|
-@mkdir objs 2>/dev/null || test -d objs
|
||||||
|
- $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c
|
||||||
|
+ $(CC) $(SFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c
|
||||||
|
-@mv objs/deflate.o $@
|
||||||
|
|
||||||
|
infback.lo: $(SRCDIR)infback.c
|
||||||
|
@@ -222,16 +228,27 @@ infback.lo: $(SRCDIR)infback.c
|
||||||
|
$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/infback.o $(SRCDIR)infback.c
|
||||||
|
-@mv objs/infback.o $@
|
||||||
|
|
||||||
|
+arminffast.lo: $(SRCDIR)contrib/arm/inffast_chunk.c $(SRCDIR)inffast.c
|
||||||
|
+ -@mkdir objs 2>/dev/null || test -d objs
|
||||||
|
+ $(CC) $(SFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -DPIC -c -o objs/arminffast.o $(SRCDIR)contrib/arm/inffast_chunk.c
|
||||||
|
+ -@mv objs/arminffast.o $@
|
||||||
|
+
|
||||||
|
inffast.lo: $(SRCDIR)inffast.c
|
||||||
|
-@mkdir objs 2>/dev/null || test -d objs
|
||||||
|
- $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/inffast.o $(SRCDIR)inffast.c
|
||||||
|
+ $(CC) $(SFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -DPIC -c -o objs/inffast.o $(SRCDIR)inffast.c
|
||||||
|
-@mv objs/inffast.o $@
|
||||||
|
|
||||||
|
-inflate.lo: $(SRCDIR)inflate.c
|
||||||
|
+inflate.lo: $(SRCDIR)contrib/arm/inflate.c
|
||||||
|
-@mkdir objs 2>/dev/null || test -d objs
|
||||||
|
- $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/inflate.o $(SRCDIR)inflate.c
|
||||||
|
+ $(CC) $(SFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -DPIC -c -o objs/inflate.o $(SRCDIR)contrib/arm/inflate.c
|
||||||
|
-@mv objs/inflate.o $@
|
||||||
|
|
||||||
|
+inflate_chunk.lo: $(SRCDIR)contrib/arm/inffast_chunk.c
|
||||||
|
+ -@mkdir objs 2>/dev/null || test -d objs
|
||||||
|
+ $(CC) $(SFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -DPIC -c -o objs/inflate_chunk.o $(SRCDIR)contrib/arm/inffast_chunk.c
|
||||||
|
+ -@mv objs/inflate_chunk.o $@
|
||||||
|
+
|
||||||
|
+
|
||||||
|
inftrees.lo: $(SRCDIR)inftrees.c
|
||||||
|
-@mkdir objs 2>/dev/null || test -d objs
|
||||||
|
$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/inftrees.o $(SRCDIR)inftrees.c
|
||||||
|
diff --git a/configure b/configure
|
||||||
|
index e974d1f..0c5f837 100755
|
||||||
|
--- a/configure
|
||||||
|
+++ b/configure
|
||||||
|
@@ -23,7 +23,7 @@ SRCDIR=`dirname $0`
|
||||||
|
if test $SRCDIR = "."; then
|
||||||
|
ZINC=""
|
||||||
|
ZINCOUT="-I."
|
||||||
|
- SRCDIR=""
|
||||||
|
+ SRCDIR="./"
|
||||||
|
else
|
||||||
|
ZINC='-include zconf.h'
|
||||||
|
ZINCOUT='-I. -I$(SRCDIR)'
|
||||||
|
diff --git a/contrib/minizip/zip.c b/contrib/minizip/zip.c
|
||||||
|
index 44e88a9..0517930 100644
|
||||||
|
--- a/contrib/minizip/zip.c
|
||||||
|
+++ b/contrib/minizip/zip.c
|
||||||
|
@@ -519,15 +519,17 @@ local ZPOS64_T zip64local_SearchCentralDir(const zlib_filefunc64_32_def* pzlib_f
|
||||||
|
break;
|
||||||
|
|
||||||
|
for (i=(int)uReadSize-3; (i--)>0;)
|
||||||
|
+ {
|
||||||
|
if (((*(buf+i))==0x50) && ((*(buf+i+1))==0x4b) &&
|
||||||
|
((*(buf+i+2))==0x05) && ((*(buf+i+3))==0x06))
|
||||||
|
{
|
||||||
|
uPosFound = uReadPos+i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
+ }
|
||||||
|
|
||||||
|
- if (uPosFound!=0)
|
||||||
|
- break;
|
||||||
|
+ if (uPosFound!=0)
|
||||||
|
+ break;
|
||||||
|
}
|
||||||
|
TRYFREE(buf);
|
||||||
|
return uPosFound;
|
||||||
|
--
|
||||||
|
2.14.3
|
||||||
|
|
18
zlib.spec
18
zlib.spec
@ -12,6 +12,12 @@ Source: http://www.zlib.net/zlib-%{version}.tar.xz
|
|||||||
Patch0: zlib-1.2.5-minizip-fixuncrypt.patch
|
Patch0: zlib-1.2.5-minizip-fixuncrypt.patch
|
||||||
# resolves: #805113
|
# resolves: #805113
|
||||||
Patch1: zlib-1.2.11-optimized-s390.patch
|
Patch1: zlib-1.2.11-optimized-s390.patch
|
||||||
|
# general aarch64 optimizations
|
||||||
|
Patch2: 0001-Porting-inflate-using-wider-loads-and-stores.patch
|
||||||
|
Patch3: 0002-Port-Fix-InflateBack-corner-case.patch
|
||||||
|
Patch4: 0001-Neon-Optimized-hash-chain-rebase.patch
|
||||||
|
Patch5: 0002-Porting-optimized-longest_match.patch
|
||||||
|
Patch6: 0003-arm64-specific-build-patch.patch
|
||||||
|
|
||||||
BuildRequires: automake, autoconf, libtool
|
BuildRequires: automake, autoconf, libtool
|
||||||
|
|
||||||
@ -64,6 +70,14 @@ developing applications which use minizip.
|
|||||||
%ifarch s390 s390x
|
%ifarch s390 s390x
|
||||||
%patch1 -p1 -b .optimized-deflate
|
%patch1 -p1 -b .optimized-deflate
|
||||||
%endif
|
%endif
|
||||||
|
%ifarch aarch64
|
||||||
|
%patch2 -p1 -b .optimize-aarch64
|
||||||
|
%patch3 -p1 -b .optimize-aarch64
|
||||||
|
%patch4 -p1 -b .optimize-aarch64
|
||||||
|
%patch5 -p1 -b .optimize-aarch64
|
||||||
|
%patch6 -p1 -b .optimize-aarch64
|
||||||
|
%endif
|
||||||
|
|
||||||
|
|
||||||
iconv -f iso-8859-2 -t utf-8 < ChangeLog > ChangeLog.tmp
|
iconv -f iso-8859-2 -t utf-8 < ChangeLog > ChangeLog.tmp
|
||||||
mv ChangeLog.tmp ChangeLog
|
mv ChangeLog.tmp ChangeLog
|
||||||
@ -73,6 +87,10 @@ export CFLAGS="$RPM_OPT_FLAGS"
|
|||||||
%ifarch ppc64
|
%ifarch ppc64
|
||||||
CFLAGS+=" -O3"
|
CFLAGS+=" -O3"
|
||||||
%endif
|
%endif
|
||||||
|
%ifarch aarch64
|
||||||
|
CFLAGS+=" -DARM_NEON -O3"
|
||||||
|
%endif
|
||||||
|
|
||||||
export LDFLAGS="$LDFLAGS -Wl,-z,relro -Wl,-z,now"
|
export LDFLAGS="$LDFLAGS -Wl,-z,relro -Wl,-z,now"
|
||||||
./configure --libdir=%{_libdir} --includedir=%{_includedir} --prefix=%{_prefix}
|
./configure --libdir=%{_libdir} --includedir=%{_includedir} --prefix=%{_prefix}
|
||||||
make %{?_smp_mflags}
|
make %{?_smp_mflags}
|
||||||
|
Loading…
Reference in New Issue
Block a user