From 25e9802713484882c27c1f979a6610a42414ee13 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Thu, 5 Apr 2018 16:20:56 -0500 Subject: [PATCH] aarch64 optimizations --- 0001-Neon-Optimized-hash-chain-rebase.patch | 157 ++ ...inflate-using-wider-loads-and-stores.patch | 2268 +++++++++++++++++ 0002-Port-Fix-InflateBack-corner-case.patch | 147 ++ 0002-Porting-optimized-longest_match.patch | 218 ++ 0003-arm64-specific-build-patch.patch | 136 + zlib.spec | 18 + 6 files changed, 2944 insertions(+) create mode 100644 0001-Neon-Optimized-hash-chain-rebase.patch create mode 100644 0001-Porting-inflate-using-wider-loads-and-stores.patch create mode 100644 0002-Port-Fix-InflateBack-corner-case.patch create mode 100644 0002-Porting-optimized-longest_match.patch create mode 100644 0003-arm64-specific-build-patch.patch diff --git a/0001-Neon-Optimized-hash-chain-rebase.patch b/0001-Neon-Optimized-hash-chain-rebase.patch new file mode 100644 index 0000000..bebec86 --- /dev/null +++ b/0001-Neon-Optimized-hash-chain-rebase.patch @@ -0,0 +1,157 @@ +From f849a23e0afc8b8a670fda64eec8b573fe62daa7 Mon Sep 17 00:00:00 2001 +From: Adenilson Cavalcanti +Date: Mon, 9 Apr 2018 13:52:17 -0700 +Subject: [PATCH 1/3] Neon-Optimized hash chain rebase + +This should help with compression of data, using NEON instructions +(therefore useful for ARMv7/ARMv8). + +Original patch by Jun He. +--- + CMakeLists.txt | 5 ++- + contrib/arm/neon_slide_hash.h | 84 +++++++++++++++++++++++++++++++++++++++++++ + deflate.c | 7 ++++ + 3 files changed, 95 insertions(+), 1 deletion(-) + create mode 100644 contrib/arm/neon_slide_hash.h + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 98ee4dd..230ca6d 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -139,7 +139,10 @@ if(CMAKE_COMPILER_IS_GNUCC) + + if(ARM_NEON) + list(REMOVE_ITEM ZLIB_SRCS inflate.c) +- set(ZLIB_ARM_NEON_HDRS contrib/arm/chunkcopy.h contrib/arm/inffast_chunk.h) ++ set(ZLIB_ARM_NEON_HDRS ++ contrib/arm/chunkcopy.h ++ contrib/arm/inffast_chunk.h ++ contrib/arm/neon_slide_hash.h) + set(ZLIB_ARM_NEON contrib/arm/inflate.c contrib/arm/inffast_chunk.c) + add_definitions(-DARM_NEON) + set(COMPILER ${CMAKE_C_COMPILER}) +diff --git a/contrib/arm/neon_slide_hash.h b/contrib/arm/neon_slide_hash.h +new file mode 100644 +index 0000000..0daffa1 +--- /dev/null ++++ b/contrib/arm/neon_slide_hash.h +@@ -0,0 +1,84 @@ ++/* Copyright (C) 1995-2011, 2016 Mark Adler ++ * Copyright (C) 2017 ARM Holdings Inc. ++ * Authors: Adenilson Cavalcanti ++ * Jun He ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++#ifndef __NEON_SLIDE_HASH__ ++#define __NEON_SLIDE_HASH__ ++ ++#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) ++#include "deflate.h" ++#include ++ ++inline static void neon_slide_hash(deflate_state *s) ++{ ++ /* ++ * This is ASIMD implementation for hash table rebase ++ * it assumes: ++ * 1. hash chain offset (Pos) is 2 bytes ++ * 2. hash table size is multiple*128 bytes ++ * #1 should be true as Pos is defined as "ush" ++ * #2 should be true as hash_bits are greater that 7 ++ */ ++ unsigned n, m; ++ unsigned short wsize = s->w_size; ++ uint16x8_t v, *p; ++ size_t size; ++ ++ size = s->hash_size*sizeof(s->head[0]); ++ Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err"); ++ ++ Assert(sizeof(Pos) == 2, "Wrong Pos size"); ++ ++ /* slide s->head */ ++ v = vdupq_n_u16(wsize); ++ p = (uint16x8_t *)(s->head); ++ n = size / (sizeof(uint16x8_t) * 8); ++ do { ++ p[0] = vqsubq_u16(p[0], v); ++ p[1] = vqsubq_u16(p[1], v); ++ p[2] = vqsubq_u16(p[2], v); ++ p[3] = vqsubq_u16(p[3], v); ++ p[4] = vqsubq_u16(p[4], v); ++ p[5] = vqsubq_u16(p[5], v); ++ p[6] = vqsubq_u16(p[6], v); ++ p[7] = vqsubq_u16(p[7], v); ++ p += 8; ++ } while (--n); ++#ifndef FASTEST ++ /* slide s->prev */ ++ size = wsize*sizeof(s->prev[0]); ++ ++ Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err"); ++ ++ p = (uint16x8_t *)(s->prev); ++ n = size / (sizeof(uint16x8_t) * 8); ++ do { ++ p[0] = vqsubq_u16(p[0], v); ++ p[1] = vqsubq_u16(p[1], v); ++ p[2] = vqsubq_u16(p[2], v); ++ p[3] = vqsubq_u16(p[3], v); ++ p[4] = vqsubq_u16(p[4], v); ++ p[5] = vqsubq_u16(p[5], v); ++ p[6] = vqsubq_u16(p[6], v); ++ p[7] = vqsubq_u16(p[7], v); ++ p += 8; ++ } while (--n); ++#endif ++} ++ ++#endif ++#endif +diff --git a/deflate.c b/deflate.c +index 1ec7614..36f99ac 100644 +--- a/deflate.c ++++ b/deflate.c +@@ -50,6 +50,9 @@ + /* @(#) $Id$ */ + + #include "deflate.h" ++#if __ARM_NEON ++#include "contrib/arm/neon_slide_hash.h" ++#endif + + const char deflate_copyright[] = + " deflate 1.2.11 Copyright 1995-2017 Jean-loup Gailly and Mark Adler "; +@@ -201,6 +204,9 @@ local const config configuration_table[10] = { + local void slide_hash(s) + deflate_state *s; + { ++#if ARM_NEON ++ return neon_slide_hash(s); ++#else + unsigned n, m; + Posf *p; + uInt wsize = s->w_size; +@@ -222,6 +228,7 @@ local void slide_hash(s) + */ + } while (--n); + #endif ++#endif + } + + /* ========================================================================= */ +-- +2.14.3 + diff --git a/0001-Porting-inflate-using-wider-loads-and-stores.patch b/0001-Porting-inflate-using-wider-loads-and-stores.patch new file mode 100644 index 0000000..17c3e7c --- /dev/null +++ b/0001-Porting-inflate-using-wider-loads-and-stores.patch @@ -0,0 +1,2268 @@ +From 390b2713be0c7f682861264a74b202439caf9460 Mon Sep 17 00:00:00 2001 +From: Adenilson Cavalcanti +Date: Wed, 4 Apr 2018 12:10:35 -0700 +Subject: [PATCH 1/2] Porting inflate using wider loads and stores + +In inflate_fast() the output pointer always has plenty of room to write. This +means that so long as the target is capable, wide un-aligned loads and stores +can be used to transfer several bytes at once. When the reference distance is +too short simply unroll the data a little to increase the distance. + +For reference, please see: +https://chromium.googlesource.com/chromium/src/+/78104f4d73e3bbb4155fa804d00ed66682180556 +ps: this is still missing the fix for inflate_back corner case. + +Change-Id: I5216424ab584e069b77ddf04000a313d5ca99839 +--- + CMakeLists.txt | 21 +- + contrib/arm/chunkcopy.h | 297 +++++++++ + contrib/arm/inffast.c | 307 +++++++++ + contrib/arm/inflate.c | 1572 +++++++++++++++++++++++++++++++++++++++++++++++ + 4 files changed, 2195 insertions(+), 2 deletions(-) + create mode 100644 contrib/arm/chunkcopy.h + create mode 100644 contrib/arm/inffast.c + create mode 100644 contrib/arm/inflate.c + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 0fe939d..09bb3db 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -7,6 +7,7 @@ set(VERSION "1.2.11") + + option(ASM686 "Enable building i686 assembly implementation") + option(AMD64 "Enable building amd64 assembly implementation") ++option(ARM_NEON "Enable building ARM NEON intrinsics implementation") + + set(INSTALL_BIN_DIR "${CMAKE_INSTALL_PREFIX}/bin" CACHE PATH "Installation directory for executables") + set(INSTALL_LIB_DIR "${CMAKE_INSTALL_PREFIX}/lib" CACHE PATH "Installation directory for libraries") +@@ -136,6 +137,22 @@ if(CMAKE_COMPILER_IS_GNUCC) + set(ZLIB_ASMS contrib/amd64/amd64-match.S) + endif () + ++ if(ARM_NEON) ++ list(REMOVE_ITEM ZLIB_SRCS inflate.c) ++ list(REMOVE_ITEM ZLIB_SRCS inffast.c) ++ set(ZLIB_ARM_NEON_HDRS contrib/arm/chunkcopy.h) ++ set(ZLIB_ARM_NEON contrib/arm/inflate.c contrib/arm/inffast.c) ++ add_definitions(-DARM_NEON) ++ set(COMPILER ${CMAKE_C_COMPILER}) ++ # NEON is mandatory in ARMv8. ++ if(${COMPILER} MATCHES "aarch64") ++ set_source_files_properties(${ZLIB_ARM_NEON} PROPERTIES LANGUAGE C COMPILE_FLAGS -march=armv8-a) ++ # But it was optional for ARMv7. ++ elseif(${COMPILER} MATCHES "arm") ++ set_source_files_properties(${ZLIB_ARM_NEON} PROPERTIES LANGUAGE C COMPILE_FLAGS -mfpu=neon) ++ endif() ++ endif() ++ + if(ZLIB_ASMS) + add_definitions(-DASMV) + set_source_files_properties(${ZLIB_ASMS} PROPERTIES LANGUAGE C COMPILE_FLAGS -DNO_UNDERLINE) +@@ -183,8 +200,8 @@ if(MINGW) + set(ZLIB_DLL_SRCS ${CMAKE_CURRENT_BINARY_DIR}/zlib1rc.obj) + endif(MINGW) + +-add_library(zlib SHARED ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_DLL_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) +-add_library(zlibstatic STATIC ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) ++add_library(zlib SHARED ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_ARM_NEON} ${ZLIB_ARM_NEON_HDRS} ${ZLIB_DLL_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) ++add_library(zlibstatic STATIC ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_ARM_NEON} ${ZLIB_ARM_NEON_HDRS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) + set_target_properties(zlib PROPERTIES DEFINE_SYMBOL ZLIB_DLL) + set_target_properties(zlib PROPERTIES SOVERSION 1) + +diff --git a/contrib/arm/chunkcopy.h b/contrib/arm/chunkcopy.h +new file mode 100644 +index 0000000..d42995c +--- /dev/null ++++ b/contrib/arm/chunkcopy.h +@@ -0,0 +1,297 @@ ++/* chunkcopy.h -- fast copies and sets ++ * Copyright (C) 2017 ARM, Inc. ++ * For conditions of distribution and use, see copyright notice in zlib.h ++ */ ++ ++#ifndef CHUNKCOPY_H ++#define CHUNKCOPY_H ++ ++#include ++#include "zutil.h" ++ ++#if __STDC_VERSION__ >= 199901L ++#define Z_RESTRICT restrict ++#else ++#define Z_RESTRICT ++#endif ++ ++typedef uint8x16_t chunkcopy_chunk_t; ++#define CHUNKCOPY_CHUNK_SIZE sizeof(chunkcopy_chunk_t) ++ ++/* ++ Ask the compiler to perform a wide, unaligned load with an machine ++ instruction appropriate for the chunkcopy_chunk_t type. ++ */ ++static inline chunkcopy_chunk_t loadchunk(const unsigned char FAR* s) { ++ chunkcopy_chunk_t c; ++ __builtin_memcpy(&c, s, sizeof(c)); ++ return c; ++} ++ ++/* ++ Ask the compiler to perform a wide, unaligned store with an machine ++ instruction appropriate for the chunkcopy_chunk_t type. ++ */ ++static inline void storechunk(unsigned char FAR* d, chunkcopy_chunk_t c) { ++ __builtin_memcpy(d, &c, sizeof(c)); ++} ++ ++/* ++ Perform a memcpy-like operation, but assume that length is non-zero and that ++ it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if ++ the length is shorter than this. ++ ++ It also guarantees that it will properly unroll the data if the distance ++ between `out` and `from` is at least CHUNKCOPY_CHUNK_SIZE, which we rely on ++ in chunkcopy_relaxed(). ++ ++ Aside from better memory bus utilisation, this means that short copies ++ (CHUNKCOPY_CHUNK_SIZE bytes or fewer) will fall straight through the loop ++ without iteration, which will hopefully make the branch prediction more ++ reliable. ++ */ ++static inline unsigned char FAR* chunkcopy_core(unsigned char FAR* out, ++ const unsigned char FAR* from, ++ unsigned len) { ++ int bump = (--len % CHUNKCOPY_CHUNK_SIZE) + 1; ++ storechunk(out, loadchunk(from)); ++ out += bump; ++ from += bump; ++ len /= CHUNKCOPY_CHUNK_SIZE; ++ while (len-- > 0) { ++ storechunk(out, loadchunk(from)); ++ out += CHUNKCOPY_CHUNK_SIZE; ++ from += CHUNKCOPY_CHUNK_SIZE; ++ } ++ return out; ++} ++ ++/* ++ Like chunkcopy_core, but avoid writing beyond of legal output. ++ ++ Accepts an additional pointer to the end of safe output. A generic safe ++ copy would use (out + len), but it's normally the case that the end of the ++ output buffer is beyond the end of the current copy, and this can still be ++ exploited. ++ */ ++static inline unsigned char FAR* chunkcopy_core_safe( ++ unsigned char FAR* out, ++ const unsigned char FAR* from, ++ unsigned len, ++ unsigned char FAR* limit) { ++ Assert(out + len <= limit, "chunk copy exceeds safety limit"); ++ if (limit - out < CHUNKCOPY_CHUNK_SIZE) { ++ const unsigned char FAR* Z_RESTRICT rfrom = from; ++ if (len & 8) { ++ __builtin_memcpy(out, rfrom, 8); ++ out += 8; ++ rfrom += 8; ++ } ++ if (len & 4) { ++ __builtin_memcpy(out, rfrom, 4); ++ out += 4; ++ rfrom += 4; ++ } ++ if (len & 2) { ++ __builtin_memcpy(out, rfrom, 2); ++ out += 2; ++ rfrom += 2; ++ } ++ if (len & 1) { ++ *out++ = *rfrom++; ++ } ++ return out; ++ } ++ return chunkcopy_core(out, from, len); ++} ++ ++/* ++ Perform short copies until distance can be rewritten as being at least ++ CHUNKCOPY_CHUNK_SIZE. ++ ++ This assumes that it's OK to overwrite at least the first ++ 2*CHUNKCOPY_CHUNK_SIZE bytes of output even if the copy is shorter than ++ this. This assumption holds within inflate_fast() which starts every ++ iteration with at least 258 bytes of output space available (258 being the ++ maximum length output from a single token; see inffast.c). ++ */ ++static inline unsigned char FAR* chunkunroll_relaxed(unsigned char FAR* out, ++ unsigned FAR* dist, ++ unsigned FAR* len) { ++ const unsigned char FAR* from = out - *dist; ++ while (*dist < *len && *dist < CHUNKCOPY_CHUNK_SIZE) { ++ storechunk(out, loadchunk(from)); ++ out += *dist; ++ *len -= *dist; ++ *dist += *dist; ++ } ++ return out; ++} ++ ++static inline uint8x16_t chunkset_vld1q_dup_u8x8( ++ const unsigned char FAR* Z_RESTRICT from) { ++#if defined(__clang__) || defined(__aarch64__) ++ return vreinterpretq_u8_u64(vld1q_dup_u64((void*)from)); ++#else ++ /* 32-bit GCC uses an alignment hint for vld1q_dup_u64, even when given a ++ * void pointer, so here's an alternate implementation. ++ */ ++ uint8x8_t h = vld1_u8(from); ++ return vcombine_u8(h, h); ++#endif ++} ++ ++/* ++ Perform an overlapping copy which behaves as a memset() operation, but ++ supporting periods other than one, and assume that length is non-zero and ++ that it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE*3 bytes of output ++ even if the length is shorter than this. ++ */ ++static inline unsigned char FAR* chunkset_core(unsigned char FAR* out, ++ unsigned period, ++ unsigned len) { ++ uint8x16_t f; ++ int bump = ((len - 1) % sizeof(f)) + 1; ++ ++ switch (period) { ++ case 1: ++ f = vld1q_dup_u8(out - 1); ++ vst1q_u8(out, f); ++ out += bump; ++ len -= bump; ++ while (len > 0) { ++ vst1q_u8(out, f); ++ out += sizeof(f); ++ len -= sizeof(f); ++ } ++ return out; ++ case 2: ++ f = vreinterpretq_u8_u16(vld1q_dup_u16((void*)(out - 2))); ++ vst1q_u8(out, f); ++ out += bump; ++ len -= bump; ++ if (len > 0) { ++ f = vreinterpretq_u8_u16(vld1q_dup_u16((void*)(out - 2))); ++ do { ++ vst1q_u8(out, f); ++ out += sizeof(f); ++ len -= sizeof(f); ++ } while (len > 0); ++ } ++ return out; ++ case 4: ++ f = vreinterpretq_u8_u32(vld1q_dup_u32((void*)(out - 4))); ++ vst1q_u8(out, f); ++ out += bump; ++ len -= bump; ++ if (len > 0) { ++ f = vreinterpretq_u8_u32(vld1q_dup_u32((void*)(out - 4))); ++ do { ++ vst1q_u8(out, f); ++ out += sizeof(f); ++ len -= sizeof(f); ++ } while (len > 0); ++ } ++ return out; ++ case 8: ++ f = chunkset_vld1q_dup_u8x8(out - 8); ++ vst1q_u8(out, f); ++ out += bump; ++ len -= bump; ++ if (len > 0) { ++ f = chunkset_vld1q_dup_u8x8(out - 8); ++ do { ++ vst1q_u8(out, f); ++ out += sizeof(f); ++ len -= sizeof(f); ++ } while (len > 0); ++ } ++ return out; ++ } ++ out = chunkunroll_relaxed(out, &period, &len); ++ return chunkcopy_core(out, out - period, len); ++} ++ ++/* ++ Perform a memcpy-like operation, but assume that length is non-zero and that ++ it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if ++ the length is shorter than this. ++ ++ Unlike chunkcopy_core() above, no guarantee is made regarding the behaviour ++ of overlapping buffers, regardless of the distance between the pointers. ++ This is reflected in the `restrict`-qualified pointers, allowing the ++ compiler to reorder loads and stores. ++ */ ++static inline unsigned char FAR* chunkcopy_relaxed( ++ unsigned char FAR* Z_RESTRICT out, ++ const unsigned char FAR* Z_RESTRICT from, ++ unsigned len) { ++ return chunkcopy_core(out, from, len); ++} ++ ++/* ++ Like chunkcopy_relaxed, but avoid writing beyond of legal output. ++ ++ Unlike chunkcopy_core_safe() above, no guarantee is made regarding the ++ behaviour of overlapping buffers, regardless of the distance between the ++ pointers. This is reflected in the `restrict`-qualified pointers, allowing ++ the compiler to reorder loads and stores. ++ ++ Accepts an additional pointer to the end of safe output. A generic safe ++ copy would use (out + len), but it's normally the case that the end of the ++ output buffer is beyond the end of the current copy, and this can still be ++ exploited. ++ */ ++static inline unsigned char FAR* chunkcopy_safe( ++ unsigned char FAR* out, ++ const unsigned char FAR* Z_RESTRICT from, ++ unsigned len, ++ unsigned char FAR* limit) { ++ Assert(out + len <= limit, "chunk copy exceeds safety limit"); ++ return chunkcopy_core_safe(out, from, len, limit); ++} ++ ++/* ++ Perform chunky copy within the same buffer, where the source and destination ++ may potentially overlap. ++ ++ Assumes that len > 0 on entry, and that it's safe to write at least ++ CHUNKCOPY_CHUNK_SIZE*3 bytes to the output. ++ */ ++static inline unsigned char FAR* ++chunkcopy_lapped_relaxed(unsigned char FAR* out, unsigned dist, unsigned len) { ++ if (dist < len && dist < CHUNKCOPY_CHUNK_SIZE) { ++ return chunkset_core(out, dist, len); ++ } ++ return chunkcopy_core(out, out - dist, len); ++} ++ ++/* ++ Behave like chunkcopy_lapped_relaxed, but avoid writing beyond of legal ++ output. ++ ++ Accepts an additional pointer to the end of safe output. A generic safe ++ copy would use (out + len), but it's normally the case that the end of the ++ output buffer is beyond the end of the current copy, and this can still be ++ exploited. ++ */ ++static inline unsigned char FAR* chunkcopy_lapped_safe( ++ unsigned char FAR* out, ++ unsigned dist, ++ unsigned len, ++ unsigned char FAR* limit) { ++ Assert(out + len <= limit, "chunk copy exceeds safety limit"); ++ if (limit - out < CHUNKCOPY_CHUNK_SIZE * 3) { ++ /* TODO(cavalcantii): try harder to optimise this */ ++ while (len-- > 0) { ++ *out = *(out - dist); ++ out++; ++ } ++ return out; ++ } ++ return chunkcopy_lapped_relaxed(out, dist, len); ++} ++ ++#undef Z_RESTRICT ++ ++#endif /* CHUNKCOPY_H */ +diff --git a/contrib/arm/inffast.c b/contrib/arm/inffast.c +new file mode 100644 +index 0000000..f7f5007 +--- /dev/null ++++ b/contrib/arm/inffast.c +@@ -0,0 +1,307 @@ ++/* inffast.c -- fast decoding ++ * Copyright (C) 1995-2017 Mark Adler ++ * For conditions of distribution and use, see copyright notice in zlib.h ++ */ ++ ++#include "zutil.h" ++#include "inftrees.h" ++#include "inflate.h" ++#include "inffast.h" ++#include "chunkcopy.h" ++ ++#ifdef ASMINF ++# pragma message("Assembler code may have bugs -- use at your own risk") ++#else ++ ++/* ++ Decode literal, length, and distance codes and write out the resulting ++ literal and match bytes until either not enough input or output is ++ available, an end-of-block is encountered, or a data error is encountered. ++ When large enough input and output buffers are supplied to inflate(), for ++ example, a 16K input buffer and a 64K output buffer, more than 95% of the ++ inflate execution time is spent in this routine. ++ ++ Entry assumptions: ++ ++ state->mode == LEN ++ strm->avail_in >= 6 ++ strm->avail_out >= 258 ++ start >= strm->avail_out ++ state->bits < 8 ++ ++ On return, state->mode is one of: ++ ++ LEN -- ran out of enough output space or enough available input ++ TYPE -- reached end of block code, inflate() to interpret next block ++ BAD -- error in block data ++ ++ Notes: ++ ++ - The maximum input bits used by a length/distance pair is 15 bits for the ++ length code, 5 bits for the length extra, 15 bits for the distance code, ++ and 13 bits for the distance extra. This totals 48 bits, or six bytes. ++ Therefore if strm->avail_in >= 6, then there is enough input to avoid ++ checking for available input while decoding. ++ ++ - The maximum bytes that a single length/distance pair can output is 258 ++ bytes, which is the maximum length that can be coded. inflate_fast() ++ requires strm->avail_out >= 258 for each loop to avoid checking for ++ output space. ++ */ ++void ZLIB_INTERNAL inflate_fast(strm, start) ++z_streamp strm; ++unsigned start; /* inflate()'s starting value for strm->avail_out */ ++{ ++ struct inflate_state FAR *state; ++ z_const unsigned char FAR *in; /* local strm->next_in */ ++ z_const unsigned char FAR *last; /* have enough input while in < last */ ++ unsigned char FAR *out; /* local strm->next_out */ ++ unsigned char FAR *beg; /* inflate()'s initial strm->next_out */ ++ unsigned char FAR *end; /* while out < end, enough space available */ ++ unsigned char FAR *limit; /* safety limit for chunky copies */ ++#ifdef INFLATE_STRICT ++ unsigned dmax; /* maximum distance from zlib header */ ++#endif ++ unsigned wsize; /* window size or zero if not using window */ ++ unsigned whave; /* valid bytes in the window */ ++ unsigned wnext; /* window write index */ ++ unsigned char FAR *window; /* allocated sliding window, if wsize != 0 */ ++ unsigned long hold; /* local strm->hold */ ++ unsigned bits; /* local strm->bits */ ++ code const FAR *lcode; /* local strm->lencode */ ++ code const FAR *dcode; /* local strm->distcode */ ++ unsigned lmask; /* mask for first level of length codes */ ++ unsigned dmask; /* mask for first level of distance codes */ ++ code here; /* retrieved table entry */ ++ unsigned op; /* code bits, operation, extra bits, or */ ++ /* window position, window bytes to copy */ ++ unsigned len; /* match length, unused bytes */ ++ unsigned dist; /* match distance */ ++ unsigned char FAR *from; /* where to copy match from */ ++ ++ /* copy state to local variables */ ++ state = (struct inflate_state FAR *)strm->state; ++ in = strm->next_in; ++ last = in + (strm->avail_in - 5); ++ out = strm->next_out; ++ beg = out - (start - strm->avail_out); ++ end = out + (strm->avail_out - 257); ++ limit = out + strm->avail_out; ++#ifdef INFLATE_STRICT ++ dmax = state->dmax; ++#endif ++ wsize = state->wsize; ++ whave = state->whave; ++ wnext = (state->wnext == 0 && whave >= wsize) ? wsize : state->wnext; ++ window = state->window; ++ hold = state->hold; ++ bits = state->bits; ++ lcode = state->lencode; ++ dcode = state->distcode; ++ lmask = (1U << state->lenbits) - 1; ++ dmask = (1U << state->distbits) - 1; ++ ++ /* decode literals and length/distances until end-of-block or not enough ++ input data or output space */ ++ do { ++ if (bits < 15) { ++ hold += (unsigned long)(*in++) << bits; ++ bits += 8; ++ hold += (unsigned long)(*in++) << bits; ++ bits += 8; ++ } ++ here = lcode[hold & lmask]; ++ dolen: ++ op = (unsigned)(here.bits); ++ hold >>= op; ++ bits -= op; ++ op = (unsigned)(here.op); ++ if (op == 0) { /* literal */ ++ Tracevv((stderr, here.val >= 0x20 && here.val < 0x7f ? ++ "inflate: literal '%c'\n" : ++ "inflate: literal 0x%02x\n", here.val)); ++ *out++ = (unsigned char)(here.val); ++ } ++ else if (op & 16) { /* length base */ ++ len = (unsigned)(here.val); ++ op &= 15; /* number of extra bits */ ++ if (op) { ++ if (bits < op) { ++ hold += (unsigned long)(*in++) << bits; ++ bits += 8; ++ } ++ len += (unsigned)hold & ((1U << op) - 1); ++ hold >>= op; ++ bits -= op; ++ } ++ Tracevv((stderr, "inflate: length %u\n", len)); ++ if (bits < 15) { ++ hold += (unsigned long)(*in++) << bits; ++ bits += 8; ++ hold += (unsigned long)(*in++) << bits; ++ bits += 8; ++ } ++ here = dcode[hold & dmask]; ++ dodist: ++ op = (unsigned)(here.bits); ++ hold >>= op; ++ bits -= op; ++ op = (unsigned)(here.op); ++ if (op & 16) { /* distance base */ ++ dist = (unsigned)(here.val); ++ op &= 15; /* number of extra bits */ ++ if (bits < op) { ++ hold += (unsigned long)(*in++) << bits; ++ bits += 8; ++ if (bits < op) { ++ hold += (unsigned long)(*in++) << bits; ++ bits += 8; ++ } ++ } ++ dist += (unsigned)hold & ((1U << op) - 1); ++#ifdef INFLATE_STRICT ++ if (dist > dmax) { ++ strm->msg = (char *)"invalid distance too far back"; ++ state->mode = BAD; ++ break; ++ } ++#endif ++ hold >>= op; ++ bits -= op; ++ Tracevv((stderr, "inflate: distance %u\n", dist)); ++ op = (unsigned)(out - beg); /* max distance in output */ ++ if (dist > op) { /* see if copy from window */ ++ op = dist - op; /* distance back in window */ ++ if (op > whave) { ++ if (state->sane) { ++ strm->msg = ++ (char *)"invalid distance too far back"; ++ state->mode = BAD; ++ break; ++ } ++#ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR ++ if (len <= op - whave) { ++ do { ++ *out++ = 0; ++ } while (--len); ++ continue; ++ } ++ len -= op - whave; ++ do { ++ *out++ = 0; ++ } while (--op > whave); ++ if (op == 0) { ++ from = out - dist; ++ do { ++ *out++ = *from++; ++ } while (--len); ++ continue; ++ } ++#endif ++ } ++ from = window; ++ if (wnext >= op) { /* contiguous in window */ ++ from += wnext - op; ++ } ++ else { /* wrap around window */ ++ op -= wnext; ++ from += wsize - op; ++ if (op < len) { /* some from end of window */ ++ len -= op; ++ out = chunkcopy_safe(out, from, op, limit); ++ from = window; /* more from start of window */ ++ op = wnext; ++ /* This (rare) case can create a situation where ++ the first chunkcopy below must be checked. ++ */ ++ } ++ } ++ if (op < len) { /* still need some from output */ ++ out = chunkcopy_safe(out, from, op, limit); ++ len -= op; ++ /* When dist is small the amount of data that can be ++ copied from the window is also small, and progress ++ towards the dangerous end of the output buffer is ++ also small. This means that for trivial memsets and ++ for chunkunroll_relaxed() a safety check is ++ unnecessary. However, these conditions may not be ++ entered at all, and in that case it's possible that ++ the main copy is near the end. ++ */ ++ out = chunkunroll_relaxed(out, &dist, &len); ++ out = chunkcopy_safe(out, out - dist, len, limit); ++ } else { ++ /* from points to window, so there is no risk of ++ overlapping pointers requiring memset-like behaviour ++ */ ++ out = chunkcopy_safe(out, from, len, limit); ++ } ++ } ++ else { ++ /* Whole reference is in range of current output. No ++ range checks are necessary because we start with room ++ for at least 258 bytes of output, so unroll and roundoff ++ operations can write beyond `out+len` so long as they ++ stay within 258 bytes of `out`. ++ */ ++ out = chunkcopy_lapped_relaxed(out, dist, len); ++ } ++ } ++ else if ((op & 64) == 0) { /* 2nd level distance code */ ++ here = dcode[here.val + (hold & ((1U << op) - 1))]; ++ goto dodist; ++ } ++ else { ++ strm->msg = (char *)"invalid distance code"; ++ state->mode = BAD; ++ break; ++ } ++ } ++ else if ((op & 64) == 0) { /* 2nd level length code */ ++ here = lcode[here.val + (hold & ((1U << op) - 1))]; ++ goto dolen; ++ } ++ else if (op & 32) { /* end-of-block */ ++ Tracevv((stderr, "inflate: end of block\n")); ++ state->mode = TYPE; ++ break; ++ } ++ else { ++ strm->msg = (char *)"invalid literal/length code"; ++ state->mode = BAD; ++ break; ++ } ++ } while (in < last && out < end); ++ ++ /* return unused bytes (on entry, bits < 8, so in won't go too far back) */ ++ len = bits >> 3; ++ in -= len; ++ bits -= len << 3; ++ hold &= (1U << bits) - 1; ++ ++ /* update state and return */ ++ strm->next_in = in; ++ strm->next_out = out; ++ strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last)); ++ strm->avail_out = (unsigned)(out < end ? ++ 257 + (end - out) : 257 - (out - end)); ++ state->hold = hold; ++ state->bits = bits; ++ return; ++} ++ ++/* ++ inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe): ++ - Using bit fields for code structure ++ - Different op definition to avoid & for extra bits (do & for table bits) ++ - Three separate decoding do-loops for direct, window, and wnext == 0 ++ - Special case for distance > 1 copies to do overlapped load and store copy ++ - Explicit branch predictions (based on measured branch probabilities) ++ - Deferring match copy and interspersed it with decoding subsequent codes ++ - Swapping literal/length else ++ - Swapping window/direct else ++ - Larger unrolled copy loops (three is about right) ++ - Moving len -= 3 statement into middle of loop ++ */ ++ ++#endif /* !ASMINF */ +diff --git a/contrib/arm/inflate.c b/contrib/arm/inflate.c +new file mode 100644 +index 0000000..23e95f1 +--- /dev/null ++++ b/contrib/arm/inflate.c +@@ -0,0 +1,1572 @@ ++/* inflate.c -- zlib decompression ++ * Copyright (C) 1995-2016 Mark Adler ++ * For conditions of distribution and use, see copyright notice in zlib.h ++ */ ++ ++/* ++ * Change history: ++ * ++ * 1.2.beta0 24 Nov 2002 ++ * - First version -- complete rewrite of inflate to simplify code, avoid ++ * creation of window when not needed, minimize use of window when it is ++ * needed, make inffast.c even faster, implement gzip decoding, and to ++ * improve code readability and style over the previous zlib inflate code ++ * ++ * 1.2.beta1 25 Nov 2002 ++ * - Use pointers for available input and output checking in inffast.c ++ * - Remove input and output counters in inffast.c ++ * - Change inffast.c entry and loop from avail_in >= 7 to >= 6 ++ * - Remove unnecessary second byte pull from length extra in inffast.c ++ * - Unroll direct copy to three copies per loop in inffast.c ++ * ++ * 1.2.beta2 4 Dec 2002 ++ * - Change external routine names to reduce potential conflicts ++ * - Correct filename to inffixed.h for fixed tables in inflate.c ++ * - Make hbuf[] unsigned char to match parameter type in inflate.c ++ * - Change strm->next_out[-state->offset] to *(strm->next_out - state->offset) ++ * to avoid negation problem on Alphas (64 bit) in inflate.c ++ * ++ * 1.2.beta3 22 Dec 2002 ++ * - Add comments on state->bits assertion in inffast.c ++ * - Add comments on op field in inftrees.h ++ * - Fix bug in reuse of allocated window after inflateReset() ++ * - Remove bit fields--back to byte structure for speed ++ * - Remove distance extra == 0 check in inflate_fast()--only helps for lengths ++ * - Change post-increments to pre-increments in inflate_fast(), PPC biased? ++ * - Add compile time option, POSTINC, to use post-increments instead (Intel?) ++ * - Make MATCH copy in inflate() much faster for when inflate_fast() not used ++ * - Use local copies of stream next and avail values, as well as local bit ++ * buffer and bit count in inflate()--for speed when inflate_fast() not used ++ * ++ * 1.2.beta4 1 Jan 2003 ++ * - Split ptr - 257 statements in inflate_table() to avoid compiler warnings ++ * - Move a comment on output buffer sizes from inffast.c to inflate.c ++ * - Add comments in inffast.c to introduce the inflate_fast() routine ++ * - Rearrange window copies in inflate_fast() for speed and simplification ++ * - Unroll last copy for window match in inflate_fast() ++ * - Use local copies of window variables in inflate_fast() for speed ++ * - Pull out common wnext == 0 case for speed in inflate_fast() ++ * - Make op and len in inflate_fast() unsigned for consistency ++ * - Add FAR to lcode and dcode declarations in inflate_fast() ++ * - Simplified bad distance check in inflate_fast() ++ * - Added inflateBackInit(), inflateBack(), and inflateBackEnd() in new ++ * source file infback.c to provide a call-back interface to inflate for ++ * programs like gzip and unzip -- uses window as output buffer to avoid ++ * window copying ++ * ++ * 1.2.beta5 1 Jan 2003 ++ * - Improved inflateBack() interface to allow the caller to provide initial ++ * input in strm. ++ * - Fixed stored blocks bug in inflateBack() ++ * ++ * 1.2.beta6 4 Jan 2003 ++ * - Added comments in inffast.c on effectiveness of POSTINC ++ * - Typecasting all around to reduce compiler warnings ++ * - Changed loops from while (1) or do {} while (1) to for (;;), again to ++ * make compilers happy ++ * - Changed type of window in inflateBackInit() to unsigned char * ++ * ++ * 1.2.beta7 27 Jan 2003 ++ * - Changed many types to unsigned or unsigned short to avoid warnings ++ * - Added inflateCopy() function ++ * ++ * 1.2.0 9 Mar 2003 ++ * - Changed inflateBack() interface to provide separate opaque descriptors ++ * for the in() and out() functions ++ * - Changed inflateBack() argument and in_func typedef to swap the length ++ * and buffer address return values for the input function ++ * - Check next_in and next_out for Z_NULL on entry to inflate() ++ * ++ * The history for versions after 1.2.0 are in ChangeLog in zlib distribution. ++ */ ++ ++#include "zutil.h" ++#include "inftrees.h" ++#include "inflate.h" ++#include "inffast.h" ++#include "contrib/arm/chunkcopy.h" ++ ++#ifdef MAKEFIXED ++# ifndef BUILDFIXED ++# define BUILDFIXED ++# endif ++#endif ++ ++/* function prototypes */ ++local int inflateStateCheck OF((z_streamp strm)); ++local void fixedtables OF((struct inflate_state FAR *state)); ++local int updatewindow OF((z_streamp strm, const unsigned char FAR *end, ++ unsigned copy)); ++#ifdef BUILDFIXED ++ void makefixed OF((void)); ++#endif ++local unsigned syncsearch OF((unsigned FAR *have, const unsigned char FAR *buf, ++ unsigned len)); ++ ++local int inflateStateCheck(strm) ++z_streamp strm; ++{ ++ struct inflate_state FAR *state; ++ if (strm == Z_NULL || ++ strm->zalloc == (alloc_func)0 || strm->zfree == (free_func)0) ++ return 1; ++ state = (struct inflate_state FAR *)strm->state; ++ if (state == Z_NULL || state->strm != strm || ++ state->mode < HEAD || state->mode > SYNC) ++ return 1; ++ return 0; ++} ++ ++int ZEXPORT inflateResetKeep(strm) ++z_streamp strm; ++{ ++ struct inflate_state FAR *state; ++ ++ if (inflateStateCheck(strm)) return Z_STREAM_ERROR; ++ state = (struct inflate_state FAR *)strm->state; ++ strm->total_in = strm->total_out = state->total = 0; ++ strm->msg = Z_NULL; ++ if (state->wrap) /* to support ill-conceived Java test suite */ ++ strm->adler = state->wrap & 1; ++ state->mode = HEAD; ++ state->last = 0; ++ state->havedict = 0; ++ state->dmax = 32768U; ++ state->head = Z_NULL; ++ state->hold = 0; ++ state->bits = 0; ++ state->lencode = state->distcode = state->next = state->codes; ++ state->sane = 1; ++ state->back = -1; ++ Tracev((stderr, "inflate: reset\n")); ++ return Z_OK; ++} ++ ++int ZEXPORT inflateReset(strm) ++z_streamp strm; ++{ ++ struct inflate_state FAR *state; ++ ++ if (inflateStateCheck(strm)) return Z_STREAM_ERROR; ++ state = (struct inflate_state FAR *)strm->state; ++ state->wsize = 0; ++ state->whave = 0; ++ state->wnext = 0; ++ return inflateResetKeep(strm); ++} ++ ++int ZEXPORT inflateReset2(strm, windowBits) ++z_streamp strm; ++int windowBits; ++{ ++ int wrap; ++ struct inflate_state FAR *state; ++ ++ /* get the state */ ++ if (inflateStateCheck(strm)) return Z_STREAM_ERROR; ++ state = (struct inflate_state FAR *)strm->state; ++ ++ /* extract wrap request from windowBits parameter */ ++ if (windowBits < 0) { ++ wrap = 0; ++ windowBits = -windowBits; ++ } ++ else { ++ wrap = (windowBits >> 4) + 5; ++#ifdef GUNZIP ++ if (windowBits < 48) ++ windowBits &= 15; ++#endif ++ } ++ ++ /* set number of window bits, free window if different */ ++ if (windowBits && (windowBits < 8 || windowBits > 15)) ++ return Z_STREAM_ERROR; ++ if (state->window != Z_NULL && state->wbits != (unsigned)windowBits) { ++ ZFREE(strm, state->window); ++ state->window = Z_NULL; ++ } ++ ++ /* update state and reset the rest of it */ ++ state->wrap = wrap; ++ state->wbits = (unsigned)windowBits; ++ return inflateReset(strm); ++} ++ ++int ZEXPORT inflateInit2_(strm, windowBits, version, stream_size) ++z_streamp strm; ++int windowBits; ++const char *version; ++int stream_size; ++{ ++ int ret; ++ struct inflate_state FAR *state; ++ ++ if (version == Z_NULL || version[0] != ZLIB_VERSION[0] || ++ stream_size != (int)(sizeof(z_stream))) ++ return Z_VERSION_ERROR; ++ if (strm == Z_NULL) return Z_STREAM_ERROR; ++ strm->msg = Z_NULL; /* in case we return an error */ ++ if (strm->zalloc == (alloc_func)0) { ++#ifdef Z_SOLO ++ return Z_STREAM_ERROR; ++#else ++ strm->zalloc = zcalloc; ++ strm->opaque = (voidpf)0; ++#endif ++ } ++ if (strm->zfree == (free_func)0) ++#ifdef Z_SOLO ++ return Z_STREAM_ERROR; ++#else ++ strm->zfree = zcfree; ++#endif ++ state = (struct inflate_state FAR *) ++ ZALLOC(strm, 1, sizeof(struct inflate_state)); ++ if (state == Z_NULL) return Z_MEM_ERROR; ++ Tracev((stderr, "inflate: allocated\n")); ++ strm->state = (struct internal_state FAR *)state; ++ state->strm = strm; ++ state->window = Z_NULL; ++ state->mode = HEAD; /* to pass state test in inflateReset2() */ ++ state->check = 1L; /* 1L is the result of adler32() zero length data */ ++ ret = inflateReset2(strm, windowBits); ++ if (ret != Z_OK) { ++ ZFREE(strm, state); ++ strm->state = Z_NULL; ++ } ++ return ret; ++} ++ ++int ZEXPORT inflateInit_(strm, version, stream_size) ++z_streamp strm; ++const char *version; ++int stream_size; ++{ ++ return inflateInit2_(strm, DEF_WBITS, version, stream_size); ++} ++ ++int ZEXPORT inflatePrime(strm, bits, value) ++z_streamp strm; ++int bits; ++int value; ++{ ++ struct inflate_state FAR *state; ++ ++ if (inflateStateCheck(strm)) return Z_STREAM_ERROR; ++ state = (struct inflate_state FAR *)strm->state; ++ if (bits < 0) { ++ state->hold = 0; ++ state->bits = 0; ++ return Z_OK; ++ } ++ if (bits > 16 || state->bits + (uInt)bits > 32) return Z_STREAM_ERROR; ++ value &= (1L << bits) - 1; ++ state->hold += (unsigned)value << state->bits; ++ state->bits += (uInt)bits; ++ return Z_OK; ++} ++ ++/* ++ Return state with length and distance decoding tables and index sizes set to ++ fixed code decoding. Normally this returns fixed tables from inffixed.h. ++ If BUILDFIXED is defined, then instead this routine builds the tables the ++ first time it's called, and returns those tables the first time and ++ thereafter. This reduces the size of the code by about 2K bytes, in ++ exchange for a little execution time. However, BUILDFIXED should not be ++ used for threaded applications, since the rewriting of the tables and virgin ++ may not be thread-safe. ++ */ ++local void fixedtables(state) ++struct inflate_state FAR *state; ++{ ++#ifdef BUILDFIXED ++ static int virgin = 1; ++ static code *lenfix, *distfix; ++ static code fixed[544]; ++ ++ /* build fixed huffman tables if first call (may not be thread safe) */ ++ if (virgin) { ++ unsigned sym, bits; ++ static code *next; ++ ++ /* literal/length table */ ++ sym = 0; ++ while (sym < 144) state->lens[sym++] = 8; ++ while (sym < 256) state->lens[sym++] = 9; ++ while (sym < 280) state->lens[sym++] = 7; ++ while (sym < 288) state->lens[sym++] = 8; ++ next = fixed; ++ lenfix = next; ++ bits = 9; ++ inflate_table(LENS, state->lens, 288, &(next), &(bits), state->work); ++ ++ /* distance table */ ++ sym = 0; ++ while (sym < 32) state->lens[sym++] = 5; ++ distfix = next; ++ bits = 5; ++ inflate_table(DISTS, state->lens, 32, &(next), &(bits), state->work); ++ ++ /* do this just once */ ++ virgin = 0; ++ } ++#else /* !BUILDFIXED */ ++# include "inffixed.h" ++#endif /* BUILDFIXED */ ++ state->lencode = lenfix; ++ state->lenbits = 9; ++ state->distcode = distfix; ++ state->distbits = 5; ++} ++ ++#ifdef MAKEFIXED ++#include ++ ++/* ++ Write out the inffixed.h that is #include'd above. Defining MAKEFIXED also ++ defines BUILDFIXED, so the tables are built on the fly. makefixed() writes ++ those tables to stdout, which would be piped to inffixed.h. A small program ++ can simply call makefixed to do this: ++ ++ void makefixed(void); ++ ++ int main(void) ++ { ++ makefixed(); ++ return 0; ++ } ++ ++ Then that can be linked with zlib built with MAKEFIXED defined and run: ++ ++ a.out > inffixed.h ++ */ ++void makefixed() ++{ ++ unsigned low, size; ++ struct inflate_state state; ++ ++ fixedtables(&state); ++ puts(" /* inffixed.h -- table for decoding fixed codes"); ++ puts(" * Generated automatically by makefixed()."); ++ puts(" */"); ++ puts(""); ++ puts(" /* WARNING: this file should *not* be used by applications."); ++ puts(" It is part of the implementation of this library and is"); ++ puts(" subject to change. Applications should only use zlib.h."); ++ puts(" */"); ++ puts(""); ++ size = 1U << 9; ++ printf(" static const code lenfix[%u] = {", size); ++ low = 0; ++ for (;;) { ++ if ((low % 7) == 0) printf("\n "); ++ printf("{%u,%u,%d}", (low & 127) == 99 ? 64 : state.lencode[low].op, ++ state.lencode[low].bits, state.lencode[low].val); ++ if (++low == size) break; ++ putchar(','); ++ } ++ puts("\n };"); ++ size = 1U << 5; ++ printf("\n static const code distfix[%u] = {", size); ++ low = 0; ++ for (;;) { ++ if ((low % 6) == 0) printf("\n "); ++ printf("{%u,%u,%d}", state.distcode[low].op, state.distcode[low].bits, ++ state.distcode[low].val); ++ if (++low == size) break; ++ putchar(','); ++ } ++ puts("\n };"); ++} ++#endif /* MAKEFIXED */ ++ ++/* ++ Update the window with the last wsize (normally 32K) bytes written before ++ returning. If window does not exist yet, create it. This is only called ++ when a window is already in use, or when output has been written during this ++ inflate call, but the end of the deflate stream has not been reached yet. ++ It is also called to create a window for dictionary data when a dictionary ++ is loaded. ++ ++ Providing output buffers larger than 32K to inflate() should provide a speed ++ advantage, since only the last 32K of output is copied to the sliding window ++ upon return from inflate(), and since all distances after the first 32K of ++ output will fall in the output data, making match copies simpler and faster. ++ The advantage may be dependent on the size of the processor's data caches. ++ */ ++local int updatewindow(strm, end, copy) ++z_streamp strm; ++const Bytef *end; ++unsigned copy; ++{ ++ struct inflate_state FAR *state; ++ unsigned dist; ++ ++ state = (struct inflate_state FAR *)strm->state; ++ ++ /* if it hasn't been done already, allocate space for the window */ ++ if (state->window == Z_NULL) { ++ unsigned wsize = 1U << state->wbits; ++ state->window = (unsigned char FAR *) ++ ZALLOC(strm, wsize + CHUNKCOPY_CHUNK_SIZE, ++ sizeof(unsigned char)); ++ if (state->window == Z_NULL) return 1; ++#ifdef INFLATE_CLEAR_UNUSED_UNDEFINED ++ /* Copies from the overflow portion of this buffer are undefined and ++ may cause analysis tools to raise a warning if we don't initialize ++ it. However, this undefined data overwrites other undefined data ++ and is subsequently either overwritten or left deliberately ++ undefined at the end of decode; so there's really no point. ++ */ ++ memset(state->window + wsize, 0, CHUNKCOPY_CHUNK_SIZE); ++#endif ++ } ++ ++ /* if window not in use yet, initialize */ ++ if (state->wsize == 0) { ++ state->wsize = 1U << state->wbits; ++ state->wnext = 0; ++ state->whave = 0; ++ } ++ ++ /* copy state->wsize or less output bytes into the circular window */ ++ if (copy >= state->wsize) { ++ zmemcpy(state->window, end - state->wsize, state->wsize); ++ state->wnext = 0; ++ state->whave = state->wsize; ++ } ++ else { ++ dist = state->wsize - state->wnext; ++ if (dist > copy) dist = copy; ++ zmemcpy(state->window + state->wnext, end - copy, dist); ++ copy -= dist; ++ if (copy) { ++ zmemcpy(state->window, end - copy, copy); ++ state->wnext = copy; ++ state->whave = state->wsize; ++ } ++ else { ++ state->wnext += dist; ++ if (state->wnext == state->wsize) state->wnext = 0; ++ if (state->whave < state->wsize) state->whave += dist; ++ } ++ } ++ return 0; ++} ++ ++/* Macros for inflate(): */ ++ ++/* check function to use adler32() for zlib or crc32() for gzip */ ++#ifdef GUNZIP ++# define UPDATE(check, buf, len) \ ++ (state->flags ? crc32(check, buf, len) : adler32(check, buf, len)) ++#else ++# define UPDATE(check, buf, len) adler32(check, buf, len) ++#endif ++ ++/* check macros for header crc */ ++#ifdef GUNZIP ++# define CRC2(check, word) \ ++ do { \ ++ hbuf[0] = (unsigned char)(word); \ ++ hbuf[1] = (unsigned char)((word) >> 8); \ ++ check = crc32(check, hbuf, 2); \ ++ } while (0) ++ ++# define CRC4(check, word) \ ++ do { \ ++ hbuf[0] = (unsigned char)(word); \ ++ hbuf[1] = (unsigned char)((word) >> 8); \ ++ hbuf[2] = (unsigned char)((word) >> 16); \ ++ hbuf[3] = (unsigned char)((word) >> 24); \ ++ check = crc32(check, hbuf, 4); \ ++ } while (0) ++#endif ++ ++/* Load registers with state in inflate() for speed */ ++#define LOAD() \ ++ do { \ ++ put = strm->next_out; \ ++ left = strm->avail_out; \ ++ next = strm->next_in; \ ++ have = strm->avail_in; \ ++ hold = state->hold; \ ++ bits = state->bits; \ ++ } while (0) ++ ++/* Restore state from registers in inflate() */ ++#define RESTORE() \ ++ do { \ ++ strm->next_out = put; \ ++ strm->avail_out = left; \ ++ strm->next_in = next; \ ++ strm->avail_in = have; \ ++ state->hold = hold; \ ++ state->bits = bits; \ ++ } while (0) ++ ++/* Clear the input bit accumulator */ ++#define INITBITS() \ ++ do { \ ++ hold = 0; \ ++ bits = 0; \ ++ } while (0) ++ ++/* Get a byte of input into the bit accumulator, or return from inflate() ++ if there is no input available. */ ++#define PULLBYTE() \ ++ do { \ ++ if (have == 0) goto inf_leave; \ ++ have--; \ ++ hold += (unsigned long)(*next++) << bits; \ ++ bits += 8; \ ++ } while (0) ++ ++/* Assure that there are at least n bits in the bit accumulator. If there is ++ not enough available input to do that, then return from inflate(). */ ++#define NEEDBITS(n) \ ++ do { \ ++ while (bits < (unsigned)(n)) \ ++ PULLBYTE(); \ ++ } while (0) ++ ++/* Return the low n bits of the bit accumulator (n < 16) */ ++#define BITS(n) \ ++ ((unsigned)hold & ((1U << (n)) - 1)) ++ ++/* Remove n bits from the bit accumulator */ ++#define DROPBITS(n) \ ++ do { \ ++ hold >>= (n); \ ++ bits -= (unsigned)(n); \ ++ } while (0) ++ ++/* Remove zero to seven bits as needed to go to a byte boundary */ ++#define BYTEBITS() \ ++ do { \ ++ hold >>= bits & 7; \ ++ bits -= bits & 7; \ ++ } while (0) ++ ++/* ++ inflate() uses a state machine to process as much input data and generate as ++ much output data as possible before returning. The state machine is ++ structured roughly as follows: ++ ++ for (;;) switch (state) { ++ ... ++ case STATEn: ++ if (not enough input data or output space to make progress) ++ return; ++ ... make progress ... ++ state = STATEm; ++ break; ++ ... ++ } ++ ++ so when inflate() is called again, the same case is attempted again, and ++ if the appropriate resources are provided, the machine proceeds to the ++ next state. The NEEDBITS() macro is usually the way the state evaluates ++ whether it can proceed or should return. NEEDBITS() does the return if ++ the requested bits are not available. The typical use of the BITS macros ++ is: ++ ++ NEEDBITS(n); ++ ... do something with BITS(n) ... ++ DROPBITS(n); ++ ++ where NEEDBITS(n) either returns from inflate() if there isn't enough ++ input left to load n bits into the accumulator, or it continues. BITS(n) ++ gives the low n bits in the accumulator. When done, DROPBITS(n) drops ++ the low n bits off the accumulator. INITBITS() clears the accumulator ++ and sets the number of available bits to zero. BYTEBITS() discards just ++ enough bits to put the accumulator on a byte boundary. After BYTEBITS() ++ and a NEEDBITS(8), then BITS(8) would return the next byte in the stream. ++ ++ NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return ++ if there is no input available. The decoding of variable length codes uses ++ PULLBYTE() directly in order to pull just enough bytes to decode the next ++ code, and no more. ++ ++ Some states loop until they get enough input, making sure that enough ++ state information is maintained to continue the loop where it left off ++ if NEEDBITS() returns in the loop. For example, want, need, and keep ++ would all have to actually be part of the saved state in case NEEDBITS() ++ returns: ++ ++ case STATEw: ++ while (want < need) { ++ NEEDBITS(n); ++ keep[want++] = BITS(n); ++ DROPBITS(n); ++ } ++ state = STATEx; ++ case STATEx: ++ ++ As shown above, if the next state is also the next case, then the break ++ is omitted. ++ ++ A state may also return if there is not enough output space available to ++ complete that state. Those states are copying stored data, writing a ++ literal byte, and copying a matching string. ++ ++ When returning, a "goto inf_leave" is used to update the total counters, ++ update the check value, and determine whether any progress has been made ++ during that inflate() call in order to return the proper return code. ++ Progress is defined as a change in either strm->avail_in or strm->avail_out. ++ When there is a window, goto inf_leave will update the window with the last ++ output written. If a goto inf_leave occurs in the middle of decompression ++ and there is no window currently, goto inf_leave will create one and copy ++ output to the window for the next call of inflate(). ++ ++ In this implementation, the flush parameter of inflate() only affects the ++ return code (per zlib.h). inflate() always writes as much as possible to ++ strm->next_out, given the space available and the provided input--the effect ++ documented in zlib.h of Z_SYNC_FLUSH. Furthermore, inflate() always defers ++ the allocation of and copying into a sliding window until necessary, which ++ provides the effect documented in zlib.h for Z_FINISH when the entire input ++ stream available. So the only thing the flush parameter actually does is: ++ when flush is set to Z_FINISH, inflate() cannot return Z_OK. Instead it ++ will return Z_BUF_ERROR if it has not reached the end of the stream. ++ */ ++ ++int ZEXPORT inflate(strm, flush) ++z_streamp strm; ++int flush; ++{ ++ struct inflate_state FAR *state; ++ z_const unsigned char FAR *next; /* next input */ ++ unsigned char FAR *put; /* next output */ ++ unsigned have, left; /* available input and output */ ++ unsigned long hold; /* bit buffer */ ++ unsigned bits; /* bits in bit buffer */ ++ unsigned in, out; /* save starting available input and output */ ++ unsigned copy; /* number of stored or match bytes to copy */ ++ unsigned char FAR *from; /* where to copy match bytes from */ ++ code here; /* current decoding table entry */ ++ code last; /* parent table entry */ ++ unsigned len; /* length to copy for repeats, bits to drop */ ++ int ret; /* return code */ ++#ifdef GUNZIP ++ unsigned char hbuf[4]; /* buffer for gzip header crc calculation */ ++#endif ++ static const unsigned short order[19] = /* permutation of code lengths */ ++ {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; ++ ++ if (inflateStateCheck(strm) || strm->next_out == Z_NULL || ++ (strm->next_in == Z_NULL && strm->avail_in != 0)) ++ return Z_STREAM_ERROR; ++ ++ state = (struct inflate_state FAR *)strm->state; ++ if (state->mode == TYPE) state->mode = TYPEDO; /* skip check */ ++ LOAD(); ++ in = have; ++ out = left; ++ ret = Z_OK; ++ for (;;) ++ switch (state->mode) { ++ case HEAD: ++ if (state->wrap == 0) { ++ state->mode = TYPEDO; ++ break; ++ } ++ NEEDBITS(16); ++#ifdef GUNZIP ++ if ((state->wrap & 2) && hold == 0x8b1f) { /* gzip header */ ++ if (state->wbits == 0) ++ state->wbits = 15; ++ state->check = crc32(0L, Z_NULL, 0); ++ CRC2(state->check, hold); ++ INITBITS(); ++ state->mode = FLAGS; ++ break; ++ } ++ state->flags = 0; /* expect zlib header */ ++ if (state->head != Z_NULL) ++ state->head->done = -1; ++ if (!(state->wrap & 1) || /* check if zlib header allowed */ ++#else ++ if ( ++#endif ++ ((BITS(8) << 8) + (hold >> 8)) % 31) { ++ strm->msg = (char *)"incorrect header check"; ++ state->mode = BAD; ++ break; ++ } ++ if (BITS(4) != Z_DEFLATED) { ++ strm->msg = (char *)"unknown compression method"; ++ state->mode = BAD; ++ break; ++ } ++ DROPBITS(4); ++ len = BITS(4) + 8; ++ if (state->wbits == 0) ++ state->wbits = len; ++ if (len > 15 || len > state->wbits) { ++ strm->msg = (char *)"invalid window size"; ++ state->mode = BAD; ++ break; ++ } ++ state->dmax = 1U << len; ++ Tracev((stderr, "inflate: zlib header ok\n")); ++ strm->adler = state->check = adler32(0L, Z_NULL, 0); ++ state->mode = hold & 0x200 ? DICTID : TYPE; ++ INITBITS(); ++ break; ++#ifdef GUNZIP ++ case FLAGS: ++ NEEDBITS(16); ++ state->flags = (int)(hold); ++ if ((state->flags & 0xff) != Z_DEFLATED) { ++ strm->msg = (char *)"unknown compression method"; ++ state->mode = BAD; ++ break; ++ } ++ if (state->flags & 0xe000) { ++ strm->msg = (char *)"unknown header flags set"; ++ state->mode = BAD; ++ break; ++ } ++ if (state->head != Z_NULL) ++ state->head->text = (int)((hold >> 8) & 1); ++ if ((state->flags & 0x0200) && (state->wrap & 4)) ++ CRC2(state->check, hold); ++ INITBITS(); ++ state->mode = TIME; ++ case TIME: ++ NEEDBITS(32); ++ if (state->head != Z_NULL) ++ state->head->time = hold; ++ if ((state->flags & 0x0200) && (state->wrap & 4)) ++ CRC4(state->check, hold); ++ INITBITS(); ++ state->mode = OS; ++ case OS: ++ NEEDBITS(16); ++ if (state->head != Z_NULL) { ++ state->head->xflags = (int)(hold & 0xff); ++ state->head->os = (int)(hold >> 8); ++ } ++ if ((state->flags & 0x0200) && (state->wrap & 4)) ++ CRC2(state->check, hold); ++ INITBITS(); ++ state->mode = EXLEN; ++ case EXLEN: ++ if (state->flags & 0x0400) { ++ NEEDBITS(16); ++ state->length = (unsigned)(hold); ++ if (state->head != Z_NULL) ++ state->head->extra_len = (unsigned)hold; ++ if ((state->flags & 0x0200) && (state->wrap & 4)) ++ CRC2(state->check, hold); ++ INITBITS(); ++ } ++ else if (state->head != Z_NULL) ++ state->head->extra = Z_NULL; ++ state->mode = EXTRA; ++ case EXTRA: ++ if (state->flags & 0x0400) { ++ copy = state->length; ++ if (copy > have) copy = have; ++ if (copy) { ++ if (state->head != Z_NULL && ++ state->head->extra != Z_NULL) { ++ len = state->head->extra_len - state->length; ++ zmemcpy(state->head->extra + len, next, ++ len + copy > state->head->extra_max ? ++ state->head->extra_max - len : copy); ++ } ++ if ((state->flags & 0x0200) && (state->wrap & 4)) ++ state->check = crc32(state->check, next, copy); ++ have -= copy; ++ next += copy; ++ state->length -= copy; ++ } ++ if (state->length) goto inf_leave; ++ } ++ state->length = 0; ++ state->mode = NAME; ++ case NAME: ++ if (state->flags & 0x0800) { ++ if (have == 0) goto inf_leave; ++ copy = 0; ++ do { ++ len = (unsigned)(next[copy++]); ++ if (state->head != Z_NULL && ++ state->head->name != Z_NULL && ++ state->length < state->head->name_max) ++ state->head->name[state->length++] = (Bytef)len; ++ } while (len && copy < have); ++ if ((state->flags & 0x0200) && (state->wrap & 4)) ++ state->check = crc32(state->check, next, copy); ++ have -= copy; ++ next += copy; ++ if (len) goto inf_leave; ++ } ++ else if (state->head != Z_NULL) ++ state->head->name = Z_NULL; ++ state->length = 0; ++ state->mode = COMMENT; ++ case COMMENT: ++ if (state->flags & 0x1000) { ++ if (have == 0) goto inf_leave; ++ copy = 0; ++ do { ++ len = (unsigned)(next[copy++]); ++ if (state->head != Z_NULL && ++ state->head->comment != Z_NULL && ++ state->length < state->head->comm_max) ++ state->head->comment[state->length++] = (Bytef)len; ++ } while (len && copy < have); ++ if ((state->flags & 0x0200) && (state->wrap & 4)) ++ state->check = crc32(state->check, next, copy); ++ have -= copy; ++ next += copy; ++ if (len) goto inf_leave; ++ } ++ else if (state->head != Z_NULL) ++ state->head->comment = Z_NULL; ++ state->mode = HCRC; ++ case HCRC: ++ if (state->flags & 0x0200) { ++ NEEDBITS(16); ++ if ((state->wrap & 4) && hold != (state->check & 0xffff)) { ++ strm->msg = (char *)"header crc mismatch"; ++ state->mode = BAD; ++ break; ++ } ++ INITBITS(); ++ } ++ if (state->head != Z_NULL) { ++ state->head->hcrc = (int)((state->flags >> 9) & 1); ++ state->head->done = 1; ++ } ++ strm->adler = state->check = crc32(0L, Z_NULL, 0); ++ state->mode = TYPE; ++ break; ++#endif ++ case DICTID: ++ NEEDBITS(32); ++ strm->adler = state->check = ZSWAP32(hold); ++ INITBITS(); ++ state->mode = DICT; ++ case DICT: ++ if (state->havedict == 0) { ++ RESTORE(); ++ return Z_NEED_DICT; ++ } ++ strm->adler = state->check = adler32(0L, Z_NULL, 0); ++ state->mode = TYPE; ++ case TYPE: ++ if (flush == Z_BLOCK || flush == Z_TREES) goto inf_leave; ++ case TYPEDO: ++ if (state->last) { ++ BYTEBITS(); ++ state->mode = CHECK; ++ break; ++ } ++ NEEDBITS(3); ++ state->last = BITS(1); ++ DROPBITS(1); ++ switch (BITS(2)) { ++ case 0: /* stored block */ ++ Tracev((stderr, "inflate: stored block%s\n", ++ state->last ? " (last)" : "")); ++ state->mode = STORED; ++ break; ++ case 1: /* fixed block */ ++ fixedtables(state); ++ Tracev((stderr, "inflate: fixed codes block%s\n", ++ state->last ? " (last)" : "")); ++ state->mode = LEN_; /* decode codes */ ++ if (flush == Z_TREES) { ++ DROPBITS(2); ++ goto inf_leave; ++ } ++ break; ++ case 2: /* dynamic block */ ++ Tracev((stderr, "inflate: dynamic codes block%s\n", ++ state->last ? " (last)" : "")); ++ state->mode = TABLE; ++ break; ++ case 3: ++ strm->msg = (char *)"invalid block type"; ++ state->mode = BAD; ++ } ++ DROPBITS(2); ++ break; ++ case STORED: ++ BYTEBITS(); /* go to byte boundary */ ++ NEEDBITS(32); ++ if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) { ++ strm->msg = (char *)"invalid stored block lengths"; ++ state->mode = BAD; ++ break; ++ } ++ state->length = (unsigned)hold & 0xffff; ++ Tracev((stderr, "inflate: stored length %u\n", ++ state->length)); ++ INITBITS(); ++ state->mode = COPY_; ++ if (flush == Z_TREES) goto inf_leave; ++ case COPY_: ++ state->mode = COPY; ++ case COPY: ++ copy = state->length; ++ if (copy) { ++ if (copy > have) copy = have; ++ if (copy > left) copy = left; ++ if (copy == 0) goto inf_leave; ++ zmemcpy(put, next, copy); ++ have -= copy; ++ next += copy; ++ left -= copy; ++ put += copy; ++ state->length -= copy; ++ break; ++ } ++ Tracev((stderr, "inflate: stored end\n")); ++ state->mode = TYPE; ++ break; ++ case TABLE: ++ NEEDBITS(14); ++ state->nlen = BITS(5) + 257; ++ DROPBITS(5); ++ state->ndist = BITS(5) + 1; ++ DROPBITS(5); ++ state->ncode = BITS(4) + 4; ++ DROPBITS(4); ++#ifndef PKZIP_BUG_WORKAROUND ++ if (state->nlen > 286 || state->ndist > 30) { ++ strm->msg = (char *)"too many length or distance symbols"; ++ state->mode = BAD; ++ break; ++ } ++#endif ++ Tracev((stderr, "inflate: table sizes ok\n")); ++ state->have = 0; ++ state->mode = LENLENS; ++ case LENLENS: ++ while (state->have < state->ncode) { ++ NEEDBITS(3); ++ state->lens[order[state->have++]] = (unsigned short)BITS(3); ++ DROPBITS(3); ++ } ++ while (state->have < 19) ++ state->lens[order[state->have++]] = 0; ++ state->next = state->codes; ++ state->lencode = (const code FAR *)(state->next); ++ state->lenbits = 7; ++ ret = inflate_table(CODES, state->lens, 19, &(state->next), ++ &(state->lenbits), state->work); ++ if (ret) { ++ strm->msg = (char *)"invalid code lengths set"; ++ state->mode = BAD; ++ break; ++ } ++ Tracev((stderr, "inflate: code lengths ok\n")); ++ state->have = 0; ++ state->mode = CODELENS; ++ case CODELENS: ++ while (state->have < state->nlen + state->ndist) { ++ for (;;) { ++ here = state->lencode[BITS(state->lenbits)]; ++ if ((unsigned)(here.bits) <= bits) break; ++ PULLBYTE(); ++ } ++ if (here.val < 16) { ++ DROPBITS(here.bits); ++ state->lens[state->have++] = here.val; ++ } ++ else { ++ if (here.val == 16) { ++ NEEDBITS(here.bits + 2); ++ DROPBITS(here.bits); ++ if (state->have == 0) { ++ strm->msg = (char *)"invalid bit length repeat"; ++ state->mode = BAD; ++ break; ++ } ++ len = state->lens[state->have - 1]; ++ copy = 3 + BITS(2); ++ DROPBITS(2); ++ } ++ else if (here.val == 17) { ++ NEEDBITS(here.bits + 3); ++ DROPBITS(here.bits); ++ len = 0; ++ copy = 3 + BITS(3); ++ DROPBITS(3); ++ } ++ else { ++ NEEDBITS(here.bits + 7); ++ DROPBITS(here.bits); ++ len = 0; ++ copy = 11 + BITS(7); ++ DROPBITS(7); ++ } ++ if (state->have + copy > state->nlen + state->ndist) { ++ strm->msg = (char *)"invalid bit length repeat"; ++ state->mode = BAD; ++ break; ++ } ++ while (copy--) ++ state->lens[state->have++] = (unsigned short)len; ++ } ++ } ++ ++ /* handle error breaks in while */ ++ if (state->mode == BAD) break; ++ ++ /* check for end-of-block code (better have one) */ ++ if (state->lens[256] == 0) { ++ strm->msg = (char *)"invalid code -- missing end-of-block"; ++ state->mode = BAD; ++ break; ++ } ++ ++ /* build code tables -- note: do not change the lenbits or distbits ++ values here (9 and 6) without reading the comments in inftrees.h ++ concerning the ENOUGH constants, which depend on those values */ ++ state->next = state->codes; ++ state->lencode = (const code FAR *)(state->next); ++ state->lenbits = 9; ++ ret = inflate_table(LENS, state->lens, state->nlen, &(state->next), ++ &(state->lenbits), state->work); ++ if (ret) { ++ strm->msg = (char *)"invalid literal/lengths set"; ++ state->mode = BAD; ++ break; ++ } ++ state->distcode = (const code FAR *)(state->next); ++ state->distbits = 6; ++ ret = inflate_table(DISTS, state->lens + state->nlen, state->ndist, ++ &(state->next), &(state->distbits), state->work); ++ if (ret) { ++ strm->msg = (char *)"invalid distances set"; ++ state->mode = BAD; ++ break; ++ } ++ Tracev((stderr, "inflate: codes ok\n")); ++ state->mode = LEN_; ++ if (flush == Z_TREES) goto inf_leave; ++ case LEN_: ++ state->mode = LEN; ++ case LEN: ++ if (have >= 6 && left >= 258) { ++ RESTORE(); ++ inflate_fast(strm, out); ++ LOAD(); ++ if (state->mode == TYPE) ++ state->back = -1; ++ break; ++ } ++ state->back = 0; ++ for (;;) { ++ here = state->lencode[BITS(state->lenbits)]; ++ if ((unsigned)(here.bits) <= bits) break; ++ PULLBYTE(); ++ } ++ if (here.op && (here.op & 0xf0) == 0) { ++ last = here; ++ for (;;) { ++ here = state->lencode[last.val + ++ (BITS(last.bits + last.op) >> last.bits)]; ++ if ((unsigned)(last.bits + here.bits) <= bits) break; ++ PULLBYTE(); ++ } ++ DROPBITS(last.bits); ++ state->back += last.bits; ++ } ++ DROPBITS(here.bits); ++ state->back += here.bits; ++ state->length = (unsigned)here.val; ++ if ((int)(here.op) == 0) { ++ Tracevv((stderr, here.val >= 0x20 && here.val < 0x7f ? ++ "inflate: literal '%c'\n" : ++ "inflate: literal 0x%02x\n", here.val)); ++ state->mode = LIT; ++ break; ++ } ++ if (here.op & 32) { ++ Tracevv((stderr, "inflate: end of block\n")); ++ state->back = -1; ++ state->mode = TYPE; ++ break; ++ } ++ if (here.op & 64) { ++ strm->msg = (char *)"invalid literal/length code"; ++ state->mode = BAD; ++ break; ++ } ++ state->extra = (unsigned)(here.op) & 15; ++ state->mode = LENEXT; ++ case LENEXT: ++ if (state->extra) { ++ NEEDBITS(state->extra); ++ state->length += BITS(state->extra); ++ DROPBITS(state->extra); ++ state->back += state->extra; ++ } ++ Tracevv((stderr, "inflate: length %u\n", state->length)); ++ state->was = state->length; ++ state->mode = DIST; ++ case DIST: ++ for (;;) { ++ here = state->distcode[BITS(state->distbits)]; ++ if ((unsigned)(here.bits) <= bits) break; ++ PULLBYTE(); ++ } ++ if ((here.op & 0xf0) == 0) { ++ last = here; ++ for (;;) { ++ here = state->distcode[last.val + ++ (BITS(last.bits + last.op) >> last.bits)]; ++ if ((unsigned)(last.bits + here.bits) <= bits) break; ++ PULLBYTE(); ++ } ++ DROPBITS(last.bits); ++ state->back += last.bits; ++ } ++ DROPBITS(here.bits); ++ state->back += here.bits; ++ if (here.op & 64) { ++ strm->msg = (char *)"invalid distance code"; ++ state->mode = BAD; ++ break; ++ } ++ state->offset = (unsigned)here.val; ++ state->extra = (unsigned)(here.op) & 15; ++ state->mode = DISTEXT; ++ case DISTEXT: ++ if (state->extra) { ++ NEEDBITS(state->extra); ++ state->offset += BITS(state->extra); ++ DROPBITS(state->extra); ++ state->back += state->extra; ++ } ++#ifdef INFLATE_STRICT ++ if (state->offset > state->dmax) { ++ strm->msg = (char *)"invalid distance too far back"; ++ state->mode = BAD; ++ break; ++ } ++#endif ++ Tracevv((stderr, "inflate: distance %u\n", state->offset)); ++ state->mode = MATCH; ++ case MATCH: ++ if (left == 0) goto inf_leave; ++ copy = out - left; ++ if (state->offset > copy) { /* copy from window */ ++ copy = state->offset - copy; ++ if (copy > state->whave) { ++ if (state->sane) { ++ strm->msg = (char *)"invalid distance too far back"; ++ state->mode = BAD; ++ break; ++ } ++#ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR ++ Trace((stderr, "inflate.c too far\n")); ++ copy -= state->whave; ++ if (copy > state->length) copy = state->length; ++ if (copy > left) copy = left; ++ left -= copy; ++ state->length -= copy; ++ do { ++ *put++ = 0; ++ } while (--copy); ++ if (state->length == 0) state->mode = LEN; ++ break; ++#endif ++ } ++ if (copy > state->wnext) { ++ copy -= state->wnext; ++ from = state->window + (state->wsize - copy); ++ } ++ else ++ from = state->window + (state->wnext - copy); ++ if (copy > state->length) copy = state->length; ++ if (copy > left) copy = left; ++ put = chunkcopy_safe(put, from, copy, put + left); ++ } ++ else { /* copy from output */ ++ copy = state->length; ++ if (copy > left) copy = left; ++ put = chunkcopy_lapped_safe(put, state->offset, copy, put + left); ++ } ++ left -= copy; ++ state->length -= copy; ++ if (state->length == 0) state->mode = LEN; ++ break; ++ case LIT: ++ if (left == 0) goto inf_leave; ++ *put++ = (unsigned char)(state->length); ++ left--; ++ state->mode = LEN; ++ break; ++ case CHECK: ++ if (state->wrap) { ++ NEEDBITS(32); ++ out -= left; ++ strm->total_out += out; ++ state->total += out; ++ if ((state->wrap & 4) && out) ++ strm->adler = state->check = ++ UPDATE(state->check, put - out, out); ++ out = left; ++ if ((state->wrap & 4) && ( ++#ifdef GUNZIP ++ state->flags ? hold : ++#endif ++ ZSWAP32(hold)) != state->check) { ++ strm->msg = (char *)"incorrect data check"; ++ state->mode = BAD; ++ break; ++ } ++ INITBITS(); ++ Tracev((stderr, "inflate: check matches trailer\n")); ++ } ++#ifdef GUNZIP ++ state->mode = LENGTH; ++ case LENGTH: ++ if (state->wrap && state->flags) { ++ NEEDBITS(32); ++ if (hold != (state->total & 0xffffffffUL)) { ++ strm->msg = (char *)"incorrect length check"; ++ state->mode = BAD; ++ break; ++ } ++ INITBITS(); ++ Tracev((stderr, "inflate: length matches trailer\n")); ++ } ++#endif ++ state->mode = DONE; ++ case DONE: ++ ret = Z_STREAM_END; ++ goto inf_leave; ++ case BAD: ++ ret = Z_DATA_ERROR; ++ goto inf_leave; ++ case MEM: ++ return Z_MEM_ERROR; ++ case SYNC: ++ default: ++ return Z_STREAM_ERROR; ++ } ++ ++ /* ++ Return from inflate(), updating the total counts and the check value. ++ If there was no progress during the inflate() call, return a buffer ++ error. Call updatewindow() to create and/or update the window state. ++ Note: a memory error from inflate() is non-recoverable. ++ */ ++ inf_leave: ++ RESTORE(); ++ if (state->wsize || (out != strm->avail_out && state->mode < BAD && ++ (state->mode < CHECK || flush != Z_FINISH))) ++ if (updatewindow(strm, strm->next_out, out - strm->avail_out)) { ++ state->mode = MEM; ++ return Z_MEM_ERROR; ++ } ++ in -= strm->avail_in; ++ out -= strm->avail_out; ++ strm->total_in += in; ++ strm->total_out += out; ++ state->total += out; ++ if ((state->wrap & 4) && out) ++ strm->adler = state->check = ++ UPDATE(state->check, strm->next_out - out, out); ++ strm->data_type = (int)state->bits + (state->last ? 64 : 0) + ++ (state->mode == TYPE ? 128 : 0) + ++ (state->mode == LEN_ || state->mode == COPY_ ? 256 : 0); ++ if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK) ++ ret = Z_BUF_ERROR; ++ return ret; ++} ++ ++int ZEXPORT inflateEnd(strm) ++z_streamp strm; ++{ ++ struct inflate_state FAR *state; ++ if (inflateStateCheck(strm)) ++ return Z_STREAM_ERROR; ++ state = (struct inflate_state FAR *)strm->state; ++ if (state->window != Z_NULL) ZFREE(strm, state->window); ++ ZFREE(strm, strm->state); ++ strm->state = Z_NULL; ++ Tracev((stderr, "inflate: end\n")); ++ return Z_OK; ++} ++ ++int ZEXPORT inflateGetDictionary(strm, dictionary, dictLength) ++z_streamp strm; ++Bytef *dictionary; ++uInt *dictLength; ++{ ++ struct inflate_state FAR *state; ++ ++ /* check state */ ++ if (inflateStateCheck(strm)) return Z_STREAM_ERROR; ++ state = (struct inflate_state FAR *)strm->state; ++ ++ /* copy dictionary */ ++ if (state->whave && dictionary != Z_NULL) { ++ zmemcpy(dictionary, state->window + state->wnext, ++ state->whave - state->wnext); ++ zmemcpy(dictionary + state->whave - state->wnext, ++ state->window, state->wnext); ++ } ++ if (dictLength != Z_NULL) ++ *dictLength = state->whave; ++ return Z_OK; ++} ++ ++int ZEXPORT inflateSetDictionary(strm, dictionary, dictLength) ++z_streamp strm; ++const Bytef *dictionary; ++uInt dictLength; ++{ ++ struct inflate_state FAR *state; ++ unsigned long dictid; ++ int ret; ++ ++ /* check state */ ++ if (inflateStateCheck(strm)) return Z_STREAM_ERROR; ++ state = (struct inflate_state FAR *)strm->state; ++ if (state->wrap != 0 && state->mode != DICT) ++ return Z_STREAM_ERROR; ++ ++ /* check for correct dictionary identifier */ ++ if (state->mode == DICT) { ++ dictid = adler32(0L, Z_NULL, 0); ++ dictid = adler32(dictid, dictionary, dictLength); ++ if (dictid != state->check) ++ return Z_DATA_ERROR; ++ } ++ ++ /* copy dictionary to window using updatewindow(), which will amend the ++ existing dictionary if appropriate */ ++ ret = updatewindow(strm, dictionary + dictLength, dictLength); ++ if (ret) { ++ state->mode = MEM; ++ return Z_MEM_ERROR; ++ } ++ state->havedict = 1; ++ Tracev((stderr, "inflate: dictionary set\n")); ++ return Z_OK; ++} ++ ++int ZEXPORT inflateGetHeader(strm, head) ++z_streamp strm; ++gz_headerp head; ++{ ++ struct inflate_state FAR *state; ++ ++ /* check state */ ++ if (inflateStateCheck(strm)) return Z_STREAM_ERROR; ++ state = (struct inflate_state FAR *)strm->state; ++ if ((state->wrap & 2) == 0) return Z_STREAM_ERROR; ++ ++ /* save header structure */ ++ state->head = head; ++ head->done = 0; ++ return Z_OK; ++} ++ ++/* ++ Search buf[0..len-1] for the pattern: 0, 0, 0xff, 0xff. Return when found ++ or when out of input. When called, *have is the number of pattern bytes ++ found in order so far, in 0..3. On return *have is updated to the new ++ state. If on return *have equals four, then the pattern was found and the ++ return value is how many bytes were read including the last byte of the ++ pattern. If *have is less than four, then the pattern has not been found ++ yet and the return value is len. In the latter case, syncsearch() can be ++ called again with more data and the *have state. *have is initialized to ++ zero for the first call. ++ */ ++local unsigned syncsearch(have, buf, len) ++unsigned FAR *have; ++const unsigned char FAR *buf; ++unsigned len; ++{ ++ unsigned got; ++ unsigned next; ++ ++ got = *have; ++ next = 0; ++ while (next < len && got < 4) { ++ if ((int)(buf[next]) == (got < 2 ? 0 : 0xff)) ++ got++; ++ else if (buf[next]) ++ got = 0; ++ else ++ got = 4 - got; ++ next++; ++ } ++ *have = got; ++ return next; ++} ++ ++int ZEXPORT inflateSync(strm) ++z_streamp strm; ++{ ++ unsigned len; /* number of bytes to look at or looked at */ ++ unsigned long in, out; /* temporary to save total_in and total_out */ ++ unsigned char buf[4]; /* to restore bit buffer to byte string */ ++ struct inflate_state FAR *state; ++ ++ /* check parameters */ ++ if (inflateStateCheck(strm)) return Z_STREAM_ERROR; ++ state = (struct inflate_state FAR *)strm->state; ++ if (strm->avail_in == 0 && state->bits < 8) return Z_BUF_ERROR; ++ ++ /* if first time, start search in bit buffer */ ++ if (state->mode != SYNC) { ++ state->mode = SYNC; ++ state->hold <<= state->bits & 7; ++ state->bits -= state->bits & 7; ++ len = 0; ++ while (state->bits >= 8) { ++ buf[len++] = (unsigned char)(state->hold); ++ state->hold >>= 8; ++ state->bits -= 8; ++ } ++ state->have = 0; ++ syncsearch(&(state->have), buf, len); ++ } ++ ++ /* search available input */ ++ len = syncsearch(&(state->have), strm->next_in, strm->avail_in); ++ strm->avail_in -= len; ++ strm->next_in += len; ++ strm->total_in += len; ++ ++ /* return no joy or set up to restart inflate() on a new block */ ++ if (state->have != 4) return Z_DATA_ERROR; ++ in = strm->total_in; out = strm->total_out; ++ inflateReset(strm); ++ strm->total_in = in; strm->total_out = out; ++ state->mode = TYPE; ++ return Z_OK; ++} ++ ++/* ++ Returns true if inflate is currently at the end of a block generated by ++ Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP ++ implementation to provide an additional safety check. PPP uses ++ Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored ++ block. When decompressing, PPP checks that at the end of input packet, ++ inflate is waiting for these length bytes. ++ */ ++int ZEXPORT inflateSyncPoint(strm) ++z_streamp strm; ++{ ++ struct inflate_state FAR *state; ++ ++ if (inflateStateCheck(strm)) return Z_STREAM_ERROR; ++ state = (struct inflate_state FAR *)strm->state; ++ return state->mode == STORED && state->bits == 0; ++} ++ ++int ZEXPORT inflateCopy(dest, source) ++z_streamp dest; ++z_streamp source; ++{ ++ struct inflate_state FAR *state; ++ struct inflate_state FAR *copy; ++ unsigned char FAR *window; ++ unsigned wsize; ++ ++ /* check input */ ++ if (inflateStateCheck(source) || dest == Z_NULL) ++ return Z_STREAM_ERROR; ++ state = (struct inflate_state FAR *)source->state; ++ ++ /* allocate space */ ++ copy = (struct inflate_state FAR *) ++ ZALLOC(source, 1, sizeof(struct inflate_state)); ++ if (copy == Z_NULL) return Z_MEM_ERROR; ++ window = Z_NULL; ++ if (state->window != Z_NULL) { ++ window = (unsigned char FAR *) ++ ZALLOC(source, 1U << state->wbits, sizeof(unsigned char)); ++ if (window == Z_NULL) { ++ ZFREE(source, copy); ++ return Z_MEM_ERROR; ++ } ++ } ++ ++ /* copy state */ ++ zmemcpy((voidpf)dest, (voidpf)source, sizeof(z_stream)); ++ zmemcpy((voidpf)copy, (voidpf)state, sizeof(struct inflate_state)); ++ copy->strm = dest; ++ if (state->lencode >= state->codes && ++ state->lencode <= state->codes + ENOUGH - 1) { ++ copy->lencode = copy->codes + (state->lencode - state->codes); ++ copy->distcode = copy->codes + (state->distcode - state->codes); ++ } ++ copy->next = copy->codes + (state->next - state->codes); ++ if (window != Z_NULL) { ++ wsize = 1U << state->wbits; ++ zmemcpy(window, state->window, wsize); ++ } ++ copy->window = window; ++ dest->state = (struct internal_state FAR *)copy; ++ return Z_OK; ++} ++ ++int ZEXPORT inflateUndermine(strm, subvert) ++z_streamp strm; ++int subvert; ++{ ++ struct inflate_state FAR *state; ++ ++ if (inflateStateCheck(strm)) return Z_STREAM_ERROR; ++ state = (struct inflate_state FAR *)strm->state; ++#ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR ++ state->sane = !subvert; ++ return Z_OK; ++#else ++ (void)subvert; ++ state->sane = 1; ++ return Z_DATA_ERROR; ++#endif ++} ++ ++int ZEXPORT inflateValidate(strm, check) ++z_streamp strm; ++int check; ++{ ++ struct inflate_state FAR *state; ++ ++ if (inflateStateCheck(strm)) return Z_STREAM_ERROR; ++ state = (struct inflate_state FAR *)strm->state; ++ if (check) ++ state->wrap |= 4; ++ else ++ state->wrap &= ~4; ++ return Z_OK; ++} ++ ++long ZEXPORT inflateMark(strm) ++z_streamp strm; ++{ ++ struct inflate_state FAR *state; ++ ++ if (inflateStateCheck(strm)) ++ return -(1L << 16); ++ state = (struct inflate_state FAR *)strm->state; ++ return (long)(((unsigned long)((long)state->back)) << 16) + ++ (state->mode == COPY ? state->length : ++ (state->mode == MATCH ? state->was - state->length : 0)); ++} ++ ++unsigned long ZEXPORT inflateCodesUsed(strm) ++z_streamp strm; ++{ ++ struct inflate_state FAR *state; ++ if (inflateStateCheck(strm)) return (unsigned long)-1; ++ state = (struct inflate_state FAR *)strm->state; ++ return (unsigned long)(state->next - state->codes); ++} +-- +2.14.3 + diff --git a/0002-Port-Fix-InflateBack-corner-case.patch b/0002-Port-Fix-InflateBack-corner-case.patch new file mode 100644 index 0000000..6673d9d --- /dev/null +++ b/0002-Port-Fix-InflateBack-corner-case.patch @@ -0,0 +1,147 @@ +From 267e6f20170edb9a00b11fc3a2ca7649ea1c4464 Mon Sep 17 00:00:00 2001 +From: Adenilson Cavalcanti +Date: Wed, 4 Apr 2018 15:14:57 -0700 +Subject: [PATCH 2/2] Port Fix InflateBack corner case + +This handles the case where a zlib user could rely on InflateBack +API to decompress content. + +The NEON optimization assumes that it can perform wide stores, sometimes +overwriting data on the output pointer (but never overflowing the buffer +end as it has enough room for the write). + +For infback there is no such guarantees (i.e. no extra wiggle room), +which can result in illegal operations. This patch fixes the potential +issue by falling back to the non-optimized code for such cases. + +Also it adds some comments about the entry assumptions in inflate and +writes out a defined value at the write buffer to identify where +the real data has ended (helpful while debugging). + +For reference, please see: +https://chromium.googlesource.com/chromium/src/+/0bb11040792edc5b28fcb710fc4c01fedd98c97c + +Change-Id: Iffbda9eb5e08a661aa15c6e3d1c59b678cc23b2c +--- + CMakeLists.txt | 5 ++--- + contrib/arm/{inffast.c => inffast_chunk.c} | 10 +++++++--- + contrib/arm/inffast_chunk.h | 12 ++++++++++++ + contrib/arm/inflate.c | 14 ++++++++++++-- + 4 files changed, 33 insertions(+), 8 deletions(-) + rename contrib/arm/{inffast.c => inffast_chunk.c} (97%) + create mode 100644 contrib/arm/inffast_chunk.h + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 09bb3db..98ee4dd 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -139,9 +139,8 @@ if(CMAKE_COMPILER_IS_GNUCC) + + if(ARM_NEON) + list(REMOVE_ITEM ZLIB_SRCS inflate.c) +- list(REMOVE_ITEM ZLIB_SRCS inffast.c) +- set(ZLIB_ARM_NEON_HDRS contrib/arm/chunkcopy.h) +- set(ZLIB_ARM_NEON contrib/arm/inflate.c contrib/arm/inffast.c) ++ set(ZLIB_ARM_NEON_HDRS contrib/arm/chunkcopy.h contrib/arm/inffast_chunk.h) ++ set(ZLIB_ARM_NEON contrib/arm/inflate.c contrib/arm/inffast_chunk.c) + add_definitions(-DARM_NEON) + set(COMPILER ${CMAKE_C_COMPILER}) + # NEON is mandatory in ARMv8. +diff --git a/contrib/arm/inffast.c b/contrib/arm/inffast_chunk.c +similarity index 97% +rename from contrib/arm/inffast.c +rename to contrib/arm/inffast_chunk.c +index f7f5007..0c5c583 100644 +--- a/contrib/arm/inffast.c ++++ b/contrib/arm/inffast_chunk.c +@@ -6,8 +6,8 @@ + #include "zutil.h" + #include "inftrees.h" + #include "inflate.h" +-#include "inffast.h" +-#include "chunkcopy.h" ++#include "contrib/arm/inffast_chunk.h" ++#include "contrib/arm/chunkcopy.h" + + #ifdef ASMINF + # pragma message("Assembler code may have bugs -- use at your own risk") +@@ -28,6 +28,10 @@ + strm->avail_out >= 258 + start >= strm->avail_out + state->bits < 8 ++ strm->next_out[0..strm->avail_out] does not overlap with ++ strm->next_in[0..strm->avail_in] ++ strm->state->window is allocated with an additional ++ CHUNKCOPY_CHUNK_SIZE-1 bytes of padding beyond strm->state->wsize + + On return, state->mode is one of: + +@@ -48,7 +52,7 @@ + requires strm->avail_out >= 258 for each loop to avoid checking for + output space. + */ +-void ZLIB_INTERNAL inflate_fast(strm, start) ++void ZLIB_INTERNAL inflate_fast_chunk(strm, start) + z_streamp strm; + unsigned start; /* inflate()'s starting value for strm->avail_out */ + { +diff --git a/contrib/arm/inffast_chunk.h b/contrib/arm/inffast_chunk.h +new file mode 100644 +index 0000000..7839e1d +--- /dev/null ++++ b/contrib/arm/inffast_chunk.h +@@ -0,0 +1,12 @@ ++/* inffast.h -- header to use inffast.c ++ * Copyright (C) 1995-2003, 2010 Mark Adler ++ * Copyright (C) 2017 ARM, Inc. ++ * For conditions of distribution and use, see copyright notice in zlib.h ++ */ ++ ++/* WARNING: this file should *not* be used by applications. It is ++ part of the implementation of the compression library and is ++ subject to change. Applications should only use zlib.h. ++ */ ++ ++void ZLIB_INTERNAL inflate_fast_chunk OF((z_streamp strm, unsigned start)); +diff --git a/contrib/arm/inflate.c b/contrib/arm/inflate.c +index 23e95f1..d860542 100644 +--- a/contrib/arm/inflate.c ++++ b/contrib/arm/inflate.c +@@ -83,7 +83,7 @@ + #include "zutil.h" + #include "inftrees.h" + #include "inflate.h" +-#include "inffast.h" ++#include "contrib/arm/inffast_chunk.h" + #include "contrib/arm/chunkcopy.h" + + #ifdef MAKEFIXED +@@ -1056,7 +1056,7 @@ int flush; + case LEN: + if (have >= 6 && left >= 258) { + RESTORE(); +- inflate_fast(strm, out); ++ inflate_fast_chunk(strm, out); + LOAD(); + if (state->mode == TYPE) + state->back = -1; +@@ -1262,6 +1262,16 @@ int flush; + Note: a memory error from inflate() is non-recoverable. + */ + inf_leave: ++ /* We write a defined value in the unused space to help mark ++ * where the stream has ended. We don't use zeros as that can ++ * mislead clients relying on undefined behavior (i.e. assuming ++ * that the data is over when the buffer has a zero/null value). ++ */ ++ if (left >= CHUNKCOPY_CHUNK_SIZE) ++ memset(put, 0x55, CHUNKCOPY_CHUNK_SIZE); ++ else ++ memset(put, 0x55, left); ++ + RESTORE(); + if (state->wsize || (out != strm->avail_out && state->mode < BAD && + (state->mode < CHECK || flush != Z_FINISH))) +-- +2.14.3 + diff --git a/0002-Porting-optimized-longest_match.patch b/0002-Porting-optimized-longest_match.patch new file mode 100644 index 0000000..790ae2a --- /dev/null +++ b/0002-Porting-optimized-longest_match.patch @@ -0,0 +1,218 @@ +From 0ad56061ade1afe2896af1acffa5e15fbe5c98ed Mon Sep 17 00:00:00 2001 +From: Adenilson Cavalcanti +Date: Mon, 9 Apr 2018 15:14:19 -0700 +Subject: [PATCH 2/3] Porting optimized longest_match + +This patch was contributed to zlib-ng and features an improved longest_match +function using the most distant hash code to reduce number of checks +(see: http://www.gildor.org/en/projects/zlib). + +Original patch by Jun He. +--- + CMakeLists.txt | 3 +- + contrib/arm/arm_longest_match.h | 142 ++++++++++++++++++++++++++++++++++++++++ + deflate.c | 11 +++- + 3 files changed, 152 insertions(+), 4 deletions(-) + create mode 100644 contrib/arm/arm_longest_match.h + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 230ca6d..c330093 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -142,7 +142,8 @@ if(CMAKE_COMPILER_IS_GNUCC) + set(ZLIB_ARM_NEON_HDRS + contrib/arm/chunkcopy.h + contrib/arm/inffast_chunk.h +- contrib/arm/neon_slide_hash.h) ++ contrib/arm/neon_slide_hash.h ++ contrib/arm/arm_longest_match.h) + set(ZLIB_ARM_NEON contrib/arm/inflate.c contrib/arm/inffast_chunk.c) + add_definitions(-DARM_NEON) + set(COMPILER ${CMAKE_C_COMPILER}) +diff --git a/contrib/arm/arm_longest_match.h b/contrib/arm/arm_longest_match.h +new file mode 100644 +index 0000000..9e7083f +--- /dev/null ++++ b/contrib/arm/arm_longest_match.h +@@ -0,0 +1,142 @@ ++/* Copyright (C) 1995-2011, 2016 Mark Adler ++ * Copyright (C) 2017 ARM Holdings Inc. ++ * Authors: Adenilson Cavalcanti ++ * Jun He ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++#ifndef __ARM_LONGEST__MATCH__ ++#define __ARM_LONGEST__MATCH__ ++ ++#if defined(ARM_NEON) ++#include "deflate.h" ++#include ++static inline long get_match_len(const unsigned char *a, const unsigned char *b, long max) ++{ ++ register int len = 0; ++ register unsigned long xor = 0; ++ register int check_loops = max/sizeof(unsigned long); ++ while(check_loops-- > 0) { ++ xor = (*(unsigned long *)(a+len)) ^ (*(unsigned long *)(b+len)); ++ if (xor) break; ++ len += sizeof(unsigned long); ++ } ++ if (0 == xor) { ++ while (len < max) { ++ if (a[len] != b[len]) break; ++ len++; ++ } ++ return len; ++ } ++ xor = __builtin_ctzl(xor)>>3; ++ return len + xor; ++} ++ ++/* ++ * This implementation is based on algorithm described at: ++ * http://www.gildor.org/en/projects/zlib ++ * It uses the hash chain indexed by the most distant hash code to ++ * reduce number of checks. ++ * This also eliminates the those unnecessary check loops in legacy ++ * longest_match's do..while loop if the "most distant code" is out ++ * of search buffer ++ * ++ */ ++static inline unsigned arm_longest_match(deflate_state *const s, IPos cur_match) { ++ unsigned chain_length = s->max_chain_length;/* max hash chain length */ ++ unsigned char *scan = s->window + s->strstart; /* current string */ ++ unsigned char *match; /* matched string */ ++ unsigned int len; /* length of current match */ ++ unsigned int best_len = s->prev_length; /* best match length so far */ ++ unsigned int nice_match = s->nice_match; /* stop if match long enough */ ++ IPos limit = s->strstart > (IPos)MAX_DIST(s) ? ++ s->strstart - (IPos)MAX_DIST(s) : 0; ++ /* Stop when cur_match becomes <= limit. To simplify the code, ++ * we prevent matches with the string of window index 0. ++ */ ++ int offset = 0; /* offset of the head[most_distant_hash] from IN cur_match */ ++ Pos *prev = s->prev; ++ unsigned int wmask = s->w_mask; ++ unsigned char *scan_buf_base = s->window; ++ ++ /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16. ++ * It is easy to get rid of this optimization if necessary. ++ */ ++ Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever"); ++ ++ /* Do not look for matches beyond the end of the input. This is necessary ++ * to make deflate deterministic. ++ */ ++ if ((unsigned int)nice_match > s->lookahead) nice_match = s->lookahead; ++ ++ Assert((unsigned long)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead"); ++ ++ /* find most distant hash code for lazy_match */ ++ if (best_len > MIN_MATCH) { ++ /* search for most distant hash code */ ++ int i; ++ uint16_t hash = 0; ++ IPos pos; ++ ++ UPDATE_HASH(s, hash, scan[1]); ++ UPDATE_HASH(s, hash, scan[2]); ++ for (i = 3; i <= best_len; i++) { ++ UPDATE_HASH(s, hash, scan[i]); ++ /* get head IPos of hash calced by scan[i-2..i] */ ++ pos = s->head[hash]; ++ /* compare it to current "farthest hash" IPos */ ++ if (pos <= cur_match) { ++ /* we have a new "farthest hash" now */ ++ offset = i - 2; ++ cur_match = pos; ++ } ++ } ++ ++ /* update variables to correspond offset */ ++ limit += offset; ++ /* ++ * check if the most distant code's offset is out of search buffer ++ * if it is true, then this means scan[offset..offset+2] are not ++ * presented in the search buffer. So we just return best_len ++ * we've found. ++ */ ++ if (cur_match < limit) return best_len; ++ ++ scan_buf_base -= offset; ++ /* reduce hash search depth based on best_len */ ++ chain_length /= best_len - MIN_MATCH; ++ } ++ ++ do { ++ Assert(cur_match < s->strstart, "no future"); ++ ++ /* Determine matched length at current pos */ ++ match = scan_buf_base + cur_match; ++ len = get_match_len(match, scan, MAX_MATCH); ++ ++ if (len > best_len) { ++ /* found longer string */ ++ s->match_start = cur_match - offset; ++ best_len = len; ++ /* good enough? */ ++ if (len >= nice_match) break; ++ } ++ /* move to prev pos in this hash chain */ ++ } while ((cur_match = prev[cur_match & wmask]) > limit && --chain_length != 0); ++ ++ return (best_len <= s->lookahead)? best_len : s->lookahead; ++} ++ ++#endif ++#endif +diff --git a/deflate.c b/deflate.c +index 36f99ac..4c42259 100644 +--- a/deflate.c ++++ b/deflate.c +@@ -50,9 +50,6 @@ + /* @(#) $Id$ */ + + #include "deflate.h" +-#if __ARM_NEON +-#include "contrib/arm/neon_slide_hash.h" +-#endif + + const char deflate_copyright[] = + " deflate 1.2.11 Copyright 1995-2017 Jean-loup Gailly and Mark Adler "; +@@ -196,6 +193,11 @@ local const config configuration_table[10] = { + s->head[s->hash_size-1] = NIL; \ + zmemzero((Bytef *)s->head, (unsigned)(s->hash_size-1)*sizeof(*s->head)); + ++#if defined(ARM_NEON) ++#include "contrib/arm/arm_longest_match.h" ++#include "contrib/arm/neon_slide_hash.h" ++#endif ++ + /* =========================================================================== + * Slide the hash table when sliding the window down (could be avoided with 32 + * bit values at the expense of memory usage). We slide even when level == 0 to +@@ -1244,6 +1246,9 @@ local uInt longest_match(s, cur_match) + deflate_state *s; + IPos cur_match; /* current match */ + { ++#if defined(ARM_NEON) ++ return arm_longest_match(s, cur_match); ++#endif + unsigned chain_length = s->max_chain_length;/* max hash chain length */ + register Bytef *scan = s->window + s->strstart; /* current string */ + register Bytef *match; /* matched string */ +-- +2.14.3 + diff --git a/0003-arm64-specific-build-patch.patch b/0003-arm64-specific-build-patch.patch new file mode 100644 index 0000000..74502d9 --- /dev/null +++ b/0003-arm64-specific-build-patch.patch @@ -0,0 +1,136 @@ +From bd30e5ff76aab2668ebfd46e5dbadc44322960c1 Mon Sep 17 00:00:00 2001 +From: Jeremy Linton +Date: Fri, 6 Apr 2018 11:46:42 -0500 +Subject: [PATCH 3/3] arm64 specific build patch + +--- + Makefile.in | 37 +++++++++++++++++++++++++++---------- + configure | 2 +- + contrib/minizip/zip.c | 6 ++++-- + 3 files changed, 32 insertions(+), 13 deletions(-) + +diff --git a/Makefile.in b/Makefile.in +index 5a77949..1a1e452 100644 +--- a/Makefile.in ++++ b/Makefile.in +@@ -57,11 +57,11 @@ SRCDIR= + ZINC= + ZINCOUT=-I. + +-OBJZ = adler32.o crc32.o deflate.o infback.o inffast.o inflate.o inftrees.o trees.o zutil.o ++OBJZ = adler32.o crc32.o deflate.o infback.o arminffast.o inffast.o inflate.o inflate_chunk.o inftrees.o trees.o zutil.o + OBJG = compress.o uncompr.o gzclose.o gzlib.o gzread.o gzwrite.o + OBJC = $(OBJZ) $(OBJG) + +-PIC_OBJZ = adler32.lo crc32.lo deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo trees.lo zutil.lo ++PIC_OBJZ = adler32.lo crc32.lo deflate.lo infback.lo inffast.lo inflate.lo inflate_chunk.lo inftrees.lo trees.lo zutil.lo + PIC_OBJG = compress.lo uncompr.lo gzclose.lo gzlib.lo gzread.lo gzwrite.lo + PIC_OBJC = $(PIC_OBJZ) $(PIC_OBJG) + +@@ -163,16 +163,22 @@ crc32.o: $(SRCDIR)crc32.c + $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)crc32.c + + deflate.o: $(SRCDIR)deflate.c +- $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c ++ $(CC) $(CFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -c -o $@ $(SRCDIR)deflate.c + + infback.o: $(SRCDIR)infback.c + $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)infback.c + + inffast.o: $(SRCDIR)inffast.c +- $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)inffast.c ++ $(CC) $(CFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -c -o $@ $(SRCDIR)inffast.c + +-inflate.o: $(SRCDIR)inflate.c +- $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)inflate.c ++arminffast.o: $(SRCDIR)contrib/arm/inffast_chunk.c ++ $(CC) $(CFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -c -o $@ $(SRCDIR)contrib/arm/inffast_chunk.c ++ ++inflate.o: $(SRCDIR)contrib/arm/inflate.c ++ $(CC) $(CFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -c -o $@ $(SRCDIR)contrib/arm/inflate.c ++ ++inflate_chunk.o: $(SRCDIR)contrib/arm/inffast_chunk.c ++ $(CC) $(CFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -c -o $@ $(SRCDIR)contrib/arm/inffast_chunk.c + + inftrees.o: $(SRCDIR)inftrees.c + $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)inftrees.c +@@ -214,7 +220,7 @@ crc32.lo: $(SRCDIR)crc32.c + + deflate.lo: $(SRCDIR)deflate.c + -@mkdir objs 2>/dev/null || test -d objs +- $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c ++ $(CC) $(SFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c + -@mv objs/deflate.o $@ + + infback.lo: $(SRCDIR)infback.c +@@ -222,16 +228,27 @@ infback.lo: $(SRCDIR)infback.c + $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/infback.o $(SRCDIR)infback.c + -@mv objs/infback.o $@ + ++arminffast.lo: $(SRCDIR)contrib/arm/inffast_chunk.c $(SRCDIR)inffast.c ++ -@mkdir objs 2>/dev/null || test -d objs ++ $(CC) $(SFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -DPIC -c -o objs/arminffast.o $(SRCDIR)contrib/arm/inffast_chunk.c ++ -@mv objs/arminffast.o $@ ++ + inffast.lo: $(SRCDIR)inffast.c + -@mkdir objs 2>/dev/null || test -d objs +- $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/inffast.o $(SRCDIR)inffast.c ++ $(CC) $(SFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -DPIC -c -o objs/inffast.o $(SRCDIR)inffast.c + -@mv objs/inffast.o $@ + +-inflate.lo: $(SRCDIR)inflate.c ++inflate.lo: $(SRCDIR)contrib/arm/inflate.c + -@mkdir objs 2>/dev/null || test -d objs +- $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/inflate.o $(SRCDIR)inflate.c ++ $(CC) $(SFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -DPIC -c -o objs/inflate.o $(SRCDIR)contrib/arm/inflate.c + -@mv objs/inflate.o $@ + ++inflate_chunk.lo: $(SRCDIR)contrib/arm/inffast_chunk.c ++ -@mkdir objs 2>/dev/null || test -d objs ++ $(CC) $(SFLAGS) $(ZINC) -I$(SRCDIR) -I$(SRCDIR)contrib/arm -DPIC -c -o objs/inflate_chunk.o $(SRCDIR)contrib/arm/inffast_chunk.c ++ -@mv objs/inflate_chunk.o $@ ++ ++ + inftrees.lo: $(SRCDIR)inftrees.c + -@mkdir objs 2>/dev/null || test -d objs + $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/inftrees.o $(SRCDIR)inftrees.c +diff --git a/configure b/configure +index e974d1f..0c5f837 100755 +--- a/configure ++++ b/configure +@@ -23,7 +23,7 @@ SRCDIR=`dirname $0` + if test $SRCDIR = "."; then + ZINC="" + ZINCOUT="-I." +- SRCDIR="" ++ SRCDIR="./" + else + ZINC='-include zconf.h' + ZINCOUT='-I. -I$(SRCDIR)' +diff --git a/contrib/minizip/zip.c b/contrib/minizip/zip.c +index 44e88a9..0517930 100644 +--- a/contrib/minizip/zip.c ++++ b/contrib/minizip/zip.c +@@ -519,15 +519,17 @@ local ZPOS64_T zip64local_SearchCentralDir(const zlib_filefunc64_32_def* pzlib_f + break; + + for (i=(int)uReadSize-3; (i--)>0;) ++ { + if (((*(buf+i))==0x50) && ((*(buf+i+1))==0x4b) && + ((*(buf+i+2))==0x05) && ((*(buf+i+3))==0x06)) + { + uPosFound = uReadPos+i; + break; + } ++ } + +- if (uPosFound!=0) +- break; ++ if (uPosFound!=0) ++ break; + } + TRYFREE(buf); + return uPosFound; +-- +2.14.3 + diff --git a/zlib.spec b/zlib.spec index 0e67f2b..9087e98 100644 --- a/zlib.spec +++ b/zlib.spec @@ -12,6 +12,12 @@ Source: http://www.zlib.net/zlib-%{version}.tar.xz Patch0: zlib-1.2.5-minizip-fixuncrypt.patch # resolves: #805113 Patch1: zlib-1.2.11-optimized-s390.patch +# general aarch64 optimizations +Patch2: 0001-Porting-inflate-using-wider-loads-and-stores.patch +Patch3: 0002-Port-Fix-InflateBack-corner-case.patch +Patch4: 0001-Neon-Optimized-hash-chain-rebase.patch +Patch5: 0002-Porting-optimized-longest_match.patch +Patch6: 0003-arm64-specific-build-patch.patch BuildRequires: automake, autoconf, libtool @@ -64,6 +70,14 @@ developing applications which use minizip. %ifarch s390 s390x %patch1 -p1 -b .optimized-deflate %endif +%ifarch aarch64 +%patch2 -p1 -b .optimize-aarch64 +%patch3 -p1 -b .optimize-aarch64 +%patch4 -p1 -b .optimize-aarch64 +%patch5 -p1 -b .optimize-aarch64 +%patch6 -p1 -b .optimize-aarch64 +%endif + iconv -f iso-8859-2 -t utf-8 < ChangeLog > ChangeLog.tmp mv ChangeLog.tmp ChangeLog @@ -73,6 +87,10 @@ export CFLAGS="$RPM_OPT_FLAGS" %ifarch ppc64 CFLAGS+=" -O3" %endif +%ifarch aarch64 +CFLAGS+=" -DARM_NEON -O3" +%endif + export LDFLAGS="$LDFLAGS -Wl,-z,relro -Wl,-z,now" ./configure --libdir=%{_libdir} --includedir=%{_includedir} --prefix=%{_prefix} make %{?_smp_mflags}