From f0fd8c553fa024c599f4aff65d7c603ceeaa6a58 Mon Sep 17 00:00:00 2001 From: Adenilson Cavalcanti Date: Mon, 9 Apr 2018 13:52:17 -0700 Subject: [PATCH 1/3] Neon-Optimized hash chain rebase This should help with compression of data, using NEON instructions (therefore useful for ARMv7/ARMv8). Original patch by Jun He. --- CMakeLists.txt | 18 ++++++++ contrib/arm/neon_slide_hash.h | 84 +++++++++++++++++++++++++++++++++++ deflate.c | 7 +++ 3 files changed, 109 insertions(+) create mode 100644 contrib/arm/neon_slide_hash.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 0fe939d..e9a74e9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -136,6 +136,24 @@ if(CMAKE_COMPILER_IS_GNUCC) set(ZLIB_ASMS contrib/amd64/amd64-match.S) endif () + if(ARM_NEON) + list(REMOVE_ITEM ZLIB_SRCS inflate.c) + set(ZLIB_ARM_NEON_HDRS + contrib/arm/chunkcopy.h + contrib/arm/inffast_chunk.h + contrib/arm/neon_slide_hash.h) + set(ZLIB_ARM_NEON contrib/arm/inflate.c contrib/arm/inffast_chunk.c) + add_definitions(-DARM_NEON) + set(COMPILER ${CMAKE_C_COMPILER}) + # NEON is mandatory in ARMv8. + if(${COMPILER} MATCHES "aarch64") + set_source_files_properties(${ZLIB_ARM_NEON} PROPERTIES LANGUAGE C COMPILE_FLAGS -march=armv8-a) + # But it was optional for ARMv7. + elseif(${COMPILER} MATCHES "arm") + set_source_files_properties(${ZLIB_ARM_NEON} PROPERTIES LANGUAGE C COMPILE_FLAGS -mfpu=neon) + endif() + endif() + if(ZLIB_ASMS) add_definitions(-DASMV) set_source_files_properties(${ZLIB_ASMS} PROPERTIES LANGUAGE C COMPILE_FLAGS -DNO_UNDERLINE) diff --git a/contrib/arm/neon_slide_hash.h b/contrib/arm/neon_slide_hash.h new file mode 100644 index 0000000..0daffa1 --- /dev/null +++ b/contrib/arm/neon_slide_hash.h @@ -0,0 +1,84 @@ +/* Copyright (C) 1995-2011, 2016 Mark Adler + * Copyright (C) 2017 ARM Holdings Inc. + * Authors: Adenilson Cavalcanti + * Jun He + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ +#ifndef __NEON_SLIDE_HASH__ +#define __NEON_SLIDE_HASH__ + +#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) +#include "deflate.h" +#include + +inline static void neon_slide_hash(deflate_state *s) +{ + /* + * This is ASIMD implementation for hash table rebase + * it assumes: + * 1. hash chain offset (Pos) is 2 bytes + * 2. hash table size is multiple*128 bytes + * #1 should be true as Pos is defined as "ush" + * #2 should be true as hash_bits are greater that 7 + */ + unsigned n, m; + unsigned short wsize = s->w_size; + uint16x8_t v, *p; + size_t size; + + size = s->hash_size*sizeof(s->head[0]); + Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err"); + + Assert(sizeof(Pos) == 2, "Wrong Pos size"); + + /* slide s->head */ + v = vdupq_n_u16(wsize); + p = (uint16x8_t *)(s->head); + n = size / (sizeof(uint16x8_t) * 8); + do { + p[0] = vqsubq_u16(p[0], v); + p[1] = vqsubq_u16(p[1], v); + p[2] = vqsubq_u16(p[2], v); + p[3] = vqsubq_u16(p[3], v); + p[4] = vqsubq_u16(p[4], v); + p[5] = vqsubq_u16(p[5], v); + p[6] = vqsubq_u16(p[6], v); + p[7] = vqsubq_u16(p[7], v); + p += 8; + } while (--n); +#ifndef FASTEST + /* slide s->prev */ + size = wsize*sizeof(s->prev[0]); + + Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err"); + + p = (uint16x8_t *)(s->prev); + n = size / (sizeof(uint16x8_t) * 8); + do { + p[0] = vqsubq_u16(p[0], v); + p[1] = vqsubq_u16(p[1], v); + p[2] = vqsubq_u16(p[2], v); + p[3] = vqsubq_u16(p[3], v); + p[4] = vqsubq_u16(p[4], v); + p[5] = vqsubq_u16(p[5], v); + p[6] = vqsubq_u16(p[6], v); + p[7] = vqsubq_u16(p[7], v); + p += 8; + } while (--n); +#endif +} + +#endif +#endif diff --git a/deflate.c b/deflate.c index 1ec7614..36f99ac 100644 --- a/deflate.c +++ b/deflate.c @@ -50,6 +50,9 @@ /* @(#) $Id$ */ #include "deflate.h" +#if __ARM_NEON +#include "contrib/arm/neon_slide_hash.h" +#endif const char deflate_copyright[] = " deflate 1.2.11 Copyright 1995-2017 Jean-loup Gailly and Mark Adler "; @@ -201,6 +204,9 @@ local const config configuration_table[10] = { local void slide_hash(s) deflate_state *s; { +#if ARM_NEON + return neon_slide_hash(s); +#else unsigned n, m; Posf *p; uInt wsize = s->w_size; @@ -222,6 +228,7 @@ local void slide_hash(s) */ } while (--n); #endif +#endif } /* ========================================================================= */ -- 2.19.0