zlib/0001-Neon-Optimized-hash-ch...

171 lines
5.5 KiB
Diff

From f0fd8c553fa024c599f4aff65d7c603ceeaa6a58 Mon Sep 17 00:00:00 2001
From: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
Date: Mon, 9 Apr 2018 13:52:17 -0700
Subject: [PATCH 1/3] Neon-Optimized hash chain rebase
This should help with compression of data, using NEON instructions
(therefore useful for ARMv7/ARMv8).
Original patch by Jun He.
---
CMakeLists.txt | 18 ++++++++
contrib/arm/neon_slide_hash.h | 84 +++++++++++++++++++++++++++++++++++
deflate.c | 7 +++
3 files changed, 109 insertions(+)
create mode 100644 contrib/arm/neon_slide_hash.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0fe939d..e9a74e9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -136,6 +136,24 @@ if(CMAKE_COMPILER_IS_GNUCC)
set(ZLIB_ASMS contrib/amd64/amd64-match.S)
endif ()
+ if(ARM_NEON)
+ list(REMOVE_ITEM ZLIB_SRCS inflate.c)
+ set(ZLIB_ARM_NEON_HDRS
+ contrib/arm/chunkcopy.h
+ contrib/arm/inffast_chunk.h
+ contrib/arm/neon_slide_hash.h)
+ set(ZLIB_ARM_NEON contrib/arm/inflate.c contrib/arm/inffast_chunk.c)
+ add_definitions(-DARM_NEON)
+ set(COMPILER ${CMAKE_C_COMPILER})
+ # NEON is mandatory in ARMv8.
+ if(${COMPILER} MATCHES "aarch64")
+ set_source_files_properties(${ZLIB_ARM_NEON} PROPERTIES LANGUAGE C COMPILE_FLAGS -march=armv8-a)
+ # But it was optional for ARMv7.
+ elseif(${COMPILER} MATCHES "arm")
+ set_source_files_properties(${ZLIB_ARM_NEON} PROPERTIES LANGUAGE C COMPILE_FLAGS -mfpu=neon)
+ endif()
+ endif()
+
if(ZLIB_ASMS)
add_definitions(-DASMV)
set_source_files_properties(${ZLIB_ASMS} PROPERTIES LANGUAGE C COMPILE_FLAGS -DNO_UNDERLINE)
diff --git a/contrib/arm/neon_slide_hash.h b/contrib/arm/neon_slide_hash.h
new file mode 100644
index 0000000..0daffa1
--- /dev/null
+++ b/contrib/arm/neon_slide_hash.h
@@ -0,0 +1,84 @@
+/* Copyright (C) 1995-2011, 2016 Mark Adler
+ * Copyright (C) 2017 ARM Holdings Inc.
+ * Authors: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
+ * Jun He <jun.he@arm.com>
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+#ifndef __NEON_SLIDE_HASH__
+#define __NEON_SLIDE_HASH__
+
+#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#include "deflate.h"
+#include <arm_neon.h>
+
+inline static void neon_slide_hash(deflate_state *s)
+{
+ /*
+ * This is ASIMD implementation for hash table rebase
+ * it assumes:
+ * 1. hash chain offset (Pos) is 2 bytes
+ * 2. hash table size is multiple*128 bytes
+ * #1 should be true as Pos is defined as "ush"
+ * #2 should be true as hash_bits are greater that 7
+ */
+ unsigned n, m;
+ unsigned short wsize = s->w_size;
+ uint16x8_t v, *p;
+ size_t size;
+
+ size = s->hash_size*sizeof(s->head[0]);
+ Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
+
+ Assert(sizeof(Pos) == 2, "Wrong Pos size");
+
+ /* slide s->head */
+ v = vdupq_n_u16(wsize);
+ p = (uint16x8_t *)(s->head);
+ n = size / (sizeof(uint16x8_t) * 8);
+ do {
+ p[0] = vqsubq_u16(p[0], v);
+ p[1] = vqsubq_u16(p[1], v);
+ p[2] = vqsubq_u16(p[2], v);
+ p[3] = vqsubq_u16(p[3], v);
+ p[4] = vqsubq_u16(p[4], v);
+ p[5] = vqsubq_u16(p[5], v);
+ p[6] = vqsubq_u16(p[6], v);
+ p[7] = vqsubq_u16(p[7], v);
+ p += 8;
+ } while (--n);
+#ifndef FASTEST
+ /* slide s->prev */
+ size = wsize*sizeof(s->prev[0]);
+
+ Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
+
+ p = (uint16x8_t *)(s->prev);
+ n = size / (sizeof(uint16x8_t) * 8);
+ do {
+ p[0] = vqsubq_u16(p[0], v);
+ p[1] = vqsubq_u16(p[1], v);
+ p[2] = vqsubq_u16(p[2], v);
+ p[3] = vqsubq_u16(p[3], v);
+ p[4] = vqsubq_u16(p[4], v);
+ p[5] = vqsubq_u16(p[5], v);
+ p[6] = vqsubq_u16(p[6], v);
+ p[7] = vqsubq_u16(p[7], v);
+ p += 8;
+ } while (--n);
+#endif
+}
+
+#endif
+#endif
diff --git a/deflate.c b/deflate.c
index 1ec7614..36f99ac 100644
--- a/deflate.c
+++ b/deflate.c
@@ -50,6 +50,9 @@
/* @(#) $Id$ */
#include "deflate.h"
+#if __ARM_NEON
+#include "contrib/arm/neon_slide_hash.h"
+#endif
const char deflate_copyright[] =
" deflate 1.2.11 Copyright 1995-2017 Jean-loup Gailly and Mark Adler ";
@@ -201,6 +204,9 @@ local const config configuration_table[10] = {
local void slide_hash(s)
deflate_state *s;
{
+#if ARM_NEON
+ return neon_slide_hash(s);
+#else
unsigned n, m;
Posf *p;
uInt wsize = s->w_size;
@@ -222,6 +228,7 @@ local void slide_hash(s)
*/
} while (--n);
#endif
+#endif
}
/* ========================================================================= */
--
2.19.0