From d064a7f405916941aea95d619c81e3efa0c41493 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Ondruch?= Date: Tue, 13 Oct 2020 17:02:57 +0200 Subject: [PATCH] Enable arm64 optimizations. The same already exists for power/x86. Resolves: rhbz#1884728 Related: rhbz#1947938 --- ...timizations-that-exist-for-power-x86.patch | 178 ++++++++++++++++++ ruby.spec | 5 + 2 files changed, 183 insertions(+) create mode 100644 ruby-3.0.0-preview1-Enable-arm64-optimizations-that-exist-for-power-x86.patch diff --git a/ruby-3.0.0-preview1-Enable-arm64-optimizations-that-exist-for-power-x86.patch b/ruby-3.0.0-preview1-Enable-arm64-optimizations-that-exist-for-power-x86.patch new file mode 100644 index 0000000..decb45f --- /dev/null +++ b/ruby-3.0.0-preview1-Enable-arm64-optimizations-that-exist-for-power-x86.patch @@ -0,0 +1,178 @@ +From c5806d668f84a86e9e6a522f84b8aa6cb4cdaae9 Mon Sep 17 00:00:00 2001 +From: Ali Saidi +Date: Wed, 5 Aug 2020 20:46:28 -0500 +Subject: [PATCH 1/3] Enable unaligned accesses on arm64 + +64-bit Arm platforms support unaligned accesses. + +Running the string benchmarks this change improves performance +by an average of 1.04x, min .96x, max 1.21x, median 1.01x +--- + include/ruby/defines.h | 2 +- + regint.h | 2 +- + siphash.c | 2 +- + st.c | 2 +- + 4 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/include/ruby/defines.h b/include/ruby/defines.h +index 49f673ef936a..0193275e8b78 100644 +--- a/include/ruby/defines.h ++++ b/include/ruby/defines.h +@@ -485,7 +485,7 @@ + #ifndef UNALIGNED_WORD_ACCESS + # if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ + defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || \ +- defined(__powerpc64__) || \ ++ defined(__powerpc64__) || defined(__aarch64__) || \ + defined(__mc68020__) + # define UNALIGNED_WORD_ACCESS 1 + # else +diff --git a/regint.h b/regint.h +index a2f5bbba1d1f..0740429688bc 100644 +--- a/regint.h ++++ b/regint.h +@@ -52,7 +52,7 @@ + #ifndef UNALIGNED_WORD_ACCESS + # if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ + defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || \ +- defined(__powerpc64__) || \ ++ defined(__powerpc64__) || defined(__aarch64__) || \ + defined(__mc68020__) + # define UNALIGNED_WORD_ACCESS 1 + # else +diff --git a/siphash.c b/siphash.c +index 153d2c690ab9..ddf8ee245d81 100644 +--- a/siphash.c ++++ b/siphash.c +@@ -30,7 +30,7 @@ + #ifndef UNALIGNED_WORD_ACCESS + # if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ + defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || \ +- defined(__powerpc64__) || \ ++ defined(__powerpc64__) || defined(__aarch64__) || \ + defined(__mc68020__) + # define UNALIGNED_WORD_ACCESS 1 + # endif +diff --git a/st.c b/st.c +index c11535ef9779..8be466bf733f 100644 +--- a/st.c ++++ b/st.c +@@ -1815,7 +1815,7 @@ st_values_check(st_table *tab, st_data_t *values, st_index_t size, + #ifndef UNALIGNED_WORD_ACCESS + # if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ + defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || \ +- defined(__powerpc64__) || \ ++ defined(__powerpc64__) || defined(__aarch64__) || \ + defined(__mc68020__) + # define UNALIGNED_WORD_ACCESS 1 + # endif + +From 79b7b9143fda0f33fc9375980cecc61eb42c6f66 Mon Sep 17 00:00:00 2001 +From: Ali Saidi +Date: Wed, 5 Aug 2020 21:04:37 -0500 +Subject: [PATCH 2/3] arm64 enable gc optimizations + +Similar to x86 and powerpc optimizations. + +| |compare-ruby|built-ruby| +|:------|-----------:|---------:| +|hash1 | 0.225| 0.237| +| | -| 1.05x| +|hash2 | 0.110| 0.110| +| | 1.00x| -| +--- + gc.c | 13 +++++++++++++ + gc.h | 2 ++ + 2 files changed, 15 insertions(+) + +diff --git a/gc.c b/gc.c +index 22972dfc806c..788f06f1586e 100644 +--- a/gc.c ++++ b/gc.c +@@ -1153,6 +1153,19 @@ tick(void) + return val; + } + ++#elif defined(__aarch64__) && defined(__GNUC__) ++typedef unsigned long tick_t; ++#define PRItick "lu" ++ ++static __inline__ tick_t ++tick(void) ++{ ++ unsigned long val; ++ __asm__ __volatile__ ("mrs %0, cntvct_el0", : "=r" (val)); ++ return val; ++} ++ ++ + #elif defined(_WIN32) && defined(_MSC_VER) + #include + typedef unsigned __int64 tick_t; +diff --git a/gc.h b/gc.h +index 6568079c54e5..47a4ca19a0c5 100644 +--- a/gc.h ++++ b/gc.h +@@ -8,6 +8,8 @@ + #define SET_MACHINE_STACK_END(p) __asm__ __volatile__ ("movl\t%%esp, %0" : "=r" (*(p))) + #elif defined(__powerpc64__) && defined(__GNUC__) + #define SET_MACHINE_STACK_END(p) __asm__ __volatile__ ("mr\t%0, %%r1" : "=r" (*(p))) ++#elif defined(__aarch64__) && defined(__GNUC__) ++#define SET_MACHINE_STACK_END(p) __asm__ __volatile__ ("mov\t%0, sp" : "=r" (*(p))) + #else + NOINLINE(void rb_gc_set_stack_end(VALUE **stack_end_p)); + #define SET_MACHINE_STACK_END(p) rb_gc_set_stack_end(p) + +From c985b8c6868a380e44e285368af4a4f414ce3309 Mon Sep 17 00:00:00 2001 +From: Ali Saidi +Date: Wed, 5 Aug 2020 21:15:55 -0500 +Subject: [PATCH 3/3] vm_exec.c: improve performance for arm64 + +| |compare-ruby|built-ruby| +|:------------------------------|-----------:|---------:| +|vm_array | 26.501M| 27.959M| +| | -| 1.06x| +|vm_attr_ivar | 21.606M| 31.429M| +| | -| 1.45x| +|vm_attr_ivar_set | 21.178M| 26.113M| +| | -| 1.23x| +|vm_backtrace | 6.621| 6.668| +| | -| 1.01x| +|vm_bigarray | 26.205M| 29.958M| +| | -| 1.14x| +|vm_bighash | 504.155k| 479.306k| +| | 1.05x| -| +|vm_block | 16.692M| 21.315M| +| | -| 1.28x| +|block_handler_type_iseq | 5.083| 7.004| +| | -| 1.38x| +--- + vm_exec.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/vm_exec.c b/vm_exec.c +index ce2e053ee745..7aa56f6ad620 100644 +--- a/vm_exec.c ++++ b/vm_exec.c +@@ -27,6 +27,9 @@ static void vm_insns_counter_count_insn(int insn) {} + #elif defined(__GNUC__) && defined(__powerpc64__) + #define DECL_SC_REG(type, r, reg) register type reg_##r __asm__("r" reg) + ++#elif defined(__GNUC__) && defined(__aarch64__) ++#define DECL_SC_REG(type, r, reg) register type reg_##r __asm__("x" reg) ++ + #else + #define DECL_SC_REG(type, r, reg) register type reg_##r + #endif +@@ -74,6 +77,11 @@ vm_exec_core(rb_execution_context_t *ec, VALUE initial) + DECL_SC_REG(rb_control_frame_t *, cfp, "15"); + #define USE_MACHINE_REGS 1 + ++#elif defined(__GNUC__) && defined(__aarch64__) ++ DECL_SC_REG(const VALUE *, pc, "19"); ++ DECL_SC_REG(rb_control_frame_t *, cfp, "20"); ++#define USE_MACHINE_REGS 1 ++ + #else + register rb_control_frame_t *reg_cfp; + const VALUE *reg_pc; diff --git a/ruby.spec b/ruby.spec index 93b4e27..d97238a 100644 --- a/ruby.spec +++ b/ruby.spec @@ -164,6 +164,10 @@ Patch18: rubygems-3.1.3-Fix-correctness-and-performance-regression-in-require.pa # Avoid possible timeout errors in TestBugReporter#test_bug_reporter_add. # https://bugs.ruby-lang.org/issues/16492 Patch19: ruby-2.7.1-Timeout-the-test_bug_reporter_add-witout-raising-err.patch +# Enable arm64 optimizations. +# https://bugzilla.redhat.com/show_bug.cgi?id=1884728 +# https://github.com/ruby/ruby/pull/3393 +Patch20: ruby-3.0.0-preview1-Enable-arm64-optimizations-that-exist-for-power-x86.patch Requires: %{name}-libs%{?_isa} = %{version}-%{release} Suggests: rubypick @@ -579,6 +583,7 @@ rm -rf ext/fiddle/libffi* %patch17 -p1 %patch18 -p1 %patch19 -p1 +%patch20 -p1 # Provide an example of usage of the tapset: cp -a %{SOURCE3} .