valgrind/0005-aarch64-frinta-and-frinta-vector-instructions.patch

From f5d1c336e9276dd5947ef94c9831d9d53673b75b Mon Sep 17 00:00:00 2001
From: Paul Floyd <pjfloyd@wanadoo.fr>
Date: Thu, 9 May 2024 21:01:52 +0200
Subject: [PATCH 05/11] aarch64 frinta and frinta vector instructions

The initial fix for Bug 484426 only corrected frinta and frintn
scalar instructions. This adds support for the vector variants.

(cherry picked from commit 7b66a5b58219ac1a4865da8e371edbdb8d765f32)
---
 NEWS                               |   1 +
 VEX/priv/guest_arm64_toIR.c        |  47 ++++++----
 none/tests/arm64/frinta_frintn.cpp | 141 +++++++++++++++++++++++++++++
 3 files changed, 171 insertions(+), 18 deletions(-)

diff --git a/NEWS b/NEWS
index b65f9206679b..adb52169dd87 100644
--- a/NEWS
+++ b/NEWS
@@ -8,6 +8,7 @@ The following bugs have been fixed or resolved on this branch.
 486180  [MIPS] 'VexGuestArchState' has no member named 'guest_IP_AT_SYSCALL'
 486293  memccpy false positives
 486569  linux inotify_init syscall wrapper missing POST entry in syscall_table
+n-i-bz  aarch64 frinta and frinta vector instructions

 To see details of a given bug, visit
   https://bugs.kde.org/show_bug.cgi?id=XXXXXX
diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c
index c7e395b4b63d..27d945d6328d 100644
--- a/VEX/priv/guest_arm64_toIR.c
+++ b/VEX/priv/guest_arm64_toIR.c
@@ -13821,46 +13821,57 @@ Bool dis_AdvSIMD_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn)
       /* -------- 1,1x,11000 (apparently unassigned)    (7) -------- */
       /* -------- 1,1x,11001 FRINTI 2d_2d, 4s_4s, 2s_2s (8) -------- */
       /* rm plan:
-         FRINTN: tieeven -- !! FIXME KLUDGED !!
+         FRINTN: tieeven
          FRINTM: -inf
          FRINTP: +inf
          FRINTZ: zero
-         FRINTA: tieaway -- !! FIXME KLUDGED !!
+         FRINTA: tieaway
          FRINTX: per FPCR + "exact = TRUE"
          FRINTI: per FPCR
       */
       Bool isD = (size & 1) == 1;
       if (bitQ == 0 && isD) return False; // implied 1d case

-      IRTemp irrmRM = mk_get_IR_rounding_mode();
-
-      UChar ch = '?';
-      IRTemp irrm = newTemp(Ity_I32);
+      UChar   ch = '?';
+      IROp    op = isD ? Iop_RoundF64toInt : Iop_RoundF32toInt;
+      Bool    isBinop = True;
+      IRExpr* irrmE = NULL;
       switch (ix) {
-         case 1: ch = 'n'; assign(irrm, mkU32(Irrm_NEAREST)); break;
-         case 2: ch = 'm'; assign(irrm, mkU32(Irrm_NegINF)); break;
-         case 3: ch = 'p'; assign(irrm, mkU32(Irrm_PosINF)); break;
-         case 4: ch = 'z'; assign(irrm, mkU32(Irrm_ZERO)); break;
+         case 1: ch = 'n'; isBinop = False; op = isD ? Iop_RoundF64toIntE : Iop_RoundF32toIntE; break;
+         case 2: ch = 'm'; irrmE = mkU32(Irrm_NegINF); break;
+         case 3: ch = 'p'; irrmE = mkU32(Irrm_PosINF); break;
+         case 4: ch = 'z'; irrmE = mkU32(Irrm_ZERO); break;
          // The following is a kludge.  Should be: Irrm_NEAREST_TIE_AWAY_0
-         case 5: ch = 'a'; assign(irrm, mkU32(Irrm_NEAREST)); break;
+         case 5: ch = 'a'; isBinop = False; op = isD ? Iop_RoundF64toIntA0 : Iop_RoundF32toIntA0; break;
          // I am unsure about the following, due to the "integral exact"
          // description in the manual.  What does it mean? (frintx, that is)
-         case 6: ch = 'x'; assign(irrm, mkexpr(irrmRM)); break;
-         case 8: ch = 'i'; assign(irrm, mkexpr(irrmRM)); break;
+         case 6: ch = 'x'; irrmE = mkexpr(mk_get_IR_rounding_mode()); break;
+         case 8: ch = 'i'; irrmE = mkexpr(mk_get_IR_rounding_mode()); break;
          default: vassert(0);
       }

-      IROp opRND = isD ? Iop_RoundF64toInt : Iop_RoundF32toInt;
       if (isD) {
          for (UInt i = 0; i < 2; i++) {
-            putQRegLane(dd, i, binop(opRND, mkexpr(irrm),
-                                            getQRegLane(nn, i, Ity_F64)));
+            if (isBinop) {
+               IRTemp irrm = newTemp(Ity_I32);
+               assign(irrm, irrmE);
+               putQRegLane(dd, i, binop(op, mkexpr(irrm),
+                                               getQRegLane(nn, i, Ity_F64)));
+            } else {
+                putQRegLane(dd, i, unop(op, getQRegLane(nn, i, Ity_F64)));
+            }
          }
       } else {
          UInt n = bitQ==1 ? 4 : 2;
          for (UInt i = 0; i < n; i++) {
-            putQRegLane(dd, i, binop(opRND, mkexpr(irrm),
-                                            getQRegLane(nn, i, Ity_F32)));
+            if (isBinop) {
+               IRTemp irrm = newTemp(Ity_I32);
+               assign(irrm, irrmE);
+               putQRegLane(dd, i, binop(op, mkexpr(irrm),
+                                               getQRegLane(nn, i, Ity_F32)));
+            } else {
+                putQRegLane(dd, i, unop(op, getQRegLane(nn, i, Ity_F32)));
+            }
          }
          if (bitQ == 0)
             putQRegLane(dd, 1, mkU64(0)); // zero out lanes 2 and 3
diff --git a/none/tests/arm64/frinta_frintn.cpp b/none/tests/arm64/frinta_frintn.cpp
index 8e13761eb966..c0803688f698 100644
--- a/none/tests/arm64/frinta_frintn.cpp
+++ b/none/tests/arm64/frinta_frintn.cpp
@@ -36,6 +36,55 @@ void test_frinta(T input, T expected)
    }
 }

+template<typename T>
+void test_frinta_fullvec(T* input, T* expected)
+{
+    T result[2*sizeof(double)/sizeof(T)];
+    T* rp = result;
+    if constexpr (std::is_same_v<double, T> == true)
+    {
+     __asm__ __volatile__(
+         "ldr q23, [%1];\n"
+         "frinta v22.2d, v23.2d;\n"
+         "str q22, [%0];\n"
+         : "+rm" (rp)
+         : "r" (input)
+         : "memory", "v22", "v23");
+       assert(result[0] == expected[0]);
+       assert(result[1] == expected[1]);
+   }
+   else
+   {
+     __asm__ __volatile__(
+         "ldr q23, [%1];\n"
+         "frinta v22.4s, v23.4s;\n"
+         "str q22, [%0];\n"
+         : "+rm" (rp)
+         : "r" (input)
+         : "memory", "v22", "v23");
+       assert(result[0] == expected[0]);
+       assert(result[1] == expected[1]);
+       assert(result[2] == expected[2]);
+       assert(result[3] == expected[3]);
+   }
+}
+
+void test_frinta_halfvec(float* input, float* expected)
+{
+    float result[2];
+    float* rp = result;
+    __asm__ __volatile__(
+         "ldr d23, [%1];\n"
+         "frinta v22.2s, v23.2s;\n"
+         "str d22, [%0];\n"
+         : "+rm" (rp)
+         : "r" (input)
+         : "memory", "v22", "v23");
+   assert(result[0] == expected[0]);
+   assert(result[1] == expected[1]);
+}
+
+
 template<typename T>
 void test_frintn(T input, T expected)
 {
@@ -66,6 +115,54 @@ void test_frintn(T input, T expected)
    }
 }

+template<typename T>
+void test_frintn_fullvec(T* input, T* expected)
+{
+    T result[2*sizeof(double)/sizeof(T)];
+    T* rp = result;
+    if constexpr (std::is_same_v<double, T> == true)
+    {
+     __asm__ __volatile__(
+         "ldr q23, [%1];\n"
+         "frintn v22.2d, v23.2d;\n"
+         "str q22, [%0];\n"
+         : "+rm" (rp)
+         : "r" (input)
+         : "memory", "v22", "v23");
+       assert(result[0] == expected[0]);
+       assert(result[1] == expected[1]);
+   }
+   else
+   {
+     __asm__ __volatile__(
+         "ldr q23, [%1];\n"
+         "frintn v22.4s, v23.4s;\n"
+         "str q22, [%0];\n"
+         : "+rm" (rp)
+         : "r" (input)
+         : "memory", "v22", "v23");
+       assert(result[0] == expected[0]);
+       assert(result[1] == expected[1]);
+       assert(result[2] == expected[2]);
+       assert(result[3] == expected[3]);
+   }
+}
+
+void test_frintn_halfvec(float* input, float* expected)
+{
+    float result[2];
+    float* rp = result;
+     __asm__ __volatile__(
+         "ldr d23, [%1];\n"
+         "frintn v22.2s, v23.2s;\n"
+         "str d22, [%0];\n"
+         : "+rm" (rp)
+         : "r" (input)
+         : "memory", "v22", "v23");
+   assert(result[0] == expected[0]);
+   assert(result[1] == expected[1]);
+}
+
 int main()
 {
     // round "away from zero"
@@ -78,6 +175,36 @@ int main()
     test_frinta(-1.5F, -2.0F);
     test_frinta(-2.5F, -3.0F);

+    double in1[] = {1.5, 1.5};
+    double out1[] = {2.0, 2,0};
+    test_frinta_fullvec(in1, out1);
+    double in2[] = {2.5, 2.5};
+    double out2[] = {3.0, 3,0};
+    test_frinta_fullvec(in2, out2);
+    double in3[] = {-1.5, -1.5};
+    double out3[] = {-2.0, -2,0};
+    test_frinta_fullvec(in3, out3);
+    double in4[] = {-2.5, -2.5};
+    double out4[] = {-3.0, -3,0};
+    test_frinta_fullvec(in4, out4);
+
+    float in1f[] = {1.5F, 1.5F, 1.5F, 1.5F};
+    float out1f[] = {2.0F, 2.0F, 2.0F, 2.0F};
+    test_frinta_fullvec(in1f, out1f);
+    test_frinta_halfvec(in1f, out1f);
+    float in2f[] = {2.5F, 2.5F, 2.5F, 2.5F};
+    float out2f[] = {3.0F, 3.0F, 3.0F, 3.0F};
+    test_frinta_fullvec(in2f, out2f);
+    test_frinta_halfvec(in2f, out2f);
+    float in3f[] = {-1.5F, -1.5F, -1.5F, -1.5F};
+    float out3f[] = {-2.0F, -2.0F, -2.0F, -2.0F};
+    test_frinta_fullvec(in3f, out3f);
+    test_frinta_halfvec(in3f, out3f);
+    float in4f[] = {-2.5F, -2.5F, -2.5F, -2.5F};
+    float out4f[] = {-3.0F, -3.0F, -3.0F, -3.0F};
+    test_frinta_fullvec(in4f, out4f);
+    test_frinta_halfvec(in4f, out4f);
+
     // round "to even"
     test_frintn(1.5, 2.0);
     test_frintn(2.5, 2.0);
@@ -87,5 +214,19 @@ int main()
     test_frintn(2.5F, 2.0F);
     test_frintn(-1.5F, -2.0F);
     test_frintn(-2.5F, -2.0F);
+
+    test_frintn_fullvec(in1, out1);
+    test_frintn_fullvec(in2, out1);
+    test_frintn_fullvec(in3, out3);
+    test_frintn_fullvec(in4, out3);
+
+    test_frintn_fullvec(in1f, out1f);
+    test_frintn_halfvec(in1f, out1f);
+    test_frintn_fullvec(in2f, out1f);
+    test_frintn_halfvec(in2f, out1f);
+    test_frintn_fullvec(in3f, out3f);
+    test_frintn_halfvec(in3f, out3f);
+    test_frintn_fullvec(in4f, out3f);
+    test_frintn_halfvec(in4f, out3f);
 }

--
2.45.2