From 81ddc792a605e75a4c87589d7d6247bef836e3b4 Mon Sep 17 00:00:00 2001
From: CentOS Sources <bugs@centos.org>
Date: Thu, 2 Dec 2021 16:44:48 +0000
Subject: [PATCH] import valgrind-3.18.1-6.el8

---
 ...algrind-3.18.1-amd64-more-spec-rules.patch |  105 +
 .../valgrind-3.18.1-arm64-atomic-align.patch  |  163 ++
 ...valgrind-3.18.1-arm64-doubleword-cas.patch |  121 ++
 .../valgrind-3.18.1-arm64-ldaxp-stlxp.patch   | 1440 +++++++++++++
 SOURCES/valgrind-3.18.1-condvar.patch         |  284 +++
 .../valgrind-3.18.1-demangle-namespace.patch  |   35 +
 SOURCES/valgrind-3.18.1-dhat-tests-copy.patch |   20 +
 ...algrind-3.18.1-gdbserver_tests-hwcap.patch |   25 +
 SOURCES/valgrind-3.18.1-ppc-pstq-tests.patch  | 1876 +++++++++++++++++
 SOURCES/valgrind-3.18.1-ppc-pstq.patch        |   47 +
 ...algrind-3.18.1-ppc64-lxsibzx-lxsihzx.patch |   60 +
 .../valgrind-3.18.1-rust-v0-demangle.patch    |  137 ++
 SOURCES/valgrind-3.18.1-s390x-EXRL.patch      |  549 +++++
 SPECS/valgrind.spec                           |   77 +-
 14 files changed, 4938 insertions(+), 1 deletion(-)
 create mode 100644 SOURCES/valgrind-3.18.1-amd64-more-spec-rules.patch
 create mode 100644 SOURCES/valgrind-3.18.1-arm64-atomic-align.patch
 create mode 100644 SOURCES/valgrind-3.18.1-arm64-doubleword-cas.patch
 create mode 100644 SOURCES/valgrind-3.18.1-arm64-ldaxp-stlxp.patch
 create mode 100644 SOURCES/valgrind-3.18.1-condvar.patch
 create mode 100644 SOURCES/valgrind-3.18.1-demangle-namespace.patch
 create mode 100644 SOURCES/valgrind-3.18.1-dhat-tests-copy.patch
 create mode 100644 SOURCES/valgrind-3.18.1-gdbserver_tests-hwcap.patch
 create mode 100644 SOURCES/valgrind-3.18.1-ppc-pstq-tests.patch
 create mode 100644 SOURCES/valgrind-3.18.1-ppc-pstq.patch
 create mode 100644 SOURCES/valgrind-3.18.1-ppc64-lxsibzx-lxsihzx.patch
 create mode 100644 SOURCES/valgrind-3.18.1-rust-v0-demangle.patch
 create mode 100644 SOURCES/valgrind-3.18.1-s390x-EXRL.patch

diff --git a/SOURCES/valgrind-3.18.1-amd64-more-spec-rules.patch b/SOURCES/valgrind-3.18.1-amd64-more-spec-rules.patch
new file mode 100644
index 0000000..87794ee
--- /dev/null
+++ b/SOURCES/valgrind-3.18.1-amd64-more-spec-rules.patch
@@ -0,0 +1,105 @@
+commit 595341b150312d2407bd43304449bf39ec3e1fa8
+Author: Julian Seward <jseward@acm.org>
+Date:   Sat Nov 13 19:59:07 2021 +0100
+
+    amd64 front end: add more spec rules:
+    
+       S  after SHRQ
+       Z  after SHLQ
+       NZ after SHLQ
+       Z  after SHLL
+       S  after SHLL
+    
+    The lack of at least one of these was observed to cause occasional false
+    positives in Memcheck.
+    
+    Plus add commented-out cases so as to complete the set of 12 rules
+    {Z,NZ,S,NS} after {SHRQ,SHLQ,SHLL}.  The commented-out ones are commented
+    out because I so far didn't find any use cases for them.
+
+diff --git a/VEX/priv/guest_amd64_helpers.c b/VEX/priv/guest_amd64_helpers.c
+index 9d61e7a0f..ba71c1b62 100644
+--- a/VEX/priv/guest_amd64_helpers.c
++++ b/VEX/priv/guest_amd64_helpers.c
+@@ -1823,16 +1823,26 @@ IRExpr* guest_amd64_spechelper ( const HChar* function_name,
+       /*---------------- SHRQ ----------------*/
+ 
+       if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondZ)) {
+-         /* SHRQ, then Z --> test dep1 == 0 */
++         /* SHRQ, then Z --> test result[63:0] == 0 */
+          return unop(Iop_1Uto64,
+                      binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
+       }
+       if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondNZ)) {
+-         /* SHRQ, then NZ --> test dep1 != 0 */
++         /* SHRQ, then NZ --> test result[63:0] != 0 */
+          return unop(Iop_1Uto64,
+                      binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
+       }
+ 
++      if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondS)) {
++         /* SHRQ, then S --> (ULong)result[63] (result is in dep1) */
++         return binop(Iop_Shr64, cc_dep1, mkU8(63));
++      }
++      // No known test case for this, hence disabled:
++      //if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondNS)) {
++      //   /* SHRQ, then NS --> (ULong) ~ result[63] */
++      //   vassert(0);
++      //}
++
+       /*---------------- SHRL ----------------*/
+ 
+       if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondZ)) {
+@@ -1881,6 +1891,52 @@ IRExpr* guest_amd64_spechelper ( const HChar* function_name,
+       //                     mkU32(0)));
+       //}
+ 
++      /*---------------- SHLQ ----------------*/
++
++      if (isU64(cc_op, AMD64G_CC_OP_SHLQ) && isU64(cond, AMD64CondZ)) {
++         /* SHLQ, then Z --> test dep1 == 0 */
++         return unop(Iop_1Uto64,
++                     binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
++      }
++      if (isU64(cc_op, AMD64G_CC_OP_SHLQ) && isU64(cond, AMD64CondNZ)) {
++         /* SHLQ, then NZ --> test dep1 != 0 */
++         return unop(Iop_1Uto64,
++                     binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
++      }
++
++      //if (isU64(cc_op, AMD64G_CC_OP_SHLQ) && isU64(cond, AMD64CondS)) {
++      //   /* SHLQ, then S --> (ULong)result[63] */
++      //   vassert(0);
++      //}
++      //if (isU64(cc_op, AMD64G_CC_OP_SHLQ) && isU64(cond, AMD64CondNS)) {
++      //   /* SHLQ, then NS --> (ULong) ~ result[63] */
++      //   vassert(0);
++      //}
++
++      /*---------------- SHLL ----------------*/
++
++      if (isU64(cc_op, AMD64G_CC_OP_SHLL) && isU64(cond, AMD64CondZ)) {
++         /* SHLL, then Z --> test result[31:0] == 0 */
++         return unop(Iop_1Uto64,
++                     binop(Iop_CmpEQ32, unop(Iop_64to32, cc_dep1),
++                           mkU32(0)));
++      }
++      //if (isU64(cc_op, AMD64G_CC_OP_SHLL) && isU64(cond, AMD64CondNZ)) {
++      //   /* SHLL, then NZ --> test dep1 != 0 */
++      //   vassert(0);
++      //}
++
++      if (isU64(cc_op, AMD64G_CC_OP_SHLL) && isU64(cond, AMD64CondS)) {
++         /* SHLL, then S --> (ULong)result[31] */
++         return binop(Iop_And64,
++                      binop(Iop_Shr64, cc_dep1, mkU8(31)),
++                      mkU64(1));
++      }
++      //if (isU64(cc_op, AMD64G_CC_OP_SHLL) && isU64(cond, AMD64CondNS)) {
++      //   /* SHLL, then NS --> (ULong) ~ result[31] */
++      //   vassert(0);
++      //}
++
+       /*---------------- COPY ----------------*/
+       /* This can happen, as a result of amd64 FP compares: "comisd ... ;
+          jbe" for example. */
diff --git a/SOURCES/valgrind-3.18.1-arm64-atomic-align.patch b/SOURCES/valgrind-3.18.1-arm64-atomic-align.patch
new file mode 100644
index 0000000..8cce35f
--- /dev/null
+++ b/SOURCES/valgrind-3.18.1-arm64-atomic-align.patch
@@ -0,0 +1,163 @@
+commit 2be719921e700a9ac9b85f470ed87cb8adf8151b
+Author: Julian Seward <jseward@acm.org>
+Date:   Sat Nov 13 09:27:01 2021 +0100
+
+    Bug 445415 - arm64 front end: alignment checks missing for atomic instructions.
+    
+    For the arm64 front end, none of the atomic instructions have address
+    alignment checks included in their IR.  They all should.  The effect of
+    missing alignment checks in the IR is that, since this IR will in most cases
+    be translated back to atomic instructions in the back end, we will get
+    alignment traps (SIGBUS) on the host side and not on the guest side, which is
+    (very) incorrect behaviour of the simulation.
+
+ 
+diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c
+index ee018c6a9..16a7e075f 100644
+--- a/VEX/priv/guest_arm64_toIR.c
++++ b/VEX/priv/guest_arm64_toIR.c
+@@ -4833,6 +4833,34 @@ static IRTemp gen_zwidening_load ( UInt szB, IRTemp addr )
+ }
+ 
+ 
++/* Generate a SIGBUS followed by a restart of the current instruction if
++   `effective_addr` is `align`-aligned.  This is required behaviour for atomic
++   instructions.  This assumes that guest_RIP_curr_instr is set correctly!
++
++   This is hardwired to generate SIGBUS because so far the only supported arm64
++   (arm64-linux) does that.  Should we need to later extend it to generate some
++   other signal, use the same scheme as with gen_SIGNAL_if_not_XX_aligned in
++   guest_amd64_toIR.c. */
++static
++void gen_SIGBUS_if_not_XX_aligned ( IRTemp effective_addr, ULong align )
++{
++   if (align == 1) {
++      return;
++   }
++   vassert(align == 16 || align == 8 || align == 4 || align == 2);
++   stmt(
++      IRStmt_Exit(
++         binop(Iop_CmpNE64,
++               binop(Iop_And64,mkexpr(effective_addr),mkU64(align-1)),
++               mkU64(0)),
++         Ijk_SigBUS,
++         IRConst_U64(guest_PC_curr_instr),
++         OFFB_PC
++      )
++   );
++}
++
++
+ /* Generate a "standard 7" name, from bitQ and size.  But also
+    allow ".1d" since that's occasionally useful. */
+ static
+@@ -6670,7 +6698,7 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
+ 
+       IRTemp ea = newTemp(Ity_I64);
+       assign(ea, getIReg64orSP(nn));
+-      /* FIXME generate check that ea is szB-aligned */
++      gen_SIGBUS_if_not_XX_aligned(ea, szB);
+ 
+       if (isLD && ss == BITS5(1,1,1,1,1)) {
+          IRTemp res = newTemp(ty);
+@@ -6803,7 +6831,7 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
+ 
+       IRTemp ea = newTemp(Ity_I64);
+       assign(ea, getIReg64orSP(nn));
+-      /* FIXME generate check that ea is 2*elemSzB-aligned */
++      gen_SIGBUS_if_not_XX_aligned(ea, fullSzB);
+ 
+       if (isLD && ss == BITS5(1,1,1,1,1)) {
+          if (abiinfo->guest__use_fallback_LLSC) {
+@@ -7044,7 +7072,7 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
+ 
+       IRTemp ea = newTemp(Ity_I64);
+       assign(ea, getIReg64orSP(nn));
+-      /* FIXME generate check that ea is szB-aligned */
++      gen_SIGBUS_if_not_XX_aligned(ea, szB);
+ 
+       if (isLD) {
+          IRTemp res = newTemp(ty);
+@@ -7159,6 +7187,7 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
+ 
+       IRTemp ea = newTemp(Ity_I64);
+       assign(ea, getIReg64orSP(nn));
++      gen_SIGBUS_if_not_XX_aligned(ea, szB);
+ 
+       // Insert barrier before loading for acquire and acquire-release variants:
+       // A and AL.
+@@ -7266,6 +7295,10 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
+       IRType ty = integerIRTypeOfSize(szB);
+       Bool is64 = szB == 8;
+ 
++      IRTemp ea = newTemp(Ity_I64);
++      assign(ea, getIReg64orSP(nn));
++      gen_SIGBUS_if_not_XX_aligned(ea, szB);
++
+       IRExpr *exp = narrowFrom64(ty, getIReg64orZR(ss));
+       IRExpr *new = narrowFrom64(ty, getIReg64orZR(tt));
+ 
+@@ -7275,7 +7308,7 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
+       // Store the result back if LHS remains unchanged in memory.
+       IRTemp old = newTemp(ty);
+       stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
+-                               Iend_LE, getIReg64orSP(nn),
++                               Iend_LE, mkexpr(ea),
+                                /*expdHi*/NULL, exp,
+                                /*dataHi*/NULL, new)) );
+ 
+@@ -7307,6 +7340,10 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
+       if ((ss & 0x1) || (tt & 0x1)) {
+          /* undefined; fall through */
+       } else {
++         IRTemp ea = newTemp(Ity_I64);
++         assign(ea, getIReg64orSP(nn));
++         gen_SIGBUS_if_not_XX_aligned(ea, is64 ? 16 : 8);
++
+          IRExpr *expLo = getIRegOrZR(is64, ss);
+          IRExpr *expHi = getIRegOrZR(is64, ss + 1);
+          IRExpr *newLo = getIRegOrZR(is64, tt);
+@@ -7318,7 +7355,7 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
+             stmt(IRStmt_MBE(Imbe_Fence));
+ 
+          stmt( IRStmt_CAS(mkIRCAS(oldHi, oldLo,
+-                                  Iend_LE, getIReg64orSP(nn),
++                                  Iend_LE, mkexpr(ea),
+                                   expHi, expLo,
+                                   newHi, newLo)) );
+ 
+diff --git a/VEX/priv/host_arm64_defs.c b/VEX/priv/host_arm64_defs.c
+index b65e27db4..39c6aaa46 100644
+--- a/VEX/priv/host_arm64_defs.c
++++ b/VEX/priv/host_arm64_defs.c
+@@ -4033,6 +4033,7 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc,
+             case Ijk_FlushDCache: trcval = VEX_TRC_JMP_FLUSHDCACHE; break;
+             case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
+             case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
++            case Ijk_SigBUS:      trcval = VEX_TRC_JMP_SIGBUS;      break;
+             //case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
+             case Ijk_Boring:      trcval = VEX_TRC_JMP_BORING;      break;
+             /* We don't expect to see the following being assisted. */
+diff --git a/VEX/priv/host_arm64_isel.c b/VEX/priv/host_arm64_isel.c
+index 094e7e74b..82cb2d78c 100644
+--- a/VEX/priv/host_arm64_isel.c
++++ b/VEX/priv/host_arm64_isel.c
+@@ -4483,6 +4483,7 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt )
+          case Ijk_InvalICache:
+          case Ijk_FlushDCache:
+          case Ijk_SigTRAP:
++         case Ijk_SigBUS:
+          case Ijk_Yield: {
+             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
+             addInstr(env, ARM64Instr_XAssisted(r, amPC, cc,
+@@ -4576,8 +4577,8 @@ static void iselNext ( ISelEnv* env,
+       case Ijk_InvalICache:
+       case Ijk_FlushDCache:
+       case Ijk_SigTRAP:
+-      case Ijk_Yield:
+-      {
++      case Ijk_SigBUS:
++      case Ijk_Yield: {
+          HReg        r    = iselIntExpr_R(env, next);
+          ARM64AMode* amPC = mk_baseblock_64bit_access_amode(offsIP);
+          addInstr(env, ARM64Instr_XAssisted(r, amPC, ARM64cc_AL, jk));
diff --git a/SOURCES/valgrind-3.18.1-arm64-doubleword-cas.patch b/SOURCES/valgrind-3.18.1-arm64-doubleword-cas.patch
new file mode 100644
index 0000000..7cf0bf5
--- /dev/null
+++ b/SOURCES/valgrind-3.18.1-arm64-doubleword-cas.patch
@@ -0,0 +1,121 @@
+commit 7dbe2fed72886874f2eaf57dc07929542ae55b58
+Author: Julian Seward <jseward@acm.org>
+Date:   Fri Nov 12 10:40:48 2021 +0100
+
+    Bug 445354 - arm64 backend: incorrect code emitted for doubleword CAS.
+    
+    The sequence of instructions emitted by the arm64 backend for doubleword
+    compare-and-swap is incorrect.  This could lead to incorrect simulation of the
+    AArch8.1 atomic instructions (CASP, at least).  It also causes failures in the
+    upcoming fix for v8.0 support for LD{,A}XP/ST{,L}XP in bug 444399, at least
+    when running with the fallback LL/SC implementation
+    (`--sim-hints=fallback-llsc`, or as autoselected at startup).  In the worst
+    case it can cause segfaulting in the generated code, because it could jump
+    backwards unexpectedly far.
+    
+    The problem is the sequence emitted for ARM64in_CASP:
+    
+    * the jump offsets are incorrect, both for `bne out` (x 2) and `cbnz w1, loop`.
+    
+    * using w1 to hold the success indication of the stxp instruction trashes the
+      previous value in x1.  But the value in x1 is an output of ARM64in_CASP,
+      hence one of the two output registers is corrupted.  That confuses any code
+      downstream that want to inspect those values to find out whether or not the
+      transaction succeeded.
+    
+    The fixes are to
+    
+    * fix the branch offsets
+    
+    * use a different register to hold the stxp success indication.  w3 is a
+      convenient check.
+
+diff --git a/VEX/priv/host_arm64_defs.c b/VEX/priv/host_arm64_defs.c
+index 5dccc0495..5657bcab9 100644
+--- a/VEX/priv/host_arm64_defs.c
++++ b/VEX/priv/host_arm64_defs.c
+@@ -2271,6 +2271,7 @@ void getRegUsage_ARM64Instr ( HRegUsage* u, const ARM64Instr* i, Bool mode64 )
+          addHRegUse(u, HRmWrite, hregARM64_X1());
+          addHRegUse(u, HRmWrite, hregARM64_X9());
+          addHRegUse(u, HRmWrite, hregARM64_X8());
++         addHRegUse(u, HRmWrite, hregARM64_X3());
+          break;
+       case ARM64in_MFence:
+          return;
+@@ -4254,16 +4255,16 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc,
+ 
+               -- always:
+               cmp     x0, x8                 // EB08001F
+-              bne     out                    // 540000E1 (b.ne #28 <out>)
++              bne     out                    // 540000A1
+               cmp     x1, x9                 // EB09003F
+-              bne     out                    // 540000A1 (b.ne #20 <out>)
++              bne     out                    // 54000061
+ 
+               -- one of:
+-              stxp    w1, x6, x7, [x2]       // C8211C46
+-              stxp    w1, w6, w7, [x2]       // 88211C46
++              stxp    w3, x6, x7, [x2]       // C8231C46
++              stxp    w3, w6, w7, [x2]       // 88231C46
+ 
+               -- always:
+-              cbnz    w1, loop               // 35FFFE81 (cbnz w1, #-48 <loop>)
++              cbnz    w3, loop               // 35FFFF03
+             out:
+          */
+          switch (i->ARM64in.CASP.szB) {
+@@ -4277,15 +4278,15 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc,
+             default: vassert(0);
+          }
+          *p++ = 0xEB08001F;
+-         *p++ = 0x540000E1;
+-         *p++ = 0xEB09003F;
+          *p++ = 0x540000A1;
++         *p++ = 0xEB09003F;
++         *p++ = 0x54000061;
+          switch (i->ARM64in.CASP.szB) {
+-            case 8:  *p++ = 0xC8211C46; break;
+-            case 4:  *p++ = 0x88211C46; break;
++            case 8:  *p++ = 0xC8231C46; break;
++            case 4:  *p++ = 0x88231C46; break;
+             default: vassert(0);
+          }
+-         *p++ = 0x35FFFE81;
++         *p++ = 0x35FFFF03;
+          goto done;
+       }
+       case ARM64in_MFence: {
+diff --git a/VEX/priv/host_arm64_defs.h b/VEX/priv/host_arm64_defs.h
+index f0737f2c6..01fb5708e 100644
+--- a/VEX/priv/host_arm64_defs.h
++++ b/VEX/priv/host_arm64_defs.h
+@@ -720,6 +720,7 @@ typedef
+             Int  szB; /* 1, 2, 4 or 8 */
+          } StrEX;
+          /* x1 = CAS(x3(addr), x5(expected) -> x7(new)),
++            and trashes x8
+             where x1[8*szB-1 : 0] == x5[8*szB-1 : 0] indicates success,
+                   x1[8*szB-1 : 0] != x5[8*szB-1 : 0] indicates failure.
+             Uses x8 as scratch (but that's not allocatable).
+@@ -738,7 +739,7 @@ typedef
+             -- if branch taken, failure; x1[[8*szB-1 : 0] holds old value
+             -- attempt to store
+             stxr    w8, x7, [x3]
+-            -- if store successful, x1==0, so the eor is "x1 := x5"
++            -- if store successful, x8==0
+             -- if store failed,     branch back and try again.
+             cbne    w8, loop
+            after:
+@@ -746,6 +747,12 @@ typedef
+          struct {
+             Int szB; /* 1, 2, 4 or 8 */
+          } CAS;
++         /* Doubleworld CAS, 2 x 32 bit or 2 x 64 bit
++            x0(oldLSW),x1(oldMSW)
++               = DCAS(x2(addr), x4(expectedLSW),x5(expectedMSW)
++                                -> x6(newLSW),x7(newMSW))
++            and trashes x8, x9 and x3
++         */
+          struct {
+             Int szB; /* 4 or 8 */
+          } CASP;
diff --git a/SOURCES/valgrind-3.18.1-arm64-ldaxp-stlxp.patch b/SOURCES/valgrind-3.18.1-arm64-ldaxp-stlxp.patch
new file mode 100644
index 0000000..d118cc6
--- /dev/null
+++ b/SOURCES/valgrind-3.18.1-arm64-ldaxp-stlxp.patch
@@ -0,0 +1,1440 @@
+commit 530df882b8f60ecacaf2b9b8a719f7ea1c1d1650
+Author: Julian Seward <jseward@acm.org>
+Date:   Fri Nov 12 12:13:45 2021 +0100
+
+    Bug 444399 - disInstr(arm64): unhandled instruction 0xC87F2D89 (LD{,A}XP and ST{,L}XP).
+    
+    This is unfortunately a big and complex patch, to implement LD{,A}XP and
+    ST{,L}XP.  These were omitted from the original AArch64 v8.0 implementation
+    for unknown reasons.
+    
+    (Background) the patch is made significantly more complex because for AArch64
+    we actually have two implementations of the underlying
+    Load-Linked/Store-Conditional (LL/SC) machinery: a "primary" implementation,
+    which translates LL/SC more or less directly into IR and re-emits them at the
+    back end, and a "fallback" implementation that implements LL/SC "manually", by
+    taking advantage of the fact that V serialises thread execution, so we can
+    "implement" LL/SC by simulating a reservation using fields LLSC_* in the guest
+    state, and invalidating the reservation at every thread switch.
+    
+    (Background) the fallback scheme is needed because the primary scheme is in
+    violation of the ARMv8 semantics in that it can (easily) introduce extra
+    memory references between the LL and SC, hence on some hardware causing the
+    reservation to always fail and so the simulated program to wind up looping
+    forever.
+    
+    For these instructions, big picture:
+    
+    * for the primary implementation, we take advantage of the fact that
+      IRStmt_LLSC allows I128 bit transactions to be represented.  Hence we bundle
+      up the two 64-bit data elements into an I128 (or vice versa) and present a
+      single I128-typed IRStmt_LLSC in the IR.  In the backend, those are
+      re-emitted as LDXP/STXP respectively.  For LL/SC on 32-bit register pairs,
+      that bundling produces a single 64-bit item, and so the existing LL/SC
+      backend machinery handles it.  The effect is that a doubleword 32-bit LL/SC
+      in the front end translates into a single 64-bit LL/SC in the back end.
+      Overall, though, the implementation is straightforward.
+    
+    * for the fallback implementation, it is necessary to extend the guest state
+      field `guest_LLSC_DATA` to represent a 128-bit transaction, by splitting it
+      into _DATA_LO64 and DATA_HI64.  Then, the implementation is an exact
+      analogue of the fallback implementation for single-word LL/SC.  It takes
+      advantage of the fact that the backend already supports 128-bit CAS, as
+      fixed in bug 445354.  As with the primary implementation, doubleword 32-bit
+      LL/SC is bundled into a single 64-bit transaction.
+    
+    Detailed changes:
+    
+    * new arm64 guest state fields LLSC_DATA_LO64/LLSC_DATA_LO64 to replace
+      guest_LLSC_DATA
+    
+    * (ridealong fix) arm64 front end: a fix to a minor and harmless decoding bug
+      for the single-word LDX/STX case.
+    
+    * arm64 front end: IR generation for LD{,A}XP/ST{,L}XP: tedious and
+      longwinded, but per comments above, an exact(ish) analogue of the singleword
+      case
+    
+    * arm64 backend: new insns ARM64Instr_LdrEXP / ARM64Instr_StrEXP to wrap up 2
+      x 64 exclusive loads/stores.  Per comments above, there's no need to handle
+      the 2 x 32 case.
+    
+    * arm64 isel: translate I128-typed IRStmt_LLSC into the above two insns
+    
+    * arm64 isel: some auxiliary bits and pieces needed to handle I128 values;
+      this is standard doubleword isel stuff
+    
+    * arm64 isel: (ridealong fix): Ist_CAS: check for endianness of the CAS!
+    
+    * arm64 isel: (ridealong) a couple of formatting fixes
+    
+    * IR infrastructure: add support for I128 constants, done the same as V128
+      constants
+    
+    * memcheck: handle shadow loads and stores for I128 values
+    
+    * testcase: memcheck/tests/atomic_incs.c: on arm64, also test 128-bit atomic
+      addition, to check we really have atomicity right
+    
+    * testcase: new test none/tests/arm64/ldxp_stxp.c, tests operation but not
+      atomicity.  (Smoke test).
+
+diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c
+index 12a1c5978..ee018c6a9 100644
+--- a/VEX/priv/guest_arm64_toIR.c
++++ b/VEX/priv/guest_arm64_toIR.c
+@@ -1184,9 +1184,10 @@ static IRExpr* narrowFrom64 ( IRType dstTy, IRExpr* e )
+ #define OFFB_CMSTART  offsetof(VexGuestARM64State,guest_CMSTART)
+ #define OFFB_CMLEN    offsetof(VexGuestARM64State,guest_CMLEN)
+ 
+-#define OFFB_LLSC_SIZE offsetof(VexGuestARM64State,guest_LLSC_SIZE)
+-#define OFFB_LLSC_ADDR offsetof(VexGuestARM64State,guest_LLSC_ADDR)
+-#define OFFB_LLSC_DATA offsetof(VexGuestARM64State,guest_LLSC_DATA)
++#define OFFB_LLSC_SIZE      offsetof(VexGuestARM64State,guest_LLSC_SIZE)
++#define OFFB_LLSC_ADDR      offsetof(VexGuestARM64State,guest_LLSC_ADDR)
++#define OFFB_LLSC_DATA_LO64 offsetof(VexGuestARM64State,guest_LLSC_DATA_LO64)
++#define OFFB_LLSC_DATA_HI64 offsetof(VexGuestARM64State,guest_LLSC_DATA_HI64)
+ 
+ 
+ /* ---------------- Integer registers ---------------- */
+@@ -6652,7 +6653,7 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
+         (coregrind/m_scheduler/scheduler.c, run_thread_for_a_while()
+          has to do this bit)
+    */   
+-   if (INSN(29,23) == BITS7(0,0,1,0,0,0,0)
++   if (INSN(29,24) == BITS6(0,0,1,0,0,0)
+        && (INSN(23,21) & BITS3(1,0,1)) == BITS3(0,0,0)
+        && INSN(14,10) == BITS5(1,1,1,1,1)) {
+       UInt szBlg2     = INSN(31,30);
+@@ -6678,7 +6679,8 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
+             // if it faults.
+             IRTemp loaded_data64 = newTemp(Ity_I64);
+             assign(loaded_data64, widenUto64(ty, loadLE(ty, mkexpr(ea))));
+-            stmt( IRStmt_Put( OFFB_LLSC_DATA, mkexpr(loaded_data64) ));
++            stmt( IRStmt_Put( OFFB_LLSC_DATA_LO64, mkexpr(loaded_data64) ));
++            stmt( IRStmt_Put( OFFB_LLSC_DATA_HI64, mkU64(0) ));
+             stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
+             stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(szB) ));
+             putIReg64orZR(tt, mkexpr(loaded_data64));
+@@ -6729,7 +6731,7 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
+             ));
+             // Fail if the data doesn't match the LL data
+             IRTemp llsc_data64 = newTemp(Ity_I64);
+-            assign(llsc_data64, IRExpr_Get(OFFB_LLSC_DATA, Ity_I64));
++            assign(llsc_data64, IRExpr_Get(OFFB_LLSC_DATA_LO64, Ity_I64));
+             stmt( IRStmt_Exit(
+                       binop(Iop_CmpNE64, widenUto64(ty, loadLE(ty, mkexpr(ea))),
+                                          mkexpr(llsc_data64)),
+@@ -6771,6 +6773,257 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
+       /* else fall through */
+    }
+ 
++   /* -------------------- LD{,A}XP -------------------- */
++   /* -------------------- ST{,L}XP -------------------- */
++   /* 31 30 29     23  20    15 14  9  4
++       1 sz 001000 011 11111 0  t2  n  t1   LDXP  Rt1, Rt2, [Xn|SP]
++       1 sz 001000 011 11111 1  t2  n  t1   LDAXP Rt1, Rt2, [Xn|SP]
++       1 sz 001000 001 s     0  t2  n  t1   STXP  Ws, Rt1, Rt2, [Xn|SP]
++       1 sz 001000 001 s     1  t2  n  t1   STLXP Ws, Rt1, Rt2, [Xn|SP]
++   */
++   /* See just above, "LD{,A}X{R,RH,RB} / ST{,L}X{R,RH,RB}", for detailed
++      comments about this implementation.  Note the 'sz' field here is only 1
++      bit; above, it is 2 bits, and has a different encoding.
++   */
++   if (INSN(31,31) == 1
++       && INSN(29,24) == BITS6(0,0,1,0,0,0)
++       && (INSN(23,21) & BITS3(1,0,1)) == BITS3(0,0,1)) {
++      Bool elemIs64   = INSN(30,30) == 1;
++      Bool isLD       = INSN(22,22) == 1;
++      Bool isAcqOrRel = INSN(15,15) == 1;
++      UInt ss         = INSN(20,16);
++      UInt tt2        = INSN(14,10);
++      UInt nn         = INSN(9,5);
++      UInt tt1        = INSN(4,0);
++
++      UInt   elemSzB = elemIs64 ? 8 : 4;
++      UInt   fullSzB = 2 * elemSzB;
++      IRType elemTy  = integerIRTypeOfSize(elemSzB);
++      IRType fullTy  = integerIRTypeOfSize(fullSzB);
++
++      IRTemp ea = newTemp(Ity_I64);
++      assign(ea, getIReg64orSP(nn));
++      /* FIXME generate check that ea is 2*elemSzB-aligned */
++
++      if (isLD && ss == BITS5(1,1,1,1,1)) {
++         if (abiinfo->guest__use_fallback_LLSC) {
++            // Fallback implementation of LL.
++            // Do the load first so we don't update any guest state if it
++            // faults.  Assumes little-endian guest.
++            if (fullTy == Ity_I64) {
++               vassert(elemSzB == 4);
++               IRTemp loaded_data64 = newTemp(Ity_I64);
++               assign(loaded_data64, loadLE(fullTy, mkexpr(ea)));
++               stmt( IRStmt_Put( OFFB_LLSC_DATA_LO64, mkexpr(loaded_data64) ));
++               stmt( IRStmt_Put( OFFB_LLSC_DATA_HI64, mkU64(0) ));
++               stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
++               stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(8) ));
++               putIReg64orZR(tt1, unop(Iop_32Uto64,
++                                       unop(Iop_64to32,
++                                            mkexpr(loaded_data64))));
++               putIReg64orZR(tt2, unop(Iop_32Uto64,
++                                       unop(Iop_64HIto32,
++                                            mkexpr(loaded_data64))));
++            } else {
++               vassert(elemSzB == 8 && fullTy == Ity_I128);
++               IRTemp loaded_data128 = newTemp(Ity_I128);
++               // Hack: do the load as V128 rather than I128 so as to avoid
++               // having to implement I128 loads in the arm64 back end.
++               assign(loaded_data128, unop(Iop_ReinterpV128asI128,
++                                           loadLE(Ity_V128, mkexpr(ea))));
++               IRTemp loaded_data_lo64 = newTemp(Ity_I64);
++               IRTemp loaded_data_hi64 = newTemp(Ity_I64);
++               assign(loaded_data_lo64, unop(Iop_128to64,
++                                             mkexpr(loaded_data128)));
++               assign(loaded_data_hi64, unop(Iop_128HIto64,
++                                             mkexpr(loaded_data128)));
++               stmt( IRStmt_Put( OFFB_LLSC_DATA_LO64,
++                                 mkexpr(loaded_data_lo64) ));
++               stmt( IRStmt_Put( OFFB_LLSC_DATA_HI64,
++                                 mkexpr(loaded_data_hi64) ));
++               stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
++               stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(16) ));
++               putIReg64orZR(tt1, mkexpr(loaded_data_lo64));
++               putIReg64orZR(tt2, mkexpr(loaded_data_hi64));
++            }
++         } else {
++            // Non-fallback implementation of LL.
++            IRTemp res = newTemp(fullTy); // I64 or I128
++            stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), NULL/*LL*/));
++            // Assuming a little-endian guest here.  Rt1 goes at the lower
++            // address, so it must live in the least significant half of `res`.
++            IROp opGetLO = fullTy == Ity_I128 ? Iop_128to64   : Iop_64to32;
++            IROp opGetHI = fullTy == Ity_I128 ? Iop_128HIto64 : Iop_64HIto32;
++            putIReg64orZR(tt1, widenUto64(elemTy, unop(opGetLO, mkexpr(res))));
++            putIReg64orZR(tt2, widenUto64(elemTy, unop(opGetHI, mkexpr(res))));
++         }
++         if (isAcqOrRel) {
++            stmt(IRStmt_MBE(Imbe_Fence));
++         }
++         DIP("ld%sxp %s, %s, [%s] %s\n",
++             isAcqOrRel ? (isLD ? "a" : "l") : "",
++             nameIRegOrZR(elemSzB == 8, tt1),
++             nameIRegOrZR(elemSzB == 8, tt2),
++             nameIReg64orSP(nn),
++             abiinfo->guest__use_fallback_LLSC
++                ? "(fallback implementation)" : "");
++         return True;
++      }
++      if (!isLD) {
++         if (isAcqOrRel) {
++            stmt(IRStmt_MBE(Imbe_Fence));
++         }
++         if (abiinfo->guest__use_fallback_LLSC) {
++            // Fallback implementation of SC.
++            // This is really ugly, since we don't have any way to do
++            // proper if-then-else.  First, set up as if the SC failed,
++            // and jump forwards if it really has failed.
++
++            // Continuation address
++            IRConst* nia = IRConst_U64(guest_PC_curr_instr + 4);
++
++            // "the SC failed".  Any non-zero value means failure.
++            putIReg64orZR(ss, mkU64(1));
++
++            IRTemp tmp_LLsize = newTemp(Ity_I64);
++            assign(tmp_LLsize, IRExpr_Get(OFFB_LLSC_SIZE, Ity_I64));
++            stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) // "no transaction"
++            ));
++            // Fail if no or wrong-size transaction
++            vassert((fullSzB == 8 && fullTy == Ity_I64)
++                    || (fullSzB == 16 && fullTy == Ity_I128));
++            stmt( IRStmt_Exit(
++                     binop(Iop_CmpNE64, mkexpr(tmp_LLsize), mkU64(fullSzB)),
++                     Ijk_Boring, nia, OFFB_PC
++            ));
++            // Fail if the address doesn't match the LL address
++            stmt( IRStmt_Exit(
++                      binop(Iop_CmpNE64, mkexpr(ea),
++                                         IRExpr_Get(OFFB_LLSC_ADDR, Ity_I64)),
++                      Ijk_Boring, nia, OFFB_PC
++            ));
++            // The data to be stored.
++            IRTemp store_data = newTemp(fullTy);
++            if (fullTy == Ity_I64) {
++               assign(store_data,
++                      binop(Iop_32HLto64,
++                            narrowFrom64(Ity_I32, getIReg64orZR(tt2)),
++                            narrowFrom64(Ity_I32, getIReg64orZR(tt1))));
++            } else {
++               assign(store_data,
++                      binop(Iop_64HLto128,
++                            getIReg64orZR(tt2), getIReg64orZR(tt1)));
++            }
++
++            if (fullTy == Ity_I64) {
++               // 64 bit (2x32 bit) path
++               // Fail if the data in memory doesn't match the data stashed by
++               // the LL.
++               IRTemp llsc_data_lo64 = newTemp(Ity_I64);
++               assign(llsc_data_lo64,
++                      IRExpr_Get(OFFB_LLSC_DATA_LO64, Ity_I64));
++               stmt( IRStmt_Exit(
++                         binop(Iop_CmpNE64, loadLE(Ity_I64, mkexpr(ea)),
++                                            mkexpr(llsc_data_lo64)),
++                      Ijk_Boring, nia, OFFB_PC
++               ));
++               // Try to CAS the new value in.
++               IRTemp old = newTemp(Ity_I64);
++               IRTemp expd = newTemp(Ity_I64);
++               assign(expd, mkexpr(llsc_data_lo64));
++               stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
++                                        Iend_LE, mkexpr(ea),
++                                        /*expdHi*/NULL, mkexpr(expd),
++                                        /*dataHi*/NULL, mkexpr(store_data)
++               )));
++               // Fail if the CAS failed (viz, old != expd)
++               stmt( IRStmt_Exit(
++                         binop(Iop_CmpNE64, mkexpr(old), mkexpr(expd)),
++                         Ijk_Boring, nia, OFFB_PC
++               ));
++            } else {
++               // 128 bit (2x64 bit) path
++               // Fail if the data in memory doesn't match the data stashed by
++               // the LL.
++               IRTemp llsc_data_lo64 = newTemp(Ity_I64);
++               assign(llsc_data_lo64,
++                      IRExpr_Get(OFFB_LLSC_DATA_LO64, Ity_I64));
++               IRTemp llsc_data_hi64 = newTemp(Ity_I64);
++               assign(llsc_data_hi64,
++                      IRExpr_Get(OFFB_LLSC_DATA_HI64, Ity_I64));
++               IRTemp data_at_ea = newTemp(Ity_I128);
++               assign(data_at_ea,
++                      unop(Iop_ReinterpV128asI128,
++                           loadLE(Ity_V128, mkexpr(ea))));
++               stmt( IRStmt_Exit(
++                        binop(Iop_CmpNE64,
++                              unop(Iop_128to64, mkexpr(data_at_ea)),
++                              mkexpr(llsc_data_lo64)),
++                        Ijk_Boring, nia, OFFB_PC
++               ));
++               stmt( IRStmt_Exit(
++                        binop(Iop_CmpNE64,
++                              unop(Iop_128HIto64, mkexpr(data_at_ea)),
++                              mkexpr(llsc_data_hi64)),
++                        Ijk_Boring, nia, OFFB_PC
++               ));
++               // Try to CAS the new value in.
++               IRTemp old_lo64 = newTemp(Ity_I64);
++               IRTemp old_hi64 = newTemp(Ity_I64);
++               IRTemp expd_lo64 = newTemp(Ity_I64);
++               IRTemp expd_hi64 = newTemp(Ity_I64);
++               IRTemp store_data_lo64 = newTemp(Ity_I64);
++               IRTemp store_data_hi64 = newTemp(Ity_I64);
++               assign(expd_lo64, mkexpr(llsc_data_lo64));
++               assign(expd_hi64, mkexpr(llsc_data_hi64));
++               assign(store_data_lo64, unop(Iop_128to64, mkexpr(store_data)));
++               assign(store_data_hi64, unop(Iop_128HIto64, mkexpr(store_data)));
++               stmt( IRStmt_CAS(mkIRCAS(old_hi64, old_lo64,
++                                        Iend_LE, mkexpr(ea),
++                                        mkexpr(expd_hi64), mkexpr(expd_lo64),
++                                        mkexpr(store_data_hi64),
++                                        mkexpr(store_data_lo64)
++               )));
++               // Fail if the CAS failed (viz, old != expd)
++               stmt( IRStmt_Exit(
++                        binop(Iop_CmpNE64, mkexpr(old_lo64), mkexpr(expd_lo64)),
++                        Ijk_Boring, nia, OFFB_PC
++               ));
++               stmt( IRStmt_Exit(
++                        binop(Iop_CmpNE64, mkexpr(old_hi64), mkexpr(expd_hi64)),
++                        Ijk_Boring, nia, OFFB_PC
++               ));
++            }
++            // Otherwise we succeeded (!)
++            putIReg64orZR(ss, mkU64(0));
++         } else {
++            // Non-fallback implementation of SC.
++            IRTemp  res     = newTemp(Ity_I1);
++            IRExpr* dataLO  = narrowFrom64(elemTy, getIReg64orZR(tt1));
++            IRExpr* dataHI  = narrowFrom64(elemTy, getIReg64orZR(tt2));
++            IROp    opMerge = fullTy == Ity_I128 ? Iop_64HLto128 : Iop_32HLto64;
++            IRExpr* data    = binop(opMerge, dataHI, dataLO);
++            // Assuming a little-endian guest here.  Rt1 goes at the lower
++            // address, so it must live in the least significant half of `data`.
++            stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), data));
++            /* IR semantics: res is 1 if store succeeds, 0 if it fails.
++               Need to set rS to 1 on failure, 0 on success. */
++            putIReg64orZR(ss, binop(Iop_Xor64, unop(Iop_1Uto64, mkexpr(res)),
++                                               mkU64(1)));
++         }
++         DIP("st%sxp %s, %s, %s, [%s] %s\n",
++             isAcqOrRel ? (isLD ? "a" : "l") : "",
++             nameIRegOrZR(False, ss),
++             nameIRegOrZR(elemSzB == 8, tt1),
++             nameIRegOrZR(elemSzB == 8, tt2),
++             nameIReg64orSP(nn),
++             abiinfo->guest__use_fallback_LLSC
++                ? "(fallback implementation)" : "");
++         return True;
++      }
++      /* else fall through */
++   }
++
+    /* ------------------ LDA{R,RH,RB} ------------------ */
+    /* ------------------ STL{R,RH,RB} ------------------ */
+    /* 31 29     23  20      14    9 4
+diff --git a/VEX/priv/host_arm64_defs.c b/VEX/priv/host_arm64_defs.c
+index 5657bcab9..b65e27db4 100644
+--- a/VEX/priv/host_arm64_defs.c
++++ b/VEX/priv/host_arm64_defs.c
+@@ -1059,6 +1059,16 @@ ARM64Instr* ARM64Instr_StrEX ( Int szB ) {
+    vassert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
+    return i;
+ }
++ARM64Instr* ARM64Instr_LdrEXP ( void ) {
++   ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr));
++   i->tag        = ARM64in_LdrEXP;
++   return i;
++}
++ARM64Instr* ARM64Instr_StrEXP ( void ) {
++   ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr));
++   i->tag        = ARM64in_StrEXP;
++   return i;
++}
+ ARM64Instr* ARM64Instr_CAS ( Int szB ) {
+    ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr));
+    i->tag             = ARM64in_CAS;
+@@ -1699,12 +1709,19 @@ void ppARM64Instr ( const ARM64Instr* i ) {
+                     sz, i->ARM64in.StrEX.szB == 8 ? 'x' : 'w');
+          return;
+       }
++      case ARM64in_LdrEXP:
++         vex_printf("ldxp   x2, x3, [x4]");
++         return;
++      case ARM64in_StrEXP:
++         vex_printf("stxp   w0, x2, x3, [x4]");
++         return;
+       case ARM64in_CAS: {
+          vex_printf("x1 = cas(%dbit)(x3, x5 -> x7)", 8 * i->ARM64in.CAS.szB);
+          return;
+       }
+       case ARM64in_CASP: {
+-         vex_printf("x0,x1 = casp(%dbit)(x2, x4,x5 -> x6,x7)", 8 * i->ARM64in.CASP.szB);
++         vex_printf("x0,x1 = casp(2x%dbit)(x2, x4,x5 -> x6,x7)",
++                    8 * i->ARM64in.CASP.szB);
+          return;
+       }
+       case ARM64in_MFence:
+@@ -2253,6 +2270,17 @@ void getRegUsage_ARM64Instr ( HRegUsage* u, const ARM64Instr* i, Bool mode64 )
+          addHRegUse(u, HRmWrite, hregARM64_X0());
+          addHRegUse(u, HRmRead, hregARM64_X2());
+          return;
++      case ARM64in_LdrEXP:
++         addHRegUse(u, HRmRead, hregARM64_X4());
++         addHRegUse(u, HRmWrite, hregARM64_X2());
++         addHRegUse(u, HRmWrite, hregARM64_X3());
++         return;
++      case ARM64in_StrEXP:
++         addHRegUse(u, HRmRead, hregARM64_X4());
++         addHRegUse(u, HRmWrite, hregARM64_X0());
++         addHRegUse(u, HRmRead, hregARM64_X2());
++         addHRegUse(u, HRmRead, hregARM64_X3());
++         return;
+       case ARM64in_CAS:
+          addHRegUse(u, HRmRead, hregARM64_X3());
+          addHRegUse(u, HRmRead, hregARM64_X5());
+@@ -2571,6 +2599,10 @@ void mapRegs_ARM64Instr ( HRegRemap* m, ARM64Instr* i, Bool mode64 )
+          return;
+       case ARM64in_StrEX:
+          return;
++      case ARM64in_LdrEXP:
++         return;
++      case ARM64in_StrEXP:
++         return;
+       case ARM64in_CAS:
+          return;
+       case ARM64in_CASP:
+@@ -4167,6 +4199,16 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc,
+          }
+          goto bad;
+       }
++      case ARM64in_LdrEXP: {
++         // 820C7FC8   ldxp x2, x3, [x4]
++         *p++ = 0xC87F0C82;
++         goto done;
++      }
++      case ARM64in_StrEXP: {
++         // 820C20C8   stxp w0, x2, x3, [x4]
++         *p++ = 0xC8200C82;
++         goto done;
++      }
+       case ARM64in_CAS: {
+          /* This isn't simple.  For an explanation see the comment in
+             host_arm64_defs.h on the definition of ARM64Instr case CAS.
+diff --git a/VEX/priv/host_arm64_defs.h b/VEX/priv/host_arm64_defs.h
+index 01fb5708e..dc686dff7 100644
+--- a/VEX/priv/host_arm64_defs.h
++++ b/VEX/priv/host_arm64_defs.h
+@@ -509,8 +509,10 @@ typedef
+       ARM64in_AddToSP,     /* move SP by small, signed constant */
+       ARM64in_FromSP,      /* move SP to integer register */
+       ARM64in_Mul,
+-      ARM64in_LdrEX,
+-      ARM64in_StrEX,
++      ARM64in_LdrEX,       /* load exclusive, single register */
++      ARM64in_StrEX,       /* store exclusive, single register */
++      ARM64in_LdrEXP,      /* load exclusive, register pair, 2x64-bit only */
++      ARM64in_StrEXP,      /* store exclusive, register pair, 2x64-bit only */
+       ARM64in_CAS,
+       ARM64in_CASP,
+       ARM64in_MFence,
+@@ -719,6 +721,12 @@ typedef
+          struct {
+             Int  szB; /* 1, 2, 4 or 8 */
+          } StrEX;
++         /* LDXP x2, x3, [x4].  This is 2x64-bit only. */
++         struct {
++         } LdrEXP;
++         /* STXP w0, x2, x3, [x4].  This is 2x64-bit only. */
++         struct {
++         } StrEXP;
+          /* x1 = CAS(x3(addr), x5(expected) -> x7(new)),
+             and trashes x8
+             where x1[8*szB-1 : 0] == x5[8*szB-1 : 0] indicates success,
+@@ -1037,6 +1045,8 @@ extern ARM64Instr* ARM64Instr_Mul     ( HReg dst, HReg argL, HReg argR,
+                                         ARM64MulOp op );
+ extern ARM64Instr* ARM64Instr_LdrEX   ( Int szB );
+ extern ARM64Instr* ARM64Instr_StrEX   ( Int szB );
++extern ARM64Instr* ARM64Instr_LdrEXP  ( void );
++extern ARM64Instr* ARM64Instr_StrEXP  ( void );
+ extern ARM64Instr* ARM64Instr_CAS     ( Int szB );
+ extern ARM64Instr* ARM64Instr_CASP    ( Int szB );
+ extern ARM64Instr* ARM64Instr_MFence  ( void );
+diff --git a/VEX/priv/host_arm64_isel.c b/VEX/priv/host_arm64_isel.c
+index 4b1d8c846..094e7e74b 100644
+--- a/VEX/priv/host_arm64_isel.c
++++ b/VEX/priv/host_arm64_isel.c
+@@ -196,9 +196,9 @@ static HReg        iselCondCode_R        ( ISelEnv* env, IRExpr* e );
+ static HReg        iselIntExpr_R_wrk     ( ISelEnv* env, IRExpr* e );
+ static HReg        iselIntExpr_R         ( ISelEnv* env, IRExpr* e );
+ 
+-static void        iselInt128Expr_wrk    ( /*OUT*/HReg* rHi, HReg* rLo, 
++static void        iselInt128Expr_wrk    ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
+                                            ISelEnv* env, IRExpr* e );
+-static void        iselInt128Expr        ( /*OUT*/HReg* rHi, HReg* rLo, 
++static void        iselInt128Expr        ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
+                                            ISelEnv* env, IRExpr* e );
+ 
+ static HReg        iselDblExpr_wrk        ( ISelEnv* env, IRExpr* e );
+@@ -1759,9 +1759,12 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
+ 
+       /* AND/OR/XOR(e1, e2) (for any e1, e2) */
+       switch (e->Iex.Binop.op) {
+-         case Iop_And64: case Iop_And32: lop = ARM64lo_AND; goto log_binop;
+-         case Iop_Or64:  case Iop_Or32:  case Iop_Or16: lop = ARM64lo_OR;  goto log_binop;
+-         case Iop_Xor64: case Iop_Xor32: lop = ARM64lo_XOR; goto log_binop;
++         case Iop_And64: case Iop_And32:
++            lop = ARM64lo_AND; goto log_binop;
++         case Iop_Or64:  case Iop_Or32:  case Iop_Or16:
++            lop = ARM64lo_OR;  goto log_binop;
++         case Iop_Xor64: case Iop_Xor32:
++            lop = ARM64lo_XOR; goto log_binop;
+          log_binop: {
+             HReg      dst  = newVRegI(env);
+             HReg      argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
+@@ -2013,6 +2016,11 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
+             iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
+             return rHi; /* and abandon rLo */
+          }
++         case Iop_128to64: {
++            HReg rHi, rLo;
++            iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
++            return rLo; /* and abandon rHi */
++         }
+          case Iop_8Sto32: case Iop_8Sto64: {
+             IRExpr* arg = e->Iex.Unop.arg;
+             HReg    src = iselIntExpr_R(env, arg);
+@@ -2185,13 +2193,19 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
+             }
+             return dst;
+          }
++         case Iop_64HIto32: {
++            HReg dst = newVRegI(env);
++            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
++            addInstr(env, ARM64Instr_Shift(dst, src, ARM64RI6_I6(32),
++                                           ARM64sh_SHR));
++            return dst;
++         }
+          case Iop_64to32:
+          case Iop_64to16:
+          case Iop_64to8:
+          case Iop_32to16:
+             /* These are no-ops. */
+             return iselIntExpr_R(env, e->Iex.Unop.arg);
+-
+          default:
+             break;
+       }
+@@ -2335,6 +2349,43 @@ static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
+    vassert(e);
+    vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
+ 
++   /* --------- TEMP --------- */
++   if (e->tag == Iex_RdTmp) {
++      lookupIRTempPair(rHi, rLo, env, e->Iex.RdTmp.tmp);
++      return;
++   }
++
++   /* --------- CONST --------- */
++   if (e->tag == Iex_Const) {
++      IRConst* c = e->Iex.Const.con;
++      vassert(c->tag == Ico_U128);
++      if (c->Ico.U128 == 0) {
++         // The only case we need to handle (so far)
++         HReg zero = newVRegI(env);
++         addInstr(env, ARM64Instr_Imm64(zero, 0));
++         *rHi = *rLo = zero;
++         return;
++      }
++   }
++
++   /* --------- UNARY ops --------- */
++   if (e->tag == Iex_Unop) {
++      switch (e->Iex.Unop.op) {
++         case Iop_ReinterpV128asI128: {
++            HReg dstHi = newVRegI(env);
++            HReg dstLo = newVRegI(env);
++            HReg src    = iselV128Expr(env, e->Iex.Unop.arg);
++            addInstr(env, ARM64Instr_VXfromQ(dstHi, src, 1));
++            addInstr(env, ARM64Instr_VXfromQ(dstLo, src, 0));
++            *rHi = dstHi;
++            *rLo = dstLo;
++            return;
++         }
++         default:
++            break;
++      }
++   }
++
+    /* --------- BINARY ops --------- */
+    if (e->tag == Iex_Binop) {
+       switch (e->Iex.Binop.op) {
+@@ -4086,6 +4137,14 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt )
+          addInstr(env, ARM64Instr_VMov(8/*yes, really*/, dst, src));
+          return;
+       }
++      if (ty == Ity_I128) {
++         HReg rHi, rLo, dstHi, dstLo;
++         iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
++         lookupIRTempPair( &dstHi, &dstLo, env, tmp);
++         addInstr(env, ARM64Instr_MovI(dstHi, rHi));
++         addInstr(env, ARM64Instr_MovI(dstLo, rLo));
++         return;
++      }
+       if (ty == Ity_V128) {
+          HReg src = iselV128Expr(env, stmt->Ist.WrTmp.data);
+          HReg dst = lookupIRTemp(env, tmp);
+@@ -4183,42 +4242,67 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt )
+          /* LL */
+          IRTemp res = stmt->Ist.LLSC.result;
+          IRType ty  = typeOfIRTemp(env->type_env, res);
+-         if (ty == Ity_I64 || ty == Ity_I32 
++         if (ty == Ity_I128 || ty == Ity_I64 || ty == Ity_I32
+              || ty == Ity_I16 || ty == Ity_I8) {
+             Int  szB   = 0;
+-            HReg r_dst = lookupIRTemp(env, res);
+             HReg raddr = iselIntExpr_R(env, stmt->Ist.LLSC.addr);
+             switch (ty) {
+-               case Ity_I8:  szB = 1; break;
+-               case Ity_I16: szB = 2; break;
+-               case Ity_I32: szB = 4; break;
+-               case Ity_I64: szB = 8; break;
+-               default:      vassert(0);
++               case Ity_I8:   szB = 1;  break;
++               case Ity_I16:  szB = 2;  break;
++               case Ity_I32:  szB = 4;  break;
++               case Ity_I64:  szB = 8;  break;
++               case Ity_I128: szB = 16; break;
++               default:       vassert(0);
++            }
++            if (szB == 16) {
++               HReg r_dstMSword = INVALID_HREG;
++               HReg r_dstLSword = INVALID_HREG;
++               lookupIRTempPair(&r_dstMSword, &r_dstLSword, env, res);
++               addInstr(env, ARM64Instr_MovI(hregARM64_X4(), raddr));
++               addInstr(env, ARM64Instr_LdrEXP());
++               addInstr(env, ARM64Instr_MovI(r_dstLSword, hregARM64_X2()));
++               addInstr(env, ARM64Instr_MovI(r_dstMSword, hregARM64_X3()));
++            } else {
++               vassert(szB != 0);
++               HReg r_dst = lookupIRTemp(env, res);
++               addInstr(env, ARM64Instr_MovI(hregARM64_X4(), raddr));
++               addInstr(env, ARM64Instr_LdrEX(szB));
++               addInstr(env, ARM64Instr_MovI(r_dst, hregARM64_X2()));
+             }
+-            addInstr(env, ARM64Instr_MovI(hregARM64_X4(), raddr));
+-            addInstr(env, ARM64Instr_LdrEX(szB));
+-            addInstr(env, ARM64Instr_MovI(r_dst, hregARM64_X2()));
+             return;
+          }
+          goto stmt_fail;
+       } else {
+          /* SC */
+          IRType tyd = typeOfIRExpr(env->type_env, stmt->Ist.LLSC.storedata);
+-         if (tyd == Ity_I64 || tyd == Ity_I32
++         if (tyd == Ity_I128 || tyd == Ity_I64 || tyd == Ity_I32
+              || tyd == Ity_I16 || tyd == Ity_I8) {
+             Int  szB = 0;
+-            HReg rD  = iselIntExpr_R(env, stmt->Ist.LLSC.storedata);
+             HReg rA  = iselIntExpr_R(env, stmt->Ist.LLSC.addr);
+             switch (tyd) {
+-               case Ity_I8:  szB = 1; break;
+-               case Ity_I16: szB = 2; break;
+-               case Ity_I32: szB = 4; break;
+-               case Ity_I64: szB = 8; break;
+-               default:      vassert(0);
++               case Ity_I8:   szB = 1; break;
++               case Ity_I16:  szB = 2; break;
++               case Ity_I32:  szB = 4; break;
++               case Ity_I64:  szB = 8; break;
++               case Ity_I128: szB = 16; break;
++               default:       vassert(0);
++            }
++            if (szB == 16) {
++               HReg rD_MSword = INVALID_HREG;
++               HReg rD_LSword = INVALID_HREG;
++               iselInt128Expr(&rD_MSword,
++                              &rD_LSword, env, stmt->Ist.LLSC.storedata);
++               addInstr(env, ARM64Instr_MovI(hregARM64_X2(), rD_LSword));
++               addInstr(env, ARM64Instr_MovI(hregARM64_X3(), rD_MSword));
++               addInstr(env, ARM64Instr_MovI(hregARM64_X4(), rA));
++               addInstr(env, ARM64Instr_StrEXP());
++            } else {
++               vassert(szB != 0);
++               HReg rD  = iselIntExpr_R(env, stmt->Ist.LLSC.storedata);
++               addInstr(env, ARM64Instr_MovI(hregARM64_X2(), rD));
++               addInstr(env, ARM64Instr_MovI(hregARM64_X4(), rA));
++               addInstr(env, ARM64Instr_StrEX(szB));
+             }
+-            addInstr(env, ARM64Instr_MovI(hregARM64_X2(), rD));
+-            addInstr(env, ARM64Instr_MovI(hregARM64_X4(), rA));
+-            addInstr(env, ARM64Instr_StrEX(szB));
+          } else {
+             goto stmt_fail;
+          }
+@@ -4243,10 +4327,10 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt )
+ 
+    /* --------- ACAS --------- */
+    case Ist_CAS: {
+-      if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
++      IRCAS* cas = stmt->Ist.CAS.details;
++      if (cas->oldHi == IRTemp_INVALID && cas->end == Iend_LE) {
+          /* "normal" singleton CAS */
+          UChar  sz;
+-         IRCAS* cas = stmt->Ist.CAS.details;
+          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
+          switch (ty) { 
+             case Ity_I64: sz = 8; break;
+@@ -4281,10 +4365,9 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt )
+          addInstr(env, ARM64Instr_MovI(rOld, rResult));
+          return;
+       }
+-      else {
++      if (cas->oldHi != IRTemp_INVALID && cas->end == Iend_LE) {
+          /* Paired register CAS, i.e. CASP */
+          UChar  sz;
+-         IRCAS* cas = stmt->Ist.CAS.details;
+          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
+          switch (ty) {
+             case Ity_I64: sz = 8; break;
+diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c
+index 25566c41c..2d82c41a1 100644
+--- a/VEX/priv/ir_defs.c
++++ b/VEX/priv/ir_defs.c
+@@ -76,6 +76,7 @@ void ppIRConst ( const IRConst* con )
+       case Ico_U16:  vex_printf( "0x%x:I16",     (UInt)(con->Ico.U16)); break;
+       case Ico_U32:  vex_printf( "0x%x:I32",     (UInt)(con->Ico.U32)); break;
+       case Ico_U64:  vex_printf( "0x%llx:I64",   (ULong)(con->Ico.U64)); break;
++      case Ico_U128: vex_printf( "I128{0x%04x}", (UInt)(con->Ico.U128)); break;
+       case Ico_F32:  u.f32 = con->Ico.F32;
+                      vex_printf( "F32{0x%x}",   u.i32);
+                      break;
+@@ -2266,6 +2267,13 @@ IRConst* IRConst_U64 ( ULong u64 )
+    c->Ico.U64 = u64;
+    return c;
+ }
++IRConst* IRConst_U128 ( UShort con )
++{
++   IRConst* c  = LibVEX_Alloc_inline(sizeof(IRConst));
++   c->tag      = Ico_U128;
++   c->Ico.U128 = con;
++   return c;
++}
+ IRConst* IRConst_F32 ( Float f32 )
+ {
+    IRConst* c = LibVEX_Alloc_inline(sizeof(IRConst));
+@@ -4230,6 +4238,7 @@ IRType typeOfIRConst ( const IRConst* con )
+       case Ico_U16:   return Ity_I16;
+       case Ico_U32:   return Ity_I32;
+       case Ico_U64:   return Ity_I64;
++      case Ico_U128:  return Ity_I128;
+       case Ico_F32:   return Ity_F32;
+       case Ico_F32i:  return Ity_F32;
+       case Ico_F64:   return Ity_F64;
+@@ -5129,7 +5138,7 @@ void tcStmt ( const IRSB* bb, const IRStmt* stmt, IRType gWordTy )
+          tyRes = typeOfIRTemp(tyenv, stmt->Ist.LLSC.result);
+          if (stmt->Ist.LLSC.storedata == NULL) {
+             /* it's a LL */
+-            if (tyRes != Ity_I64 && tyRes != Ity_I32
++            if (tyRes != Ity_I128 && tyRes != Ity_I64 && tyRes != Ity_I32
+                 && tyRes != Ity_I16 && tyRes != Ity_I8)
+                sanityCheckFail(bb,stmt,"Ist.LLSC(LL).result :: bogus");
+          } else {
+@@ -5137,7 +5146,7 @@ void tcStmt ( const IRSB* bb, const IRStmt* stmt, IRType gWordTy )
+             if (tyRes != Ity_I1)
+                sanityCheckFail(bb,stmt,"Ist.LLSC(SC).result: not :: Ity_I1");
+             tyData = typeOfIRExpr(tyenv, stmt->Ist.LLSC.storedata);
+-            if (tyData != Ity_I64 && tyData != Ity_I32
++            if (tyData != Ity_I128 && tyData != Ity_I64 && tyData != Ity_I32
+                 && tyData != Ity_I16 && tyData != Ity_I8)
+                sanityCheckFail(bb,stmt,
+                                "Ist.LLSC(SC).result :: storedata bogus");
+@@ -5385,6 +5394,7 @@ Int sizeofIRType ( IRType ty )
+ IRType integerIRTypeOfSize ( Int szB )
+ {
+    switch (szB) {
++      case 16: return Ity_I128;
+       case 8: return Ity_I64;
+       case 4: return Ity_I32;
+       case 2: return Ity_I16;
+diff --git a/VEX/pub/libvex_guest_arm64.h b/VEX/pub/libvex_guest_arm64.h
+index 39b6ecdc2..91d06bd75 100644
+--- a/VEX/pub/libvex_guest_arm64.h
++++ b/VEX/pub/libvex_guest_arm64.h
+@@ -157,14 +157,18 @@ typedef
+          note of bits 23 and 22. */
+       UInt  guest_FPCR;
+ 
+-      /* Fallback LL/SC support.  See bugs 344524 and 369459. */
+-      ULong guest_LLSC_SIZE; // 0==no current transaction, else 1,2,4 or 8.
++      /* Fallback LL/SC support.  See bugs 344524 and 369459.  _LO64 and _HI64
++         contain the original contents of _ADDR+0 .. _ADDR+15, but only _SIZE
++         number of bytes of it.  The remaining 16-_SIZE bytes of them must be
++         zero. */
++      ULong guest_LLSC_SIZE; // 0==no current transaction, else 1,2,4,8 or 16.
+       ULong guest_LLSC_ADDR; // Address of transaction.
+-      ULong guest_LLSC_DATA; // Original value at _ADDR, zero-extended.
++      ULong guest_LLSC_DATA_LO64; // Original value at _ADDR+0.
++      ULong guest_LLSC_DATA_HI64; // Original value at _ADDR+8.
+ 
+       /* Padding to make it have an 16-aligned size */
+       /* UInt  pad_end_0; */
+-      ULong pad_end_1;
++      /* ULong pad_end_1; */
+    }
+    VexGuestARM64State;
+ 
+diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h
+index deaa044c1..85805bb69 100644
+--- a/VEX/pub/libvex_ir.h
++++ b/VEX/pub/libvex_ir.h
+@@ -269,6 +269,8 @@ typedef
+       Ico_U16, 
+       Ico_U32, 
+       Ico_U64,
++      Ico_U128,  /* 128-bit restricted integer constant,
++                    same encoding scheme as V128 */
+       Ico_F32,   /* 32-bit IEEE754 floating */
+       Ico_F32i,  /* 32-bit unsigned int to be interpreted literally
+                     as a IEEE754 single value. */
+@@ -295,6 +297,7 @@ typedef
+          UShort U16;
+          UInt   U32;
+          ULong  U64;
++         UShort U128;
+          Float  F32;
+          UInt   F32i;
+          Double F64;
+@@ -311,6 +314,7 @@ extern IRConst* IRConst_U8   ( UChar );
+ extern IRConst* IRConst_U16  ( UShort );
+ extern IRConst* IRConst_U32  ( UInt );
+ extern IRConst* IRConst_U64  ( ULong );
++extern IRConst* IRConst_U128 ( UShort );
+ extern IRConst* IRConst_F32  ( Float );
+ extern IRConst* IRConst_F32i ( UInt );
+ extern IRConst* IRConst_F64  ( Double );
+diff --git a/memcheck/mc_machine.c b/memcheck/mc_machine.c
+index 919c7fae8..176c8e5cb 100644
+--- a/memcheck/mc_machine.c
++++ b/memcheck/mc_machine.c
+@@ -1115,9 +1115,10 @@ static Int get_otrack_shadow_offset_wrk ( Int offset, Int szB )
+    if (o == GOF(CMSTART) && sz == 8) return -1; // untracked
+    if (o == GOF(CMLEN)   && sz == 8) return -1; // untracked
+ 
+-   if (o == GOF(LLSC_SIZE) && sz == 8) return -1; // untracked
+-   if (o == GOF(LLSC_ADDR) && sz == 8) return o;
+-   if (o == GOF(LLSC_DATA) && sz == 8) return o;
++   if (o == GOF(LLSC_SIZE)      && sz == 8) return -1; // untracked
++   if (o == GOF(LLSC_ADDR)      && sz == 8) return o;
++   if (o == GOF(LLSC_DATA_LO64) && sz == 8) return o;
++   if (o == GOF(LLSC_DATA_HI64) && sz == 8) return o;
+ 
+    VG_(printf)("MC_(get_otrack_shadow_offset)(arm64)(off=%d,sz=%d)\n",
+                offset,szB);
+diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
+index c6fd2653f..72ccb3c8c 100644
+--- a/memcheck/mc_translate.c
++++ b/memcheck/mc_translate.c
+@@ -5497,8 +5497,11 @@ IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
+       the address (shadow) to 'defined' following the test. */
+    complainIfUndefined( mce, addr, guard );
+ 
+-   /* Now cook up a call to the relevant helper function, to read the
+-      data V bits from shadow memory. */
++   /* Now cook up a call to the relevant helper function, to read the data V
++      bits from shadow memory.  Note that I128 loads are done by pretending
++      we're doing a V128 load, and then converting the resulting V128 vbits
++      word to an I128, right at the end of this function -- see `castedToI128`
++      below.  (It's only a minor hack :-) This pertains to bug 444399. */
+    ty = shadowTypeV(ty);
+ 
+    void*        helper           = NULL;
+@@ -5511,6 +5514,7 @@ IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
+                         hname = "MC_(helperc_LOADV256le)";
+                         ret_via_outparam = True;
+                         break;
++         case Ity_I128: // fallthrough.  See comment above.
+          case Ity_V128: helper = &MC_(helperc_LOADV128le);
+                         hname = "MC_(helperc_LOADV128le)";
+                         ret_via_outparam = True;
+@@ -5576,7 +5580,7 @@ IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
+ 
+    /* We need to have a place to park the V bits we're just about to
+       read. */
+-   IRTemp datavbits = newTemp(mce, ty, VSh);
++   IRTemp datavbits = newTemp(mce, ty == Ity_I128 ? Ity_V128 : ty, VSh);
+ 
+    /* Here's the call. */
+    IRDirty* di;
+@@ -5603,7 +5607,14 @@ IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
+    }
+    stmt( 'V', mce, IRStmt_Dirty(di) );
+ 
+-   return mkexpr(datavbits);
++   if (ty == Ity_I128) {
++      IRAtom* castedToI128
++         = assignNew('V', mce, Ity_I128,
++                     unop(Iop_ReinterpV128asI128, mkexpr(datavbits)));
++      return castedToI128;
++   } else {
++      return mkexpr(datavbits);
++   }
+ }
+ 
+ 
+@@ -5631,6 +5642,7 @@ IRAtom* expr2vbits_Load ( MCEnv* mce,
+       case Ity_I16:
+       case Ity_I32:
+       case Ity_I64:
++      case Ity_I128:
+       case Ity_V128:
+       case Ity_V256:
+          return expr2vbits_Load_WRK(mce, end, ty, addr, bias, guard);
+@@ -5928,6 +5940,7 @@ void do_shadow_Store ( MCEnv* mce,
+                         c = IRConst_V256(V_BITS32_DEFINED); break;
+          case Ity_V128: // V128 weirdness -- used twice
+                         c = IRConst_V128(V_BITS16_DEFINED); break;
++         case Ity_I128: c = IRConst_U128(V_BITS16_DEFINED); break;
+          case Ity_I64:  c = IRConst_U64 (V_BITS64_DEFINED); break;
+          case Ity_I32:  c = IRConst_U32 (V_BITS32_DEFINED); break;
+          case Ity_I16:  c = IRConst_U16 (V_BITS16_DEFINED); break;
+@@ -5948,6 +5961,7 @@ void do_shadow_Store ( MCEnv* mce,
+       switch (ty) {
+          case Ity_V256: /* we'll use the helper four times */
+          case Ity_V128: /* we'll use the helper twice */
++         case Ity_I128: /* we'll use the helper twice */
+          case Ity_I64: helper = &MC_(helperc_STOREV64le);
+                        hname = "MC_(helperc_STOREV64le)";
+                        break;
+@@ -6051,9 +6065,9 @@ void do_shadow_Store ( MCEnv* mce,
+       stmt( 'V', mce, IRStmt_Dirty(diQ3) );
+ 
+    } 
+-   else if (UNLIKELY(ty == Ity_V128)) {
++   else if (UNLIKELY(ty == Ity_V128 || ty == Ity_I128)) {
+ 
+-      /* V128-bit case */
++      /* V128/I128-bit case */
+       /* See comment in next clause re 64-bit regparms */
+       /* also, need to be careful about endianness */
+ 
+@@ -6062,6 +6076,7 @@ void do_shadow_Store ( MCEnv* mce,
+       IRAtom  *addrLo64, *addrHi64;
+       IRAtom  *vdataLo64, *vdataHi64;
+       IRAtom  *eBiasLo64, *eBiasHi64;
++      IROp    opGetLO64,  opGetHI64;
+ 
+       if (end == Iend_LE) {
+          offLo64 = 0;
+@@ -6071,9 +6086,17 @@ void do_shadow_Store ( MCEnv* mce,
+          offHi64 = 0;
+       }
+ 
++      if (ty == Ity_V128) {
++         opGetLO64 = Iop_V128to64;
++         opGetHI64 = Iop_V128HIto64;
++      } else {
++         opGetLO64 = Iop_128to64;
++         opGetHI64 = Iop_128HIto64;
++      }
++
+       eBiasLo64 = tyAddr==Ity_I32 ? mkU32(bias+offLo64) : mkU64(bias+offLo64);
+       addrLo64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
+-      vdataLo64 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vdata));
++      vdataLo64 = assignNew('V', mce, Ity_I64, unop(opGetLO64, vdata));
+       diLo64    = unsafeIRDirty_0_N( 
+                      1/*regparms*/, 
+                      hname, VG_(fnptr_to_fnentry)( helper ), 
+@@ -6081,7 +6104,7 @@ void do_shadow_Store ( MCEnv* mce,
+                   );
+       eBiasHi64 = tyAddr==Ity_I32 ? mkU32(bias+offHi64) : mkU64(bias+offHi64);
+       addrHi64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
+-      vdataHi64 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vdata));
++      vdataHi64 = assignNew('V', mce, Ity_I64, unop(opGetHI64, vdata));
+       diHi64    = unsafeIRDirty_0_N( 
+                      1/*regparms*/, 
+                      hname, VG_(fnptr_to_fnentry)( helper ), 
+@@ -6888,7 +6911,7 @@ static void do_shadow_LLSC ( MCEnv*    mce,
+       /* Just treat this as a normal load, followed by an assignment of
+          the value to .result. */
+       /* Stay sane */
+-      tl_assert(resTy == Ity_I64 || resTy == Ity_I32
++      tl_assert(resTy == Ity_I128 || resTy == Ity_I64 || resTy == Ity_I32
+                 || resTy == Ity_I16 || resTy == Ity_I8);
+       assign( 'V', mce, resTmp,
+                    expr2vbits_Load(
+@@ -6899,7 +6922,7 @@ static void do_shadow_LLSC ( MCEnv*    mce,
+       /* Stay sane */
+       IRType dataTy = typeOfIRExpr(mce->sb->tyenv,
+                                    stStoredata);
+-      tl_assert(dataTy == Ity_I64 || dataTy == Ity_I32
++      tl_assert(dataTy == Ity_I128 || dataTy == Ity_I64 || dataTy == Ity_I32
+                 || dataTy == Ity_I16 || dataTy == Ity_I8);
+       do_shadow_Store( mce, stEnd,
+                             stAddr, 0/* addr bias */,
+@@ -7684,7 +7707,7 @@ static void schemeS ( MCEnv* mce, IRStmt* st )
+                = typeOfIRTemp(mce->sb->tyenv, st->Ist.LLSC.result);
+             IRExpr* vanillaLoad
+                = IRExpr_Load(st->Ist.LLSC.end, resTy, st->Ist.LLSC.addr);
+-            tl_assert(resTy == Ity_I64 || resTy == Ity_I32
++            tl_assert(resTy == Ity_I128 || resTy == Ity_I64 || resTy == Ity_I32
+                       || resTy == Ity_I16 || resTy == Ity_I8);
+             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
+                               schemeE(mce, vanillaLoad));
+diff --git a/memcheck/tests/Makefile.am b/memcheck/tests/Makefile.am
+index 449710020..2b43ef7d7 100644
+--- a/memcheck/tests/Makefile.am
++++ b/memcheck/tests/Makefile.am
+@@ -90,6 +90,7 @@ EXTRA_DIST = \
+ 	addressable.stderr.exp addressable.stdout.exp addressable.vgtest \
+ 	atomic_incs.stderr.exp atomic_incs.vgtest \
+ 	atomic_incs.stdout.exp-32bit atomic_incs.stdout.exp-64bit \
++	atomic_incs.stdout.exp-64bit-and-128bit \
+ 	badaddrvalue.stderr.exp \
+ 	badaddrvalue.stdout.exp badaddrvalue.vgtest \
+         exit_on_first_error.stderr.exp \
+diff --git a/memcheck/tests/atomic_incs.c b/memcheck/tests/atomic_incs.c
+index f931750f4..1c738c530 100644
+--- a/memcheck/tests/atomic_incs.c
++++ b/memcheck/tests/atomic_incs.c
+@@ -22,6 +22,17 @@
+ #define NNN 3456987
+ 
+ #define IS_8_ALIGNED(_ptr)   (0 == (((unsigned long)(_ptr)) & 7))
++#define IS_16_ALIGNED(_ptr)  (0 == (((unsigned long)(_ptr)) & 15))
++
++// U128 from libvex_basictypes.h is a 4-x-UInt array, which is a bit
++// inconvenient, hence:
++typedef
++   struct {
++      // assuming little-endianness
++      unsigned long long int lo64;
++      unsigned long long int hi64;
++   }
++   MyU128;
+ 
+ 
+ __attribute__((noinline)) void atomic_add_8bit ( char* p, int n ) 
+@@ -712,6 +723,40 @@ __attribute__((noinline)) void atomic_add_64bit ( long long int* p, int n )
+ #endif
+ }
+ 
++__attribute__((noinline)) void atomic_add_128bit ( MyU128* p,
++                                                   unsigned long long int n )
++{
++#if defined(VGA_x86) || defined(VGA_ppc32) || defined(VGA_mips32) \
++    || defined (VGA_nanomips) || defined(VGA_mips64) \
++    || defined(VGA_amd64) \
++    || defined(VGA_ppc64be) || defined(VGA_ppc64le) \
++    || defined(VGA_arm) \
++    || defined(VGA_s390x)
++   /* do nothing; is not supported */
++#elif defined(VGA_arm64)
++   unsigned long long int block[3]
++      = { (unsigned long long int)p, (unsigned long long int)n,
++          0xFFFFFFFFFFFFFFFFULL};
++   do {
++      __asm__ __volatile__(
++         "mov   x5, %0"             "\n\t" // &block[0]
++         "ldr   x9, [x5, #0]"       "\n\t" // p
++         "ldr   x10, [x5, #8]"      "\n\t" // n
++         "ldxp  x7, x8, [x9]"       "\n\t"
++         "adds  x7, x7, x10"        "\n\t"
++         "adc   x8, x8, xzr"        "\n\t"
++         "stxp  w4, x7, x8, [x9]"   "\n\t"
++         "str   x4, [x5, #16]"      "\n\t"
++         : /*out*/
++         : /*in*/ "r"(&block[0])
++         : /*trash*/ "memory", "cc", "x5", "x7", "x8", "x9", "x10", "x4"
++      );
++   } while (block[2] != 0);
++#else
++# error "Unsupported arch"
++#endif
++}
++
+ int main ( int argc, char** argv )
+ {
+    int    i, status;
+@@ -720,8 +765,12 @@ int main ( int argc, char** argv )
+    short* p16;
+    int*   p32;
+    long long int* p64;
++   MyU128*  p128;
+    pid_t  child, p2;
+ 
++   assert(sizeof(MyU128) == 16);
++   assert(sysconf(_SC_PAGESIZE) >= 4096);
++
+    printf("parent, pre-fork\n");
+ 
+    page = mmap( 0, sysconf(_SC_PAGESIZE),
+@@ -736,11 +785,13 @@ int main ( int argc, char** argv )
+    p16 = (short*)(page+256);
+    p32 = (int*)(page+512);
+    p64 = (long long int*)(page+768);
++   p128 = (MyU128*)(page+1024);
+ 
+    assert( IS_8_ALIGNED(p8) );
+    assert( IS_8_ALIGNED(p16) );
+    assert( IS_8_ALIGNED(p32) );
+    assert( IS_8_ALIGNED(p64) );
++   assert( IS_16_ALIGNED(p128) );
+ 
+    memset(page, 0, 1024);
+ 
+@@ -748,6 +799,7 @@ int main ( int argc, char** argv )
+    *p16 = 0;
+    *p32 = 0;
+    *p64 = 0;
++   p128->lo64 = p128->hi64 = 0;
+ 
+    child = fork();
+    if (child == -1) {
+@@ -763,6 +815,7 @@ int main ( int argc, char** argv )
+          atomic_add_16bit(p16, 1);
+          atomic_add_32bit(p32, 1);
+          atomic_add_64bit(p64, 98765 ); /* ensure we hit the upper 32 bits */
++         atomic_add_128bit(p128, 0x1000000013374771ULL); // ditto re upper 64
+       }
+       return 1;
+       /* NOTREACHED */
+@@ -778,6 +831,7 @@ int main ( int argc, char** argv )
+       atomic_add_16bit(p16, 1);
+       atomic_add_32bit(p32, 1);
+       atomic_add_64bit(p64, 98765 ); /* ensure we hit the upper 32 bits */
++      atomic_add_128bit(p128, 0x1000000013374771ULL); // ditto re upper 64
+    }
+ 
+    p2 = waitpid(child, &status, 0);
+@@ -788,11 +842,17 @@ int main ( int argc, char** argv )
+ 
+    printf("FINAL VALUES:  8 bit %d,  16 bit %d,  32 bit %d,  64 bit %lld\n",
+           (int)(*(signed char*)p8), (int)(*p16), *p32, *p64 );
++   printf("               128 bit 0x%016llx:0x%016llx\n",
++          p128->hi64, p128->lo64);
+ 
+    if (-74 == (int)(*(signed char*)p8) 
+        && 32694 == (int)(*p16) 
+        && 6913974 == *p32
+-       && (0LL == *p64 || 682858642110LL == *p64)) {
++       && (0LL == *p64 || 682858642110LL == *p64)
++       && ((0 == p128->hi64 && 0 == p128->lo64)
++           || (0x00000000000697fb == p128->hi64
++               && 0x6007eb426316d956ULL == p128->lo64))
++      ) {
+       printf("PASS\n");
+    } else {
+       printf("FAIL -- see source code for expected values\n");
+diff --git a/memcheck/tests/atomic_incs.stdout.exp-32bit b/memcheck/tests/atomic_incs.stdout.exp-32bit
+index c5b8781e5..55e5044b5 100644
+--- a/memcheck/tests/atomic_incs.stdout.exp-32bit
++++ b/memcheck/tests/atomic_incs.stdout.exp-32bit
+@@ -3,5 +3,6 @@ child
+ parent, pre-fork
+ parent
+ FINAL VALUES:  8 bit -74,  16 bit 32694,  32 bit 6913974,  64 bit 0
++               128 bit 0x0000000000000000:0x0000000000000000
+ PASS
+ parent exits
+diff --git a/memcheck/tests/atomic_incs.stdout.exp-64bit b/memcheck/tests/atomic_incs.stdout.exp-64bit
+index 82405c520..ca2f4fc97 100644
+--- a/memcheck/tests/atomic_incs.stdout.exp-64bit
++++ b/memcheck/tests/atomic_incs.stdout.exp-64bit
+@@ -3,5 +3,6 @@ child
+ parent, pre-fork
+ parent
+ FINAL VALUES:  8 bit -74,  16 bit 32694,  32 bit 6913974,  64 bit 682858642110
++               128 bit 0x0000000000000000:0x0000000000000000
+ PASS
+ parent exits
+diff --git a/memcheck/tests/atomic_incs.stdout.exp-64bit-and-128bit b/memcheck/tests/atomic_incs.stdout.exp-64bit-and-128bit
+new file mode 100644
+index 000000000..ef6580917
+--- /dev/null
++++ b/memcheck/tests/atomic_incs.stdout.exp-64bit-and-128bit
+@@ -0,0 +1,8 @@
++parent, pre-fork
++child
++parent, pre-fork
++parent
++FINAL VALUES:  8 bit -74,  16 bit 32694,  32 bit 6913974,  64 bit 682858642110
++               128 bit 0x00000000000697fb:0x6007eb426316d956
++PASS
++parent exits
+diff --git a/none/tests/arm64/Makefile.am b/none/tests/arm64/Makefile.am
+index 00cbfa52c..9efb49b27 100644
+--- a/none/tests/arm64/Makefile.am
++++ b/none/tests/arm64/Makefile.am
+@@ -12,7 +12,10 @@ EXTRA_DIST = \
+ 	atomics_v81.stdout.exp atomics_v81.stderr.exp atomics_v81.vgtest \
+ 	simd_v81.stdout.exp simd_v81.stderr.exp simd_v81.vgtest \
+         fmadd_sub.stdout.exp fmadd_sub.stderr.exp fmadd_sub.vgtest \
+-	fp_and_simd_v82.stdout.exp fp_and_simd_v82.stderr.exp fp_and_simd_v82.vgtest
++	fp_and_simd_v82.stdout.exp fp_and_simd_v82.stderr.exp \
++	fp_and_simd_v82.vgtest \
++	ldxp_stxp.stdout.exp ldxp_stxp.stderr.exp \
++	ldxp_stxp_basisimpl.vgtest ldxp_stxp_fallbackimpl.vgtest
+ 
+ check_PROGRAMS = \
+ 	allexec \
+@@ -20,7 +23,8 @@ check_PROGRAMS = \
+ 	fp_and_simd \
+ 	integer \
+ 	memory \
+-	fmadd_sub
++	fmadd_sub \
++	ldxp_stxp
+ 
+ if BUILD_ARMV8_CRC_TESTS
+   check_PROGRAMS += crc32
+diff --git a/none/tests/arm64/ldxp_stxp.c b/none/tests/arm64/ldxp_stxp.c
+new file mode 100644
+index 000000000..b5f6ea121
+--- /dev/null
++++ b/none/tests/arm64/ldxp_stxp.c
+@@ -0,0 +1,93 @@
++
++/* Note, this is only a basic smoke test of LD{A}XP and ST{L}XP.  Their
++   atomicity properties are tested by memcheck/tests/atomic_incs.c. */
++
++#include <stdio.h>
++#include <stdlib.h>
++#include <malloc.h>
++#include <assert.h>
++
++typedef  unsigned int            UInt;
++typedef  unsigned long long int  ULong;
++
++
++void initBlock ( ULong* block )
++{
++   block[0] = 0x0001020304050607ULL;
++   block[1] = 0x1011121314151617ULL;
++   block[2] = 0x2021222324252627ULL;
++   block[3] = 0x3031323334353637ULL;
++   block[4] = 0x4041424344454647ULL;
++   block[5] = 0x5051525354555657ULL;
++}
++
++void printBlock ( const char* who,
++                  ULong* block, ULong rt1contents, ULong rt2contents,
++                  UInt zeroIfSuccess )
++{
++   printf("Block %s (%s)\n", who, zeroIfSuccess == 0 ? "success" : "FAILURE" );
++   for (int i = 0; i < 6; i++) {
++      printf("0x%016llx\n", block[i]);
++   }
++   printf("0x%016llx rt1contents\n", rt1contents);
++   printf("0x%016llx rt2contents\n", rt2contents);
++   printf("\n");
++}
++
++int main ( void )
++{
++   ULong* block = memalign(16, 6 * sizeof(ULong));
++   assert(block);
++
++   ULong rt1in, rt2in, rt1out, rt2out;
++   UInt scRes;
++
++   // Do ldxp then stxp with x-registers
++   initBlock(block);
++   rt1in  = 0x5555666677778888ULL;
++   rt2in  = 0xAAAA9999BBBB0000ULL;
++   rt1out = 0x1111222233334444ULL;
++   rt2out = 0xFFFFEEEEDDDDCCCCULL;
++   scRes  = 0x55555555;
++   __asm__ __volatile__(
++      "ldxp %1, %2, [%5]"       "\n\t"
++      "stxp %w0, %3, %4, [%5]"  "\n\t"
++      : /*OUT*/
++        "=&r"(scRes),  // %0
++        "=&r"(rt1out), // %1
++        "=&r"(rt2out)  // %2
++      : /*IN*/
++        "r"(rt1in),    // %3
++        "r"(rt2in),    // %4
++        "r"(&block[2]) // %5
++      : /*TRASH*/
++        "memory","cc"
++   );
++   printBlock("after ldxp/stxp 2x64-bit", block, rt1out, rt2out, scRes);
++
++   // Do ldxp then stxp with w-registers
++   initBlock(block);
++   rt1in  = 0x5555666677778888ULL;
++   rt2in  = 0xAAAA9999BBBB0000ULL;
++   rt1out = 0x1111222233334444ULL;
++   rt2out = 0xFFFFEEEEDDDDCCCCULL;
++   scRes  = 0x55555555;
++   __asm__ __volatile__(
++      "ldxp %w1, %w2, [%5]"       "\n\t"
++      "stxp %w0, %w3, %w4, [%5]"  "\n\t"
++      : /*OUT*/
++        "=&r"(scRes),  // %0
++        "=&r"(rt1out), // %1
++        "=&r"(rt2out)  // %2
++      : /*IN*/
++        "r"(rt1in),    // %3
++        "r"(rt2in),    // %4
++        "r"(&block[2]) // %5
++      : /*TRASH*/
++        "memory","cc"
++   );
++   printBlock("after ldxp/stxp 2x32-bit", block, rt1out, rt2out, scRes);
++
++   free(block);
++   return 0;
++}
+diff --git a/none/tests/arm64/ldxp_stxp_basisimpl.stderr.exp b/none/tests/arm64/ldxp_stxp_basisimpl.stderr.exp
+new file mode 100644
+index 000000000..e69de29bb
+diff --git a/none/tests/arm64/ldxp_stxp_basisimpl.stdout.exp b/none/tests/arm64/ldxp_stxp_basisimpl.stdout.exp
+new file mode 100644
+index 000000000..f269ecdcc
+--- /dev/null
++++ b/none/tests/arm64/ldxp_stxp_basisimpl.stdout.exp
+@@ -0,0 +1,20 @@
++Block after ldxp/stxp 2x64-bit (success)
++0x0001020304050607
++0x1011121314151617
++0x5555666677778888
++0xaaaa9999bbbb0000
++0x4041424344454647
++0x5051525354555657
++0x2021222324252627 rt1contents
++0x3031323334353637 rt2contents
++
++Block after ldxp/stxp 2x32-bit (success)
++0x0001020304050607
++0x1011121314151617
++0xbbbb000077778888
++0x3031323334353637
++0x4041424344454647
++0x5051525354555657
++0x0000000024252627 rt1contents
++0x0000000020212223 rt2contents
++
+diff --git a/none/tests/arm64/ldxp_stxp_basisimpl.vgtest b/none/tests/arm64/ldxp_stxp_basisimpl.vgtest
+new file mode 100644
+index 000000000..29133729a
+--- /dev/null
++++ b/none/tests/arm64/ldxp_stxp_basisimpl.vgtest
+@@ -0,0 +1,2 @@
++prog: ldxp_stxp
++vgopts: -q
+diff --git a/none/tests/arm64/ldxp_stxp_fallbackimpl.stderr.exp b/none/tests/arm64/ldxp_stxp_fallbackimpl.stderr.exp
+new file mode 100644
+index 000000000..e69de29bb
+diff --git a/none/tests/arm64/ldxp_stxp_fallbackimpl.stdout.exp b/none/tests/arm64/ldxp_stxp_fallbackimpl.stdout.exp
+new file mode 100644
+index 000000000..f269ecdcc
+--- /dev/null
++++ b/none/tests/arm64/ldxp_stxp_fallbackimpl.stdout.exp
+@@ -0,0 +1,20 @@
++Block after ldxp/stxp 2x64-bit (success)
++0x0001020304050607
++0x1011121314151617
++0x5555666677778888
++0xaaaa9999bbbb0000
++0x4041424344454647
++0x5051525354555657
++0x2021222324252627 rt1contents
++0x3031323334353637 rt2contents
++
++Block after ldxp/stxp 2x32-bit (success)
++0x0001020304050607
++0x1011121314151617
++0xbbbb000077778888
++0x3031323334353637
++0x4041424344454647
++0x5051525354555657
++0x0000000024252627 rt1contents
++0x0000000020212223 rt2contents
++
+diff --git a/none/tests/arm64/ldxp_stxp_fallbackimpl.vgtest b/none/tests/arm64/ldxp_stxp_fallbackimpl.vgtest
+new file mode 100644
+index 000000000..474282a03
+--- /dev/null
++++ b/none/tests/arm64/ldxp_stxp_fallbackimpl.vgtest
+@@ -0,0 +1,2 @@
++prog: ldxp_stxp
++vgopts: -q --sim-hints=fallback-llsc
+
+commit 0d38ca5dd6b446c70738031132d41f09de0f7a8a
+Author: Julian Seward <jseward@acm.org>
+Date:   Fri Nov 12 13:08:45 2021 +0100
+
+    Bug 444399 - disInstr(arm64): unhandled instruction 0xC87F2D89 (LD{,A}XP and ST{,L}XP).  FOLLOWUP FIX.
+    
+    This is an attempt to un-break 'make dist', as broken by the main commit for
+    this bug, which was 530df882b8f60ecacaf2b9b8a719f7ea1c1d1650.
+
+diff --git a/none/tests/arm64/Makefile.am b/none/tests/arm64/Makefile.am
+index 9efb49b27..4a06f0996 100644
+--- a/none/tests/arm64/Makefile.am
++++ b/none/tests/arm64/Makefile.am
+@@ -14,8 +14,10 @@ EXTRA_DIST = \
+         fmadd_sub.stdout.exp fmadd_sub.stderr.exp fmadd_sub.vgtest \
+ 	fp_and_simd_v82.stdout.exp fp_and_simd_v82.stderr.exp \
+ 	fp_and_simd_v82.vgtest \
+-	ldxp_stxp.stdout.exp ldxp_stxp.stderr.exp \
+-	ldxp_stxp_basisimpl.vgtest ldxp_stxp_fallbackimpl.vgtest
++	ldxp_stxp_basisimpl.stdout.exp ldxp_stxp_basisimpl.stderr.exp \
++	ldxp_stxp_basisimpl.vgtest \
++	ldxp_stxp_fallbackimpl.stdout.exp ldxp_stxp_fallbackimpl.stderr.exp \
++	ldxp_stxp_fallbackimpl.vgtest
+ 
+ check_PROGRAMS = \
+ 	allexec \
diff --git a/SOURCES/valgrind-3.18.1-condvar.patch b/SOURCES/valgrind-3.18.1-condvar.patch
new file mode 100644
index 0000000..e129326
--- /dev/null
+++ b/SOURCES/valgrind-3.18.1-condvar.patch
@@ -0,0 +1,284 @@
+commit 9abfed23c0d430aafb85de6397d171316c982792
+Author: Paul Floyd <pjfloyd@wanadoo.fr>
+Date:   Fri Nov 19 08:34:53 2021 +0100
+
+    Bug 445504 Using C++ condition_variable results in bogus "mutex is locked simultaneously by two threads" warning(edit)
+    
+    Add intercepts for pthread_cond_clockwait to DRD and Helgrind
+    Also testcase from bugzilla done by Bart, with configure check
+
+diff --git a/configure.ac b/configure.ac
+index e7381f205..cb836dbff 100755
+--- a/configure.ac
++++ b/configure.ac
+@@ -1989,6 +1989,27 @@ AC_LANG(C)
+ 
+ AM_CONDITIONAL(CXX_CAN_INCLUDE_THREAD_HEADER, test x$ac_cxx_can_include_thread_header = xyes)
+ 
++# Check whether compiler can process #include <condition_variable> without errors
++
++AC_MSG_CHECKING([that C++ compiler can include <condition_variable> header file])
++AC_LANG(C++)
++safe_CXXFLAGS=$CXXFLAGS
++CXXFLAGS=-std=c++0x
++
++AC_COMPILE_IFELSE([AC_LANG_SOURCE([
++#include <condition_variable> 
++])],
++[
++ac_cxx_can_include_condition_variable_header=yes
++AC_MSG_RESULT([yes])
++], [
++ac_cxx_can_include_condition_variable_header=no
++AC_MSG_RESULT([no])
++])
++CXXFLAGS=$safe_CXXFLAGS
++AC_LANG(C)
++
++AM_CONDITIONAL(CXX_CAN_INCLUDE_CONDITION_VARIABLE_HEADER, test x$ac_cxx_can_include_condition_variable_header = xyes)
+ 
+ # On aarch64 before glibc 2.20 we would get the kernel user_pt_regs instead
+ # of the user_regs_struct from sys/user.h. They are structurally the same
+diff --git a/drd/drd_pthread_intercepts.c b/drd/drd_pthread_intercepts.c
+index 8b4454364..95127b42c 100644
+--- a/drd/drd_pthread_intercepts.c
++++ b/drd/drd_pthread_intercepts.c
+@@ -1175,6 +1175,30 @@ PTH_FUNCS(int, condZureltimedwait, pthread_cond_timedwait_intercept,
+           (cond, mutex, timeout));
+ #endif /* VGO_solaris */
+ 
++
++static __always_inline
++int pthread_cond_clockwait_intercept(pthread_cond_t *cond,
++                                     pthread_mutex_t *mutex,
++                                     clockid_t clockid,
++                                     const struct timespec* abstime)
++{
++   int   ret;
++   OrigFn fn;
++   VALGRIND_GET_ORIG_FN(fn);
++   VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__PRE_COND_WAIT,
++                                   cond, mutex, DRD_(mutex_type)(mutex), 0, 0);
++   CALL_FN_W_WWWW(ret, fn, cond, mutex, clockid, abstime);
++   VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__POST_COND_WAIT,
++                                   cond, mutex, 1, 0, 0);
++   return ret;
++}
++
++PTH_FUNCS(int, pthreadZucondZuclockwait, pthread_cond_clockwait_intercept,
++          (pthread_cond_t *cond, pthread_mutex_t *mutex,
++            clockid_t clockid, const struct timespec* abstime),
++          (cond, mutex, clockid, abstime));
++
++
+ // NOTE: be careful to intercept only pthread_cond_signal() and not Darwin's
+ // pthread_cond_signal_thread_np(). The former accepts one argument; the latter
+ // two. Intercepting all pthread_cond_signal* functions will cause only one
+diff --git a/drd/tests/Makefile.am b/drd/tests/Makefile.am
+index 4cb2f7f84..c804391e8 100755
+--- a/drd/tests/Makefile.am
++++ b/drd/tests/Makefile.am
+@@ -105,6 +105,8 @@ EXTRA_DIST =                                        \
+ 	circular_buffer.vgtest			    \
+ 	concurrent_close.stderr.exp		    \
+ 	concurrent_close.vgtest			    \
++	condvar.stderr.exp			    \
++	condvar.vgtest				    \
+ 	custom_alloc.stderr.exp			    \
+ 	custom_alloc.vgtest			    \
+ 	custom_alloc_fiw.stderr.exp		    \
+@@ -458,6 +460,11 @@ check_PROGRAMS += \
+ endif
+ endif
+ 
++if CXX_CAN_INCLUDE_CONDITION_VARIABLE_HEADER
++check_PROGRAMS += \
++    condvar
++endif
++
+ if HAVE_OPENMP
+ check_PROGRAMS += omp_matinv omp_prime omp_printf
+ endif
+@@ -502,6 +509,8 @@ LDADD = -lpthread
+ 
+ 
+ bug322621_SOURCES           = bug322621.cpp
++condvar_SOURCES		    = condvar.cpp
++condvar_CXXFLAGS            = $(AM_CXXFLAGS) -std=c++0x
+ concurrent_close_SOURCES    = concurrent_close.cpp
+ if !VGCONF_OS_IS_FREEBSD
+ dlopen_main_LDADD           = -ldl
+diff --git a/drd/tests/condvar.cpp b/drd/tests/condvar.cpp
+new file mode 100644
+index 000000000..18ecb3f8a
+--- /dev/null
++++ b/drd/tests/condvar.cpp
+@@ -0,0 +1,55 @@
++/* See also https://bugs.kde.org/show_bug.cgi?id=445504 */
++
++#include <condition_variable>
++#include <future>
++#include <iostream>
++#include <mutex>
++#include <thread>
++#include <vector>
++
++using lock_guard = std::lock_guard<std::mutex>;
++using unique_lock = std::unique_lock<std::mutex>;
++
++struct state {
++  std::mutex m;
++  std::vector<int> v;
++  std::condition_variable cv;
++
++  state() {
++    // Call pthread_cond_init() explicitly to let DRD know about 'cv'.
++    pthread_cond_init(cv.native_handle(), NULL);
++  }
++};
++
++void other_thread(state *sp) {
++  state &s = *sp;
++  std::cerr << "Other thread: waiting for notify\n";
++  unique_lock l{s.m};
++  while (true) {
++    if (s.cv.wait_for(l, std::chrono::seconds(3)) !=
++	std::cv_status::timeout) {
++      std::cerr << "Other thread: notified\n";
++      break;
++    }
++  }
++  return;
++}
++
++
++int main() {
++  state s;
++  auto future = std::async(std::launch::async, other_thread, &s);
++
++  if (future.wait_for(std::chrono::seconds(1)) != std::future_status::timeout) {
++    std::cerr << "Main: other thread returned too early!\n";
++    return 2;
++  }
++
++  {
++    std::lock_guard<std::mutex> g{s.m};
++    s.v.push_back(1);
++    s.v.push_back(2);
++    s.cv.notify_all();
++  }
++  return 0;
++}
+diff --git a/drd/tests/condvar.stderr.exp b/drd/tests/condvar.stderr.exp
+new file mode 100644
+index 000000000..be1de9f97
+--- /dev/null
++++ b/drd/tests/condvar.stderr.exp
+@@ -0,0 +1,5 @@
++
++Other thread: waiting for notify
++Other thread: notified
++
++ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)
+diff --git a/drd/tests/condvar.vgtest b/drd/tests/condvar.vgtest
+new file mode 100644
+index 000000000..2e7d49f5a
+--- /dev/null
++++ b/drd/tests/condvar.vgtest
+@@ -0,0 +1,3 @@
++prereq: ./supported_libpthread && [ -e condvar ]
++vgopts: --check-stack-var=yes --read-var-info=yes
++prog: condvar
+diff --git a/helgrind/hg_intercepts.c b/helgrind/hg_intercepts.c
+index 866efdbaa..49c3ddcd9 100644
+--- a/helgrind/hg_intercepts.c
++++ b/helgrind/hg_intercepts.c
+@@ -1409,6 +1409,88 @@ static int pthread_cond_timedwait_WRK(pthread_cond_t* cond,
+ #  error "Unsupported OS"
+ #endif
+ 
++//-----------------------------------------------------------
++// glibc:   pthread_cond_clockwait
++//
++__attribute__((noinline))
++static int pthread_cond_clockwait_WRK(pthread_cond_t* cond,
++                                      pthread_mutex_t* mutex,
++                                      clockid_t clockid,
++                                      struct timespec* abstime,
++                                      int timeout_error)
++{
++   int ret;
++   OrigFn fn;
++   unsigned long mutex_is_valid;
++   Bool abstime_is_valid;
++   VALGRIND_GET_ORIG_FN(fn);
++
++   if (TRACE_PTH_FNS) {
++      fprintf(stderr, "<< pthread_cond_clockwait %p %p %p",
++                      cond, mutex, abstime);
++      fflush(stderr);
++   }
++
++   /* Tell the tool a cond-wait is about to happen, so it can check
++      for bogus argument values.  In return it tells us whether it
++      thinks the mutex is valid or not. */
++   DO_CREQ_W_WW(mutex_is_valid,
++                _VG_USERREQ__HG_PTHREAD_COND_WAIT_PRE,
++                pthread_cond_t*,cond, pthread_mutex_t*,mutex);
++   assert(mutex_is_valid == 1 || mutex_is_valid == 0);
++
++   abstime_is_valid = abstime->tv_nsec >= 0 && abstime->tv_nsec < 1000000000;
++
++   /* Tell the tool we're about to drop the mutex.  This reflects the
++      fact that in a cond_wait, we show up holding the mutex, and the
++      call atomically drops the mutex and waits for the cv to be
++      signalled. */
++   if (mutex_is_valid && abstime_is_valid) {
++      DO_CREQ_v_W(_VG_USERREQ__HG_PTHREAD_MUTEX_UNLOCK_PRE,
++                  pthread_mutex_t*,mutex);
++   }
++
++   CALL_FN_W_WWWW(ret, fn, cond,mutex,clockid,abstime);
++
++   if (mutex_is_valid && !abstime_is_valid && ret != EINVAL) {
++      DO_PthAPIerror("Bug in libpthread: pthread_cond_clockwait "
++                     "invalid abstime did not cause"
++                     " EINVAL", ret);
++   }
++
++   if (mutex_is_valid && abstime_is_valid) {
++      /* and now we have the mutex again if (ret == 0 || ret == timeout) */
++      DO_CREQ_v_WW(_VG_USERREQ__HG_PTHREAD_MUTEX_LOCK_POST,
++                   pthread_mutex_t *, mutex,
++                   long, (ret == 0 || ret == timeout_error) ? True : False);
++   }
++
++   DO_CREQ_v_WWWW(_VG_USERREQ__HG_PTHREAD_COND_WAIT_POST,
++                  pthread_cond_t*,cond, pthread_mutex_t*,mutex,
++                  long,ret == timeout_error,
++                  long, (ret == 0 || ret == timeout_error) && mutex_is_valid
++                        ? True : False);
++
++   if (ret != 0 && ret != timeout_error) {
++      DO_PthAPIerror( "pthread_cond_clockwait", ret );
++   }
++
++   if (TRACE_PTH_FNS) {
++      fprintf(stderr, " cotimedwait -> %d >>\n", ret);
++   }
++
++   return ret;
++}
++
++#if defined(VGO_linux)
++   PTH_FUNC(int, pthreadZucondZuclockwait, // pthread_cond_clockwait
++                 pthread_cond_t* cond, pthread_mutex_t* mutex,
++                 clockid_t clockid,
++                 struct timespec* abstime) {
++      return pthread_cond_clockwait_WRK(cond, mutex, clockid, abstime, ETIMEDOUT);
++   }
++#endif
++
+ 
+ //-----------------------------------------------------------
+ // glibc:   pthread_cond_signal@GLIBC_2.0
diff --git a/SOURCES/valgrind-3.18.1-demangle-namespace.patch b/SOURCES/valgrind-3.18.1-demangle-namespace.patch
new file mode 100644
index 0000000..25ddf92
--- /dev/null
+++ b/SOURCES/valgrind-3.18.1-demangle-namespace.patch
@@ -0,0 +1,35 @@
+commit 542447d4708d4418a08e678dcf467af92b90b7ad
+Author: Mark Wielaard <mark@klomp.org>
+Date:   Mon Nov 22 13:07:59 2021 +0100
+
+    readdwarf3.c (parse_inl_DIE) inlined_subroutine can appear in namespaces
+    
+    This was broken by commit 75e3ef0f3 "readdwarf3: Skip units without
+    addresses when looking for inlined functions". Specifically by this
+    part: "Also use skip_DIE instead of read_DIE when not parsing
+    (skipping) children"
+    
+    rustc puts concrete function instances in namespaces (which is
+    allowed in DWARF since there is no strict separation between type
+    declarations and program scope entries in a DIE tree), the inline
+    parser didn't expect this and so skipped any DIE under a namespace
+    entry. This wasn't an issue before because "skipping" a DIE tree was
+    done by reading it, so it wasn't actually skipped. But now that we
+    really skip the DIE (sub)tree (which is faster than actually parsing
+    it) some entries were missed in the rustc case.
+    
+    https://bugs.kde.org/show_bug.cgi?id=445668
+
+diff --git a/coregrind/m_debuginfo/readdwarf3.c b/coregrind/m_debuginfo/readdwarf3.c
+index 18eecea9f..5489f8d13 100644
+--- a/coregrind/m_debuginfo/readdwarf3.c
++++ b/coregrind/m_debuginfo/readdwarf3.c
+@@ -3358,7 +3358,7 @@ static Bool parse_inl_DIE (
+    // might maybe contain a DW_TAG_inlined_subroutine:
+    Bool ret = (unit_has_addrs
+                || dtag == DW_TAG_lexical_block || dtag == DW_TAG_subprogram
+-               || dtag == DW_TAG_inlined_subroutine);
++               || dtag == DW_TAG_inlined_subroutine || dtag == DW_TAG_namespace);
+    return ret;
+ 
+   bad_DIE:
diff --git a/SOURCES/valgrind-3.18.1-dhat-tests-copy.patch b/SOURCES/valgrind-3.18.1-dhat-tests-copy.patch
new file mode 100644
index 0000000..8e183b9
--- /dev/null
+++ b/SOURCES/valgrind-3.18.1-dhat-tests-copy.patch
@@ -0,0 +1,20 @@
+commit 33aba8eef68b1745d3de96b609ff8296b70d9a1c
+Author: Paul Floyd <pjfloyd@wanadoo.fr>
+Date:   Wed Oct 27 21:37:00 2021 +0200
+
+    Bug 444495 - dhat/tests/copy fails on s390x
+    
+    Add -fno-builtin to ensure that the copy functions get called and so dhat
+    can intercept and count them.
+
+diff --git a/dhat/tests/Makefile.am b/dhat/tests/Makefile.am
+index 86a9b6d64..b86fc416d 100644
+--- a/dhat/tests/Makefile.am
++++ b/dhat/tests/Makefile.am
+@@ -29,3 +29,6 @@ AM_CXXFLAGS += $(AM_FLAG_M3264_PRI)
+ # We don't care about uninitialized or unused malloc results
+ basic_CFLAGS  = $(AM_CFLAGS) -Wno-uninitialized
+ big_CFLAGS  = $(AM_CFLAGS) -Wno-unused-result
++
++# Prevent the copying functions from being inlined
++copy_CFLAGS = $(AM_CFLAGS) -fno-builtin
diff --git a/SOURCES/valgrind-3.18.1-gdbserver_tests-hwcap.patch b/SOURCES/valgrind-3.18.1-gdbserver_tests-hwcap.patch
new file mode 100644
index 0000000..2d952cd
--- /dev/null
+++ b/SOURCES/valgrind-3.18.1-gdbserver_tests-hwcap.patch
@@ -0,0 +1,25 @@
+commit 64ab89162906d5b9e2de6c3afe476fec861ef7ec
+Author: Mark Wielaard <mark@klomp.org>
+Date:   Tue Nov 2 14:27:45 2021 +0100
+
+    gdbserver_tests: Filter out glibc hwcaps libc.so
+    
+    On some systems the gdbserver_tests would fail because the filter
+    for the optimized hwcaps subdir didn't match because the file is
+    called slightly differently, with the version number before .so
+    instead of after. For example: /lib64/glibc-hwcaps/power9/libc-2.28.so
+    
+    Add one extra filter for this pattern.
+
+diff --git a/gdbserver_tests/filter_gdb.in b/gdbserver_tests/filter_gdb.in
+index d0c94f3f1..b753e0168 100755
+--- a/gdbserver_tests/filter_gdb.in
++++ b/gdbserver_tests/filter_gdb.in
+@@ -134,6 +134,7 @@ s/in \(.__\)\{0,1\}select () from \/.*$/in syscall .../
+ /^   from \/lib\/libc.so.*$/d
+ /^   from \/lib64\/libc.so.*$/d
+ /^   from \/lib64\/.*\/libc.so.*$/d
++/^   from \/lib64\/.*\/libc-.*.so/d
+ 
+ #       and yet another (gdb 7.0 way) to get a system call
+ s/in select ()$/in syscall .../
diff --git a/SOURCES/valgrind-3.18.1-ppc-pstq-tests.patch b/SOURCES/valgrind-3.18.1-ppc-pstq-tests.patch
new file mode 100644
index 0000000..58498f2
--- /dev/null
+++ b/SOURCES/valgrind-3.18.1-ppc-pstq-tests.patch
@@ -0,0 +1,1876 @@
+commit 3950c5d661ee09526cddcf24daf5fc22bc83f70c
+Author: Carl Love <cel@us.ibm.com>
+Date:   Mon Nov 1 11:18:32 2021 -0500
+
+    Valgrind Add powerpc R=1 tests
+    
+    Contributed by Will Schmidt <will_schmidt@vnet.ibm.com>
+    
+    This includes updates and adjustments as suggested by Carl.
+    
+    Add tests that exercise PCRelative instructions.
+    These instructions are encoded with R==1, which indicate that
+    the memory accessed by the instruction is at a location
+    relative to the currently executing instruction.
+    
+    These tests are built using -Wl,-text and -Wl,-bss
+    options to ensure the location of the target array is at a
+    location with a specific offset from the currently
+    executing instruction.
+    
+    The write instructions are aimed at a large buffer in
+    the bss section; which is checked for updates at the
+    completion of each test.
+    
+    In order to ensure consistent output across assorted
+    systems, the tests have been padded with ori, nop instructions
+    and align directives.
+    
+    Detailed changes:
+     * Makefile.am: Add test_isa_3_1_R1_RT and test_isa_3_1_R1_XT tests.
+     * isa_3_1_helpers.h: Add identify_instruction_by_func_name() helper function
+       to indicate if the test is for R==1.
+       Add helpers to initialize and print changes to the pcrelative_write_target
+       array.
+       Add #define to help pad code with a series of eyecatcher ORI instructions.
+         * test_isa_3_1_R1_RT.c: New test.
+         * test_isa_3_1_R1_XT.c: New test.
+         * test_isa_3_1_R1_XT.stdout.exp: New expected output.
+         * test_isa_3_1_R1_XT.stdout.exp: New expected output.
+         * test_isa_3_1_R1_RT.stderr.exp: New expected output.
+         * test_isa_3_1_R1_RT.stderr.exp: New expected output.
+    
+         * test_isa_3_1_R1_RT.vgtest: New test handler.
+         * test_isa_3_1_R1_XT.vgtest: New test handler.
+    
+         * test_isa_3_1_common.c: Add indicators (updates_byte,updates_halfword,
+           updates_word) indicators to control the output from the R==1 tests.
+           Add helper check for "_R1" to indicate if instruction is coded with R==1.
+           Add init and print helpers for the pcrelative_write_target array.
+
+diff --git a/none/tests/ppc64/Makefile.am b/none/tests/ppc64/Makefile.am
+index b709f3ef4..f8eab9fc0 100644
+--- a/none/tests/ppc64/Makefile.am
++++ b/none/tests/ppc64/Makefile.am
+@@ -61,6 +61,8 @@ EXTRA_DIST = \
+ 	test_isa_3_1_VRT.vgtest test_isa_3_1_VRT.stderr.exp test_isa_3_1_VRT.stdout.exp \
+ 	test_isa_3_1_Misc.vgtest test_isa_3_1_Misc.stderr.exp test_isa_3_1_Misc.stdout.exp \
+ 	test_isa_3_1_AT.vgtest test_isa_3_1_AT.stderr.exp test_isa_3_1_AT.stdout.exp \
++	test_isa_3_1_R1_RT.vgtest test_isa_3_1_R1_RT.stderr.exp test_isa_3_1_R1_RT.stdout.exp \
++	test_isa_3_1_R1_XT.vgtest test_isa_3_1_R1_XT.stderr.exp test_isa_3_1_R1_XT.stdout.exp \
+ 	subnormal_test.stderr.exp  subnormal_test.stdout.exp \
+ 	subnormal_test.vgtest test_darn_inst.stderr.exp \
+ 	test_darn_inst.stdout.exp test_darn_inst.vgtest \
+@@ -68,8 +70,8 @@ EXTRA_DIST = \
+ 	test_copy_paste.stderr.exp test_copy_paste.stdout.exp \
+ 	test_copy_paste.vgtest \
+ 	test_mcrxrx.vgtest test_mcrxrx.stderr.exp test_mcrxrx.stdout.exp \
+-	test_lxvx_stxvx.vgtest test_lxvx_stxvx.stderr.exp test_lxvx_stxvx.stdout.exp-p8  test_lxvx_stxvx.stdout.exp-p9
+-
++	test_lxvx_stxvx.vgtest test_lxvx_stxvx.stderr.exp \
++	test_lxvx_stxvx.stdout.exp-p8  test_lxvx_stxvx.stdout.exp-p9
+ 
+ check_PROGRAMS = \
+ 	allexec \
+@@ -80,11 +82,12 @@ check_PROGRAMS = \
+ 	test_isa_3_0 test_mod_instructions \
+ 	test_isa_3_1_RT test_isa_3_1_XT test_isa_3_1_VRT \
+ 	test_isa_3_1_Misc test_isa_3_1_AT \
++	test_isa_3_1_R1_RT test_isa_3_1_R1_XT \
+ 	subnormal_test test_darn_inst test_copy_paste \
+ 	test_tm test_touch_tm data-cache-instructions \
+ 	std_reg_imm \
+ 	twi_tdi tw_td power6_bcmp scv_test \
+-	test_mcrxrx  test_lxvx_stxvx
++	test_mcrxrx test_lxvx_stxvx
+ 
+ # lmw, stmw, lswi, lswx, stswi, stswx compile (and run) only on big endian.
+ if VGCONF_PLATFORMS_INCLUDE_PPC64BE_LINUX
+@@ -106,6 +109,8 @@ test_isa_3_1_RT_SOURCES = test_isa_3_1_RT.c test_isa_3_1_common.c
+ test_isa_3_1_VRT_SOURCES = test_isa_3_1_VRT.c test_isa_3_1_common.c
+ test_isa_3_1_Misc_SOURCES = test_isa_3_1_Misc.c test_isa_3_1_common.c
+ test_isa_3_1_AT_SOURCES = test_isa_3_1_AT.c test_isa_3_1_common.c
++test_isa_3_1_R1_XT_SOURCES = test_isa_3_1_R1_XT.c test_isa_3_1_common.c
++test_isa_3_1_R1_RT_SOURCES = test_isa_3_1_R1_RT.c test_isa_3_1_common.c
+ test_darn_inst_SOURCES = test_darn_inst.c
+ 
+ if HAS_ALTIVEC
+@@ -224,6 +229,11 @@ test_isa_3_1_VRT_CFLAGS = $(test_isa_3_1_CFLAGS)
+ test_isa_3_1_Misc_CFLAGS = $(test_isa_3_1_CFLAGS)
+ test_isa_3_1_AT_CFLAGS = $(test_isa_3_1_CFLAGS)
+ 
++# The _R1_foo tests exercise pc-relative instructions, so require the bss and text sections
++# exist at known offsets with respect to each other.
++test_isa_3_1_R1_RT_CFLAGS = $(test_isa_3_1_CFLAGS) -Wl,-Tbss,0x20000 -Wl,-Ttext,0x40000
++test_isa_3_1_R1_XT_CFLAGS = $(test_isa_3_1_CFLAGS) -Wl,-Tbss,0x20000 -Wl,-Ttext,0x40000
++
+ subnormal_test_CFLAGS = $(AM_CFLAGS) -Winline -Wall -O -g -mregnames $(VSX_FLAG) $(ISA_2_06_FLAG) \
+ 			@FLAG_M64@ $(ALTIVEC_FLAG) $(BUILD_FLAG_VSX) $(BUILD_FLAGS_ISA_2_06)
+ 
+diff --git a/none/tests/ppc64/isa_3_1_helpers.h b/none/tests/ppc64/isa_3_1_helpers.h
+index 338f55526..716a6277b 100644
+--- a/none/tests/ppc64/isa_3_1_helpers.h
++++ b/none/tests/ppc64/isa_3_1_helpers.h
+@@ -43,6 +43,9 @@ extern void debug_show_current_iteration();
+ extern void debug_dump_buffer();
+ 
+ extern void identify_form_components(const char *, const char *);
++extern void identify_instruction_by_func_name(const char *);
++extern void init_pcrelative_write_target();
++extern void print_pcrelative_write_target();
+ extern void dump_vsxargs();
+ extern void generic_prologue();
+ extern void build_args_table();
+@@ -58,6 +61,21 @@ extern void initialize_source_registers();
+ extern void set_up_iterators();
+ extern void initialize_buffer(int);
+ 
++/* This (TEXT_BSS_DELTA) is the relative distance between those
++   sections as set by the linker options for the R==1 tests. */
++#define TEXT_BSS_DELTA 0x20000
++#define RELOC_BUFFER_SIZE 0x1000
++extern unsigned long long pcrelative_buff_addr(int);
++#define PAD_ORI	\
++	__asm__ __volatile__ ("ori 21,21,21"); \
++	__asm__ __volatile__ ("ori 22,22,22");\
++	__asm__ __volatile__ ("ori 23,23,23");\
++	__asm__ __volatile__ ("ori 24,24,24");\
++	__asm__ __volatile__ ("ori 25,25,25");\
++	__asm__ __volatile__ ("ori 26,26,26");\
++	__asm__ __volatile__ ("ori 27,27,27");\
++	__asm__ __volatile__ ("ori 28,28,28");
++
+ extern int verbose;
+ #define debug_printf(X) if (verbose>0) printf(X);
+ #define debug_show_labels (verbose>0)
+diff --git a/none/tests/ppc64/test_isa_3_1_R1_RT.c b/none/tests/ppc64/test_isa_3_1_R1_RT.c
+new file mode 100644
+index 000000000..d73b84b10
+--- /dev/null
++++ b/none/tests/ppc64/test_isa_3_1_R1_RT.c
+@@ -0,0 +1,624 @@
++/*
++ * Valgrind testcase for PowerPC ISA 3.1
++ *
++ * Copyright (C) 2019-2020 Will Schmidt <will_schmidt@vnet.ibm.com>
++ *
++ * 64bit build:
++ *    gcc -Winline -Wall -g -O -mregnames -maltivec -m64
++ */
++
++/*
++ *   This program is free software; you can redistribute it and/or
++ *   modify it under the terms of the GNU General Public License as
++ *   published by the Free Software Foundation; either version 2 of the
++ *   License, or (at your option) any later version.
++ *
++ *   This program is distributed in the hope that it will be useful,
++ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
++ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ *   GNU General Public License for more details.
++ *
++ *   You should have received a copy of the GNU General Public License
++ *   along with this program; if not, write to the Free Software
++ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
++ */
++
++#include <stdio.h>
++#ifdef HAS_ISA_3_1
++#include <stdint.h>
++#include <assert.h>
++#include <stdlib.h>
++#include <string.h>
++#include <unistd.h>
++#include <altivec.h>
++#include <malloc.h>
++
++#include <string.h>
++#include <signal.h>
++#include <setjmp.h>
++
++/* Condition Register fields.
++   These are used to capture the condition register values immediately after
++   the instruction under test is executed. This is done to help prevent other
++   test overhead (switch statements, result compares, etc) from disturbing
++   the test case results.  */
++unsigned long current_cr;
++unsigned long current_fpscr;
++
++struct test_list_t current_test;
++
++#include "isa_3_1_helpers.h"
++
++static void test_plxvp_off0_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plxvp 20, +0(0),1"  );
++	PAD_ORI
++}
++static void test_plxvp_off8_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plxvp 20, +8(0),1" );
++	PAD_ORI
++}
++static void test_plxvp_off16_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plxvp 20, +16(0),1" );
++	PAD_ORI
++}
++static void test_plxvp_off24_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plxvp 20, +24(0),1" );
++	PAD_ORI
++}
++static void test_plxvp_off32_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plxvp 20, +32(0),1" );
++	PAD_ORI
++}
++static void test_plbz_off0_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plbz %0, +0(0), 1" : "=r" (rt) );
++	PAD_ORI
++}
++static void test_plbz_off8_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plbz %0, +8(0), 1" : "=r" (rt) );
++	PAD_ORI
++}
++static void test_plbz_off16_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plbz %0, +16(0), 1" : "=r" (rt) );
++	PAD_ORI
++}
++static void test_plbz_off32_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plbz %0, +32(0), 1" : "=r" (rt) );
++	PAD_ORI
++}
++static void test_plbz_off64_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plbz %0, +64(0), 1" : "=r" (rt) );
++	PAD_ORI
++	PAD_ORI
++}
++static void test_plhz_off0_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plhz %0, +0(0), 1" : "=r" (rt) );
++	PAD_ORI
++}
++static void test_plhz_off8_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plhz %0, +8(0), 1" : "=r" (rt) );
++	PAD_ORI
++}
++static void test_plhz_off16_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plhz %0, +16(0), 1" : "=r" (rt) );
++	PAD_ORI
++}
++static void test_plhz_off32_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plhz %0, +32(0), 1" : "=r" (rt) );
++	PAD_ORI
++}
++static void test_plhz_off64_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plhz %0, +64(0), 1" : "=r" (rt) );
++	PAD_ORI
++	PAD_ORI
++}
++static void test_plha_off0_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plha %0, +0(0), 1" : "=r" (rt) );
++	PAD_ORI
++}
++static void test_plha_off8_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plha %0, +8(0), 1" : "=r" (rt)  );
++	PAD_ORI
++}
++static void test_plha_off16_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plha %0, +16(0), 1" : "=r" (rt) );
++	PAD_ORI
++}
++static void test_plha_off32_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plha %0, +32(0), 1" : "=r" (rt) );
++	PAD_ORI
++}
++static void test_plha_off64_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plha %0, +64(0), 1" : "=r" (rt) );
++	PAD_ORI
++	PAD_ORI
++}
++static void test_plwz_off0_R1 (void) {
++  __asm__ __volatile__ ("plwz %0, +0(0), 1" : "=r" (rt)  );
++}
++static void test_plwz_off8_R1 (void) {
++  __asm__ __volatile__ ("plwz %0, +8(0), 1" : "=r" (rt) );
++}
++static void test_plwz_off16_R1 (void) {
++  __asm__ __volatile__ ("plwz %0, +16(0), 1" : "=r" (rt) );
++}
++static void test_plwz_off32_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plwz %0, +32(0), 1" : "=r" (rt) );
++	PAD_ORI
++}
++static void test_plwz_off64_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plwz %0, +64(0), 1" : "=r" (rt) );
++	PAD_ORI
++	PAD_ORI
++}
++static void test_plwa_off0_R1 (void) {
++  __asm__ __volatile__ ("plwa %0, +0(0), 1" : "=r" (rt)  );
++}
++static void test_plwa_off8_R1 (void) {
++  __asm__ __volatile__ ("plwa %0, +8(0), 1" : "=r" (rt)  );
++}
++static void test_plwa_off16_R1 (void) {
++  __asm__ __volatile__ ("plwa %0, +16(0), 1" : "=r" (rt) );
++}
++static void test_plwa_off32_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plwa %0, +32(0), 1" : "=r" (rt) );
++	PAD_ORI
++}
++static void test_plwa_off64_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plwa %0, +64(0), 1" : "=r" (rt) );
++	PAD_ORI
++	PAD_ORI
++}
++static void test_pld_off0_R1 (void) {
++  __asm__ __volatile__ ("pld %0, +0(0), 1" : "=r" (rt)  );
++}
++static void test_pld_off8_R1 (void) {
++  __asm__ __volatile__ ("pld %0, +8(0), 1" : "=r" (rt)  );
++}
++static void test_pld_off16_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("pld %0, +16(0), 1" : "=r" (rt) );
++	PAD_ORI
++}
++static void test_pld_off32_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("pld %0, +32(0), 1" : "=r" (rt) );
++	PAD_ORI
++}
++static void test_pld_off64_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("pld %0, +64(0), 1" : "=r" (rt) );
++	PAD_ORI
++	PAD_ORI
++}
++static void test_pstb_off0_R1 (void) {
++  __asm__ __volatile__ ("pstb %0, -0x1f400+0(0), 1" :: "r" (rs) );
++}
++static void test_pstb_off8_R1 (void) {
++  __asm__ __volatile__ ("pstb %0, -0x1f400+8(0), 1" :: "r" (rs) );
++}
++static void test_pstb_off16_R1 (void) {
++  __asm__ __volatile__ ("pstb %0, -0x1f400+16(0), 1" :: "r" (rs) );
++}
++static void test_pstb_off32_R1 (void) {
++  __asm__ __volatile__ ("pstb %0, -0x1f400+32(0), 1" :: "r" (rs) );
++}
++static void test_psth_off0_R1 (void) {
++  __asm__ __volatile__ ("psth %0, -0x1f400+0(0), 1" :: "r" (rs) );
++}
++static void test_psth_off8_R1 (void) {
++  __asm__ __volatile__ ("psth %0, -0x1f400+8(0), 1" :: "r" (rs) );
++}
++static void test_psth_off16_R1 (void) {
++  __asm__ __volatile__ ("psth %0, -0x1f400+16(0), 1" :: "r" (rs) );
++}
++static void test_psth_off32_R1 (void) {
++  __asm__ __volatile__ ("psth %0, -0x1f400+32(0), 1" :: "r" (rs) );
++}
++static void test_pstw_off0_R1 (void) {
++  __asm__ __volatile__ ("pstw %0, -0x1f400+0(0), 1" :: "r" (rs) );
++}
++static void test_pstw_off8_R1 (void) {
++  __asm__ __volatile__ ("pstw %0, -0x1f400+8(0), 1" :: "r" (rs) );
++}
++static void test_pstw_off16_R1 (void) {
++  __asm__ __volatile__ ("pstw %0, -0x1f400+16(0), 1" :: "r" (rs) );
++}
++static void test_pstw_off32_R1 (void) {
++  __asm__ __volatile__ ("pstw %0, -0x1f400+32(0), 1" :: "r" (rs) );
++}
++static void test_pstd_off0_R1 (void) {
++  __asm__ __volatile__ ("pstd %0, -0x1f400+0(0), 1" :: "r" (rs) );
++}
++static void test_pstd_off8_R1 (void) {
++  __asm__ __volatile__ ("pstd %0, -0x1f400+8(0), 1" :: "r" (rs) );
++}
++static void test_pstd_off16_R1 (void) {
++  __asm__ __volatile__ ("pstd %0, -0x1f400+16(0), 1" :: "r" (rs) );
++}
++static void test_pstd_off32_R1 (void) {
++  __asm__ __volatile__ ("pstd %0, -0x1f400+32(0), 1" :: "r" (rs) );
++}
++  /* For the paddi tests; although we can get close to a read/write target
++     due to forcing where the .text and .bss sections are placed, there is
++     still enough codegen variability that having a raw value in the exp
++     file will not be determinative for these instructions.
++     Thus, compromise and just ensure that the generated value is an
++     address that lands within the reloc buffer, and use quasi magic
++     eyecatcher values in the return to indicate success.  */
++static void test_paddi_0_R1 (void) {
++  __asm__ __volatile__ ("paddi %0, 0, 0+0, 1" : "=r" (rt)  );
++  rt = rt - TEXT_BSS_DELTA;
++  if (rt > pcrelative_buff_addr(0) &&
++		  rt < pcrelative_buff_addr(RELOC_BUFFER_SIZE))
++	  rt = 0xffff0000;
++}
++static void test_paddi_12_R1 (void) {
++  __asm__ __volatile__ ("paddi %0, 0, 0+12, 1" : "=r" (rt)  );
++  rt = rt - TEXT_BSS_DELTA;
++  if (rt > pcrelative_buff_addr(0) &&
++		  rt < pcrelative_buff_addr(RELOC_BUFFER_SIZE))
++	  rt = 0xffff0012;
++}
++static void test_paddi_48_R1 (void) {
++  __asm__ __volatile__ ("paddi %0, 0, 0+48, 1" : "=r" (rt)  );
++  rt = rt - TEXT_BSS_DELTA;
++  if (rt > pcrelative_buff_addr(0) &&
++		  rt < pcrelative_buff_addr(RELOC_BUFFER_SIZE))
++	  rt = 0xffff0048;
++}
++static void test_paddi_98_R1 (void) {
++  __asm__ __volatile__ ("paddi %0, 0, 0+98, 1" : "=r" (rt) );
++  rt = rt - TEXT_BSS_DELTA;
++  if (rt > pcrelative_buff_addr(0) &&
++		  rt < pcrelative_buff_addr(RELOC_BUFFER_SIZE))
++	  rt = 0xffff0098;
++}
++static void test_plq_off0_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plq 26, +0(0), 1"  );
++	PAD_ORI
++}
++static void test_plq_off8_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plq 26, +8(0), 1"  );
++	PAD_ORI
++}
++static void test_plq_off16_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plq 26, +16(0), 1"  );
++	PAD_ORI
++}
++static void test_plq_off32_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plq 26, +32(0), 1"  );
++	PAD_ORI
++}
++static void test_plq_off48_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plq 26, +48(0), 1"  );
++	PAD_ORI
++}
++static void test_plq_off64_R1 (void) {
++	PAD_ORI
++  __asm__ __volatile__ ("plq 26, +64(0), 1"  );
++	PAD_ORI
++	PAD_ORI
++}
++static void test_pstq_off0_R1 (void) {
++  __asm__ __volatile__ ("pstq 24, -0x1f400+0(0), 1"  );
++}
++static void test_pstq_off8_R1 (void) {
++  __asm__ __volatile__ ("pstq 24, -0x1f400+8(0), 1"  );
++}
++static void test_pstq_off16_R1 (void) {
++  __asm__ __volatile__ ("pstq 24, -0x1f400+16(0), 1"  );
++}
++static void test_pstq_off32_R1 (void) {
++  __asm__ __volatile__ ("pstq 24, -0x1f400+32(0), 1"  );
++}
++static void test_pstq_off64_R1 (void) {
++  __asm__ __volatile__ ("pstq 24, -0x1f400+64(0), 1"  );
++}
++
++static test_list_t testgroup_generic[] = {
++  { &test_paddi_0_R1, "paddi 0_R1", "RT,RA,SI,R"}, /* bcwp */
++  { &test_paddi_12_R1, "paddi 12_R1", "RT,RA,SI,R"}, /* bcwp */
++  { &test_paddi_48_R1, "paddi 48_R1", "RT,RA,SI,R"}, /* bcwp */
++  { &test_paddi_98_R1, "paddi 98_R1", "RT,RA,SI,R"}, /* bcwp */
++  { &test_plbz_off0_R1, "plbz off0_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plbz_off8_R1, "plbz off8_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plbz_off16_R1, "plbz off16_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plbz_off32_R1, "plbz off32_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plbz_off64_R1, "plbz off64_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_pld_off0_R1, "pld off0_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_pld_off8_R1, "pld off8_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_pld_off16_R1, "pld off16_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_pld_off32_R1, "pld off32_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_pld_off64_R1, "pld off64_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plha_off0_R1, "plha off0_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plha_off8_R1, "plha off8_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plha_off16_R1, "plha off16_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plha_off32_R1, "plha off32_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plha_off64_R1, "plha off64_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plhz_off0_R1, "plhz off0_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plhz_off8_R1, "plhz off8_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plhz_off16_R1, "plhz off16_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plhz_off32_R1, "plhz off32_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plhz_off64_R1, "plhz off64_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plq_off0_R1, "plq off0_R1", "RTp,D(RA),R"}, /* bcwp */
++  { &test_plq_off8_R1, "plq off8_R1", "RTp,D(RA),R"}, /* bcwp */
++  { &test_plq_off16_R1, "plq off16_R1", "RTp,D(RA),R"}, /* bcwp */
++  { &test_plq_off32_R1, "plq off32_R1", "RTp,D(RA),R"}, /* bcwp */
++  { &test_plq_off48_R1, "plq off48_R1", "RTp,D(RA),R"}, /* bcwp */
++  { &test_plq_off64_R1, "plq off64_R1", "RTp,D(RA),R"}, /* bcwp */
++  { &test_plwa_off0_R1, "plwa off0_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plwa_off8_R1, "plwa off8_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plwa_off16_R1, "plwa off16_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plwa_off32_R1, "plwa off32_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plwa_off64_R1, "plwa off64_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plwz_off0_R1, "plwz off0_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plwz_off8_R1, "plwz off8_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plwz_off16_R1, "plwz off16_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plwz_off32_R1, "plwz off32_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plwz_off64_R1, "plwz off64_R1", "RT,D(RA),R"}, /* bcwp */
++  { &test_plxvp_off0_R1, "plxvp off0_R1", "XTp,D(RA),R"}, /* bcwp */
++  { &test_plxvp_off8_R1, "plxvp off8_R1", "XTp,D(RA),R"}, /* bcwp */
++  { &test_plxvp_off16_R1, "plxvp off16_R1", "XTp,D(RA),R"}, /* bcwp */
++  { &test_plxvp_off24_R1, "plxvp off24_R1", "XTp,D(RA),R"}, /* bcwp */
++  { &test_plxvp_off32_R1, "plxvp off32_R1", "XTp,D(RA),R"}, /* bcwp */
++  { &test_pstb_off0_R1, "pstb off0_R1", "RS,D(RA),R"}, /* bcwp */
++  { &test_pstb_off8_R1, "pstb off8_R1", "RS,D(RA),R"}, /* bcwp */
++  { &test_pstb_off16_R1, "pstb off16_R1", "RS,D(RA),R"}, /* bcwp */
++  { &test_pstb_off32_R1, "pstb off32_R1", "RS,D(RA),R"}, /* bcwp */
++  { &test_pstd_off0_R1, "pstd off0_R1", "RS,D(RA),R"}, /* bcwp */
++  { &test_pstd_off8_R1, "pstd off8_R1", "RS,D(RA),R"}, /* bcwp */
++  { &test_pstd_off16_R1, "pstd off16_R1", "RS,D(RA),R"}, /* bcwp */
++  { &test_pstd_off32_R1, "pstd off32_R1", "RS,D(RA),R"}, /* bcwp */
++  { &test_psth_off0_R1, "psth off0_R1", "RS,D(RA),R"}, /* bcwp */
++  { &test_psth_off8_R1, "psth off8_R1", "RS,D(RA),R"}, /* bcwp */
++  { &test_psth_off16_R1, "psth off16_R1", "RS,D(RA),R"}, /* bcwp */
++  { &test_psth_off32_R1, "psth off32_R1", "RS,D(RA),R"}, /* bcwp */
++  { &test_pstq_off0_R1, "pstq off0_R1", "RSp,D(RA),R"}, /* bcwp */
++  { &test_pstq_off8_R1, "pstq off8_R1", "RSp,D(RA),R"}, /* bcwp */
++  { &test_pstq_off16_R1, "pstq off16_R1", "RSp,D(RA),R"}, /* bcwp */
++  { &test_pstq_off32_R1, "pstq off32_R1", "RSp,D(RA),R"}, /* bcwp */
++  { &test_pstq_off64_R1, "pstq off64_R1", "RSp,D(RA),R"}, /* bcwp */
++  { &test_pstw_off0_R1, "pstw off0_R1", "RS,D(RA),R"}, /* bcwp */
++  { &test_pstw_off8_R1, "pstw off8_R1", "RS,D(RA),R"}, /* bcwp */
++  { &test_pstw_off16_R1, "pstw off16_R1", "RS,D(RA),R"}, /* bcwp */
++  { &test_pstw_off32_R1, "pstw off32_R1", "RS,D(RA),R"}, /* bcwp */
++	{ NULL, 	    NULL },
++};
++
++/*  Allow skipping of tests. */
++unsigned long test_count=0xffff;
++unsigned long skip_count=0;
++unsigned long setup_only=0;
++
++/*  Set up a setjmp/longjmp to gently handle our SIGILLs and SIGSEGVs.  */
++static jmp_buf mybuf;
++
++/* This (testfunction_generic) is meant to handle all of the instruction
++   variations.  The helpers set up the register and iterator values
++   as is appropriate for the instruction being tested.  */
++static void testfunction_generic (const char* instruction_name,
++				  test_func_t test_function,
++				  unsigned int ignore_flags,
++				  char * cur_form) {
++
++   identify_form_components (instruction_name , cur_form);
++   debug_show_form (instruction_name, cur_form);
++   set_up_iterators ();
++   debug_show_iter_ranges ();
++   initialize_buffer (0);
++   init_pcrelative_write_target ();
++   debug_dump_buffer ();
++
++   for (vrai = a_start; vrai < a_iters ; vrai+=a_inc) {
++      for (vrbi = b_start; vrbi < b_iters ; vrbi+=b_inc) {
++	 for (vrci = c_start; vrci < c_iters ; vrci+=c_inc) {
++	    for (vrmi = m_start; (vrmi < m_iters) ; vrmi+=m_inc) {
++		CHECK_OVERRIDES
++		debug_show_current_iteration ();
++		// Be sure to initialize the target registers first.
++		initialize_target_registers ();
++		initialize_source_registers ();
++		printf ("%s", instruction_name);
++		print_register_header ();
++		printf( " =>"); fflush (stdout);
++		if (!setup_only) {
++		  if (enable_setjmp) {
++		   if ( setjmp ( mybuf ) ) {
++		     printf("signal tripped. (FIXME)\n");
++		     continue;
++		   }
++		  }
++		  (*test_function) ();
++		}
++		print_register_footer ();
++		print_result_buffer ();
++		print_pcrelative_write_target ();
++		printf ("\n");
++	    }
++	 }
++      }
++   }
++}
++
++void mykillhandler ( int x ) { longjmp (mybuf, 1); }
++void mysegvhandler ( int x ) { longjmp (mybuf, 1); }
++
++static void do_tests ( void )
++{
++   int groupcount;
++   char * cur_form;
++   test_group_t group_function = &testfunction_generic;
++   test_list_t *tests = testgroup_generic;
++
++   struct sigaction kill_action, segv_action;
++   struct sigaction old_kill_action, old_segv_action;
++   if (enable_setjmp) {
++      kill_action.sa_handler = mykillhandler;
++      segv_action.sa_handler = mysegvhandler;
++      sigemptyset ( &kill_action.sa_mask );
++      sigemptyset ( &segv_action.sa_mask );
++      kill_action.sa_flags = SA_NODEFER;
++      segv_action.sa_flags = SA_NODEFER;
++      sigaction ( SIGILL, &kill_action, &old_kill_action);
++      sigaction ( SIGSEGV, &segv_action, &old_segv_action);
++   }
++
++   for (groupcount = 0; tests[groupcount].name != NULL; groupcount++) {
++	cur_form = strdup(tests[groupcount].form);
++	current_test = tests[groupcount];
++	identify_instruction_by_func_name (current_test.name);
++	if (groupcount < skip_count) continue;
++	if (verbose) printf("Test #%d ,", groupcount);
++	if (verbose > 1) printf(" instruction %s (v=%d)", current_test.name, verbose);
++	(*group_function) (current_test.name, current_test.func, 0, cur_form );
++	printf ("\n");
++	if (groupcount >= (skip_count+test_count)) break;
++   }
++   if (debug_show_labels) printf("\n");
++   printf ("All done. Tested %d different instruction groups\n", groupcount);
++}
++
++static void usage (void)
++{
++   fprintf(stderr,
++      "Usage: test_isa_XXX [OPTIONS]\n"
++      "\t-h: display this help and exit\n"
++      "\t-v: increase verbosity\n"
++      "\t-a <foo> : limit number of a-iterations to <foo>\n"
++      "\t-b <foo> : limit number of b-iterations to <foo>\n"
++      "\t-c <foo> : limit number of c-iterations to <foo>\n"
++      "\t-n <foo> : limit to this number of tests.\n"
++      "\t-r <foo>: run only test # <foo> \n"
++      "\t\n"
++      "\t-j :enable setjmp to recover from illegal insns. \n"
++      "\t-m :(dev only?) lock VRM value to zero.\n"
++      "\t-z :(dev only?) lock MC value to zero.\n"
++      "\t-p :(dev only?) disable prefix instructions\n"
++      "\t-s <foo>: skip <foo> tests \n"
++      "\t-c <foo>: stop after running <foo> # of tests \n"
++      "\t-f : Do the test setup but do not actually execute the test instruction. \n"
++   );
++}
++
++int main (int argc, char **argv)
++{
++   int c;
++   while ((c = getopt(argc, argv, "dhjvmpfzs:a:b:c:n:r:")) != -1) {
++      switch (c) {
++	 case 'h':
++	    usage();
++	    return 0;
++
++	 case 'v':
++	    verbose++;
++	    break;
++
++	 /* Options related to limiting the test iterations.  */
++	 case 'a':
++	    a_limit=atoi (optarg);
++	    printf ("limiting a-iters to %ld.\n", a_limit);
++	    break;
++	 case 'b':
++	    b_limit=atoi (optarg);
++	    printf ("limiting b-iters to %ld.\n", b_limit);
++	    break;
++	 case 'c':
++	    c_limit=atoi (optarg);
++	    printf ("limiting c-iters to %ld.\n", c_limit);
++	    break;
++	 case 'n': // run this number of tests.
++	    test_count=atoi (optarg);
++	    printf ("limiting to %ld tests\n", test_count);
++	    break;
++	 case 'r': // run just test #<foo>.
++	    skip_count=atoi (optarg);
++	    test_count=0;
++	    if (verbose) printf("Running test number %ld\n", skip_count);
++	    break;
++	 case 's': // skip this number of tests.
++	    skip_count=atoi (optarg);
++	    printf ("skipping %ld tests\n", skip_count);
++	    break;
++
++	 /* debug options.  */
++	 case 'd':
++	    dump_tables=1;
++	    printf("DEBUG:dump_tables.\n");
++	    break;
++	 case 'f':
++	    setup_only=1;
++	    printf("DEBUG:setup_only.\n");
++	    break;
++	 case 'j':
++	    enable_setjmp=1;
++	    printf ("DEBUG:setjmp enabled.\n");
++	    break;
++	 case 'm':
++	    vrm_override=1;
++	    printf ("DEBUG:vrm override enabled.\n");
++	    break;
++	 case 'p':
++	    prefix_override=1;
++	    printf ("DEBUG:prefix override enabled.\n");
++	    break;
++	 case 'z':
++	    mc_override=1;
++	    printf ("DEBUG:MC override enabled.\n");
++	    break;
++	 default:
++	    usage();
++	    fprintf(stderr, "Unknown argument: '%c'\n", c);
++	   }
++	}
++
++	generic_prologue ();
++	build_vsx_table ();
++	build_args_table ();
++	build_float_vsx_tables ();
++
++	if (dump_tables) {
++	   dump_float_vsx_tables ();
++	   dump_vsxargs ();
++	}
++
++	do_tests ();
++
++	return 0;
++}
++
++#else	   // HAS_ISA_3_1
++int main (int argc, char **argv)
++{
++   printf("NO ISA 3.1 SUPPORT\n");
++   return 0;
++}
++#endif
+diff --git a/none/tests/ppc64/test_isa_3_1_R1_RT.stderr.exp b/none/tests/ppc64/test_isa_3_1_R1_RT.stderr.exp
+new file mode 100644
+index 000000000..139597f9c
+--- /dev/null
++++ b/none/tests/ppc64/test_isa_3_1_R1_RT.stderr.exp
+@@ -0,0 +1,2 @@
++
++
+diff --git a/none/tests/ppc64/test_isa_3_1_R1_RT.stdout.exp b/none/tests/ppc64/test_isa_3_1_R1_RT.stdout.exp
+new file mode 100644
+index 000000000..87594748f
+--- /dev/null
++++ b/none/tests/ppc64/test_isa_3_1_R1_RT.stdout.exp
+@@ -0,0 +1,138 @@
++paddi 0_R1 =>         ffff0000
++
++paddi 12_R1 =>         ffff0012
++
++paddi 48_R1 =>         ffff0048
++
++paddi 98_R1 =>         ffff0098
++
++plbz off0_R1 =>               1a
++
++plbz off8_R1 =>               1f
++
++plbz off16_R1 =>               1f
++
++plbz off32_R1 =>               1b
++
++plbz off64_R1 =>               1b
++
++pld off0_R1 => e740000004100000
++
++pld off8_R1 =>         4e800020
++
++pld off16_R1 => 6318001862f7001f
++
++pld off32_R1 => 639c001c637b001b
++
++pld off64_R1 => 639c001c637b001b
++
++plha off0_R1 =>               1a
++
++plha off8_R1 =>               1f
++
++plha off16_R1 =>               1f
++
++plha off32_R1 =>               1b
++
++plha off64_R1 =>               1b
++
++plhz off0_R1 =>               1a
++
++plhz off8_R1 =>               1f
++
++plhz off16_R1 =>               1f
++
++plhz off32_R1 =>               1b
++
++plhz off64_R1 =>               1b
++
++plq off0_R1 => e34000000410001a 62d6001662b5001f
++
++plq off8_R1 => 62d6001662b5001f 6318001862f7001f
++
++plq off16_R1 => 6318001862f7001f 635a001a6339001b
++
++plq off32_R1 => 639c001c637b001b         4e80003b
++
++plq off48_R1 =>               1a 62d6001662b5001f
++
++plq off64_R1 => 639c001c637b001b         4e80003b
++
++plwa off0_R1 =>          4100000
++
++plwa off8_R1 =>         4e800020
++
++plwa off16_R1 =>                0
++
++plwa off32_R1 =>         637b001b
++
++plwa off64_R1 =>         637b001b
++
++plwz off0_R1 =>          6100000
++
++plwz off8_R1 =>         4e800020
++
++plwz off16_R1 =>                0
++
++plwz off32_R1 =>         637b001b
++
++plwz off64_R1 =>         637b001b
++
++plxvp off0_R1 => 6318001862f70017 635a001a63390019 ea80000004100000 62d6001662b50015
++
++plxvp off8_R1 => 635a001a63390019 639c001c637b001b 62d6001662b50015 6318001862f70017
++
++plxvp off16_R1 => 639c001c637b001b 000000004e800020 6318001862f70017 635a001a63390019
++
++plxvp off24_R1 => 000000004e800020 0000000000000000 635a001a63390019 639c001c637b001b
++
++plxvp off32_R1 => 0000000000000000 62d6001662b50015 639c001c637b001b 000000004e800020
++
++pstb off0_R1 102030405060708 => 08              
++
++pstb off8_R1 102030405060708 => 08              
++
++pstb off16_R1 102030405060708 => 08              
++
++pstb off32_R1 102030405060708 => 08              
++
++pstd off0_R1 102030405060708 => 0102030405060708 
++
++pstd off8_R1 102030405060708 => 0102030405060708 
++
++pstd off16_R1 102030405060708 => 0102030405060708 
++
++pstd off32_R1 102030405060708 => 0102030405060708 
++
++psth off0_R1 102030405060708 => 0708      
++
++psth off8_R1 102030405060708 => 0708      
++
++psth off16_R1 102030405060708 => 0708      
++
++psth off32_R1 102030405060708 => 0708      
++
++pstq off0_R1 102030405060708 a5b4c3d2e1f00918 => 0102030405060708  a5b4c3d2e1f00918 
++pstq off0_R1 102030405060708 a5b4c3d2e1f00918 => 0102030405060708  a5b4c3d2e1f00918 
++
++pstq off8_R1 102030405060708 a5b4c3d2e1f00918 => 0102030405060708  a5b4c3d2e1f00918 
++pstq off8_R1 102030405060708 a5b4c3d2e1f00918 => 0102030405060708  a5b4c3d2e1f00918 
++
++pstq off16_R1 102030405060708 a5b4c3d2e1f00918 => 0102030405060708  a5b4c3d2e1f00918 
++pstq off16_R1 102030405060708 a5b4c3d2e1f00918 => 0102030405060708  a5b4c3d2e1f00918 
++
++pstq off32_R1 102030405060708 a5b4c3d2e1f00918 => 0102030405060708  a5b4c3d2e1f00918 
++pstq off32_R1 102030405060708 a5b4c3d2e1f00918 => 0102030405060708  a5b4c3d2e1f00918 
++
++pstq off64_R1 102030405060708 a5b4c3d2e1f00918 => 0102030405060708  a5b4c3d2e1f00918 
++pstq off64_R1 102030405060708 a5b4c3d2e1f00918 => 0102030405060708  a5b4c3d2e1f00918 
++
++pstw off0_R1 102030405060708 => 05060708  
++
++pstw off8_R1 102030405060708 => 05060708  
++
++pstw off16_R1 102030405060708 => 05060708  
++
++pstw off32_R1 102030405060708 => 05060708  
++
++All done. Tested 66 different instruction groups
+diff --git a/none/tests/ppc64/test_isa_3_1_R1_RT.vgtest b/none/tests/ppc64/test_isa_3_1_R1_RT.vgtest
+new file mode 100644
+index 000000000..61d7f65a1
+--- /dev/null
++++ b/none/tests/ppc64/test_isa_3_1_R1_RT.vgtest
+@@ -0,0 +1,2 @@
++prereq: ../../../tests/check_ppc64_auxv_cap arch_3_1
++prog: test_isa_3_1_R1_RT
+diff --git a/none/tests/ppc64/test_isa_3_1_R1_XT.c b/none/tests/ppc64/test_isa_3_1_R1_XT.c
+new file mode 100644
+index 000000000..58885b8d3
+--- /dev/null
++++ b/none/tests/ppc64/test_isa_3_1_R1_XT.c
+@@ -0,0 +1,534 @@
++/*
++ * Valgrind testcase for PowerPC ISA 3.1
++ *
++ * Copyright (C) 2019-2020 Will Schmidt <will_schmidt@vnet.ibm.com>
++ *
++ * 64bit build:
++ *    gcc -Winline -Wall -g -O -mregnames -maltivec -m64
++ */
++
++/*
++ *   This program is free software; you can redistribute it and/or
++ *   modify it under the terms of the GNU General Public License as
++ *   published by the Free Software Foundation; either version 2 of the
++ *   License, or (at your option) any later version.
++ *
++ *   This program is distributed in the hope that it will be useful,
++ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
++ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ *   GNU General Public License for more details.
++ *
++ *   You should have received a copy of the GNU General Public License
++ *   along with this program; if not, write to the Free Software
++ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
++ */
++
++#include <stdio.h>
++#ifdef HAS_ISA_3_1
++#include <stdint.h>
++#include <assert.h>
++#include <stdlib.h>
++#include <string.h>
++#include <unistd.h>
++#include <altivec.h>
++#include <malloc.h>
++
++#include <string.h>
++#include <signal.h>
++#include <setjmp.h>
++
++/* Condition Register fields.
++   These are used to capture the condition register values immediately after
++   the instruction under test is executed. This is done to help prevent other
++   test overhead (switch statements, result compares, etc) from disturbing
++   the test case results.  */
++unsigned long current_cr;
++unsigned long current_fpscr;
++
++struct test_list_t current_test;
++
++#include "isa_3_1_helpers.h"
++static void test_pstxvp_off0_R1 (void) {
++  __asm__ __volatile__ ("pstxvp 20, -0x1f400+0(0),1");
++}
++static void test_pstxvp_off16_R1 (void) {
++  __asm__ __volatile__ ("pstxvp 20, -0x1f400+16(0),1");
++}
++static void test_pstxvp_off32_R1 (void) {
++  __asm__ __volatile__ ("pstxvp 20, -0x1f400+32(0),1");
++}
++static void test_pstxvp_off48_R1 (void) {
++  __asm__ __volatile__ ("pstxvp 20, -0x1f400+48(0),1");
++}
++static void test_plfd_64_R1 (void) {
++  __asm__ __volatile__ ("plfd 28, +64(0), 1");
++	PAD_ORI
++	PAD_ORI
++}
++static void test_plfd_32_R1 (void) {
++  __asm__ __volatile__ ("plfd 28, +32(0), 1");
++	PAD_ORI
++}
++static void test_plfd_16_R1 (void) {
++  __asm__ __volatile__ ("plfd 28, +16(0), 1");
++	PAD_ORI
++}
++static void test_plfd_8_R1 (void) {
++  __asm__ __volatile__ ("plfd 28, +8(0), 1");
++	PAD_ORI
++}
++static void test_plfd_4_R1 (void) {
++  __asm__ __volatile__ ("plfd 28, +4(0), 1");
++	PAD_ORI
++}
++static void test_plfd_0_R1 (void) {
++  __asm__ __volatile__ ("plfd 28, +0(0), 1");
++	PAD_ORI
++}
++static void test_plfs_64_R1 (void) {
++  __asm__ __volatile__ ("plfs 28, +64(0), 1");
++	PAD_ORI
++	PAD_ORI
++}
++static void test_plfs_32_R1 (void) {
++  __asm__ __volatile__ ("plfs 28, +32(0), 1");
++	PAD_ORI
++}
++static void test_plfs_16_R1 (void) {
++  __asm__ __volatile__ ("plfs 28, +16(0), 1");
++	PAD_ORI
++}
++static void test_plfs_8_R1 (void) {
++  __asm__ __volatile__ ("plfs 28, +8(0), 1");
++	PAD_ORI
++}
++static void test_plfs_4_R1 (void) {
++  __asm__ __volatile__ ("plfs 28, +4(0), 1");
++	PAD_ORI
++}
++static void test_plfs_0_R1 (void) {
++  __asm__ __volatile__ ("plfs 28, +0(0), 1");
++	PAD_ORI
++}
++static void test_pstfd_32_R1 (void) {
++  __asm__ __volatile__ ("pstfd 26, -0x1f400+32(0), 1");
++}
++static void test_pstfd_16_R1 (void) {
++  __asm__ __volatile__ ("pstfd 26, -0x1f400+16(0), 1");
++}
++static void test_pstfd_8_R1 (void) {
++  __asm__ __volatile__ ("pstfd 26, -0x1f400+8(0), 1");
++}
++static void test_pstfd_4_R1 (void) {
++  __asm__ __volatile__ ("pstfd 26, -0x1f400+4(0), 1");
++}
++static void test_pstfd_0_R1 (void) {
++  __asm__ __volatile__ ("pstfd 26, -0x1f400+0(0), 1");
++}
++static void test_pstfs_32_R1 (void) {
++  __asm__ __volatile__ ("pstfs 26, -0x1f400+32(0), 1");
++}
++static void test_pstfs_16_R1 (void) {
++  __asm__ __volatile__ ("pstfs 26, -0x1f400+16(0), 1");
++}
++static void test_pstfs_8_R1 (void) {
++  __asm__ __volatile__ ("pstfs 26, -0x1f400+8(0), 1");
++}
++static void test_pstfs_4_R1 (void) {
++  __asm__ __volatile__ ("pstfs 26, -0x1f400+4(0), 1");
++}
++static void test_pstfs_0_R1 (void) {
++  __asm__ __volatile__ ("pstfs 26, -0x1f400+0(0), 1");
++}
++static void test_plxsd_64_R1 (void) {
++  __asm__ __volatile__ ("plxsd %0, +64(0), 1" : "=v" (vrt) );
++	PAD_ORI
++	PAD_ORI
++}
++static void test_plxsd_32_R1 (void) {
++  __asm__ __volatile__ (".align 2 ; plxsd %0, +32(0), 1" : "=v" (vrt) );
++	PAD_ORI
++}
++static void test_plxsd_16_R1 (void) {
++  __asm__ __volatile__ ("plxsd %0, +16(0), 1; pnop;pnop;pnop; " : "=v" (vrt) );
++	PAD_ORI
++}
++static void test_plxsd_8_R1 (void) {
++  __asm__ __volatile__ ("plxsd %0, +8(0), 1; pnop;pnop;pnop; " : "=v" (vrt) );
++	PAD_ORI
++}
++static void test_plxsd_4_R1 (void) {
++  __asm__ __volatile__ ("plxsd %0, +4(0), 1; pnop;pnop;pnop; "  : "=v" (vrt) );
++	PAD_ORI
++}
++static void test_plxsd_0_R1 (void) {
++  __asm__ __volatile__ ("plxsd %0, +0(0), 1; pnop;pnop;pnop; " : "=v" (vrt) );
++	PAD_ORI
++}
++static void test_plxssp_64_R1 (void) {
++  __asm__ __volatile__ ("plxssp %0, +64(0), 1; pnop;pnop;pnop; " : "=v" (vrt) );
++	PAD_ORI
++	PAD_ORI
++}
++static void test_plxssp_32_R1 (void) {
++  __asm__ __volatile__ ("plxssp %0, +32(0), 1; pnop; " : "=v" (vrt) );
++	PAD_ORI
++}
++static void test_plxssp_16_R1 (void) {
++  __asm__ __volatile__ ("plxssp %0, +16(0), 1; pnop;pnop;pnop; " : "=v" (vrt) );
++	PAD_ORI
++}
++static void test_plxssp_8_R1 (void) {
++  __asm__ __volatile__ ("plxssp %0, +8(0), 1; pnop;pnop;pnop; " : "=v" (vrt) );
++	PAD_ORI
++}
++static void test_plxssp_4_R1 (void) {
++  __asm__ __volatile__ ("plxssp %0, +4(0), 1; pnop;pnop;pnop; " : "=v" (vrt) );
++	PAD_ORI
++}
++static void test_plxssp_0_R1 (void) {
++  __asm__ __volatile__ ("plxssp %0, +0(0), 1; pnop;pnop;pnop; " : "=v" (vrt) );
++	PAD_ORI
++}
++/* Follow the short-range plxv instructions with nop in order to
++   pad out subsequent instructions.  When written there are found
++   to be fluctuations in the instructions to store the result back
++   into the target variable.  (pla,pstxv...).
++   */
++static void test_plxv_16_R1 (void) {
++  __asm__ __volatile__ ("plxv %x0, +16(0), 1; pnop;pnop;pnop;" : "=wa" (vec_xt) );
++	PAD_ORI
++}
++static void test_plxv_8_R1 (void) {
++  __asm__ __volatile__ ("plxv %x0, +8(0), 1; pnop;pnop;pnop;" : "=wa" (vec_xt) );
++	PAD_ORI
++}
++static void test_plxv_4_R1 (void) {
++  __asm__ __volatile__ ("plxv %x0, +4(0), 1; pnop;pnop;pnop;" : "=wa" (vec_xt) );
++	PAD_ORI
++}
++static void test_plxv_0_R1 (void) {
++  __asm__ __volatile__ ("plxv %x0, +0(0), 1; pnop;pnop;pnop; " : "=wa" (vec_xt) );
++	PAD_ORI
++}
++static void test_pstxsd_64_R1 (void) {
++  __asm__ __volatile__ (".align 2 ; pstxsd 22, -0x1f400+64(0), 1" );
++}
++static void test_pstxsd_32_R1 (void) {
++  __asm__ __volatile__ (".align 2 ; pstxsd 22, -0x1f400+32(0), 1" );
++}
++static void test_pstxsd_16_R1 (void) {
++  __asm__ __volatile__ (".align 2 ; pstxsd 22, -0x1f400+16(0), 1" );
++}
++static void test_pstxsd_8_R1 (void) {
++  __asm__ __volatile__ (".align 2 ; pstxsd 22, -0x1f400+8(0), 1" );
++}
++static void test_pstxsd_4_R1 (void) {
++  __asm__ __volatile__ (".align 2 ; pstxsd 22, -0x1f400+4(0), 1"  );
++}
++static void test_pstxsd_0_R1 (void) {
++  __asm__ __volatile__ (".align 2 ; pstxsd 22, -0x1f400+0(0), 1" );
++}
++static void test_pstxssp_64_R1 (void) {
++  __asm__ __volatile__ ("pstxssp 22, -0x1f400+64(0), 1" );
++}
++static void test_pstxssp_32_R1 (void) {
++  __asm__ __volatile__ ("pstxssp 22, -0x1f400+32(0), 1");
++}
++static void test_pstxssp_16_R1 (void) {
++  __asm__ __volatile__ ("pstxssp 22, -0x1f400+16(0), 1");
++}
++static void test_pstxssp_8_R1 (void) {
++  __asm__ __volatile__ ("pstxssp 22, -0x1f400+8(0), 1");
++}
++static void test_pstxssp_4_R1 (void) {
++  __asm__ __volatile__ ("pstxssp 22, -0x1f400+4(0), 1");
++}
++static void test_pstxssp_0_R1 (void) {
++  __asm__ __volatile__ ("pstxssp 22, -0x1f400+0(0), 1");
++}
++static void test_pstxv_16_R1 (void) {
++  __asm__ __volatile__ ("pstxv %x0, -0x1f400+16(0), 1" :: "wa" (vec_xs));
++}
++static void test_pstxv_8_R1 (void) {
++  __asm__ __volatile__ ("pstxv %x0, -0x1f400+8(0), 1" :: "wa" (vec_xs));
++}
++static void test_pstxv_4_R1 (void) {
++  __asm__ __volatile__ ("pstxv %x0, -0x1f400+4(0), 1" :: "wa" (vec_xs));
++}
++static void test_pstxv_0_R1 (void) {
++  __asm__ __volatile__ ("pstxv %x0, -0x1f400+0(0), 1" :: "wa" (vec_xs));
++}
++
++static test_list_t testgroup_generic[] = {
++  { &test_plfd_0_R1, "plfd 0_R1", "FRT,D(RA),R"}, /* bcwp */
++  { &test_plfd_4_R1, "plfd 4_R1", "FRT,D(RA),R"}, /* bcwp */
++  { &test_plfd_8_R1, "plfd 8_R1", "FRT,D(RA),R"}, /* bcwp */
++  { &test_plfd_16_R1, "plfd 16_R1", "FRT,D(RA),R"}, /* bcwp */
++  { &test_plfd_32_R1, "plfd 32_R1", "FRT,D(RA),R"}, /* bcwp */
++  { &test_plfd_64_R1, "plfd 64_R1", "FRT,D(RA),R"}, /* bcwp */
++  { &test_plfs_0_R1, "plfs 0_R1", "FRT,D(RA),R"}, /* bcwp */
++  { &test_plfs_4_R1, "plfs 4_R1", "FRT,D(RA),R"}, /* bcwp */
++  { &test_plfs_8_R1, "plfs 8_R1", "FRT,D(RA),R"}, /* bcwp */
++  { &test_plfs_16_R1, "plfs 16_R1", "FRT,D(RA),R"}, /* bcwp */
++  { &test_plfs_32_R1, "plfs 32_R1", "FRT,D(RA),R"}, /* bcwp */
++  { &test_plfs_64_R1, "plfs 64_R1", "FRT,D(RA),R"}, /* bcwp */
++  { &test_plxsd_0_R1, "plxsd 0_R1", "VRT,D(RA),R", 0b00110000}, /* bcwp */
++  { &test_plxsd_4_R1, "plxsd 4_R1", "VRT,D(RA),R", 0b00110000}, /* bcwp */
++  { &test_plxsd_8_R1, "plxsd 8_R1", "VRT,D(RA),R", 0b00110000}, /* bcwp */
++  { &test_plxsd_16_R1, "plxsd 16_R1", "VRT,D(RA),R", 0b00110000}, /* bcwp */
++  { &test_plxsd_32_R1, "plxsd 32_R1", "VRT,D(RA),R", 0b00110000}, /* bcwp */
++  { &test_plxsd_64_R1, "plxsd 64_R1", "VRT,D(RA),R", 0b00110000}, /* bcwp */
++  { &test_plxssp_0_R1, "plxssp 0_R1", "VRT,D(RA),R", 0b00001111}, /* bcwp */
++  { &test_plxssp_4_R1, "plxssp 4_R1", "VRT,D(RA),R", 0b00001111}, /* bcwp */
++  { &test_plxssp_8_R1, "plxssp 8_R1", "VRT,D(RA),R", 0b00001111}, /* bcwp */
++  { &test_plxssp_16_R1, "plxssp 16_R1", "VRT,D(RA),R", 0b00001111}, /* bcwp */
++  { &test_plxssp_32_R1, "plxssp 32_R1", "VRT,D(RA),R", 0b00001111}, /* bcwp */
++  { &test_plxssp_64_R1, "plxssp 64_R1", "VRT,D(RA),R", 0b00001111}, /* bcwp */
++  { &test_plxv_0_R1, "plxv 0_R1", "XT,D(RA),R"}, /* bcwp */
++  { &test_plxv_4_R1, "plxv 4_R1", "XT,D(RA),R"}, /* bcwp */
++  { &test_plxv_8_R1, "plxv 8_R1", "XT,D(RA),R"}, /* bcwp */
++  { &test_plxv_16_R1, "plxv 16_R1", "XT,D(RA),R"}, /* bcwp */
++  { &test_pstfd_0_R1, "pstfd 0_R1", "FRS,D(RA),R", 0b00110000}, /* bcwp */
++  { &test_pstfd_4_R1, "pstfd 4_R1", "FRS,D(RA),R", 0b00110000}, /* bcwp */
++  { &test_pstfd_8_R1, "pstfd 8_R1", "FRS,D(RA),R", 0b00110000}, /* bcwp */
++  { &test_pstfd_16_R1, "pstfd 16_R1", "FRS,D(RA),R", 0b00110000}, /* bcwp */
++  { &test_pstfd_32_R1, "pstfd 32_R1", "FRS,D(RA),R", 0b00110000}, /* bcwp */
++  { &test_pstfs_0_R1, "pstfs 0_R1", "FRS,D(RA),R", 0b00001111}, /* bcwp */
++  { &test_pstfs_4_R1, "pstfs 4_R1", "FRS,D(RA),R", 0b00001111}, /* bcwp */
++  { &test_pstfs_8_R1, "pstfs 8_R1", "FRS,D(RA),R", 0b00001111}, /* bcwp */
++  { &test_pstfs_16_R1, "pstfs 16_R1", "FRS,D(RA),R", 0b00001111}, /* bcwp */
++  { &test_pstfs_32_R1, "pstfs 32_R1", "FRS,D(RA),R", 0b00001111}, /* bcwp */
++  { &test_pstxsd_0_R1, "pstxsd 0_R1", "VRS,D(RA),R"}, /* bcwp */
++  { &test_pstxsd_4_R1, "pstxsd 4_R1", "VRS,D(RA),R"}, /* bcwp */
++  { &test_pstxsd_8_R1, "pstxsd 8_R1", "VRS,D(RA),R"}, /* bcwp */
++  { &test_pstxsd_16_R1, "pstxsd 16_R1", "VRS,D(RA),R"}, /* bcwp */
++  { &test_pstxsd_32_R1, "pstxsd 32_R1", "VRS,D(RA),R"}, /* bcwp */
++  { &test_pstxsd_64_R1, "pstxsd 64_R1", "VRS,D(RA),R"}, /* bcwp */
++  { &test_pstxssp_0_R1, "pstxssp 0_R1", "VRS,D(RA),R"}, /* bcwp */
++  { &test_pstxssp_4_R1, "pstxssp 4_R1", "VRS,D(RA),R"}, /* bcwp */
++  { &test_pstxssp_8_R1, "pstxssp 8_R1", "VRS,D(RA),R"}, /* bcwp */
++  { &test_pstxssp_16_R1, "pstxssp 16_R1", "VRS,D(RA),R"}, /* bcwp */
++  { &test_pstxssp_32_R1, "pstxssp 32_R1", "VRS,D(RA),R"}, /* bcwp */
++  { &test_pstxssp_64_R1, "pstxssp 64_R1", "VRS,D(RA),R"}, /* bcwp */
++  { &test_pstxvp_off0_R1, "pstxvp off0_R1", "XSp,D(RA),R"}, /* bcwp */
++  { &test_pstxvp_off16_R1, "pstxvp off16_R1", "XSp,D(RA),R"}, /* bcwp */
++  { &test_pstxvp_off32_R1, "pstxvp off32_R1", "XSp,D(RA),R"}, /* bcwp */
++  { &test_pstxvp_off48_R1, "pstxvp off48_R1", "XSp,D(RA),R"}, /* bcwp */
++  { &test_pstxv_0_R1, "pstxv 0_R1", "XS,D(RA),R"}, /* bcwp */
++  { &test_pstxv_4_R1, "pstxv 4_R1", "XS,D(RA),R"}, /* bcwp */
++  { &test_pstxv_8_R1, "pstxv 8_R1", "XS,D(RA),R"}, /* bcwp */
++  { &test_pstxv_16_R1, "pstxv 16_R1", "XS,D(RA),R"}, /* bcwp */
++	{ NULL, 	    NULL },
++};
++
++/*  Allow skipping of tests. */
++unsigned long test_count=0xffff;
++unsigned long skip_count=0;
++unsigned long setup_only=0;
++
++/*  Set up a setjmp/longjmp to gently handle our SIGILLs and SIGSEGVs.  */
++static jmp_buf mybuf;
++
++/* This (testfunction_generic) is meant to handle all of the instruction
++   variations.  The helpers set up the register and iterator values
++   as is appropriate for the instruction being tested.  */
++static void testfunction_generic (const char* instruction_name,
++				  test_func_t test_function,
++				  unsigned int ignore_flags,
++				  char * cur_form) {
++
++   identify_form_components (instruction_name , cur_form);
++   debug_show_form (instruction_name, cur_form);
++   set_up_iterators ();
++   debug_show_iter_ranges ();
++   initialize_buffer (0);
++   init_pcrelative_write_target ();
++   debug_dump_buffer ();
++
++   for (vrai = a_start; vrai < a_iters ; vrai+=a_inc) {
++      for (vrbi = b_start; vrbi < b_iters ; vrbi+=b_inc) {
++	 for (vrci = c_start; vrci < c_iters ; vrci+=c_inc) {
++	    for (vrmi = m_start; (vrmi < m_iters) ; vrmi+=m_inc) {
++		CHECK_OVERRIDES
++		debug_show_current_iteration ();
++		// Be sure to initialize the target registers first.
++		initialize_target_registers ();
++		initialize_source_registers ();
++		vec_xa[0]=0x1234;
++		vec_xa[1]=0x4567;
++		printf ("%s", instruction_name);
++		print_register_header ();
++		printf( " =>"); fflush (stdout);
++		if (!setup_only) {
++		  if (enable_setjmp) {
++		   if ( setjmp ( mybuf ) ) {
++		     printf("signal tripped. (FIXME)\n");
++		     continue;
++		   }
++		  }
++		  (*test_function) ();
++		}
++		print_register_footer ();
++		print_result_buffer ();
++		print_pcrelative_write_target ();
++		printf ("\n");
++	    }
++	 }
++      }
++   }
++}
++
++void mykillhandler ( int x ) { longjmp (mybuf, 1); }
++void mysegvhandler ( int x ) { longjmp (mybuf, 1); }
++
++static void do_tests ( void )
++{
++   int groupcount;
++   char * cur_form;
++   test_group_t group_function = &testfunction_generic;
++   test_list_t *tests = testgroup_generic;
++
++   struct sigaction kill_action, segv_action;
++   struct sigaction old_kill_action, old_segv_action;
++   if (enable_setjmp) {
++      kill_action.sa_handler = mykillhandler;
++      segv_action.sa_handler = mysegvhandler;
++      sigemptyset ( &kill_action.sa_mask );
++      sigemptyset ( &segv_action.sa_mask );
++      kill_action.sa_flags = SA_NODEFER;
++      segv_action.sa_flags = SA_NODEFER;
++      sigaction ( SIGILL, &kill_action, &old_kill_action);
++      sigaction ( SIGSEGV, &segv_action, &old_segv_action);
++   }
++
++   for (groupcount = 0; tests[groupcount].name != NULL; groupcount++) {
++	cur_form = strdup(tests[groupcount].form);
++	current_test = tests[groupcount];
++	identify_instruction_by_func_name (current_test.name);
++	if (groupcount < skip_count) continue;
++	if (verbose) printf("Test #%d ,", groupcount);
++	if (verbose > 1) printf(" instruction %s (v=%d)", current_test.name, verbose);
++	(*group_function) (current_test.name, current_test.func, 0, cur_form );
++	printf ("\n");
++	if (groupcount >= (skip_count+test_count)) break;
++   }
++   if (debug_show_labels) printf("\n");
++   printf ("All done. Tested %d different instruction groups\n", groupcount);
++}
++
++static void usage (void)
++{
++   fprintf(stderr,
++      "Usage: test_isa_XXX [OPTIONS]\n"
++      "\t-h: display this help and exit\n"
++      "\t-v: increase verbosity\n"
++      "\t-a <foo> : limit number of a-iterations to <foo>\n"
++      "\t-b <foo> : limit number of b-iterations to <foo>\n"
++      "\t-c <foo> : limit number of c-iterations to <foo>\n"
++      "\t-n <foo> : limit to this number of tests.\n"
++      "\t-r <foo>: run only test # <foo> \n"
++      "\t\n"
++      "\t-j :enable setjmp to recover from illegal insns. \n"
++      "\t-m :(dev only?) lock VRM value to zero.\n"
++      "\t-z :(dev only?) lock MC value to zero.\n"
++      "\t-p :(dev only?) disable prefix instructions\n"
++      "\t-s <foo>: skip <foo> tests \n"
++      "\t-c <foo>: stop after running <foo> # of tests \n"
++      "\t-f : Do the test setup but do not actually execute the test instruction. \n"
++   );
++}
++
++int main (int argc, char **argv)
++{
++   int c;
++   while ((c = getopt(argc, argv, "dhjvmpfzs:a:b:c:n:r:")) != -1) {
++      switch (c) {
++	 case 'h':
++	    usage();
++	    return 0;
++
++	 case 'v':
++	    verbose++;
++	    break;
++
++	 /* Options related to limiting the test iterations.  */
++	 case 'a':
++	    a_limit=atoi (optarg);
++	    printf ("limiting a-iters to %ld.\n", a_limit);
++	    break;
++	 case 'b':
++	    b_limit=atoi (optarg);
++	    printf ("limiting b-iters to %ld.\n", b_limit);
++	    break;
++	 case 'c':
++	    c_limit=atoi (optarg);
++	    printf ("limiting c-iters to %ld.\n", c_limit);
++	    break;
++	 case 'n': // run this number of tests.
++	    test_count=atoi (optarg);
++	    printf ("limiting to %ld tests\n", test_count);
++	    break;
++	 case 'r': // run just test #<foo>.
++	    skip_count=atoi (optarg);
++	    test_count=0;
++	    if (verbose) printf("Running test number %ld\n", skip_count);
++	    break;
++	 case 's': // skip this number of tests.
++	    skip_count=atoi (optarg);
++	    printf ("skipping %ld tests\n", skip_count);
++	    break;
++
++	 /* debug options.  */
++	 case 'd':
++	    dump_tables=1;
++	    printf("DEBUG:dump_tables.\n");
++	    break;
++	 case 'f':
++	    setup_only=1;
++	    printf("DEBUG:setup_only.\n");
++	    break;
++	 case 'j':
++	    enable_setjmp=1;
++	    printf ("DEBUG:setjmp enabled.\n");
++	    break;
++	 case 'm':
++	    vrm_override=1;
++	    printf ("DEBUG:vrm override enabled.\n");
++	    break;
++	 case 'p':
++	    prefix_override=1;
++	    printf ("DEBUG:prefix override enabled.\n");
++	    break;
++	 case 'z':
++	    mc_override=1;
++	    printf ("DEBUG:MC override enabled.\n");
++	    break;
++	 default:
++	    usage();
++	    fprintf(stderr, "Unknown argument: '%c'\n", c);
++	   }
++	}
++
++	generic_prologue ();
++	build_vsx_table ();
++	build_args_table ();
++	build_float_vsx_tables ();
++
++	if (dump_tables) {
++	   dump_float_vsx_tables ();
++	   dump_vsxargs ();
++	}
++
++	do_tests ();
++
++	return 0;
++}
++
++#else	   // HAS_ISA_3_1
++int main (int argc, char **argv)
++{
++   printf("NO ISA 3.1 SUPPORT\n");
++   return 0;
++}
++#endif
+diff --git a/none/tests/ppc64/test_isa_3_1_R1_XT.stderr.exp b/none/tests/ppc64/test_isa_3_1_R1_XT.stderr.exp
+new file mode 100644
+index 000000000..139597f9c
+--- /dev/null
++++ b/none/tests/ppc64/test_isa_3_1_R1_XT.stderr.exp
+@@ -0,0 +1,2 @@
++
++
+diff --git a/none/tests/ppc64/test_isa_3_1_R1_XT.stdout.exp b/none/tests/ppc64/test_isa_3_1_R1_XT.stdout.exp
+new file mode 100644
+index 000000000..48d591f4d
+--- /dev/null
++++ b/none/tests/ppc64/test_isa_3_1_R1_XT.stdout.exp
+@@ -0,0 +1,127 @@
++plfd 0_R1 =>_ -4.903986e+55 _  cb80000006100000, 0 
++
++plfd 4_R1 =>_ 3.095878e+167 _  62b50015cb800004, 0 
++
++plfd 8_R1 =>_ 1.297320e+168 _  62d6001662b50015, 0 
++
++plfd 16_R1 =>_ 2.264413e+169 _  6318001862f70017, 0 
++
++plfd 32_R1 =>_ 6.763045e+171 _  639c001c637b001b, 0 
++
++plfd 64_R1 =>_ 6.763045e+171 _  639c001c637b001b, 0 
++
++plfs 0_R1 =>_ 2.708339e-35 _  38c2000000000000, 0 
++
++plfs 4_R1 =>_ -2.560001e+02 _  c070000080000000, 0 
++
++plfs 8_R1 =>_ 1.669433e+21 _  4456a002a0000000, 0 
++
++plfs 16_R1 =>_ 2.278176e+21 _  445ee002e0000000, 0 
++
++plfs 32_R1 =>_ 4.630140e+21 _  446f600360000000, 0 
++
++plfs 64_R1 =>_ 4.630140e+21 _  446f600360000000, 0 
++
++plxsd 0_R1 => a800000004100000,0000000000000000 -5.07588375e-116            +Zero
++
++plxsd 4_R1 =>  7000000a8000004,0000000000000000  5.77662562e-275            +Zero
++
++plxsd 8_R1 =>  700000060000000,0000000000000000  5.77662407e-275            +Zero
++
++plxsd 16_R1 =>          7000000,0000000000000000             +Den            +Zero
++
++plxsd 32_R1 => 6339001963180018,0000000000000000  9.43505226e+169            +Zero
++
++plxsd 64_R1 => 6339001963180018,0000000000000000  9.43505226e+169            +Zero
++
++plxssp 0_R1 => 3882000000000000,0000000000000000     6.19888e-05           +Zero           +Zero           +Zero
++
++plxssp 4_R1 => bd80000080000000,0000000000000000    -6.25000e-02           -Zero           +Zero           +Zero
++
++plxssp 8_R1 => 38e0000000000000,0000000000000000     1.06812e-04           +Zero           +Zero           +Zero
++
++plxssp 16_R1 => 38e0000000000000,0000000000000000     1.06812e-04           +Zero           +Zero           +Zero
++
++plxssp 32_R1 => 445ac002c0000000,0000000000000000     8.75000e+02    -2.00000e+00           +Zero           +Zero
++
++plxssp 64_R1 => 446b400340000000,0000000000000000     9.41000e+02     2.00000e+00           +Zero           +Zero
++
++plxv 0_R1 => c800000004100000          7000000
++
++plxv 4_R1 =>  7000000c8000004  700000000000000
++
++plxv 8_R1 =>          7000000          7000000
++
++plxv 16_R1 =>          7000000          7000000
++
++pstfd 0_R1 43dfe000003fe000 43eff000000ff000 => e000003fe00043df
++pstfd 0_R1 43eff000000ff000 43efefffffcff000 => f000000ff00043ef
++
++pstfd 4_R1 43dfe000003fe000 43eff000000ff000 =>     e000003f e00043df    
++pstfd 4_R1 43eff000000ff000 43efefffffcff000 =>     f000000f f00043ef    
++
++pstfd 8_R1 43dfe000003fe000 43eff000000ff000 => e000003fe00043df
++pstfd 8_R1 43eff000000ff000 43efefffffcff000 => f000000ff00043ef
++
++pstfd 16_R1 43dfe000003fe000 43eff000000ff000 => e000003fe00043df
++pstfd 16_R1 43eff000000ff000 43efefffffcff000 => f000000ff00043ef
++
++pstfd 32_R1 43dfe000003fe000 43eff000000ff000 => e000003fe00043df
++pstfd 32_R1 43eff000000ff000 43efefffffcff000 => f000000ff00043ef
++
++pstfs 0_R1 000000005eff0000 000000005f7f8000 => 00005eff    
++pstfs 0_R1 000000005f7f8000 000000005f7f8000 => 80005f7f    
++
++pstfs 4_R1 000000005eff0000 000000005f7f8000 =>     00005eff
++pstfs 4_R1 000000005f7f8000 000000005f7f8000 =>     80005f7f
++
++pstfs 8_R1 000000005eff0000 000000005f7f8000 => 00005eff    
++pstfs 8_R1 000000005f7f8000 000000005f7f8000 => 80005f7f    
++
++pstfs 16_R1 000000005eff0000 000000005f7f8000 => 00005eff    
++pstfs 16_R1 000000005f7f8000 000000005f7f8000 => 80005f7f    
++
++pstfs 32_R1 000000005eff0000 000000005f7f8000 => 00005eff    
++pstfs 32_R1 000000005f7f8000 000000005f7f8000 => 80005f7f    
++
++pstxsd 0_R1 => 0000000000000000
++
++pstxsd 4_R1 =>     00000000 00000000    
++
++pstxsd 8_R1 => 0000000000000000
++
++pstxsd 16_R1 => 0000000000000000
++
++pstxsd 32_R1 => 0000000000000000
++
++pstxsd 64_R1 => 0000000000000000
++
++pstxssp 0_R1 => 00000000    
++
++pstxssp 4_R1 =>     00000000
++
++pstxssp 8_R1 => 00000000    
++
++pstxssp 16_R1 => 00000000    
++
++pstxssp 32_R1 => 00000000    
++
++pstxssp 64_R1 => 00000000    
++
++pstxvp off0_R1 0180055e0180077e 0080000e8080000e ff7ffffe7f7ffffe ff8000007f800000 => fffe7f7ffffeff7f 00007f800000ff80 077e0180055e0180 000e8080000e0080
++
++pstxvp off16_R1 0180055e0180077e 0080000e8080000e ff7ffffe7f7ffffe ff8000007f800000 => fffe7f7ffffeff7f 00007f800000ff80 077e0180055e0180 000e8080000e0080
++
++pstxvp off32_R1 0180055e0180077e 0080000e8080000e ff7ffffe7f7ffffe ff8000007f800000 => fffe7f7ffffeff7f 00007f800000ff80 077e0180055e0180 000e8080000e0080
++
++pstxvp off48_R1 0180055e0180077e 0080000e8080000e ff7ffffe7f7ffffe ff8000007f800000 => fffe7f7ffffeff7f 00007f800000ff80 077e0180055e0180 000e8080000e0080
++
++pstxv 0_R1 ff7ffffe7f7ffffe,ff8000007f800000 => fffe7f7ffffeff7f 00007f800000ff80
++
++pstxv 4_R1 ff7ffffe7f7ffffe,ff8000007f800000 => fffe7f7ffffeff7f 00007f800000ff80
++
++pstxv 8_R1 ff7ffffe7f7ffffe,ff8000007f800000 => fffe7f7ffffeff7f 00007f800000ff80
++
++pstxv 16_R1 ff7ffffe7f7ffffe,ff8000007f800000 =>     fffe7f7f fffeff7f00007f80 0000ff80    
++
++All done. Tested 58 different instruction groups
+diff --git a/none/tests/ppc64/test_isa_3_1_R1_XT.vgtest b/none/tests/ppc64/test_isa_3_1_R1_XT.vgtest
+new file mode 100644
+index 000000000..7331aafad
+--- /dev/null
++++ b/none/tests/ppc64/test_isa_3_1_R1_XT.vgtest
+@@ -0,0 +1,2 @@
++prereq: ../../../tests/check_ppc64_auxv_cap arch_3_1
++prog: test_isa_3_1_R1_XT
+diff --git a/none/tests/ppc64/test_isa_3_1_common.c b/none/tests/ppc64/test_isa_3_1_common.c
+index 7c3dc6f00..b3320277b 100644
+--- a/none/tests/ppc64/test_isa_3_1_common.c
++++ b/none/tests/ppc64/test_isa_3_1_common.c
+@@ -134,11 +134,13 @@ bool uses_acc_vsrs;
+ bool uses_pmsk;
+ bool uses_buffer;  // Buffer related.
+ bool uses_load_buffer, uses_store_buffer, uses_any_buffer;
++bool updates_byte, updates_halfword, updates_word; // output helpers.
+ bool uses_quad;
+ unsigned long output_mask;  // Output field special handling.
+ bool instruction_is_sp, instruction_is_sp_estimate;
+ bool instruction_is_dp, instruction_is_dp_estimate;
+ bool instruction_is_b16;
++bool instruction_is_relative;
+ 
+ unsigned long long min (unsigned long long a, unsigned long long b) {
+    if ( a < b )
+@@ -236,6 +238,18 @@ void identify_form_components (const char *instruction_name,
+       (strncmp (instruction_name, "pmst", 4) == 0) ||
+       (strncmp (instruction_name, "pst", 3) == 0) ||
+       (strncmp (instruction_name, "st", 2) == 0));
++   updates_byte = (
++      (strncmp (instruction_name, "pstb", 4) == 0) );
++   updates_halfword = (
++      (strncmp (instruction_name, "psth", 4) == 0) ||
++       (strncmp (instruction_name, "pstfs", 4) == 0) ||
++       (strncmp (instruction_name, "pstxsd", 4) == 0) ||
++       (strncmp (instruction_name, "pstxssp", 4) == 0) ||
++       (strncmp (instruction_name, "pstxv", 4) == 0) ||
++       (strncmp (instruction_name, "psfs", 4) == 0) );
++   updates_word = (
++      (strncmp (instruction_name, "pstw", 4) == 0) );
++
+    uses_any_buffer = (strstr (cur_form, "(RA)") != NULL);
+    uses_buffer = uses_any_buffer||uses_load_buffer||uses_store_buffer;
+ 
+@@ -268,6 +282,15 @@ void identify_form_components (const char *instruction_name,
+    instruction_is_b16 =         ( current_test.mask & B16_MASK        );
+ }
+ 
++/* Parse the provided function name to set assorted values.
++   In particular, set an indicator when the instruction test has
++   indicated it will run with R==1 that indicates it is a PC-relative
++   instruction.  Those tests should all have "_R1" as part of
++   the function name.  */
++void identify_instruction_by_func_name(const char * function_name) {
++   instruction_is_relative = ( (strstr (function_name, "R1") != NULL));
++}
++
+ void display_form_components (char * cur_form) {
+    printf (" %s\n", cur_form);
+    printf ("Instruction form elements: ");
+@@ -288,7 +311,7 @@ void display_form_components (char * cur_form) {
+    if (has_frbp) printf ("frbp ");
+    if (has_frs)  printf ("frs ");
+    if (has_frsp) printf ("frsp ");
+-   if (has_frt)  printf ("frt ");
++   if (has_frt)  printf ("frt%s ",(instruction_is_relative)?"-raw":"");
+    if (has_frtp) printf ("frtp ");
+    if (has_xa)   printf ("xa ");
+    if (has_xap)  printf ("xap ");
+@@ -298,6 +321,7 @@ void display_form_components (char * cur_form) {
+    if (has_xsp)  printf ("xsp ");
+    if (has_xt)   printf ("xt ");
+    if (has_xtp)  printf ("xtp ");
++   if (instruction_is_relative)  printf ("R==1 ");
+    if (uses_acc_src) printf ("AS ");
+    if (uses_acc_dest) printf ("AT ");
+    printf ("\n");
+@@ -991,6 +1015,107 @@ if (debug_show_values) printf (" buffer:");
+   }
+ }
+ 
++/* **** Reloc Buffer **************************************** */
++/* Create a large buffer to be the destination for pc-relative
++ * writes.  This test is built with linker hints in order
++ * to ensure our buffer, stored in the .bss section, is at a
++ * mostly known offset from the instructions being exercised,
++ * so a hardcoded offset from the PC (pc-relative) will be
++ * on-target.
++ * If there are significant reworks to the code, the bss or
++ * text sections, or the offsets used may need to change.
++ *
++ * The linker hints are specifically -Tbss and -Ttext.
++ * gcc foo.c test_isa_3_1_common.c -I../../../   -Wl,-Tbss 0x20000 -Wl,-Ttext 0x40000
++ */
++ /* RELOC_BUFFER_SIZE is defined to 0x1000 in isa_3_1_helpers.h  */
++#define RELOC_BUFFER_PATTERN 0x0001000100010001
++volatile unsigned long long pcrelative_write_target[RELOC_BUFFER_SIZE];
++
++/* Initialize the buffer to known values. */
++void init_pcrelative_write_target() {
++       int i;
++       for (i=0;i<RELOC_BUFFER_SIZE;i++)
++               pcrelative_write_target[i]=i*RELOC_BUFFER_PATTERN;
++}
++
++/* Review the pcrelative_write_target buffer; and print any
++   elements that vary from the initialized value.
++   Exclude portions of the output as appropriate if the current test
++   is marked for byte,halfword,word.  */
++void print_pcrelative_write_target() {
++  int i,z,rshift;
++  unsigned long long curr_value;
++  unsigned long long ref_value;
++  unsigned long long curr_token,init_token;
++  for (i=0;i<RELOC_BUFFER_SIZE;i++) {
++    ref_value=i*RELOC_BUFFER_PATTERN;
++    curr_value = pcrelative_write_target[i];
++    if (ref_value != curr_value) {
++      printf(" ");
++      if (verbose)
++	printf("delta found: %d %llx -> %llx\n",i,ref_value,curr_value);
++      if (updates_byte) {
++	for (z=0;z<8;z++) {
++	  rshift=z*8;
++	  if (verbose) printf("z:%d ",z);
++	  init_token = (ref_value>>rshift) & 0xff;
++	  curr_token = (curr_value>>rshift) & 0xff;
++	  if (verbose)
++	    printf("wms byte:: %llx -> %llx \n",init_token,curr_token);
++	  if (init_token == curr_token && (updates_byte||updates_halfword||updates_word) ) {
++	     printf("%2s","  ");
++	  } else {
++	    printf("%02llx",curr_token);
++	  }
++        }
++      }
++      else if (updates_halfword) {
++	for (z=0;z<4;z++) {
++	  rshift=z*16;
++	  if (verbose) printf("z:%d ",z);
++	  init_token = (ref_value>>rshift) & 0xffff;
++	  curr_token = (curr_value>>rshift) & 0xffff;
++	  if (verbose)
++	    printf("wms half:: %llx -> %llx \n",init_token,curr_token);
++	  if (init_token == curr_token) {
++	     printf("%2s","  ");
++	  } else {
++	    printf("%04llx",curr_token);
++	  }
++        }
++      }
++      else if (updates_word) {
++	for (z=0;z<2;z++) {
++	  rshift=z*32;
++	  if (verbose) printf("z:%d ",z);
++	  init_token = (ref_value>>rshift) & 0xffffffff;
++	  curr_token = (curr_value>>rshift) & 0xffffffff;
++	  if (verbose)
++	    printf("wms word:: %llx -> %llx \n",init_token,curr_token);
++	  if (init_token == curr_token ) {
++	     printf("%2s","  ");
++	  } else {
++	    printf("%08llx",curr_token);
++	  }
++        }
++      }
++      else {
++	printf("%016llx ",curr_value);
++      }
++    }
++  }
++}
++
++/* Helper that returns the address of the pcrelative_write_target buffer.
++   Due to variances in where the sections land in memory, this value is
++   used to normalize the results.  (see paddi tests for usage).   */
++unsigned long long pcrelative_buff_addr(int x) {
++   /* Return the base address of the array.  The base address will be
++      a function of the code load address.  */
++   return (unsigned long long) &pcrelative_write_target[x];
++}
++
+ void print_undefined () {
+    if (debug_show_values)
+       printf (" [Undef]");
+@@ -1339,7 +1464,7 @@ void print_frt () {
+       /* If the result is a dfp128 value, the dfp128 value is
+          contained in the frt, frtp values which are split across
+          a pair of VSRs.  */
+-      if (uses_dfp128_output) {
++      if (!instruction_is_relative && uses_dfp128_output) {
+ 	 if (verbose) print_vsr (28);
+ 	 if (verbose) print_vsr (29);
+ 	 value1 = get_vsrhd_vs28 ();
+@@ -1347,7 +1472,12 @@ void print_frt () {
+ 	 dissect_dfp128_float (value1, value3);
+       } else {
+ 	 if (debug_show_raw_values) generic_print_float_as_hex (frt);
+-	 printf (" %e", frt);
++	 if (instruction_is_relative) {
++	    printf ("_ %e _ ", frt);
++	    print_vsr (28);
++	 } else {
++		printf (" %e", frt);
++	  }
+ 	 if (has_frtp) {
+ 	    if (debug_show_raw_values) generic_print_float_as_hex (frtp);
+ 	    printf (" %e", frtp);
+@@ -1652,7 +1782,15 @@ void print_all() {
+ void print_register_header () {
+   post_test = 0;
+   if (debug_show_all_regs) print_all();
+-  if (has_ra) print_ra ();
++
++  if (has_ra) {
++	  /* Suppress the print of RA if the instruction has
++	     R==1, since the ra value must be zero for the
++	     instruction to be valid.  */
++	  if (!instruction_is_relative)
++		 print_ra();
++  }
++
+   if (has_rb) print_rb ();
+   if (has_rc) print_rc ();
+   if (has_rs) print_rs();
+@@ -1894,6 +2032,11 @@ void set_up_iterators () {
+    } else {
+ 	   a_start=0; b_start=0; c_start=0; m_start=0;
+    }
++   /* Special casing for R==1 tests. */
++   if (instruction_is_relative) {
++	  a_iters = 1;
++	  m_start=3; m_iters=4;
++   }
+    if ((has_vra+has_vrb+has_vrc+has_vrm+has_xa+has_xb+uses_MC > 2) &&
+        (!debug_enable_all_iters)) {
+       /* Instruction tests using multiple fields will generate a lot of
+@@ -2196,15 +2339,12 @@ void initialize_source_registers () {
+ 	  vrb[0] = vsxargs[ (vrbi  ) % isr_modulo];
+ 	  vrb[1] = vsxargs[ (vrbi+1) % isr_modulo];
+    }
+- 
+-  if (has_xa) { 
+-    vec_xa[0] = vsxargs[ (vrai  ) % isr_modulo];
+-    vec_xa[1] = vsxargs[ (vrai+1) % isr_modulo];
+-  }
+-  if (has_xb) {
+-    vec_xb[0] = vsxargs[ (vrbi  ) % isr_modulo];
+-    vec_xb[1] = vsxargs[ (vrbi+1) % isr_modulo];
+-  }
++
++   if (instruction_is_relative) {
++     /* for pstxsd and friends using R=1 */
++     vec_xa[0] = vsxargs[ (vrai+2  ) % isr_modulo];
++     vec_xa[1] = vsxargs[ (vrai+3  ) % isr_modulo];
++   }
+ 
+    // xap 'shares' with the second half of an xa-pair.
+   if (has_xap ) {
diff --git a/SOURCES/valgrind-3.18.1-ppc-pstq.patch b/SOURCES/valgrind-3.18.1-ppc-pstq.patch
new file mode 100644
index 0000000..2e23d18
--- /dev/null
+++ b/SOURCES/valgrind-3.18.1-ppc-pstq.patch
@@ -0,0 +1,47 @@
+commit ae8c6de01417023e78763de145b1c0e6ddd87277
+Author: Carl Love <cel@us.ibm.com>
+Date:   Wed Oct 20 20:40:13 2021 +0000
+
+    Fix for the prefixed stq instruction in PC relative mode.
+    
+    The pstq instruction for R=1, was not using the correct effective address.
+    The EA_hi and EA_lo should have been based on the value of EA as calculated
+    by the function calculate_prefix_EA.  Unfortuanely, the EA_hi and EA_lo
+    addresses were still using the previous code (not PC relative) to calculate
+    the address from the contants of RA plus the offset.
+
+diff --git a/VEX/priv/guest_ppc_toIR.c b/VEX/priv/guest_ppc_toIR.c
+index 8afd77490..543fa9574 100644
+--- a/VEX/priv/guest_ppc_toIR.c
++++ b/VEX/priv/guest_ppc_toIR.c
+@@ -9838,23 +9838,24 @@ static Bool dis_int_store_ds_prefix ( UInt prefix,
+             if (host_endness == VexEndnessBE) {
+ 
+                /* upper 64-bits */
+-               assign( EA_hi, ea_rAor0_simm( rA_addr, immediate_val ) );
++               assign( EA_hi, mkexpr(EA));
+ 
+                /* lower 64-bits */
+-               assign( EA_lo, ea_rAor0_simm( rA_addr, immediate_val+8 ) );
++               assign( EA_lo, binop(Iop_Add64, mkexpr(EA), mkU64(8)));
++
+             } else {
+                /* upper 64-bits */
+-               assign( EA_hi, ea_rAor0_simm( rA_addr, immediate_val+8 ) );
++               assign( EA_hi, binop(Iop_Add64, mkexpr(EA), mkU64(8)));
+ 
+                /* lower 64-bits */
+-               assign( EA_lo, ea_rAor0_simm( rA_addr, immediate_val ) );
++               assign( EA_lo, mkexpr(EA));
+             }
+          } else {
+             /* upper half of upper 64-bits */
+-            assign( EA_hi, ea_rAor0_simm( rA_addr, immediate_val+4 ) );
++            assign( EA_hi, binop(Iop_Add32, mkexpr(EA), mkU32(4)));
+ 
+             /* lower half of upper 64-bits */
+-            assign( EA_lo, ea_rAor0_simm( rA_addr, immediate_val+12 ) );
++            assign( EA_lo, binop(Iop_Add32, mkexpr(EA), mkU32(12)));
+          }
+ 
+          /* Note, the store order for stq instruction is the same for BE
diff --git a/SOURCES/valgrind-3.18.1-ppc64-lxsibzx-lxsihzx.patch b/SOURCES/valgrind-3.18.1-ppc64-lxsibzx-lxsihzx.patch
new file mode 100644
index 0000000..bb36c80
--- /dev/null
+++ b/SOURCES/valgrind-3.18.1-ppc64-lxsibzx-lxsihzx.patch
@@ -0,0 +1,60 @@
+commit 6e08ee95f7f1b1c3fd434fa380cc5b2cc3e3f7c7
+Author: Carl Love <cel@us.ibm.com>
+Date:   Fri Oct 29 16:30:33 2021 -0500
+
+    Bug 444571 - PPC, fix the lxsibzx and lxsihzx so they only load their respective sized data.
+    
+    The lxsibzx was doing a 64-bit load.  The result was initializing
+    additional bytes in the register that should not have been initialized.
+    The memcheck/tests/linux/dlclose_leak test detected the issue.  The
+    code generation uses lxsibzx and stxsibx with -mcpu=power9.  Previously
+    the lbz and stb instructions were generated.
+    
+    The same issue was noted and fixed with the lxsihzx instruction.  The
+    memcheck/tests/linux/badrw test now passes as well.
+    
+    https://bugs.kde.org/show_bug.cgi?id=444571
+
+diff --git a/VEX/priv/guest_ppc_toIR.c b/VEX/priv/guest_ppc_toIR.c
+index d90d566ed..8afd77490 100644
+--- a/VEX/priv/guest_ppc_toIR.c
++++ b/VEX/priv/guest_ppc_toIR.c
+@@ -25359,19 +25359,17 @@ dis_vx_load ( UInt prefix, UInt theInstr )
+ 
+       else
+          irx_addr = mkexpr( EA );
+-
+-      byte = load( Ity_I64, irx_addr );
++      /* byte load */
++      byte = load( Ity_I8, irx_addr );
+       putVSReg( XT, binop( Iop_64HLtoV128,
+-                            binop( Iop_And64,
+-                                   byte,
+-                                   mkU64( 0xFF ) ),
++                           unop( Iop_8Uto64, byte ),
+                            mkU64( 0 ) ) );
+       break;
+    }
+ 
+    case 0x32D: // lxsihzx
+    {
+-      IRExpr *byte;
++      IRExpr *hword;
+       IRExpr* irx_addr;
+ 
+       DIP("lxsihzx %u,r%u,r%u\n", (UInt)XT, rA_addr, rB_addr);
+@@ -25382,11 +25380,10 @@ dis_vx_load ( UInt prefix, UInt theInstr )
+       else
+          irx_addr = mkexpr( EA );
+ 
+-      byte = load( Ity_I64, irx_addr );
++      hword = load( Ity_I16, irx_addr );
+       putVSReg( XT, binop( Iop_64HLtoV128,
+-                            binop( Iop_And64,
+-                                   byte,
+-                                   mkU64( 0xFFFF ) ),
++                            unop( Iop_16Uto64,
++                                  hword ),
+                            mkU64( 0 ) ) );
+       break;
+    }
diff --git a/SOURCES/valgrind-3.18.1-rust-v0-demangle.patch b/SOURCES/valgrind-3.18.1-rust-v0-demangle.patch
new file mode 100644
index 0000000..e48a106
--- /dev/null
+++ b/SOURCES/valgrind-3.18.1-rust-v0-demangle.patch
@@ -0,0 +1,137 @@
+commit 4831385c6706b377851284adc4c4545fff4c6564
+Author: Nicholas Nethercote <nnethercote@apple.com>
+Date:   Tue Nov 9 12:30:07 2021 +1100
+
+    Fix Rust v0 demangling.
+    
+    It's currently broken due to a silly test that prevents the v0
+    demangling code from even running.
+    
+    The commit also adds a test, to avoid such problems in the future.
+
+diff --git a/coregrind/m_demangle/demangle.c b/coregrind/m_demangle/demangle.c
+index 16161da2a..3fd7cb75f 100644
+--- a/coregrind/m_demangle/demangle.c
++++ b/coregrind/m_demangle/demangle.c
+@@ -118,8 +118,13 @@ void VG_(demangle) ( Bool do_cxx_demangling, Bool do_z_demangling,
+    }
+ 
+    /* Possibly undo (1) */
++   // - C++ mangled symbols start with "_Z" (possibly with exceptions?)
++   // - Rust "legacy" mangled symbols start with "_Z".
++   // - Rust "v0" mangled symbols start with "_R".
++   // XXX: the Java/Rust/Ada demangling here probably doesn't work. See
++   // https://bugs.kde.org/show_bug.cgi?id=445235 for details.
+    if (do_cxx_demangling && VG_(clo_demangle)
+-       && orig != NULL && orig[0] == '_' && orig[1] == 'Z') {
++       && orig != NULL && orig[0] == '_' && (orig[1] == 'Z' || orig[1] == 'R')) {
+       /* !!! vvv STATIC vvv !!! */
+       static HChar* demangled = NULL;
+       /* !!! ^^^ STATIC ^^^ !!! */
+diff --git a/memcheck/tests/demangle-rust.c b/memcheck/tests/demangle-rust.c
+new file mode 100644
+index 000000000..f2a458b2a
+--- /dev/null
++++ b/memcheck/tests/demangle-rust.c
+@@ -0,0 +1,31 @@
++// Valgrind supports demangling Rust symbols (both the "v0" and "legacy"
++// mangling schemes), but we don't want to add a dependency on the Rust
++// compiler for a single test. So this is a C program with function names that
++// are mangled Rust symbols. In the output, they become demangled Rust names.
++// It's a hack, but a useful one.
++
++#include <stdlib.h>
++
++// A v0 symbol that demangles to: <rustc_middle::ty::PredicateKind as rustc_middle::ty::fold::TypeFoldable>::fold_with::<rustc_infer::infer::resolve::OpportunisticVarResolver>
++int _RINvYNtNtCs4uGc65yWeeX_12rustc_middle2ty13PredicateKindNtNtB5_4fold12TypeFoldable9fold_withNtNtNtCsgI90OQiJWEs_11rustc_infer5infer7resolve24OpportunisticVarResolverECsdozMG8X9FIu_21rustc_trait_selection(int *p)
++{
++   return *p ? 1 : 2;
++}
++
++// A v0 symbol that demangles to: rustc_expand::mbe::macro_parser::parse_tt
++int _RNvNtNtCsaqSe1lZGvEL_12rustc_expand3mbe12macro_parser8parse_tt(int* p)
++{
++   return _RINvYNtNtCs4uGc65yWeeX_12rustc_middle2ty13PredicateKindNtNtB5_4fold12TypeFoldable9fold_withNtNtNtCsgI90OQiJWEs_11rustc_infer5infer7resolve24OpportunisticVarResolverECsdozMG8X9FIu_21rustc_trait_selection(p);
++}
++
++// A legacy symbol that demangles to: core::str::lossy::Utf8Lossy::from_bytes
++int _ZN4core3str5lossy9Utf8Lossy10from_bytes17heb1677c8cb728b0bE(int* p)
++{
++   return _RNvNtNtCsaqSe1lZGvEL_12rustc_expand3mbe12macro_parser8parse_tt(p);
++}
++
++int main(void)
++{
++   return _ZN4core3str5lossy9Utf8Lossy10from_bytes17heb1677c8cb728b0bE(malloc(sizeof(int)));
++}
++
+diff --git a/memcheck/tests/demangle-rust.stderr.exp b/memcheck/tests/demangle-rust.stderr.exp
+new file mode 100644
+index 000000000..f04bb625b
+--- /dev/null
++++ b/memcheck/tests/demangle-rust.stderr.exp
+@@ -0,0 +1,6 @@
++Conditional jump or move depends on uninitialised value(s)
++   at 0x........: <rustc_middle::ty::PredicateKind as rustc_middle::ty::fold::TypeFoldable>::fold_with::<rustc_infer::infer::resolve::OpportunisticVarResolver> (demangle-rust.c:12)
++   by 0x........: rustc_expand::mbe::macro_parser::parse_tt (demangle-rust.c:18)
++   by 0x........: core::str::lossy::Utf8Lossy::from_bytes (demangle-rust.c:24)
++   by 0x........: main (demangle-rust.c:29)
++
+diff --git a/memcheck/tests/demangle-rust.vgtest b/memcheck/tests/demangle-rust.vgtest
+new file mode 100644
+index 000000000..d726c6b2e
+--- /dev/null
++++ b/memcheck/tests/demangle-rust.vgtest
+@@ -0,0 +1,2 @@
++prog: demangle-rust
++vgopts: -q
+
+commit c1bfa115f985633722f25922d2996c231e8c9d8d
+Author: Mark Wielaard <mark@klomp.org>
+Date:   Wed Nov 10 09:02:36 2021 +0100
+
+    Add demangle-rust.vgtest demangle-rust.stderr.exp to EXTRA_DIST
+
+diff --git a/memcheck/tests/Makefile.am b/memcheck/tests/Makefile.am
+index 4d0476e2d..7837d87c7 100644
+--- a/memcheck/tests/Makefile.am
++++ b/memcheck/tests/Makefile.am
+@@ -281,6 +281,7 @@ EXTRA_DIST = \
+ 	realloc3.stderr.exp realloc3.vgtest \
+ 	recursive-merge.stderr.exp recursive-merge.vgtest \
+ 	resvn_stack.stderr.exp resvn_stack.vgtest \
++	demangle-rust.vgtest demangle-rust.stderr.exp \
+ 	sbfragment.stdout.exp sbfragment.stderr.exp sbfragment.vgtest \
+ 	sem.stderr.exp sem.vgtest \
+ 	sendmsg.stderr.exp sendmsg.stderr.exp-solaris sendmsg.vgtest \
+
+commit d151907e5d8ff393f4fef126c8ae445ea8813661
+Author: Mark Wielaard <mark@klomp.org>
+Date:   Thu Nov 11 18:02:09 2021 +0100
+
+    Add demangle-rust to check_PROGRAMS
+    
+    The demangle-rust.vgtest would fail because the demangle-rust binary
+    wasn't build by default. Add it to check_PROGRAMS and define
+    demangle_rust_SOURCES to make sure it is always build.
+
+diff --git a/memcheck/tests/Makefile.am b/memcheck/tests/Makefile.am
+index 7837d87c7..449710020 100644
+--- a/memcheck/tests/Makefile.am
++++ b/memcheck/tests/Makefile.am
+@@ -392,6 +392,7 @@ check_PROGRAMS = \
+ 	custom_alloc \
+ 	custom-overlap \
+ 	demangle \
++	demangle-rust \
+ 	big_debuginfo_symbol \
+ 	deep-backtrace \
+ 	describe-block \
+@@ -505,6 +506,7 @@ endif
+ leak_cpp_interior_SOURCES	= leak_cpp_interior.cpp
+ 
+ demangle_SOURCES = demangle.cpp
++demangle_rust_SOURCES = demangle-rust.c
+ 
+ # Suppress various gcc warnings which are correct, but for things
+ # we are actually testing for at runtime.
diff --git a/SOURCES/valgrind-3.18.1-s390x-EXRL.patch b/SOURCES/valgrind-3.18.1-s390x-EXRL.patch
new file mode 100644
index 0000000..6927cc3
--- /dev/null
+++ b/SOURCES/valgrind-3.18.1-s390x-EXRL.patch
@@ -0,0 +1,549 @@
+commit b77dbefe72e4a5c7bcf1576a02c909010bd56991
+Author: Andreas Arnez <arnez@linux.ibm.com>
+Date:   Fri Oct 22 19:55:12 2021 +0200
+
+    Bug 444242 - s390x: Sign-extend "relative long" offset in EXRL
+    
+    In s390_irgen_EXRL, the offset is zero-extended instead of sign-extended,
+    typically causing Valgrind to crash when a negative offset occurs.
+    
+    Fix this with a new helper function that calculates a "relative long"
+    address from a 32-bit offset.  Replace other calculations of "relative
+    long" addresses by invocations of this function as well.  And for
+    consistency, do the same with "relative" (short) addresses.
+
+diff --git a/VEX/priv/guest_s390_toIR.c b/VEX/priv/guest_s390_toIR.c
+index 72222ab04..fffc563d4 100644
+--- a/VEX/priv/guest_s390_toIR.c
++++ b/VEX/priv/guest_s390_toIR.c
+@@ -399,6 +399,22 @@ mkF64i(ULong value)
+    return IRExpr_Const(IRConst_F64i(value));
+ }
+ 
++/* Return the 64-bit address with the given 32-bit "relative long" offset from
++   the current guest instruction being translated. */
++static __inline__ Addr64
++addr_rel_long(UInt offset)
++{
++   return guest_IA_curr_instr + ((Addr64)(Long)(Int)offset << 1);
++}
++
++/* Return the 64-bit address with the given 16-bit "relative" offset from the
++   current guest instruction being translated. */
++static __inline__ Addr64
++addr_relative(UShort offset)
++{
++   return guest_IA_curr_instr + ((Addr64)(Long)(Short)offset << 1);
++}
++
+ /* Little helper function for my sanity. ITE = if-then-else */
+ static IRExpr *
+ mkite(IRExpr *condition, IRExpr *iftrue, IRExpr *iffalse)
+@@ -5516,7 +5532,7 @@ static const HChar *
+ s390_irgen_BRAS(UChar r1, UShort i2)
+ {
+    put_gpr_dw0(r1, mkU64(guest_IA_curr_instr + 4ULL));
+-   call_function_and_chase(guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
++   call_function_and_chase(addr_relative(i2));
+ 
+    return "bras";
+ }
+@@ -5525,7 +5541,7 @@ static const HChar *
+ s390_irgen_BRASL(UChar r1, UInt i2)
+ {
+    put_gpr_dw0(r1, mkU64(guest_IA_curr_instr + 6ULL));
+-   call_function_and_chase(guest_IA_curr_instr + ((ULong)(Long)(Int)i2 << 1));
++   call_function_and_chase(addr_rel_long(i2));
+ 
+    return "brasl";
+ }
+@@ -5538,12 +5554,11 @@ s390_irgen_BRC(UChar r1, UShort i2)
+    if (r1 == 0) {
+    } else {
+       if (r1 == 15) {
+-         always_goto_and_chase(
+-               guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
++         always_goto_and_chase(addr_relative(i2));
+       } else {
+          assign(cond, s390_call_calculate_cond(r1));
+          if_condition_goto(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
+-                           guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
++                           addr_relative(i2));
+ 
+       }
+    }
+@@ -5561,11 +5576,11 @@ s390_irgen_BRCL(UChar r1, UInt i2)
+    if (r1 == 0) {
+    } else {
+       if (r1 == 15) {
+-         always_goto_and_chase(guest_IA_curr_instr + ((ULong)(Long)(Int)i2 << 1));
++         always_goto_and_chase(addr_rel_long(i2));
+       } else {
+          assign(cond, s390_call_calculate_cond(r1));
+          if_condition_goto(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
+-                           guest_IA_curr_instr + ((ULong)(Long)(Int)i2 << 1));
++                           addr_rel_long(i2));
+       }
+    }
+    if (UNLIKELY(vex_traceflags & VEX_TRACE_FE))
+@@ -5579,7 +5594,7 @@ s390_irgen_BRCT(UChar r1, UShort i2)
+ {
+    put_gpr_w1(r1, binop(Iop_Sub32, get_gpr_w1(r1), mkU32(1)));
+    if_condition_goto(binop(Iop_CmpNE32, get_gpr_w1(r1), mkU32(0)),
+-                     guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
++                     addr_relative(i2));
+ 
+    return "brct";
+ }
+@@ -5589,7 +5604,7 @@ s390_irgen_BRCTH(UChar r1, UInt i2)
+ {
+    put_gpr_w0(r1, binop(Iop_Sub32, get_gpr_w0(r1), mkU32(1)));
+    if_condition_goto(binop(Iop_CmpNE32, get_gpr_w0(r1), mkU32(0)),
+-                     guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
++                     addr_relative(i2));
+ 
+    return "brcth";
+ }
+@@ -5599,7 +5614,7 @@ s390_irgen_BRCTG(UChar r1, UShort i2)
+ {
+    put_gpr_dw0(r1, binop(Iop_Sub64, get_gpr_dw0(r1), mkU64(1)));
+    if_condition_goto(binop(Iop_CmpNE64, get_gpr_dw0(r1), mkU64(0)),
+-                     guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
++                     addr_relative(i2));
+ 
+    return "brctg";
+ }
+@@ -5612,7 +5627,7 @@ s390_irgen_BRXH(UChar r1, UChar r3, UShort i2)
+    assign(value, get_gpr_w1(r3 | 1));
+    put_gpr_w1(r1, binop(Iop_Add32, get_gpr_w1(r1), get_gpr_w1(r3)));
+    if_condition_goto(binop(Iop_CmpLT32S, mkexpr(value), get_gpr_w1(r1)),
+-                     guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
++                     addr_relative(i2));
+ 
+    return "brxh";
+ }
+@@ -5625,7 +5640,7 @@ s390_irgen_BRXHG(UChar r1, UChar r3, UShort i2)
+    assign(value, get_gpr_dw0(r3 | 1));
+    put_gpr_dw0(r1, binop(Iop_Add64, get_gpr_dw0(r1), get_gpr_dw0(r3)));
+    if_condition_goto(binop(Iop_CmpLT64S, mkexpr(value), get_gpr_dw0(r1)),
+-                     guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
++                     addr_relative(i2));
+ 
+    return "brxhg";
+ }
+@@ -5638,7 +5653,7 @@ s390_irgen_BRXLE(UChar r1, UChar r3, UShort i2)
+    assign(value, get_gpr_w1(r3 | 1));
+    put_gpr_w1(r1, binop(Iop_Add32, get_gpr_w1(r1), get_gpr_w1(r3)));
+    if_condition_goto(binop(Iop_CmpLE32S, get_gpr_w1(r1), mkexpr(value)),
+-                     guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
++                     addr_relative(i2));
+ 
+    return "brxle";
+ }
+@@ -5651,7 +5666,7 @@ s390_irgen_BRXLG(UChar r1, UChar r3, UShort i2)
+    assign(value, get_gpr_dw0(r3 | 1));
+    put_gpr_dw0(r1, binop(Iop_Add64, get_gpr_dw0(r1), get_gpr_dw0(r3)));
+    if_condition_goto(binop(Iop_CmpLE64S, get_gpr_dw0(r1), mkexpr(value)),
+-                     guest_IA_curr_instr + ((ULong)(Long)(Short)i2 << 1));
++                     addr_relative(i2));
+ 
+    return "brxlg";
+ }
+@@ -5782,8 +5797,7 @@ s390_irgen_CRL(UChar r1, UInt i2)
+    IRTemp op2 = newTemp(Ity_I32);
+ 
+    assign(op1, get_gpr_w1(r1));
+-   assign(op2, load(Ity_I32, mkU64(guest_IA_curr_instr + ((ULong)(Long)(Int)
+-          i2 << 1))));
++   assign(op2, load(Ity_I32, mkU64(addr_rel_long(i2))));
+    s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+ 
+    return "crl";
+@@ -5796,8 +5810,7 @@ s390_irgen_CGRL(UChar r1, UInt i2)
+    IRTemp op2 = newTemp(Ity_I64);
+ 
+    assign(op1, get_gpr_dw0(r1));
+-   assign(op2, load(Ity_I64, mkU64(guest_IA_curr_instr + ((ULong)(Long)(Int)
+-          i2 << 1))));
++   assign(op2, load(Ity_I64, mkU64(addr_rel_long(i2))));
+    s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+ 
+    return "cgrl";
+@@ -5810,8 +5823,7 @@ s390_irgen_CGFRL(UChar r1, UInt i2)
+    IRTemp op2 = newTemp(Ity_I64);
+ 
+    assign(op1, get_gpr_dw0(r1));
+-   assign(op2, unop(Iop_32Sto64, load(Ity_I32, mkU64(guest_IA_curr_instr +
+-          ((ULong)(Long)(Int)i2 << 1)))));
++   assign(op2, unop(Iop_32Sto64, load(Ity_I32, mkU64(addr_rel_long(i2)))));
+    s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+ 
+    return "cgfrl";
+@@ -5875,15 +5887,14 @@ s390_irgen_CRJ(UChar r1, UChar r2, UShort i4, UChar m3)
+    if (m3 == 0) {
+    } else {
+       if (m3 == 14) {
+-         always_goto_and_chase(
+-                guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
++         always_goto_and_chase(addr_relative(i4));
+       } else {
+          assign(op1, get_gpr_w1(r1));
+          assign(op2, get_gpr_w1(r2));
+          assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_SIGNED_COMPARE,
+                                               op1, op2));
+          if_condition_goto(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
+-                           guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
++                           addr_relative(i4));
+ 
+       }
+    }
+@@ -5901,15 +5912,14 @@ s390_irgen_CGRJ(UChar r1, UChar r2, UShort i4, UChar m3)
+    if (m3 == 0) {
+    } else {
+       if (m3 == 14) {
+-         always_goto_and_chase(
+-                guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
++         always_goto_and_chase(addr_relative(i4));
+       } else {
+          assign(op1, get_gpr_dw0(r1));
+          assign(op2, get_gpr_dw0(r2));
+          assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_SIGNED_COMPARE,
+                                               op1, op2));
+          if_condition_goto(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
+-                           guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
++                           addr_relative(i4));
+ 
+       }
+    }
+@@ -5975,14 +5985,14 @@ s390_irgen_CIJ(UChar r1, UChar m3, UShort i4, UChar i2)
+    if (m3 == 0) {
+    } else {
+       if (m3 == 14) {
+-         always_goto_and_chase(guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
++         always_goto_and_chase(addr_relative(i4));
+       } else {
+          assign(op1, get_gpr_w1(r1));
+          op2 = (Int)(Char)i2;
+          assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_SIGNED_COMPARE, op1,
+                                               mktemp(Ity_I32, mkU32((UInt)op2))));
+          if_condition_goto(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
+-                           guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
++                           addr_relative(i4));
+ 
+       }
+    }
+@@ -6000,14 +6010,14 @@ s390_irgen_CGIJ(UChar r1, UChar m3, UShort i4, UChar i2)
+    if (m3 == 0) {
+    } else {
+       if (m3 == 14) {
+-         always_goto_and_chase(guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
++         always_goto_and_chase(addr_relative(i4));
+       } else {
+          assign(op1, get_gpr_dw0(r1));
+          op2 = (Long)(Char)i2;
+          assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_SIGNED_COMPARE, op1,
+                                               mktemp(Ity_I64, mkU64((ULong)op2))));
+          if_condition_goto(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
+-                           guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
++                           addr_relative(i4));
+ 
+       }
+    }
+@@ -6131,8 +6141,7 @@ s390_irgen_CHRL(UChar r1, UInt i2)
+    IRTemp op2 = newTemp(Ity_I32);
+ 
+    assign(op1, get_gpr_w1(r1));
+-   assign(op2, unop(Iop_16Sto32, load(Ity_I16, mkU64(guest_IA_curr_instr +
+-          ((ULong)(Long)(Int)i2 << 1)))));
++   assign(op2, unop(Iop_16Sto32, load(Ity_I16, mkU64(addr_rel_long(i2)))));
+    s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+ 
+    return "chrl";
+@@ -6145,8 +6154,7 @@ s390_irgen_CGHRL(UChar r1, UInt i2)
+    IRTemp op2 = newTemp(Ity_I64);
+ 
+    assign(op1, get_gpr_dw0(r1));
+-   assign(op2, unop(Iop_16Sto64, load(Ity_I16, mkU64(guest_IA_curr_instr +
+-          ((ULong)(Long)(Int)i2 << 1)))));
++   assign(op2, unop(Iop_16Sto64, load(Ity_I16, mkU64(addr_rel_long(i2)))));
+    s390_cc_thunk_putSS(S390_CC_OP_SIGNED_COMPARE, op1, op2);
+ 
+    return "cghrl";
+@@ -6401,8 +6409,7 @@ s390_irgen_CLRL(UChar r1, UInt i2)
+    IRTemp op2 = newTemp(Ity_I32);
+ 
+    assign(op1, get_gpr_w1(r1));
+-   assign(op2, load(Ity_I32, mkU64(guest_IA_curr_instr + ((ULong)(Long)(Int)
+-          i2 << 1))));
++   assign(op2, load(Ity_I32, mkU64(addr_rel_long(i2))));
+    s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, op2);
+ 
+    return "clrl";
+@@ -6415,8 +6422,7 @@ s390_irgen_CLGRL(UChar r1, UInt i2)
+    IRTemp op2 = newTemp(Ity_I64);
+ 
+    assign(op1, get_gpr_dw0(r1));
+-   assign(op2, load(Ity_I64, mkU64(guest_IA_curr_instr + ((ULong)(Long)(Int)
+-          i2 << 1))));
++   assign(op2, load(Ity_I64, mkU64(addr_rel_long(i2))));
+    s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, op2);
+ 
+    return "clgrl";
+@@ -6429,8 +6435,7 @@ s390_irgen_CLGFRL(UChar r1, UInt i2)
+    IRTemp op2 = newTemp(Ity_I64);
+ 
+    assign(op1, get_gpr_dw0(r1));
+-   assign(op2, unop(Iop_32Uto64, load(Ity_I32, mkU64(guest_IA_curr_instr +
+-          ((ULong)(Long)(Int)i2 << 1)))));
++   assign(op2, unop(Iop_32Uto64, load(Ity_I32, mkU64(addr_rel_long(i2)))));
+    s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, op2);
+ 
+    return "clgfrl";
+@@ -6443,8 +6448,7 @@ s390_irgen_CLHRL(UChar r1, UInt i2)
+    IRTemp op2 = newTemp(Ity_I32);
+ 
+    assign(op1, get_gpr_w1(r1));
+-   assign(op2, unop(Iop_16Uto32, load(Ity_I16, mkU64(guest_IA_curr_instr +
+-          ((ULong)(Long)(Int)i2 << 1)))));
++   assign(op2, unop(Iop_16Uto32, load(Ity_I16, mkU64(addr_rel_long(i2)))));
+    s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, op2);
+ 
+    return "clhrl";
+@@ -6457,8 +6461,7 @@ s390_irgen_CLGHRL(UChar r1, UInt i2)
+    IRTemp op2 = newTemp(Ity_I64);
+ 
+    assign(op1, get_gpr_dw0(r1));
+-   assign(op2, unop(Iop_16Uto64, load(Ity_I16, mkU64(guest_IA_curr_instr +
+-          ((ULong)(Long)(Int)i2 << 1)))));
++   assign(op2, unop(Iop_16Uto64, load(Ity_I16, mkU64(addr_rel_long(i2)))));
+    s390_cc_thunk_putZZ(S390_CC_OP_UNSIGNED_COMPARE, op1, op2);
+ 
+    return "clghrl";
+@@ -6730,14 +6733,14 @@ s390_irgen_CLRJ(UChar r1, UChar r2, UShort i4, UChar m3)
+    if (m3 == 0) {
+    } else {
+       if (m3 == 14) {
+-         always_goto_and_chase(guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
++         always_goto_and_chase(addr_relative(i4));
+       } else {
+          assign(op1, get_gpr_w1(r1));
+          assign(op2, get_gpr_w1(r2));
+          assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_UNSIGNED_COMPARE,
+                                               op1, op2));
+          if_condition_goto(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
+-                           guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
++                           addr_relative(i4));
+ 
+       }
+    }
+@@ -6755,14 +6758,14 @@ s390_irgen_CLGRJ(UChar r1, UChar r2, UShort i4, UChar m3)
+    if (m3 == 0) {
+    } else {
+       if (m3 == 14) {
+-         always_goto_and_chase(guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
++         always_goto_and_chase(addr_relative(i4));
+       } else {
+          assign(op1, get_gpr_dw0(r1));
+          assign(op2, get_gpr_dw0(r2));
+          assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_UNSIGNED_COMPARE,
+                                               op1, op2));
+          if_condition_goto(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
+-                           guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
++                           addr_relative(i4));
+ 
+       }
+    }
+@@ -6828,14 +6831,14 @@ s390_irgen_CLIJ(UChar r1, UChar m3, UShort i4, UChar i2)
+    if (m3 == 0) {
+    } else {
+       if (m3 == 14) {
+-         always_goto_and_chase(guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
++         always_goto_and_chase(addr_relative(i4));
+       } else {
+          assign(op1, get_gpr_w1(r1));
+          op2 = (UInt)i2;
+          assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_UNSIGNED_COMPARE, op1,
+                                               mktemp(Ity_I32, mkU32(op2))));
+          if_condition_goto(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
+-                           guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
++                           addr_relative(i4));
+ 
+       }
+    }
+@@ -6853,14 +6856,14 @@ s390_irgen_CLGIJ(UChar r1, UChar m3, UShort i4, UChar i2)
+    if (m3 == 0) {
+    } else {
+       if (m3 == 14) {
+-         always_goto_and_chase(guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
++         always_goto_and_chase(addr_relative(i4));
+       } else {
+          assign(op1, get_gpr_dw0(r1));
+          op2 = (ULong)i2;
+          assign(cond, s390_call_calculate_icc(m3, S390_CC_OP_UNSIGNED_COMPARE, op1,
+                                               mktemp(Ity_I64, mkU64(op2))));
+          if_condition_goto(binop(Iop_CmpNE32, mkexpr(cond), mkU32(0)),
+-                           guest_IA_curr_instr + ((ULong)(Long)(Short)i4 << 1));
++                           addr_relative(i4));
+ 
+       }
+    }
+@@ -7539,8 +7542,7 @@ s390_irgen_LGFI(UChar r1, UInt i2)
+ static const HChar *
+ s390_irgen_LRL(UChar r1, UInt i2)
+ {
+-   put_gpr_w1(r1, load(Ity_I32, mkU64(guest_IA_curr_instr + ((ULong)(Long)(Int)
+-              i2 << 1))));
++   put_gpr_w1(r1, load(Ity_I32, mkU64(addr_rel_long(i2))));
+ 
+    return "lrl";
+ }
+@@ -7548,8 +7550,7 @@ s390_irgen_LRL(UChar r1, UInt i2)
+ static const HChar *
+ s390_irgen_LGRL(UChar r1, UInt i2)
+ {
+-   put_gpr_dw0(r1, load(Ity_I64, mkU64(guest_IA_curr_instr + ((ULong)(Long)(Int)
+-               i2 << 1))));
++   put_gpr_dw0(r1, load(Ity_I64, mkU64(addr_rel_long(i2))));
+ 
+    return "lgrl";
+ }
+@@ -7557,8 +7558,7 @@ s390_irgen_LGRL(UChar r1, UInt i2)
+ static const HChar *
+ s390_irgen_LGFRL(UChar r1, UInt i2)
+ {
+-   put_gpr_dw0(r1, unop(Iop_32Sto64, load(Ity_I32, mkU64(guest_IA_curr_instr +
+-               ((ULong)(Long)(Int)i2 << 1)))));
++   put_gpr_dw0(r1, unop(Iop_32Sto64, load(Ity_I32, mkU64(addr_rel_long(i2)))));
+ 
+    return "lgfrl";
+ }
+@@ -7598,7 +7598,7 @@ s390_irgen_LAEY(UChar r1, IRTemp op2addr)
+ static const HChar *
+ s390_irgen_LARL(UChar r1, UInt i2)
+ {
+-   put_gpr_dw0(r1, mkU64(guest_IA_curr_instr + ((ULong)(Long)(Int)i2 << 1)));
++   put_gpr_dw0(r1, mkU64(addr_rel_long(i2)));
+ 
+    return "larl";
+ }
+@@ -8038,8 +8038,7 @@ s390_irgen_LGHI(UChar r1, UShort i2)
+ static const HChar *
+ s390_irgen_LHRL(UChar r1, UInt i2)
+ {
+-   put_gpr_w1(r1, unop(Iop_16Sto32, load(Ity_I16, mkU64(guest_IA_curr_instr +
+-              ((ULong)(Long)(Int)i2 << 1)))));
++   put_gpr_w1(r1, unop(Iop_16Sto32, load(Ity_I16, mkU64(addr_rel_long(i2)))));
+ 
+    return "lhrl";
+ }
+@@ -8047,8 +8046,7 @@ s390_irgen_LHRL(UChar r1, UInt i2)
+ static const HChar *
+ s390_irgen_LGHRL(UChar r1, UInt i2)
+ {
+-   put_gpr_dw0(r1, unop(Iop_16Sto64, load(Ity_I16, mkU64(guest_IA_curr_instr +
+-               ((ULong)(Long)(Int)i2 << 1)))));
++   put_gpr_dw0(r1, unop(Iop_16Sto64, load(Ity_I16, mkU64(addr_rel_long(i2)))));
+ 
+    return "lghrl";
+ }
+@@ -8088,8 +8086,7 @@ s390_irgen_LLGF(UChar r1, IRTemp op2addr)
+ static const HChar *
+ s390_irgen_LLGFRL(UChar r1, UInt i2)
+ {
+-   put_gpr_dw0(r1, unop(Iop_32Uto64, load(Ity_I32, mkU64(guest_IA_curr_instr +
+-               ((ULong)(Long)(Int)i2 << 1)))));
++   put_gpr_dw0(r1, unop(Iop_32Uto64, load(Ity_I32, mkU64(addr_rel_long(i2)))));
+ 
+    return "llgfrl";
+ }
+@@ -8169,8 +8166,7 @@ s390_irgen_LLGH(UChar r1, IRTemp op2addr)
+ static const HChar *
+ s390_irgen_LLHRL(UChar r1, UInt i2)
+ {
+-   put_gpr_w1(r1, unop(Iop_16Uto32, load(Ity_I16, mkU64(guest_IA_curr_instr +
+-              ((ULong)(Long)(Int)i2 << 1)))));
++   put_gpr_w1(r1, unop(Iop_16Uto32, load(Ity_I16, mkU64(addr_rel_long(i2)))));
+ 
+    return "llhrl";
+ }
+@@ -8178,8 +8174,7 @@ s390_irgen_LLHRL(UChar r1, UInt i2)
+ static const HChar *
+ s390_irgen_LLGHRL(UChar r1, UInt i2)
+ {
+-   put_gpr_dw0(r1, unop(Iop_16Uto64, load(Ity_I16, mkU64(guest_IA_curr_instr +
+-               ((ULong)(Long)(Int)i2 << 1)))));
++   put_gpr_dw0(r1, unop(Iop_16Uto64, load(Ity_I16, mkU64(addr_rel_long(i2)))));
+ 
+    return "llghrl";
+ }
+@@ -10064,8 +10059,7 @@ s390_irgen_STG(UChar r1, IRTemp op2addr)
+ static const HChar *
+ s390_irgen_STRL(UChar r1, UInt i2)
+ {
+-   store(mkU64(guest_IA_curr_instr + ((ULong)(Long)(Int)i2 << 1)),
+-         get_gpr_w1(r1));
++   store(mkU64(addr_rel_long(i2)), get_gpr_w1(r1));
+ 
+    return "strl";
+ }
+@@ -10073,8 +10067,7 @@ s390_irgen_STRL(UChar r1, UInt i2)
+ static const HChar *
+ s390_irgen_STGRL(UChar r1, UInt i2)
+ {
+-   store(mkU64(guest_IA_curr_instr + ((ULong)(Long)(Int)i2 << 1)),
+-         get_gpr_dw0(r1));
++   store(mkU64(addr_rel_long(i2)), get_gpr_dw0(r1));
+ 
+    return "stgrl";
+ }
+@@ -10203,8 +10196,7 @@ s390_irgen_STHY(UChar r1, IRTemp op2addr)
+ static const HChar *
+ s390_irgen_STHRL(UChar r1, UInt i2)
+ {
+-   store(mkU64(guest_IA_curr_instr + ((ULong)(Long)(Int)i2 << 1)),
+-         get_gpr_hw3(r1));
++   store(mkU64(addr_rel_long(i2)), get_gpr_hw3(r1));
+ 
+    return "sthrl";
+ }
+@@ -13282,7 +13274,7 @@ static const HChar *
+ s390_irgen_EXRL(UChar r1, UInt offset)
+ {
+    IRTemp addr = newTemp(Ity_I64);
+-   Addr64 bytes_addr = guest_IA_curr_instr + offset * 2UL;
++   Addr64 bytes_addr = addr_rel_long(offset);
+    UChar *bytes = (UChar *)(HWord)bytes_addr;
+    /* we might save one round trip because we know the target */
+    if (!last_execute_target)
+diff --git a/none/tests/s390x/exrl.c b/none/tests/s390x/exrl.c
+index 2c99602d8..e669e484f 100644
+--- a/none/tests/s390x/exrl.c
++++ b/none/tests/s390x/exrl.c
+@@ -54,6 +54,17 @@ int main(void)
+    printf("|\n");
+    printf("\n");
+ 
++   printf("------- EXRL with negative offset\n");
++   asm volatile( "j    2f\n\t"
++                 "1:\n\t"
++                 "mvc  2(1,%0),0(%0)\n\t"
++                 "2:\n\t"
++                 "lghi 1,8\n\t"
++                 ".insn ril,0xc60000000000,1,1b\n\t" // exrl 1, 1b
++                 : : "a" (target)
++                 : "1", "2", "3", "4");
++   printf("        target = |%s|\n", target);
++
+    return 0;
+ }
+ 
+diff --git a/none/tests/s390x/exrl.stdout.exp b/none/tests/s390x/exrl.stdout.exp
+index 520919e92..30dcde829 100644
+--- a/none/tests/s390x/exrl.stdout.exp
++++ b/none/tests/s390x/exrl.stdout.exp
+@@ -11,3 +11,5 @@ after:  target = |0123456789aXXXXX|
+ ------- EXRL to OR in the syscall number (writes out target)
+         target = |0123456789aXXXXX|
+ 
++------- EXRL with negative offset
++        target = |01010101010XXXXX|
diff --git a/SPECS/valgrind.spec b/SPECS/valgrind.spec
index 2d07d53..9bd1b51 100644
--- a/SPECS/valgrind.spec
+++ b/SPECS/valgrind.spec
@@ -3,7 +3,7 @@
 Summary: Tool for finding memory management bugs in programs
 Name: %{?scl_prefix}valgrind
 Version: 3.18.1
-Release: 1%{?dist}
+Release: 6%{?dist}
 Epoch: 1
 License: GPLv2+
 URL: http://www.valgrind.org/
@@ -86,6 +86,49 @@ Patch4: valgrind-3.16.0-some-stack-protector.patch
 # Add some -Wl,z,now.
 Patch5: valgrind-3.16.0-some-Wl-z-now.patch
 
+# KDE#444495 dhat/tests/copy fails on s390x
+Patch6: valgrind-3.18.1-dhat-tests-copy.patch
+
+# KDE#444242 s390x: Sign-extend "relative long" offset in EXRL
+Patch7: valgrind-3.18.1-s390x-EXRL.patch
+
+# KDE#444571 - PPC, fix lxsibzx and lxsihzx
+Patch8: valgrind-3.18.1-ppc64-lxsibzx-lxsihzx.patch
+
+# commit ae8c6de01417023e78763de145b1c0e6ddd87277
+# commit 3950c5d661ee09526cddcf24daf5fc22bc83f70c
+# Fix for the prefixed stq instruction in PC relative mode.
+# KDE#444836 pstq instruction for R=1 is not storing to the correct address
+Patch9: valgrind-3.18.1-ppc-pstq.patch
+Patch10: valgrind-3.18.1-ppc-pstq-tests.patch
+
+# commit 64ab89162906d5b9e2de6c3afe476fec861ef7ec
+# gdbserver_tests: Filter out glibc hwcaps libc.so
+Patch11: valgrind-3.18.1-gdbserver_tests-hwcap.patch
+
+# KDE#445184 Rust v0 symbol demangling is broken 
+Patch12: valgrind-3.18.1-rust-v0-demangle.patch
+
+# KDE#445354 arm64 backend: incorrect code emitted for doubleword CAS
+Patch13: valgrind-3.18.1-arm64-doubleword-cas.patch
+
+# KDE#444399 arm64: unhandled instruction LD{,A}XP and ST{,L}XP
+Patch14: valgrind-3.18.1-arm64-ldaxp-stlxp.patch
+
+# KDE#445415 arm64 front end: alignment checks missing for atomic instructions.
+Patch15: valgrind-3.18.1-arm64-atomic-align.patch
+
+# commit 595341b150312d2407bd43304449bf39ec3e1fa8
+# amd64 front end: add more spec rules
+Patch16: valgrind-3.18.1-amd64-more-spec-rules.patch
+
+# KDE#445504 Using C++ condition_variable results in bogus
+# "mutex is locked simultaneously by two threads" warning
+Patch17: valgrind-3.18.1-condvar.patch
+
+# KDE#445668 Inline stack frame generation is broken for Rust binaries
+Patch18: valgrind-3.18.1-demangle-namespace.patch
+
 BuildRequires: make
 BuildRequires: glibc-devel
 
@@ -226,6 +269,20 @@ Valgrind User Manual for details.
 %patch5 -p1
 %endif
 
+%patch6 -p1
+%patch7 -p1
+%patch8 -p1
+%patch9 -p1
+%patch10 -p1
+%patch11 -p1
+%patch12 -p1
+%patch13 -p1
+%patch14 -p1
+%patch15 -p1
+%patch16 -p1
+%patch17 -p1
+%patch18 -p1
+
 %build
 
 # Some patches (might) touch Makefile.am or configure.ac files.
@@ -450,6 +507,24 @@ fi
 %endif
 
 %changelog
+* Tue Nov 30 2021 Mark Wielaard <mjw@redhat.com> - 3.18.1-6
+- Rebuild against fresh toolchain
+
+* Wed Nov 24 2021 Mark Wielaard <mjw@redhat.com> - 3.18.1-5
+- Add valgrind-3.18.1-dhat-tests-copy.patch
+- Add valgrind-3.18.1-s390x-EXRL.patch
+- Add valgrind-3.18.1-ppc64-lxsibzx-lxsihzx.patch
+- Add valgrind-3.18.1-ppc-pstq.patch
+- Add valgrind-3.18.1-ppc-pstq-tests.patch
+- Add valgrind-3.18.1-gdbserver_tests-hwcap.patch
+- Add valgrind-3.18.1-rust-v0-demangle.patch
+- Add valgrind-3.18.1-arm64-doubleword-cas.patch
+- Add valgrind-3.18.1-arm64-ldaxp-stlxp.patch
+- Add valgrind-3.18.1-arm64-atomic-align.patch
+- Add valgrind-3.18.1-amd64-more-spec-rules.patch
+- Add valgrind-3.18.1-condvar.patch
+- Add valgrind-3.18.1-demangle-namespace.patch
+
 * Wed Oct 20 2021 Mark Wielaard <mjw@redhat.com> - 3.18.1-1
 - Update to upstream 3.18.1 final