valgrind/valgrind-3.14.0-memcheck-new-IROps.patch

commit e221eca26be6b2396e3fcbf4117e630fc22e79f6
Author: Julian Seward <jseward@acm.org>
Date:   Tue Nov 20 11:28:42 2018 +0100

    Add Memcheck support for IROps added in 42719898.
    
    memcheck/mc_translate.c:
    
    Add mkRight{32,64} as right-travelling analogues to mkLeft{32,64}.
    
    doCmpORD: for the cases of a signed comparison against zero, compute
    definedness of the 3 result bits (lt,gt,eq) separately, and, for the lt and eq
    bits, do it exactly accurately.
    
    expensiveCountTrailingZeroes: no functional change.  Re-analyse/verify and add
    comments.
    
    expensiveCountLeadingZeroes: add.  Very similar to
    expensiveCountTrailingZeroes.
    
    Add some comments to mark unary ops which are self-shadowing.
    
    Route Iop_Ctz{,Nat}{32,64} through expensiveCountTrailingZeroes.
    Route Iop_Clz{,Nat}{32,64} through expensiveCountLeadingZeroes.
    
    Add instrumentation for Iop_PopCount{32,64} and Iop_Reverse8sIn32_x1.
    
    memcheck/tests/vbit-test/irops.c
    
    Add dummy new entries for all new IROps, just enough to make it compile and
    run.

diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
index 68a2ab3..c24db91 100644
--- a/memcheck/mc_translate.c
+++ b/memcheck/mc_translate.c
@@ -737,6 +737,34 @@ static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) {
    return assignNew('V', mce, Ity_I64, unop(Iop_Left64, a1));
 }
 
+/* --------- The Right-family of operations. --------- */
+
+/* Unfortunately these are a lot more expensive then their Left
+   counterparts.  Fortunately they are only very rarely used -- only for
+   count-leading-zeroes instrumentation. */
+
+static IRAtom* mkRight32 ( MCEnv* mce, IRAtom* a1 )
+{
+   for (Int i = 1; i <= 16; i *= 2) {
+      // a1 |= (a1 >>u i)
+      IRAtom* tmp
+         = assignNew('V', mce, Ity_I32, binop(Iop_Shr32, a1, mkU8(i)));
+      a1 = assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, tmp));
+   }
+   return a1;
+}
+
+static IRAtom* mkRight64 ( MCEnv* mce, IRAtom* a1 )
+{
+   for (Int i = 1; i <= 32; i *= 2) {
+      // a1 |= (a1 >>u i)
+      IRAtom* tmp
+         = assignNew('V', mce, Ity_I64, binop(Iop_Shr64, a1, mkU8(i)));
+      a1 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, tmp));
+   }
+   return a1;
+}
+
 /* --------- 'Improvement' functions for AND/OR. --------- */
 
 /* ImproveAND(data, vbits) = data OR vbits.  Defined (0) data 0s give
@@ -1280,20 +1308,18 @@ static IRAtom* doCmpORD ( MCEnv*  mce,
                           IRAtom* xxhash, IRAtom* yyhash, 
                           IRAtom* xx,     IRAtom* yy )
 {
-   Bool   m64    = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U;
-   Bool   syned  = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S;
-   IROp   opOR   = m64 ? Iop_Or64  : Iop_Or32;
-   IROp   opAND  = m64 ? Iop_And64 : Iop_And32;
-   IROp   opSHL  = m64 ? Iop_Shl64 : Iop_Shl32;
-   IROp   opSHR  = m64 ? Iop_Shr64 : Iop_Shr32;
-   IRType ty     = m64 ? Ity_I64   : Ity_I32;
-   Int    width  = m64 ? 64        : 32;
+   Bool   m64      = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U;
+   Bool   syned    = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S;
+   IROp   opOR     = m64 ? Iop_Or64   : Iop_Or32;
+   IROp   opAND    = m64 ? Iop_And64  : Iop_And32;
+   IROp   opSHL    = m64 ? Iop_Shl64  : Iop_Shl32;
+   IROp   opSHR    = m64 ? Iop_Shr64  : Iop_Shr32;
+   IROp   op1UtoWS = m64 ? Iop_1Uto64 : Iop_1Uto32;
+   IRType ty       = m64 ? Ity_I64    : Ity_I32;
+   Int    width    = m64 ? 64         : 32;
 
    Bool (*isZero)(IRAtom*) = m64 ? isZeroU64 : isZeroU32;
 
-   IRAtom* threeLeft1 = NULL;
-   IRAtom* sevenLeft1 = NULL;
-
    tl_assert(isShadowAtom(mce,xxhash));
    tl_assert(isShadowAtom(mce,yyhash));
    tl_assert(isOriginalAtom(mce,xx));
@@ -1312,30 +1338,55 @@ static IRAtom* doCmpORD ( MCEnv*  mce,
       /* fancy interpretation */
       /* if yy is zero, then it must be fully defined (zero#). */
       tl_assert(isZero(yyhash));
-      threeLeft1 = m64 ? mkU64(3<<1) : mkU32(3<<1);
+      // This is still inaccurate, but I don't think it matters, since
+      // nobody writes code of the form
+      // "is <partially-undefined-value> signedly greater than zero?".
+      // We therefore simply declare "x >s 0" to be undefined if any bit in
+      // x is undefined.  That's clearly suboptimal in some cases.  Eg, if
+      // the highest order bit is a defined 1 then x is negative so it
+      // doesn't matter whether the remaining bits are defined or not.
+      IRAtom* t_0_gt_0_0
+         = assignNew(
+              'V', mce,ty,
+              binop(
+                 opAND,
+                 mkPCastTo(mce,ty, xxhash),
+                 m64 ? mkU64(1<<2) : mkU32(1<<2)
+              ));
+      // For "x <s 0", we can just copy the definedness of the top bit of x
+      // and we have a precise result.
+      IRAtom* t_lt_0_0_0
+         = assignNew(
+              'V', mce,ty,
+              binop(
+                 opSHL,
+                 assignNew(
+                    'V', mce,ty,
+                    binop(opSHR, xxhash, mkU8(width-1))),
+                 mkU8(3)
+              ));
+      // For "x == 0" we can hand the problem off to expensiveCmpEQorNE.
+      IRAtom* t_0_0_eq_0
+         = assignNew(
+              'V', mce,ty,
+              binop(
+                 opSHL,
+                 assignNew('V', mce,ty,
+                    unop(
+                    op1UtoWS,
+                    expensiveCmpEQorNE(mce, ty, xxhash, yyhash, xx, yy))
+                 ),
+                 mkU8(1)
+              ));
       return
          binop(
             opOR,
-            assignNew(
-               'V', mce,ty,
-               binop(
-                  opAND,
-                  mkPCastTo(mce,ty, xxhash), 
-                  threeLeft1
-               )),
-            assignNew(
-               'V', mce,ty,
-               binop(
-                  opSHL,
-                  assignNew(
-                     'V', mce,ty,
-                     binop(opSHR, xxhash, mkU8(width-1))),
-                  mkU8(3)
-               ))
-	 );
+            assignNew('V', mce,ty, binop(opOR, t_lt_0_0_0, t_0_gt_0_0)),
+            t_0_0_eq_0
+         );
    } else {
       /* standard interpretation */
-      sevenLeft1 = m64 ? mkU64(7<<1) : mkU32(7<<1);
+      IRAtom* sevenLeft1 = m64 ? mkU64(7<<1) : mkU32(7<<1);
       return 
          binop( 
             opAND, 
@@ -2211,14 +2262,14 @@ IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,
    tl_assert(sameKindedAtoms(atom,vatom));
 
    switch (czop) {
-      case Iop_Ctz32:
+      case Iop_Ctz32: case Iop_CtzNat32:
          ty = Ity_I32;
          xorOp = Iop_Xor32;
          subOp = Iop_Sub32;
          andOp = Iop_And32;
          one = mkU32(1);
          break;
-      case Iop_Ctz64:
+      case Iop_Ctz64: case Iop_CtzNat64:
          ty = Ity_I64;
          xorOp = Iop_Xor64;
          subOp = Iop_Sub64;
@@ -2232,8 +2283,30 @@ IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,
 
    // improver = atom ^ (atom - 1)
    //
-   // That is, improver has its low ctz(atom) bits equal to one;
-   // higher bits (if any) equal to zero.
+   // That is, improver has its low ctz(atom)+1 bits equal to one;
+   // higher bits (if any) equal to zero.  So it's exactly the right
+   // mask to use to remove the irrelevant undefined input bits.
+   /* Here are some examples:
+         atom   = U...U 1 0...0
+         atom-1 = U...U 0 1...1
+         ^ed    = 0...0 1 11111, which correctly describes which bits of |atom|
+                                 actually influence the result
+      A boundary case
+         atom   = 0...0
+         atom-1 = 1...1
+         ^ed    = 11111, also a correct mask for the input: all input bits
+                         are relevant
+      Another boundary case
+         atom   = 1..1 1
+         atom-1 = 1..1 0
+         ^ed    = 0..0 1, also a correct mask: only the rightmost input bit
+                          is relevant
+      Now with misc U bits interspersed:
+         atom   = U...U 1 0 U...U 0 1 0...0
+         atom-1 = U...U 1 0 U...U 0 0 1...1
+         ^ed    = 0...0 0 0 0...0 0 1 1...1, also correct
+      (Per re-check/analysis of 14 Nov 2018)
+   */
    improver = assignNew('V', mce,ty,
                         binop(xorOp,
                               atom,
@@ -2242,8 +2315,96 @@ IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,
 
    // improved = vatom & improver
    //
-   // That is, treat any V bits above the first ctz(atom) bits as
-   // "defined".
+   // That is, treat any V bits to the left of the rightmost ctz(atom)+1
+   // bits as "defined".
+   improved = assignNew('V', mce, ty,
+                        binop(andOp, vatom, improver));
+
+   // Return pessimizing cast of improved.
+   return mkPCastTo(mce, ty, improved);
+}
+
+static
+IRAtom* expensiveCountLeadingZeroes ( MCEnv* mce, IROp czop,
+                                      IRAtom* atom, IRAtom* vatom )
+{
+   IRType ty;
+   IROp shrOp, notOp, andOp;
+   IRAtom* (*mkRight)(MCEnv*, IRAtom*);
+   IRAtom *improver, *improved;
+   tl_assert(isShadowAtom(mce,vatom));
+   tl_assert(isOriginalAtom(mce,atom));
+   tl_assert(sameKindedAtoms(atom,vatom));
+
+   switch (czop) {
+      case Iop_Clz32: case Iop_ClzNat32:
+         ty = Ity_I32;
+         shrOp = Iop_Shr32;
+         notOp = Iop_Not32;
+         andOp = Iop_And32;
+         mkRight = mkRight32;
+         break;
+      case Iop_Clz64: case Iop_ClzNat64:
+         ty = Ity_I64;
+         shrOp = Iop_Shr64;
+         notOp = Iop_Not64;
+         andOp = Iop_And64;
+         mkRight = mkRight64;
+         break;
+      default:
+         ppIROp(czop);
+         VG_(tool_panic)("memcheck:expensiveCountLeadingZeroes");
+   }
+
+   // This is in principle very similar to how expensiveCountTrailingZeroes
+   // works.  That function computed an "improver", which it used to mask
+   // off all but the rightmost 1-bit and the zeroes to the right of it,
+   // hence removing irrelevant bits from the input.  Here, we play the
+   // exact same game but with the left-vs-right roles interchanged.
+   // Unfortunately calculation of the improver in this case is
+   // significantly more expensive.
+   //
+   // improver = ~(RIGHT(atom) >>u 1)
+   //
+   // That is, improver has its upper clz(atom)+1 bits equal to one;
+   // lower bits (if any) equal to zero.  So it's exactly the right
+   // mask to use to remove the irrelevant undefined input bits.
+   /* Here are some examples:
+         atom             = 0...0 1 U...U
+         R(atom)          = 0...0 1 1...1
+         R(atom) >>u 1    = 0...0 0 1...1
+         ~(R(atom) >>u 1) = 1...1 1 0...0
+                            which correctly describes which bits of |atom|
+                            actually influence the result
+      A boundary case
+         atom             = 0...0
+         R(atom)          = 0...0
+         R(atom) >>u 1    = 0...0
+         ~(R(atom) >>u 1) = 1...1
+                            also a correct mask for the input: all input bits
+                            are relevant
+      Another boundary case
+         atom             = 1 1..1
+         R(atom)          = 1 1..1
+         R(atom) >>u 1    = 0 1..1
+         ~(R(atom) >>u 1) = 1 0..0
+                            also a correct mask: only the leftmost input bit
+                            is relevant
+      Now with misc U bits interspersed:
+         atom             = 0...0 1 U...U 0 1 U...U
+         R(atom)          = 0...0 1 1...1 1 1 1...1
+         R(atom) >>u 1    = 0...0 0 1...1 1 1 1...1
+         ~(R(atom) >>u 1) = 1...1 1 0...0 0 0 0...0, also correct
+      (Per initial implementation of 15 Nov 2018)
+   */
+   improver = mkRight(mce, atom);
+   improver = assignNew('V', mce, ty, binop(shrOp, improver, mkU8(1)));
+   improver = assignNew('V', mce, ty, unop(notOp, improver));
+
+   // improved = vatom & improver
+   //
+   // That is, treat any V bits to the right of the leftmost clz(atom)+1
+   // bits as "defined".
    improved = assignNew('V', mce, ty,
                         binop(andOp, vatom, improver));
 
@@ -4705,6 +4866,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
       case Iop_RecipEst32F0x4:
          return unary32F0x4(mce, vatom);
 
+      // These are self-shadowing.
       case Iop_32UtoV128:
       case Iop_64UtoV128:
       case Iop_Dup8x16:
@@ -4745,6 +4907,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
       case Iop_MulI128by10Carry:
       case Iop_F16toF64x2:
       case Iop_F64toF16x2:
+         // FIXME JRS 2018-Nov-15.  This is surely not correct!
          return vatom;
 
       case Iop_I32StoF128: /* signed I32 -> F128 */
@@ -4770,7 +4933,6 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
       case Iop_RoundF64toF64_NegINF:
       case Iop_RoundF64toF64_PosINF:
       case Iop_RoundF64toF64_ZERO:
-      case Iop_Clz64:
       case Iop_D32toD64:
       case Iop_I32StoD64:
       case Iop_I32UtoD64:
@@ -4785,17 +4947,32 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
       case Iop_D64toD128:
          return mkPCastTo(mce, Ity_I128, vatom);
 
-      case Iop_Clz32:
       case Iop_TruncF64asF32:
       case Iop_NegF32:
       case Iop_AbsF32:
       case Iop_F16toF32: 
          return mkPCastTo(mce, Ity_I32, vatom);
 
-      case Iop_Ctz32:
-      case Iop_Ctz64:
+      case Iop_Ctz32: case Iop_CtzNat32:
+      case Iop_Ctz64: case Iop_CtzNat64:
          return expensiveCountTrailingZeroes(mce, op, atom, vatom);
 
+      case Iop_Clz32: case Iop_ClzNat32:
+      case Iop_Clz64: case Iop_ClzNat64:
+         return expensiveCountLeadingZeroes(mce, op, atom, vatom);
+
+      // PopCount32: this is slightly pessimistic.  It is true that the
+      // result depends on all input bits, so that aspect of the PCast is
+      // correct.  However, regardless of the input, only the lowest 5 bits
+      // out of the output can ever be undefined.  So we could actually
+      // "improve" the results here by marking the top 27 bits of output as
+      // defined.  A similar comment applies for PopCount64.
+      case Iop_PopCount32:
+         return mkPCastTo(mce, Ity_I32, vatom);
+      case Iop_PopCount64:
+         return mkPCastTo(mce, Ity_I64, vatom);
+
+      // These are self-shadowing.
       case Iop_1Uto64:
       case Iop_1Sto64:
       case Iop_8Uto64:
@@ -4821,6 +4998,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
       case Iop_V256to64_2: case Iop_V256to64_3:
          return assignNew('V', mce, Ity_I64, unop(op, vatom));
 
+      // These are self-shadowing.
       case Iop_64to32:
       case Iop_64HIto32:
       case Iop_1Uto32:
@@ -4830,8 +5008,10 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
       case Iop_16Sto32:
       case Iop_8Sto32:
       case Iop_V128to32:
+      case Iop_Reverse8sIn32_x1:
          return assignNew('V', mce, Ity_I32, unop(op, vatom));
 
+      // These are self-shadowing.
       case Iop_8Sto16:
       case Iop_8Uto16:
       case Iop_32to16:
@@ -4840,6 +5020,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
       case Iop_GetMSBs8x16:
          return assignNew('V', mce, Ity_I16, unop(op, vatom));
 
+      // These are self-shadowing.
       case Iop_1Uto8:
       case Iop_1Sto8:
       case Iop_16to8:
@@ -4868,6 +5049,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
       case Iop_Not16:
       case Iop_Not8:
       case Iop_Not1:
+         // FIXME JRS 2018-Nov-15.  This is surely not correct!
          return vatom;
 
       case Iop_CmpNEZ8x8:
@@ -4929,6 +5111,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
       case Iop_Ctz64x2:
          return mkPCast64x2(mce, vatom);
 
+      // This is self-shadowing.
       case Iop_PwBitMtxXpose64x2:
          return assignNew('V', mce, Ity_V128, unop(op, vatom));
 
diff --git a/memcheck/tests/vbit-test/irops.c b/memcheck/tests/vbit-test/irops.c
index bfd82fc..e8bf67d 100644
--- a/memcheck/tests/vbit-test/irops.c
+++ b/memcheck/tests/vbit-test/irops.c
@@ -111,6 +111,12 @@ static irop_t irops[] = {
   { DEFOP(Iop_Clz32,      UNDEF_ALL),  .s390x = 0, .amd64 = 0, .x86 = 1, .arm = 1, .ppc64 = 1, .ppc32 = 1, .mips32 =1, .mips64 = 1 },
   { DEFOP(Iop_Ctz64,      UNDEF_ALL),  .s390x = 0, .amd64 = 1, .x86 = 0, .arm = 0, .ppc64 = 0, .ppc32 = 0, .mips32 =0, .mips64 = 0 },
   { DEFOP(Iop_Ctz32,      UNDEF_ALL),  .s390x = 0, .amd64 = 0, .x86 = 1, .arm = 0, .ppc64 = 0, .ppc32 = 0, .mips32 =0, .mips64 = 0 },
+  { DEFOP(Iop_ClzNat64,   UNDEF_ALL),  .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 0, .mips32 =0, .mips64 = 0 }, // ppc32 asserts
+  { DEFOP(Iop_ClzNat32,   UNDEF_ALL),  .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 1, .mips32 =0, .mips64 = 0 },
+  { DEFOP(Iop_CtzNat64,   UNDEF_ALL),  .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 0, .mips32 =0, .mips64 = 0 },
+  { DEFOP(Iop_CtzNat32,   UNDEF_ALL),  .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 0, .ppc32 = 1, .mips32 =0, .mips64 = 0 },
+  { DEFOP(Iop_PopCount64, UNDEF_ALL),  .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 0, .mips32 =0, .mips64 = 0 },
+  { DEFOP(Iop_PopCount32, UNDEF_ALL),  .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 1, .mips32 =0, .mips64 = 0 },
   { DEFOP(Iop_CmpLT32S,   UNDEF_ALL),  .s390x = 1, .amd64 = 1, .x86 = 1, .arm = 1, .ppc64 = 1, .ppc32 = 1, .mips32 =1, .mips64 = 1 },
   { DEFOP(Iop_CmpLT64S,   UNDEF_ALL),  .s390x = 1, .amd64 = 1, .x86 = 0, .arm = 0, .ppc64 = 0, .ppc32 = 0, .mips32 =0, .mips64 = 1 }, // ppc, mips assert
   { DEFOP(Iop_CmpLE32S,   UNDEF_ALL),  .s390x = 1, .amd64 = 1, .x86 = 1, .arm = 1, .ppc64 = 1, .ppc32 = 1, .mips32 =1, .mips64 = 1 },
@@ -336,6 +342,7 @@ static irop_t irops[] = {
   { DEFOP(Iop_Sad8Ux4, UNDEF_UNKNOWN), },
   { DEFOP(Iop_CmpNEZ16x2, UNDEF_UNKNOWN), },
   { DEFOP(Iop_CmpNEZ8x4, UNDEF_UNKNOWN), },
+  { DEFOP(Iop_Reverse8sIn32_x1, UNDEF_UNKNOWN) },
   /* ------------------ 64-bit SIMD FP ------------------------ */
   { DEFOP(Iop_I32UtoFx2, UNDEF_UNKNOWN), },
   { DEFOP(Iop_I32StoFx2, UNDEF_UNKNOWN), },
3.14.0-4 gcc ppc64le inlined memcmp vs memcheck (#1652926) - Add valgrind-3.14.0-get_otrack_shadow_offset_wrk-ppc.patch, valgrind-3.14.0-new-strlen-IROps.patch, valgrind-3.14.0-ppc-instr-new-IROps.patch, valgrind-3.14.0-memcheck-new-IROps.patch, valgrind-3.14.0-ppc-frontend-new-IROps.patch, valgrind-3.14.0-transform-popcount64-ctznat64.patch and valgrind-3.14.0-enable-ppc-Iop_Sar_Shr8.patch (#1652926) 2018-11-23 21:31:07 +00:00			`commit e221eca26be6b2396e3fcbf4117e630fc22e79f6`
			`Author: Julian Seward <jseward@acm.org>`
			`Date: Tue Nov 20 11:28:42 2018 +0100`

			`Add Memcheck support for IROps added in 42719898.`

			`memcheck/mc_translate.c:`

			`Add mkRight{32,64} as right-travelling analogues to mkLeft{32,64}.`

			`doCmpORD: for the cases of a signed comparison against zero, compute`
			`definedness of the 3 result bits (lt,gt,eq) separately, and, for the lt and eq`
			`bits, do it exactly accurately.`

			`expensiveCountTrailingZeroes: no functional change. Re-analyse/verify and add`
			`comments.`

			`expensiveCountLeadingZeroes: add. Very similar to`
			`expensiveCountTrailingZeroes.`

			`Add some comments to mark unary ops which are self-shadowing.`

			`Route Iop_Ctz{,Nat}{32,64} through expensiveCountTrailingZeroes.`
			`Route Iop_Clz{,Nat}{32,64} through expensiveCountLeadingZeroes.`

			`Add instrumentation for Iop_PopCount{32,64} and Iop_Reverse8sIn32_x1.`

			`memcheck/tests/vbit-test/irops.c`

			`Add dummy new entries for all new IROps, just enough to make it compile and`
			`run.`

			`diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c`
			`index 68a2ab3..c24db91 100644`
			`--- a/memcheck/mc_translate.c`
			`+++ b/memcheck/mc_translate.c`
			`@@ -737,6 +737,34 @@ static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) {`
			`return assignNew('V', mce, Ity_I64, unop(Iop_Left64, a1));`
			`}`

			`+/* --------- The Right-family of operations. --------- */`
			`+`
			`+/* Unfortunately these are a lot more expensive then their Left`
			`+ counterparts. Fortunately they are only very rarely used -- only for`
			`+ count-leading-zeroes instrumentation. */`
			`+`
			`+static IRAtom* mkRight32 ( MCEnv* mce, IRAtom* a1 )`
			`+{`
			`+ for (Int i = 1; i <= 16; i *= 2) {`
			`+ // a1 \|= (a1 >>u i)`
			`+ IRAtom* tmp`
			`+ = assignNew('V', mce, Ity_I32, binop(Iop_Shr32, a1, mkU8(i)));`
			`+ a1 = assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, tmp));`
			`+ }`
			`+ return a1;`
			`+}`
			`+`
			`+static IRAtom* mkRight64 ( MCEnv* mce, IRAtom* a1 )`
			`+{`
			`+ for (Int i = 1; i <= 32; i *= 2) {`
			`+ // a1 \|= (a1 >>u i)`
			`+ IRAtom* tmp`
			`+ = assignNew('V', mce, Ity_I64, binop(Iop_Shr64, a1, mkU8(i)));`
			`+ a1 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, tmp));`
			`+ }`
			`+ return a1;`
			`+}`
			`+`
			`/* --------- 'Improvement' functions for AND/OR. --------- */`

			`/* ImproveAND(data, vbits) = data OR vbits. Defined (0) data 0s give`
			`@@ -1280,20 +1308,18 @@ static IRAtom* doCmpORD ( MCEnv* mce,`
			`IRAtom* xxhash, IRAtom* yyhash,`
			`IRAtom* xx, IRAtom* yy )`
			`{`
			`- Bool m64 = cmp_op == Iop_CmpORD64S \|\| cmp_op == Iop_CmpORD64U;`
			`- Bool syned = cmp_op == Iop_CmpORD64S \|\| cmp_op == Iop_CmpORD32S;`
			`- IROp opOR = m64 ? Iop_Or64 : Iop_Or32;`
			`- IROp opAND = m64 ? Iop_And64 : Iop_And32;`
			`- IROp opSHL = m64 ? Iop_Shl64 : Iop_Shl32;`
			`- IROp opSHR = m64 ? Iop_Shr64 : Iop_Shr32;`
			`- IRType ty = m64 ? Ity_I64 : Ity_I32;`
			`- Int width = m64 ? 64 : 32;`
			`+ Bool m64 = cmp_op == Iop_CmpORD64S \|\| cmp_op == Iop_CmpORD64U;`
			`+ Bool syned = cmp_op == Iop_CmpORD64S \|\| cmp_op == Iop_CmpORD32S;`
			`+ IROp opOR = m64 ? Iop_Or64 : Iop_Or32;`
			`+ IROp opAND = m64 ? Iop_And64 : Iop_And32;`
			`+ IROp opSHL = m64 ? Iop_Shl64 : Iop_Shl32;`
			`+ IROp opSHR = m64 ? Iop_Shr64 : Iop_Shr32;`
			`+ IROp op1UtoWS = m64 ? Iop_1Uto64 : Iop_1Uto32;`
			`+ IRType ty = m64 ? Ity_I64 : Ity_I32;`
			`+ Int width = m64 ? 64 : 32;`

			`Bool (isZero)(IRAtom) = m64 ? isZeroU64 : isZeroU32;`

			`- IRAtom* threeLeft1 = NULL;`
			`- IRAtom* sevenLeft1 = NULL;`
			`-`
			`tl_assert(isShadowAtom(mce,xxhash));`
			`tl_assert(isShadowAtom(mce,yyhash));`
			`tl_assert(isOriginalAtom(mce,xx));`
			`@@ -1312,30 +1338,55 @@ static IRAtom* doCmpORD ( MCEnv* mce,`
			`/* fancy interpretation */`
			`/* if yy is zero, then it must be fully defined (zero#). */`
			`tl_assert(isZero(yyhash));`
			`- threeLeft1 = m64 ? mkU64(3<<1) : mkU32(3<<1);`
			`+ // This is still inaccurate, but I don't think it matters, since`
			`+ // nobody writes code of the form`
			`+ // "is <partially-undefined-value> signedly greater than zero?".`
			`+ // We therefore simply declare "x >s 0" to be undefined if any bit in`
			`+ // x is undefined. That's clearly suboptimal in some cases. Eg, if`
			`+ // the highest order bit is a defined 1 then x is negative so it`
			`+ // doesn't matter whether the remaining bits are defined or not.`
			`+ IRAtom* t_0_gt_0_0`
			`+ = assignNew(`
			`+ 'V', mce,ty,`
			`+ binop(`
			`+ opAND,`
			`+ mkPCastTo(mce,ty, xxhash),`
			`+ m64 ? mkU64(1<<2) : mkU32(1<<2)`
			`+ ));`
			`+ // For "x <s 0", we can just copy the definedness of the top bit of x`
			`+ // and we have a precise result.`
			`+ IRAtom* t_lt_0_0_0`
			`+ = assignNew(`
			`+ 'V', mce,ty,`
			`+ binop(`
			`+ opSHL,`
			`+ assignNew(`
			`+ 'V', mce,ty,`
			`+ binop(opSHR, xxhash, mkU8(width-1))),`
			`+ mkU8(3)`
			`+ ));`
			`+ // For "x == 0" we can hand the problem off to expensiveCmpEQorNE.`
			`+ IRAtom* t_0_0_eq_0`
			`+ = assignNew(`
			`+ 'V', mce,ty,`
			`+ binop(`
			`+ opSHL,`
			`+ assignNew('V', mce,ty,`
			`+ unop(`
			`+ op1UtoWS,`
			`+ expensiveCmpEQorNE(mce, ty, xxhash, yyhash, xx, yy))`
			`+ ),`
			`+ mkU8(1)`
			`+ ));`
			`return`
			`binop(`
			`opOR,`
			`- assignNew(`
			`- 'V', mce,ty,`
			`- binop(`
			`- opAND,`
			`- mkPCastTo(mce,ty, xxhash),`
			`- threeLeft1`
			`- )),`
			`- assignNew(`
			`- 'V', mce,ty,`
			`- binop(`
			`- opSHL,`
			`- assignNew(`
			`- 'V', mce,ty,`
			`- binop(opSHR, xxhash, mkU8(width-1))),`
			`- mkU8(3)`
			`- ))`
			`- );`
			`+ assignNew('V', mce,ty, binop(opOR, t_lt_0_0_0, t_0_gt_0_0)),`
			`+ t_0_0_eq_0`
			`+ );`
			`} else {`
			`/* standard interpretation */`
			`- sevenLeft1 = m64 ? mkU64(7<<1) : mkU32(7<<1);`
			`+ IRAtom* sevenLeft1 = m64 ? mkU64(7<<1) : mkU32(7<<1);`
			`return`
			`binop(`
			`opAND,`
			`@@ -2211,14 +2262,14 @@ IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,`
			`tl_assert(sameKindedAtoms(atom,vatom));`

			`switch (czop) {`
			`- case Iop_Ctz32:`
			`+ case Iop_Ctz32: case Iop_CtzNat32:`
			`ty = Ity_I32;`
			`xorOp = Iop_Xor32;`
			`subOp = Iop_Sub32;`
			`andOp = Iop_And32;`
			`one = mkU32(1);`
			`break;`
			`- case Iop_Ctz64:`
			`+ case Iop_Ctz64: case Iop_CtzNat64:`
			`ty = Ity_I64;`
			`xorOp = Iop_Xor64;`
			`subOp = Iop_Sub64;`
			`@@ -2232,8 +2283,30 @@ IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,`

			`// improver = atom ^ (atom - 1)`
			`//`
			`- // That is, improver has its low ctz(atom) bits equal to one;`
			`- // higher bits (if any) equal to zero.`
			`+ // That is, improver has its low ctz(atom)+1 bits equal to one;`
			`+ // higher bits (if any) equal to zero. So it's exactly the right`
			`+ // mask to use to remove the irrelevant undefined input bits.`
			`+ /* Here are some examples:`
			`+ atom = U...U 1 0...0`
			`+ atom-1 = U...U 0 1...1`
			`+ ^ed = 0...0 1 11111, which correctly describes which bits of \|atom\|`
			`+ actually influence the result`
			`+ A boundary case`
			`+ atom = 0...0`
			`+ atom-1 = 1...1`
			`+ ^ed = 11111, also a correct mask for the input: all input bits`
			`+ are relevant`
			`+ Another boundary case`
			`+ atom = 1..1 1`
			`+ atom-1 = 1..1 0`
			`+ ^ed = 0..0 1, also a correct mask: only the rightmost input bit`
			`+ is relevant`
			`+ Now with misc U bits interspersed:`
			`+ atom = U...U 1 0 U...U 0 1 0...0`
			`+ atom-1 = U...U 1 0 U...U 0 0 1...1`
			`+ ^ed = 0...0 0 0 0...0 0 1 1...1, also correct`
			`+ (Per re-check/analysis of 14 Nov 2018)`
			`+ */`
			`improver = assignNew('V', mce,ty,`
			`binop(xorOp,`
			`atom,`
			`@@ -2242,8 +2315,96 @@ IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,`

			`// improved = vatom & improver`
			`//`
			`- // That is, treat any V bits above the first ctz(atom) bits as`
			`- // "defined".`
			`+ // That is, treat any V bits to the left of the rightmost ctz(atom)+1`
			`+ // bits as "defined".`
			`+ improved = assignNew('V', mce, ty,`
			`+ binop(andOp, vatom, improver));`
			`+`
			`+ // Return pessimizing cast of improved.`
			`+ return mkPCastTo(mce, ty, improved);`
			`+}`
			`+`
			`+static`
			`+IRAtom* expensiveCountLeadingZeroes ( MCEnv* mce, IROp czop,`
			`+ IRAtom* atom, IRAtom* vatom )`
			`+{`
			`+ IRType ty;`
			`+ IROp shrOp, notOp, andOp;`
			`+ IRAtom* (mkRight)(MCEnv, IRAtom*);`
			`+ IRAtom improver, improved;`
			`+ tl_assert(isShadowAtom(mce,vatom));`
			`+ tl_assert(isOriginalAtom(mce,atom));`
			`+ tl_assert(sameKindedAtoms(atom,vatom));`
			`+`
			`+ switch (czop) {`
			`+ case Iop_Clz32: case Iop_ClzNat32:`
			`+ ty = Ity_I32;`
			`+ shrOp = Iop_Shr32;`
			`+ notOp = Iop_Not32;`
			`+ andOp = Iop_And32;`
			`+ mkRight = mkRight32;`
			`+ break;`
			`+ case Iop_Clz64: case Iop_ClzNat64:`
			`+ ty = Ity_I64;`
			`+ shrOp = Iop_Shr64;`
			`+ notOp = Iop_Not64;`
			`+ andOp = Iop_And64;`
			`+ mkRight = mkRight64;`
			`+ break;`
			`+ default:`
			`+ ppIROp(czop);`
			`+ VG_(tool_panic)("memcheck:expensiveCountLeadingZeroes");`
			`+ }`
			`+`
			`+ // This is in principle very similar to how expensiveCountTrailingZeroes`
			`+ // works. That function computed an "improver", which it used to mask`
			`+ // off all but the rightmost 1-bit and the zeroes to the right of it,`
			`+ // hence removing irrelevant bits from the input. Here, we play the`
			`+ // exact same game but with the left-vs-right roles interchanged.`
			`+ // Unfortunately calculation of the improver in this case is`
			`+ // significantly more expensive.`
			`+ //`
			`+ // improver = ~(RIGHT(atom) >>u 1)`
			`+ //`
			`+ // That is, improver has its upper clz(atom)+1 bits equal to one;`
			`+ // lower bits (if any) equal to zero. So it's exactly the right`
			`+ // mask to use to remove the irrelevant undefined input bits.`
			`+ /* Here are some examples:`
			`+ atom = 0...0 1 U...U`
			`+ R(atom) = 0...0 1 1...1`
			`+ R(atom) >>u 1 = 0...0 0 1...1`
			`+ ~(R(atom) >>u 1) = 1...1 1 0...0`
			`+ which correctly describes which bits of \|atom\|`
			`+ actually influence the result`
			`+ A boundary case`
			`+ atom = 0...0`
			`+ R(atom) = 0...0`
			`+ R(atom) >>u 1 = 0...0`
			`+ ~(R(atom) >>u 1) = 1...1`
			`+ also a correct mask for the input: all input bits`
			`+ are relevant`
			`+ Another boundary case`
			`+ atom = 1 1..1`
			`+ R(atom) = 1 1..1`
			`+ R(atom) >>u 1 = 0 1..1`
			`+ ~(R(atom) >>u 1) = 1 0..0`
			`+ also a correct mask: only the leftmost input bit`
			`+ is relevant`
			`+ Now with misc U bits interspersed:`
			`+ atom = 0...0 1 U...U 0 1 U...U`
			`+ R(atom) = 0...0 1 1...1 1 1 1...1`
			`+ R(atom) >>u 1 = 0...0 0 1...1 1 1 1...1`
			`+ ~(R(atom) >>u 1) = 1...1 1 0...0 0 0 0...0, also correct`
			`+ (Per initial implementation of 15 Nov 2018)`
			`+ */`
			`+ improver = mkRight(mce, atom);`
			`+ improver = assignNew('V', mce, ty, binop(shrOp, improver, mkU8(1)));`
			`+ improver = assignNew('V', mce, ty, unop(notOp, improver));`
			`+`
			`+ // improved = vatom & improver`
			`+ //`
			`+ // That is, treat any V bits to the right of the leftmost clz(atom)+1`
			`+ // bits as "defined".`
			`improved = assignNew('V', mce, ty,`
			`binop(andOp, vatom, improver));`

			`@@ -4705,6 +4866,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )`
			`case Iop_RecipEst32F0x4:`
			`return unary32F0x4(mce, vatom);`

			`+ // These are self-shadowing.`
			`case Iop_32UtoV128:`
			`case Iop_64UtoV128:`
			`case Iop_Dup8x16:`
			`@@ -4745,6 +4907,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )`
			`case Iop_MulI128by10Carry:`
			`case Iop_F16toF64x2:`
			`case Iop_F64toF16x2:`
			`+ // FIXME JRS 2018-Nov-15. This is surely not correct!`
			`return vatom;`

			`case Iop_I32StoF128: /* signed I32 -> F128 */`
			`@@ -4770,7 +4933,6 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )`
			`case Iop_RoundF64toF64_NegINF:`
			`case Iop_RoundF64toF64_PosINF:`
			`case Iop_RoundF64toF64_ZERO:`
			`- case Iop_Clz64:`
			`case Iop_D32toD64:`
			`case Iop_I32StoD64:`
			`case Iop_I32UtoD64:`
			`@@ -4785,17 +4947,32 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )`
			`case Iop_D64toD128:`
			`return mkPCastTo(mce, Ity_I128, vatom);`

			`- case Iop_Clz32:`
			`case Iop_TruncF64asF32:`
			`case Iop_NegF32:`
			`case Iop_AbsF32:`
			`case Iop_F16toF32:`
			`return mkPCastTo(mce, Ity_I32, vatom);`

			`- case Iop_Ctz32:`
			`- case Iop_Ctz64:`
			`+ case Iop_Ctz32: case Iop_CtzNat32:`
			`+ case Iop_Ctz64: case Iop_CtzNat64:`
			`return expensiveCountTrailingZeroes(mce, op, atom, vatom);`

			`+ case Iop_Clz32: case Iop_ClzNat32:`
			`+ case Iop_Clz64: case Iop_ClzNat64:`
			`+ return expensiveCountLeadingZeroes(mce, op, atom, vatom);`
			`+`
			`+ // PopCount32: this is slightly pessimistic. It is true that the`
			`+ // result depends on all input bits, so that aspect of the PCast is`
			`+ // correct. However, regardless of the input, only the lowest 5 bits`
			`+ // out of the output can ever be undefined. So we could actually`
			`+ // "improve" the results here by marking the top 27 bits of output as`
			`+ // defined. A similar comment applies for PopCount64.`
			`+ case Iop_PopCount32:`
			`+ return mkPCastTo(mce, Ity_I32, vatom);`
			`+ case Iop_PopCount64:`
			`+ return mkPCastTo(mce, Ity_I64, vatom);`
			`+`
			`+ // These are self-shadowing.`
			`case Iop_1Uto64:`
			`case Iop_1Sto64:`
			`case Iop_8Uto64:`
			`@@ -4821,6 +4998,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )`
			`case Iop_V256to64_2: case Iop_V256to64_3:`
			`return assignNew('V', mce, Ity_I64, unop(op, vatom));`

			`+ // These are self-shadowing.`
			`case Iop_64to32:`
			`case Iop_64HIto32:`
			`case Iop_1Uto32:`
			`@@ -4830,8 +5008,10 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )`
			`case Iop_16Sto32:`
			`case Iop_8Sto32:`
			`case Iop_V128to32:`
			`+ case Iop_Reverse8sIn32_x1:`
			`return assignNew('V', mce, Ity_I32, unop(op, vatom));`

			`+ // These are self-shadowing.`
			`case Iop_8Sto16:`
			`case Iop_8Uto16:`
			`case Iop_32to16:`
			`@@ -4840,6 +5020,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )`
			`case Iop_GetMSBs8x16:`
			`return assignNew('V', mce, Ity_I16, unop(op, vatom));`

			`+ // These are self-shadowing.`
			`case Iop_1Uto8:`
			`case Iop_1Sto8:`
			`case Iop_16to8:`
			`@@ -4868,6 +5049,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )`
			`case Iop_Not16:`
			`case Iop_Not8:`
			`case Iop_Not1:`
			`+ // FIXME JRS 2018-Nov-15. This is surely not correct!`
			`return vatom;`

			`case Iop_CmpNEZ8x8:`
			`@@ -4929,6 +5111,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )`
			`case Iop_Ctz64x2:`
			`return mkPCast64x2(mce, vatom);`

			`+ // This is self-shadowing.`
			`case Iop_PwBitMtxXpose64x2:`
			`return assignNew('V', mce, Ity_V128, unop(op, vatom));`

			`diff --git a/memcheck/tests/vbit-test/irops.c b/memcheck/tests/vbit-test/irops.c`
			`index bfd82fc..e8bf67d 100644`
			`--- a/memcheck/tests/vbit-test/irops.c`
			`+++ b/memcheck/tests/vbit-test/irops.c`
			`@@ -111,6 +111,12 @@ static irop_t irops[] = {`
			`{ DEFOP(Iop_Clz32, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 1, .arm = 1, .ppc64 = 1, .ppc32 = 1, .mips32 =1, .mips64 = 1 },`
			`{ DEFOP(Iop_Ctz64, UNDEF_ALL), .s390x = 0, .amd64 = 1, .x86 = 0, .arm = 0, .ppc64 = 0, .ppc32 = 0, .mips32 =0, .mips64 = 0 },`
			`{ DEFOP(Iop_Ctz32, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 1, .arm = 0, .ppc64 = 0, .ppc32 = 0, .mips32 =0, .mips64 = 0 },`
			`+ { DEFOP(Iop_ClzNat64, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 0, .mips32 =0, .mips64 = 0 }, // ppc32 asserts`
			`+ { DEFOP(Iop_ClzNat32, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 1, .mips32 =0, .mips64 = 0 },`
			`+ { DEFOP(Iop_CtzNat64, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 0, .mips32 =0, .mips64 = 0 },`
			`+ { DEFOP(Iop_CtzNat32, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 0, .ppc32 = 1, .mips32 =0, .mips64 = 0 },`
			`+ { DEFOP(Iop_PopCount64, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 0, .mips32 =0, .mips64 = 0 },`
			`+ { DEFOP(Iop_PopCount32, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 1, .mips32 =0, .mips64 = 0 },`
			`{ DEFOP(Iop_CmpLT32S, UNDEF_ALL), .s390x = 1, .amd64 = 1, .x86 = 1, .arm = 1, .ppc64 = 1, .ppc32 = 1, .mips32 =1, .mips64 = 1 },`
			`{ DEFOP(Iop_CmpLT64S, UNDEF_ALL), .s390x = 1, .amd64 = 1, .x86 = 0, .arm = 0, .ppc64 = 0, .ppc32 = 0, .mips32 =0, .mips64 = 1 }, // ppc, mips assert`
			`{ DEFOP(Iop_CmpLE32S, UNDEF_ALL), .s390x = 1, .amd64 = 1, .x86 = 1, .arm = 1, .ppc64 = 1, .ppc32 = 1, .mips32 =1, .mips64 = 1 },`
			`@@ -336,6 +342,7 @@ static irop_t irops[] = {`
			`{ DEFOP(Iop_Sad8Ux4, UNDEF_UNKNOWN), },`
			`{ DEFOP(Iop_CmpNEZ16x2, UNDEF_UNKNOWN), },`
			`{ DEFOP(Iop_CmpNEZ8x4, UNDEF_UNKNOWN), },`
			`+ { DEFOP(Iop_Reverse8sIn32_x1, UNDEF_UNKNOWN) },`
			`/* ------------------ 64-bit SIMD FP ------------------------ */`
			`{ DEFOP(Iop_I32UtoFx2, UNDEF_UNKNOWN), },`
			`{ DEFOP(Iop_I32StoFx2, UNDEF_UNKNOWN), },`