3.14.0-4 gcc ppc64le inlined memcmp vs memcheck (#1652926)

- Add valgrind-3.14.0-get_otrack_shadow_offset_wrk-ppc.patch,
  valgrind-3.14.0-new-strlen-IROps.patch,
  valgrind-3.14.0-ppc-instr-new-IROps.patch,
  valgrind-3.14.0-memcheck-new-IROps.patch,
  valgrind-3.14.0-ppc-frontend-new-IROps.patch,
  valgrind-3.14.0-transform-popcount64-ctznat64.patch and
  valgrind-3.14.0-enable-ppc-Iop_Sar_Shr8.patch (#1652926)
This commit is contained in:
Mark Wielaard 2018-11-23 22:31:07 +01:00
parent 06ef44fd1a
commit b3eda9b80b
8 changed files with 1422 additions and 1 deletions

View File

@ -0,0 +1,18 @@
commit 27fe22378da38424102c5292b782cacdd9d7b9e4
Author: Julian Seward <jseward@acm.org>
Date: Tue Nov 20 12:09:03 2018 +0100
Add support for Iop_{Sar,Shr}8 on ppc. --expensive-definedness-checks=yes needs them.
diff --git a/VEX/priv/host_ppc_isel.c b/VEX/priv/host_ppc_isel.c
index 5242176..750cf8d 100644
--- a/VEX/priv/host_ppc_isel.c
+++ b/VEX/priv/host_ppc_isel.c
@@ -1528,7 +1528,6 @@ static HReg iselWordExpr_R_wrk ( ISelEnv* env, const IRExpr* e,
True/*32bit shift*/,
tmp, tmp, amt));
r_srcL = tmp;
- vassert(0); /* AWAITING TEST CASE */
}
}
/* Only 64 expressions need 64bit shifts,

View File

@ -0,0 +1,81 @@
commit 7f1dd9d5aec1f1fd4eb0ae3a311358a914f1d73f
Author: Julian Seward <jseward@acm.org>
Date: Tue Nov 20 10:18:29 2018 +0100
get_otrack_shadow_offset_wrk for ppc32 and ppc64: add missing cases for XER_OV32, XER_CA32 and C_FPCC.
The missing cases were discovered whilst testing fixes for bug 386945, but are
otherwise unrelated to that bug.
diff --git a/memcheck/mc_machine.c b/memcheck/mc_machine.c
index 5ed101f..4ce746e 100644
--- a/memcheck/mc_machine.c
+++ b/memcheck/mc_machine.c
@@ -120,11 +120,11 @@ static Int get_otrack_shadow_offset_wrk ( Int offset, Int szB )
Int o = offset;
tl_assert(sz > 0);
-#if defined(VGA_ppc64be)
+# if defined(VGA_ppc64be)
tl_assert(host_is_big_endian());
-#elif defined(VGA_ppc64le)
+# elif defined(VGA_ppc64le)
tl_assert(host_is_little_endian());
-#endif
+# endif
if (sz == 8 || sz == 4) {
/* The point of this is to achieve
@@ -132,11 +132,11 @@ static Int get_otrack_shadow_offset_wrk ( Int offset, Int szB )
return GOF(GPRn);
by testing ox instead of o, and setting ox back 4 bytes when sz == 4.
*/
-#if defined(VGA_ppc64le)
+# if defined(VGA_ppc64le)
Int ox = o;
-#else
+# else
Int ox = sz == 8 ? o : (o - 4);
-#endif
+# endif
if (ox == GOF(GPR0)) return ox;
if (ox == GOF(GPR1)) return ox;
if (ox == GOF(GPR2)) return ox;
@@ -240,11 +240,13 @@ static Int get_otrack_shadow_offset_wrk ( Int offset, Int szB )
if (o == GOF(VSR31) && sz == 8) return o;
/* For the various byte sized XER/CR pieces, use offset 8
- in VSR0 .. VSR19. */
+ in VSR0 .. VSR21. */
tl_assert(SZB(VSR0) == 16);
if (o == GOF(XER_SO) && sz == 1) return 8 +GOF(VSR0);
if (o == GOF(XER_OV) && sz == 1) return 8 +GOF(VSR1);
+ if (o == GOF(XER_OV32) && sz == 1) return 8 +GOF(VSR20);
if (o == GOF(XER_CA) && sz == 1) return 8 +GOF(VSR2);
+ if (o == GOF(XER_CA32) && sz == 1) return 8 +GOF(VSR21);
if (o == GOF(XER_BC) && sz == 1) return 8 +GOF(VSR3);
if (o == GOF(CR0_321) && sz == 1) return 8 +GOF(VSR4);
@@ -388,6 +390,7 @@ static Int get_otrack_shadow_offset_wrk ( Int offset, Int szB )
if (o == GOF(IP_AT_SYSCALL) && sz == 4) return -1; /* slot unused */
if (o == GOF(FPROUND) && sz == 1) return -1;
if (o == GOF(DFPROUND) && sz == 1) return -1;
+ if (o == GOF(C_FPCC) && sz == 1) return -1;
if (o == GOF(VRSAVE) && sz == 4) return -1;
if (o == GOF(EMNOTE) && sz == 4) return -1;
if (o == GOF(CMSTART) && sz == 4) return -1;
@@ -440,11 +443,13 @@ static Int get_otrack_shadow_offset_wrk ( Int offset, Int szB )
if (o == GOF(VSR31) && sz == 8) return o;
/* For the various byte sized XER/CR pieces, use offset 8
- in VSR0 .. VSR19. */
+ in VSR0 .. VSR21. */
tl_assert(SZB(VSR0) == 16);
if (o == GOF(XER_SO) && sz == 1) return 8 +GOF(VSR0);
if (o == GOF(XER_OV) && sz == 1) return 8 +GOF(VSR1);
+ if (o == GOF(XER_OV32) && sz == 1) return 8 +GOF(VSR20);
if (o == GOF(XER_CA) && sz == 1) return 8 +GOF(VSR2);
+ if (o == GOF(XER_CA32) && sz == 1) return 8 +GOF(VSR21);
if (o == GOF(XER_BC) && sz == 1) return 8 +GOF(VSR3);
if (o == GOF(CR0_321) && sz == 1) return 8 +GOF(VSR4);

View File

@ -0,0 +1,453 @@
commit e221eca26be6b2396e3fcbf4117e630fc22e79f6
Author: Julian Seward <jseward@acm.org>
Date: Tue Nov 20 11:28:42 2018 +0100
Add Memcheck support for IROps added in 42719898.
memcheck/mc_translate.c:
Add mkRight{32,64} as right-travelling analogues to mkLeft{32,64}.
doCmpORD: for the cases of a signed comparison against zero, compute
definedness of the 3 result bits (lt,gt,eq) separately, and, for the lt and eq
bits, do it exactly accurately.
expensiveCountTrailingZeroes: no functional change. Re-analyse/verify and add
comments.
expensiveCountLeadingZeroes: add. Very similar to
expensiveCountTrailingZeroes.
Add some comments to mark unary ops which are self-shadowing.
Route Iop_Ctz{,Nat}{32,64} through expensiveCountTrailingZeroes.
Route Iop_Clz{,Nat}{32,64} through expensiveCountLeadingZeroes.
Add instrumentation for Iop_PopCount{32,64} and Iop_Reverse8sIn32_x1.
memcheck/tests/vbit-test/irops.c
Add dummy new entries for all new IROps, just enough to make it compile and
run.
diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
index 68a2ab3..c24db91 100644
--- a/memcheck/mc_translate.c
+++ b/memcheck/mc_translate.c
@@ -737,6 +737,34 @@ static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) {
return assignNew('V', mce, Ity_I64, unop(Iop_Left64, a1));
}
+/* --------- The Right-family of operations. --------- */
+
+/* Unfortunately these are a lot more expensive then their Left
+ counterparts. Fortunately they are only very rarely used -- only for
+ count-leading-zeroes instrumentation. */
+
+static IRAtom* mkRight32 ( MCEnv* mce, IRAtom* a1 )
+{
+ for (Int i = 1; i <= 16; i *= 2) {
+ // a1 |= (a1 >>u i)
+ IRAtom* tmp
+ = assignNew('V', mce, Ity_I32, binop(Iop_Shr32, a1, mkU8(i)));
+ a1 = assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, tmp));
+ }
+ return a1;
+}
+
+static IRAtom* mkRight64 ( MCEnv* mce, IRAtom* a1 )
+{
+ for (Int i = 1; i <= 32; i *= 2) {
+ // a1 |= (a1 >>u i)
+ IRAtom* tmp
+ = assignNew('V', mce, Ity_I64, binop(Iop_Shr64, a1, mkU8(i)));
+ a1 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, tmp));
+ }
+ return a1;
+}
+
/* --------- 'Improvement' functions for AND/OR. --------- */
/* ImproveAND(data, vbits) = data OR vbits. Defined (0) data 0s give
@@ -1280,20 +1308,18 @@ static IRAtom* doCmpORD ( MCEnv* mce,
IRAtom* xxhash, IRAtom* yyhash,
IRAtom* xx, IRAtom* yy )
{
- Bool m64 = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U;
- Bool syned = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S;
- IROp opOR = m64 ? Iop_Or64 : Iop_Or32;
- IROp opAND = m64 ? Iop_And64 : Iop_And32;
- IROp opSHL = m64 ? Iop_Shl64 : Iop_Shl32;
- IROp opSHR = m64 ? Iop_Shr64 : Iop_Shr32;
- IRType ty = m64 ? Ity_I64 : Ity_I32;
- Int width = m64 ? 64 : 32;
+ Bool m64 = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U;
+ Bool syned = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S;
+ IROp opOR = m64 ? Iop_Or64 : Iop_Or32;
+ IROp opAND = m64 ? Iop_And64 : Iop_And32;
+ IROp opSHL = m64 ? Iop_Shl64 : Iop_Shl32;
+ IROp opSHR = m64 ? Iop_Shr64 : Iop_Shr32;
+ IROp op1UtoWS = m64 ? Iop_1Uto64 : Iop_1Uto32;
+ IRType ty = m64 ? Ity_I64 : Ity_I32;
+ Int width = m64 ? 64 : 32;
Bool (*isZero)(IRAtom*) = m64 ? isZeroU64 : isZeroU32;
- IRAtom* threeLeft1 = NULL;
- IRAtom* sevenLeft1 = NULL;
-
tl_assert(isShadowAtom(mce,xxhash));
tl_assert(isShadowAtom(mce,yyhash));
tl_assert(isOriginalAtom(mce,xx));
@@ -1312,30 +1338,55 @@ static IRAtom* doCmpORD ( MCEnv* mce,
/* fancy interpretation */
/* if yy is zero, then it must be fully defined (zero#). */
tl_assert(isZero(yyhash));
- threeLeft1 = m64 ? mkU64(3<<1) : mkU32(3<<1);
+ // This is still inaccurate, but I don't think it matters, since
+ // nobody writes code of the form
+ // "is <partially-undefined-value> signedly greater than zero?".
+ // We therefore simply declare "x >s 0" to be undefined if any bit in
+ // x is undefined. That's clearly suboptimal in some cases. Eg, if
+ // the highest order bit is a defined 1 then x is negative so it
+ // doesn't matter whether the remaining bits are defined or not.
+ IRAtom* t_0_gt_0_0
+ = assignNew(
+ 'V', mce,ty,
+ binop(
+ opAND,
+ mkPCastTo(mce,ty, xxhash),
+ m64 ? mkU64(1<<2) : mkU32(1<<2)
+ ));
+ // For "x <s 0", we can just copy the definedness of the top bit of x
+ // and we have a precise result.
+ IRAtom* t_lt_0_0_0
+ = assignNew(
+ 'V', mce,ty,
+ binop(
+ opSHL,
+ assignNew(
+ 'V', mce,ty,
+ binop(opSHR, xxhash, mkU8(width-1))),
+ mkU8(3)
+ ));
+ // For "x == 0" we can hand the problem off to expensiveCmpEQorNE.
+ IRAtom* t_0_0_eq_0
+ = assignNew(
+ 'V', mce,ty,
+ binop(
+ opSHL,
+ assignNew('V', mce,ty,
+ unop(
+ op1UtoWS,
+ expensiveCmpEQorNE(mce, ty, xxhash, yyhash, xx, yy))
+ ),
+ mkU8(1)
+ ));
return
binop(
opOR,
- assignNew(
- 'V', mce,ty,
- binop(
- opAND,
- mkPCastTo(mce,ty, xxhash),
- threeLeft1
- )),
- assignNew(
- 'V', mce,ty,
- binop(
- opSHL,
- assignNew(
- 'V', mce,ty,
- binop(opSHR, xxhash, mkU8(width-1))),
- mkU8(3)
- ))
- );
+ assignNew('V', mce,ty, binop(opOR, t_lt_0_0_0, t_0_gt_0_0)),
+ t_0_0_eq_0
+ );
} else {
/* standard interpretation */
- sevenLeft1 = m64 ? mkU64(7<<1) : mkU32(7<<1);
+ IRAtom* sevenLeft1 = m64 ? mkU64(7<<1) : mkU32(7<<1);
return
binop(
opAND,
@@ -2211,14 +2262,14 @@ IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,
tl_assert(sameKindedAtoms(atom,vatom));
switch (czop) {
- case Iop_Ctz32:
+ case Iop_Ctz32: case Iop_CtzNat32:
ty = Ity_I32;
xorOp = Iop_Xor32;
subOp = Iop_Sub32;
andOp = Iop_And32;
one = mkU32(1);
break;
- case Iop_Ctz64:
+ case Iop_Ctz64: case Iop_CtzNat64:
ty = Ity_I64;
xorOp = Iop_Xor64;
subOp = Iop_Sub64;
@@ -2232,8 +2283,30 @@ IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,
// improver = atom ^ (atom - 1)
//
- // That is, improver has its low ctz(atom) bits equal to one;
- // higher bits (if any) equal to zero.
+ // That is, improver has its low ctz(atom)+1 bits equal to one;
+ // higher bits (if any) equal to zero. So it's exactly the right
+ // mask to use to remove the irrelevant undefined input bits.
+ /* Here are some examples:
+ atom = U...U 1 0...0
+ atom-1 = U...U 0 1...1
+ ^ed = 0...0 1 11111, which correctly describes which bits of |atom|
+ actually influence the result
+ A boundary case
+ atom = 0...0
+ atom-1 = 1...1
+ ^ed = 11111, also a correct mask for the input: all input bits
+ are relevant
+ Another boundary case
+ atom = 1..1 1
+ atom-1 = 1..1 0
+ ^ed = 0..0 1, also a correct mask: only the rightmost input bit
+ is relevant
+ Now with misc U bits interspersed:
+ atom = U...U 1 0 U...U 0 1 0...0
+ atom-1 = U...U 1 0 U...U 0 0 1...1
+ ^ed = 0...0 0 0 0...0 0 1 1...1, also correct
+ (Per re-check/analysis of 14 Nov 2018)
+ */
improver = assignNew('V', mce,ty,
binop(xorOp,
atom,
@@ -2242,8 +2315,96 @@ IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,
// improved = vatom & improver
//
- // That is, treat any V bits above the first ctz(atom) bits as
- // "defined".
+ // That is, treat any V bits to the left of the rightmost ctz(atom)+1
+ // bits as "defined".
+ improved = assignNew('V', mce, ty,
+ binop(andOp, vatom, improver));
+
+ // Return pessimizing cast of improved.
+ return mkPCastTo(mce, ty, improved);
+}
+
+static
+IRAtom* expensiveCountLeadingZeroes ( MCEnv* mce, IROp czop,
+ IRAtom* atom, IRAtom* vatom )
+{
+ IRType ty;
+ IROp shrOp, notOp, andOp;
+ IRAtom* (*mkRight)(MCEnv*, IRAtom*);
+ IRAtom *improver, *improved;
+ tl_assert(isShadowAtom(mce,vatom));
+ tl_assert(isOriginalAtom(mce,atom));
+ tl_assert(sameKindedAtoms(atom,vatom));
+
+ switch (czop) {
+ case Iop_Clz32: case Iop_ClzNat32:
+ ty = Ity_I32;
+ shrOp = Iop_Shr32;
+ notOp = Iop_Not32;
+ andOp = Iop_And32;
+ mkRight = mkRight32;
+ break;
+ case Iop_Clz64: case Iop_ClzNat64:
+ ty = Ity_I64;
+ shrOp = Iop_Shr64;
+ notOp = Iop_Not64;
+ andOp = Iop_And64;
+ mkRight = mkRight64;
+ break;
+ default:
+ ppIROp(czop);
+ VG_(tool_panic)("memcheck:expensiveCountLeadingZeroes");
+ }
+
+ // This is in principle very similar to how expensiveCountTrailingZeroes
+ // works. That function computed an "improver", which it used to mask
+ // off all but the rightmost 1-bit and the zeroes to the right of it,
+ // hence removing irrelevant bits from the input. Here, we play the
+ // exact same game but with the left-vs-right roles interchanged.
+ // Unfortunately calculation of the improver in this case is
+ // significantly more expensive.
+ //
+ // improver = ~(RIGHT(atom) >>u 1)
+ //
+ // That is, improver has its upper clz(atom)+1 bits equal to one;
+ // lower bits (if any) equal to zero. So it's exactly the right
+ // mask to use to remove the irrelevant undefined input bits.
+ /* Here are some examples:
+ atom = 0...0 1 U...U
+ R(atom) = 0...0 1 1...1
+ R(atom) >>u 1 = 0...0 0 1...1
+ ~(R(atom) >>u 1) = 1...1 1 0...0
+ which correctly describes which bits of |atom|
+ actually influence the result
+ A boundary case
+ atom = 0...0
+ R(atom) = 0...0
+ R(atom) >>u 1 = 0...0
+ ~(R(atom) >>u 1) = 1...1
+ also a correct mask for the input: all input bits
+ are relevant
+ Another boundary case
+ atom = 1 1..1
+ R(atom) = 1 1..1
+ R(atom) >>u 1 = 0 1..1
+ ~(R(atom) >>u 1) = 1 0..0
+ also a correct mask: only the leftmost input bit
+ is relevant
+ Now with misc U bits interspersed:
+ atom = 0...0 1 U...U 0 1 U...U
+ R(atom) = 0...0 1 1...1 1 1 1...1
+ R(atom) >>u 1 = 0...0 0 1...1 1 1 1...1
+ ~(R(atom) >>u 1) = 1...1 1 0...0 0 0 0...0, also correct
+ (Per initial implementation of 15 Nov 2018)
+ */
+ improver = mkRight(mce, atom);
+ improver = assignNew('V', mce, ty, binop(shrOp, improver, mkU8(1)));
+ improver = assignNew('V', mce, ty, unop(notOp, improver));
+
+ // improved = vatom & improver
+ //
+ // That is, treat any V bits to the right of the leftmost clz(atom)+1
+ // bits as "defined".
improved = assignNew('V', mce, ty,
binop(andOp, vatom, improver));
@@ -4705,6 +4866,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
case Iop_RecipEst32F0x4:
return unary32F0x4(mce, vatom);
+ // These are self-shadowing.
case Iop_32UtoV128:
case Iop_64UtoV128:
case Iop_Dup8x16:
@@ -4745,6 +4907,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
case Iop_MulI128by10Carry:
case Iop_F16toF64x2:
case Iop_F64toF16x2:
+ // FIXME JRS 2018-Nov-15. This is surely not correct!
return vatom;
case Iop_I32StoF128: /* signed I32 -> F128 */
@@ -4770,7 +4933,6 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
case Iop_RoundF64toF64_NegINF:
case Iop_RoundF64toF64_PosINF:
case Iop_RoundF64toF64_ZERO:
- case Iop_Clz64:
case Iop_D32toD64:
case Iop_I32StoD64:
case Iop_I32UtoD64:
@@ -4785,17 +4947,32 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
case Iop_D64toD128:
return mkPCastTo(mce, Ity_I128, vatom);
- case Iop_Clz32:
case Iop_TruncF64asF32:
case Iop_NegF32:
case Iop_AbsF32:
case Iop_F16toF32:
return mkPCastTo(mce, Ity_I32, vatom);
- case Iop_Ctz32:
- case Iop_Ctz64:
+ case Iop_Ctz32: case Iop_CtzNat32:
+ case Iop_Ctz64: case Iop_CtzNat64:
return expensiveCountTrailingZeroes(mce, op, atom, vatom);
+ case Iop_Clz32: case Iop_ClzNat32:
+ case Iop_Clz64: case Iop_ClzNat64:
+ return expensiveCountLeadingZeroes(mce, op, atom, vatom);
+
+ // PopCount32: this is slightly pessimistic. It is true that the
+ // result depends on all input bits, so that aspect of the PCast is
+ // correct. However, regardless of the input, only the lowest 5 bits
+ // out of the output can ever be undefined. So we could actually
+ // "improve" the results here by marking the top 27 bits of output as
+ // defined. A similar comment applies for PopCount64.
+ case Iop_PopCount32:
+ return mkPCastTo(mce, Ity_I32, vatom);
+ case Iop_PopCount64:
+ return mkPCastTo(mce, Ity_I64, vatom);
+
+ // These are self-shadowing.
case Iop_1Uto64:
case Iop_1Sto64:
case Iop_8Uto64:
@@ -4821,6 +4998,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
case Iop_V256to64_2: case Iop_V256to64_3:
return assignNew('V', mce, Ity_I64, unop(op, vatom));
+ // These are self-shadowing.
case Iop_64to32:
case Iop_64HIto32:
case Iop_1Uto32:
@@ -4830,8 +5008,10 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
case Iop_16Sto32:
case Iop_8Sto32:
case Iop_V128to32:
+ case Iop_Reverse8sIn32_x1:
return assignNew('V', mce, Ity_I32, unop(op, vatom));
+ // These are self-shadowing.
case Iop_8Sto16:
case Iop_8Uto16:
case Iop_32to16:
@@ -4840,6 +5020,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
case Iop_GetMSBs8x16:
return assignNew('V', mce, Ity_I16, unop(op, vatom));
+ // These are self-shadowing.
case Iop_1Uto8:
case Iop_1Sto8:
case Iop_16to8:
@@ -4868,6 +5049,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
case Iop_Not16:
case Iop_Not8:
case Iop_Not1:
+ // FIXME JRS 2018-Nov-15. This is surely not correct!
return vatom;
case Iop_CmpNEZ8x8:
@@ -4929,6 +5111,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
case Iop_Ctz64x2:
return mkPCast64x2(mce, vatom);
+ // This is self-shadowing.
case Iop_PwBitMtxXpose64x2:
return assignNew('V', mce, Ity_V128, unop(op, vatom));
diff --git a/memcheck/tests/vbit-test/irops.c b/memcheck/tests/vbit-test/irops.c
index bfd82fc..e8bf67d 100644
--- a/memcheck/tests/vbit-test/irops.c
+++ b/memcheck/tests/vbit-test/irops.c
@@ -111,6 +111,12 @@ static irop_t irops[] = {
{ DEFOP(Iop_Clz32, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 1, .arm = 1, .ppc64 = 1, .ppc32 = 1, .mips32 =1, .mips64 = 1 },
{ DEFOP(Iop_Ctz64, UNDEF_ALL), .s390x = 0, .amd64 = 1, .x86 = 0, .arm = 0, .ppc64 = 0, .ppc32 = 0, .mips32 =0, .mips64 = 0 },
{ DEFOP(Iop_Ctz32, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 1, .arm = 0, .ppc64 = 0, .ppc32 = 0, .mips32 =0, .mips64 = 0 },
+ { DEFOP(Iop_ClzNat64, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 0, .mips32 =0, .mips64 = 0 }, // ppc32 asserts
+ { DEFOP(Iop_ClzNat32, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 1, .mips32 =0, .mips64 = 0 },
+ { DEFOP(Iop_CtzNat64, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 0, .mips32 =0, .mips64 = 0 },
+ { DEFOP(Iop_CtzNat32, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 0, .ppc32 = 1, .mips32 =0, .mips64 = 0 },
+ { DEFOP(Iop_PopCount64, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 0, .mips32 =0, .mips64 = 0 },
+ { DEFOP(Iop_PopCount32, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 1, .mips32 =0, .mips64 = 0 },
{ DEFOP(Iop_CmpLT32S, UNDEF_ALL), .s390x = 1, .amd64 = 1, .x86 = 1, .arm = 1, .ppc64 = 1, .ppc32 = 1, .mips32 =1, .mips64 = 1 },
{ DEFOP(Iop_CmpLT64S, UNDEF_ALL), .s390x = 1, .amd64 = 1, .x86 = 0, .arm = 0, .ppc64 = 0, .ppc32 = 0, .mips32 =0, .mips64 = 1 }, // ppc, mips assert
{ DEFOP(Iop_CmpLE32S, UNDEF_ALL), .s390x = 1, .amd64 = 1, .x86 = 1, .arm = 1, .ppc64 = 1, .ppc32 = 1, .mips32 =1, .mips64 = 1 },
@@ -336,6 +342,7 @@ static irop_t irops[] = {
{ DEFOP(Iop_Sad8Ux4, UNDEF_UNKNOWN), },
{ DEFOP(Iop_CmpNEZ16x2, UNDEF_UNKNOWN), },
{ DEFOP(Iop_CmpNEZ8x4, UNDEF_UNKNOWN), },
+ { DEFOP(Iop_Reverse8sIn32_x1, UNDEF_UNKNOWN) },
/* ------------------ 64-bit SIMD FP ------------------------ */
{ DEFOP(Iop_I32UtoFx2, UNDEF_UNKNOWN), },
{ DEFOP(Iop_I32StoFx2, UNDEF_UNKNOWN), },

View File

@ -0,0 +1,124 @@
commit 4271989815b5fc933c1e29bc75507c2726dc3738
Author: Julian Seward <jseward@acm.org>
Date: Tue Nov 20 10:52:33 2018 +0100
Add some new IROps to support improved Memcheck analysis of strlen etc.
This is part of the fix for bug 386945. It adds the following IROps, plus
their supporting type- and printing- fragments:
Iop_Reverse8sIn32_x1: 32-bit byteswap. A fancy name, but it is consistent
with naming for the other swapping IROps that already exist.
Iop_PopCount64, Iop_PopCount32: population count
Iop_ClzNat64, Iop_ClzNat32, Iop_CtzNat64, Iop_CtzNat32: counting leading and
trailing zeroes, with "natural" (Nat) semantics for a zero input, meaning, in
the case of zero input, return the number of bits in the word. These
functionally overlap with the existing Iop_Clz64, Iop_Clz32, Iop_Ctz64,
Iop_Ctz32. The existing operations are undefined in case of a zero input.
Adding these new variants avoids the complexity of having to change the
declared semantics of the existing operations. Instead they are deprecated
but still available for use.
diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c
index 823b6be..3221033 100644
--- a/VEX/priv/ir_defs.c
+++ b/VEX/priv/ir_defs.c
@@ -194,6 +194,14 @@ void ppIROp ( IROp op )
case Iop_Ctz64: vex_printf("Ctz64"); return;
case Iop_Ctz32: vex_printf("Ctz32"); return;
+ case Iop_ClzNat64: vex_printf("ClzNat64"); return;
+ case Iop_ClzNat32: vex_printf("ClzNat32"); return;
+ case Iop_CtzNat64: vex_printf("CtzNat64"); return;
+ case Iop_CtzNat32: vex_printf("CtzNat32"); return;
+
+ case Iop_PopCount64: vex_printf("PopCount64"); return;
+ case Iop_PopCount32: vex_printf("PopCount32"); return;
+
case Iop_CmpLT32S: vex_printf("CmpLT32S"); return;
case Iop_CmpLE32S: vex_printf("CmpLE32S"); return;
case Iop_CmpLT32U: vex_printf("CmpLT32U"); return;
@@ -395,6 +403,7 @@ void ppIROp ( IROp op )
case Iop_CmpNEZ16x2: vex_printf("CmpNEZ16x2"); return;
case Iop_CmpNEZ8x4: vex_printf("CmpNEZ8x4"); return;
+ case Iop_Reverse8sIn32_x1: vex_printf("Reverse8sIn32_x1"); return;
case Iop_CmpF64: vex_printf("CmpF64"); return;
@@ -2719,6 +2728,7 @@ void typeOfPrimop ( IROp op,
UNARY(Ity_I16, Ity_I16);
case Iop_Not32:
case Iop_CmpNEZ16x2: case Iop_CmpNEZ8x4:
+ case Iop_Reverse8sIn32_x1:
UNARY(Ity_I32, Ity_I32);
case Iop_Not64:
@@ -2782,9 +2792,13 @@ void typeOfPrimop ( IROp op,
BINARY(Ity_I64,Ity_I64, Ity_I128);
case Iop_Clz32: case Iop_Ctz32:
+ case Iop_ClzNat32: case Iop_CtzNat32:
+ case Iop_PopCount32:
UNARY(Ity_I32, Ity_I32);
case Iop_Clz64: case Iop_Ctz64:
+ case Iop_ClzNat64: case Iop_CtzNat64:
+ case Iop_PopCount64:
UNARY(Ity_I64, Ity_I64);
case Iop_DivU32: case Iop_DivS32: case Iop_DivU32E: case Iop_DivS32E:
diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h
index 17bcb55..93fa5ac 100644
--- a/VEX/pub/libvex_ir.h
+++ b/VEX/pub/libvex_ir.h
@@ -452,12 +452,21 @@ typedef
Iop_MullS8, Iop_MullS16, Iop_MullS32, Iop_MullS64,
Iop_MullU8, Iop_MullU16, Iop_MullU32, Iop_MullU64,
- /* Wierdo integer stuff */
+ /* Counting bits */
+ /* Ctz64/Ctz32/Clz64/Clz32 are UNDEFINED when given arguments of zero.
+ You must ensure they are never given a zero argument. As of
+ 2018-Nov-14 they are deprecated. Try to use the Nat variants
+ immediately below, if you can.
+ */
Iop_Clz64, Iop_Clz32, /* count leading zeroes */
Iop_Ctz64, Iop_Ctz32, /* count trailing zeros */
- /* Ctz64/Ctz32/Clz64/Clz32 are UNDEFINED when given arguments of
- zero. You must ensure they are never given a zero argument.
- */
+ /* Count leading/trailing zeroes, with "natural" semantics for the
+ case where the input is zero: then the result is the number of bits
+ in the word. */
+ Iop_ClzNat64, Iop_ClzNat32,
+ Iop_CtzNat64, Iop_CtzNat32,
+ /* Population count -- compute the number of 1 bits in the argument. */
+ Iop_PopCount64, Iop_PopCount32,
/* Standard integer comparisons */
Iop_CmpLT32S, Iop_CmpLT64S,
@@ -831,6 +840,9 @@ typedef
/* MISC (vector integer cmp != 0) */
Iop_CmpNEZ16x2, Iop_CmpNEZ8x4,
+ /* Byte swap in a 32-bit word */
+ Iop_Reverse8sIn32_x1,
+
/* ------------------ 64-bit SIMD FP ------------------------ */
/* Convertion to/from int */
@@ -1034,8 +1046,9 @@ typedef
Iop_Slice64, // (I64, I64, I8) -> I64
/* REVERSE the order of chunks in vector lanes. Chunks must be
- smaller than the vector lanes (obviously) and so may be 8-,
- 16- and 32-bit in size. */
+ smaller than the vector lanes (obviously) and so may be 8-, 16- and
+ 32-bit in size. Note that the degenerate case,
+ Iop_Reverse8sIn64_x1, is a simply a vanilla byte-swap. */
/* Examples:
Reverse8sIn16_x4([a,b,c,d,e,f,g,h]) = [b,a,d,c,f,e,h,g]
Reverse8sIn32_x2([a,b,c,d,e,f,g,h]) = [d,c,b,a,h,g,f,e]

View File

@ -0,0 +1,381 @@
commit 81d9832226d6e3d1ee78ee3133189d7b520e7eea
Author: Julian Seward <jseward@acm.org>
Date: Tue Nov 20 11:36:53 2018 +0100
ppc front end: use new IROps added in 42719898.
This pertains to bug 386945.
VEX/priv/guest_ppc_toIR.c:
gen_POPCOUNT: use Iop_PopCount{32,64} where possible.
gen_vpopcntd_mode32: use Iop_PopCount32.
for cntlz{w,d}, use Iop_CtzNat{32,64}.
gen_byterev32: use Iop_Reverse8sIn32_x1 instead of lengthy sequence.
verbose_Clz32: remove (was unused anyway).
diff --git a/VEX/priv/guest_ppc_toIR.c b/VEX/priv/guest_ppc_toIR.c
index cb1cae1..8977d4f 100644
--- a/VEX/priv/guest_ppc_toIR.c
+++ b/VEX/priv/guest_ppc_toIR.c
@@ -1595,7 +1595,8 @@ typedef enum {
/* Generate an IR sequence to do a popcount operation on the supplied
IRTemp, and return a new IRTemp holding the result. 'ty' may be
Ity_I32 or Ity_I64 only. */
-static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src, _popcount_data_type data_type )
+static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src,
+ _popcount_data_type data_type )
{
/* Do count across 2^data_type bits,
byte: data_type = 3
@@ -1611,6 +1612,22 @@ static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src, _popcount_data_type data_typ
vassert(ty == Ity_I64 || ty == Ity_I32);
+ // Use a single IROp in cases where we can.
+
+ if (ty == Ity_I64 && data_type == DWORD) {
+ IRTemp res = newTemp(Ity_I64);
+ assign(res, unop(Iop_PopCount64, mkexpr(src)));
+ return res;
+ }
+
+ if (ty == Ity_I32 && data_type == WORD) {
+ IRTemp res = newTemp(Ity_I32);
+ assign(res, unop(Iop_PopCount32, mkexpr(src)));
+ return res;
+ }
+
+ // For the rest, we have to do it the slow way.
+
if (ty == Ity_I32) {
for (idx = 0; idx < WORD; idx++) {
@@ -1638,7 +1655,7 @@ static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src, _popcount_data_type data_typ
return nyu;
}
-// else, ty == Ity_I64
+ // else, ty == Ity_I64
vassert(mode64);
for (i = 0; i < DWORD; i++) {
@@ -1670,52 +1687,15 @@ static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src, _popcount_data_type data_typ
*/
static IRTemp gen_vpopcntd_mode32 ( IRTemp src1, IRTemp src2 )
{
- Int i, shift[6];
- IRTemp mask[6];
- IRTemp old = IRTemp_INVALID;
- IRTemp nyu1 = IRTemp_INVALID;
- IRTemp nyu2 = IRTemp_INVALID;
IRTemp retval = newTemp(Ity_I64);
vassert(!mode64);
- for (i = 0; i < WORD; i++) {
- mask[i] = newTemp(Ity_I32);
- shift[i] = 1 << i;
- }
- assign(mask[0], mkU32(0x55555555));
- assign(mask[1], mkU32(0x33333333));
- assign(mask[2], mkU32(0x0F0F0F0F));
- assign(mask[3], mkU32(0x00FF00FF));
- assign(mask[4], mkU32(0x0000FFFF));
- old = src1;
- for (i = 0; i < WORD; i++) {
- nyu1 = newTemp(Ity_I32);
- assign(nyu1,
- binop(Iop_Add32,
- binop(Iop_And32,
- mkexpr(old),
- mkexpr(mask[i])),
- binop(Iop_And32,
- binop(Iop_Shr32, mkexpr(old), mkU8(shift[i])),
- mkexpr(mask[i]))));
- old = nyu1;
- }
-
- old = src2;
- for (i = 0; i < WORD; i++) {
- nyu2 = newTemp(Ity_I32);
- assign(nyu2,
- binop(Iop_Add32,
- binop(Iop_And32,
- mkexpr(old),
- mkexpr(mask[i])),
- binop(Iop_And32,
- binop(Iop_Shr32, mkexpr(old), mkU8(shift[i])),
- mkexpr(mask[i]))));
- old = nyu2;
- }
- assign(retval, unop(Iop_32Uto64, binop(Iop_Add32, mkexpr(nyu1), mkexpr(nyu2))));
+ assign(retval,
+ unop(Iop_32Uto64,
+ binop(Iop_Add32,
+ unop(Iop_PopCount32, mkexpr(src1)),
+ unop(Iop_PopCount32, mkexpr(src2)))));
return retval;
}
@@ -5715,7 +5695,7 @@ static Bool dis_modulo_int ( UInt theInstr )
rA_address, rS_address);
assign( rS, getIReg( rS_address ) );
- assign( result, unop( Iop_Ctz32,
+ assign( result, unop( Iop_CtzNat32,
unop( Iop_64to32, mkexpr( rS ) ) ) );
assign( rA, binop( Iop_32HLto64, mkU32( 0 ), mkexpr( result ) ) );
@@ -5746,7 +5726,7 @@ static Bool dis_modulo_int ( UInt theInstr )
rA_address, rS_address);
assign( rS, getIReg( rS_address ) );
- assign( rA, unop( Iop_Ctz64, mkexpr( rS ) ) );
+ assign( rA, unop( Iop_CtzNat64, mkexpr( rS ) ) );
if ( flag_rC == 1 )
set_CR0( mkexpr( rA ) );
@@ -6307,7 +6287,6 @@ static Bool dis_int_logic ( UInt theInstr )
IRTemp rS = newTemp(ty);
IRTemp rA = newTemp(ty);
IRTemp rB = newTemp(ty);
- IRExpr* irx;
Bool do_rc = False;
assign( rS, getIReg(rS_addr) );
@@ -6404,26 +6383,16 @@ static Bool dis_int_logic ( UInt theInstr )
break;
case 0x01A: { // cntlzw (Count Leading Zeros Word, PPC32 p371)
- IRExpr* lo32;
if (rB_addr!=0) {
vex_printf("dis_int_logic(ppc)(cntlzw,rB_addr)\n");
return False;
}
- DIP("cntlzw%s r%u,r%u\n",
- flag_rC ? ".":"", rA_addr, rS_addr);
+ DIP("cntlzw%s r%u,r%u\n", flag_rC ? ".":"", rA_addr, rS_addr);
// mode64: count in low word only
- lo32 = mode64 ? unop(Iop_64to32, mkexpr(rS)) : mkexpr(rS);
-
- // Iop_Clz32 undefined for arg==0, so deal with that case:
- irx = binop(Iop_CmpNE32, lo32, mkU32(0));
- assign(rA, mkWidenFrom32(ty,
- IRExpr_ITE( irx,
- unop(Iop_Clz32, lo32),
- mkU32(32)),
- False));
-
- // TODO: alternatively: assign(rA, verbose_Clz32(rS));
+ IRExpr* lo32 = mode64 ? unop(Iop_64to32, mkexpr(rS)) : mkexpr(rS);
+ IRExpr* res32 = unop(Iop_ClzNat32, lo32);
+ assign(rA, mode64 ? unop(Iop_32Uto64, res32) : res32);
break;
}
@@ -6521,14 +6490,8 @@ static Bool dis_int_logic ( UInt theInstr )
vex_printf("dis_int_logic(ppc)(cntlzd,rB_addr)\n");
return False;
}
- DIP("cntlzd%s r%u,r%u\n",
- flag_rC ? ".":"", rA_addr, rS_addr);
- // Iop_Clz64 undefined for arg==0, so deal with that case:
- irx = binop(Iop_CmpNE64, mkexpr(rS), mkU64(0));
- assign(rA, IRExpr_ITE( irx,
- unop(Iop_Clz64, mkexpr(rS)),
- mkU64(64) ));
- // TODO: alternatively: assign(rA, verbose_Clz64(rS));
+ DIP("cntlzd%s r%u,r%u\n", flag_rC ? ".":"", rA_addr, rS_addr);
+ assign(rA, unop(Iop_ClzNat64, mkexpr(rS)));
break;
case 0x1FC: // cmpb (Power6: compare bytes)
@@ -6574,8 +6537,9 @@ static Bool dis_int_logic ( UInt theInstr )
putFReg( rS_addr, mkexpr(frA));
return True;
}
- case 0x1FA: // popcntd (population count doubleword
+ case 0x1FA: // popcntd (population count doubleword)
{
+ vassert(mode64);
DIP("popcntd r%u,r%u\n", rA_addr, rS_addr);
IRTemp result = gen_POPCOUNT(ty, rS, DWORD);
putIReg( rA_addr, mkexpr(result) );
@@ -9154,18 +9118,7 @@ static Bool dis_int_shift ( UInt theInstr )
static IRExpr* /* :: Ity_I32 */ gen_byterev32 ( IRTemp t )
{
vassert(typeOfIRTemp(irsb->tyenv, t) == Ity_I32);
- return
- binop(Iop_Or32,
- binop(Iop_Shl32, mkexpr(t), mkU8(24)),
- binop(Iop_Or32,
- binop(Iop_And32, binop(Iop_Shl32, mkexpr(t), mkU8(8)),
- mkU32(0x00FF0000)),
- binop(Iop_Or32,
- binop(Iop_And32, binop(Iop_Shr32, mkexpr(t), mkU8(8)),
- mkU32(0x0000FF00)),
- binop(Iop_And32, binop(Iop_Shr32, mkexpr(t), mkU8(24)),
- mkU32(0x000000FF) )
- )));
+ return unop(Iop_Reverse8sIn32_x1, mkexpr(t));
}
/* Generates code to swap the byte order in the lower half of an Ity_I32,
@@ -9225,6 +9178,10 @@ static Bool dis_int_ldst_rev ( UInt theInstr )
case 0x214: // ldbrx (Load Doubleword Byte-Reverse Indexed)
{
+ // JRS FIXME:
+ // * is the host_endness conditional below actually necessary?
+ // * can we just do a 64-bit load followed by by Iop_Reverse8sIn64_x1?
+ // That would be a lot more efficient.
IRExpr * nextAddr;
IRTemp w3 = newTemp( Ity_I32 );
IRTemp w4 = newTemp( Ity_I32 );
@@ -17056,8 +17013,8 @@ dis_av_count_bitTranspose ( UInt theInstr, UInt opc2 )
case 0x7C3: // vpopcntd
{
if (mode64) {
- /* Break vector into 64-bit double words and do the population count
- * on each double word.
+ /* Break vector into 64-bit double words and do the population
+ count on each double word.
*/
IRType ty = Ity_I64;
IRTemp bits0_63 = newTemp(Ity_I64);
@@ -17077,15 +17034,16 @@ dis_av_count_bitTranspose ( UInt theInstr, UInt opc2 )
mkexpr( cnt_bits0_63 ) ) );
} else {
/* Break vector into 32-bit words and do the population count
- * on each doubleword.
+ on each 32-bit word.
*/
IRTemp bits0_31, bits32_63, bits64_95, bits96_127;
bits0_31 = bits32_63 = bits64_95 = bits96_127 = IRTemp_INVALID;
- IRTemp cnt_bits0_63 = newTemp(Ity_I64);
+ IRTemp cnt_bits0_63 = newTemp(Ity_I64);
IRTemp cnt_bits64_127 = newTemp(Ity_I64);
DIP("vpopcntd v%d,v%d\n", vRT_addr, vRB_addr);
- breakV128to4x32(mkexpr( vB), &bits96_127, &bits64_95, &bits32_63, &bits0_31 );
+ breakV128to4x32(mkexpr( vB), &bits96_127, &bits64_95,
+ &bits32_63, &bits0_31 );
cnt_bits0_63 = gen_vpopcntd_mode32(bits0_31, bits32_63);
cnt_bits64_127 = gen_vpopcntd_mode32(bits64_95, bits96_127);
@@ -29103,10 +29061,12 @@ DisResult disInstr_PPC_WRK (
/* Miscellaneous ISA 2.06 instructions */
case 0x1FA: // popcntd
+ if (!mode64) goto decode_failure;
+ /* else fallthru */
case 0x17A: // popcntw
case 0x7A: // popcntb
- if (dis_int_logic( theInstr )) goto decode_success;
- goto decode_failure;
+ if (dis_int_logic( theInstr )) goto decode_success;
+ goto decode_failure;
case 0x0FC: // bpermd
if (!mode64) goto decode_failure;
@@ -29669,94 +29629,6 @@ DisResult disInstr_PPC ( IRSB* irsb_IN,
return dres;
}
-
-/*------------------------------------------------------------*/
-/*--- Unused stuff ---*/
-/*------------------------------------------------------------*/
-
-///* A potentially more memcheck-friendly implementation of Clz32, with
-// the boundary case Clz32(0) = 32, which is what ppc requires. */
-//
-//static IRExpr* /* :: Ity_I32 */ verbose_Clz32 ( IRTemp arg )
-//{
-// /* Welcome ... to SSA R Us. */
-// IRTemp n1 = newTemp(Ity_I32);
-// IRTemp n2 = newTemp(Ity_I32);
-// IRTemp n3 = newTemp(Ity_I32);
-// IRTemp n4 = newTemp(Ity_I32);
-// IRTemp n5 = newTemp(Ity_I32);
-// IRTemp n6 = newTemp(Ity_I32);
-// IRTemp n7 = newTemp(Ity_I32);
-// IRTemp n8 = newTemp(Ity_I32);
-// IRTemp n9 = newTemp(Ity_I32);
-// IRTemp n10 = newTemp(Ity_I32);
-// IRTemp n11 = newTemp(Ity_I32);
-// IRTemp n12 = newTemp(Ity_I32);
-//
-// /* First, propagate the most significant 1-bit into all lower
-// positions in the word. */
-// /* unsigned int clz ( unsigned int n )
-// {
-// n |= (n >> 1);
-// n |= (n >> 2);
-// n |= (n >> 4);
-// n |= (n >> 8);
-// n |= (n >> 16);
-// return bitcount(~n);
-// }
-// */
-// assign(n1, mkexpr(arg));
-// assign(n2, binop(Iop_Or32, mkexpr(n1), binop(Iop_Shr32, mkexpr(n1), mkU8(1))));
-// assign(n3, binop(Iop_Or32, mkexpr(n2), binop(Iop_Shr32, mkexpr(n2), mkU8(2))));
-// assign(n4, binop(Iop_Or32, mkexpr(n3), binop(Iop_Shr32, mkexpr(n3), mkU8(4))));
-// assign(n5, binop(Iop_Or32, mkexpr(n4), binop(Iop_Shr32, mkexpr(n4), mkU8(8))));
-// assign(n6, binop(Iop_Or32, mkexpr(n5), binop(Iop_Shr32, mkexpr(n5), mkU8(16))));
-// /* This gives a word of the form 0---01---1. Now invert it, giving
-// a word of the form 1---10---0, then do a population-count idiom
-// (to count the 1s, which is the number of leading zeroes, or 32
-// if the original word was 0. */
-// assign(n7, unop(Iop_Not32, mkexpr(n6)));
-//
-// /* unsigned int bitcount ( unsigned int n )
-// {
-// n = n - ((n >> 1) & 0x55555555);
-// n = (n & 0x33333333) + ((n >> 2) & 0x33333333);
-// n = (n + (n >> 4)) & 0x0F0F0F0F;
-// n = n + (n >> 8);
-// n = (n + (n >> 16)) & 0x3F;
-// return n;
-// }
-// */
-// assign(n8,
-// binop(Iop_Sub32,
-// mkexpr(n7),
-// binop(Iop_And32,
-// binop(Iop_Shr32, mkexpr(n7), mkU8(1)),
-// mkU32(0x55555555))));
-// assign(n9,
-// binop(Iop_Add32,
-// binop(Iop_And32, mkexpr(n8), mkU32(0x33333333)),
-// binop(Iop_And32,
-// binop(Iop_Shr32, mkexpr(n8), mkU8(2)),
-// mkU32(0x33333333))));
-// assign(n10,
-// binop(Iop_And32,
-// binop(Iop_Add32,
-// mkexpr(n9),
-// binop(Iop_Shr32, mkexpr(n9), mkU8(4))),
-// mkU32(0x0F0F0F0F)));
-// assign(n11,
-// binop(Iop_Add32,
-// mkexpr(n10),
-// binop(Iop_Shr32, mkexpr(n10), mkU8(8))));
-// assign(n12,
-// binop(Iop_Add32,
-// mkexpr(n11),
-// binop(Iop_Shr32, mkexpr(n11), mkU8(16))));
-// return
-// binop(Iop_And32, mkexpr(n12), mkU32(0x3F));
-//}
-
/*--------------------------------------------------------------------*/
/*--- end guest_ppc_toIR.c ---*/
/*--------------------------------------------------------------------*/

View File

@ -0,0 +1,257 @@
commit 97d336b79e36f6c99d8b07f49ebc9b780e6df84e
Author: Julian Seward <jseward@acm.org>
Date: Tue Nov 20 11:07:37 2018 +0100
Add ppc host-side isel and instruction support for IROps added in previous commit.
VEX/priv/host_ppc_defs.c, VEX/priv/host_ppc_defs.h:
Dont emit cnttz{w,d}. We may need them on a target which doesn't support
them. Instead we can generate a fairly reasonable alternative sequence with
cntlz{w,d} instead.
Add support for emitting popcnt{w,d}.
VEX/priv/host_ppc_isel.c
Add support for: Iop_ClzNat32 Iop_ClzNat64
Redo support for: Iop_Ctz{32,64} and their Nat equivalents, so as to not use
cnttz{w,d}, as mentioned above.
Add support for: Iop_PopCount64 Iop_PopCount32 Iop_Reverse8sIn32_x1
diff --git a/VEX/priv/host_ppc_defs.c b/VEX/priv/host_ppc_defs.c
index b073c1d..f4b52e4 100644
--- a/VEX/priv/host_ppc_defs.c
+++ b/VEX/priv/host_ppc_defs.c
@@ -501,9 +501,9 @@ const HChar* showPPCUnaryOp ( PPCUnaryOp op ) {
case Pun_NEG: return "neg";
case Pun_CLZ32: return "cntlzw";
case Pun_CLZ64: return "cntlzd";
- case Pun_CTZ32: return "cnttzw";
- case Pun_CTZ64: return "cnttzd";
case Pun_EXTSW: return "extsw";
+ case Pun_POP32: return "popcntw";
+ case Pun_POP64: return "popcntd";
default: vpanic("showPPCUnaryOp");
}
}
@@ -4265,20 +4265,19 @@ Int emit_PPCInstr ( /*MB_MOD*/Bool* is_profInc,
vassert(mode64);
p = mkFormX(p, 31, r_src, r_dst, 0, 58, 0, endness_host);
break;
- case Pun_CTZ32: // cnttzw r_dst, r_src
- /* Note oder of src and dst is backwards from normal */
- p = mkFormX(p, 31, r_src, r_dst, 0, 538, 0, endness_host);
- break;
- case Pun_CTZ64: // cnttzd r_dst, r_src
- /* Note oder of src and dst is backwards from normal */
- vassert(mode64);
- p = mkFormX(p, 31, r_src, r_dst, 0, 570, 0, endness_host);
- break;
case Pun_EXTSW: // extsw r_dst, r_src
vassert(mode64);
p = mkFormX(p, 31, r_src, r_dst, 0, 986, 0, endness_host);
break;
- default: goto bad;
+ case Pun_POP32: // popcntw r_dst, r_src
+ p = mkFormX(p, 31, r_src, r_dst, 0, 378, 0, endness_host);
+ break;
+ case Pun_POP64: // popcntd r_dst, r_src
+ vassert(mode64);
+ p = mkFormX(p, 31, r_src, r_dst, 0, 506, 0, endness_host);
+ break;
+ default:
+ goto bad;
}
goto done;
}
diff --git a/VEX/priv/host_ppc_defs.h b/VEX/priv/host_ppc_defs.h
index 17baff5..321fba9 100644
--- a/VEX/priv/host_ppc_defs.h
+++ b/VEX/priv/host_ppc_defs.h
@@ -291,9 +291,9 @@ typedef
Pun_NOT,
Pun_CLZ32,
Pun_CLZ64,
- Pun_CTZ32,
- Pun_CTZ64,
- Pun_EXTSW
+ Pun_EXTSW,
+ Pun_POP32, // popcntw
+ Pun_POP64 // popcntd
}
PPCUnaryOp;
diff --git a/VEX/priv/host_ppc_isel.c b/VEX/priv/host_ppc_isel.c
index 6bdb5f7..5242176 100644
--- a/VEX/priv/host_ppc_isel.c
+++ b/VEX/priv/host_ppc_isel.c
@@ -2065,12 +2065,15 @@ static HReg iselWordExpr_R_wrk ( ISelEnv* env, const IRExpr* e,
return r_dst;
}
break;
- case Iop_Clz32:
- case Iop_Clz64: {
+
+ case Iop_Clz32: case Iop_ClzNat32:
+ case Iop_Clz64: case Iop_ClzNat64: {
+ // cntlz is available even in the most basic (earliest) ppc
+ // variants, so it's safe to generate it unconditionally.
HReg r_src, r_dst;
- PPCUnaryOp op_clz = (op_unop == Iop_Clz32) ? Pun_CLZ32 :
- Pun_CLZ64;
- if (op_unop == Iop_Clz64 && !mode64)
+ PPCUnaryOp op_clz = (op_unop == Iop_Clz32 || op_unop == Iop_ClzNat32)
+ ? Pun_CLZ32 : Pun_CLZ64;
+ if ((op_unop == Iop_Clz64 || op_unop == Iop_ClzNat64) && !mode64)
goto irreducible;
/* Count leading zeroes. */
r_dst = newVRegI(env);
@@ -2079,18 +2082,133 @@ static HReg iselWordExpr_R_wrk ( ISelEnv* env, const IRExpr* e,
return r_dst;
}
- case Iop_Ctz32:
- case Iop_Ctz64: {
- HReg r_src, r_dst;
- PPCUnaryOp op_clz = (op_unop == Iop_Ctz32) ? Pun_CTZ32 :
- Pun_CTZ64;
- if (op_unop == Iop_Ctz64 && !mode64)
- goto irreducible;
- /* Count trailing zeroes. */
- r_dst = newVRegI(env);
- r_src = iselWordExpr_R(env, e->Iex.Unop.arg, IEndianess);
- addInstr(env, PPCInstr_Unary(op_clz,r_dst,r_src));
- return r_dst;
+ //case Iop_Ctz32:
+ case Iop_CtzNat32:
+ //case Iop_Ctz64:
+ case Iop_CtzNat64:
+ {
+ // Generate code using Clz, because we can't assume the host has
+ // Ctz. In particular, part of the fix for bug 386945 involves
+ // creating a Ctz in ir_opt.c from smaller fragments.
+ PPCUnaryOp op_clz = Pun_CLZ64;
+ Int WS = 64;
+ if (op_unop == Iop_Ctz32 || op_unop == Iop_CtzNat32) {
+ op_clz = Pun_CLZ32;
+ WS = 32;
+ }
+ /* Compute ctz(arg) = wordsize - clz(~arg & (arg - 1)), thusly:
+ t1 = arg - 1
+ t2 = not arg
+ t2 = t2 & t1
+ t2 = clz t2
+ t1 = WS
+ t2 = t1 - t2
+ // result in t2
+ */
+ HReg arg = iselWordExpr_R(env, e->Iex.Unop.arg, IEndianess);
+ HReg t1 = newVRegI(env);
+ HReg t2 = newVRegI(env);
+ addInstr(env, PPCInstr_Alu(Palu_SUB, t1, arg, PPCRH_Imm(True, 1)));
+ addInstr(env, PPCInstr_Unary(Pun_NOT, t2, arg));
+ addInstr(env, PPCInstr_Alu(Palu_AND, t2, t2, PPCRH_Reg(t1)));
+ addInstr(env, PPCInstr_Unary(op_clz, t2, t2));
+ addInstr(env, PPCInstr_LI(t1, WS, False/*!64-bit imm*/));
+ addInstr(env, PPCInstr_Alu(Palu_SUB, t2, t1, PPCRH_Reg(t2)));
+ return t2;
+ }
+
+ case Iop_PopCount64: {
+ // popcnt{x,d} is only available in later arch revs (ISA 3.0,
+ // maybe) so it's not really correct to emit it here without a caps
+ // check for the host.
+ if (mode64) {
+ HReg r_dst = newVRegI(env);
+ HReg r_src = iselWordExpr_R(env, e->Iex.Unop.arg, IEndianess);
+ addInstr(env, PPCInstr_Unary(Pun_POP64, r_dst, r_src));
+ return r_dst;
+ }
+ // We don't expect to be required to handle this in 32-bit mode.
+ break;
+ }
+
+ case Iop_PopCount32: {
+ // Similar comment as for Ctz just above applies -- we really
+ // should have a caps check here.
+
+ HReg r_dst = newVRegI(env);
+ // This actually generates popcntw, which in 64 bit mode does a
+ // 32-bit count individually for both low and high halves of the
+ // word. Per the comment at the top of iselIntExpr_R, in the 64
+ // bit mode case, the user of this result is required to ignore
+ // the upper 32 bits of the result. In 32 bit mode this is all
+ // moot. It is however unclear from the PowerISA 3.0 docs that
+ // the instruction exists in 32 bit mode; however our own front
+ // end (guest_ppc_toIR.c) accepts it, so I guess it does exist.
+ HReg r_src = iselWordExpr_R(env, e->Iex.Unop.arg, IEndianess);
+ addInstr(env, PPCInstr_Unary(Pun_POP32, r_dst, r_src));
+ return r_dst;
+ }
+
+ case Iop_Reverse8sIn32_x1: {
+ // A bit of a mouthful, but simply .. 32-bit byte swap.
+ // This is pretty rubbish code. We could do vastly better if
+ // rotates, and better, rotate-inserts, were allowed. Note that
+ // even on a 64 bit target, the right shifts must be done as 32-bit
+ // so as to introduce zero bits in the right places. So it seems
+ // simplest to do the whole sequence in 32-bit insns.
+ /*
+ r = <argument> // working temporary, initial byte order ABCD
+ Mask = 00FF00FF
+ nMask = not Mask
+ tHi = and r, Mask
+ tHi = shl tHi, 8
+ tLo = and r, nMask
+ tLo = shr tLo, 8
+ r = or tHi, tLo // now r has order BADC
+ and repeat for 16 bit chunks ..
+ Mask = 0000FFFF
+ nMask = not Mask
+ tHi = and r, Mask
+ tHi = shl tHi, 16
+ tLo = and r, nMask
+ tLo = shr tLo, 16
+ r = or tHi, tLo // now r has order DCBA
+ */
+ HReg r_src = iselWordExpr_R(env, e->Iex.Unop.arg, IEndianess);
+ HReg rr = newVRegI(env);
+ HReg rMask = newVRegI(env);
+ HReg rnMask = newVRegI(env);
+ HReg rtHi = newVRegI(env);
+ HReg rtLo = newVRegI(env);
+ // Copy r_src since we need to modify it
+ addInstr(env, mk_iMOVds_RR(rr, r_src));
+ // Swap within 16-bit lanes
+ addInstr(env, PPCInstr_LI(rMask, 0x00FF00FFULL,
+ False/* !64bit imm*/));
+ addInstr(env, PPCInstr_Unary(Pun_NOT, rnMask, rMask));
+ addInstr(env, PPCInstr_Alu(Palu_AND, rtHi, rr, PPCRH_Reg(rMask)));
+ addInstr(env, PPCInstr_Shft(Pshft_SHL, True/*32 bit shift*/,
+ rtHi, rtHi,
+ PPCRH_Imm(False/*!signed imm*/, 8)));
+ addInstr(env, PPCInstr_Alu(Palu_AND, rtLo, rr, PPCRH_Reg(rnMask)));
+ addInstr(env, PPCInstr_Shft(Pshft_SHR, True/*32 bit shift*/,
+ rtLo, rtLo,
+ PPCRH_Imm(False/*!signed imm*/, 8)));
+ addInstr(env, PPCInstr_Alu(Palu_OR, rr, rtHi, PPCRH_Reg(rtLo)));
+ // And now swap the two 16-bit chunks
+ addInstr(env, PPCInstr_LI(rMask, 0x0000FFFFULL,
+ False/* !64bit imm*/));
+ addInstr(env, PPCInstr_Unary(Pun_NOT, rnMask, rMask));
+ addInstr(env, PPCInstr_Alu(Palu_AND, rtHi, rr, PPCRH_Reg(rMask)));
+ addInstr(env, PPCInstr_Shft(Pshft_SHL, True/*32 bit shift*/,
+ rtHi, rtHi,
+ PPCRH_Imm(False/*!signed imm*/, 16)));
+ addInstr(env, PPCInstr_Alu(Palu_AND, rtLo, rr, PPCRH_Reg(rnMask)));
+ addInstr(env, PPCInstr_Shft(Pshft_SHR, True/*32 bit shift*/,
+ rtLo, rtLo,
+ PPCRH_Imm(False/*!signed imm*/, 16)));
+ addInstr(env, PPCInstr_Alu(Palu_OR, rr, rtHi, PPCRH_Reg(rtLo)));
+ return rr;
}
case Iop_Left8:

View File

@ -0,0 +1,82 @@
commit cb5d7e047598bff6d0f1d707a70d9fb1a1c7f0e2
Author: Julian Seward <jseward@acm.org>
Date: Tue Nov 20 11:46:55 2018 +0100
VEX/priv/ir_opt.c
fold_Expr: transform PopCount64(And64(Add64(x,-1),Not64(x))) into CtzNat64(x).
This is part of the fix for bug 386945.
diff --git a/VEX/priv/ir_opt.c b/VEX/priv/ir_opt.c
index f40870b..23964be 100644
--- a/VEX/priv/ir_opt.c
+++ b/VEX/priv/ir_opt.c
@@ -1377,6 +1377,8 @@ static IRExpr* fold_Expr ( IRExpr** env, IRExpr* e )
case Iex_Unop:
/* UNARY ops */
if (e->Iex.Unop.arg->tag == Iex_Const) {
+
+ /* cases where the arg is a const */
switch (e->Iex.Unop.op) {
case Iop_1Uto8:
e2 = IRExpr_Const(IRConst_U8(toUChar(
@@ -1690,8 +1692,56 @@ static IRExpr* fold_Expr ( IRExpr** env, IRExpr* e )
default:
goto unhandled;
- }
- }
+ } // switch (e->Iex.Unop.op)
+
+ } else {
+
+ /* other cases (identities, etc) */
+ switch (e->Iex.Unop.op) {
+ case Iop_PopCount64: {
+ // PopCount64( And64( Add64(x,-1), Not64(x) ) ) ==> CtzNat64(x)
+ // bindings:
+ // a1:And64( a11:Add64(a111:x,a112:-1), a12:Not64(a121:x) )
+ IRExpr* a1 = chase(env, e->Iex.Unop.arg);
+ if (!a1)
+ goto nomatch;
+ if (a1->tag != Iex_Binop || a1->Iex.Binop.op != Iop_And64)
+ goto nomatch;
+ // a1 is established
+ IRExpr* a11 = chase(env, a1->Iex.Binop.arg1);
+ if (!a11)
+ goto nomatch;
+ if (a11->tag != Iex_Binop || a11->Iex.Binop.op != Iop_Add64)
+ goto nomatch;
+ // a11 is established
+ IRExpr* a12 = chase(env, a1->Iex.Binop.arg2);
+ if (!a12)
+ goto nomatch;
+ if (a12->tag != Iex_Unop || a12->Iex.Unop.op != Iop_Not64)
+ goto nomatch;
+ // a12 is established
+ IRExpr* a111 = a11->Iex.Binop.arg1;
+ IRExpr* a112 = chase(env, a11->Iex.Binop.arg2);
+ IRExpr* a121 = a12->Iex.Unop.arg;
+ if (!a111 || !a112 || !a121)
+ goto nomatch;
+ // a111 and a121 need to be the same temp.
+ if (!eqIRAtom(a111, a121))
+ goto nomatch;
+ // Finally, a112 must be a 64-bit version of -1.
+ if (!isOnesU(a112))
+ goto nomatch;
+ // Match established. Transform.
+ e2 = IRExpr_Unop(Iop_CtzNat64, a111);
+ break;
+ nomatch:
+ break;
+ }
+ default:
+ break;
+ } // switch (e->Iex.Unop.op)
+
+ } // if (e->Iex.Unop.arg->tag == Iex_Const)
break;
case Iex_Binop:

View File

@ -3,7 +3,7 @@
Summary: Tool for finding memory management bugs in programs
Name: %{?scl_prefix}valgrind
Version: 3.14.0
Release: 3%{?dist}
Release: 4%{?dist}
Epoch: 1
License: GPLv2+
URL: http://www.valgrind.org/
@ -119,6 +119,15 @@ Patch8: valgrind-3.14.0-s390x-vec-float-point-tests.patch
# KDE#401277 More bugs in z13 support
Patch9: valgrind-3.14.0-s390z-more-z13-fixes.patch
# KDE#386945 Bogus memcheck errors on ppc64(le) when using strcmp
Patch10: valgrind-3.14.0-get_otrack_shadow_offset_wrk-ppc.patch
Patch11: valgrind-3.14.0-new-strlen-IROps.patch
Patch12: valgrind-3.14.0-ppc-instr-new-IROps.patch
Patch13: valgrind-3.14.0-memcheck-new-IROps.patch
Patch14: valgrind-3.14.0-ppc-frontend-new-IROps.patch
Patch15: valgrind-3.14.0-transform-popcount64-ctznat64.patch
Patch16: valgrind-3.14.0-enable-ppc-Iop_Sar_Shr8.patch
%if %{build_multilib}
# Ensure glibc{,-devel} is installed for both multilib arches
BuildRequires: /lib/libc.so.6 /usr/lib/libc.so /lib64/libc.so.6 /usr/lib64/libc.so
@ -260,6 +269,13 @@ Valgrind User Manual for details.
%patch7 -p1
%patch8 -p1
%patch9 -p1
%patch10 -p1
%patch11 -p1
%patch12 -p1
%patch13 -p1
%patch14 -p1
%patch15 -p1
%patch16 -p1
%build
CC=gcc
@ -494,6 +510,15 @@ fi
%endif
%changelog
* Fri Nov 23 2018 Mark Wielaard <mjw@fedoraproject.org> - 3.14.0-4
- Add valgrind-3.14.0-get_otrack_shadow_offset_wrk-ppc.patch,
valgrind-3.14.0-new-strlen-IROps.patch,
valgrind-3.14.0-ppc-instr-new-IROps.patch,
valgrind-3.14.0-memcheck-new-IROps.patch,
valgrind-3.14.0-ppc-frontend-new-IROps.patch,
valgrind-3.14.0-transform-popcount64-ctznat64.patch and
valgrind-3.14.0-enable-ppc-Iop_Sar_Shr8.patch (#1652926)
* Wed Nov 21 2018 Mark Wielaard <mjw@fedoraproject.org> - 3.14.0-3
- Add valgrind-3.14.0-s390z-more-z13-fixes.patch.