|
From: <sv...@va...> - 2005-04-26 23:49:30
|
Author: sewardj
Date: 2005-04-27 00:49:24 +0100 (Wed, 27 Apr 2005)
New Revision: 3572
Modified:
trunk/memcheck/mc_translate.c
Log:
* Modify the instrumenter to use the new primops introduced in=20
vex rev 1144.
* Observe that mkLazy2 generates IR which often turns into=20
long and slow code sequences in the back end, primarily because
PCast operations are expensive. Add a couple of special=20
cases which give noticably better performance when handling
FP-intensive code on x86.
Modified: trunk/memcheck/mc_translate.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/memcheck/mc_translate.c 2005-04-26 08:13:24 UTC (rev 3571)
+++ trunk/memcheck/mc_translate.c 2005-04-26 23:49:24 UTC (rev 3572)
@@ -338,8 +338,7 @@
return assignNew(mce, Ity_I8,=20
binop(Iop_Or8, a1,=20
assignNew(mce, Ity_I8,
- /* unop(Iop_Neg8, a1)))); */
- binop(Iop_Sub8, mkU8(0), a1) )));
+ unop(Iop_Neg8, a1))));
}
=20
static IRAtom* mkLeft16 ( MCEnv* mce, IRAtom* a1 ) {
@@ -348,8 +347,7 @@
return assignNew(mce, Ity_I16,=20
binop(Iop_Or16, a1,=20
assignNew(mce, Ity_I16,
- /* unop(Iop_Neg16, a1)))); */
- binop(Iop_Sub16, mkU16(0), a1) )));
+ unop(Iop_Neg16, a1))));
}
=20
static IRAtom* mkLeft32 ( MCEnv* mce, IRAtom* a1 ) {
@@ -358,8 +356,7 @@
return assignNew(mce, Ity_I32,=20
binop(Iop_Or32, a1,=20
assignNew(mce, Ity_I32,
- /* unop(Iop_Neg32, a1)))); */
- binop(Iop_Sub32, mkU32(0), a1) )));
+ unop(Iop_Neg32, a1))));
}
=20
static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) {
@@ -368,8 +365,7 @@
return assignNew(mce, Ity_I64,=20
binop(Iop_Or64, a1,=20
assignNew(mce, Ity_I64,
- /* unop(Iop_Neg32, a1)))); */
- binop(Iop_Sub64, mkU64(0), a1) )));
+ unop(Iop_Neg64, a1))));
}
=20
/* --------- 'Improvement' functions for AND/OR. --------- */
@@ -496,16 +492,16 @@
tmp1 =3D vbits;
break;
case Ity_I8:=20
- tmp1 =3D assignNew(mce, Ity_I1, binop(Iop_CmpNE8, vbits, mkU8(0=
)));
+ tmp1 =3D assignNew(mce, Ity_I1, unop(Iop_CmpNEZ8, vbits));
break;
case Ity_I16:=20
- tmp1 =3D assignNew(mce, Ity_I1, binop(Iop_CmpNE16, vbits, mkU16=
(0)));
+ tmp1 =3D assignNew(mce, Ity_I1, unop(Iop_CmpNEZ16, vbits));
break;
case Ity_I32:=20
- tmp1 =3D assignNew(mce, Ity_I1, binop(Iop_CmpNE32, vbits, mkU32=
(0)));
+ tmp1 =3D assignNew(mce, Ity_I1, unop(Iop_CmpNEZ32, vbits));
break;
case Ity_I64:=20
- tmp1 =3D assignNew(mce, Ity_I1, binop(Iop_CmpNE64, vbits, mkU64=
(0)));
+ tmp1 =3D assignNew(mce, Ity_I1, unop(Iop_CmpNEZ64, vbits));
break;
case Ity_I128: {
/* Gah. Chop it in half, OR the halves together, and compare
@@ -514,7 +510,7 @@
IRAtom* tmp3 =3D assignNew(mce, Ity_I64, unop(Iop_128to64, vbit=
s));
IRAtom* tmp4 =3D assignNew(mce, Ity_I64, binop(Iop_Or64, tmp2, =
tmp3));
tmp1 =3D assignNew(mce, Ity_I1,=20
- binop(Iop_CmpNE64, tmp4, mkU64(0)=
));
+ unop(Iop_CmpNEZ64, tmp4));
break;
}
default:
@@ -601,7 +597,7 @@
opNOT =3D Iop_Not64;
opXOR =3D Iop_Xor64;
opCMP =3D Iop_CmpEQ64;
- top =3D mkU64(0xFFFFFFFFFFFFFFFF);
+ top =3D mkU64(0xFFFFFFFFFFFFFFFFULL);
break;
default:
VG_(tool_panic)("expensiveCmpEQorNE");
@@ -901,10 +897,43 @@
static
IRAtom* mkLazy2 ( MCEnv* mce, IRType finalVty, IRAtom* va1, IRAtom* va2 =
)
{
- /* force everything via 32-bit intermediaries. */
IRAtom* at;
+ IRType t1 =3D typeOfIRExpr(mce->bb->tyenv, va1);
+ IRType t2 =3D typeOfIRExpr(mce->bb->tyenv, va2);
tl_assert(isShadowAtom(mce,va1));
tl_assert(isShadowAtom(mce,va2));
+
+ /* The general case is inefficient because PCast is an expensive
+ operation. Here are some special cases which use PCast only
+ once rather than twice. */
+
+ /* I64 x I64 -> I64 */
+ if (t1 =3D=3D Ity_I64 && t2 =3D=3D Ity_I64 && finalVty =3D=3D Ity_I64=
) {
+ if (0) VG_(printf)("mkLazy2: I64 x I64 -> I64\n");
+ at =3D mkUifU(mce, Ity_I64, va1, va2);
+ at =3D mkPCastTo(mce, Ity_I64, at);
+ return at;
+ }
+
+ /* I64 x I64 -> I32 */
+ if (t1 =3D=3D Ity_I64 && t2 =3D=3D Ity_I64 && finalVty =3D=3D Ity_I32=
) {
+ if (0) VG_(printf)("mkLazy2: I64 x I64 -> I32\n");
+ at =3D mkUifU(mce, Ity_I64, va1, va2);
+ at =3D mkPCastTo(mce, Ity_I32, at);
+ return at;
+ }
+
+ if (0) {
+ VG_(printf)("mkLazy2 ");
+ ppIRType(t1);
+ VG_(printf)("_");
+ ppIRType(t2);
+ VG_(printf)("_");
+ ppIRType(finalVty);
+ VG_(printf)("\n");
+ }
+
+ /* General case: force everything via 32-bit intermediaries. */
at =3D mkPCastTo(mce, Ity_I32, va1);
at =3D mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
at =3D mkPCastTo(mce, finalVty, at);
|