|
From: <sv...@va...> - 2012-03-26 09:44:47
|
sewardj 2012-03-26 10:44:39 +0100 (Mon, 26 Mar 2012)
New Revision: 2268
Log:
gcc seems to have taken to generating "orl $0xFFFFFFFF, %reg32" to get
-1 (32-bit) into a register. [Is this wise? Does the processor know
that this generates no dependency on the previous value of the
register?] Teach the constant folder about such cases, therefore.
Modified files:
trunk/priv/ir_opt.c
Modified: trunk/priv/ir_opt.c (+41 -6)
===================================================================
--- trunk/priv/ir_opt.c 2012-03-25 17:17:18 +01:00 (rev 2267)
+++ trunk/priv/ir_opt.c 2012-03-26 10:44:39 +01:00 (rev 2268)
@@ -1027,6 +1027,14 @@
&& e->Iex.Const.con->Ico.U32 == 0);
}
+/* Is this literally IRExpr_Const(IRConst_U32(1---1)) ? */
+static Bool isOnesU32 ( IRExpr* e )
+{
+ return toBool( e->tag == Iex_Const
+ && e->Iex.Const.con->tag == Ico_U32
+ && e->Iex.Const.con->Ico.U32 == 0xFFFFFFFF );
+}
+
/* Is this literally IRExpr_Const(IRConst_U64(0)) ? */
static Bool isZeroU64 ( IRExpr* e )
{
@@ -1039,7 +1047,6 @@
static Bool isZeroU ( IRExpr* e )
{
if (e->tag != Iex_Const) return False;
-
switch (e->Iex.Const.con->tag) {
case Ico_U1: return toBool( e->Iex.Const.con->Ico.U1 == 0);
case Ico_U8: return toBool( e->Iex.Const.con->Ico.U8 == 0);
@@ -1050,6 +1057,21 @@
}
}
+/* Is this an integer constant with value 1---1b ? */
+static Bool isOnesU ( IRExpr* e )
+{
+ if (e->tag != Iex_Const) return False;
+ switch (e->Iex.Const.con->tag) {
+ case Ico_U8: return toBool( e->Iex.Const.con->Ico.U8 == 0xFF);
+ case Ico_U16: return toBool( e->Iex.Const.con->Ico.U16 == 0xFFFF);
+ case Ico_U32: return toBool( e->Iex.Const.con->Ico.U32
+ == 0xFFFFFFFF);
+ case Ico_U64: return toBool( e->Iex.Const.con->Ico.U64
+ == 0xFFFFFFFFFFFFFFFFULL);
+ default: ppIRExpr(e); vpanic("isOnesU");
+ }
+}
+
static Bool notBool ( Bool b )
{
if (b == True) return False;
@@ -1080,11 +1102,19 @@
switch (op) {
case Iop_CmpEQ64:
return IRExpr_Const(IRConst_U1(toBool(1)));
+ case Iop_Or8:
+ return IRExpr_Const(IRConst_U8(0xFF));
+ case Iop_Or16:
+ return IRExpr_Const(IRConst_U16(0xFFFF));
+ case Iop_Or32:
+ return IRExpr_Const(IRConst_U32(0xFFFFFFFF));
case Iop_CmpEQ8x8:
+ case Iop_Or64:
return IRExpr_Const(IRConst_U64(0xFFFFFFFFFFFFFFFFULL));
case Iop_CmpEQ8x16:
return IRExpr_Const(IRConst_V128(0xFFFF));
default:
+ ppIROp(op);
vpanic("mkOnesOfPrimopResultType: bad primop");
}
}
@@ -1730,17 +1760,23 @@
case Iop_Or32:
case Iop_Or64:
case Iop_Max32U:
- /* Or8/Or16/Or32/Max32U(x,0) ==> x */
+ /* Or8/Or16/Or32/Or64/Max32U(x,0) ==> x */
if (isZeroU(e->Iex.Binop.arg2)) {
e2 = e->Iex.Binop.arg1;
break;
}
- /* Or8/Or16/Or32/Max32U(0,x) ==> x */
+ /* Or8/Or16/Or32/Or64/Max32U(0,x) ==> x */
if (isZeroU(e->Iex.Binop.arg1)) {
e2 = e->Iex.Binop.arg2;
break;
}
- /* Or8/Or16/Or32/Max32U(t,t) ==> t, for some IRTemp t */
+ /* Or8/Or16/Or32/Or64/Max32U(x,1---1b) ==> 1---1b */
+ /* Or8/Or16/Or32/Or64/Max32U(1---1b,x) ==> 1---1b */
+ if (isOnesU(e->Iex.Binop.arg1) || isOnesU(e->Iex.Binop.arg2)) {
+ e2 = mkOnesOfPrimopResultType(e->Iex.Binop.op);
+ break;
+ }
+ /* Or8/Or16/Or32/Or64/Max32U(t,t) ==> t, for some IRTemp t */
if (sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
e2 = e->Iex.Binop.arg1;
break;
@@ -1797,8 +1833,7 @@
case Iop_And32:
/* And32(x,0xFFFFFFFF) ==> x */
- if (e->Iex.Binop.arg2->tag == Iex_Const
- && e->Iex.Binop.arg2->Iex.Const.con->Ico.U32 == 0xFFFFFFFF) {
+ if (isOnesU32(e->Iex.Binop.arg2)) {
e2 = e->Iex.Binop.arg1;
break;
}
|
|
From: John R. <jr...@bi...> - 2012-03-26 14:29:48
|
> [VEX] New Revision: 2268 > gcc seems to have taken to generating "orl $0xFFFFFFFF, %reg32" to get > -1 (32-bit) into a register. [Is this wise? Does the processor know > that this generates no dependency on the previous value of the > register?] Teach the constant folder about such cases, therefore. Is this really new? The advantage is code size: "orl $~0,%reg32" takes 3 bytes using opcode 0x83 (the immediate byte is sign extended first), whereas "movl $~0,%reg32" takes 5 bytes. However, no current CPU understands that there is no dependency on the previous value in %reg32. The only cases of suppressing the dependency are for SUB or XOR with the same register as source and destination. (This can be detected by the decoding of 2 adjacent instruction bytes, and costs almost nothing in the decoder: two AND in the 16-input PLA.) Therefore, using "orl $~0,%reg32" must wait for the previous write of %reg32. Of course, the value written by any previous write into %reg32 is dead, so any implied dataflow dependency represents a lost opportunity for optimization (don't compute the previous value), or a deliberate choice to avoid a don't-care branch at the start of a new dataflow whose value resides in %reg32. Most code is control-dominated. Also, the previous write of %reg32 is likely to be a fast instruction (1 cycle or less; not a multiply, not a divide, not a load from memory, ...). So in practice the delay is likely to be zero. At most, in practice the dependency might inhibit simultaneous execution in the same cycle; but the next cycle might have a free slot, so the net loss over 2 cycles also might be zero. -- |