|
From: <sv...@va...> - 2010-09-08 08:35:01
|
Author: sewardj
Date: 2010-09-08 09:34:52 +0100 (Wed, 08 Sep 2010)
New Revision: 2030
Log:
Minor amd64 instruction selection improvements, leading to a
1% generated code size reduction for perf/bz2.c running on
Memcheck:
- reduce the amount of pointless cast-of-a-cast code by
rewriting it out at tree-creation time in ir_opt.c
- generate movslq for 32Sto64
- generate movzbq for 8Uto64(LD(...)), ditto movzwq for 16-bit loads
Modified:
trunk/priv/host_amd64_defs.c
trunk/priv/host_amd64_defs.h
trunk/priv/host_amd64_isel.c
trunk/priv/ir_opt.c
Modified: trunk/priv/host_amd64_defs.c
===================================================================
--- trunk/priv/host_amd64_defs.c 2010-09-03 23:37:02 UTC (rev 2029)
+++ trunk/priv/host_amd64_defs.c 2010-09-08 08:34:52 UTC (rev 2030)
@@ -737,11 +737,12 @@
vassert(cond != Acc_ALWAYS);
return i;
}
-AMD64Instr* AMD64Instr_MovZLQ ( HReg src, HReg dst ) {
- AMD64Instr* i = LibVEX_Alloc(sizeof(AMD64Instr));
- i->tag = Ain_MovZLQ;
- i->Ain.MovZLQ.src = src;
- i->Ain.MovZLQ.dst = dst;
+AMD64Instr* AMD64Instr_MovxLQ ( Bool syned, HReg src, HReg dst ) {
+ AMD64Instr* i = LibVEX_Alloc(sizeof(AMD64Instr));
+ i->tag = Ain_MovxLQ;
+ i->Ain.MovxLQ.syned = syned;
+ i->Ain.MovxLQ.src = src;
+ i->Ain.MovxLQ.dst = dst;
return i;
}
AMD64Instr* AMD64Instr_LoadEX ( UChar szSmall, Bool syned,
@@ -1138,11 +1139,11 @@
vex_printf(",");
ppHRegAMD64(i->Ain.CMov64.dst);
return;
- case Ain_MovZLQ:
- vex_printf("movzlq ");
- ppHRegAMD64_lo32(i->Ain.MovZLQ.src);
+ case Ain_MovxLQ:
+ vex_printf("mov%clq ", i->Ain.MovxLQ.syned ? 's' : 'z');
+ ppHRegAMD64_lo32(i->Ain.MovxLQ.src);
vex_printf(",");
- ppHRegAMD64(i->Ain.MovZLQ.dst);
+ ppHRegAMD64(i->Ain.MovxLQ.dst);
return;
case Ain_LoadEX:
if (i->Ain.LoadEX.szSmall==4 && !i->Ain.LoadEX.syned) {
@@ -1510,9 +1511,9 @@
addRegUsage_AMD64RM(u, i->Ain.CMov64.src, HRmRead);
addHRegUse(u, HRmModify, i->Ain.CMov64.dst);
return;
- case Ain_MovZLQ:
- addHRegUse(u, HRmRead, i->Ain.MovZLQ.src);
- addHRegUse(u, HRmWrite, i->Ain.MovZLQ.dst);
+ case Ain_MovxLQ:
+ addHRegUse(u, HRmRead, i->Ain.MovxLQ.src);
+ addHRegUse(u, HRmWrite, i->Ain.MovxLQ.dst);
return;
case Ain_LoadEX:
addRegUsage_AMD64AMode(u, i->Ain.LoadEX.src);
@@ -1740,9 +1741,9 @@
mapRegs_AMD64RM(m, i->Ain.CMov64.src);
mapReg(m, &i->Ain.CMov64.dst);
return;
- case Ain_MovZLQ:
- mapReg(m, &i->Ain.MovZLQ.src);
- mapReg(m, &i->Ain.MovZLQ.dst);
+ case Ain_MovxLQ:
+ mapReg(m, &i->Ain.MovxLQ.src);
+ mapReg(m, &i->Ain.MovxLQ.dst);
return;
case Ain_LoadEX:
mapRegs_AMD64AMode(m, i->Ain.LoadEX.src);
@@ -2830,13 +2831,22 @@
}
break;
- case Ain_MovZLQ:
- /* Produce a 32-bit reg-reg move, since the implicit zero-extend
- does what we want. */
- *p++ = clearWBit (
- rexAMode_R(i->Ain.MovZLQ.src, i->Ain.MovZLQ.dst));
- *p++ = 0x89;
- p = doAMode_R(p, i->Ain.MovZLQ.src, i->Ain.MovZLQ.dst);
+ case Ain_MovxLQ:
+ /* No, _don't_ ask me why the sense of the args has to be
+ different in the S vs Z case. I don't know. */
+ if (i->Ain.MovxLQ.syned) {
+ /* Need REX.W = 1 here, but rexAMode_R does that for us. */
+ *p++ = rexAMode_R(i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
+ *p++ = 0x63;
+ p = doAMode_R(p, i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
+ } else {
+ /* Produce a 32-bit reg-reg move, since the implicit
+ zero-extend does what we want. */
+ *p++ = clearWBit (
+ rexAMode_R(i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst));
+ *p++ = 0x89;
+ p = doAMode_R(p, i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst);
+ }
goto done;
case Ain_LoadEX:
Modified: trunk/priv/host_amd64_defs.h
===================================================================
--- trunk/priv/host_amd64_defs.h 2010-09-03 23:37:02 UTC (rev 2029)
+++ trunk/priv/host_amd64_defs.h 2010-09-08 08:34:52 UTC (rev 2030)
@@ -366,7 +366,7 @@
Ain_Call, /* call to address in register */
Ain_Goto, /* conditional/unconditional jmp to dst */
Ain_CMov64, /* conditional move */
- Ain_MovZLQ, /* reg-reg move, zeroing out top half */
+ Ain_MovxLQ, /* reg-reg move, zx-ing/sx-ing top half */
Ain_LoadEX, /* mov{s,z}{b,w,l}q from mem to reg */
Ain_Store, /* store 32/16/8 bit value in memory */
Ain_Set64, /* convert condition code to 64-bit value */
@@ -493,11 +493,12 @@
AMD64RM* src;
HReg dst;
} CMov64;
- /* reg-reg move, zeroing out top half */
+ /* reg-reg move, sx-ing/zx-ing top half */
struct {
+ Bool syned;
HReg src;
HReg dst;
- } MovZLQ;
+ } MovxLQ;
/* Sign/Zero extending loads. Dst size is always 64 bits. */
struct {
UChar szSmall; /* only 1, 2 or 4 */
@@ -684,7 +685,7 @@
extern AMD64Instr* AMD64Instr_Call ( AMD64CondCode, Addr64, Int );
extern AMD64Instr* AMD64Instr_Goto ( IRJumpKind, AMD64CondCode cond, AMD64RI* dst );
extern AMD64Instr* AMD64Instr_CMov64 ( AMD64CondCode, AMD64RM* src, HReg dst );
-extern AMD64Instr* AMD64Instr_MovZLQ ( HReg src, HReg dst );
+extern AMD64Instr* AMD64Instr_MovxLQ ( Bool syned, HReg src, HReg dst );
extern AMD64Instr* AMD64Instr_LoadEX ( UChar szSmall, Bool syned,
AMD64AMode* src, HReg dst );
extern AMD64Instr* AMD64Instr_Store ( UChar sz, HReg src, AMD64AMode* dst );
Modified: trunk/priv/host_amd64_isel.c
===================================================================
--- trunk/priv/host_amd64_isel.c 2010-09-03 23:37:02 UTC (rev 2029)
+++ trunk/priv/host_amd64_isel.c 2010-09-08 08:34:52 UTC (rev 2030)
@@ -417,7 +417,7 @@
&& e->Iex.Unop.op == Iop_32Uto64
&& e->Iex.Unop.arg->tag == Iex_RdTmp) {
HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
- return AMD64Instr_MovZLQ(src, dst);
+ return AMD64Instr_MovxLQ(False, src, dst);
}
if (0) { ppIRExpr(e); vex_printf("\n"); }
@@ -858,8 +858,9 @@
Bool second_is_UInt;
MatchInfo mi;
- DECLARE_PATTERN(p_8Uto64);
DECLARE_PATTERN(p_1Uto8_64to1);
+ DECLARE_PATTERN(p_LDle8_then_8Uto64);
+ DECLARE_PATTERN(p_LDle16_then_16Uto64);
IRType ty = typeOfIRExpr(env->type_env,e);
vassert(ty == Ity_I32 || Ity_I16 || Ity_I8);
@@ -977,7 +978,7 @@
Aalu_AND, AMD64RMI_Imm(0xFFFF), dst));
break;
case Iop_Shr32:
- addInstr(env, AMD64Instr_MovZLQ(dst,dst));
+ addInstr(env, AMD64Instr_MovxLQ(False, dst, dst));
break;
case Iop_Sar8:
addInstr(env, AMD64Instr_Sh64(Ash_SHL, 56, dst));
@@ -988,8 +989,7 @@
addInstr(env, AMD64Instr_Sh64(Ash_SAR, 48, dst));
break;
case Iop_Sar32:
- addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, dst));
- addInstr(env, AMD64Instr_Sh64(Ash_SAR, 32, dst));
+ addInstr(env, AMD64Instr_MovxLQ(True, dst, dst));
break;
default:
ppIROp(e->Iex.Binop.op);
@@ -1159,7 +1159,7 @@
HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
if (second_is_UInt)
- addInstr(env, AMD64Instr_MovZLQ(argR, argR));
+ addInstr(env, AMD64Instr_MovxLQ(False, argR, argR));
addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) );
addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) );
addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2 ));
@@ -1204,8 +1204,8 @@
addInstr(env, mk_iMOVsd_RR(left64, rax));
addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, rdx));
addInstr(env, AMD64Instr_Div(syned, 4, rmRight));
- addInstr(env, AMD64Instr_MovZLQ(rdx,rdx));
- addInstr(env, AMD64Instr_MovZLQ(rax,rax));
+ addInstr(env, AMD64Instr_MovxLQ(False, rdx, rdx));
+ addInstr(env, AMD64Instr_MovxLQ(False, rax, rax));
addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, rdx));
addInstr(env, mk_iMOVsd_RR(rax, dst));
addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(rdx), dst));
@@ -1220,7 +1220,7 @@
addInstr(env, mk_iMOVsd_RR(hi32s, hi32));
addInstr(env, mk_iMOVsd_RR(lo32s, lo32));
addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, hi32));
- addInstr(env, AMD64Instr_MovZLQ(lo32,lo32));
+ addInstr(env, AMD64Instr_MovxLQ(False, lo32, lo32));
addInstr(env, AMD64Instr_Alu64R(
Aalu_OR, AMD64RMI_Reg(lo32), hi32));
return hi32;
@@ -1377,59 +1377,55 @@
/* --------- UNARY OP --------- */
case Iex_Unop: {
- /* 32Uto64(8Uto32(expr8)) */
- DEFINE_PATTERN(p_8Uto64,
- unop(Iop_32Uto64, unop(Iop_8Uto32, bind(0)) ) );
- if (matchIRExpr(&mi,p_8Uto64,e)) {
- IRExpr* expr8 = mi.bindee[0];
- HReg dst = newVRegI(env);
- HReg src = iselIntExpr_R(env, expr8);
- addInstr(env, mk_iMOVsd_RR(src,dst) );
- addInstr(env, AMD64Instr_Sh64(Ash_SHL, 56, dst));
- addInstr(env, AMD64Instr_Sh64(Ash_SHR, 56, dst));
- return dst;
- }
/* 1Uto8(64to1(expr64)) */
- DEFINE_PATTERN( p_1Uto8_64to1,
- unop(Iop_1Uto8, unop(Iop_64to1, bind(0))) );
- if (matchIRExpr(&mi,p_1Uto8_64to1,e)) {
- IRExpr* expr64 = mi.bindee[0];
- HReg dst = newVRegI(env);
- HReg src = iselIntExpr_R(env, expr64);
- addInstr(env, mk_iMOVsd_RR(src,dst) );
- addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
- AMD64RMI_Imm(1), dst));
- return dst;
+ {
+ DEFINE_PATTERN( p_1Uto8_64to1,
+ unop(Iop_1Uto8, unop(Iop_64to1, bind(0))) );
+ if (matchIRExpr(&mi,p_1Uto8_64to1,e)) {
+ IRExpr* expr64 = mi.bindee[0];
+ HReg dst = newVRegI(env);
+ HReg src = iselIntExpr_R(env, expr64);
+ addInstr(env, mk_iMOVsd_RR(src,dst) );
+ addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
+ AMD64RMI_Imm(1), dst));
+ return dst;
+ }
}
-//.. /* 16Uto32(LDle(expr32)) */
-//.. {
-//.. DECLARE_PATTERN(p_LDle16_then_16Uto32);
-//.. DEFINE_PATTERN(p_LDle16_then_16Uto32,
-//.. unop(Iop_16Uto32,IRExpr_LDle(Ity_I16,bind(0))) );
-//.. if (matchIRExpr(&mi,p_LDle16_then_16Uto32,e)) {
-//.. HReg dst = newVRegI(env);
-//.. X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
-//.. addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
-//.. return dst;
-//.. }
-//.. }
+ /* 8Uto64(LDle(expr64)) */
+ {
+ DEFINE_PATTERN(p_LDle8_then_8Uto64,
+ unop(Iop_8Uto64,
+ IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
+ if (matchIRExpr(&mi,p_LDle8_then_8Uto64,e)) {
+ HReg dst = newVRegI(env);
+ AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
+ addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
+ return dst;
+ }
+ }
- switch (e->Iex.Unop.op) {
- case Iop_32Uto64: {
+ /* 16Uto64(LDle(expr64)) */
+ {
+ DEFINE_PATTERN(p_LDle16_then_16Uto64,
+ unop(Iop_16Uto64,
+ IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
+ if (matchIRExpr(&mi,p_LDle16_then_16Uto64,e)) {
HReg dst = newVRegI(env);
- HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
- addInstr(env, AMD64Instr_MovZLQ(src,dst) );
+ AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
+ addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
return dst;
}
+ }
+
+ switch (e->Iex.Unop.op) {
+ case Iop_32Uto64:
case Iop_32Sto64: {
HReg dst = newVRegI(env);
HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
- UInt amt = 32;
- addInstr(env, mk_iMOVsd_RR(src,dst) );
- addInstr(env, AMD64Instr_Sh64(Ash_SHL, amt, dst));
- addInstr(env, AMD64Instr_Sh64(Ash_SAR, amt, dst));
+ addInstr(env, AMD64Instr_MovxLQ(e->Iex.Unop.op == Iop_32Sto64,
+ src, dst) );
return dst;
}
case Iop_128HIto64: {
@@ -1566,7 +1562,7 @@
HReg dst = newVRegI(env);
HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
addInstr(env, mk_iMOVsd_RR(pre,src));
- addInstr(env, AMD64Instr_MovZLQ(src,src));
+ addInstr(env, AMD64Instr_MovxLQ(False, src, src));
addInstr(env, mk_iMOVsd_RR(src,dst));
addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
@@ -1741,7 +1737,7 @@
if (e->Iex.CCall.retty == Ity_I64)
addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
else
- addInstr(env, AMD64Instr_MovZLQ(hregAMD64_RAX(), dst));
+ addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
return dst;
}
@@ -2179,7 +2175,7 @@
HReg r1 = iselIntExpr_R(env, e->Iex.Unop.arg);
HReg tmp = newVRegI(env);
AMD64RMI* rmi2 = AMD64RMI_Imm(0);
- addInstr(env, AMD64Instr_MovZLQ(r1,tmp));
+ addInstr(env, AMD64Instr_MovxLQ(False, r1, tmp));
addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,tmp));
return Acc_NZ;
}
Modified: trunk/priv/ir_opt.c
===================================================================
--- trunk/priv/ir_opt.c 2010-09-03 23:37:02 UTC (rev 2029)
+++ trunk/priv/ir_opt.c 2010-09-08 08:34:52 UTC (rev 2030)
@@ -3975,8 +3975,20 @@
/* 64to32( 32Uto64 ( x )) --> x */
if (is_Unop(aa, Iop_32Uto64))
return aa->Iex.Unop.arg;
+ /* 64to32( 8Uto64 ( x )) --> 8Uto32(x) */
+ if (is_Unop(aa, Iop_8Uto64))
+ return IRExpr_Unop(Iop_8Uto32, aa->Iex.Unop.arg);
break;
+ case Iop_32Uto64:
+ /* 32Uto64( 8Uto32( x )) --> 8Uto64(x) */
+ if (is_Unop(aa, Iop_8Uto32))
+ return IRExpr_Unop(Iop_8Uto64, aa->Iex.Unop.arg);
+ /* 32Uto64( 16Uto32( x )) --> 16Uto64(x) */
+ if (is_Unop(aa, Iop_16Uto32))
+ return IRExpr_Unop(Iop_16Uto64, aa->Iex.Unop.arg);
+ break;
+
case Iop_1Sto32:
/* 1Sto32( CmpNEZ8( 32to8( 1Uto32( CmpNEZ32( x ))))) -> CmpwNEZ32(x) */
if (is_Unop(aa, Iop_CmpNEZ8)
|