|
From: <sv...@va...> - 2005-05-11 16:13:39
|
Author: sewardj
Date: 2005-05-11 17:13:37 +0100 (Wed, 11 May 2005)
New Revision: 1186
Modified:
trunk/priv/host-amd64/hdefs.c
trunk/priv/host-amd64/isel.c
Log:
Make the amd64 back end capable of dealing with the stuff memcheck
generates for 128-bit vector primops.
Modified: trunk/priv/host-amd64/hdefs.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/priv/host-amd64/hdefs.c 2005-05-11 15:37:50 UTC (rev 1185)
+++ trunk/priv/host-amd64/hdefs.c 2005-05-11 16:13:37 UTC (rev 1186)
@@ -1603,9 +1603,12 @@
i->Ain.Sse64FLo.dst);
return;
case Ain_SseReRg:
- if (i->Ain.SseReRg.op =3D=3D Asse_XOR
- && i->Ain.SseReRg.src =3D=3D i->Ain.SseReRg.dst) {
- /* reg-alloc needs to understand 'xor r,r' as a write of r *=
/
+ if ( (i->Ain.SseReRg.op =3D=3D Asse_XOR
+ || i->Ain.SseReRg.op =3D=3D Asse_CMPEQ32)
+ && i->Ain.SseReRg.src =3D=3D i->Ain.SseReRg.dst) {
+ /* reg-alloc needs to understand 'xor r,r' and 'cmpeqd
+ r,r' as a write of a value to r, and independent of any
+ previous value in r */
/* (as opposed to a rite of passage :-) */
addHRegUse(u, HRmWrite, i->Ain.SseReRg.dst);
} else {
Modified: trunk/priv/host-amd64/isel.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/priv/host-amd64/isel.c 2005-05-11 15:37:50 UTC (rev 1185)
+++ trunk/priv/host-amd64/isel.c 2005-05-11 16:13:37 UTC (rev 1186)
@@ -698,17 +698,31 @@
}
=20
=20
-/* Generate !src into a new vector register. Amazing that there isn't
- a less crappy way to do this.
+/* Generate all-zeroes into a new vector register.
*/
-static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
+static HReg generate_zeroes_V128 ( ISelEnv* env )
{
HReg dst =3D newVRegV(env);
- /* Set dst to zero. Not strictly necessary. */
addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst));
- /* And now make it all 1s ... */
+ return dst;
+}
+
+/* Generate all-ones into a new vector register.
+*/
+static HReg generate_ones_V128 ( ISelEnv* env )
+{
+ HReg dst =3D newVRegV(env);
addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst));
- /* Finally, xor 'src' into it. */
+ return dst;
+}
+
+
+/* Generate !src into a new vector register. Amazing that there isn't
+ a less crappy way to do this.
+*/
+static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
+{
+ HReg dst =3D generate_ones_V128(env);
addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst));
return dst;
}
@@ -3053,7 +3067,7 @@
HReg dst =3D newVRegV(env);
vassert(e->Iex.Const.con->tag =3D=3D Ico_V128);
if (e->Iex.Const.con->Ico.V128 =3D=3D 0x0000) {
- addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst));
+ dst =3D generate_zeroes_V128(env);
return dst;
} else
if (e->Iex.Const.con->Ico.V128 =3D=3D 0x00FF) {
@@ -3108,9 +3122,8 @@
(2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)=20
*/
HReg arg =3D iselVecExpr(env, e->Iex.Unop.arg);
- HReg tmp =3D newVRegV(env);
+ HReg tmp =3D generate_zeroes_V128(env);
HReg dst =3D newVRegV(env);
- addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, tmp));
addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp));
tmp =3D do_sse_NotV128(env, tmp);
addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst));
@@ -3118,59 +3131,20 @@
return dst;
}
=20
-//.. case Iop_CmpNEZ32x4: {
-//.. /* Sigh, we have to generate lousy code since this has to
-//.. work on SSE1 hosts */
-//.. /* basically, the idea is: for each lane:
-//.. movl lane, %r ; negl %r (now CF =3D lane=3D=3D0 ? =
0 : 1)
-//.. sbbl %r, %r (now %r =3D 1Sto32(CF))
-//.. movl %r, lane
-//.. */
-//.. Int i;
-//.. X86AMode* am;
-//.. X86AMode* esp0 =3D X86AMode_IR(0, hregX86_ESP());
-//.. HReg arg =3D iselVecExpr(env, e->Iex.Unop.arg);
-//.. HReg dst =3D newVRegV(env);
-//.. HReg r32 =3D newVRegI(env);
-//.. sub_from_esp(env, 16);
-//.. addInstr(env, X86Instr_SseLdSt(False/*store*/, arg, esp0))=
;
-//.. for (i =3D 0; i < 4; i++) {
-//.. am =3D X86AMode_IR(i*4, hregX86_ESP());
-//.. addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(am),=
r32));
-//.. addInstr(env, X86Instr_Unary32(Xun_NEG, X86RM_Reg(r32))=
);
-//.. addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(r32)=
, r32));
-//.. addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r32),=
am));
-//.. }
-//.. addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
-//.. add_to_esp(env, 16);
-//.. return dst;
-//.. }
-//..=20
-//.. case Iop_CmpNEZ8x16:
-//.. case Iop_CmpNEZ16x8: {
-//.. /* We can use SSE2 instructions for this. */
-//.. HReg arg;
-//.. HReg vec0 =3D newVRegV(env);
-//.. HReg vec1 =3D newVRegV(env);
-//.. HReg dst =3D newVRegV(env);
-//.. X86SseOp cmpOp=20
-//.. =3D e->Iex.Unop.op=3D=3DIop_CmpNEZ16x8 ? Xsse_CMPEQ16
-//.. : Xsse_CMPEQ8;
-//.. REQUIRE_SSE2;
-//.. addInstr(env, X86Instr_SseReRg(Xsse_XOR, vec0, vec0));
-//.. addInstr(env, mk_vMOVsd_RR(vec0, vec1));
-//.. addInstr(env, X86Instr_Sse32Fx4(Xsse_CMPEQF, vec1, vec1));
-//.. /* defer arg computation to here so as to give CMPEQF as l=
ong
-//.. as possible to complete */
-//.. arg =3D iselVecExpr(env, e->Iex.Unop.arg);
-//.. /* vec0 is all 0s; vec1 is all 1s */
-//.. addInstr(env, mk_vMOVsd_RR(arg, dst));
-//.. /* 16x8 or 8x16 comparison =3D=3D */
-//.. addInstr(env, X86Instr_SseReRg(cmpOp, vec0, dst));
-//.. /* invert result */
-//.. addInstr(env, X86Instr_SseReRg(Xsse_XOR, vec1, dst));
-//.. return dst;
-//.. }
+ case Iop_CmpNEZ32x4: op =3D Asse_CMPEQ32; goto do_CmpNEZ_vector;
+ case Iop_CmpNEZ16x8: op =3D Asse_CMPEQ16; goto do_CmpNEZ_vector;
+ case Iop_CmpNEZ8x16: op =3D Asse_CMPEQ8; goto do_CmpNEZ_vector;
+ do_CmpNEZ_vector:
+ {
+ HReg arg =3D iselVecExpr(env, e->Iex.Unop.arg);
+ HReg tmp =3D newVRegV(env);
+ HReg zero =3D generate_zeroes_V128(env);
+ HReg dst;
+ addInstr(env, mk_vMOVsd_RR(arg, tmp));
+ addInstr(env, AMD64Instr_SseReRg(op, zero, tmp));
+ dst =3D do_sse_NotV128(env, tmp);
+ return dst;
+ }
=20
case Iop_Recip32Fx4: op =3D Asse_RCPF; goto do_32Fx4_unary;
case Iop_RSqrt32Fx4: op =3D Asse_RSQRTF; goto do_32Fx4_unary;
|