|
From: <sv...@va...> - 2005-05-05 21:34:08
|
Author: sewardj
Date: 2005-05-05 22:34:02 +0100 (Thu, 05 May 2005)
New Revision: 1163
Modified:
trunk/priv/guest-amd64/toIR.c
trunk/priv/host-amd64/hdefs.c
trunk/priv/host-amd64/hdefs.h
trunk/priv/host-amd64/isel.c
Log:
Implement a whole bunch more SSE instructions on amd64.
Modified: trunk/priv/guest-amd64/toIR.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/priv/guest-amd64/toIR.c 2005-05-05 12:05:54 UTC (rev 1162)
+++ trunk/priv/guest-amd64/toIR.c 2005-05-05 21:34:02 UTC (rev 1163)
@@ -2163,7 +2163,7 @@
by the caller. This is needed to make sense of %rip-relative
addresses. Note that the value that *len is set to is only the
length of the amode itself and does not include the value supplied
- in xtra_bytes.
+ in extra_bytes.
*/
=20
static IRTemp disAMode_copy2tmp ( IRExpr* addr64 )
@@ -7443,41 +7443,41 @@
}
=20
=20
-//.. /* SSE integer binary operation:
-//.. G =3D G `op` E (eLeft =3D=3D False)
-//.. G =3D E `op` G (eLeft =3D=3D True)
-//.. */
-//.. static UInt dis_SSEint_E_to_G(=20
-//.. UChar sorb, ULong delta,=20
-//.. HChar* opname, IROp op,
-//.. Bool eLeft
-//.. )
-//.. {
-//.. HChar dis_buf[50];
-//.. Int alen;
-//.. IRTemp addr;
-//.. UChar rm =3D getUChar(delta);
-//.. IRExpr* gpart =3D getXMMReg(gregOfRM(rm));
-//.. IRExpr* epart =3D NULL;
-//.. if (epartIsReg(rm)) {
-//.. epart =3D getXMMReg(eregOfRM(rm));
-//.. DIP("%s %s,%s\n", opname,
-//.. nameXMMReg(eregOfRM(rm)),
-//.. nameXMMReg(gregOfRM(rm)) );
-//.. delta +=3D 1;
-//.. } else {
-//.. addr =3D disAMode ( &alen, sorb, delta, dis_buf );
-//.. epart =3D loadLE(Ity_V128, mkexpr(addr));
-//.. DIP("%s %s,%s\n", opname,
-//.. dis_buf,
-//.. nameXMMReg(gregOfRM(rm)) );
-//.. delta +=3D alen;
-//.. }
-//.. putXMMReg( gregOfRM(rm),=20
-//.. eLeft ? binop(op, epart, gpart)
-//.. : binop(op, gpart, epart) );
-//.. return delta;
-//.. }
+/* SSE integer binary operation:
+ G =3D G `op` E (eLeft =3D=3D False)
+ G =3D E `op` G (eLeft =3D=3D True)
+*/
+static ULong dis_SSEint_E_to_G(=20
+ Prefix pfx, ULong delta,=20
+ HChar* opname, IROp op,
+ Bool eLeft
+ )
+{
+ HChar dis_buf[50];
+ Int alen;
+ IRTemp addr;
+ UChar rm =3D getUChar(delta);
+ IRExpr* gpart =3D getXMMReg(gregOfRexRM(pfx,rm));
+ IRExpr* epart =3D NULL;
+ if (epartIsReg(rm)) {
+ epart =3D getXMMReg(eregOfRexRM(pfx,rm));
+ DIP("%s %s,%s\n", opname,
+ nameXMMReg(eregOfRexRM(pfx,rm)),
+ nameXMMReg(gregOfRexRM(pfx,rm)) );
+ delta +=3D 1;
+ } else {
+ addr =3D disAMode ( &alen, pfx, delta, dis_buf, 0 );
+ epart =3D loadLE(Ity_V128, mkexpr(addr));
+ DIP("%s %s,%s\n", opname,
+ dis_buf,
+ nameXMMReg(gregOfRexRM(pfx,rm)) );
+ delta +=3D alen;
+ }
+ putXMMReg( gregOfRexRM(pfx,rm),=20
+ eLeft ? binop(op, epart, gpart)
+ : binop(op, gpart, epart) );
+ return delta;
+}
=20
=20
/* Helper for doing SSE FP comparisons. */
@@ -7659,62 +7659,63 @@
//.. putXMMReg( gregOfRM(rm), mkexpr(g1) );
//.. return delta;
//.. }
-//..=20
-//..=20
-//.. /* Vector by scalar shift of E by an immediate byte. */
-//..=20
-//.. static=20
-//.. UInt dis_SSE_shiftE_imm ( ULong delta, HChar* opname, IROp op )
-//.. {
-//.. Bool shl, shr, sar;
-//.. UChar rm =3D getUChar(delta);
-//.. IRTemp e0 =3D newTemp(Ity_V128);
-//.. IRTemp e1 =3D newTemp(Ity_V128);
-//.. UChar amt, size;
-//.. vassert(epartIsReg(rm));
-//.. vassert(gregOfRM(rm) =3D=3D 2=20
-//.. || gregOfRM(rm) =3D=3D 4 || gregOfRM(rm) =3D=3D 6);
-//.. amt =3D (Int)(getUChar(delta+1));
-//.. delta +=3D 2;
-//.. DIP("%s $%d,%s\n", opname,
-//.. (Int)amt,
-//.. nameXMMReg(eregOfRM(rm)) );
-//.. assign( e0, getXMMReg(eregOfRM(rm)) );
-//..=20
-//.. shl =3D shr =3D sar =3D False;
-//.. size =3D 0;
-//.. switch (op) {
-//.. case Iop_ShlN16x8: shl =3D True; size =3D 16; break;
-//.. case Iop_ShlN32x4: shl =3D True; size =3D 32; break;
-//.. case Iop_ShlN64x2: shl =3D True; size =3D 64; break;
-//.. case Iop_SarN16x8: sar =3D True; size =3D 16; break;
-//.. case Iop_SarN32x4: sar =3D True; size =3D 32; break;
-//.. case Iop_ShrN16x8: shr =3D True; size =3D 16; break;
-//.. case Iop_ShrN32x4: shr =3D True; size =3D 32; break;
-//.. case Iop_ShrN64x2: shr =3D True; size =3D 64; break;
-//.. default: vassert(0);
-//.. }
-//..=20
-//.. if (shl || shr) {
-//.. assign( e1, amt >=3D size=20
-//.. ? mkV128(0x0000)
-//.. : binop(op, mkexpr(e0), mkU8(amt))
-//.. );
-//.. } else=20
-//.. if (sar) {
-//.. assign( e1, amt >=3D size=20
-//.. ? binop(op, mkexpr(e0), mkU8(size-1))
-//.. : binop(op, mkexpr(e0), mkU8(amt))
-//.. );
-//.. } else {
-//.. vassert(0);
-//.. }
-//..=20
-//.. putXMMReg( eregOfRM(rm), mkexpr(e1) );
-//.. return delta;
-//.. }
=20
=20
+/* Vector by scalar shift of E by an immediate byte. */
+
+static=20
+ULong dis_SSE_shiftE_imm ( Prefix pfx,=20
+ ULong delta, HChar* opname, IROp op )
+{
+ Bool shl, shr, sar;
+ UChar rm =3D getUChar(delta);
+ IRTemp e0 =3D newTemp(Ity_V128);
+ IRTemp e1 =3D newTemp(Ity_V128);
+ UChar amt, size;
+ vassert(epartIsReg(rm));
+ vassert(gregLO3ofRM(rm) =3D=3D 2=20
+ || gregLO3ofRM(rm) =3D=3D 4 || gregLO3ofRM(rm) =3D=3D 6);
+ amt =3D (Int)(getUChar(delta+1));
+ delta +=3D 2;
+ DIP("%s $%d,%s\n", opname,
+ (Int)amt,
+ nameXMMReg(eregOfRexRM(pfx,rm)) );
+ assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
+
+ shl =3D shr =3D sar =3D False;
+ size =3D 0;
+ switch (op) {
+ case Iop_ShlN16x8: shl =3D True; size =3D 16; break;
+ case Iop_ShlN32x4: shl =3D True; size =3D 32; break;
+ case Iop_ShlN64x2: shl =3D True; size =3D 64; break;
+ case Iop_SarN16x8: sar =3D True; size =3D 16; break;
+ case Iop_SarN32x4: sar =3D True; size =3D 32; break;
+ case Iop_ShrN16x8: shr =3D True; size =3D 16; break;
+ case Iop_ShrN32x4: shr =3D True; size =3D 32; break;
+ case Iop_ShrN64x2: shr =3D True; size =3D 64; break;
+ default: vassert(0);
+ }
+
+ if (shl || shr) {
+ assign( e1, amt >=3D size=20
+ ? mkV128(0x0000)
+ : binop(op, mkexpr(e0), mkU8(amt))
+ );
+ } else=20
+ if (sar) {
+ assign( e1, amt >=3D size=20
+ ? binop(op, mkexpr(e0), mkU8(size-1))
+ : binop(op, mkexpr(e0), mkU8(amt))
+ );
+ } else {
+ vassert(0);
+ }
+
+ putXMMReg( eregOfRexRM(pfx,rm), mkexpr(e1) );
+ return delta;
+}
+
+
/* Get the current SSE rounding mode. */
=20
static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
@@ -7832,7 +7833,7 @@
/*OUT*/ Addr64* whereNext )
{
IRType ty;
- IRTemp addr, /* t0, */ t1, t2, t3, t4 /*, t5, t6 */;
+ IRTemp addr, t0, t1, t2, t3, t4 /*, t5, t6 */;
Int alen;
UChar opc, modrm, /*abyte,*/ pre;
Long d64;
@@ -7860,7 +7861,7 @@
vassert(guest_rip_next_assumed =3D=3D 0);
vassert(guest_rip_next_mustcheck =3D=3D False);
=20
- addr =3D /* t0 =3D */ t1 =3D t2 =3D t3 =3D t4 =3D /* t5 =3D t6 =3D */=
IRTemp_INVALID;=20
+ addr =3D t0 =3D t1 =3D t2 =3D t3 =3D t4 =3D /* t5 =3D t6 =3D */ IRTem=
p_INVALID;=20
=20
DIP("\t0x%llx: ", guest_rip_bbstart+delta);
=20
@@ -7966,15 +7967,49 @@
=20
=20
/* ---------------------------------------------------- */
- /* --- The SSE decoder. --- */
+ /* --- The SSE/SSE2 decoder. --- */
/* ---------------------------------------------------- */
=20
/* What did I do to deserve SSE ? Perhaps I was really bad in a
previous life? */
=20
- /* Note, this doesn't handle SSE2 or SSE3. That is handled in a
- later section, further on. */
+ /* Note, this doesn't handle SSE3 right now. All amd64s support
+ SSE2 as a minimum so there is no point distinguishing SSE1 vs
+ SSE2. */
=20
+ /* There are just so many damn SSE insns, and amongst them are a
+ large number of data-move insns, many of which seem almost
+ identical. Here's a statement of the behaviour of MOVQ, MOVSD,
+ MOVD, MOVSS. It doesn't help that the Intel manuals are less
+ than accurate about these. The AMD docs seem OK tho.=20
+
+ The following is true for both x86 and amd64. MOVQ and MOVSD
+ shunt 64-bit things around. r is an xmm register and m is
+ memory.
+
+ MOVQ r <- r lo64 moved; hi64 set to zero
+ MOVQ m <- r lo64 moved
+ MOVQ r <- m lo64 moved; hi64 set to zero
+
+ MOVSD r <- r lo64 moved; hi64 unchanged
+ MOVSD m <- r lo64 moved
+ MOVSD r <- m lo64 moved; hi64 set to zero
+
+ MOVD and MOVSS shunt 32-bit things around, and are exactly
+ analogous:
+
+ MOVD r <- r lo32 moved; hi96 set to zero
+ MOVD m <- r lo32 moved
+ MOVD r <- m lo32 moved; hi96 set to zero
+
+ MOVSS r <- r lo32 moved; hi96 unchanged
+ MOVSS m <- r lo32 moved
+ MOVSS r <- m lo32 moved; hi96 set to zero
+
+ For MOVQ and MOVD, the r <- r rules apply even if the source r
+ is instead an integer register.
+ */
+
insn =3D (UChar*)&guest_code[delta];
=20
//.. /* Treat fxsave specially. It should be doable even on an SSE0
@@ -9244,39 +9279,39 @@
goto decode_success;
}
=20
-//.. /* F3 0F E6 =3D CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm t=
o 2 x
-//.. F64 in xmm(G) */
-//.. if (insn[0] =3D=3D 0xF3 && insn[1] =3D=3D 0x0F && insn[2] =3D=3D=
0xE6) {
-//.. IRTemp arg64 =3D newTemp(Ity_I64);
-//.. vassert(sz =3D=3D 4);
-//..=20
-//.. modrm =3D getUChar(delta+3);
-//.. if (epartIsReg(modrm)) {
-//.. assign( arg64, getXMMRegLane64(eregOfRM(modrm), 0) );
-//.. delta +=3D 3+1;
-//.. DIP("cvtdq2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
-//.. nameXMMReg(gregOfRM(modrm)));
-//.. } else {
-//.. addr =3D disAMode ( &alen, sorb, delta+3, dis_buf );
-//.. assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
-//.. delta +=3D 3+alen;
-//.. DIP("cvtdq2pd %s,%s\n", dis_buf,
-//.. nameXMMReg(gregOfRM(modrm)) );
-//.. }
-//..=20
-//.. putXMMRegLane64F(=20
-//.. gregOfRM(modrm), 0,
-//.. unop(Iop_I32toF64, unop(Iop_64to32, mkexpr(arg64)))
-//.. );
-//..=20
-//.. putXMMRegLane64F(
-//.. gregOfRM(modrm), 1,=20
-//.. unop(Iop_I32toF64, unop(Iop_64HIto32, mkexpr(arg64)))
-//.. );
-//..=20
-//.. goto decode_success;
-//.. }
-//..=20
+ /* F3 0F E6 =3D CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
+ F64 in xmm(G) */
+ if (haveF3no66noF2(pfx) && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0xE6=
) {
+ IRTemp arg64 =3D newTemp(Ity_I64);
+ if (sz !=3D 4) goto decode_failure;
+
+ modrm =3D getUChar(delta+2);
+ if (epartIsReg(modrm)) {
+ assign( arg64, getXMMRegLane64(eregOfRexRM(pfx,modrm), 0) );
+ delta +=3D 2+1;
+ DIP("cvtdq2pd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+ nameXMMReg(gregOfRexRM(pfx,modrm)));
+ } else {
+ addr =3D disAMode ( &alen, pfx, delta+2, dis_buf, 0 );
+ assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
+ delta +=3D 2+alen;
+ DIP("cvtdq2pd %s,%s\n", dis_buf,
+ nameXMMReg(gregOfRexRM(pfx,modrm)) );
+ }
+
+ putXMMRegLane64F(=20
+ gregOfRexRM(pfx,modrm), 0,
+ unop(Iop_I32toF64, unop(Iop_64to32, mkexpr(arg64)))
+ );
+
+ putXMMRegLane64F(
+ gregOfRexRM(pfx,modrm), 1,=20
+ unop(Iop_I32toF64, unop(Iop_64HIto32, mkexpr(arg64)))
+ );
+
+ goto decode_success;
+ }
+
//.. /* 0F 5B =3D CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 i=
n
//.. xmm(G) */
//.. if (sz =3D=3D 4 && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0x5B) {
@@ -9315,52 +9350,50 @@
//..=20
//.. goto decode_success;
//.. }
-//..=20
-//.. /* F2 0F E6 =3D CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I3=
2 in
-//.. lo half xmm(G), and zero upper half */
-//.. if (insn[0] =3D=3D 0xF2 && insn[1] =3D=3D 0x0F && insn[2] =3D=3D=
0xE6) {
-//.. IRTemp argV =3D newTemp(Ity_V128);
-//.. IRTemp rmode =3D newTemp(Ity_I32);
-//.. vassert(sz =3D=3D 4);
-//..=20
-//.. modrm =3D getUChar(delta+3);
-//.. if (epartIsReg(modrm)) {
-//.. assign( argV, getXMMReg(eregOfRM(modrm)) );
-//.. delta +=3D 3+1;
-//.. DIP("cvtpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
-//.. nameXMMReg(gregOfRM(modrm)));
-//.. } else {
-//.. addr =3D disAMode ( &alen, sorb, delta+3, dis_buf );
-//.. assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
-//.. delta +=3D 3+alen;
-//.. DIP("cvtpd2dq %s,%s\n", dis_buf,
-//.. nameXMMReg(gregOfRM(modrm)) );
-//.. }
-//.. =20
-//.. assign( rmode, get_sse_roundingmode() );
-//.. t0 =3D newTemp(Ity_F64);
-//.. t1 =3D newTemp(Ity_F64);
-//.. assign( t0, unop(Iop_ReinterpI64asF64,=20
-//.. unop(Iop_128to64, mkexpr(argV))) );
-//.. assign( t1, unop(Iop_ReinterpI64asF64,=20
-//.. unop(Iop_128HIto64, mkexpr(argV))) );
-//.. =20
-#if 0 /* stop gcc multi-line comment warning */
-/.. # define CVT(_t) binop( Iop_F64toI32, \
-/.. mkexpr(rmode), \
-/.. mkexpr(_t) )
-#endif /* stop gcc multi-line comment warning */
-//.. =20
-//.. putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
-//.. putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
-//.. putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
-//.. putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
-//..=20
-//.. # undef CVT
-//..=20
-//.. goto decode_success;
-//.. }
-//..=20
+
+ /* F2 0F E6 =3D CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
+ lo half xmm(G), and zero upper half */
+ if (haveF2no66noF3(pfx) && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0xE6=
) {
+ IRTemp argV =3D newTemp(Ity_V128);
+ IRTemp rmode =3D newTemp(Ity_I32);
+ if (sz !=3D 4) goto decode_failure;
+
+ modrm =3D getUChar(delta+2);
+ if (epartIsReg(modrm)) {
+ assign( argV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+ delta +=3D 2+1;
+ DIP("cvtpd2dq %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+ nameXMMReg(gregOfRexRM(pfx,modrm)));
+ } else {
+ addr =3D disAMode ( &alen, pfx, delta+2, dis_buf, 0 );
+ assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
+ delta +=3D 2+alen;
+ DIP("cvtpd2dq %s,%s\n", dis_buf,
+ nameXMMReg(gregOfRexRM(pfx,modrm)) );
+ }
+ =20
+ assign( rmode, get_sse_roundingmode() );
+ t0 =3D newTemp(Ity_F64);
+ t1 =3D newTemp(Ity_F64);
+ assign( t0, unop(Iop_ReinterpI64asF64,=20
+ unop(Iop_V128to64, mkexpr(argV))) );
+ assign( t1, unop(Iop_ReinterpI64asF64,=20
+ unop(Iop_V128HIto64, mkexpr(argV))) );
+ =20
+# define CVT(_t) binop( Iop_F64toI32, \
+ mkexpr(rmode), \
+ mkexpr(_t) )
+ =20
+ putXMMRegLane32( gregOfRexRM(pfx,modrm), 3, mkU32(0) );
+ putXMMRegLane32( gregOfRexRM(pfx,modrm), 2, mkU32(0) );
+ putXMMRegLane32( gregOfRexRM(pfx,modrm), 1, CVT(t1) );
+ putXMMRegLane32( gregOfRexRM(pfx,modrm), 0, CVT(t0) );
+
+# undef CVT
+
+ goto decode_success;
+ }
+
//.. /* 66 0F 2D =3D CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
//.. I32 in mmx, according to prevailing SSE rounding mode */
//.. /* 66 0F 2C =3D CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
@@ -9909,66 +9942,95 @@
}
}
=20
-//.. /* 66 0F 6E =3D MOVD from r/m32 to xmm, zeroing high 3/4 of xmm.=
*/
-//.. if (sz =3D=3D 2 && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0x6E) {
-//.. modrm =3D getUChar(delta+2);
-//.. if (epartIsReg(modrm)) {
-//.. delta +=3D 2+1;
-//.. putXMMReg(
-//.. gregOfRM(modrm),
-//.. unop( Iop_32Uto128, getIReg(4, eregOfRM(modrm)) )=20
-//.. );
-//.. DIP("movd %s, %s\n",=20
-//.. nameIReg(4,eregOfRM(modrm)), nameXMMReg(gregOfRM(modrm=
)));
-//.. } else {
-//.. addr =3D disAMode( &alen, sorb, delta+2, dis_buf );
-//.. delta +=3D 2+alen;
-//.. putXMMReg(
-//.. gregOfRM(modrm),
-//.. unop( Iop_32Uto128,loadLE(Ity_I32, mkexpr(addr)) )=20
-//.. );
-//.. DIP("movd %s, %s\n", dis_buf, nameXMMReg(gregOfRM(modrm)))=
;
-//.. }
-//.. goto decode_success;
-//.. }
-//..=20
-//.. /* 66 0F 7E =3D MOVD from xmm low 1/4 to r/m32. */
-//.. if (sz =3D=3D 2 && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0x7E) {
-//.. modrm =3D getUChar(delta+2);
-//.. if (epartIsReg(modrm)) {
-//.. delta +=3D 2+1;
-//.. putIReg( 4, eregOfRM(modrm),
-//.. getXMMRegLane32(gregOfRM(modrm), 0) );
-//.. DIP("movd %s, %s\n",=20
-//.. nameXMMReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm=
)));
-//.. } else {
-//.. addr =3D disAMode( &alen, sorb, delta+2, dis_buf );
-//.. delta +=3D 2+alen;
-//.. storeLE( mkexpr(addr),
-//.. getXMMRegLane32(gregOfRM(modrm), 0) );
-//.. DIP("movd %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf)=
;
-//.. }
-//.. goto decode_success;
-//.. }
-//..=20
-//.. /* 66 0F 7F =3D MOVDQA -- move from G (xmm) to E (mem or xmm). *=
/
-//.. if (sz =3D=3D 2 && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0x7F) {
-//.. modrm =3D getUChar(delta+2);
-//.. if (epartIsReg(modrm)) {
-//.. delta +=3D 2+1;
-//.. putXMMReg( eregOfRM(modrm),
-//.. getXMMReg(gregOfRM(modrm)) );
-//.. DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)),=20
-//.. nameXMMReg(eregOfRM(modrm)));
-//.. } else {
-//.. addr =3D disAMode( &alen, sorb, delta+2, dis_buf );
-//.. delta +=3D 2+alen;
-//.. storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
-//.. DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_bu=
f);
-//.. }
-//.. goto decode_success;
-//.. }
-//..=20
+ /* 66 0F 6E =3D MOVD from ireg32/m32 to xmm lo 1/4, zeroing high 3/4 =
of xmm. */
+ /* or from ireg64/m64 to xmm lo 1/2, zeroing high 1/2 of=
xmm. */
+ if (have66noF2noF3(pfx) && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0x6E=
) {
+ vassert(sz =3D=3D 4 || sz =3D=3D 8);
+ modrm =3D getUChar(delta+2);
+ if (epartIsReg(modrm)) {
+ delta +=3D 2+1;
+ if (sz =3D=3D 4) {
+ goto decode_failure; /* awaiting test case */
+ putXMMReg(
+ gregOfRexRM(pfx,modrm),
+ unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )=20
+ );
+ DIP("movd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),=20
+ nameXMMReg(gregOfRexRM(pfx,modrm)));
+ } else {
+ putXMMReg(
+ gregOfRexRM(pfx,modrm),
+ unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )=20
+ );
+ DIP("movq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),=20
+ nameXMMReg(gregOfRexRM(pfx,modrm)));
+ }
+ } else {
+ addr =3D disAMode( &alen, pfx, delta+2, dis_buf, 0 );
+ delta +=3D 2+alen;
+ putXMMReg(
+ gregOfRexRM(pfx,modrm),
+ sz =3D=3D 4=20
+ ? unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )=20
+ : unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)) )
+ );
+ DIP("mov%c %s, %s\n", sz =3D=3D 4 ? 'd' : 'q', dis_buf,=20
+ nameXMMReg(gregOfRexRM(pfx,modrm)));
+ }
+ goto decode_success;
+ }
+
+ /* 66 0F 7E =3D MOVD from xmm low 1/4 to ireg32 or m32. */
+ /* or from xmm low 1/2 to ireg64 or m64. */
+ if (have66noF2noF3(pfx) && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0x7E=
) {
+ if (sz =3D=3D 2) sz =3D 4;
+ vassert(sz =3D=3D 4 || sz =3D=3D 8);
+ modrm =3D getUChar(delta+2);
+ if (epartIsReg(modrm)) {
+ delta +=3D 2+1;
+ if (sz =3D=3D 4) {
+ putIReg32( eregOfRexRM(pfx,modrm),
+ getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
+ DIP("movd %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),=20
+ nameIReg32(eregOfRexRM(pfx,modrm)));
+ } else {
+ putIReg64( eregOfRexRM(pfx,modrm),
+ getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
+ DIP("movq %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),=20
+ nameIReg64(eregOfRexRM(pfx,modrm)));
+ }
+ } else {
+ addr =3D disAMode( &alen, pfx, delta+2, dis_buf, 0 );
+ delta +=3D 2+alen;
+ storeLE( mkexpr(addr),
+ sz =3D=3D 4
+ ? getXMMRegLane32(gregOfRexRM(pfx,modrm),0)
+ : getXMMRegLane64(gregOfRexRM(pfx,modrm),0) );
+ DIP("mov%c %s, %s\n", sz =3D=3D 4 ? 'd' : 'q',
+ nameXMMReg(gregOfRexRM(pfx,modrm)), dis_b=
uf);
+ }
+ goto decode_success;
+ }
+
+ /* 66 0F 7F =3D MOVDQA -- move from G (xmm) to E (mem or xmm). */
+ if (have66noF2noF3(pfx) && sz =3D=3D 2=20
+ && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0x7F) {
+ modrm =3D getUChar(delta+2);
+ if (epartIsReg(modrm)) {
+ delta +=3D 2+1;
+ putXMMReg( eregOfRexRM(pfx,modrm),
+ getXMMReg(gregOfRexRM(pfx,modrm)) );
+ DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),=20
+ nameXMMReg(eregOfRexRM(pfx,modrm)));
+ } else {
+ addr =3D disAMode( &alen, pfx, delta+2, dis_buf, 0 );
+ delta +=3D 2+alen;
+ storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
+ DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_=
buf);
+ }
+ goto decode_success;
+ }
+
//.. /* F3 0F 6F =3D MOVDQU -- move from E (mem or xmm) to G (xmm). *=
/
//.. /* Unfortunately can't simply use the MOVDQA case since the
//.. prefix lengths are different (66 vs F3) */
@@ -10259,55 +10321,56 @@
goto decode_success;
}
=20
-//.. /* 66 0F C6 /r ib =3D SHUFPD -- shuffle packed F64s */
-//.. if (sz =3D=3D 2 && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0xC6) {
-//.. Int select;
-//.. IRTemp sV =3D newTemp(Ity_V128);
-//.. IRTemp dV =3D newTemp(Ity_V128);
-//.. IRTemp s1 =3D newTemp(Ity_I64);
-//.. IRTemp s0 =3D newTemp(Ity_I64);
-//.. IRTemp d1 =3D newTemp(Ity_I64);
-//.. IRTemp d0 =3D newTemp(Ity_I64);
-//..=20
-//.. modrm =3D insn[2];
-//.. assign( dV, getXMMReg(gregOfRM(modrm)) );
-//..=20
-//.. if (epartIsReg(modrm)) {
-//.. assign( sV, getXMMReg(eregOfRM(modrm)) );
-//.. select =3D (Int)insn[3];
-//.. delta +=3D 2+2;
-//.. DIP("shufpd $%d,%s,%s\n", select,=20
-//.. nameXMMReg(eregOfRM(modrm)),
-//.. nameXMMReg(gregOfRM(modrm)));
-//.. } else {
-//.. addr =3D disAMode ( &alen, sorb, delta+2, dis_buf );
-//.. assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
-//.. select =3D (Int)insn[2+alen];
-//.. delta +=3D 3+alen;
-//.. DIP("shufpd $%d,%s,%s\n", select,=20
-//.. dis_buf,
-//.. nameXMMReg(gregOfRM(modrm)));
-//.. }
-//..=20
-//.. assign( d1, unop(Iop_128HIto64, mkexpr(dV)) );
-//.. assign( d0, unop(Iop_128to64, mkexpr(dV)) );
-//.. assign( s1, unop(Iop_128HIto64, mkexpr(sV)) );
-//.. assign( s0, unop(Iop_128to64, mkexpr(sV)) );
-//..=20
-//.. # define SELD(n) mkexpr((n)=3D=3D0 ? d0 : d1)
-//.. # define SELS(n) mkexpr((n)=3D=3D0 ? s0 : s1)
-//..=20
-//.. putXMMReg(
-//.. gregOfRM(modrm),=20
-//.. binop(Iop_64HLto128, SELS((select>>1)&1), SELD((select>>0)=
&1) )
-//.. );
-//..=20
-//.. # undef SELD
-//.. # undef SELS
-//..=20
-//.. goto decode_success;
-//.. }
-//..=20
+ /* 66 0F C6 /r ib =3D SHUFPD -- shuffle packed F64s */
+ if (have66noF2noF3(pfx) && sz =3D=3D 2=20
+ && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0xC6) {
+ Int select;
+ IRTemp sV =3D newTemp(Ity_V128);
+ IRTemp dV =3D newTemp(Ity_V128);
+ IRTemp s1 =3D newTemp(Ity_I64);
+ IRTemp s0 =3D newTemp(Ity_I64);
+ IRTemp d1 =3D newTemp(Ity_I64);
+ IRTemp d0 =3D newTemp(Ity_I64);
+
+ modrm =3D insn[2];
+ assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+ if (epartIsReg(modrm)) {
+ assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+ select =3D (Int)insn[3];
+ delta +=3D 2+2;
+ DIP("shufpd $%d,%s,%s\n", select,=20
+ nameXMMReg(eregOfRexRM(pfx,modrm)),
+ nameXMMReg(gregOfRexRM(pfx,modrm)));
+ } else {
+ addr =3D disAMode ( &alen, pfx, delta+2, dis_buf, 1 );
+ assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+ select =3D (Int)insn[2+alen];
+ delta +=3D 3+alen;
+ DIP("shufpd $%d,%s,%s\n", select,=20
+ dis_buf,
+ nameXMMReg(gregOfRexRM(pfx,modrm)));
+ }
+
+ assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
+ assign( d0, unop(Iop_V128to64, mkexpr(dV)) );
+ assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
+ assign( s0, unop(Iop_V128to64, mkexpr(sV)) );
+
+# define SELD(n) mkexpr((n)=3D=3D0 ? d0 : d1)
+# define SELS(n) mkexpr((n)=3D=3D0 ? s0 : s1)
+
+ putXMMReg(
+ gregOfRexRM(pfx,modrm),=20
+ binop(Iop_64HLtoV128, SELS((select>>1)&1), SELD((select>>0)&1) =
)
+ );
+
+# undef SELD
+# undef SELS
+
+ goto decode_success;
+ }
+
//.. /* 66 0F 51 =3D SQRTPD -- approx sqrt 64Fx2 from R/M to R */
//.. if (sz =3D=3D 2 && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0x51) {
//.. delta =3D dis_SSE_E_to_G_unary_all( sorb, delta+2,=20
@@ -10434,14 +10497,15 @@
//.. sorb, delta+2, insn[1], "paddq", False );
//.. goto decode_success;
//.. }
-//..=20
-//.. /* 66 0F D4 =3D PADDQ */
-//.. if (sz =3D=3D 2 && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0xD4) {
-//.. delta =3D dis_SSEint_E_to_G( sorb, delta+2,=20
-//.. "paddq", Iop_Add64x2, False );
-//.. goto decode_success;
-//.. }
-//..=20
+
+ /* 66 0F D4 =3D PADDQ */
+ if (have66noF2noF3(pfx) && sz =3D=3D 2=20
+ && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0xD4) {
+ delta =3D dis_SSEint_E_to_G( pfx, delta+2,=20
+ "paddq", Iop_Add64x2, False );
+ goto decode_success;
+ }
+
//.. /* 66 0F FD =3D PADDW */
//.. if (sz =3D=3D 2 && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0xFD) {
//.. delta =3D dis_SSEint_E_to_G( sorb, delta+2,=20
@@ -10476,13 +10540,14 @@
//.. "paddusw", Iop_QAdd16Ux8, False );
//.. goto decode_success;
//.. }
-//..=20
-//.. /* 66 0F DB =3D PAND */
-//.. if (sz =3D=3D 2 && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0xDB) {
-//.. delta =3D dis_SSE_E_to_G_all( sorb, delta+2, "pand", Iop_And1=
28 );
-//.. goto decode_success;
-//.. }
-//..=20
+
+ /* 66 0F DB =3D PAND */
+ if (have66noF2noF3(pfx) && sz =3D=3D 2=20
+ && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0xDB) {
+ delta =3D dis_SSE_E_to_G_all( pfx, delta+2, "pand", Iop_AndV128 );
+ goto decode_success;
+ }
+
//.. /* 66 0F DF =3D PANDN */
//.. if (sz =3D=3D 2 && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0xDF) {
//.. delta =3D dis_SSE_E_to_G_all_invG( sorb, delta+2, "pandn", Io=
p_And128 );
@@ -10745,13 +10810,14 @@
//.. putXMMRegLane64( gregOfRM(modrm), 1, mkexpr(t1) );
//.. goto decode_success;
//.. }
-//..=20
-//.. /* 66 0F EB =3D POR */
-//.. if (sz =3D=3D 2 && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0xEB) {
-//.. delta =3D dis_SSE_E_to_G_all( sorb, delta+2, "por", Iop_Or128=
);
-//.. goto decode_success;
-//.. }
-//..=20
+
+ /* 66 0F EB =3D POR */
+ if (have66noF2noF3(pfx) && sz =3D=3D 2=20
+ && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0xEB) {
+ delta =3D dis_SSE_E_to_G_all( pfx, delta+2, "por", Iop_OrV128 );
+ goto decode_success;
+ }
+
//.. /* 66 0F 70 =3D PSHUFD -- rearrange 4x32 from E(xmm or mem) to G=
(xmm) */
//.. if (sz =3D=3D 2 && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0x70) {
//.. Int order;
@@ -11081,15 +11147,16 @@
//.. putXMMReg(reg, mkexpr(dV));
//.. goto decode_success;
//.. }
-//..=20
-//.. /* 66 0F 73 /2 ib =3D PSRLQ by immediate */
-//.. if (sz =3D=3D 2 && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0x73
-//.. && epartIsReg(insn[2])
-//.. && gregOfRM(insn[2]) =3D=3D 2) {
-//.. delta =3D dis_SSE_shiftE_imm( delta+2, "psrlq", Iop_ShrN64x2 =
);
-//.. goto decode_success;
-//.. }
-//..=20
+
+ /* 66 0F 73 /2 ib =3D PSRLQ by immediate */
+ if (have66noF2noF3(pfx) && sz =3D=3D 2=20
+ && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0x73
+ && epartIsReg(insn[2])
+ && gregLO3ofRM(insn[2]) =3D=3D 2) {
+ delta =3D dis_SSE_shiftE_imm( pfx, delta+2, "psrlq", Iop_ShrN64x2 =
);
+ goto decode_success;
+ }
+
//.. /* 66 0F D3 =3D PSRLQ by E */
//.. if (sz =3D=3D 2 && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0xD3) {
//.. delta =3D dis_SSE_shiftG_byE( sorb, delta+2, "psrlq", Iop_Shr=
N64x2 );
@@ -11132,14 +11199,15 @@
//.. sorb, delta+2, insn[1], "psubq", False );
//.. goto decode_success;
//.. }
-//..=20
-//.. /* 66 0F FB =3D PSUBQ */
-//.. if (sz =3D=3D 2 && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0xFB) {
-//.. delta =3D dis_SSEint_E_to_G( sorb, delta+2,=20
-//.. "psubq", Iop_Sub64x2, False );
-//.. goto decode_success;
-//.. }
-//..=20
+
+ /* 66 0F FB =3D PSUBQ */
+ if (have66noF2noF3(pfx) && sz =3D=3D 2=20
+ && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0xFB) {
+ delta =3D dis_SSEint_E_to_G( pfx, delta+2,=20
+ "psubq", Iop_Sub64x2, False );
+ goto decode_success;
+ }
+
//.. /* 66 0F F9 =3D PSUBW */
//.. if (sz =3D=3D 2 && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0xF9) {
//.. delta =3D dis_SSEint_E_to_G( sorb, delta+2,=20
@@ -11238,14 +11306,14 @@
//.. Iop_InterleaveLO16x8, True );
//.. goto decode_success;
//.. }
-//..=20
-//.. /* 66 0F EF =3D PXOR */
-//.. if (sz =3D=3D 2 && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0xEF) {
-//.. delta =3D dis_SSE_E_to_G_all( sorb, delta+2, "pxor", Iop_Xor1=
28 );
-//.. goto decode_success;
-//.. }
-//..=20
-//..=20
+
+ /* 66 0F EF =3D PXOR */
+ if (have66noF2noF3(pfx) && sz =3D=3D 2=20
+ && insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0xEF) {
+ delta =3D dis_SSE_E_to_G_all( pfx, delta+2, "pxor", Iop_XorV128 );
+ goto decode_success;
+ }
+
//.. //-- /* FXSAVE/FXRSTOR m32 -- load/store the FPU/MMX/SSE state. =
*/
//.. //-- if (insn[0] =3D=3D 0x0F && insn[1] =3D=3D 0xAE=20
//.. //-- && (!epartIsReg(insn[2]))
Modified: trunk/priv/host-amd64/hdefs.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/priv/host-amd64/hdefs.c 2005-05-05 12:05:54 UTC (rev 1162)
+++ trunk/priv/host-amd64/hdefs.c 2005-05-05 21:34:02 UTC (rev 1163)
@@ -606,7 +606,7 @@
//.. case Xsse_ADD8: return "paddb";
//.. case Xsse_ADD16: return "paddw";
//.. case Xsse_ADD32: return "paddd";
-//.. case Xsse_ADD64: return "paddq";
+ case Asse_ADD64: return "paddq";
//.. case Xsse_QADD8U: return "paddusb";
//.. case Xsse_QADD16U: return "paddusw";
//.. case Xsse_QADD8S: return "paddsb";
@@ -614,7 +614,7 @@
//.. case Xsse_SUB8: return "psubb";
//.. case Xsse_SUB16: return "psubw";
//.. case Xsse_SUB32: return "psubd";
-//.. case Xsse_SUB64: return "psubq";
+ case Asse_SUB64: return "psubq";
//.. case Xsse_QSUB8U: return "psubusb";
//.. case Xsse_QSUB16U: return "psubusw";
//.. case Xsse_QSUB8S: return "psubsb";
@@ -630,7 +630,7 @@
//.. case Xsse_MIN8U: return "pminub";
//.. case Xsse_CMPEQ8: return "pcmpeqb";
//.. case Xsse_CMPEQ16: return "pcmpeqw";
-//.. case Xsse_CMPEQ32: return "pcmpeqd";
+ case Asse_CMPEQ32: return "pcmpeqd";
//.. case Xsse_CMPGT8S: return "pcmpgtb";
//.. case Xsse_CMPGT16S: return "pcmpgtw";
//.. case Xsse_CMPGT32S: return "pcmpgtd";
@@ -639,7 +639,7 @@
//.. case Xsse_SHL64: return "psllq";
//.. case Xsse_SHR16: return "psrlw";
//.. case Xsse_SHR32: return "psrld";
-//.. case Xsse_SHR64: return "psrlq";
+ case Asse_SHR64: return "psrlq";
//.. case Xsse_SAR16: return "psraw";
//.. case Xsse_SAR32: return "psrad";
//.. case Xsse_PACKSSD: return "packssdw";
@@ -1001,15 +1001,15 @@
vassert(cond !=3D Acc_ALWAYS);
return i;
}
-//.. AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ) {
-//.. AMD64Instr* i =3D LibVEX_Alloc(sizeof(AMD64Instr));
-//.. i->tag =3D Xin_SseShuf;
-//.. i->Xin.SseShuf.order =3D order;
-//.. i->Xin.SseShuf.src =3D src;
-//.. i->Xin.SseShuf.dst =3D dst;
-//.. vassert(order >=3D 0 && order <=3D 0xFF);
-//.. return i;
-//.. }
+AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ) {
+ AMD64Instr* i =3D LibVEX_Alloc(sizeof(AMD64Instr));
+ i->tag =3D Ain_SseShuf;
+ i->Ain.SseShuf.order =3D order;
+ i->Ain.SseShuf.src =3D src;
+ i->Ain.SseShuf.dst =3D dst;
+ vassert(order >=3D 0 && order <=3D 0xFF);
+ return i;
+}
=20
void ppAMD64Instr ( AMD64Instr* i )=20
{
@@ -1303,12 +1303,12 @@
vex_printf(",");
ppHRegAMD64(i->Ain.SseCMov.dst);
return;
-//.. case Xin_SseShuf:
-//.. vex_printf("pshufd $0x%x,", i->Xin.SseShuf.order);
-//.. ppHRegAMD64(i->Xin.SseShuf.src);
-//.. vex_printf(",");
-//.. ppHRegAMD64(i->Xin.SseShuf.dst);
-//.. return;
+ case Ain_SseShuf:
+ vex_printf("pshufd $0x%x,", i->Ain.SseShuf.order);
+ ppHRegAMD64(i->Ain.SseShuf.src);
+ vex_printf(",");
+ ppHRegAMD64(i->Ain.SseShuf.dst);
+ return;
=20
default:
vpanic("ppAMD64Instr");
@@ -1572,10 +1572,10 @@
addHRegUse(u, HRmRead, i->Ain.SseCMov.src);
addHRegUse(u, HRmModify, i->Ain.SseCMov.dst);
return;
-//.. case Xin_SseShuf:
-//.. addHRegUse(u, HRmRead, i->Xin.SseShuf.src);
-//.. addHRegUse(u, HRmWrite, i->Xin.SseShuf.dst);
-//.. return;
+ case Ain_SseShuf:
+ addHRegUse(u, HRmRead, i->Ain.SseShuf.src);
+ addHRegUse(u, HRmWrite, i->Ain.SseShuf.dst);
+ return;
default:
ppAMD64Instr(i);
vpanic("getRegUsage_AMD64Instr");
@@ -1737,10 +1737,10 @@
mapReg(m, &i->Ain.SseCMov.src);
mapReg(m, &i->Ain.SseCMov.dst);
return;
-//.. case Xin_SseShuf:
-//.. mapReg(m, &i->Xin.SseShuf.src);
-//.. mapReg(m, &i->Xin.SseShuf.dst);
-//.. return;
+ case Ain_SseShuf:
+ mapReg(m, &i->Ain.SseShuf.src);
+ mapReg(m, &i->Ain.SseShuf.dst);
+ return;
default:
ppAMD64Instr(i);
vpanic("mapRegs_AMD64Instr");
@@ -3203,7 +3203,7 @@
//.. case Xsse_ADD8: XX(0x66); XX(rex); XX(0x0F); XX(0xFC);=
break;
//.. case Xsse_ADD16: XX(0x66); XX(rex); XX(0x0F); XX(0xFD);=
break;
//.. case Xsse_ADD32: XX(0x66); XX(rex); XX(0x0F); XX(0xFE);=
break;
-//.. case Xsse_ADD64: XX(0x66); XX(rex); XX(0x0F); XX(0xD4);=
break;
+ case Asse_ADD64: XX(0x66); XX(rex); XX(0x0F); XX(0xD4); brea=
k;
//.. case Xsse_QADD8S: XX(0x66); XX(rex); XX(0x0F); XX(0xEC);=
break;
//.. case Xsse_QADD16S: XX(0x66); XX(rex); XX(0x0F); XX(0xED);=
break;
//.. case Xsse_QADD8U: XX(0x66); XX(rex); XX(0x0F); XX(0xDC);=
break;
@@ -3212,7 +3212,7 @@
//.. case Xsse_AVG16U: XX(0x66); XX(rex); XX(0x0F); XX(0xE3);=
break;
//.. case Xsse_CMPEQ8: XX(0x66); XX(rex); XX(0x0F); XX(0x74);=
break;
//.. case Xsse_CMPEQ16: XX(0x66); XX(rex); XX(0x0F); XX(0x75);=
break;
-//.. case Xsse_CMPEQ32: XX(0x66); XX(rex); XX(0x0F); XX(0x76);=
break;
+ case Asse_CMPEQ32: XX(0x66); XX(rex); XX(0x0F); XX(0x76); brea=
k;
//.. case Xsse_CMPGT8S: XX(0x66); XX(rex); XX(0x0F); XX(0x64);=
break;
//.. case Xsse_CMPGT16S: XX(0x66); XX(rex); XX(0x0F); XX(0x65);=
break;
//.. case Xsse_CMPGT32S: XX(0x66); XX(rex); XX(0x0F); XX(0x66);=
break;
@@ -3230,11 +3230,11 @@
//.. case Xsse_SAR32: XX(0x66); XX(rex); XX(0x0F); XX(0xE2);=
break;
//.. case Xsse_SHR16: XX(0x66); XX(rex); XX(0x0F); XX(0xD1);=
break;
//.. case Xsse_SHR32: XX(0x66); XX(rex); XX(0x0F); XX(0xD2);=
break;
-//.. case Xsse_SHR64: XX(0x66); XX(rex); XX(0x0F); XX(0xD3);=
break;
+ case Asse_SHR64: XX(0x66); XX(rex); XX(0x0F); XX(0xD3); brea=
k;
//.. case Xsse_SUB8: XX(0x66); XX(rex); XX(0x0F); XX(0xF8);=
break;
//.. case Xsse_SUB16: XX(0x66); XX(rex); XX(0x0F); XX(0xF9);=
break;
//.. case Xsse_SUB32: XX(0x66); XX(rex); XX(0x0F); XX(0xFA);=
break;
-//.. case Xsse_SUB64: XX(0x66); XX(rex); XX(0x0F); XX(0xFB);=
break;
+ case Asse_SUB64: XX(0x66); XX(rex); XX(0x0F); XX(0xFB); brea=
k;
//.. case Xsse_QSUB8S: XX(0x66); XX(rex); XX(0x0F); XX(0xE8);=
break;
//.. case Xsse_QSUB16S: XX(0x66); XX(rex); XX(0x0F); XX(0xE9);=
break;
//.. case Xsse_QSUB8U: XX(0x66); XX(rex); XX(0x0F); XX(0xD8);=
break;
@@ -3273,15 +3273,18 @@
*(ptmp-1) =3D toUChar(p - ptmp);
goto done;
=20
-//.. case Xin_SseShuf:
-//.. *p++ =3D 0x66;=20
-//.. *p++ =3D 0x0F;=20
-//.. *p++ =3D 0x70;=20
-//.. p =3D doAMode_R(p, fake(vregNo(i->Xin.SseShuf.dst)),
-//.. fake(vregNo(i->Xin.SseShuf.src)) );
-//.. *p++ =3D (UChar)(i->Xin.SseShuf.order);
-//.. goto done;
-//..=20
+ case Ain_SseShuf:
+ *p++ =3D 0x66;=20
+ *p++ =3D clearWBit(
+ rexAMode_R( vreg2ireg(i->Ain.SseShuf.dst),
+ vreg2ireg(i->Ain.SseShuf.src) ));
+ *p++ =3D 0x0F;=20
+ *p++ =3D 0x70;=20
+ p =3D doAMode_R(p, vreg2ireg(i->Ain.SseShuf.dst),
+ vreg2ireg(i->Ain.SseShuf.src) );
+ *p++ =3D (UChar)(i->Ain.SseShuf.order);
+ goto done;
+
default:=20
goto bad;
}
Modified: trunk/priv/host-amd64/hdefs.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/priv/host-amd64/hdefs.h 2005-05-05 12:05:54 UTC (rev 1162)
+++ trunk/priv/host-amd64/hdefs.h 2005-05-05 21:34:02 UTC (rev 1163)
@@ -322,10 +322,12 @@
/* Bitwise */
Asse_AND, Asse_OR, Asse_XOR, Asse_ANDN,
//.. /* Integer binary */
-//.. Xsse_ADD8, Xsse_ADD16, Xsse_ADD32, Xsse_ADD64,
+//.. Xsse_ADD8, Xsse_ADD16, Xsse_ADD32,
+ Asse_ADD64,
//.. Xsse_QADD8U, Xsse_QADD16U,
//.. Xsse_QADD8S, Xsse_QADD16S,
-//.. Xsse_SUB8, Xsse_SUB16, Xsse_SUB32, Xsse_SUB64,
+//.. Xsse_SUB8, Xsse_SUB16, Xsse_SUB32,
+ Asse_SUB64,
//.. Xsse_QSUB8U, Xsse_QSUB16U,
//.. Xsse_QSUB8S, Xsse_QSUB16S,
//.. Xsse_MUL16,
@@ -336,10 +338,12 @@
//.. Xsse_MAX8U,
//.. Xsse_MIN16S,
//.. Xsse_MIN8U,
-//.. Xsse_CMPEQ8, Xsse_CMPEQ16, Xsse_CMPEQ32,
+//.. Xsse_CMPEQ8, Xsse_CMPEQ16, =20
+ Asse_CMPEQ32,
//.. Xsse_CMPGT8S, Xsse_CMPGT16S, Xsse_CMPGT32S,
//.. Xsse_SHL16, Xsse_SHL32, Xsse_SHL64,
-//.. Xsse_SHR16, Xsse_SHR32, Xsse_SHR64,
+//.. Xsse_SHR16, Xsse_SHR32,=20
+ Asse_SHR64,
//.. Xsse_SAR16, Xsse_SAR32,=20
//.. Xsse_PACKSSD, Xsse_PACKSSW, Xsse_PACKUSW,
//.. Xsse_UNPCKHB, Xsse_UNPCKHW, Xsse_UNPCKHD, Xsse_UNPCKHQ,
@@ -397,7 +401,7 @@
Ain_Sse64FLo, /* SSE binary, 64F in lowest lane only */
Ain_SseReRg, /* SSE binary general reg-reg, Re, Rg */
Ain_SseCMov, /* SSE conditional move */
-//.. Xin_SseShuf /* SSE2 shuffle (pshufd) */
+ Ain_SseShuf /* SSE2 shuffle (pshufd) */
}
AMD64InstrTag;
=20
@@ -642,11 +646,11 @@
HReg src;
HReg dst;
} SseCMov;
-//.. struct {
-//.. Int order; /* 0 <=3D order <=3D 0xFF */
-//.. HReg src;
-//.. HReg dst;
-//.. } SseShuf;
+ struct {
+ Int order; /* 0 <=3D order <=3D 0xFF */
+ HReg src;
+ HReg dst;
+ } SseShuf;
=20
} Ain;
}
@@ -695,7 +699,7 @@
extern AMD64Instr* AMD64Instr_Sse64FLo ( AMD64SseOp, HReg, HReg );
extern AMD64Instr* AMD64Instr_SseReRg ( AMD64SseOp, HReg, HReg );
extern AMD64Instr* AMD64Instr_SseCMov ( AMD64CondCode, HReg src, HReg =
dst );
-//.. extern AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg=
dst );
+extern AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst =
);
=20
=20
extern void ppAMD64Instr ( AMD64Instr* );
Modified: trunk/priv/host-amd64/isel.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/priv/host-amd64/isel.c 2005-05-05 12:05:54 UTC (rev 1162)
+++ trunk/priv/host-amd64/isel.c 2005-05-05 21:34:02 UTC (rev 1163)
@@ -699,19 +699,16 @@
//.. }
=20
=20
-/* Generate !src into a new vector register, and be sure that the code
- is SSE1 compatible. Amazing that Intel doesn't offer a less crappy
- way to do this.=20
+/* Generate !src into a new vector register. Amazing that there isn't
+ a less crappy way to do this.
*/
static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
{
HReg dst =3D newVRegV(env);
- /* Set dst to zero. Not strictly necessary, but the idea of doing
- a FP comparison on whatever junk happens to be floating around
- in it is just too scary. */
+ /* Set dst to zero. Not strictly necessary. */
addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst));
/* And now make it all 1s ... */
- addInstr(env, AMD64Instr_Sse32Fx4(Asse_CMPEQF, dst, dst));
+ addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst));
/* Finally, xor 'src' into it. */
addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst));
return dst;
@@ -3049,34 +3046,33 @@
return do_sse_NotV128(env, arg);
}
=20
-//.. case Iop_CmpNEZ64x2: {
-//.. /* We can use SSE2 instructions for this. */
-//.. /* Ideally, we want to do a 64Ix2 comparison against zero =
of
-//.. the operand. Problem is no such insn exists. Solution
-//.. therefore is to do a 32Ix4 comparison instead, and bitw=
ise-
-//.. negate (NOT) the result. Let a,b,c,d be 32-bit lanes, =
and=20
-//.. let the not'd result of this initial comparison be a:b:=
c:d.
-//.. What we need to compute is (a|b):(a|b):(c|d):(c|d). So=
, use
-//.. pshufd to create a value b:a:d:c, and OR that with a:b:=
c:d,
-//.. giving the required result.
-//..=20
-//.. The required selection sequence is 2,3,0,1, which
-//.. according to Intel's documentation means the pshufd
-//.. literal value is 0xB1, that is,=20
-//.. (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)=20
-//.. */
-//.. HReg arg =3D iselVecExpr(env, e->Iex.Unop.arg);
-//.. HReg tmp =3D newVRegV(env);
-//.. HReg dst =3D newVRegV(env);
-//.. REQUIRE_SSE2;
-//.. addInstr(env, X86Instr_SseReRg(Xsse_XOR, tmp, tmp));
-//.. addInstr(env, X86Instr_SseReRg(Xsse_CMPEQ32, arg, tmp));
-//.. tmp =3D do_sse_Not128(env, tmp);
-//.. addInstr(env, X86Instr_SseShuf(0xB1, tmp, dst));
-//.. addInstr(env, X86Instr_SseReRg(Xsse_OR, tmp, dst));
-//.. return dst;
-//.. }
-//..=20
+ case Iop_CmpNEZ64x2: {
+ /* We can use SSE2 instructions for this. */
+ /* Ideally, we want to do a 64Ix2 comparison against zero of
+ the operand. Problem is no such insn exists. Solution
+ therefore is to do a 32Ix4 comparison instead, and bitwise-
+ negate (NOT) the result. Let a,b,c,d be 32-bit lanes, and=20
+ let the not'd result of this initial comparison be a:b:c:d.
+ What we need to compute is (a|b):(a|b):(c|d):(c|d). So, use
+ pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
+ giving the required result.
+
+ The required selection sequence is 2,3,0,1, which
+ according to Intel's documentation means the pshufd
+ literal value is 0xB1, that is,=20
+ (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)=20
+ */
+ HReg arg =3D iselVecExpr(env, e->Iex.Unop.arg);
+ HReg tmp =3D newVRegV(env);
+ HReg dst =3D newVRegV(env);
+ addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, tmp));
+ addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp));
+ tmp =3D do_sse_NotV128(env, tmp);
+ addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst));
+ addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
+ return dst;
+ }
+
//.. case Iop_CmpNEZ32x4: {
//.. /* Sigh, we have to generate lousy code since this has to
//.. work on SSE1 hosts */
@@ -3355,7 +3351,7 @@
//.. case Iop_Add8x16: op =3D Xsse_ADD8; goto do_SseReRg;
//.. case Iop_Add16x8: op =3D Xsse_ADD16; goto do_SseReRg;
//.. case Iop_Add32x4: op =3D Xsse_ADD32; goto do_SseReRg;
-//.. case Iop_Add64x2: op =3D Xsse_ADD64; goto do_SseReRg;
+ case Iop_Add64x2: op =3D Asse_ADD64; goto do_SseReRg;
//.. case Iop_QAdd8Sx16: op =3D Xsse_QADD8S; goto do_SseReRg;
//.. case Iop_QAdd16Sx8: op =3D Xsse_QADD16S; goto do_SseReRg;
//.. case Iop_QAdd8Ux16: op =3D Xsse_QADD8U; goto do_SseReRg;
@@ -3378,7 +3374,7 @@
//.. case Iop_Sub8x16: op =3D Xsse_SUB8; goto do_SseReRg;
//.. case Iop_Sub16x8: op =3D Xsse_SUB16; goto do_SseReRg;
//.. case Iop_Sub32x4: op =3D Xsse_SUB32; goto do_SseReRg;
-//.. case Iop_Sub64x2: op =3D Xsse_SUB64; goto do_SseReRg;
+ case Iop_Sub64x2: op =3D Asse_SUB64; goto do_SseReRg;
//.. case Iop_QSub8Sx16: op =3D Xsse_QSUB8S; goto do_SseReRg;
//.. case Iop_QSub16Sx8: op =3D Xsse_QSUB16S; goto do_SseReRg;
//.. case Iop_QSub8Ux16: op =3D Xsse_QSUB8U; goto do_SseReRg;
@@ -3405,24 +3401,21 @@
//.. case Iop_SarN32x4: op =3D Xsse_SAR32; goto do_SseShift;
//.. case Iop_ShrN16x8: op =3D Xsse_SHR16; goto do_SseShift;
//.. case Iop_ShrN32x4: op =3D Xsse_SHR32; goto do_SseShift;
-//.. case Iop_ShrN64x2: op =3D Xsse_SHR64; goto do_SseShift;
-//.. do_SseShift: {
-//.. HReg greg =3D iselVecExpr(env, e->Iex.Binop.arg1);
-//.. X86RMI* rmi =3D iselIntExpr_RMI(env, e->Iex.Binop.arg2)=
;
-//.. X86AMode* esp0 =3D X86AMode_IR(0, hregX86_ESP());
-//.. HReg ereg =3D newVRegV(env);
-//.. HReg dst =3D newVRegV(env);
-//.. REQUIRE_SSE2;
-//.. addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
-//.. addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
-//.. addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
-//.. addInstr(env, X86Instr_Push(rmi));
-//.. addInstr(env, X86Instr_SseLdSt(True/*load*/, ereg, esp0));
-//.. addInstr(env, mk_vMOVsd_RR(greg, dst));
-//.. addInstr(env, X86Instr_SseReRg(op, ereg, dst));
-//.. add_to_esp(env, 16);
-//.. return dst;
-//.. }
+ case Iop_ShrN64x2: op =3D Asse_SHR64; goto do_SseShift;
+ do_SseShift: {
+ HReg greg =3D iselVecExpr(env, e->Iex.Binop.arg1);
+ AMD64RMI* rmi =3D iselIntExpr_RMI(env, e->Iex.Binop.arg2);
+ AMD64AMode* rsp0 =3D AMD64AMode_IR(0, hregAMD64_RSP());
+ HReg ereg =3D newVRegV(env);
+ HReg dst =3D newVRegV(env);
+ addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
+ addInstr(env, AMD64Instr_Push(rmi));
+ addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0))=
;
+ addInstr(env, mk_vMOVsd_RR(greg, dst));
+ addInstr(env, AMD64Instr_SseReRg(op, ereg, dst));
+ add_to_rsp(env, 16);
+ return dst;
+ }
=20
default:
break;
|