|
[Valgrind-developers] vex: r2379: Make a start at implementing
256-bit AVX instructions generated by
From: <sv...@va...> - 2012-06-12 08:45:56
|
sewardj 2012-06-12 09:45:39 +0100 (Tue, 12 Jun 2012)
New Revision: 2379
Log:
Make a start at implementing 256-bit AVX instructions generated by
"gcc-4.7.0 -mavx -O3":
VMOVUPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 10 /r
VMOVUPS xmm2/m128, xmm1 = VEX.128.0F.WIG 10 /r
VUNPCKHPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 15 /r
VUNPCKLPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 14 /r
VUNPCKHPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 15 /r
VADDPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 58 /r
VADDPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 58 /r
VADDPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 58 /r
VMULPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 59 /r
VMULPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 59 /r
VMULPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 59 /r
VSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5C /r
VSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5C /r
VSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5C /r
VDIVPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5E /r
VDIVPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5E /r
VPSRLQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /2 ib
VPCMPEQQ = VEX.NDS.128.66.0F38.WIG 29 /r
VPCMPGTQ = VEX.NDS.128.66.0F38.WIG 37 /r
VPEXTRQ = VEX.128.66.0F3A.W1 16 /r ib
Modified files:
trunk/priv/guest_amd64_toIR.c
trunk/priv/host_amd64_isel.c
trunk/priv/ir_defs.c
trunk/pub/libvex_ir.h
Modified: trunk/priv/guest_amd64_toIR.c (+323 -71)
===================================================================
--- trunk/priv/guest_amd64_toIR.c 2012-06-11 22:54:58 +01:00 (rev 2378)
+++ trunk/priv/guest_amd64_toIR.c 2012-06-12 09:45:39 +01:00 (rev 2379)
@@ -9751,21 +9751,39 @@
/* FIXME: why not just use InterleaveLO / InterleaveHI ?? */
-static IRTemp math_UNPCKxPS_128 ( IRTemp sV, IRTemp dV, UChar opc )
+/* Does the maths for 128 bit versions of UNPCKLPS and UNPCKHPS */
+static IRTemp math_UNPCKxPS_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
{
IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
- Bool hi = toBool(opc == 0x15);
- vassert(opc == 0x15/*UNPCKLPS*/ || opc == 0x14/*UNPCKHPS*/);
s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
breakup128to32s( dV, &d3, &d2, &d1, &d0 );
breakup128to32s( sV, &s3, &s2, &s1, &s0 );
IRTemp res = newTemp(Ity_V128);
- assign(res, hi ? mk128from32s( s3, d3, s2, d2 )
- : mk128from32s( s1, d1, s0, d0 ));
+ assign(res, xIsH ? mk128from32s( s3, d3, s2, d2 )
+ : mk128from32s( s1, d1, s0, d0 ));
return res;
}
+/* FIXME: why not just use InterleaveLO / InterleaveHI ?? */
+/* Does the maths for 128 bit versions of UNPCKLPD and UNPCKHPD */
+static IRTemp math_UNPCKxPD_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
+{
+ IRTemp s1 = newTemp(Ity_I64);
+ IRTemp s0 = newTemp(Ity_I64);
+ IRTemp d1 = newTemp(Ity_I64);
+ IRTemp d0 = newTemp(Ity_I64);
+ assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
+ assign( d0, unop(Iop_V128to64, mkexpr(dV)) );
+ assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
+ assign( s0, unop(Iop_V128to64, mkexpr(sV)) );
+ IRTemp res = newTemp(Ity_V128);
+ assign(res, xIsH ? binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1))
+ : binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)));
+ return res;
+}
+
+
static IRTemp math_SHUFPS ( IRTemp sV, IRTemp dV, UInt imm8 )
{
IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
@@ -10135,7 +10153,7 @@
DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
dis_buf, nameXMMReg(rG));
}
- IRTemp res = math_UNPCKxPS_128( sV, dV, opc );
+ IRTemp res = math_UNPCKxPS_128( sV, dV, hi );
putXMMReg( rG, mkexpr(res) );
goto decode_success;
}
@@ -10144,45 +10162,27 @@
/* These just appear to be special cases of SHUFPS */
if (have66noF2noF3(pfx)
&& sz == 2 /* could be 8 if rex also present */) {
- IRTemp s1 = newTemp(Ity_I64);
- IRTemp s0 = newTemp(Ity_I64);
- IRTemp d1 = newTemp(Ity_I64);
- IRTemp d0 = newTemp(Ity_I64);
+ Bool hi = toBool(opc == 0x15);
IRTemp sV = newTemp(Ity_V128);
IRTemp dV = newTemp(Ity_V128);
- Bool hi = toBool(opc == 0x15);
-
modrm = getUChar(delta);
- assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
-
+ UInt rG = gregOfRexRM(pfx,modrm);
+ assign( dV, getXMMReg(rG) );
if (epartIsReg(modrm)) {
- assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+ UInt rE = eregOfRexRM(pfx,modrm);
+ assign( sV, getXMMReg(rE) );
delta += 1;
DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
- nameXMMReg(eregOfRexRM(pfx,modrm)),
- nameXMMReg(gregOfRexRM(pfx,modrm)));
+ nameXMMReg(rE), nameXMMReg(rG));
} else {
addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
delta += alen;
DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
- dis_buf,
- nameXMMReg(gregOfRexRM(pfx,modrm)));
+ dis_buf, nameXMMReg(rG));
}
-
- assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
- assign( d0, unop(Iop_V128to64, mkexpr(dV)) );
- assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
- assign( s0, unop(Iop_V128to64, mkexpr(sV)) );
-
- if (hi) {
- putXMMReg( gregOfRexRM(pfx,modrm),
- binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
- } else {
- putXMMReg( gregOfRexRM(pfx,modrm),
- binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
- }
-
+ IRTemp res = math_UNPCKxPD_128( sV, dV, hi );
+ putXMMReg( rG, mkexpr(res) );
goto decode_success;
}
break;
@@ -15579,6 +15579,54 @@
}
+static Long dis_PEXTRQ ( VexAbiInfo* vbi, Prefix pfx,
+ Long delta, Bool isAvx )
+{
+ IRTemp addr = IRTemp_INVALID;
+ UChar modrm = 0;
+ Int alen = 0;
+ HChar dis_buf[50];
+
+ Int imm8_0;
+ IRTemp xmm_vec = newTemp(Ity_V128);
+ IRTemp src_qword = newTemp(Ity_I64);
+ HChar* mbV = isAvx ? "v" : "";
+
+ vassert(1==getRexW(pfx)); /* ensured by caller */
+ modrm = getUChar(delta);
+ assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
+
+ if ( epartIsReg( modrm ) ) {
+ imm8_0 = (Int)(getUChar(delta+1) & 1);
+ } else {
+ addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
+ imm8_0 = (Int)(getUChar(delta+alen) & 1);
+ }
+
+ switch ( imm8_0 ) {
+ case 0: assign( src_qword, unop(Iop_V128to64, mkexpr(xmm_vec)) );
+ break;
+ case 1: assign( src_qword, unop(Iop_V128HIto64, mkexpr(xmm_vec)) );
+ break;
+ default: vassert(0);
+ }
+
+ if ( epartIsReg( modrm ) ) {
+ putIReg64( eregOfRexRM(pfx,modrm), mkexpr(src_qword) );
+ delta += 1+1;
+ DIP( "%spextrq $%d, %s,%s\n", mbV, imm8_0,
+ nameXMMReg( gregOfRexRM(pfx, modrm) ),
+ nameIReg64( eregOfRexRM(pfx, modrm) ) );
+ } else {
+ storeLE( mkexpr(addr), mkexpr(src_qword) );
+ delta += alen+1;
+ DIP( "%spextrq $%d, %s,%s\n", mbV,
+ imm8_0, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
+ }
+ return delta;
+}
+
+
/* This can fail, in which case it returns the original (unchanged)
delta. */
static Long dis_PCMPxSTRx ( VexAbiInfo* vbi, Prefix pfx,
@@ -16261,41 +16309,7 @@
here the REX.W bit is present */
if (have66noF2noF3(pfx)
&& sz == 8 /* REX.W is present */) {
-
- Int imm8_0;
- IRTemp xmm_vec = newTemp(Ity_V128);
- IRTemp src_qword = newTemp(Ity_I64);
-
- modrm = getUChar(delta);
- assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
-
- if ( epartIsReg( modrm ) ) {
- imm8_0 = (Int)(getUChar(delta+1) & 1);
- } else {
- addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
- imm8_0 = (Int)(getUChar(delta+alen) & 1);
- }
- switch ( imm8_0 ) {
- case 0: assign( src_qword, unop(Iop_V128to64, mkexpr(xmm_vec)) );
- break;
- case 1: assign( src_qword, unop(Iop_V128HIto64, mkexpr(xmm_vec)) );
- break;
- default: vassert(0);
- }
-
- if ( epartIsReg( modrm ) ) {
- putIReg64( eregOfRexRM(pfx,modrm), mkexpr(src_qword) );
- delta += 1+1;
- DIP( "pextrq $%d, %s,%s\n", imm8_0,
- nameXMMReg( gregOfRexRM(pfx, modrm) ),
- nameIReg64( eregOfRexRM(pfx, modrm) ) );
- } else {
- storeLE( mkexpr(addr), mkexpr(src_qword) );
- delta += alen+1;
- DIP( "pextrq $%d, %s,%s\n",
- imm8_0, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
- }
-
+ delta = dis_PEXTRQ( vbi, pfx, delta, False/*!isAvx*/);
goto decode_success;
}
break;
@@ -19316,6 +19330,7 @@
/*--- ---*/
/*------------------------------------------------------------*/
+/* FIXME: common up with the _256_ version below? */
static
Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG (
/*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
@@ -19436,7 +19451,7 @@
//case Iop_SarN32x4: sar = True; size = 32; break;
case Iop_ShrN16x8: shr = True; size = 16; break;
case Iop_ShrN32x4: shr = True; size = 32; break;
- //case Iop_ShrN64x2: shr = True; size = 64; break;
+ case Iop_ShrN64x2: shr = True; size = 64; break;
default: vassert(0);
}
@@ -19594,7 +19609,7 @@
/* All-lanes AVX128 binary operation:
- G[127:0] = V127:0] `op` E[127:0]
+ G[127:0] = V[127:0] `op` E[127:0]
G[255:128] = 0.
*/
static Long dis_AVX128_E_V_to_G ( /*OUT*/Bool* uses_vvvv,
@@ -19752,6 +19767,80 @@
}
+/* FIXME: common up with the _128_ version above? */
+static
+Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG (
+ /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
+ Prefix pfx, Long delta, HChar* name,
+ /* The actual operation. Use either 'op' or 'opfn',
+ but not both. */
+ IROp op, IRTemp(*opFn)(IRTemp,IRTemp),
+ Bool invertLeftArg,
+ Bool swapArgs
+ )
+{
+ UChar modrm = getUChar(delta);
+ UInt rD = gregOfRexRM(pfx, modrm);
+ UInt rSL = getVexNvvvv(pfx);
+ IRTemp tSL = newTemp(Ity_V256);
+ IRTemp tSR = newTemp(Ity_V256);
+ IRTemp addr = IRTemp_INVALID;
+ HChar dis_buf[50];
+ Int alen = 0;
+ vassert(1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*WIG?*/);
+
+ // Hmm. we don't actually have Iop_NotV256 (yet). Hence kludge:
+ vassert(!invertLeftArg);
+ assign(tSL, /* invertLeftArg ? unop(Iop_NotV256, getYMMReg(rSL))
+ : */ getYMMReg(rSL));
+
+ if (epartIsReg(modrm)) {
+ UInt rSR = eregOfRexRM(pfx, modrm);
+ delta += 1;
+ assign(tSR, getYMMReg(rSR));
+ DIP("%s %s,%s,%s\n",
+ name, nameYMMReg(rSR), nameYMMReg(rSL), nameYMMReg(rD));
+ } else {
+ addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+ delta += alen;
+ assign(tSR, loadLE(Ity_V256, mkexpr(addr)));
+ DIP("%s %s,%s,%s\n",
+ name, dis_buf, nameYMMReg(rSL), nameYMMReg(rD));
+ }
+
+ IRTemp res = IRTemp_INVALID;
+ if (op != Iop_INVALID) {
+ vassert(opFn == NULL);
+ res = newTemp(Ity_V256);
+ assign(res, swapArgs ? binop(op, mkexpr(tSR), mkexpr(tSL))
+ : binop(op, mkexpr(tSL), mkexpr(tSR)));
+ } else {
+ vassert(opFn != NULL);
+ res = swapArgs ? opFn(tSR, tSL) : opFn(tSL, tSR);
+ }
+
+ putYMMReg(rD, mkexpr(res));
+
+ *uses_vvvv = True;
+ return delta;
+}
+
+
+/* All-lanes AVX256 binary operation:
+ G[255:0] = V[255:0] `op` E[255:0]
+*/
+static Long dis_AVX256_E_V_to_G ( /*OUT*/Bool* uses_vvvv,
+ VexAbiInfo* vbi,
+ Prefix pfx, Long delta,
+ HChar* opname, IROp op )
+{
+ return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
+ uses_vvvv, vbi, pfx, delta, opname, op,
+ NULL, False/*!invertLeftArg*/, False/*!swapArgs*/
+ );
+}
+
+
__attribute__((noinline))
static
Long dis_ESC_0F__VEX (
@@ -19810,6 +19899,23 @@
delta += alen;
goto decode_success;
}
+ /* VMOVUPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 10 /r */
+ if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
+ UChar modrm = getUChar(delta);
+ UInt rG = gregOfRexRM(pfx, modrm);
+ if (epartIsReg(modrm)) {
+ UInt rE = eregOfRexRM(pfx,modrm);
+ putYMMRegLoAndZU( rG, getXMMReg( rE ));
+ DIP("vmovupd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
+ delta += 1;
+ } else {
+ addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+ putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
+ DIP("vmovupd %s,%s\n", dis_buf, nameXMMReg(rG));
+ delta += alen;
+ }
+ goto decode_success;
+ }
/* VMOVUPD ymm2/m256, ymm1 = VEX.256.66.0F.WIG 10 /r */
if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
UChar modrm = getUChar(delta);
@@ -19827,6 +19933,23 @@
}
goto decode_success;
}
+ /* VMOVUPS xmm2/m128, xmm1 = VEX.128.0F.WIG 10 /r */
+ if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
+ UChar modrm = getUChar(delta);
+ UInt rG = gregOfRexRM(pfx, modrm);
+ if (epartIsReg(modrm)) {
+ UInt rE = eregOfRexRM(pfx,modrm);
+ putYMMRegLoAndZU( rG, getXMMReg( rE ));
+ DIP("vmovups %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
+ delta += 1;
+ } else {
+ addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+ putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
+ DIP("vmovups %s,%s\n", dis_buf, nameXMMReg(rG));
+ delta += alen;
+ }
+ goto decode_success;
+ }
break;
case 0x11:
@@ -19935,7 +20058,9 @@
break;
case 0x14:
+ case 0x15:
/* VUNPCKLPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 14 /r */
+ /* VUNPCKHPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 15 /r */
if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
Bool hi = opc == 0x15;
UChar modrm = getUChar(delta);
@@ -19957,11 +20082,39 @@
DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
dis_buf, nameXMMReg(rG));
}
- IRTemp res = math_UNPCKxPS_128( eV, vV, opc );
+ IRTemp res = math_UNPCKxPS_128( eV, vV, hi );
putYMMRegLoAndZU( rG, mkexpr(res) );
*uses_vvvv = True;
goto decode_success;
}
+ /* VUNPCKLPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 14 /r */
+ /* VUNPCKHPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 15 /r */
+ if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
+ Bool hi = opc == 0x15;
+ UChar modrm = getUChar(delta);
+ UInt rG = gregOfRexRM(pfx,modrm);
+ UInt rV = getVexNvvvv(pfx);
+ IRTemp eV = newTemp(Ity_V128);
+ IRTemp vV = newTemp(Ity_V128);
+ assign( vV, getXMMReg(rV) );
+ if (epartIsReg(modrm)) {
+ UInt rE = eregOfRexRM(pfx,modrm);
+ assign( eV, getXMMReg(rE) );
+ delta += 1;
+ DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
+ nameXMMReg(rE), nameXMMReg(rG));
+ } else {
+ addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+ assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
+ delta += alen;
+ DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
+ dis_buf, nameXMMReg(rG));
+ }
+ IRTemp res = math_UNPCKxPD_128( eV, vV, hi );
+ putYMMRegLoAndZU( rG, mkexpr(res) );
+ *uses_vvvv = True;
+ goto decode_success;
+ }
break;
case 0x16:
@@ -20386,6 +20539,24 @@
uses_vvvv, vbi, pfx, delta, "vaddps", Iop_Add32Fx4 );
goto decode_success;
}
+ /* VADDPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 58 /r */
+ if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+ delta = dis_AVX256_E_V_to_G(
+ uses_vvvv, vbi, pfx, delta, "vaddps", Iop_Add32Fx8 );
+ goto decode_success;
+ }
+ /* VADDPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 58 /r */
+ if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
+ delta = dis_AVX128_E_V_to_G(
+ uses_vvvv, vbi, pfx, delta, "vaddpd", Iop_Add64Fx2 );
+ goto decode_success;
+ }
+ /* VADDPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 58 /r */
+ if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+ delta = dis_AVX256_E_V_to_G(
+ uses_vvvv, vbi, pfx, delta, "vaddpd", Iop_Add64Fx4 );
+ goto decode_success;
+ }
break;
case 0x59:
@@ -20407,6 +20578,24 @@
uses_vvvv, vbi, pfx, delta, "vmulps", Iop_Mul32Fx4 );
goto decode_success;
}
+ /* VMULPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 59 /r */
+ if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+ delta = dis_AVX256_E_V_to_G(
+ uses_vvvv, vbi, pfx, delta, "vmulps", Iop_Mul32Fx8 );
+ goto decode_success;
+ }
+ /* VMULPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 59 /r */
+ if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
+ delta = dis_AVX128_E_V_to_G(
+ uses_vvvv, vbi, pfx, delta, "vmulpd", Iop_Mul64Fx2 );
+ goto decode_success;
+ }
+ /* VMULPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 59 /r */
+ if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+ delta = dis_AVX256_E_V_to_G(
+ uses_vvvv, vbi, pfx, delta, "vmulpd", Iop_Mul64Fx4 );
+ goto decode_success;
+ }
break;
case 0x5A:
@@ -20506,6 +20695,24 @@
uses_vvvv, vbi, pfx, delta, "vsubps", Iop_Sub32Fx4 );
goto decode_success;
}
+ /* VSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5C /r */
+ if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+ delta = dis_AVX256_E_V_to_G(
+ uses_vvvv, vbi, pfx, delta, "vsubps", Iop_Sub32Fx8 );
+ goto decode_success;
+ }
+ /* VSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5C /r */
+ if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
+ delta = dis_AVX128_E_V_to_G(
+ uses_vvvv, vbi, pfx, delta, "vsubpd", Iop_Sub64Fx2 );
+ goto decode_success;
+ }
+ /* VSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5C /r */
+ if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+ delta = dis_AVX256_E_V_to_G(
+ uses_vvvv, vbi, pfx, delta, "vsubpd", Iop_Sub64Fx4 );
+ goto decode_success;
+ }
break;
case 0x5D:
@@ -20542,6 +20749,18 @@
uses_vvvv, vbi, pfx, delta, "vdivss", Iop_Div32F0x4 );
goto decode_success;
}
+ /* VDIVPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5E /r */
+ if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+ delta = dis_AVX256_E_V_to_G(
+ uses_vvvv, vbi, pfx, delta, "vdivps", Iop_Div32Fx8 );
+ goto decode_success;
+ }
+ /* VDIVPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5E /r */
+ if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+ delta = dis_AVX256_E_V_to_G(
+ uses_vvvv, vbi, pfx, delta, "vdivpd", Iop_Div64Fx4 );
+ goto decode_success;
+ }
break;
case 0x5F:
@@ -20827,6 +21046,7 @@
case 0x73:
/* VPSRLDQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /3 ib */
/* VPSLLDQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /7 ib */
+ /* VPSRLQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /2 ib */
if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
&& epartIsReg(getUChar(delta))) {
Int rS = eregOfRexRM(pfx,getUChar(delta));
@@ -20850,6 +21070,12 @@
*uses_vvvv = True;
goto decode_success;
}
+ if (gregLO3ofRM(getUChar(delta)) == 2) {
+ delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
+ "vpsrlq", Iop_ShrN64x2 );
+ *uses_vvvv = True;
+ goto decode_success;
+ }
/* else fall through */
}
break;
@@ -21319,6 +21545,16 @@
}
break;
+ case 0x29:
+ /* VPCMPEQQ r/m, rV, r ::: r = rV `eq-by-64s` r/m */
+ /* VPCMPEQQ = VEX.NDS.128.66.0F38.WIG 29 /r */
+ if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
+ delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
+ uses_vvvv, vbi, pfx, delta, "vpcmpeqq", Iop_CmpEQ64x2 );
+ goto decode_success;
+ }
+ break;
+
case 0x30:
/* VPMOVZXBW xmm2/m64, xmm1 */
/* VPMOVZXBW = VEX.128.66.0F38.WIG 30 /r */
@@ -21337,6 +21573,16 @@
}
break;
+ case 0x37:
+ /* VPCMPGTQ r/m, rV, r ::: r = rV `>s-by-64s` r/m */
+ /* VPCMPGTQ = VEX.NDS.128.66.0F38.WIG 37 /r */
+ if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
+ delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
+ uses_vvvv, vbi, pfx, delta, "vpcmpgtq", Iop_CmpGT64Sx2 );
+ goto decode_success;
+ }
+ break;
+
case 0x39:
/* VPMINSD r/m, rV, r ::: r = min-signed-32s(rV, r/m) */
/* VPMINSD = VEX.NDS.128.66.0F38.WIG 39 /r */
@@ -21407,6 +21653,12 @@
delta = dis_PEXTRD( vbi, pfx, delta, True/*isAvx*/ );
goto decode_success;
}
+ /* VPEXTRQ = VEX.128.66.0F3A.W1 16 /r ib */
+ if (have66noF2noF3(pfx)
+ && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
+ delta = dis_PEXTRQ( vbi, pfx, delta, True/*isAvx*/ );
+ goto decode_success;
+ }
break;
case 0x18:
Modified: trunk/priv/ir_defs.c (+20 -2)
===================================================================
--- trunk/priv/ir_defs.c 2012-06-11 22:54:58 +01:00 (rev 2378)
+++ trunk/priv/ir_defs.c 2012-06-12 09:45:39 +01:00 (rev 2379)
@@ -982,8 +982,16 @@
case Iop_V256to64_2: vex_printf("V256to64_2"); return;
case Iop_V256to64_3: vex_printf("V256to64_3"); return;
case Iop_64x4toV256: vex_printf("64x4toV256"); return;
- case Iop_DPBtoBCD: vex_printf("Iop_DPBtoBCD"); return;
- case Iop_BCDtoDPB: vex_printf("Iop_BCDtoDPB"); return;
+ case Iop_DPBtoBCD: vex_printf("DPBtoBCD"); return;
+ case Iop_BCDtoDPB: vex_printf("BCDtoDPB"); return;
+ case Iop_Add64Fx4: vex_printf("Add64Fx4"); return;
+ case Iop_Sub64Fx4: vex_printf("Sub64Fx4"); return;
+ case Iop_Mul64Fx4: vex_printf("Mul64Fx4"); return;
+ case Iop_Div64Fx4: vex_printf("Div64Fx4"); return;
+ case Iop_Add32Fx8: vex_printf("Add32Fx8"); return;
+ case Iop_Sub32Fx8: vex_printf("Sub32Fx8"); return;
+ case Iop_Mul32Fx8: vex_printf("Mul32Fx8"); return;
+ case Iop_Div32Fx8: vex_printf("Div32Fx8"); return;
default: vpanic("ppIROp(1)");
}
@@ -2783,6 +2791,16 @@
case Iop_64x4toV256:
QUATERNARY(Ity_I64, Ity_I64, Ity_I64, Ity_I64, Ity_V256);
+ case Iop_Add64Fx4:
+ case Iop_Sub64Fx4:
+ case Iop_Mul64Fx4:
+ case Iop_Div64Fx4:
+ case Iop_Add32Fx8:
+ case Iop_Sub32Fx8:
+ case Iop_Mul32Fx8:
+ case Iop_Div32Fx8:
+ BINARY(Ity_V256,Ity_V256, Ity_V256);
+
default:
ppIROp(op);
vpanic("typeOfPrimop");
Modified: trunk/pub/libvex_ir.h (+11 -1)
===================================================================
--- trunk/pub/libvex_ir.h 2012-06-11 22:54:58 +01:00 (rev 2378)
+++ trunk/pub/libvex_ir.h 2012-06-12 09:45:39 +01:00 (rev 2379)
@@ -1428,8 +1428,18 @@
Iop_V256to64_2,
Iop_V256to64_3, // V256 -> I64, extract most sigificant lane
- Iop_64x4toV256 // (I64,I64,I64,I64)->V256
+ Iop_64x4toV256, // (I64,I64,I64,I64)->V256
// first arg is most significant lane
+
+ /* ------------------ 256-bit SIMD FP. ------------------ */
+ Iop_Add64Fx4,
+ Iop_Sub64Fx4,
+ Iop_Mul64Fx4,
+ Iop_Div64Fx4,
+ Iop_Add32Fx8,
+ Iop_Sub32Fx8,
+ Iop_Mul32Fx8,
+ Iop_Div32Fx8
}
IROp;
Modified: trunk/priv/host_amd64_isel.c (+52 -2)
===================================================================
--- trunk/priv/host_amd64_isel.c 2012-06-11 22:54:58 +01:00 (rev 2378)
+++ trunk/priv/host_amd64_isel.c 2012-06-12 09:45:39 +01:00 (rev 2379)
@@ -3368,7 +3368,7 @@
/*--- ISEL: SIMD (V256) expressions, into 2 XMM regs. --*/
/*---------------------------------------------------------*/
-static void iselDVecExpr ( /*OUT*/HReg* rHi, HReg* rLo,
+static void iselDVecExpr ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
ISelEnv* env, IRExpr* e )
{
iselDVecExpr_wrk( rHi, rLo, env, e );
@@ -3383,13 +3383,15 @@
/* DO NOT CALL THIS DIRECTLY */
-static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
+static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
ISelEnv* env, IRExpr* e )
{
vassert(e);
IRType ty = typeOfIRExpr(env->type_env,e);
vassert(ty == Ity_V256);
+ AMD64SseOp op = Asse_INVALID;
+
/* read 256-bit IRTemp */
if (e->tag == Iex_RdTmp) {
lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
@@ -3422,6 +3424,54 @@
return;
}
+ if (e->tag == Iex_Binop) {
+ switch (e->Iex.Binop.op) {
+
+ case Iop_Add64Fx4: op = Asse_ADDF; goto do_64Fx4;
+ case Iop_Sub64Fx4: op = Asse_SUBF; goto do_64Fx4;
+ case Iop_Mul64Fx4: op = Asse_MULF; goto do_64Fx4;
+ case Iop_Div64Fx4: op = Asse_DIVF; goto do_64Fx4;
+ do_64Fx4:
+ {
+ HReg argLhi, argLlo, argRhi, argRlo;
+ iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
+ iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
+ HReg dstHi = newVRegV(env);
+ HReg dstLo = newVRegV(env);
+ addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
+ addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
+ addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
+ addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
+ *rHi = dstHi;
+ *rLo = dstLo;
+ return;
+ }
+
+ case Iop_Add32Fx8: op = Asse_ADDF; goto do_32Fx8;
+ case Iop_Sub32Fx8: op = Asse_SUBF; goto do_32Fx8;
+ case Iop_Mul32Fx8: op = Asse_MULF; goto do_32Fx8;
+ case Iop_Div32Fx8: op = Asse_DIVF; goto do_32Fx8;
+ do_32Fx8:
+ {
+ HReg argLhi, argLlo, argRhi, argRlo;
+ iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
+ iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
+ HReg dstHi = newVRegV(env);
+ HReg dstLo = newVRegV(env);
+ addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
+ addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
+ addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
+ addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
+ *rHi = dstHi;
+ *rLo = dstLo;
+ return;
+ }
+
+ default:
+ break;
+ } /* switch (e->Iex.Binop.op) */
+ } /* if (e->tag == Iex_Binop) */
+
if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) {
HReg rsp = hregAMD64_RSP();
HReg vHi = newVRegV(env);
|