|
From: <sv...@va...> - 2012-06-20 11:46:37
|
sewardj 2012-06-20 12:46:19 +0100 (Wed, 20 Jun 2012)
New Revision: 2395
Log:
Implement
VPERMILPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0C /r
VPERMILPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0C /r
VPERMILPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0D /r
VPERMILPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0D /r
Modified files:
trunk/priv/guest_amd64_toIR.c
trunk/priv/host_amd64_isel.c
trunk/priv/host_generic_simd128.c
trunk/priv/host_generic_simd128.h
trunk/priv/ir_defs.c
trunk/pub/libvex_ir.h
Modified: trunk/priv/ir_defs.c (+2 -1)
===================================================================
--- trunk/priv/ir_defs.c 2012-06-20 11:21:05 +01:00 (rev 2394)
+++ trunk/priv/ir_defs.c 2012-06-20 12:46:19 +01:00 (rev 2395)
@@ -925,6 +925,7 @@
case Iop_ExtractV128: vex_printf("ExtractV128"); return;
case Iop_Perm8x16: vex_printf("Perm8x16"); return;
+ case Iop_Perm32x4: vex_printf("Perm32x4"); return;
case Iop_Reverse16_8x16: vex_printf("Reverse16_8x16"); return;
case Iop_Reverse32_8x16: vex_printf("Reverse32_8x16"); return;
case Iop_Reverse32_16x8: vex_printf("Reverse32_16x8"); return;
@@ -2579,7 +2580,7 @@
case Iop_InterleaveOddLanes8x16: case Iop_InterleaveEvenLanes8x16:
case Iop_InterleaveOddLanes16x8: case Iop_InterleaveEvenLanes16x8:
case Iop_InterleaveOddLanes32x4: case Iop_InterleaveEvenLanes32x4:
- case Iop_Perm8x16:
+ case Iop_Perm8x16: case Iop_Perm32x4:
case Iop_Recps32Fx4:
case Iop_Rsqrts32Fx4:
BINARY(Ity_V128,Ity_V128, Ity_V128);
Modified: trunk/priv/host_amd64_isel.c (+2 -0)
===================================================================
--- trunk/priv/host_amd64_isel.c 2012-06-20 11:21:05 +01:00 (rev 2394)
+++ trunk/priv/host_amd64_isel.c 2012-06-20 12:46:19 +01:00 (rev 2395)
@@ -3243,6 +3243,8 @@
goto do_SseAssistedBinary;
case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
goto do_SseAssistedBinary;
+ case Iop_Perm32x4: fn = (HWord)h_generic_calc_Perm32x4;
+ goto do_SseAssistedBinary;
case Iop_QNarrowBin32Sto16Ux8:
fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8;
goto do_SseAssistedBinary;
Modified: trunk/pub/libvex_ir.h (+1 -0)
===================================================================
--- trunk/pub/libvex_ir.h 2012-06-20 11:21:05 +01:00 (rev 2394)
+++ trunk/pub/libvex_ir.h 2012-06-20 12:46:19 +01:00 (rev 2395)
@@ -1415,6 +1415,7 @@
argR[i] values may only be in the range 0 .. 15, else behaviour
is undefined. */
Iop_Perm8x16,
+ Iop_Perm32x4, /* ditto, except argR values are restricted to 0 .. 3 */
/* Vector Reciprocal Estimate and Vector Reciprocal Square Root Estimate
See floating-point equiwalents for details. */
Modified: trunk/priv/host_generic_simd128.h (+3 -0)
===================================================================
--- trunk/priv/host_generic_simd128.h 2012-06-20 11:21:05 +01:00 (rev 2394)
+++ trunk/priv/host_generic_simd128.h 2012-06-20 12:46:19 +01:00 (rev 2395)
@@ -83,6 +83,9 @@
void h_generic_calc_NarrowBin32to16x8
( /*OUT*/V128*, V128*, V128* );
+extern VEX_REGPARM(3)
+ void h_generic_calc_Perm32x4 ( /*OUT*/V128*, V128*, V128* );
+
#endif /* ndef __VEX_HOST_GENERIC_SIMD128_H */
/*---------------------------------------------------------------*/
Modified: trunk/priv/host_generic_simd128.c (+10 -0)
===================================================================
--- trunk/priv/host_generic_simd128.c 2012-06-20 11:21:05 +01:00 (rev 2394)
+++ trunk/priv/host_generic_simd128.c 2012-06-20 12:46:19 +01:00 (rev 2395)
@@ -358,7 +358,17 @@
res->w16[7] = narrow32to16(argL->w32[3]);
}
+void VEX_REGPARM(3)
+ h_generic_calc_Perm32x4 ( /*OUT*/V128* res,
+ V128* argL, V128* argR )
+{
+ res->w32[0] = argL->w32[ argR->w32[0] & 3 ];
+ res->w32[1] = argL->w32[ argR->w32[1] & 3 ];
+ res->w32[2] = argL->w32[ argR->w32[2] & 3 ];
+ res->w32[3] = argL->w32[ argR->w32[3] & 3 ];
+}
+
/*---------------------------------------------------------------*/
/*--- end host_generic_simd128.c ---*/
/*---------------------------------------------------------------*/
Modified: trunk/priv/guest_amd64_toIR.c (+188 -0)
===================================================================
--- trunk/priv/guest_amd64_toIR.c 2012-06-20 11:21:05 +01:00 (rev 2394)
+++ trunk/priv/guest_amd64_toIR.c 2012-06-20 12:46:19 +01:00 (rev 2395)
@@ -8986,6 +8986,20 @@
assign(*t0, unop(Iop_V256toV128_0, mkexpr(t256)));
}
+/* Break a V128-bit value up into two 64-bit ints. */
+
+static void breakupV128to64s ( IRTemp t128,
+ /*OUTs*/
+ IRTemp* t1, IRTemp* t0 )
+{
+ vassert(t0 && *t0 == IRTemp_INVALID);
+ vassert(t1 && *t1 == IRTemp_INVALID);
+ *t0 = newTemp(Ity_I64);
+ *t1 = newTemp(Ity_I64);
+ assign( *t0, unop(Iop_V128to64, mkexpr(t128)) );
+ assign( *t1, unop(Iop_V128HIto64, mkexpr(t128)) );
+}
+
/* Helper for the SSSE3 (not SSE3) PMULHRSW insns. Given two 64-bit
values (aa,bb), computes, for each of the 4 16-bit lanes:
@@ -23015,6 +23029,66 @@
/*--- ---*/
/*------------------------------------------------------------*/
+static IRTemp math_PERMILPS_VAR_128 ( IRTemp dataV, IRTemp ctrlV )
+{
+ /* In the control vector, zero out all but the bottom two bits of
+ each 32-bit lane. */
+ IRExpr* cv1 = binop(Iop_ShrN32x4,
+ binop(Iop_ShlN32x4, mkexpr(ctrlV), mkU8(30)),
+ mkU8(30));
+ /* And use the resulting cleaned-up control vector as steering
+ in a Perm operation. */
+ IRTemp res = newTemp(Ity_V128);
+ assign(res, binop(Iop_Perm32x4, mkexpr(dataV), cv1));
+ return res;
+}
+
+static IRTemp math_PERMILPS_VAR_256 ( IRTemp dataV, IRTemp ctrlV )
+{
+ IRTemp dHi, dLo, cHi, cLo;
+ dHi = dLo = cHi = cLo = IRTemp_INVALID;
+ breakupV256toV128s( dataV, &dHi, &dLo );
+ breakupV256toV128s( ctrlV, &cHi, &cLo );
+ IRTemp rHi = math_PERMILPS_VAR_128( dHi, cHi );
+ IRTemp rLo = math_PERMILPS_VAR_128( dLo, cLo );
+ IRTemp res = newTemp(Ity_V256);
+ assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo)));
+ return res;
+}
+
+static IRTemp math_PERMILPD_VAR_128 ( IRTemp dataV, IRTemp ctrlV )
+{
+ /* No cleverness here .. */
+ IRTemp dHi, dLo, cHi, cLo;
+ dHi = dLo = cHi = cLo = IRTemp_INVALID;
+ breakupV128to64s( dataV, &dHi, &dLo );
+ breakupV128to64s( ctrlV, &cHi, &cLo );
+ IRExpr* rHi
+ = IRExpr_Mux0X( unop(Iop_64to8,
+ binop(Iop_And64, mkexpr(cHi), mkU64(2))),
+ mkexpr(dLo), mkexpr(dHi) );
+ IRExpr* rLo
+ = IRExpr_Mux0X( unop(Iop_64to8,
+ binop(Iop_And64, mkexpr(cLo), mkU64(2))),
+ mkexpr(dLo), mkexpr(dHi) );
+ IRTemp res = newTemp(Ity_V128);
+ assign(res, binop(Iop_64HLtoV128, rHi, rLo));
+ return res;
+}
+
+static IRTemp math_PERMILPD_VAR_256 ( IRTemp dataV, IRTemp ctrlV )
+{
+ IRTemp dHi, dLo, cHi, cLo;
+ dHi = dLo = cHi = cLo = IRTemp_INVALID;
+ breakupV256toV128s( dataV, &dHi, &dLo );
+ breakupV256toV128s( ctrlV, &cHi, &cLo );
+ IRTemp rHi = math_PERMILPD_VAR_128( dHi, cHi );
+ IRTemp rLo = math_PERMILPD_VAR_128( dLo, cLo );
+ IRTemp res = newTemp(Ity_V256);
+ assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo)));
+ return res;
+}
+
__attribute__((noinline))
static
Long dis_ESC_0F38__VEX (
@@ -23048,6 +23122,120 @@
}
break;
+ case 0x0C:
+ /* VPERMILPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0C /r */
+ if (have66noF2noF3(pfx)
+ && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
+ UChar modrm = getUChar(delta);
+ UInt rG = gregOfRexRM(pfx, modrm);
+ UInt rV = getVexNvvvv(pfx);
+ IRTemp ctrlV = newTemp(Ity_V128);
+ if (epartIsReg(modrm)) {
+ UInt rE = eregOfRexRM(pfx, modrm);
+ delta += 1;
+ DIP("vpermilps %s,%s,%s\n",
+ nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
+ assign(ctrlV, getXMMReg(rE));
+ } else {
+ addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
+ delta += alen;
+ DIP("vpermilps %s,%s,%s\n",
+ dis_buf, nameXMMReg(rV), nameXMMReg(rG));
+ assign(ctrlV, loadLE(Ity_V128, mkexpr(addr)));
+ }
+ IRTemp dataV = newTemp(Ity_V128);
+ assign(dataV, getXMMReg(rV));
+ IRTemp resV = math_PERMILPS_VAR_128(dataV, ctrlV);
+ putYMMRegLoAndZU(rG, mkexpr(resV));
+ *uses_vvvv = True;
+ goto decode_success;
+ }
+ /* VPERMILPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0C /r */
+ if (have66noF2noF3(pfx)
+ && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
+ UChar modrm = getUChar(delta);
+ UInt rG = gregOfRexRM(pfx, modrm);
+ UInt rV = getVexNvvvv(pfx);
+ IRTemp ctrlV = newTemp(Ity_V256);
+ if (epartIsReg(modrm)) {
+ UInt rE = eregOfRexRM(pfx, modrm);
+ delta += 1;
+ DIP("vpermilps %s,%s,%s\n",
+ nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
+ assign(ctrlV, getYMMReg(rE));
+ } else {
+ addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
+ delta += alen;
+ DIP("vpermilps %s,%s,%s\n",
+ dis_buf, nameYMMReg(rV), nameYMMReg(rG));
+ assign(ctrlV, loadLE(Ity_V256, mkexpr(addr)));
+ }
+ IRTemp dataV = newTemp(Ity_V256);
+ assign(dataV, getYMMReg(rV));
+ IRTemp resV = math_PERMILPS_VAR_256(dataV, ctrlV);
+ putYMMReg(rG, mkexpr(resV));
+ *uses_vvvv = True;
+ goto decode_success;
+ }
+ break;
+
+ case 0x0D:
+ /* VPERMILPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0D /r */
+ if (have66noF2noF3(pfx)
+ && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
+ UChar modrm = getUChar(delta);
+ UInt rG = gregOfRexRM(pfx, modrm);
+ UInt rV = getVexNvvvv(pfx);
+ IRTemp ctrlV = newTemp(Ity_V128);
+ if (epartIsReg(modrm)) {
+ UInt rE = eregOfRexRM(pfx, modrm);
+ delta += 1;
+ DIP("vpermilpd %s,%s,%s\n",
+ nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
+ assign(ctrlV, getXMMReg(rE));
+ } else {
+ addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
+ delta += alen;
+ DIP("vpermilpd %s,%s,%s\n",
+ dis_buf, nameXMMReg(rV), nameXMMReg(rG));
+ assign(ctrlV, loadLE(Ity_V128, mkexpr(addr)));
+ }
+ IRTemp dataV = newTemp(Ity_V128);
+ assign(dataV, getXMMReg(rV));
+ IRTemp resV = math_PERMILPD_VAR_128(dataV, ctrlV);
+ putYMMRegLoAndZU(rG, mkexpr(resV));
+ *uses_vvvv = True;
+ goto decode_success;
+ }
+ /* VPERMILPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0D /r */
+ if (have66noF2noF3(pfx)
+ && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
+ UChar modrm = getUChar(delta);
+ UInt rG = gregOfRexRM(pfx, modrm);
+ UInt rV = getVexNvvvv(pfx);
+ IRTemp ctrlV = newTemp(Ity_V256);
+ if (epartIsReg(modrm)) {
+ UInt rE = eregOfRexRM(pfx, modrm);
+ delta += 1;
+ DIP("vpermilpd %s,%s,%s\n",
+ nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
+ assign(ctrlV, getYMMReg(rE));
+ } else {
+ addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
+ delta += alen;
+ DIP("vpermilpd %s,%s,%s\n",
+ dis_buf, nameYMMReg(rV), nameYMMReg(rG));
+ assign(ctrlV, loadLE(Ity_V256, mkexpr(addr)));
+ }
+ IRTemp dataV = newTemp(Ity_V256);
+ assign(dataV, getYMMReg(rV));
+ IRTemp resV = math_PERMILPD_VAR_256(dataV, ctrlV);
+ putYMMReg(rG, mkexpr(resV));
+ *uses_vvvv = True;
+ goto decode_success;
+ }
+ break;
+
case 0x18:
/* VBROADCASTSS m32, xmm1 = VEX.128.66.0F38.WIG 18 /r */
if (have66noF2noF3(pfx)
|