|
From: <sv...@va...> - 2008-02-06 11:42:47
|
Author: sewardj
Date: 2008-02-06 11:42:45 +0000 (Wed, 06 Feb 2008)
New Revision: 1808
Log:
Add SSSE3 support. Currently only for 64-bit. TODO:
* Check through IR generation
* For 128-bit variants accessing memory, generate an exception
if effective address is not 128-bit aligned
* Change CPUID output to be Core-2 like
* Enable for 32-bit code too.
* Make Memcheck handle the new IROps
* Commit test cases
Modified:
trunk/priv/guest-amd64/toIR.c
trunk/priv/host-amd64/isel.c
trunk/priv/host-generic/h_generic_simd64.c
trunk/priv/host-generic/h_generic_simd64.h
trunk/priv/ir/irdefs.c
trunk/pub/libvex_ir.h
Modified: trunk/priv/guest-amd64/toIR.c
===================================================================
--- trunk/priv/guest-amd64/toIR.c 2008-01-04 01:22:41 UTC (rev 1807)
+++ trunk/priv/guest-amd64/toIR.c 2008-02-06 11:42:45 UTC (rev 1808)
@@ -8309,6 +8309,165 @@
}
+/* Helper for the SSSE3 (not SSE3) PMULHRSW insns. Given two 64-bit
+ values (aa,bb), computes, for each of the 4 16-bit lanes:
+
+ (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
+*/
+static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
+{
+ IRTemp aa = newTemp(Ity_I64);
+ IRTemp bb = newTemp(Ity_I64);
+ IRTemp aahi32s = newTemp(Ity_I64);
+ IRTemp aalo32s = newTemp(Ity_I64);
+ IRTemp bbhi32s = newTemp(Ity_I64);
+ IRTemp bblo32s = newTemp(Ity_I64);
+ IRTemp rHi = newTemp(Ity_I64);
+ IRTemp rLo = newTemp(Ity_I64);
+ IRTemp one32x2 = newTemp(Ity_I64);
+ assign(aa, aax);
+ assign(bb, bbx);
+ assign( aahi32s,
+ binop(Iop_SarN32x2,
+ binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
+ mkU8(16) ));
+ assign( aalo32s,
+ binop(Iop_SarN32x2,
+ binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
+ mkU8(16) ));
+ assign( bbhi32s,
+ binop(Iop_SarN32x2,
+ binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
+ mkU8(16) ));
+ assign( bblo32s,
+ binop(Iop_SarN32x2,
+ binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
+ mkU8(16) ));
+ assign(one32x2, mkU64( (1ULL << 32) + 1 ));
+ assign(
+ rHi,
+ binop(
+ Iop_ShrN32x2,
+ binop(
+ Iop_Add32x2,
+ binop(
+ Iop_ShrN32x2,
+ binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
+ mkU8(14)
+ ),
+ mkexpr(one32x2)
+ ),
+ mkU8(1)
+ )
+ );
+ assign(
+ rLo,
+ binop(
+ Iop_ShrN32x2,
+ binop(
+ Iop_Add32x2,
+ binop(
+ Iop_ShrN32x2,
+ binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
+ mkU8(14)
+ ),
+ mkexpr(one32x2)
+ ),
+ mkU8(1)
+ )
+ );
+ return
+ binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
+}
+
+/* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns. Given two 64-bit
+ values (aa,bb), computes, for each lane:
+
+ if aa_lane < 0 then - bb_lane
+ else if aa_lane > 0 then bb_lane
+ else 0
+*/
+static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
+{
+ IRTemp aa = newTemp(Ity_I64);
+ IRTemp bb = newTemp(Ity_I64);
+ IRTemp zero = newTemp(Ity_I64);
+ IRTemp bbNeg = newTemp(Ity_I64);
+ IRTemp negMask = newTemp(Ity_I64);
+ IRTemp posMask = newTemp(Ity_I64);
+ IROp opSub = Iop_INVALID;
+ IROp opCmpGTS = Iop_INVALID;
+
+ switch (laneszB) {
+ case 1: opSub = Iop_Sub8x8; opCmpGTS = Iop_CmpGT8Sx8; break;
+ case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
+ case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
+ default: vassert(0);
+ }
+
+ assign( aa, aax );
+ assign( bb, bbx );
+ assign( zero, mkU64(0) );
+ assign( bbNeg, binop(opSub, mkexpr(zero), mkexpr(bb)) );
+ assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
+ assign( posMask, binop(opCmpGTS, mkexpr(aa), mkexpr(zero)) );
+
+ return
+ binop(Iop_Or64,
+ binop(Iop_And64, mkexpr(bb), mkexpr(posMask)),
+ binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
+
+}
+
+/* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns. Given a 64-bit
+ value aa, computes, for each lane
+
+ if aa < 0 then -aa else aa
+
+ Note that the result is interpreted as unsigned, so that the
+ absolute value of the most negative signed input can be
+ represented.
+*/
+static IRExpr* dis_PABS_helper ( IRExpr* aax, Int laneszB )
+{
+ IRTemp aa = newTemp(Ity_I64);
+ IRTemp zero = newTemp(Ity_I64);
+ IRTemp aaNeg = newTemp(Ity_I64);
+ IRTemp negMask = newTemp(Ity_I64);
+ IRTemp posMask = newTemp(Ity_I64);
+ IROp opSub = Iop_INVALID;
+ IROp opSarN = Iop_INVALID;
+
+ switch (laneszB) {
+ case 1: opSub = Iop_Sub8x8; opSarN = Iop_SarN8x8; break;
+ case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
+ case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
+ default: vassert(0);
+ }
+
+ assign( aa, aax );
+ assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
+ assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
+ assign( zero, mkU64(0) );
+ assign( aaNeg, binop(opSub, mkexpr(zero), mkexpr(aa)) );
+ return
+ binop(Iop_Or64,
+ binop(Iop_And64, mkexpr(aa), mkexpr(posMask)),
+ binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) );
+}
+
+static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
+ IRTemp lo64, Long byteShift )
+{
+ vassert(byteShift >= 1 && byteShift <= 7);
+ return
+ binop(Iop_Or64,
+ binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
+ binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
+ );
+}
+
+
/* Helper for deciding whether a given insn (starting at the opcode
byte) may validly be used with a LOCK prefix. The following insns
may be used with LOCK when their destination operand is in memory.
@@ -12455,6 +12614,830 @@
/* --- end of the SSE3 decoder. --- */
/* ---------------------------------------------------- */
+ /* ---------------------------------------------------- */
+ /* --- start of the SSSE3 decoder. --- */
+ /* ---------------------------------------------------- */
+
+ /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
+ Unsigned Bytes (MMX) */
+ if (haveNo66noF2noF3(pfx)
+ && sz == 4
+ && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
+ IRTemp sV = newTemp(Ity_I64);
+ IRTemp dV = newTemp(Ity_I64);
+ IRTemp sVoddsSX = newTemp(Ity_I64);
+ IRTemp sVevensSX = newTemp(Ity_I64);
+ IRTemp dVoddsZX = newTemp(Ity_I64);
+ IRTemp dVevensZX = newTemp(Ity_I64);
+
+ modrm = insn[3];
+ do_MMX_preamble();
+ assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
+
+ if (epartIsReg(modrm)) {
+ assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
+ delta += 3+1;
+ DIP("pmaddubsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
+ nameMMXReg(gregLO3ofRM(modrm)));
+ } else {
+ addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+ assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+ delta += 3+alen;
+ DIP("pmaddubsw %s,%s\n", dis_buf,
+ nameMMXReg(gregLO3ofRM(modrm)));
+ }
+
+ /* compute dV unsigned x sV signed */
+ assign( sVoddsSX,
+ binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
+ assign( sVevensSX,
+ binop(Iop_SarN16x4,
+ binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
+ mkU8(8)) );
+ assign( dVoddsZX,
+ binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
+ assign( dVevensZX,
+ binop(Iop_ShrN16x4,
+ binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
+ mkU8(8)) );
+
+ putMMXReg(
+ gregLO3ofRM(modrm),
+ binop(Iop_QAdd16Sx4,
+ binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
+ binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
+ )
+ );
+ goto decode_success;
+ }
+
+ /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
+ Unsigned Bytes (XMM) */
+ if (have66noF2noF3(pfx)
+ && (sz == 2 || /*redundant REX.W*/ sz == 8)
+ && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
+ IRTemp sV = newTemp(Ity_V128);
+ IRTemp dV = newTemp(Ity_V128);
+ IRTemp sVoddsSX = newTemp(Ity_V128);
+ IRTemp sVevensSX = newTemp(Ity_V128);
+ IRTemp dVoddsZX = newTemp(Ity_V128);
+ IRTemp dVevensZX = newTemp(Ity_V128);
+
+ modrm = insn[3];
+ assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+ if (epartIsReg(modrm)) {
+ assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+ delta += 3+1;
+ DIP("pmaddubsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+ nameXMMReg(gregOfRexRM(pfx,modrm)));
+ } else {
+ addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+ /* FIXME: generate trap if addr is not 16-aligned */
+ assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+ delta += 3+alen;
+ DIP("pmaddubsw %s,%s\n", dis_buf,
+ nameXMMReg(gregOfRexRM(pfx,modrm)));
+ }
+
+ /* compute dV unsigned x sV signed */
+ assign( sVoddsSX,
+ binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
+ assign( sVevensSX,
+ binop(Iop_SarN16x8,
+ binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
+ mkU8(8)) );
+ assign( dVoddsZX,
+ binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
+ assign( dVevensZX,
+ binop(Iop_ShrN16x8,
+ binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
+ mkU8(8)) );
+
+ putXMMReg(
+ gregOfRexRM(pfx,modrm),
+ binop(Iop_QAdd16Sx8,
+ binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
+ binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
+ )
+ );
+ goto decode_success;
+ }
+
+ /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
+ /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
+ mmx) and G to G (mmx). */
+ /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
+ mmx) and G to G (mmx). */
+ /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
+ to G (mmx). */
+ /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
+ to G (mmx). */
+ /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
+ to G (mmx). */
+ /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
+ to G (mmx). */
+
+ if (haveNo66noF2noF3(pfx)
+ && sz == 4
+ && insn[0] == 0x0F && insn[1] == 0x38
+ && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
+ || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
+ HChar* str = "???";
+ IROp opV64 = Iop_INVALID;
+ IROp opCatO = Iop_CatOddLanes16x4;
+ IROp opCatE = Iop_CatEvenLanes16x4;
+ IRTemp sV = newTemp(Ity_I64);
+ IRTemp dV = newTemp(Ity_I64);
+
+ modrm = insn[3];
+
+ switch (insn[2]) {
+ case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
+ case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
+ case 0x01: opV64 = Iop_Add16x4; str = "addw"; break;
+ case 0x05: opV64 = Iop_Sub16x4; str = "subw"; break;
+ case 0x02: opV64 = Iop_Add32x2; str = "addd"; break;
+ case 0x06: opV64 = Iop_Sub32x2; str = "subd"; break;
+ default: vassert(0);
+ }
+ if (insn[2] == 0x02 || insn[2] == 0x06) {
+ opCatO = Iop_InterleaveHI32x2;
+ opCatE = Iop_InterleaveLO32x2;
+ }
+
+ do_MMX_preamble();
+ assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
+
+ if (epartIsReg(modrm)) {
+ assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
+ delta += 3+1;
+ DIP("ph%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
+ nameMMXReg(gregLO3ofRM(modrm)));
+ } else {
+ addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+ assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+ delta += 3+alen;
+ DIP("ph%s %s,%s\n", str, dis_buf,
+ nameMMXReg(gregLO3ofRM(modrm)));
+ }
+
+ putMMXReg(
+ gregLO3ofRM(modrm),
+ binop(opV64,
+ binop(opCatE,mkexpr(sV),mkexpr(dV)),
+ binop(opCatO,mkexpr(sV),mkexpr(dV))
+ )
+ );
+ goto decode_success;
+ }
+
+ /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
+ xmm) and G to G (xmm). */
+ /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
+ xmm) and G to G (xmm). */
+ /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
+ G to G (xmm). */
+ /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
+ G to G (xmm). */
+ /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
+ G to G (xmm). */
+ /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
+ G to G (xmm). */
+
+ if (have66noF2noF3(pfx)
+ && (sz == 2 || /*redundant REX.W*/ sz == 8)
+ && insn[0] == 0x0F && insn[1] == 0x38
+ && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
+ || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
+ HChar* str = "???";
+ IROp opV64 = Iop_INVALID;
+ IROp opCatO = Iop_CatOddLanes16x4;
+ IROp opCatE = Iop_CatEvenLanes16x4;
+ IRTemp sV = newTemp(Ity_V128);
+ IRTemp dV = newTemp(Ity_V128);
+ IRTemp sHi = newTemp(Ity_I64);
+ IRTemp sLo = newTemp(Ity_I64);
+ IRTemp dHi = newTemp(Ity_I64);
+ IRTemp dLo = newTemp(Ity_I64);
+
+ modrm = insn[3];
+
+ switch (insn[2]) {
+ case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
+ case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
+ case 0x01: opV64 = Iop_Add16x4; str = "addw"; break;
+ case 0x05: opV64 = Iop_Sub16x4; str = "subw"; break;
+ case 0x02: opV64 = Iop_Add32x2; str = "addd"; break;
+ case 0x06: opV64 = Iop_Sub32x2; str = "subd"; break;
+ default: vassert(0);
+ }
+ if (insn[2] == 0x02 || insn[2] == 0x06) {
+ opCatO = Iop_InterleaveHI32x2;
+ opCatE = Iop_InterleaveLO32x2;
+ }
+
+ assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+ if (epartIsReg(modrm)) {
+ assign( sV, getXMMReg( eregOfRexRM(pfx,modrm)) );
+ DIP("ph%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
+ nameXMMReg(gregOfRexRM(pfx,modrm)));
+ delta += 3+1;
+ } else {
+ addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+ /* FIXME: generate trap if addr is not 16-aligned */
+ assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+ DIP("ph%s %s,%s\n", str, dis_buf,
+ nameXMMReg(gregOfRexRM(pfx,modrm)));
+ delta += 3+alen;
+ }
+
+ assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
+ assign( dLo, unop(Iop_V128to64, mkexpr(dV)) );
+ assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+ assign( sLo, unop(Iop_V128to64, mkexpr(sV)) );
+
+ /* This isn't a particularly efficient way to compute the
+ result, but at least it avoids a proliferation of IROps,
+ hence avoids complication all the backends. */
+ putXMMReg(
+ gregOfRexRM(pfx,modrm),
+ binop(Iop_64HLtoV128,
+ binop(opV64,
+ binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
+ binop(opCatO,mkexpr(sHi),mkexpr(sLo))
+ ),
+ binop(opV64,
+ binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
+ binop(opCatO,mkexpr(dHi),mkexpr(dLo))
+ )
+ )
+ );
+ goto decode_success;
+ }
+
+ /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
+ (MMX) */
+ if (haveNo66noF2noF3(pfx)
+ && sz == 4
+ && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
+ IRTemp sV = newTemp(Ity_I64);
+ IRTemp dV = newTemp(Ity_I64);
+
+ modrm = insn[3];
+ do_MMX_preamble();
+ assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
+
+ if (epartIsReg(modrm)) {
+ assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
+ delta += 3+1;
+ DIP("pmulhrsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
+ nameMMXReg(gregLO3ofRM(modrm)));
+ } else {
+ addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+ assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+ delta += 3+alen;
+ DIP("pmulhrsw %s,%s\n", dis_buf,
+ nameMMXReg(gregLO3ofRM(modrm)));
+ }
+
+ putMMXReg(
+ gregLO3ofRM(modrm),
+ dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
+ );
+ goto decode_success;
+ }
+
+ /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
+ Scale (XMM) */
+ if (have66noF2noF3(pfx)
+ && (sz == 2 || /*redundant REX.W*/ sz == 8)
+ && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
+ IRTemp sV = newTemp(Ity_V128);
+ IRTemp dV = newTemp(Ity_V128);
+ IRTemp sHi = newTemp(Ity_I64);
+ IRTemp sLo = newTemp(Ity_I64);
+ IRTemp dHi = newTemp(Ity_I64);
+ IRTemp dLo = newTemp(Ity_I64);
+
+ modrm = insn[3];
+ assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+ if (epartIsReg(modrm)) {
+ assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+ delta += 3+1;
+ DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+ nameXMMReg(gregOfRexRM(pfx,modrm)));
+ } else {
+ addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+ /* FIXME: generate trap if addr is not 16-aligned */
+ assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+ delta += 3+alen;
+ DIP("pmulhrsw %s,%s\n", dis_buf,
+ nameXMMReg(gregOfRexRM(pfx,modrm)));
+ }
+
+ assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
+ assign( dLo, unop(Iop_V128to64, mkexpr(dV)) );
+ assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+ assign( sLo, unop(Iop_V128to64, mkexpr(sV)) );
+
+ putXMMReg(
+ gregOfRexRM(pfx,modrm),
+ binop(Iop_64HLtoV128,
+ dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
+ dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
+ )
+ );
+ goto decode_success;
+ }
+
+ /* 0F 38 08 = PSIGNB -- Packed Sign 8x8 (MMX) */
+ /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
+ /* 0F 38 09 = PSIGND -- Packed Sign 32x2 (MMX) */
+ if (haveNo66noF2noF3(pfx)
+ && sz == 4
+ && insn[0] == 0x0F && insn[1] == 0x38
+ && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
+ IRTemp sV = newTemp(Ity_I64);
+ IRTemp dV = newTemp(Ity_I64);
+ HChar* str = "???";
+ Int laneszB = 0;
+
+ switch (insn[2]) {
+ case 0x08: laneszB = 1; str = "b"; break;
+ case 0x09: laneszB = 2; str = "w"; break;
+ case 0x0A: laneszB = 4; str = "d"; break;
+ default: vassert(0);
+ }
+
+ modrm = insn[3];
+ do_MMX_preamble();
+ assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
+
+ if (epartIsReg(modrm)) {
+ assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
+ delta += 3+1;
+ DIP("psign%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
+ nameMMXReg(gregLO3ofRM(modrm)));
+ } else {
+ addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+ assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+ delta += 3+alen;
+ DIP("psign%s %s,%s\n", str, dis_buf,
+ nameMMXReg(gregLO3ofRM(modrm)));
+ }
+
+ putMMXReg(
+ gregLO3ofRM(modrm),
+ dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
+ );
+ goto decode_success;
+ }
+
+ /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
+ /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
+ /* 66 0F 38 09 = PSIGND -- Packed Sign 32x4 (XMM) */
+ if (have66noF2noF3(pfx)
+ && (sz == 2 || /*redundant REX.W*/ sz == 8)
+ && insn[0] == 0x0F && insn[1] == 0x38
+ && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
+ IRTemp sV = newTemp(Ity_V128);
+ IRTemp dV = newTemp(Ity_V128);
+ IRTemp sHi = newTemp(Ity_I64);
+ IRTemp sLo = newTemp(Ity_I64);
+ IRTemp dHi = newTemp(Ity_I64);
+ IRTemp dLo = newTemp(Ity_I64);
+ HChar* str = "???";
+ Int laneszB = 0;
+
+ switch (insn[2]) {
+ case 0x08: laneszB = 1; str = "b"; break;
+ case 0x09: laneszB = 2; str = "w"; break;
+ case 0x0A: laneszB = 4; str = "d"; break;
+ default: vassert(0);
+ }
+
+ modrm = insn[3];
+ assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+ if (epartIsReg(modrm)) {
+ assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+ delta += 3+1;
+ DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
+ nameXMMReg(gregOfRexRM(pfx,modrm)));
+ } else {
+ addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+ /* FIXME: generate trap if addr is not 16-aligned */
+ assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+ delta += 3+alen;
+ DIP("psign%s %s,%s\n", str, dis_buf,
+ nameXMMReg(gregOfRexRM(pfx,modrm)));
+ }
+
+ assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
+ assign( dLo, unop(Iop_V128to64, mkexpr(dV)) );
+ assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+ assign( sLo, unop(Iop_V128to64, mkexpr(sV)) );
+
+ putXMMReg(
+ gregOfRexRM(pfx,modrm),
+ binop(Iop_64HLtoV128,
+ dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
+ dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
+ )
+ );
+ goto decode_success;
+ }
+
+ /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8 (MMX) */
+ /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
+ /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
+ if (haveNo66noF2noF3(pfx)
+ && sz == 4
+ && insn[0] == 0x0F && insn[1] == 0x38
+ && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
+ IRTemp sV = newTemp(Ity_I64);
+ HChar* str = "???";
+ Int laneszB = 0;
+
+ switch (insn[2]) {
+ case 0x1C: laneszB = 1; str = "b"; break;
+ case 0x1D: laneszB = 2; str = "w"; break;
+ case 0x1E: laneszB = 4; str = "d"; break;
+ default: vassert(0);
+ }
+
+ modrm = insn[3];
+ do_MMX_preamble();
+
+ if (epartIsReg(modrm)) {
+ assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
+ delta += 3+1;
+ DIP("pabs%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
+ nameMMXReg(gregLO3ofRM(modrm)));
+ } else {
+ addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+ assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+ delta += 3+alen;
+ DIP("pabs%s %s,%s\n", str, dis_buf,
+ nameMMXReg(gregLO3ofRM(modrm)));
+ }
+
+ putMMXReg(
+ gregLO3ofRM(modrm),
+ dis_PABS_helper( mkexpr(sV), laneszB )
+ );
+ goto decode_success;
+ }
+
+ /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
+ /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
+ /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
+ if (have66noF2noF3(pfx)
+ && (sz == 2 || /*redundant REX.W*/ sz == 8)
+ && insn[0] == 0x0F && insn[1] == 0x38
+ && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
+ IRTemp sV = newTemp(Ity_V128);
+ IRTemp sHi = newTemp(Ity_I64);
+ IRTemp sLo = newTemp(Ity_I64);
+ HChar* str = "???";
+ Int laneszB = 0;
+
+ switch (insn[2]) {
+ case 0x1C: laneszB = 1; str = "b"; break;
+ case 0x1D: laneszB = 2; str = "w"; break;
+ case 0x1E: laneszB = 4; str = "d"; break;
+ default: vassert(0);
+ }
+
+ modrm = insn[3];
+
+ if (epartIsReg(modrm)) {
+ assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+ delta += 3+1;
+ DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
+ nameXMMReg(gregOfRexRM(pfx,modrm)));
+ } else {
+ addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+ /* FIXME: generate trap if addr is not 16-aligned */
+ assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+ delta += 3+alen;
+ DIP("pabs%s %s,%s\n", str, dis_buf,
+ nameXMMReg(gregOfRexRM(pfx,modrm)));
+ }
+
+ assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+ assign( sLo, unop(Iop_V128to64, mkexpr(sV)) );
+
+ putXMMReg(
+ gregOfRexRM(pfx,modrm),
+ binop(Iop_64HLtoV128,
+ dis_PABS_helper( mkexpr(sHi), laneszB ),
+ dis_PABS_helper( mkexpr(sLo), laneszB )
+ )
+ );
+ goto decode_success;
+ }
+
+ /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
+ if (haveNo66noF2noF3(pfx) && sz == 4
+ && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
+ IRTemp sV = newTemp(Ity_I64);
+ IRTemp dV = newTemp(Ity_I64);
+ IRTemp res = newTemp(Ity_I64);
+
+ modrm = insn[3];
+ do_MMX_preamble();
+ assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
+
+ if (epartIsReg(modrm)) {
+ assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
+ d64 = (Long)insn[3+1];
+ delta += 3+1+1;
+ DIP("palignr $%d,%s,%s\n", (Int)d64,
+ nameMMXReg(eregLO3ofRM(modrm)),
+ nameMMXReg(gregLO3ofRM(modrm)));
+ } else {
+ addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+ assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+ d64 = (Long)insn[3+alen];
+ delta += 3+alen+1;
+ DIP("palignr $%d%s,%s\n", (Int)d64,
+ dis_buf,
+ nameMMXReg(gregLO3ofRM(modrm)));
+ }
+
+ if (d64 == 0) {
+ assign( res, mkexpr(sV) );
+ }
+ else if (d64 >= 1 && d64 <= 7) {
+ assign(res,
+ binop(Iop_Or64,
+ binop(Iop_Shr64, mkexpr(sV), mkU8(8*d64)),
+ binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d64))
+ )));
+ }
+ else if (d64 == 8) {
+ assign( res, mkexpr(dV) );
+ }
+ else if (d64 >= 9 && d64 <= 15) {
+ assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d64-8))) );
+ }
+ else if (d64 >= 16 && d64 <= 255) {
+ assign( res, mkU64(0) );
+ }
+ else
+ vassert(0);
+
+ putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
+ goto decode_success;
+ }
+
+ /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
+ if (have66noF2noF3(pfx)
+ && (sz == 2 || /*redundant REX.W*/ sz == 8)
+ && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
+ IRTemp sV = newTemp(Ity_V128);
+ IRTemp dV = newTemp(Ity_V128);
+ IRTemp sHi = newTemp(Ity_I64);
+ IRTemp sLo = newTemp(Ity_I64);
+ IRTemp dHi = newTemp(Ity_I64);
+ IRTemp dLo = newTemp(Ity_I64);
+ IRTemp rHi = newTemp(Ity_I64);
+ IRTemp rLo = newTemp(Ity_I64);
+
+ modrm = insn[3];
+ assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+ if (epartIsReg(modrm)) {
+ assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+ d64 = (Long)insn[3+1];
+ delta += 3+1+1;
+ DIP("palignr $%d,%s,%s\n", (Int)d64,
+ nameXMMReg(eregOfRexRM(pfx,modrm)),
+ nameXMMReg(gregOfRexRM(pfx,modrm)));
+ } else {
+ addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+ /* FIXME: generate trap if addr is not 16-aligned */
+ assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+ d64 = (Long)insn[3+alen];
+ delta += 3+alen+1;
+ DIP("palignr $%d,%s,%s\n", (Int)d64,
+ dis_buf,
+ nameXMMReg(gregOfRexRM(pfx,modrm)));
+ }
+
+ assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
+ assign( dLo, unop(Iop_V128to64, mkexpr(dV)) );
+ assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+ assign( sLo, unop(Iop_V128to64, mkexpr(sV)) );
+
+ if (d64 == 0) {
+ assign( rHi, mkexpr(sHi) );
+ assign( rLo, mkexpr(sLo) );
+ }
+ else if (d64 >= 1 && d64 <= 7) {
+ assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, d64) );
+ assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, d64) );
+ }
+ else if (d64 == 8) {
+ assign( rHi, mkexpr(dLo) );
+ assign( rLo, mkexpr(sHi) );
+ }
+ else if (d64 >= 9 && d64 <= 15) {
+ assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, d64-8) );
+ assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, d64-8) );
+ }
+ else if (d64 == 16) {
+ assign( rHi, mkexpr(dHi) );
+ assign( rLo, mkexpr(dLo) );
+ }
+ else if (d64 >= 17 && d64 <= 23) {
+ assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d64-16))) );
+ assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, d64-16) );
+ }
+ else if (d64 == 24) {
+ assign( rHi, mkU64(0) );
+ assign( rLo, mkexpr(dHi) );
+ }
+ else if (d64 >= 25 && d64 <= 31) {
+ assign( rHi, mkU64(0) );
+ assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d64-24))) );
+ }
+ else if (d64 >= 32 && d64 <= 255) {
+ assign( rHi, mkU64(0) );
+ assign( rLo, mkU64(0) );
+ }
+ else
+ vassert(0);
+
+ putXMMReg(
+ gregOfRexRM(pfx,modrm),
+ binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
+ );
+ goto decode_success;
+ }
+
+ /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
+ if (haveNo66noF2noF3(pfx)
+ && sz == 4
+ && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
+ IRTemp sV = newTemp(Ity_I64);
+ IRTemp dV = newTemp(Ity_I64);
+
+ modrm = insn[3];
+ do_MMX_preamble();
+ assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
+
+ if (epartIsReg(modrm)) {
+ assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
+ delta += 3+1;
+ DIP("pshufb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
+ nameMMXReg(gregLO3ofRM(modrm)));
+ } else {
+ addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+ assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+ delta += 3+alen;
+ DIP("pshufb %s,%s\n", dis_buf,
+ nameMMXReg(gregLO3ofRM(modrm)));
+ }
+
+ putMMXReg(
+ gregLO3ofRM(modrm),
+ binop(
+ Iop_And64,
+ /* permute the lanes */
+ binop(
+ Iop_Perm8x8,
+ mkexpr(dV),
+ binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
+ ),
+ /* mask off lanes which have (index & 0x80) == 0x80 */
+ unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
+ )
+ );
+ goto decode_success;
+ }
+
+ /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
+ if (have66noF2noF3(pfx)
+ && (sz == 2 || /*redundant REX.W*/ sz == 8)
+ && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
+ IRTemp sV = newTemp(Ity_V128);
+ IRTemp dV = newTemp(Ity_V128);
+ IRTemp sHi = newTemp(Ity_I64);
+ IRTemp sLo = newTemp(Ity_I64);
+ IRTemp dHi = newTemp(Ity_I64);
+ IRTemp dLo = newTemp(Ity_I64);
+ IRTemp rHi = newTemp(Ity_I64);
+ IRTemp rLo = newTemp(Ity_I64);
+ IRTemp sevens = newTemp(Ity_I64);
+ IRTemp mask0x80hi = newTemp(Ity_I64);
+ IRTemp mask0x80lo = newTemp(Ity_I64);
+ IRTemp maskBit3hi = newTemp(Ity_I64);
+ IRTemp maskBit3lo = newTemp(Ity_I64);
+ IRTemp sAnd7hi = newTemp(Ity_I64);
+ IRTemp sAnd7lo = newTemp(Ity_I64);
+ IRTemp permdHi = newTemp(Ity_I64);
+ IRTemp permdLo = newTemp(Ity_I64);
+
+ modrm = insn[3];
+ assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+ if (epartIsReg(modrm)) {
+ assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+ delta += 3+1;
+ DIP("pshufb %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+ nameXMMReg(gregOfRexRM(pfx,modrm)));
+ } else {
+ addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+ /* FIXME: generate trap if addr is not 16-aligned */
+ assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+ delta += 3+alen;
+ DIP("pshufb %s,%s\n", dis_buf,
+ nameXMMReg(gregOfRexRM(pfx,modrm)));
+ }
+
+ assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
+ assign( dLo, unop(Iop_V128to64, mkexpr(dV)) );
+ assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+ assign( sLo, unop(Iop_V128to64, mkexpr(sV)) );
+
+ assign( sevens, mkU64(0x0707070707070707ULL) );
+
+ /*
+ mask0x80hi = Not(SarN8x8(sHi,7))
+ maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
+ sAnd7hi = And(sHi,sevens)
+ permdHi = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
+ And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
+ rHi = And(permdHi,mask0x80hi)
+ */
+ assign(
+ mask0x80hi,
+ unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
+
+ assign(
+ maskBit3hi,
+ binop(Iop_SarN8x8,
+ binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
+ mkU8(7)));
+
+ assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
+
+ assign(
+ permdHi,
+ binop(
+ Iop_Or64,
+ binop(Iop_And64,
+ binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
+ mkexpr(maskBit3hi)),
+ binop(Iop_And64,
+ binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
+ unop(Iop_Not64,mkexpr(maskBit3hi))) ));
+
+ assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
+
+ /* And the same for the lower half of the result. What fun. */
+
+ assign(
+ mask0x80lo,
+ unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
+
+ assign(
+ maskBit3lo,
+ binop(Iop_SarN8x8,
+ binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
+ mkU8(7)));
+
+ assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
+
+ assign(
+ permdLo,
+ binop(
+ Iop_Or64,
+ binop(Iop_And64,
+ binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
+ mkexpr(maskBit3lo)),
+ binop(Iop_And64,
+ binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
+ unop(Iop_Not64,mkexpr(maskBit3lo))) ));
+
+ assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
+
+ putXMMReg(
+ gregOfRexRM(pfx,modrm),
+ binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
+ );
+ goto decode_success;
+ }
+
+ /* ---------------------------------------------------- */
+ /* --- end of the SSSE3 decoder. --- */
+ /* ---------------------------------------------------- */
+
/*after_sse_decoders:*/
/* Get the primary opcode. */
@@ -14699,11 +15682,13 @@
decode_failure:
/* All decode failures end up here. */
vex_printf("vex amd64->IR: unhandled instruction bytes: "
- "0x%x 0x%x 0x%x 0x%x\n",
+ "0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
(Int)getUChar(delta_start+0),
(Int)getUChar(delta_start+1),
(Int)getUChar(delta_start+2),
- (Int)getUChar(delta_start+3) );
+ (Int)getUChar(delta_start+3),
+ (Int)getUChar(delta_start+4),
+ (Int)getUChar(delta_start+5) );
/* Tell the dispatcher that this insn cannot be decoded, and so has
not been executed, and (is currently) the next to be executed.
Modified: trunk/priv/host-amd64/isel.c
===================================================================
--- trunk/priv/host-amd64/isel.c 2008-01-04 01:22:41 UTC (rev 1807)
+++ trunk/priv/host-amd64/isel.c 2008-02-06 11:42:45 UTC (rev 1808)
@@ -1038,6 +1038,12 @@
fn = (HWord)h_generic_calc_InterleaveHI32x2; break;
case Iop_InterleaveLO32x2:
fn = (HWord)h_generic_calc_InterleaveLO32x2; break;
+ case Iop_CatOddLanes16x4:
+ fn = (HWord)h_generic_calc_CatOddLanes16x4; break;
+ case Iop_CatEvenLanes16x4:
+ fn = (HWord)h_generic_calc_CatEvenLanes16x4; break;
+ case Iop_Perm8x8:
+ fn = (HWord)h_generic_calc_Perm8x8; break;
case Iop_Max8Ux8:
fn = (HWord)h_generic_calc_Max8Ux8; break;
@@ -1050,6 +1056,8 @@
case Iop_Mul16x4:
fn = (HWord)h_generic_calc_Mul16x4; break;
+ case Iop_Mul32x2:
+ fn = (HWord)h_generic_calc_Mul32x2; break;
case Iop_MulHi16Sx4:
fn = (HWord)h_generic_calc_MulHi16Sx4; break;
case Iop_MulHi16Ux4:
@@ -1095,6 +1103,10 @@
fn = (HWord)h_generic_calc_ShlN16x4;
second_is_UInt = True;
break;
+ case Iop_ShlN8x8:
+ fn = (HWord)h_generic_calc_ShlN8x8;
+ second_is_UInt = True;
+ break;
case Iop_ShrN32x2:
fn = (HWord)h_generic_calc_ShrN32x2;
second_is_UInt = True;
Modified: trunk/priv/host-generic/h_generic_simd64.c
===================================================================
--- trunk/priv/host-generic/h_generic_simd64.c 2008-01-04 01:22:41 UTC (rev 1807)
+++ trunk/priv/host-generic/h_generic_simd64.c 2008-02-06 11:42:45 UTC (rev 1808)
@@ -142,7 +142,12 @@
return toUChar(0xFF & (lo32 >> 0));
}
+static inline UChar index8x8 ( ULong w64, UChar ix ) {
+ ix &= 7;
+ return toUChar((w64 >> (8*ix)) & 0xFF);
+}
+
/* Scalar helpers. */
static inline Short qadd16S ( Short xx, Short yy )
@@ -213,6 +218,12 @@
return (Short)t;
}
+static inline Int mul32 ( Int xx, Int yy )
+{
+ Int t = ((Int)xx) * ((Int)yy);
+ return (Int)t;
+}
+
static inline Short mulhi16S ( Short xx, Short yy )
{
Int t = ((Int)xx) * ((Int)yy);
@@ -299,6 +310,11 @@
/* shifts: we don't care about out-of-range ones, since
that is dealt with at a higher level. */
+static inline UChar shl8 ( UChar v, UInt n )
+{
+ return toUChar(v << n);
+}
+
static inline UChar sar8 ( UChar v, UInt n )
{
return toUChar(((Char)v) >> n);
@@ -555,6 +571,14 @@
);
}
+ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
+{
+ return mk32x2(
+ mul32( sel32x2_1(xx), sel32x2_1(yy) ),
+ mul32( sel32x2_0(xx), sel32x2_0(yy) )
+ );
+}
+
ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
{
return mk16x4(
@@ -799,7 +823,43 @@
);
}
+/* ------------ Concatenation ------------ */
+ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
+{
+ return mk16x4(
+ sel16x4_3(aa),
+ sel16x4_1(aa),
+ sel16x4_3(bb),
+ sel16x4_1(bb)
+ );
+}
+
+ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
+{
+ return mk16x4(
+ sel16x4_2(aa),
+ sel16x4_0(aa),
+ sel16x4_2(bb),
+ sel16x4_0(bb)
+ );
+}
+
+/* misc hack looking for a proper home */
+ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
+{
+ return mk8x8(
+ index8x8(aa, sel8x8_7(bb)),
+ index8x8(aa, sel8x8_6(bb)),
+ index8x8(aa, sel8x8_5(bb)),
+ index8x8(aa, sel8x8_4(bb)),
+ index8x8(aa, sel8x8_3(bb)),
+ index8x8(aa, sel8x8_2(bb)),
+ index8x8(aa, sel8x8_1(bb)),
+ index8x8(aa, sel8x8_0(bb))
+ );
+}
+
/* ------------ Shifting ------------ */
/* Note that because these primops are undefined if the shift amount
equals or exceeds the lane width, the shift amount is masked so
@@ -829,6 +889,22 @@
);
}
+ULong h_generic_calc_ShlN8x8 ( ULong xx, UInt nn )
+{
+ /* vassert(nn < 8); */
+ nn &= 7;
+ return mk8x8(
+ shl8( sel8x8_7(xx), nn ),
+ shl8( sel8x8_6(xx), nn ),
+ shl8( sel8x8_5(xx), nn ),
+ shl8( sel8x8_4(xx), nn ),
+ shl8( sel8x8_3(xx), nn ),
+ shl8( sel8x8_2(xx), nn ),
+ shl8( sel8x8_1(xx), nn ),
+ shl8( sel8x8_0(xx), nn )
+ );
+}
+
ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
{
/* vassert(nn < 32); */
Modified: trunk/priv/host-generic/h_generic_simd64.h
===================================================================
--- trunk/priv/host-generic/h_generic_simd64.h 2008-01-04 01:22:41 UTC (rev 1807)
+++ trunk/priv/host-generic/h_generic_simd64.h 2008-02-06 11:42:45 UTC (rev 1808)
@@ -83,6 +83,7 @@
extern ULong h_generic_calc_QSub8Ux8 ( ULong, ULong );
extern ULong h_generic_calc_Mul16x4 ( ULong, ULong );
+extern ULong h_generic_calc_Mul32x2 ( ULong, ULong );
extern ULong h_generic_calc_MulHi16Sx4 ( ULong, ULong );
extern ULong h_generic_calc_MulHi16Ux4 ( ULong, ULong );
@@ -108,6 +109,11 @@
extern ULong h_generic_calc_InterleaveHI32x2 ( ULong, ULong );
extern ULong h_generic_calc_InterleaveLO32x2 ( ULong, ULong );
+extern ULong h_generic_calc_CatOddLanes16x4 ( ULong, ULong );
+extern ULong h_generic_calc_CatEvenLanes16x4 ( ULong, ULong );
+extern ULong h_generic_calc_Perm8x8 ( ULong, ULong );
+
+extern ULong h_generic_calc_ShlN8x8 ( ULong, UInt );
extern ULong h_generic_calc_ShlN16x4 ( ULong, UInt );
extern ULong h_generic_calc_ShlN32x2 ( ULong, UInt );
Modified: trunk/priv/ir/irdefs.c
===================================================================
--- trunk/priv/ir/irdefs.c 2008-01-04 01:22:41 UTC (rev 1807)
+++ trunk/priv/ir/irdefs.c 2008-02-06 11:42:45 UTC (rev 1808)
@@ -326,6 +326,7 @@
case Iop_QSub8Sx8: vex_printf("QSub8Sx8"); return;
case Iop_QSub16Sx4: vex_printf("QSub16Sx4"); return;
case Iop_Mul16x4: vex_printf("Mul16x4"); return;
+ case Iop_Mul32x2: vex_printf("Mul32x2"); return;
case Iop_MulHi16Ux4: vex_printf("MulHi16Ux4"); return;
case Iop_MulHi16Sx4: vex_printf("MulHi16Sx4"); return;
case Iop_Avg8Ux8: vex_printf("Avg8Ux8"); return;
@@ -340,6 +341,7 @@
case Iop_CmpGT8Sx8: vex_printf("CmpGT8Sx8"); return;
case Iop_CmpGT16Sx4: vex_printf("CmpGT16Sx4"); return;
case Iop_CmpGT32Sx2: vex_printf("CmpGT32Sx2"); return;
+ case Iop_ShlN8x8: vex_printf("ShlN8x8"); return;
case Iop_ShlN16x4: vex_printf("ShlN16x4"); return;
case Iop_ShlN32x2: vex_printf("ShlN32x2"); return;
case Iop_ShrN16x4: vex_printf("ShrN16x4"); return;
@@ -356,6 +358,9 @@
case Iop_InterleaveLO8x8: vex_printf("InterleaveLO8x8"); return;
case Iop_InterleaveLO16x4: vex_printf("InterleaveLO16x4"); return;
case Iop_InterleaveLO32x2: vex_printf("InterleaveLO32x2"); return;
+ case Iop_CatOddLanes16x4: vex_printf("CatOddLanes16x4"); return;
+ case Iop_CatEvenLanes16x4: vex_printf("CatEvenLanes16x4"); return;
+ case Iop_Perm8x8: vex_printf("Iop_Perm8x8"); return;
case Iop_CmpNEZ32x2: vex_printf("CmpNEZ32x2"); return;
case Iop_CmpNEZ16x4: vex_printf("CmpNEZ16x4"); return;
@@ -1506,9 +1511,12 @@
case Iop_InterleaveHI8x8: case Iop_InterleaveLO8x8:
case Iop_InterleaveHI16x4: case Iop_InterleaveLO16x4:
case Iop_InterleaveHI32x2: case Iop_InterleaveLO32x2:
+ case Iop_CatOddLanes16x4: case Iop_CatEvenLanes16x4:
+ case Iop_Perm8x8:
case Iop_Max8Ux8: case Iop_Max16Sx4:
case Iop_Min8Ux8: case Iop_Min16Sx4:
- case Iop_Mul16x4: case Iop_MulHi16Sx4: case Iop_MulHi16Ux4:
+ case Iop_Mul16x4: case Iop_Mul32x2:
+ case Iop_MulHi16Sx4: case Iop_MulHi16Ux4:
case Iop_QAdd8Sx8: case Iop_QAdd16Sx4:
case Iop_QAdd8Ux8: case Iop_QAdd16Ux4:
case Iop_QNarrow32Sx2:
@@ -1518,7 +1526,7 @@
case Iop_QSub8Ux8: case Iop_QSub16Ux4:
BINARY(Ity_I64,Ity_I64, Ity_I64);
- case Iop_ShlN32x2: case Iop_ShlN16x4:
+ case Iop_ShlN32x2: case Iop_ShlN16x4: case Iop_ShlN8x8:
case Iop_ShrN32x2: case Iop_ShrN16x4:
case Iop_SarN32x2: case Iop_SarN16x4: case Iop_SarN8x8:
BINARY(Ity_I64,Ity_I8, Ity_I64);
Modified: trunk/pub/libvex_ir.h
===================================================================
--- trunk/pub/libvex_ir.h 2008-01-04 01:22:41 UTC (rev 1807)
+++ trunk/pub/libvex_ir.h 2008-02-06 11:42:45 UTC (rev 1808)
@@ -658,7 +658,7 @@
Iop_QSub8Sx8, Iop_QSub16Sx4,
/* MULTIPLICATION (normal / high half of signed/unsigned) */
- Iop_Mul16x4,
+ Iop_Mul16x4, Iop_Mul32x2,
Iop_MulHi16Ux4,
Iop_MulHi16Sx4,
@@ -677,7 +677,7 @@
Iop_CmpGT8Sx8, Iop_CmpGT16Sx4, Iop_CmpGT32Sx2,
/* VECTOR x SCALAR SHIFT (shift amt :: Ity_I8) */
- Iop_ShlN16x4, Iop_ShlN32x2,
+ Iop_ShlN8x8, Iop_ShlN16x4, Iop_ShlN32x2,
Iop_ShrN16x4, Iop_ShrN32x2,
Iop_SarN8x8, Iop_SarN16x4, Iop_SarN32x2,
@@ -692,6 +692,19 @@
Iop_InterleaveHI8x8, Iop_InterleaveHI16x4, Iop_InterleaveHI32x2,
Iop_InterleaveLO8x8, Iop_InterleaveLO16x4, Iop_InterleaveLO32x2,
+ /* CONCATENATION -- build a new value by concatenating either
+ the even or odd lanes of both operands. Note that
+ Cat{Odd,Even}Lanes32x2 are identical to Interleave{HI,LO}32x2
+ and so are omitted. */
+ Iop_CatOddLanes16x4, Iop_CatEvenLanes16x4,
+
+ /* PERMUTING -- copy src bytes to dst,
+ as indexed by control vector bytes:
+ for i in 0 .. 7 . result[i] = argL[ argR[i] ]
+ argR[i] values may only be in the range 0 .. 7, else behaviour
+ is undefined. */
+ Iop_Perm8x8,
+
/* ------------------ 128-bit SIMD FP. ------------------ */
/* --- 32x4 vector FP --- */
|