You can subscribe to this list here.
| 2002 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
(1) |
Oct
(122) |
Nov
(152) |
Dec
(69) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2003 |
Jan
(6) |
Feb
(25) |
Mar
(73) |
Apr
(82) |
May
(24) |
Jun
(25) |
Jul
(10) |
Aug
(11) |
Sep
(10) |
Oct
(54) |
Nov
(203) |
Dec
(182) |
| 2004 |
Jan
(307) |
Feb
(305) |
Mar
(430) |
Apr
(312) |
May
(187) |
Jun
(342) |
Jul
(487) |
Aug
(637) |
Sep
(336) |
Oct
(373) |
Nov
(441) |
Dec
(210) |
| 2005 |
Jan
(385) |
Feb
(480) |
Mar
(636) |
Apr
(544) |
May
(679) |
Jun
(625) |
Jul
(810) |
Aug
(838) |
Sep
(634) |
Oct
(521) |
Nov
(965) |
Dec
(543) |
| 2006 |
Jan
(494) |
Feb
(431) |
Mar
(546) |
Apr
(411) |
May
(406) |
Jun
(322) |
Jul
(256) |
Aug
(401) |
Sep
(345) |
Oct
(542) |
Nov
(308) |
Dec
(481) |
| 2007 |
Jan
(427) |
Feb
(326) |
Mar
(367) |
Apr
(255) |
May
(244) |
Jun
(204) |
Jul
(223) |
Aug
(231) |
Sep
(354) |
Oct
(374) |
Nov
(497) |
Dec
(362) |
| 2008 |
Jan
(322) |
Feb
(482) |
Mar
(658) |
Apr
(422) |
May
(476) |
Jun
(396) |
Jul
(455) |
Aug
(267) |
Sep
(280) |
Oct
(253) |
Nov
(232) |
Dec
(304) |
| 2009 |
Jan
(486) |
Feb
(470) |
Mar
(458) |
Apr
(423) |
May
(696) |
Jun
(461) |
Jul
(551) |
Aug
(575) |
Sep
(134) |
Oct
(110) |
Nov
(157) |
Dec
(102) |
| 2010 |
Jan
(226) |
Feb
(86) |
Mar
(147) |
Apr
(117) |
May
(107) |
Jun
(203) |
Jul
(193) |
Aug
(238) |
Sep
(300) |
Oct
(246) |
Nov
(23) |
Dec
(75) |
| 2011 |
Jan
(133) |
Feb
(195) |
Mar
(315) |
Apr
(200) |
May
(267) |
Jun
(293) |
Jul
(353) |
Aug
(237) |
Sep
(278) |
Oct
(611) |
Nov
(274) |
Dec
(260) |
| 2012 |
Jan
(303) |
Feb
(391) |
Mar
(417) |
Apr
(441) |
May
(488) |
Jun
(655) |
Jul
(590) |
Aug
(610) |
Sep
(526) |
Oct
(478) |
Nov
(359) |
Dec
(372) |
| 2013 |
Jan
(467) |
Feb
(226) |
Mar
(391) |
Apr
(281) |
May
(299) |
Jun
(252) |
Jul
(311) |
Aug
(352) |
Sep
(481) |
Oct
(571) |
Nov
(222) |
Dec
(231) |
| 2014 |
Jan
(185) |
Feb
(329) |
Mar
(245) |
Apr
(238) |
May
(281) |
Jun
(399) |
Jul
(382) |
Aug
(500) |
Sep
(579) |
Oct
(435) |
Nov
(487) |
Dec
(256) |
| 2015 |
Jan
(338) |
Feb
(357) |
Mar
(330) |
Apr
(294) |
May
(191) |
Jun
(108) |
Jul
(142) |
Aug
(261) |
Sep
(190) |
Oct
(54) |
Nov
(83) |
Dec
(22) |
| 2016 |
Jan
(49) |
Feb
(89) |
Mar
(33) |
Apr
(50) |
May
(27) |
Jun
(34) |
Jul
(53) |
Aug
(53) |
Sep
(98) |
Oct
(206) |
Nov
(93) |
Dec
(53) |
| 2017 |
Jan
(65) |
Feb
(82) |
Mar
(102) |
Apr
(86) |
May
(187) |
Jun
(67) |
Jul
(23) |
Aug
(93) |
Sep
(65) |
Oct
(45) |
Nov
(35) |
Dec
(17) |
| 2018 |
Jan
(26) |
Feb
(35) |
Mar
(38) |
Apr
(32) |
May
(8) |
Jun
(43) |
Jul
(27) |
Aug
(30) |
Sep
(43) |
Oct
(42) |
Nov
(38) |
Dec
(67) |
| 2019 |
Jan
(32) |
Feb
(37) |
Mar
(53) |
Apr
(64) |
May
(49) |
Jun
(18) |
Jul
(14) |
Aug
(53) |
Sep
(25) |
Oct
(30) |
Nov
(49) |
Dec
(31) |
| 2020 |
Jan
(87) |
Feb
(45) |
Mar
(37) |
Apr
(51) |
May
(99) |
Jun
(36) |
Jul
(11) |
Aug
(14) |
Sep
(20) |
Oct
(24) |
Nov
(40) |
Dec
(23) |
| 2021 |
Jan
(14) |
Feb
(53) |
Mar
(85) |
Apr
(15) |
May
(19) |
Jun
(3) |
Jul
(14) |
Aug
(1) |
Sep
(57) |
Oct
(73) |
Nov
(56) |
Dec
(22) |
| 2022 |
Jan
(3) |
Feb
(22) |
Mar
(6) |
Apr
(55) |
May
(46) |
Jun
(39) |
Jul
(15) |
Aug
(9) |
Sep
(11) |
Oct
(34) |
Nov
(20) |
Dec
(36) |
| 2023 |
Jan
(79) |
Feb
(41) |
Mar
(99) |
Apr
(169) |
May
(48) |
Jun
(16) |
Jul
(16) |
Aug
(57) |
Sep
(19) |
Oct
|
Nov
|
Dec
|
| S | M | T | W | T | F | S |
|---|---|---|---|---|---|---|
|
|
|
1
(9) |
2
(13) |
3
(12) |
4
(4) |
5
(5) |
|
6
(5) |
7
(16) |
8
(9) |
9
(14) |
10
(15) |
11
(7) |
12
(2) |
|
13
(7) |
14
(9) |
15
(8) |
16
(6) |
17
|
18
(2) |
19
(4) |
|
20
|
21
(7) |
22
(12) |
23
(6) |
24
(2) |
25
(3) |
26
(2) |
|
27
(2) |
28
(5) |
29
(7) |
30
(10) |
|
|
|
|
From: <sv...@va...> - 2010-06-18 08:18:47
|
Author: sewardj
Date: 2010-06-18 09:18:38 +0100 (Fri, 18 Jun 2010)
New Revision: 11181
Log:
Valgrind-side changes needed to go with vex r1984 (Implement SSE4
insns: CMPGTQ PMAXUD PMINUD PMAXSB PMINSB PMULLD)
Modified:
trunk/Makefile.vex.am
trunk/memcheck/mc_translate.c
trunk/none/tests/amd64/sse4-64.c
Modified: trunk/Makefile.vex.am
===================================================================
--- trunk/Makefile.vex.am 2010-06-15 14:55:28 UTC (rev 11180)
+++ trunk/Makefile.vex.am 2010-06-18 08:18:38 UTC (rev 11181)
@@ -40,6 +40,7 @@
priv/guest_arm_defs.h \
priv/host_generic_regs.h \
priv/host_generic_simd64.h \
+ priv/host_generic_simd128.h \
priv/host_x86_defs.h \
priv/host_amd64_defs.h \
priv/host_ppc_defs.h \
@@ -90,6 +91,7 @@
priv/guest_arm_toIR.c \
priv/host_generic_regs.c \
priv/host_generic_simd64.c \
+ priv/host_generic_simd128.c \
priv/host_generic_reg_alloc2.c \
priv/host_x86_defs.c \
priv/host_x86_isel.c \
Modified: trunk/memcheck/mc_translate.c
===================================================================
--- trunk/memcheck/mc_translate.c 2010-06-15 14:55:28 UTC (rev 11180)
+++ trunk/memcheck/mc_translate.c 2010-06-18 08:18:38 UTC (rev 11181)
@@ -2279,10 +2279,12 @@
case Iop_Max32Sx4:
case Iop_Min32Ux4:
case Iop_Min32Sx4:
+ case Iop_Mul32x4:
return binary32Ix4(mce, vatom1, vatom2);
case Iop_Sub64x2:
case Iop_Add64x2:
+ case Iop_CmpGT64Sx2:
return binary64Ix2(mce, vatom1, vatom2);
case Iop_QNarrow32Sx4:
Modified: trunk/none/tests/amd64/sse4-64.c
===================================================================
--- trunk/none/tests/amd64/sse4-64.c 2010-06-15 14:55:28 UTC (rev 11180)
+++ trunk/none/tests/amd64/sse4-64.c 2010-06-18 08:18:38 UTC (rev 11181)
@@ -58,6 +58,14 @@
}
RMArgs;
+static void do64HLtoV128 ( /*OUT*/V128* res, ULong wHi, ULong wLo )
+{
+ // try to sidestep strict-aliasing snafus by memcpying explicitly
+ UChar* p = (UChar*)res;
+ memcpy(&p[8], (UChar*)&wHi, 8);
+ memcpy(&p[0], (UChar*)&wLo, 8);
+}
+
static UChar randUChar ( void )
{
static UInt seed = 80021;
@@ -2059,12 +2067,40 @@
}
+void test_PCMPGTQ ( void )
+{
+ V128 spec[7];
+ do64HLtoV128( &spec[0], 0x0000000000000000ULL, 0xffffffffffffffffULL );
+ do64HLtoV128( &spec[1], 0x0000000000000001ULL, 0xfffffffffffffffeULL );
+ do64HLtoV128( &spec[2], 0x7fffffffffffffffULL, 0x8000000000000001ULL );
+ do64HLtoV128( &spec[3], 0x8000000000000000ULL, 0x8000000000000000ULL );
+ do64HLtoV128( &spec[4], 0x8000000000000001ULL, 0x7fffffffffffffffULL );
+ do64HLtoV128( &spec[5], 0xfffffffffffffffeULL, 0x0000000000000001ULL );
+ do64HLtoV128( &spec[6], 0xffffffffffffffffULL, 0x0000000000000000ULL );
+ V128 src, dst;
+ Int i, j;
+ for (i = 0; i < 10; i++) {
+ randV128(&src);
+ randV128(&dst);
+ DO_mandr_r("pcmpgtq", src, dst);
+ }
+ for (i = 0; i < 7; i++) {
+ for (j = 0; j < 7; j++) {
+ memcpy(&src, &spec[i], 16);
+ memcpy(&dst, &spec[j], 16);
+ DO_mandr_r("pcmpgtq", src, dst);
+ }
+ }
+}
+
+
int main ( int argc, char** argv )
{
#if 1
+ // ------ SSE 4.1 ------
test_BLENDPD(); // done Apr.01.2010
test_BLENDPS(); // done Apr.02.2010
//test_PBLENDW();
@@ -2088,14 +2124,14 @@
//test_PINSRW(); // todo
//test_PINSRB(); // todo
//test_PHMINPOSUW();
- //test_PMAXSB();
+ test_PMAXSB();
test_PMAXSD(); // done Apr.09.2010
test_PMAXUD(); // done Apr.16.2010
- //test_PMAXUW();
- //test_PMINSB();
+ test_PMAXUW();
+ test_PMINSB();
test_PMINSD(); // done Apr.09.2010
test_PMINUD();
- //test_PMINUW();
+ test_PMINUW();
test_PMOVSXBW(); // done Apr.02.2010
test_PMOVSXBD(); // done Mar.30.2010
test_PMOVSXBQ(); // done Mar.30.2010
@@ -2112,13 +2148,16 @@
test_POPCNTL();
test_POPCNTQ();
//test_PMULDQ();
- //test_PMULLD();
+ test_PMULLD();
// PTEST
// ROUNDPD
// ROUNDPS
// ROUNDSD
// ROUNDSS
+ // ------ SSE 4.2 ------
+ test_PCMPGTQ();
#else
+ test_PMAXSB();
#endif
return 0;
|
|
From: <sv...@va...> - 2010-06-18 08:17:51
|
Author: sewardj
Date: 2010-06-18 09:17:41 +0100 (Fri, 18 Jun 2010)
New Revision: 1984
Log:
Implement SSE4 instructions: PCMPGTQ PMAXUD PMINUD PMAXSB PMINSB PMULLD
I believe this covers everything that gcc-4.4 and gcc-4.5 will generate
with "-O3 -msse4.2". Note, this commit changes the set of IR ops and so
requires a from-scratch rebuild of the tree.
Added:
trunk/priv/host_generic_simd128.c
trunk/priv/host_generic_simd128.h
Modified:
trunk/Makefile-gcc
trunk/priv/guest_amd64_toIR.c
trunk/priv/host_amd64_isel.c
trunk/priv/ir_defs.c
trunk/priv/main_main.c
trunk/pub/libvex_ir.h
Modified: trunk/Makefile-gcc
===================================================================
--- trunk/Makefile-gcc 2010-06-14 21:29:35 UTC (rev 1983)
+++ trunk/Makefile-gcc 2010-06-18 08:17:41 UTC (rev 1984)
@@ -17,6 +17,7 @@
priv/host_ppc_defs.h \
priv/host_generic_regs.h \
priv/host_generic_simd64.h \
+ priv/host_generic_simd128.h \
priv/main_globals.h \
priv/main_util.h \
priv/guest_generic_x87.h \
@@ -44,6 +45,7 @@
priv/host_ppc_isel.o \
priv/host_generic_regs.o \
priv/host_generic_simd64.o \
+ priv/host_generic_simd128.o \
priv/host_generic_reg_alloc2.o \
priv/guest_generic_x87.o \
priv/guest_generic_bb_to_IR.o \
@@ -262,6 +264,10 @@
$(CC) $(CCFLAGS) $(ALL_INCLUDES) -o priv/host_generic_simd64.o \
-c priv/host_generic_simd64.c
+priv/host_generic_simd128.o: $(ALL_HEADERS) priv/host_generic_simd128.c
+ $(CC) $(CCFLAGS) $(ALL_INCLUDES) -o priv/host_generic_simd128.o \
+ -c priv/host_generic_simd128.c
+
priv/host_generic_reg_alloc2.o: $(ALL_HEADERS) priv/host_generic_reg_alloc2.c
$(CC) $(CCFLAGS) $(ALL_INCLUDES) -o priv/host_generic_reg_alloc2.o \
-c priv/host_generic_reg_alloc2.c
Modified: trunk/priv/guest_amd64_toIR.c
===================================================================
--- trunk/priv/guest_amd64_toIR.c 2010-06-14 21:29:35 UTC (rev 1983)
+++ trunk/priv/guest_amd64_toIR.c 2010-06-18 08:17:41 UTC (rev 1984)
@@ -14406,122 +14406,86 @@
goto decode_success;
}
+ /* 66 0F 38 37 = PCMPGTQ
+ 64x2 comparison (signed, presumably; the Intel docs don't say :-)
+ */
+ if ( have66noF2noF3( pfx ) && sz == 2
+ && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x37) {
+ delta = dis_SSEint_E_to_G( vbi, pfx, delta+3,
+ "pcmpgtq", Iop_CmpGT64Sx2, False );
+ goto decode_success;
+ }
/* 66 0F 38 3D /r = PMAXSD xmm1, xmm2/m128
Maximum of Packed Signed Double Word Integers (XMM)
- --
66 0F 38 39 /r = PMINSD xmm1, xmm2/m128
Minimum of Packed Signed Double Word Integers (XMM) */
- if ( have66noF2noF3( pfx )
- && sz == 2
+ if ( have66noF2noF3( pfx ) && sz == 2
&& insn[0] == 0x0F && insn[1] == 0x38
- && ( (insn[2] == 0x3D) || (insn[2] == 0x39) ) ) {
-
- IRTemp reg_vec = newTemp(Ity_V128);
- IRTemp rom_vec = newTemp(Ity_V128);
- IRTemp mask_vec = newTemp(Ity_V128);
-
- Bool isPMAX = (insn[2] == 0x3D) ? True : False;
-
- HChar* str = isPMAX ? "pmaxsd" : "pminsd";
-
- modrm = insn[3];
- assign( reg_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
-
- if ( epartIsReg( modrm ) ) {
- assign( rom_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
- delta += 3+1;
- DIP( "%s %s,%s\n", str,
- nameXMMReg( eregOfRexRM(pfx, modrm) ),
- nameXMMReg( gregOfRexRM(pfx, modrm) ) );
- } else {
- addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
- assign( rom_vec, loadLE( Ity_V128, mkexpr(addr) ) );
- delta += 3+alen;
- DIP( "%s %s,%s\n", str, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
- }
-
- assign( mask_vec, binop( Iop_CmpGT32Sx4, mkexpr(reg_vec), mkexpr(rom_vec) ) );
-
- IRTemp max_min_vec = newTemp(Ity_V128);
- if ( isPMAX ) {
- assign( max_min_vec,
- binop( Iop_OrV128,
- binop( Iop_AndV128, mkexpr(rom_vec),
- unop( Iop_NotV128, mkexpr(mask_vec) ) ),
- binop( Iop_AndV128, mkexpr(reg_vec), mkexpr(mask_vec) ) ) );
- } else {
- assign( max_min_vec,
- binop( Iop_OrV128,
- binop( Iop_AndV128, mkexpr(reg_vec),
- unop( Iop_NotV128, mkexpr(mask_vec) ) ),
- binop( Iop_AndV128, mkexpr(rom_vec), mkexpr(mask_vec) ) ) );
- }
-
- putXMMReg( gregOfRexRM(pfx, modrm), mkexpr(max_min_vec) );
-
+ && (insn[2] == 0x3D || insn[2] == 0x39)) {
+ Bool isMAX = insn[2] == 0x3D;
+ delta = dis_SSEint_E_to_G(
+ vbi, pfx, delta+3,
+ isMAX ? "pmaxsd" : "pminsd",
+ isMAX ? Iop_Max32Sx4 : Iop_Min32Sx4,
+ False
+ );
goto decode_success;
}
-
/* 66 0F 38 3F /r = PMAXUD xmm1, xmm2/m128
Maximum of Packed Unsigned Doubleword Integers (XMM)
66 0F 38 3B /r = PMINUD xmm1, xmm2/m128
Minimum of Packed Unsigned Doubleword Integers (XMM) */
- if ( have66noF2noF3( pfx )
- && sz == 2
+ if ( have66noF2noF3( pfx ) && sz == 2
&& insn[0] == 0x0F && insn[1] == 0x38
&& (insn[2] == 0x3F || insn[2] == 0x3B)) {
+ Bool isMAX = insn[2] == 0x3F;
+ delta = dis_SSEint_E_to_G(
+ vbi, pfx, delta+3,
+ isMAX ? "pmaxud" : "pminud",
+ isMAX ? Iop_Max32Ux4 : Iop_Min32Ux4,
+ False
+ );
+ goto decode_success;
+ }
- Bool is_max = insn[2] == 0x3F;
- IRTemp reg_vec = newTemp(Ity_V128);
- IRTemp rom_vec = newTemp(Ity_V128);
- IRTemp mask_vec = newTemp(Ity_V128);
- IRTemp and_vec = newTemp(Ity_V128);
- IRTemp not_vec = newTemp(Ity_V128);
+ /* 66 0F 38 3E /r = PMAXUW xmm1, xmm2/m128
+ Maximum of Packed Unsigned Word Integers (XMM)
+ 66 0F 38 3A /r = PMINUW xmm1, xmm2/m128
+ Minimum of Packed Unsigned Word Integers (XMM)
+ */
+ if ( have66noF2noF3( pfx ) && sz == 2
+ && insn[0] == 0x0F && insn[1] == 0x38
+ && (insn[2] == 0x3E || insn[2] == 0x3A)) {
+ Bool isMAX = insn[2] == 0x3E;
+ delta = dis_SSEint_E_to_G(
+ vbi, pfx, delta+3,
+ isMAX ? "pmaxuw" : "pminuw",
+ isMAX ? Iop_Max16Ux8 : Iop_Min16Ux8,
+ False
+ );
+ goto decode_success;
+ }
- modrm = insn[3];
- assign( reg_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
-
- if ( epartIsReg( modrm ) ) {
- assign( rom_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
- delta += 3+1;
- DIP( "p%sud %s,%s\n",
- is_max ? "max" : "min",
- nameXMMReg( eregOfRexRM(pfx, modrm) ),
- nameXMMReg( gregOfRexRM(pfx, modrm) ) );
- } else {
- addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
- assign( rom_vec, loadLE( Ity_V128, mkexpr(addr) ) );
- delta += 3+alen;
- DIP( "p%sd %s,%s\n",
- is_max ? "max" : "min",
- dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
- }
-
- /* the foll. simulates Iop_CmpGT32Ux4 (not implemented)
- c.f. Hacker's Delight, S2-11, p.23 */
- assign( mask_vec,
- binop( Iop_XorV128,
- binop( Iop_XorV128,
- binop( Iop_CmpGT32Sx4, mkexpr(reg_vec), mkexpr(rom_vec) ),
- binop( Iop_SarN32x4, mkexpr(reg_vec), mkU8(31) ) ),
- binop( Iop_SarN32x4, mkexpr(rom_vec), mkU8(31) ) ) );
-
- assign( and_vec,
- binop( Iop_AndV128, mkexpr(is_max ? reg_vec : rom_vec),
- mkexpr(mask_vec) ) );
- assign( not_vec,
- binop( Iop_AndV128, mkexpr(is_max ? rom_vec : reg_vec),
- unop( Iop_NotV128, mkexpr(mask_vec) ) ) );
-
- putXMMReg( gregOfRexRM(pfx, modrm),
- binop( Iop_OrV128, mkexpr(not_vec), mkexpr(and_vec) ) );
-
+ /* 66 0F 38 3C /r = PMAXSB xmm1, xmm2/m128
+ 8Sx16 (signed) max
+ 66 0F 38 38 /r = PMINSB xmm1, xmm2/m128
+ 8Sx16 (signed) min
+ */
+ if ( have66noF2noF3( pfx ) && sz == 2
+ && insn[0] == 0x0F && insn[1] == 0x38
+ && (insn[2] == 0x3C || insn[2] == 0x38)) {
+ Bool isMAX = insn[2] == 0x3C;
+ delta = dis_SSEint_E_to_G(
+ vbi, pfx, delta+3,
+ isMAX ? "pmaxsb" : "pminsb",
+ isMAX ? Iop_Max8Sx16 : Iop_Min8Sx16,
+ False
+ );
goto decode_success;
}
-
/* 66 0f 38 20 /r = PMOVSXBW xmm1, xmm2/m64
Packed Move with Sign Extend from Byte to Word (XMM) */
if ( have66noF2noF3( pfx )
@@ -14964,6 +14928,40 @@
}
+ /* 66 0f 38 40 /r = PMULLD xmm1, xmm2/m128
+ 32x4 integer multiply from xmm2/m128 to xmm1 */
+ if ( have66noF2noF3( pfx )
+ && sz == 2
+ && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x40 ) {
+
+ modrm = insn[3];
+
+ IRTemp argL = newTemp(Ity_V128);
+ IRTemp argR = newTemp(Ity_V128);
+
+ if ( epartIsReg(modrm) ) {
+ assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
+ delta += 3+1;
+ DIP( "pmulld %s,%s\n",
+ nameXMMReg( eregOfRexRM(pfx, modrm) ),
+ nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+ } else {
+ addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+ assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
+ delta += 3+alen;
+ DIP( "pmulld %s,%s\n",
+ dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+ }
+
+ assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
+
+ putXMMReg( gregOfRexRM(pfx, modrm),
+ binop( Iop_Mul32x4, mkexpr(argL), mkexpr(argR)) );
+
+ goto decode_success;
+ }
+
+
/* F3 0F B8 = POPCNT{W,L,Q}
Count the number of 1 bits in a register
*/
Modified: trunk/priv/host_amd64_isel.c
===================================================================
--- trunk/priv/host_amd64_isel.c 2010-06-14 21:29:35 UTC (rev 1983)
+++ trunk/priv/host_amd64_isel.c 2010-06-18 08:17:41 UTC (rev 1984)
@@ -42,6 +42,7 @@
#include "main_globals.h"
#include "host_generic_regs.h"
#include "host_generic_simd64.h"
+#include "host_generic_simd128.h"
#include "host_amd64_defs.h"
@@ -3158,7 +3159,8 @@
/* DO NOT CALL THIS DIRECTLY */
static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
{
- Bool arg1isEReg = False;
+ HWord fn = 0; /* address of helper fn, if required */
+ Bool arg1isEReg = False;
AMD64SseOp op = Asse_INVALID;
IRType ty = typeOfIRExpr(env->type_env,e);
vassert(e);
@@ -3614,6 +3616,73 @@
return dst;
}
+ case Iop_Mul32x4: fn = (HWord)h_generic_calc_Mul32x4;
+ goto do_SseAssistedBinary;
+ case Iop_Max32Sx4: fn = (HWord)h_generic_calc_Max32Sx4;
+ goto do_SseAssistedBinary;
+ case Iop_Min32Sx4: fn = (HWord)h_generic_calc_Min32Sx4;
+ goto do_SseAssistedBinary;
+ case Iop_Max32Ux4: fn = (HWord)h_generic_calc_Max32Ux4;
+ goto do_SseAssistedBinary;
+ case Iop_Min32Ux4: fn = (HWord)h_generic_calc_Min32Ux4;
+ goto do_SseAssistedBinary;
+ case Iop_Max16Ux8: fn = (HWord)h_generic_calc_Max16Ux8;
+ goto do_SseAssistedBinary;
+ case Iop_Min16Ux8: fn = (HWord)h_generic_calc_Min16Ux8;
+ goto do_SseAssistedBinary;
+ case Iop_Max8Sx16: fn = (HWord)h_generic_calc_Max8Sx16;
+ goto do_SseAssistedBinary;
+ case Iop_Min8Sx16: fn = (HWord)h_generic_calc_Min8Sx16;
+ goto do_SseAssistedBinary;
+ case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
+ goto do_SseAssistedBinary;
+ do_SseAssistedBinary: {
+ /* RRRufff! RRRufff code is what we're generating here. Oh
+ well. */
+ vassert(fn != 0);
+ HReg dst = newVRegV(env);
+ HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
+ HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
+ HReg argp = newVRegI(env);
+ /* subq $112, %rsp -- make a space*/
+ sub_from_rsp(env, 112);
+ /* leaq 48(%rsp), %r_argp -- point into it */
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
+ argp));
+ /* andq $-16, %r_argp -- 16-align the pointer */
+ addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
+ AMD64RMI_Imm( ~(UInt)15 ),
+ argp));
+ /* Prepare 3 arg regs:
+ leaq 0(%r_argp), %rdi
+ leaq 16(%r_argp), %rsi
+ leaq 32(%r_argp), %rdx
+ */
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
+ hregAMD64_RDI()));
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
+ hregAMD64_RSI()));
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
+ hregAMD64_RDX()));
+ /* Store the two args, at (%rsi) and (%rdx):
+ movupd %argL, 0(%rsi)
+ movupd %argR, 0(%rdx)
+ */
+ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
+ AMD64AMode_IR(0, hregAMD64_RSI())));
+ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR,
+ AMD64AMode_IR(0, hregAMD64_RDX())));
+ /* call the helper */
+ addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3 ));
+ /* fetch the result from memory, using %r_argp, which the
+ register allocator will keep alive across the call. */
+ addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
+ AMD64AMode_IR(0, argp)));
+ /* and finally, clear the space */
+ add_to_rsp(env, 112);
+ return dst;
+ }
+
default:
break;
} /* switch (e->Iex.Binop.op) */
Added: trunk/priv/host_generic_simd128.c
===================================================================
--- trunk/priv/host_generic_simd128.c (rev 0)
+++ trunk/priv/host_generic_simd128.c 2010-06-18 08:17:41 UTC (rev 1984)
@@ -0,0 +1,220 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin host_generic_simd128.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+ This file is part of Valgrind, a dynamic binary instrumentation
+ framework.
+
+ Copyright (C) 2010-2010 OpenWorks GbR
+ in...@op...
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA.
+
+ The GNU General Public License is contained in the file COPYING.
+*/
+
+/* Generic helper functions for doing 128-bit SIMD arithmetic in cases
+ where the instruction selectors cannot generate code in-line.
+ These are purely back-end entities and cannot be seen/referenced
+ from IR. */
+
+#include "libvex_basictypes.h"
+#include "host_generic_simd128.h"
+
+
+/* Primitive helpers always take args of the real type (signed vs
+ unsigned) but return an unsigned result, so there's no conversion
+ weirdness when stuffing results back in the V128 union fields,
+ which are all unsigned. */
+
+static inline UInt mul32 ( Int xx, Int yy )
+{
+ Int t = ((Int)xx) * ((Int)yy);
+ return toUInt(t);
+}
+
+static inline UInt max32S ( Int xx, Int yy )
+{
+ return toUInt((xx > yy) ? xx : yy);
+}
+
+static inline UInt min32S ( Int xx, Int yy )
+{
+ return toUInt((xx < yy) ? xx : yy);
+}
+
+static inline UInt max32U ( UInt xx, UInt yy )
+{
+ return toUInt((xx > yy) ? xx : yy);
+}
+
+static inline UInt min32U ( UInt xx, UInt yy )
+{
+ return toUInt((xx < yy) ? xx : yy);
+}
+
+static inline UShort max16U ( UShort xx, UShort yy )
+{
+ return toUShort((xx > yy) ? xx : yy);
+}
+
+static inline UShort min16U ( UShort xx, UShort yy )
+{
+ return toUShort((xx < yy) ? xx : yy);
+}
+
+static inline UChar max8S ( Char xx, Char yy )
+{
+ return toUChar((xx > yy) ? xx : yy);
+}
+
+static inline UChar min8S ( Char xx, Char yy )
+{
+ return toUChar((xx < yy) ? xx : yy);
+}
+
+static inline ULong cmpGT64S ( Long xx, Long yy )
+{
+ return (((Long)xx) > ((Long)yy))
+ ? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
+}
+
+void h_generic_calc_Mul32x4 ( /*OUT*/V128* res,
+ V128* argL, V128* argR )
+{
+ res->w32[0] = mul32(argL->w32[0], argR->w32[0]);
+ res->w32[1] = mul32(argL->w32[1], argR->w32[1]);
+ res->w32[2] = mul32(argL->w32[2], argR->w32[2]);
+ res->w32[3] = mul32(argL->w32[3], argR->w32[3]);
+}
+
+void h_generic_calc_Max32Sx4 ( /*OUT*/V128* res,
+ V128* argL, V128* argR )
+{
+ res->w32[0] = max32S(argL->w32[0], argR->w32[0]);
+ res->w32[1] = max32S(argL->w32[1], argR->w32[1]);
+ res->w32[2] = max32S(argL->w32[2], argR->w32[2]);
+ res->w32[3] = max32S(argL->w32[3], argR->w32[3]);
+}
+
+void h_generic_calc_Min32Sx4 ( /*OUT*/V128* res,
+ V128* argL, V128* argR )
+{
+ res->w32[0] = min32S(argL->w32[0], argR->w32[0]);
+ res->w32[1] = min32S(argL->w32[1], argR->w32[1]);
+ res->w32[2] = min32S(argL->w32[2], argR->w32[2]);
+ res->w32[3] = min32S(argL->w32[3], argR->w32[3]);
+}
+
+void h_generic_calc_Max32Ux4 ( /*OUT*/V128* res,
+ V128* argL, V128* argR )
+{
+ res->w32[0] = max32U(argL->w32[0], argR->w32[0]);
+ res->w32[1] = max32U(argL->w32[1], argR->w32[1]);
+ res->w32[2] = max32U(argL->w32[2], argR->w32[2]);
+ res->w32[3] = max32U(argL->w32[3], argR->w32[3]);
+}
+
+void h_generic_calc_Min32Ux4 ( /*OUT*/V128* res,
+ V128* argL, V128* argR )
+{
+ res->w32[0] = min32U(argL->w32[0], argR->w32[0]);
+ res->w32[1] = min32U(argL->w32[1], argR->w32[1]);
+ res->w32[2] = min32U(argL->w32[2], argR->w32[2]);
+ res->w32[3] = min32U(argL->w32[3], argR->w32[3]);
+}
+
+void h_generic_calc_Max16Ux8 ( /*OUT*/V128* res,
+ V128* argL, V128* argR )
+{
+ res->w16[0] = max16U(argL->w16[0], argR->w16[0]);
+ res->w16[1] = max16U(argL->w16[1], argR->w16[1]);
+ res->w16[2] = max16U(argL->w16[2], argR->w16[2]);
+ res->w16[3] = max16U(argL->w16[3], argR->w16[3]);
+ res->w16[4] = max16U(argL->w16[4], argR->w16[4]);
+ res->w16[5] = max16U(argL->w16[5], argR->w16[5]);
+ res->w16[6] = max16U(argL->w16[6], argR->w16[6]);
+ res->w16[7] = max16U(argL->w16[7], argR->w16[7]);
+}
+
+void h_generic_calc_Min16Ux8 ( /*OUT*/V128* res,
+ V128* argL, V128* argR )
+{
+ res->w16[0] = min16U(argL->w16[0], argR->w16[0]);
+ res->w16[1] = min16U(argL->w16[1], argR->w16[1]);
+ res->w16[2] = min16U(argL->w16[2], argR->w16[2]);
+ res->w16[3] = min16U(argL->w16[3], argR->w16[3]);
+ res->w16[4] = min16U(argL->w16[4], argR->w16[4]);
+ res->w16[5] = min16U(argL->w16[5], argR->w16[5]);
+ res->w16[6] = min16U(argL->w16[6], argR->w16[6]);
+ res->w16[7] = min16U(argL->w16[7], argR->w16[7]);
+}
+
+void h_generic_calc_Max8Sx16 ( /*OUT*/V128* res,
+ V128* argL, V128* argR )
+{
+ res->w8[ 0] = max8S(argL->w8[ 0], argR->w8[ 0]);
+ res->w8[ 1] = max8S(argL->w8[ 1], argR->w8[ 1]);
+ res->w8[ 2] = max8S(argL->w8[ 2], argR->w8[ 2]);
+ res->w8[ 3] = max8S(argL->w8[ 3], argR->w8[ 3]);
+ res->w8[ 4] = max8S(argL->w8[ 4], argR->w8[ 4]);
+ res->w8[ 5] = max8S(argL->w8[ 5], argR->w8[ 5]);
+ res->w8[ 6] = max8S(argL->w8[ 6], argR->w8[ 6]);
+ res->w8[ 7] = max8S(argL->w8[ 7], argR->w8[ 7]);
+ res->w8[ 8] = max8S(argL->w8[ 8], argR->w8[ 8]);
+ res->w8[ 9] = max8S(argL->w8[ 9], argR->w8[ 9]);
+ res->w8[10] = max8S(argL->w8[10], argR->w8[10]);
+ res->w8[11] = max8S(argL->w8[11], argR->w8[11]);
+ res->w8[12] = max8S(argL->w8[12], argR->w8[12]);
+ res->w8[13] = max8S(argL->w8[13], argR->w8[13]);
+ res->w8[14] = max8S(argL->w8[14], argR->w8[14]);
+ res->w8[15] = max8S(argL->w8[15], argR->w8[15]);
+}
+
+void h_generic_calc_Min8Sx16 ( /*OUT*/V128* res,
+ V128* argL, V128* argR )
+{
+ res->w8[ 0] = min8S(argL->w8[ 0], argR->w8[ 0]);
+ res->w8[ 1] = min8S(argL->w8[ 1], argR->w8[ 1]);
+ res->w8[ 2] = min8S(argL->w8[ 2], argR->w8[ 2]);
+ res->w8[ 3] = min8S(argL->w8[ 3], argR->w8[ 3]);
+ res->w8[ 4] = min8S(argL->w8[ 4], argR->w8[ 4]);
+ res->w8[ 5] = min8S(argL->w8[ 5], argR->w8[ 5]);
+ res->w8[ 6] = min8S(argL->w8[ 6], argR->w8[ 6]);
+ res->w8[ 7] = min8S(argL->w8[ 7], argR->w8[ 7]);
+ res->w8[ 8] = min8S(argL->w8[ 8], argR->w8[ 8]);
+ res->w8[ 9] = min8S(argL->w8[ 9], argR->w8[ 9]);
+ res->w8[10] = min8S(argL->w8[10], argR->w8[10]);
+ res->w8[11] = min8S(argL->w8[11], argR->w8[11]);
+ res->w8[12] = min8S(argL->w8[12], argR->w8[12]);
+ res->w8[13] = min8S(argL->w8[13], argR->w8[13]);
+ res->w8[14] = min8S(argL->w8[14], argR->w8[14]);
+ res->w8[15] = min8S(argL->w8[15], argR->w8[15]);
+}
+
+void h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128* res,
+ V128* argL, V128* argR )
+{
+ res->w64[0] = cmpGT64S(argL->w64[0], argR->w64[0]);
+ res->w64[1] = cmpGT64S(argL->w64[1], argR->w64[1]);
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- end host_generic_simd128.c ---*/
+/*---------------------------------------------------------------*/
Added: trunk/priv/host_generic_simd128.h
===================================================================
--- trunk/priv/host_generic_simd128.h (rev 0)
+++ trunk/priv/host_generic_simd128.h 2010-06-18 08:17:41 UTC (rev 1984)
@@ -0,0 +1,79 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin host_generic_simd128.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+ This file is part of Valgrind, a dynamic binary instrumentation
+ framework.
+
+ Copyright (C) 2010-2010 OpenWorks GbR
+ in...@op...
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA.
+
+ The GNU General Public License is contained in the file COPYING.
+*/
+
+/* Generic helper functions for doing 128-bit SIMD arithmetic in cases
+ where the instruction selectors cannot generate code in-line.
+ These are purely back-end entities and cannot be seen/referenced
+ as clean helper functions from IR.
+
+ These will get called from generated code and therefore should be
+ well behaved -- no floating point or mmx insns, just straight
+ integer code.
+
+ Each function implements the correspondingly-named IR primop.
+*/
+
+#ifndef __VEX_HOST_GENERIC_SIMD128_H
+#define __VEX_HOST_GENERIC_SIMD128_H
+
+/* A union for doing 128-bit primitives conveniently. It is not
+ public and so not placed in pub/. */
+typedef
+ union {
+ UChar w8[16];
+ UShort w16[8];
+ UInt w32[4];
+ ULong w64[2];
+ }
+ V128;
+
+
+#include "libvex_basictypes.h"
+
+/* DO NOT MAKE THESE INTO REGPARM FNS! THIS WILL BREAK CALLING
+ SEQUENCES GENERATED BY host-x86/isel.c. */
+
+extern void h_generic_calc_Mul32x4 ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Max32Sx4 ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Min32Sx4 ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Max32Ux4 ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Min32Ux4 ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Max16Ux8 ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Min16Ux8 ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Max8Sx16 ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Min8Sx16 ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128*, V128*, V128* );
+
+
+#endif /* ndef __VEX_HOST_GENERIC_SIMD128_H */
+
+/*---------------------------------------------------------------*/
+/*--- end host_generic_simd128.h ---*/
+/*---------------------------------------------------------------*/
Modified: trunk/priv/ir_defs.c
===================================================================
--- trunk/priv/ir_defs.c 2010-06-14 21:29:35 UTC (rev 1983)
+++ trunk/priv/ir_defs.c 2010-06-18 08:17:41 UTC (rev 1984)
@@ -336,6 +336,7 @@
case Iop_QSub16Sx4: vex_printf("QSub16Sx4"); return;
case Iop_Mul16x4: vex_printf("Mul16x4"); return;
case Iop_Mul32x2: vex_printf("Mul32x2"); return;
+ case Iop_Mul32x4: vex_printf("Mul32x4"); return;
case Iop_MulHi16Ux4: vex_printf("MulHi16Ux4"); return;
case Iop_MulHi16Sx4: vex_printf("MulHi16Sx4"); return;
case Iop_Avg8Ux8: vex_printf("Avg8Ux8"); return;
@@ -525,6 +526,7 @@
case Iop_CmpGT8Sx16: vex_printf("CmpGT8Sx16"); return;
case Iop_CmpGT16Sx8: vex_printf("CmpGT16Sx8"); return;
case Iop_CmpGT32Sx4: vex_printf("CmpGT32Sx4"); return;
+ case Iop_CmpGT64Sx2: vex_printf("CmpGT64Sx2"); return;
case Iop_CmpGT8Ux16: vex_printf("CmpGT8Ux16"); return;
case Iop_CmpGT16Ux8: vex_printf("CmpGT16Ux8"); return;
case Iop_CmpGT32Ux4: vex_printf("CmpGT32Ux4"); return;
@@ -1899,7 +1901,7 @@
case Iop_Sub32x4: case Iop_Sub64x2:
case Iop_QSub8Ux16: case Iop_QSub16Ux8: case Iop_QSub32Ux4:
case Iop_QSub8Sx16: case Iop_QSub16Sx8: case Iop_QSub32Sx4:
- case Iop_Mul16x8:
+ case Iop_Mul16x8: case Iop_Mul32x4:
case Iop_MulHi16Ux8: case Iop_MulHi32Ux4:
case Iop_MulHi16Sx8: case Iop_MulHi32Sx4:
case Iop_MullEven8Ux16: case Iop_MullEven16Ux8:
@@ -1912,6 +1914,7 @@
case Iop_Min8Ux16: case Iop_Min16Ux8: case Iop_Min32Ux4:
case Iop_CmpEQ8x16: case Iop_CmpEQ16x8: case Iop_CmpEQ32x4:
case Iop_CmpGT8Sx16: case Iop_CmpGT16Sx8: case Iop_CmpGT32Sx4:
+ case Iop_CmpGT64Sx2:
case Iop_CmpGT8Ux16: case Iop_CmpGT16Ux8: case Iop_CmpGT32Ux4:
case Iop_Shl8x16: case Iop_Shl16x8: case Iop_Shl32x4:
case Iop_Shr8x16: case Iop_Shr16x8: case Iop_Shr32x4:
Modified: trunk/priv/main_main.c
===================================================================
--- trunk/priv/main_main.c 2010-06-14 21:29:35 UTC (rev 1983)
+++ trunk/priv/main_main.c 2010-06-18 08:17:41 UTC (rev 1984)
@@ -57,7 +57,9 @@
#include "guest_arm_defs.h"
#include "guest_ppc_defs.h"
+#include "host_generic_simd128.h"
+
/* This file contains the top level interface to the library. */
/* --------- fwds ... --------- */
@@ -141,6 +143,7 @@
vassert(4 == sizeof(Addr32));
vassert(8 == sizeof(Addr64));
vassert(16 == sizeof(U128));
+ vassert(16 == sizeof(V128));
vassert(sizeof(void*) == 4 || sizeof(void*) == 8);
vassert(sizeof(void*) == sizeof(int*));
Modified: trunk/pub/libvex_ir.h
===================================================================
--- trunk/pub/libvex_ir.h 2010-06-14 21:29:35 UTC (rev 1983)
+++ trunk/pub/libvex_ir.h 2010-06-18 08:17:41 UTC (rev 1984)
@@ -834,7 +834,7 @@
Iop_QSub8Sx16, Iop_QSub16Sx8, Iop_QSub32Sx4,
/* MULTIPLICATION (normal / high half of signed/unsigned) */
- Iop_Mul16x8,
+ Iop_Mul16x8, Iop_Mul32x4,
Iop_MulHi16Ux8, Iop_MulHi32Ux4,
Iop_MulHi16Sx8, Iop_MulHi32Sx4,
/* (widening signed/unsigned of even lanes, with lowest lane=zero) */
@@ -853,7 +853,7 @@
/* COMPARISON */
Iop_CmpEQ8x16, Iop_CmpEQ16x8, Iop_CmpEQ32x4,
- Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4,
+ Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4, Iop_CmpGT64Sx2,
Iop_CmpGT8Ux16, Iop_CmpGT16Ux8, Iop_CmpGT32Ux4,
/* VECTOR x SCALAR SHIFT (shift amt :: Ity_I8) */
|