You can subscribe to this list here.
| 2002 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
(1) |
Oct
(122) |
Nov
(152) |
Dec
(69) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2003 |
Jan
(6) |
Feb
(25) |
Mar
(73) |
Apr
(82) |
May
(24) |
Jun
(25) |
Jul
(10) |
Aug
(11) |
Sep
(10) |
Oct
(54) |
Nov
(203) |
Dec
(182) |
| 2004 |
Jan
(307) |
Feb
(305) |
Mar
(430) |
Apr
(312) |
May
(187) |
Jun
(342) |
Jul
(487) |
Aug
(637) |
Sep
(336) |
Oct
(373) |
Nov
(441) |
Dec
(210) |
| 2005 |
Jan
(385) |
Feb
(480) |
Mar
(636) |
Apr
(544) |
May
(679) |
Jun
(625) |
Jul
(810) |
Aug
(838) |
Sep
(634) |
Oct
(521) |
Nov
(965) |
Dec
(543) |
| 2006 |
Jan
(494) |
Feb
(431) |
Mar
(546) |
Apr
(411) |
May
(406) |
Jun
(322) |
Jul
(256) |
Aug
(401) |
Sep
(345) |
Oct
(542) |
Nov
(308) |
Dec
(481) |
| 2007 |
Jan
(427) |
Feb
(326) |
Mar
(367) |
Apr
(255) |
May
(244) |
Jun
(204) |
Jul
(223) |
Aug
(231) |
Sep
(354) |
Oct
(374) |
Nov
(497) |
Dec
(362) |
| 2008 |
Jan
(322) |
Feb
(482) |
Mar
(658) |
Apr
(422) |
May
(476) |
Jun
(396) |
Jul
(455) |
Aug
(267) |
Sep
(280) |
Oct
(253) |
Nov
(232) |
Dec
(304) |
| 2009 |
Jan
(486) |
Feb
(470) |
Mar
(458) |
Apr
(423) |
May
(696) |
Jun
(461) |
Jul
(551) |
Aug
(575) |
Sep
(134) |
Oct
(110) |
Nov
(157) |
Dec
(102) |
| 2010 |
Jan
(226) |
Feb
(86) |
Mar
(147) |
Apr
(117) |
May
(107) |
Jun
(203) |
Jul
(193) |
Aug
(238) |
Sep
(300) |
Oct
(246) |
Nov
(23) |
Dec
(75) |
| 2011 |
Jan
(133) |
Feb
(195) |
Mar
(315) |
Apr
(200) |
May
(267) |
Jun
(293) |
Jul
(353) |
Aug
(237) |
Sep
(278) |
Oct
(611) |
Nov
(274) |
Dec
(260) |
| 2012 |
Jan
(303) |
Feb
(391) |
Mar
(417) |
Apr
(441) |
May
(488) |
Jun
(655) |
Jul
(590) |
Aug
(610) |
Sep
(526) |
Oct
(478) |
Nov
(359) |
Dec
(372) |
| 2013 |
Jan
(467) |
Feb
(226) |
Mar
(391) |
Apr
(281) |
May
(299) |
Jun
(252) |
Jul
(311) |
Aug
(352) |
Sep
(481) |
Oct
(571) |
Nov
(222) |
Dec
(231) |
| 2014 |
Jan
(185) |
Feb
(329) |
Mar
(245) |
Apr
(238) |
May
(281) |
Jun
(399) |
Jul
(382) |
Aug
(500) |
Sep
(579) |
Oct
(435) |
Nov
(487) |
Dec
(256) |
| 2015 |
Jan
(338) |
Feb
(357) |
Mar
(330) |
Apr
(294) |
May
(191) |
Jun
(108) |
Jul
(142) |
Aug
(261) |
Sep
(190) |
Oct
(54) |
Nov
(83) |
Dec
(22) |
| 2016 |
Jan
(49) |
Feb
(89) |
Mar
(33) |
Apr
(50) |
May
(27) |
Jun
(34) |
Jul
(53) |
Aug
(53) |
Sep
(98) |
Oct
(206) |
Nov
(93) |
Dec
(53) |
| 2017 |
Jan
(65) |
Feb
(82) |
Mar
(102) |
Apr
(86) |
May
(187) |
Jun
(67) |
Jul
(23) |
Aug
(93) |
Sep
(65) |
Oct
(45) |
Nov
(35) |
Dec
(17) |
| 2018 |
Jan
(26) |
Feb
(35) |
Mar
(38) |
Apr
(32) |
May
(8) |
Jun
(43) |
Jul
(27) |
Aug
(30) |
Sep
(43) |
Oct
(42) |
Nov
(38) |
Dec
(67) |
| 2019 |
Jan
(32) |
Feb
(37) |
Mar
(53) |
Apr
(64) |
May
(49) |
Jun
(18) |
Jul
(14) |
Aug
(53) |
Sep
(25) |
Oct
(30) |
Nov
(49) |
Dec
(31) |
| 2020 |
Jan
(87) |
Feb
(45) |
Mar
(37) |
Apr
(51) |
May
(99) |
Jun
(36) |
Jul
(11) |
Aug
(14) |
Sep
(20) |
Oct
(24) |
Nov
(40) |
Dec
(23) |
| 2021 |
Jan
(14) |
Feb
(53) |
Mar
(85) |
Apr
(15) |
May
(19) |
Jun
(3) |
Jul
(14) |
Aug
(1) |
Sep
(57) |
Oct
(73) |
Nov
(56) |
Dec
(22) |
| 2022 |
Jan
(3) |
Feb
(22) |
Mar
(6) |
Apr
(55) |
May
(46) |
Jun
(39) |
Jul
(15) |
Aug
(9) |
Sep
(11) |
Oct
(34) |
Nov
(20) |
Dec
(36) |
| 2023 |
Jan
(79) |
Feb
(41) |
Mar
(99) |
Apr
(169) |
May
(48) |
Jun
(16) |
Jul
(16) |
Aug
(57) |
Sep
(19) |
Oct
|
Nov
|
Dec
|
| S | M | T | W | T | F | S |
|---|---|---|---|---|---|---|
|
|
|
|
|
|
|
1
(5) |
|
2
(2) |
3
(3) |
4
(2) |
5
(4) |
6
(4) |
7
(1) |
8
|
|
9
|
10
(1) |
11
|
12
(3) |
13
(2) |
14
(2) |
15
|
|
16
|
17
|
18
|
19
(2) |
20
(10) |
21
|
22
(6) |
|
23
(7) |
24
|
25
(2) |
26
|
27
(1) |
28
(8) |
29
(2) |
|
30
|
31
|
|
|
|
|
|
|
From: Julian S. <se...@so...> - 2018-12-22 18:03:34
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=3b2f8bf69ea11f13357468d28cebc88d41be9199 commit 3b2f8bf69ea11f13357468d28cebc88d41be9199 Author: Julian Seward <js...@ac...> Date: Sat Dec 22 19:01:50 2018 +0100 amd64 back end: generate improved SIMD64 code. For most SIMD operations that happen on 64-bit values (as would arise from MMX instructions, for example, such as Add16x4, CmpEQ32x2, etc), generate code that performs the operation using SSE/SSE2 instructions on values in the low halves of XMM registers. This is much more efficient than the previous scheme of calling out to helper functions written in C. There are still a few SIMD64 operations done via helpers, though. Diff: --- VEX/priv/host_amd64_isel.c | 383 ++++++++++++++++++++++++++------------------- 1 file changed, 219 insertions(+), 164 deletions(-) diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c index e67edc5..faddc68 100644 --- a/VEX/priv/host_amd64_isel.c +++ b/VEX/priv/host_amd64_isel.c @@ -923,10 +923,6 @@ static HReg iselIntExpr_R ( ISelEnv* env, const IRExpr* e ) /* DO NOT CALL THIS DIRECTLY ! */ static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e ) { - /* Used for unary/binary SIMD64 ops. */ - HWord fn = 0; - Bool second_is_UInt; - MatchInfo mi; DECLARE_PATTERN(p_1Uto8_64to1); DECLARE_PATTERN(p_LDle8_then_8Uto64); @@ -1089,164 +1085,7 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e ) return dst; } - /* Deal with 64-bit SIMD binary ops */ - second_is_UInt = False; - switch (e->Iex.Binop.op) { - case Iop_Add8x8: - fn = (HWord)h_generic_calc_Add8x8; break; - case Iop_Add16x4: - fn = (HWord)h_generic_calc_Add16x4; break; - case Iop_Add32x2: - fn = (HWord)h_generic_calc_Add32x2; break; - - case Iop_Avg8Ux8: - fn = (HWord)h_generic_calc_Avg8Ux8; break; - case Iop_Avg16Ux4: - fn = (HWord)h_generic_calc_Avg16Ux4; break; - - case Iop_CmpEQ8x8: - fn = (HWord)h_generic_calc_CmpEQ8x8; break; - case Iop_CmpEQ16x4: - fn = (HWord)h_generic_calc_CmpEQ16x4; break; - case Iop_CmpEQ32x2: - fn = (HWord)h_generic_calc_CmpEQ32x2; break; - - case Iop_CmpGT8Sx8: - fn = (HWord)h_generic_calc_CmpGT8Sx8; break; - case Iop_CmpGT16Sx4: - fn = (HWord)h_generic_calc_CmpGT16Sx4; break; - case Iop_CmpGT32Sx2: - fn = (HWord)h_generic_calc_CmpGT32Sx2; break; - - case Iop_InterleaveHI8x8: - fn = (HWord)h_generic_calc_InterleaveHI8x8; break; - case Iop_InterleaveLO8x8: - fn = (HWord)h_generic_calc_InterleaveLO8x8; break; - case Iop_InterleaveHI16x4: - fn = (HWord)h_generic_calc_InterleaveHI16x4; break; - case Iop_InterleaveLO16x4: - fn = (HWord)h_generic_calc_InterleaveLO16x4; break; - case Iop_InterleaveHI32x2: - fn = (HWord)h_generic_calc_InterleaveHI32x2; break; - case Iop_InterleaveLO32x2: - fn = (HWord)h_generic_calc_InterleaveLO32x2; break; - case Iop_CatOddLanes16x4: - fn = (HWord)h_generic_calc_CatOddLanes16x4; break; - case Iop_CatEvenLanes16x4: - fn = (HWord)h_generic_calc_CatEvenLanes16x4; break; - case Iop_PermOrZero8x8: - fn = (HWord)h_generic_calc_PermOrZero8x8; break; - - case Iop_Max8Ux8: - fn = (HWord)h_generic_calc_Max8Ux8; break; - case Iop_Max16Sx4: - fn = (HWord)h_generic_calc_Max16Sx4; break; - case Iop_Min8Ux8: - fn = (HWord)h_generic_calc_Min8Ux8; break; - case Iop_Min16Sx4: - fn = (HWord)h_generic_calc_Min16Sx4; break; - - case Iop_Mul16x4: - fn = (HWord)h_generic_calc_Mul16x4; break; - case Iop_Mul32x2: - fn = (HWord)h_generic_calc_Mul32x2; break; - case Iop_MulHi16Sx4: - fn = (HWord)h_generic_calc_MulHi16Sx4; break; - case Iop_MulHi16Ux4: - fn = (HWord)h_generic_calc_MulHi16Ux4; break; - - case Iop_QAdd8Sx8: - fn = (HWord)h_generic_calc_QAdd8Sx8; break; - case Iop_QAdd16Sx4: - fn = (HWord)h_generic_calc_QAdd16Sx4; break; - case Iop_QAdd8Ux8: - fn = (HWord)h_generic_calc_QAdd8Ux8; break; - case Iop_QAdd16Ux4: - fn = (HWord)h_generic_calc_QAdd16Ux4; break; - - case Iop_QNarrowBin32Sto16Sx4: - fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; break; - case Iop_QNarrowBin16Sto8Sx8: - fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; break; - case Iop_QNarrowBin16Sto8Ux8: - fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; break; - case Iop_NarrowBin16to8x8: - fn = (HWord)h_generic_calc_NarrowBin16to8x8; break; - case Iop_NarrowBin32to16x4: - fn = (HWord)h_generic_calc_NarrowBin32to16x4; break; - - case Iop_QSub8Sx8: - fn = (HWord)h_generic_calc_QSub8Sx8; break; - case Iop_QSub16Sx4: - fn = (HWord)h_generic_calc_QSub16Sx4; break; - case Iop_QSub8Ux8: - fn = (HWord)h_generic_calc_QSub8Ux8; break; - case Iop_QSub16Ux4: - fn = (HWord)h_generic_calc_QSub16Ux4; break; - - case Iop_Sub8x8: - fn = (HWord)h_generic_calc_Sub8x8; break; - case Iop_Sub16x4: - fn = (HWord)h_generic_calc_Sub16x4; break; - case Iop_Sub32x2: - fn = (HWord)h_generic_calc_Sub32x2; break; - - case Iop_ShlN32x2: - fn = (HWord)h_generic_calc_ShlN32x2; - second_is_UInt = True; - break; - case Iop_ShlN16x4: - fn = (HWord)h_generic_calc_ShlN16x4; - second_is_UInt = True; - break; - case Iop_ShlN8x8: - fn = (HWord)h_generic_calc_ShlN8x8; - second_is_UInt = True; - break; - case Iop_ShrN32x2: - fn = (HWord)h_generic_calc_ShrN32x2; - second_is_UInt = True; - break; - case Iop_ShrN16x4: - fn = (HWord)h_generic_calc_ShrN16x4; - second_is_UInt = True; - break; - case Iop_SarN32x2: - fn = (HWord)h_generic_calc_SarN32x2; - second_is_UInt = True; - break; - case Iop_SarN16x4: - fn = (HWord)h_generic_calc_SarN16x4; - second_is_UInt = True; - break; - case Iop_SarN8x8: - fn = (HWord)h_generic_calc_SarN8x8; - second_is_UInt = True; - break; - - default: - fn = (HWord)0; break; - } - if (fn != (HWord)0) { - /* Note: the following assumes all helpers are of signature - ULong fn ( ULong, ULong ), and they are - not marked as regparm functions. - */ - HReg dst = newVRegI(env); - HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1); - HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2); - if (second_is_UInt) - addInstr(env, AMD64Instr_MovxLQ(False, argR, argR)); - addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) ); - addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) ); - addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2, - mk_RetLoc_simple(RLPri_Int) )); - addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst)); - return dst; - } - - /* Handle misc other ops. */ - + /* Handle misc other scalar ops. */ if (e->Iex.Binop.op == Iop_Max32U) { HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1); HReg dst = newVRegI(env); @@ -1380,6 +1219,221 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e ) return dst; } + /* Deal with 64-bit SIMD binary ops. For the most part these are doable + by using the equivalent 128-bit operation and ignoring the upper half + of the result. */ + AMD64SseOp op = Asse_INVALID; + Bool arg1isEReg = False; + Bool preShift32R = False; + switch (e->Iex.Binop.op) { + // The following 3 could be done with 128 bit insns too, but + // first require the inputs to be reformatted. + //case Iop_QNarrowBin32Sto16Sx4: + //op = Asse_PACKSSD; arg1isEReg = True; break; + //case Iop_QNarrowBin16Sto8Sx8: + //op = Asse_PACKSSW; arg1isEReg = True; break; + //case Iop_QNarrowBin16Sto8Ux8: + //op = Asse_PACKUSW; arg1isEReg = True; break; + + case Iop_InterleaveHI8x8: + op = Asse_UNPCKLB; arg1isEReg = True; preShift32R = True; + break; + case Iop_InterleaveHI16x4: + op = Asse_UNPCKLW; arg1isEReg = True; preShift32R = True; + break; + case Iop_InterleaveHI32x2: + op = Asse_UNPCKLD; arg1isEReg = True; preShift32R = True; + break; + case Iop_InterleaveLO8x8: + op = Asse_UNPCKLB; arg1isEReg = True; + break; + case Iop_InterleaveLO16x4: + op = Asse_UNPCKLW; arg1isEReg = True; + break; + case Iop_InterleaveLO32x2: + op = Asse_UNPCKLD; arg1isEReg = True; + break; + + case Iop_Add8x8: op = Asse_ADD8; break; + case Iop_Add16x4: op = Asse_ADD16; break; + case Iop_Add32x2: op = Asse_ADD32; break; + case Iop_QAdd8Sx8: op = Asse_QADD8S; break; + case Iop_QAdd16Sx4: op = Asse_QADD16S; break; + case Iop_QAdd8Ux8: op = Asse_QADD8U; break; + case Iop_QAdd16Ux4: op = Asse_QADD16U; break; + case Iop_Avg8Ux8: op = Asse_AVG8U; break; + case Iop_Avg16Ux4: op = Asse_AVG16U; break; + case Iop_CmpEQ8x8: op = Asse_CMPEQ8; break; + case Iop_CmpEQ16x4: op = Asse_CMPEQ16; break; + case Iop_CmpEQ32x2: op = Asse_CMPEQ32; break; + case Iop_CmpGT8Sx8: op = Asse_CMPGT8S; break; + case Iop_CmpGT16Sx4: op = Asse_CMPGT16S; break; + case Iop_CmpGT32Sx2: op = Asse_CMPGT32S; break; + case Iop_Max16Sx4: op = Asse_MAX16S; break; + case Iop_Max8Ux8: op = Asse_MAX8U; break; + case Iop_Min16Sx4: op = Asse_MIN16S; break; + case Iop_Min8Ux8: op = Asse_MIN8U; break; + case Iop_MulHi16Ux4: op = Asse_MULHI16U; break; + case Iop_MulHi16Sx4: op = Asse_MULHI16S; break; + case Iop_Mul16x4: op = Asse_MUL16; break; + case Iop_Sub8x8: op = Asse_SUB8; break; + case Iop_Sub16x4: op = Asse_SUB16; break; + case Iop_Sub32x2: op = Asse_SUB32; break; + case Iop_QSub8Sx8: op = Asse_QSUB8S; break; + case Iop_QSub16Sx4: op = Asse_QSUB16S; break; + case Iop_QSub8Ux8: op = Asse_QSUB8U; break; + case Iop_QSub16Ux4: op = Asse_QSUB16U; break; + default: break; + } + if (op != Asse_INVALID) { + /* This isn't pretty, but .. move each arg to the low half of an XMM + register, do the operation on the whole register, and move the + result back to an integer register. */ + const IRExpr* arg1 = e->Iex.Binop.arg1; + const IRExpr* arg2 = e->Iex.Binop.arg2; + vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64); + vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64); + HReg iarg1 = iselIntExpr_R(env, arg1); + HReg iarg2 = iselIntExpr_R(env, arg2); + HReg varg1 = newVRegV(env); + HReg varg2 = newVRegV(env); + HReg idst = newVRegI(env); + addInstr(env, AMD64Instr_SseMOVQ(iarg1, varg1, True/*toXMM*/)); + addInstr(env, AMD64Instr_SseMOVQ(iarg2, varg2, True/*toXMM*/)); + if (arg1isEReg) { + if (preShift32R) { + addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg1)); + addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg2)); + } + addInstr(env, AMD64Instr_SseReRg(op, varg1, varg2)); + addInstr(env, AMD64Instr_SseMOVQ(idst, varg2, False/*!toXMM*/)); + } else { + vassert(!preShift32R); + addInstr(env, AMD64Instr_SseReRg(op, varg2, varg1)); + addInstr(env, AMD64Instr_SseMOVQ(idst, varg1, False/*!toXMM*/)); + } + return idst; + } + + UInt laneBits = 0; + op = Asse_INVALID; + switch (e->Iex.Binop.op) { + case Iop_ShlN16x4: laneBits = 16; op = Asse_SHL16; break; + case Iop_ShlN32x2: laneBits = 32; op = Asse_SHL32; break; + case Iop_SarN16x4: laneBits = 16; op = Asse_SAR16; break; + case Iop_SarN32x2: laneBits = 32; op = Asse_SAR32; break; + case Iop_ShrN16x4: laneBits = 16; op = Asse_SHR16; break; + case Iop_ShrN32x2: laneBits = 32; op = Asse_SHR32; break; + default: break; + } + if (op != Asse_INVALID) { + const IRExpr* arg1 = e->Iex.Binop.arg1; + const IRExpr* arg2 = e->Iex.Binop.arg2; + vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64); + vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I8); + HReg igreg = iselIntExpr_R(env, arg1); + HReg vgreg = newVRegV(env); + HReg idst = newVRegI(env); + addInstr(env, AMD64Instr_SseMOVQ(igreg, vgreg, True/*toXMM*/)); + /* If it's a shift by an in-range immediate, generate a single + instruction. */ + if (arg2->tag == Iex_Const) { + IRConst* c = arg2->Iex.Const.con; + vassert(c->tag == Ico_U8); + UInt shift = c->Ico.U8; + if (shift < laneBits) { + addInstr(env, AMD64Instr_SseShiftN(op, shift, vgreg)); + addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/)); + return idst; + } + } + /* Otherwise we have to do it the longwinded way. */ + HReg ishift = iselIntExpr_R(env, arg2); + HReg vshift = newVRegV(env); + addInstr(env, AMD64Instr_SseMOVQ(ishift, vshift, True/*toXMM*/)); + addInstr(env, AMD64Instr_SseReRg(op, vshift, vgreg)); + addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/)); + return idst; + } + + if (e->Iex.Binop.op == Iop_Mul32x2) { + const IRExpr* arg1 = e->Iex.Binop.arg1; + const IRExpr* arg2 = e->Iex.Binop.arg2; + vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64); + vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64); + HReg s1 = iselIntExpr_R(env, arg1); + HReg s2 = iselIntExpr_R(env, arg2); + HReg resLo = newVRegI(env); + // resLo = (s1 *64 s2) & 0xFFFF'FFFF + addInstr(env, mk_iMOVsd_RR(s1, resLo)); + addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(s2), resLo)); + addInstr(env, AMD64Instr_MovxLQ(False, resLo, resLo)); + + // resHi = ((s1 >>u 32) *64 (s2 >>u 32)) << 32; + HReg resHi = newVRegI(env); + addInstr(env, mk_iMOVsd_RR(s1, resHi)); + addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, resHi)); + HReg tmp = newVRegI(env); + addInstr(env, mk_iMOVsd_RR(s2, tmp)); + addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, tmp)); + addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(tmp), resHi)); + addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, resHi)); + + // final result = resHi | resLo + addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(resHi), resLo)); + return resLo; + } + + // A few remaining SIMD64 ops require helper functions, at least for + // now. + Bool second_is_UInt = False; + HWord fn = 0; + switch (e->Iex.Binop.op) { + case Iop_CatOddLanes16x4: + fn = (HWord)h_generic_calc_CatOddLanes16x4; break; + case Iop_CatEvenLanes16x4: + fn = (HWord)h_generic_calc_CatEvenLanes16x4; break; + case Iop_PermOrZero8x8: + fn = (HWord)h_generic_calc_PermOrZero8x8; break; + + case Iop_QNarrowBin32Sto16Sx4: + fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; break; + case Iop_QNarrowBin16Sto8Sx8: + fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; break; + case Iop_QNarrowBin16Sto8Ux8: + fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; break; + + case Iop_NarrowBin16to8x8: + fn = (HWord)h_generic_calc_NarrowBin16to8x8; break; + case Iop_NarrowBin32to16x4: + fn = (HWord)h_generic_calc_NarrowBin32to16x4; break; + + case Iop_SarN8x8: + fn = (HWord)h_generic_calc_SarN8x8; + second_is_UInt = True; + break; + + default: + fn = (HWord)0; break; + } + if (fn != (HWord)0) { + /* Note: the following assumes all helpers are of signature + ULong fn ( ULong, ULong ), and they are + not marked as regparm functions. + */ + HReg dst = newVRegI(env); + HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1); + HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2); + if (second_is_UInt) + addInstr(env, AMD64Instr_MovxLQ(False, argR, argR)); + addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) ); + addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) ); + addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2, + mk_RetLoc_simple(RLPri_Int) )); + addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst)); + return dst; + } + break; } @@ -1710,7 +1764,7 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e ) */ HReg dst = newVRegI(env); HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg); - fn = (HWord)h_generic_calc_GetMSBs8x8; + HWord fn = (HWord)h_generic_calc_GetMSBs8x8; addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) ); addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1, mk_RetLoc_simple(RLPri_Int) )); @@ -1730,7 +1784,7 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e ) HReg dst = newVRegI(env); HReg vec = iselVecExpr(env, e->Iex.Unop.arg); HReg rsp = hregAMD64_RSP(); - fn = (HWord)h_generic_calc_GetMSBs8x16; + HWord fn = (HWord)h_generic_calc_GetMSBs8x16; AMD64AMode* m8_rsp = AMD64AMode_IR( -8, rsp); AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp); addInstr(env, AMD64Instr_SseLdSt(False/*store*/, @@ -1759,6 +1813,7 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e ) } /* Deal with unary 64-bit SIMD ops. */ + HWord fn = 0; switch (e->Iex.Unop.op) { case Iop_CmpNEZ32x2: fn = (HWord)h_generic_calc_CmpNEZ32x2; break; |
|
From: Julian S. <se...@so...> - 2018-12-22 17:05:53
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=b17d5ffdb844cf081c86d7df9489f61b4392ca47 commit b17d5ffdb844cf081c86d7df9489f61b4392ca47 Author: Julian Seward <js...@ac...> Date: Sat Dec 22 18:04:42 2018 +0100 amd64 back end: generate better code for 2x64<-->V128 and 4x64<-->V256 transfers .. .. by adding support for MOVQ xmm/ireg and using that to implement 64HLtoV128, 4x64toV256 and their inverses. This reduces the number of instructions, removes the use of memory as an intermediary, and avoids store-forwarding stalls. Diff: --- VEX/priv/host_amd64_defs.c | 49 ++++++++++++++ VEX/priv/host_amd64_defs.h | 7 ++ VEX/priv/host_amd64_isel.c | 159 +++++++++++++++++++++++++++++---------------- 3 files changed, 158 insertions(+), 57 deletions(-) diff --git a/VEX/priv/host_amd64_defs.c b/VEX/priv/host_amd64_defs.c index e3a2c72..8e55197 100644 --- a/VEX/priv/host_amd64_defs.c +++ b/VEX/priv/host_amd64_defs.c @@ -1020,6 +1020,16 @@ AMD64Instr* AMD64Instr_SseShiftN ( AMD64SseOp op, i->Ain.SseShiftN.dst = dst; return i; } +AMD64Instr* AMD64Instr_SseMOVQ ( HReg gpr, HReg xmm, Bool toXMM ) { + AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr)); + i->tag = Ain_SseMOVQ; + i->Ain.SseMOVQ.gpr = gpr; + i->Ain.SseMOVQ.xmm = xmm; + i->Ain.SseMOVQ.toXMM = toXMM; + vassert(hregClass(gpr) == HRcInt64); + vassert(hregClass(xmm) == HRcVec128); + return i; +} //uu AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad, //uu HReg reg, AMD64AMode* addr ) { //uu AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr)); @@ -1377,6 +1387,18 @@ void ppAMD64Instr ( const AMD64Instr* i, Bool mode64 ) i->Ain.SseShiftN.shiftBits); ppHRegAMD64(i->Ain.SseShiftN.dst); return; + case Ain_SseMOVQ: + vex_printf("movq "); + if (i->Ain.SseMOVQ.toXMM) { + ppHRegAMD64(i->Ain.SseMOVQ.gpr); + vex_printf(","); + ppHRegAMD64(i->Ain.SseMOVQ.xmm); + } else { + ppHRegAMD64(i->Ain.SseMOVQ.xmm); + vex_printf(","); + ppHRegAMD64(i->Ain.SseMOVQ.gpr); + }; + return; //uu case Ain_AvxLdSt: //uu vex_printf("vmovups "); //uu if (i->Ain.AvxLdSt.isLoad) { @@ -1714,6 +1736,12 @@ void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 ) case Ain_SseShiftN: addHRegUse(u, HRmModify, i->Ain.SseShiftN.dst); return; + case Ain_SseMOVQ: + addHRegUse(u, i->Ain.SseMOVQ.toXMM ? HRmRead : HRmWrite, + i->Ain.SseMOVQ.gpr); + addHRegUse(u, i->Ain.SseMOVQ.toXMM ? HRmWrite : HRmRead, + i->Ain.SseMOVQ.xmm); + return; //uu case Ain_AvxLdSt: //uu addRegUsage_AMD64AMode(u, i->Ain.AvxLdSt.addr); //uu addHRegUse(u, i->Ain.AvxLdSt.isLoad ? HRmWrite : HRmRead, @@ -1932,6 +1960,10 @@ void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 ) case Ain_SseShiftN: mapReg(m, &i->Ain.SseShiftN.dst); return; + case Ain_SseMOVQ: + mapReg(m, &i->Ain.SseMOVQ.gpr); + mapReg(m, &i->Ain.SseMOVQ.xmm); + return; //uu case Ain_AvxLdSt: //uu mapReg(m, &i->Ain.AvxLdSt.reg); //uu mapRegs_AMD64AMode(m, i->Ain.AvxLdSt.addr); @@ -2301,6 +2333,11 @@ static inline UChar clearWBit ( UChar rex ) return rex & ~(1<<3); } +static inline UChar setWBit ( UChar rex ) +{ + return rex | (1<<3); +} + /* Make up a REX byte, with W=1 (size=64), for a (greg,amode) pair. */ inline static UChar rexAMode_M__wrk ( UInt gregEnc3210, AMD64AMode* am ) @@ -3914,6 +3951,18 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc, goto done; } + case Ain_SseMOVQ: { + Bool toXMM = i->Ain.SseMOVQ.toXMM; + HReg gpr = i->Ain.SseMOVQ.gpr; + HReg xmm = i->Ain.SseMOVQ.xmm; + *p++ = 0x66; + *p++ = setWBit( rexAMode_R_enc_enc( vregEnc3210(xmm), iregEnc3210(gpr)) ); + *p++ = 0x0F; + *p++ = toXMM ? 0x6E : 0x7E; + p = doAMode_R_enc_enc( p, vregEnc3210(xmm), iregEnc3210(gpr) ); + goto done; + } + //uu case Ain_AvxLdSt: { //uu UInt vex = vexAMode_M( dvreg2ireg(i->Ain.AvxLdSt.reg), //uu i->Ain.AvxLdSt.addr ); diff --git a/VEX/priv/host_amd64_defs.h b/VEX/priv/host_amd64_defs.h index c45229f..64bd810 100644 --- a/VEX/priv/host_amd64_defs.h +++ b/VEX/priv/host_amd64_defs.h @@ -404,6 +404,7 @@ typedef Ain_SseCMov, /* SSE conditional move */ Ain_SseShuf, /* SSE2 shuffle (pshufd) */ Ain_SseShiftN, /* SSE2 shift by immediate */ + Ain_SseMOVQ, /* SSE2 moves of xmm[63:0] to/from GPR */ //uu Ain_AvxLdSt, /* AVX load/store 256 bits, //uu no alignment constraints */ //uu Ain_AvxReRg, /* AVX binary general reg-reg, Re, Rg */ @@ -704,6 +705,11 @@ typedef UInt shiftBits; HReg dst; } SseShiftN; + struct { + HReg gpr; + HReg xmm; + Bool toXMM; // when moving to xmm, xmm[127:64] is zeroed out + } SseMOVQ; //uu struct { //uu Bool isLoad; //uu HReg reg; @@ -784,6 +790,7 @@ extern AMD64Instr* AMD64Instr_SseCMov ( AMD64CondCode, HReg src, HReg dst ); extern AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ); extern AMD64Instr* AMD64Instr_SseShiftN ( AMD64SseOp, UInt shiftBits, HReg dst ); +extern AMD64Instr* AMD64Instr_SseMOVQ ( HReg gpr, HReg xmm, Bool toXMM ); //uu extern AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad, HReg, AMD64AMode* ); //uu extern AMD64Instr* AMD64Instr_AvxReRg ( AMD64SseOp, HReg, HReg ); extern AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter, diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c index 486901c..e67edc5 100644 --- a/VEX/priv/host_amd64_isel.c +++ b/VEX/priv/host_amd64_isel.c @@ -91,7 +91,7 @@ static IRExpr* bind ( Int binder ) return IRExpr_Binder(binder); } -static Bool isZeroU8 ( IRExpr* e ) +static Bool isZeroU8 ( const IRExpr* e ) { return e->tag == Iex_Const && e->Iex.Const.con->tag == Ico_U8 @@ -291,20 +291,32 @@ static Bool fitsIn32Bits ( ULong x ) /* Is this a 64-bit zero expression? */ -static Bool isZeroU64 ( IRExpr* e ) +static Bool isZeroU64 ( const IRExpr* e ) { return e->tag == Iex_Const && e->Iex.Const.con->tag == Ico_U64 && e->Iex.Const.con->Ico.U64 == 0ULL; } -static Bool isZeroU32 ( IRExpr* e ) +static Bool isZeroU32 ( const IRExpr* e ) { return e->tag == Iex_Const && e->Iex.Const.con->tag == Ico_U32 && e->Iex.Const.con->Ico.U32 == 0; } +/* Are both args atoms and the same? This is copy of eqIRAtom + that omits the assertions that the args are indeed atoms. */ + +static Bool areAtomsAndEqual ( const IRExpr* a1, const IRExpr* a2 ) +{ + if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp) + return toBool(a1->Iex.RdTmp.tmp == a2->Iex.RdTmp.tmp); + if (a1->tag == Iex_Const && a2->tag == Iex_Const) + return eqIRConst(a1->Iex.Const.con, a2->Iex.Const.con); + return False; +} + /* Make a int reg-reg move. */ static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst ) @@ -1609,44 +1621,47 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e ) } /* V128{HI}to64 */ - case Iop_V128HIto64: case Iop_V128to64: { HReg dst = newVRegI(env); - Int off = e->Iex.Unop.op==Iop_V128HIto64 ? -8 : -16; - HReg rsp = hregAMD64_RSP(); HReg vec = iselVecExpr(env, e->Iex.Unop.arg); - AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp); - AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp); - addInstr(env, AMD64Instr_SseLdSt(False/*store*/, - 16, vec, m16_rsp)); - addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, - AMD64RMI_Mem(off_rsp), dst )); + addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/)); + return dst; + } + case Iop_V128HIto64: { + HReg dst = newVRegI(env); + HReg vec = iselVecExpr(env, e->Iex.Unop.arg); + HReg vec2 = newVRegV(env); + addInstr(env, mk_vMOVsd_RR(vec, vec2)); + addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2)); + addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/)); return dst; } + /* V256to64_{3,2,1,0} */ case Iop_V256to64_0: case Iop_V256to64_1: case Iop_V256to64_2: case Iop_V256to64_3: { HReg vHi, vLo, vec; iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg); /* Do the first part of the selection by deciding which of - the 128 bit registers do look at, and second part using + the 128 bit registers to look at, and second part using the same scheme as for V128{HI}to64 above. */ - Int off = 0; + Bool low64of128 = True; switch (e->Iex.Unop.op) { - case Iop_V256to64_0: vec = vLo; off = -16; break; - case Iop_V256to64_1: vec = vLo; off = -8; break; - case Iop_V256to64_2: vec = vHi; off = -16; break; - case Iop_V256to64_3: vec = vHi; off = -8; break; + case Iop_V256to64_0: vec = vLo; low64of128 = True; break; + case Iop_V256to64_1: vec = vLo; low64of128 = False; break; + case Iop_V256to64_2: vec = vHi; low64of128 = True; break; + case Iop_V256to64_3: vec = vHi; low64of128 = False; break; default: vassert(0); } - HReg dst = newVRegI(env); - HReg rsp = hregAMD64_RSP(); - AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp); - AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp); - addInstr(env, AMD64Instr_SseLdSt(False/*store*/, - 16, vec, m16_rsp)); - addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, - AMD64RMI_Mem(off_rsp), dst )); + HReg dst = newVRegI(env); + if (low64of128) { + addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/)); + } else { + HReg vec2 = newVRegV(env); + addInstr(env, mk_vMOVsd_RR(vec, vec2)); + addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2)); + addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/)); + } return dst; } @@ -3355,16 +3370,26 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e ) } case Iop_64HLtoV128: { - HReg rsp = hregAMD64_RSP(); - AMD64AMode* m8_rsp = AMD64AMode_IR(-8, rsp); - AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp); - AMD64RI* qHi = iselIntExpr_RI(env, e->Iex.Binop.arg1); - AMD64RI* qLo = iselIntExpr_RI(env, e->Iex.Binop.arg2); - addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qHi, m8_rsp)); - addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qLo, m16_rsp)); - HReg dst = newVRegV(env); - /* One store-forwarding stall coming up, oh well :-( */ - addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, m16_rsp)); + const IRExpr* arg1 = e->Iex.Binop.arg1; + const IRExpr* arg2 = e->Iex.Binop.arg2; + HReg dst = newVRegV(env); + HReg tmp = newVRegV(env); + HReg qHi = iselIntExpr_R(env, arg1); + // If the args are trivially the same (tmp or const), use the same + // source register for both, and only one movq since those are + // (relatively) expensive. + if (areAtomsAndEqual(arg1, arg2)) { + addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/)); + addInstr(env, mk_vMOVsd_RR(dst, tmp)); + addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst)); + addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst)); + } else { + HReg qLo = iselIntExpr_R(env, arg2); + addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/)); + addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst)); + addInstr(env, AMD64Instr_SseMOVQ(qLo, tmp, True/*toXMM*/)); + addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst)); + } return dst; } @@ -4071,6 +4096,9 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, } case Iop_V128HLtoV256: { + // Curiously, there doesn't seem to be any benefit to be had here by + // checking whether arg1 and arg2 are the same, in the style of how + // (eg) 64HLtoV128 is handled elsewhere in this file. *rHi = iselVecExpr(env, e->Iex.Binop.arg1); *rLo = iselVecExpr(env, e->Iex.Binop.arg2); return; @@ -4313,27 +4341,44 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) { - HReg rsp = hregAMD64_RSP(); - HReg vHi = newVRegV(env); - HReg vLo = newVRegV(env); - AMD64AMode* m8_rsp = AMD64AMode_IR(-8, rsp); - AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp); - /* arg1 is the most significant (Q3), arg4 the least (Q0) */ - /* Get all the args into regs, before messing with the stack. */ - AMD64RI* q3 = iselIntExpr_RI(env, e->Iex.Qop.details->arg1); - AMD64RI* q2 = iselIntExpr_RI(env, e->Iex.Qop.details->arg2); - AMD64RI* q1 = iselIntExpr_RI(env, e->Iex.Qop.details->arg3); - AMD64RI* q0 = iselIntExpr_RI(env, e->Iex.Qop.details->arg4); - /* less significant lane (Q2) at the lower address (-16(rsp)) */ - addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q3, m8_rsp)); - addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q2, m16_rsp)); - addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, m16_rsp)); - /* and then the lower half .. */ - addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q1, m8_rsp)); - addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q0, m16_rsp)); - addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, m16_rsp)); - *rHi = vHi; - *rLo = vLo; + const IRExpr* arg1 = e->Iex.Qop.details->arg1; + const IRExpr* arg2 = e->Iex.Qop.details->arg2; + const IRExpr* arg3 = e->Iex.Qop.details->arg3; + const IRExpr* arg4 = e->Iex.Qop.details->arg4; + // If the args are trivially the same (tmp or const), use the same + // source register for all four, and only one movq since those are + // (relatively) expensive. + if (areAtomsAndEqual(arg1, arg2) + && areAtomsAndEqual(arg1, arg3) && areAtomsAndEqual(arg1, arg4)) { + HReg q3 = iselIntExpr_R(env, e->Iex.Qop.details->arg1); + HReg tmp = newVRegV(env); + HReg dst = newVRegV(env); + addInstr(env, AMD64Instr_SseMOVQ(q3, dst, True/*toXMM*/)); + addInstr(env, mk_vMOVsd_RR(dst, tmp)); + addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst)); + addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst)); + *rHi = dst; + *rLo = dst; + } else { + /* arg1 is the most significant (Q3), arg4 the least (Q0) */ + HReg q3 = iselIntExpr_R(env, arg1); + HReg q2 = iselIntExpr_R(env, arg2); + HReg q1 = iselIntExpr_R(env, arg3); + HReg q0 = iselIntExpr_R(env, arg4); + HReg tmp = newVRegV(env); + HReg dstHi = newVRegV(env); + HReg dstLo = newVRegV(env); + addInstr(env, AMD64Instr_SseMOVQ(q3, dstHi, True/*toXMM*/)); + addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi)); + addInstr(env, AMD64Instr_SseMOVQ(q2, tmp, True/*toXMM*/)); + addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstHi)); + addInstr(env, AMD64Instr_SseMOVQ(q1, dstLo, True/*toXMM*/)); + addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo)); + addInstr(env, AMD64Instr_SseMOVQ(q0, tmp, True/*toXMM*/)); + addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstLo)); + *rHi = dstHi; + *rLo = dstLo; + } return; } |
|
From: Julian S. <se...@so...> - 2018-12-22 15:15:31
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=dda0d80f3db1632b204b522a1dbb009490265b0e commit dda0d80f3db1632b204b522a1dbb009490265b0e Author: Julian Seward <js...@ac...> Date: Sat Dec 22 16:11:39 2018 +0100 amd64 pipeline: improve performance of cvtdq2ps and cvtps2dq (128 and 256 bit versions) .. .. by giving them their own vector IROps rather than doing each lane individually. Diff: --- VEX/priv/guest_amd64_toIR.c | 80 ++-------------------------------------- VEX/priv/host_amd64_defs.c | 14 ++++++- VEX/priv/host_amd64_defs.h | 3 ++ VEX/priv/host_amd64_isel.c | 29 +++++++++++++++ VEX/priv/ir_defs.c | 11 ++++++ VEX/pub/libvex_ir.h | 15 ++++++-- memcheck/mc_translate.c | 28 ++++++++++++-- memcheck/tests/vbit-test/irops.c | 4 ++ 8 files changed, 100 insertions(+), 84 deletions(-) diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c index 2451a29..fea0eca 100644 --- a/VEX/priv/guest_amd64_toIR.c +++ b/VEX/priv/guest_amd64_toIR.c @@ -10671,7 +10671,6 @@ static Long dis_CVTxPS2DQ_128 ( const VexAbiInfo* vbi, Prefix pfx, IRTemp argV = newTemp(Ity_V128); IRTemp rmode = newTemp(Ity_I32); UInt rG = gregOfRexRM(pfx,modrm); - IRTemp t0, t1, t2, t3; if (epartIsReg(modrm)) { UInt rE = eregOfRexRM(pfx,modrm); @@ -10689,21 +10688,7 @@ static Long dis_CVTxPS2DQ_128 ( const VexAbiInfo* vbi, Prefix pfx, assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO) : get_sse_roundingmode() ); - t0 = t1 = t2 = t3 = IRTemp_INVALID; - breakupV128to32s( argV, &t3, &t2, &t1, &t0 ); - /* This is less than ideal. If it turns out to be a performance - bottleneck it can be improved. */ -# define CVT(_t) \ - binop( Iop_F64toI32S, \ - mkexpr(rmode), \ - unop( Iop_F32toF64, \ - unop( Iop_ReinterpI32asF32, mkexpr(_t))) ) - - putXMMRegLane32( rG, 3, CVT(t3) ); - putXMMRegLane32( rG, 2, CVT(t2) ); - putXMMRegLane32( rG, 1, CVT(t1) ); - putXMMRegLane32( rG, 0, CVT(t0) ); -# undef CVT + putXMMReg( rG, binop(Iop_F32toI32Sx4, mkexpr(rmode), mkexpr(argV)) ); if (isAvx) putYMMRegLane128( rG, 1, mkV128(0) ); @@ -10721,7 +10706,6 @@ static Long dis_CVTxPS2DQ_256 ( const VexAbiInfo* vbi, Prefix pfx, IRTemp argV = newTemp(Ity_V256); IRTemp rmode = newTemp(Ity_I32); UInt rG = gregOfRexRM(pfx,modrm); - IRTemp t0, t1, t2, t3, t4, t5, t6, t7; if (epartIsReg(modrm)) { UInt rE = eregOfRexRM(pfx,modrm); @@ -10739,26 +10723,7 @@ static Long dis_CVTxPS2DQ_256 ( const VexAbiInfo* vbi, Prefix pfx, assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO) : get_sse_roundingmode() ); - t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = IRTemp_INVALID; - breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 ); - /* This is less than ideal. If it turns out to be a performance - bottleneck it can be improved. */ -# define CVT(_t) \ - binop( Iop_F64toI32S, \ - mkexpr(rmode), \ - unop( Iop_F32toF64, \ - unop( Iop_ReinterpI32asF32, mkexpr(_t))) ) - - putYMMRegLane32( rG, 7, CVT(t7) ); - putYMMRegLane32( rG, 6, CVT(t6) ); - putYMMRegLane32( rG, 5, CVT(t5) ); - putYMMRegLane32( rG, 4, CVT(t4) ); - putYMMRegLane32( rG, 3, CVT(t3) ); - putYMMRegLane32( rG, 2, CVT(t2) ); - putYMMRegLane32( rG, 1, CVT(t1) ); - putYMMRegLane32( rG, 0, CVT(t0) ); -# undef CVT - + putYMMReg( rG, binop(Iop_F32toI32Sx8, mkexpr(rmode), mkexpr(argV)) ); return delta; } @@ -10882,7 +10847,6 @@ static Long dis_CVTDQ2PS_128 ( const VexAbiInfo* vbi, Prefix pfx, IRTemp argV = newTemp(Ity_V128); IRTemp rmode = newTemp(Ity_I32); UInt rG = gregOfRexRM(pfx,modrm); - IRTemp t0, t1, t2, t3; if (epartIsReg(modrm)) { UInt rE = eregOfRexRM(pfx,modrm); @@ -10899,21 +10863,8 @@ static Long dis_CVTDQ2PS_128 ( const VexAbiInfo* vbi, Prefix pfx, } assign( rmode, get_sse_roundingmode() ); - t0 = IRTemp_INVALID; - t1 = IRTemp_INVALID; - t2 = IRTemp_INVALID; - t3 = IRTemp_INVALID; - breakupV128to32s( argV, &t3, &t2, &t1, &t0 ); + putXMMReg(rG, binop(Iop_I32StoF32x4, mkexpr(rmode), mkexpr(argV))); -# define CVT(_t) binop( Iop_F64toF32, \ - mkexpr(rmode), \ - unop(Iop_I32StoF64,mkexpr(_t))) - - putXMMRegLane32F( rG, 3, CVT(t3) ); - putXMMRegLane32F( rG, 2, CVT(t2) ); - putXMMRegLane32F( rG, 1, CVT(t1) ); - putXMMRegLane32F( rG, 0, CVT(t0) ); -# undef CVT if (isAvx) putYMMRegLane128( rG, 1, mkV128(0) ); @@ -10930,7 +10881,6 @@ static Long dis_CVTDQ2PS_256 ( const VexAbiInfo* vbi, Prefix pfx, IRTemp argV = newTemp(Ity_V256); IRTemp rmode = newTemp(Ity_I32); UInt rG = gregOfRexRM(pfx,modrm); - IRTemp t0, t1, t2, t3, t4, t5, t6, t7; if (epartIsReg(modrm)) { UInt rE = eregOfRexRM(pfx,modrm); @@ -10945,29 +10895,7 @@ static Long dis_CVTDQ2PS_256 ( const VexAbiInfo* vbi, Prefix pfx, } assign( rmode, get_sse_roundingmode() ); - t0 = IRTemp_INVALID; - t1 = IRTemp_INVALID; - t2 = IRTemp_INVALID; - t3 = IRTemp_INVALID; - t4 = IRTemp_INVALID; - t5 = IRTemp_INVALID; - t6 = IRTemp_INVALID; - t7 = IRTemp_INVALID; - breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 ); - -# define CVT(_t) binop( Iop_F64toF32, \ - mkexpr(rmode), \ - unop(Iop_I32StoF64,mkexpr(_t))) - - putYMMRegLane32F( rG, 7, CVT(t7) ); - putYMMRegLane32F( rG, 6, CVT(t6) ); - putYMMRegLane32F( rG, 5, CVT(t5) ); - putYMMRegLane32F( rG, 4, CVT(t4) ); - putYMMRegLane32F( rG, 3, CVT(t3) ); - putYMMRegLane32F( rG, 2, CVT(t2) ); - putYMMRegLane32F( rG, 1, CVT(t1) ); - putYMMRegLane32F( rG, 0, CVT(t0) ); -# undef CVT + putYMMReg(rG, binop(Iop_I32StoF32x8, mkexpr(rmode), mkexpr(argV))); return delta; } diff --git a/VEX/priv/host_amd64_defs.c b/VEX/priv/host_amd64_defs.c index 1536d81..e3a2c72 100644 --- a/VEX/priv/host_amd64_defs.c +++ b/VEX/priv/host_amd64_defs.c @@ -530,6 +530,8 @@ const HChar* showAMD64SseOp ( AMD64SseOp op ) { case Asse_RCPF: return "rcp"; case Asse_RSQRTF: return "rsqrt"; case Asse_SQRTF: return "sqrt"; + case Asse_I2F: return "cvtdq2ps."; + case Asse_F2I: return "cvtps2dq."; case Asse_AND: return "and"; case Asse_OR: return "or"; case Asse_XOR: return "xor"; @@ -568,9 +570,11 @@ const HChar* showAMD64SseOp ( AMD64SseOp op ) { case Asse_SHL16: return "psllw"; case Asse_SHL32: return "pslld"; case Asse_SHL64: return "psllq"; + case Asse_SHL128: return "pslldq"; case Asse_SHR16: return "psrlw"; case Asse_SHR32: return "psrld"; case Asse_SHR64: return "psrlq"; + case Asse_SHR128: return "psrldq"; case Asse_SAR16: return "psraw"; case Asse_SAR32: return "psrad"; case Asse_PACKSSD: return "packssdw"; @@ -1643,7 +1647,9 @@ void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 ) vassert(i->Ain.Sse32Fx4.op != Asse_MOV); unary = toBool( i->Ain.Sse32Fx4.op == Asse_RCPF || i->Ain.Sse32Fx4.op == Asse_RSQRTF - || i->Ain.Sse32Fx4.op == Asse_SQRTF ); + || i->Ain.Sse32Fx4.op == Asse_SQRTF + || i->Ain.Sse32Fx4.op == Asse_I2F + || i->Ain.Sse32Fx4.op == Asse_F2I ); addHRegUse(u, HRmRead, i->Ain.Sse32Fx4.src); addHRegUse(u, unary ? HRmWrite : HRmModify, i->Ain.Sse32Fx4.dst); @@ -3648,6 +3654,10 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc, case Ain_Sse32Fx4: xtra = 0; + switch (i->Ain.Sse32Fx4.op) { + case Asse_F2I: *p++ = 0x66; break; + default: break; + } *p++ = clearWBit( rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse32Fx4.dst), vregEnc3210(i->Ain.Sse32Fx4.src) )); @@ -3661,6 +3671,8 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc, case Asse_RCPF: *p++ = 0x53; break; case Asse_RSQRTF: *p++ = 0x52; break; case Asse_SQRTF: *p++ = 0x51; break; + case Asse_I2F: *p++ = 0x5B; break; // cvtdq2ps; no 0x66 pfx + case Asse_F2I: *p++ = 0x5B; break; // cvtps2dq; with 0x66 pfx case Asse_SUBF: *p++ = 0x5C; break; case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break; case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break; diff --git a/VEX/priv/host_amd64_defs.h b/VEX/priv/host_amd64_defs.h index e1715a0..c45229f 100644 --- a/VEX/priv/host_amd64_defs.h +++ b/VEX/priv/host_amd64_defs.h @@ -316,6 +316,9 @@ typedef Asse_CMPEQF, Asse_CMPLTF, Asse_CMPLEF, Asse_CMPUNF, /* Floating point unary */ Asse_RCPF, Asse_RSQRTF, Asse_SQRTF, + /* Floating point conversion */ + Asse_I2F, // i32-signed to float conversion, aka cvtdq2ps in vec form + Asse_F2I, // float to i32-signed conversion, aka cvtps2dq in vec form /* Bitwise */ Asse_AND, Asse_OR, Asse_XOR, Asse_ANDN, Asse_ADD8, Asse_ADD16, Asse_ADD32, Asse_ADD64, diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c index 59fd752..486901c 100644 --- a/VEX/priv/host_amd64_isel.c +++ b/VEX/priv/host_amd64_isel.c @@ -3688,6 +3688,18 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e ) return dst; } + case Iop_I32StoF32x4: + case Iop_F32toI32Sx4: { + HReg arg = iselVecExpr(env, e->Iex.Binop.arg2); + HReg dst = newVRegV(env); + AMD64SseOp mop + = e->Iex.Binop.op == Iop_I32StoF32x4 ? Asse_I2F : Asse_F2I; + set_SSE_rounding_mode(env, e->Iex.Binop.arg1); + addInstr(env, AMD64Instr_Sse32Fx4(mop, arg, dst)); + set_SSE_rounding_default(env); + return dst; + } + default: break; } /* switch (e->Iex.Binop.op) */ @@ -4224,6 +4236,23 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, return; } + case Iop_I32StoF32x8: + case Iop_F32toI32Sx8: { + HReg argHi, argLo; + iselDVecExpr(&argHi, &argLo, env, e->Iex.Binop.arg2); + HReg dstHi = newVRegV(env); + HReg dstLo = newVRegV(env); + AMD64SseOp mop + = e->Iex.Binop.op == Iop_I32StoF32x8 ? Asse_I2F : Asse_F2I; + set_SSE_rounding_mode(env, e->Iex.Binop.arg1); + addInstr(env, AMD64Instr_Sse32Fx4(mop, argHi, dstHi)); + addInstr(env, AMD64Instr_Sse32Fx4(mop, argLo, dstLo)); + set_SSE_rounding_default(env); + *rHi = dstHi; + *rLo = dstLo; + return; + } + default: break; } /* switch (e->Iex.Binop.op) */ diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c index ae1c203..93de80f 100644 --- a/VEX/priv/ir_defs.c +++ b/VEX/priv/ir_defs.c @@ -440,6 +440,9 @@ void ppIROp ( IROp op ) case Iop_I32UtoFx4: vex_printf("I32UtoFx4"); return; case Iop_I32StoFx4: vex_printf("I32StoFx4"); return; + case Iop_I32StoF32x4: vex_printf("I32StoF32x4"); return; + case Iop_F32toI32Sx4: vex_printf("F32toI32Sx4"); return; + case Iop_F32toF16x4: vex_printf("F32toF16x4"); return; case Iop_F16toF32x4: vex_printf("F16toF32x4"); return; case Iop_F16toF64x2: vex_printf("F16toF64x2"); return; @@ -1237,6 +1240,8 @@ void ppIROp ( IROp op ) case Iop_Sub32Fx8: vex_printf("Sub32Fx8"); return; case Iop_Mul32Fx8: vex_printf("Mul32Fx8"); return; case Iop_Div32Fx8: vex_printf("Div32Fx8"); return; + case Iop_I32StoF32x8: vex_printf("I32StoF32x8"); return; + case Iop_F32toI32Sx8: vex_printf("F32toI32Sx8"); return; case Iop_AndV256: vex_printf("AndV256"); return; case Iop_OrV256: vex_printf("OrV256"); return; case Iop_XorV256: vex_printf("XorV256"); return; @@ -2990,6 +2995,8 @@ void typeOfPrimop ( IROp op, case Iop_Sqrt64Fx2: case Iop_Sqrt32Fx4: + case Iop_I32StoF32x4: + case Iop_F32toI32Sx4: BINARY(ity_RMode,Ity_V128, Ity_V128); case Iop_64HLtoV128: @@ -3579,6 +3586,10 @@ void typeOfPrimop ( IROp op, case Iop_Perm32x8: BINARY(Ity_V256,Ity_V256, Ity_V256); + case Iop_I32StoF32x8: + case Iop_F32toI32Sx8: + BINARY(ity_RMode,Ity_V256, Ity_V256); + case Iop_V256toV128_1: case Iop_V256toV128_0: UNARY(Ity_V256, Ity_V128); diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h index 459d14b..f8ba2c7 100644 --- a/VEX/pub/libvex_ir.h +++ b/VEX/pub/libvex_ir.h @@ -1387,7 +1387,13 @@ typedef /* Unlike the standard fp conversions, these irops take no rounding mode argument. Instead the irop trailers _R{M,P,N,Z} indicate the mode: {-inf, +inf, nearest, zero} respectively. */ + + // FIXME These carry no rounding mode Iop_I32UtoFx4, Iop_I32StoFx4, /* I32x4 -> F32x4 */ + + Iop_I32StoF32x4, /* IRRoundingMode(I32) x V128 -> V128 */ + Iop_F32toI32Sx4, /* IRRoundingMode(I32) x V128 -> V128 */ + Iop_FtoI32Ux4_RZ, Iop_FtoI32Sx4_RZ, /* F32x4 -> I32x4 */ Iop_QFtoI32Ux4_RZ, Iop_QFtoI32Sx4_RZ, /* F32x4 -> I32x4 (saturating) */ Iop_RoundF32x4_RM, Iop_RoundF32x4_RP, /* round to fp integer */ @@ -1400,12 +1406,12 @@ typedef /* --- Single to/from half conversion --- */ /* FIXME: what kind of rounding in F32x4 -> F16x4 case? */ + // FIXME these carry no rounding mode Iop_F32toF16x4, Iop_F16toF32x4, /* F32x4 <-> F16x4 */ - - /* -- Double to/from half conversion -- */ - Iop_F64toF16x2, Iop_F16toF64x2, + Iop_F64toF16x2, // FIXME this carries no rounding mode (?) + Iop_F16toF64x2, /* Values from two registers converted in smaller type and put in one IRRoundingMode(I32) x (F32x4 | F32x4) -> Q16x8 */ @@ -1957,6 +1963,9 @@ typedef Iop_Add64Fx4, Iop_Sub64Fx4, Iop_Mul64Fx4, Iop_Div64Fx4, Iop_Add32Fx8, Iop_Sub32Fx8, Iop_Mul32Fx8, Iop_Div32Fx8, + Iop_I32StoF32x8, /* IRRoundingMode(I32) x V256 -> V256 */ + Iop_F32toI32Sx8, /* IRRoundingMode(I32) x V256 -> V256 */ + Iop_Sqrt32Fx8, Iop_Sqrt64Fx4, Iop_RSqrtEst32Fx8, diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c index 6e449e2..c6ac3a5 100644 --- a/memcheck/mc_translate.c +++ b/memcheck/mc_translate.c @@ -2810,7 +2810,7 @@ IRAtom* unary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX ) static IRAtom* unary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX ) { - /* Same scheme as unary32Fx4_w_rm. */ + /* Same scheme as binaryFx4_w_rm. */ IRAtom* t1 = unary32Fx4(mce, vatomX); // PCast the RM, and widen it to 128 bits IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM); @@ -2819,6 +2819,20 @@ IRAtom* unary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX ) return t1; } +/* --- ... and ... 32Fx8 versions of the same --- */ + +static +IRAtom* unary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX ) +{ + /* Same scheme as unary32Fx8_w_rm. */ + IRAtom* t1 = unary32Fx8(mce, vatomX); + // PCast the RM, and widen it to 256 bits + IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM); + // Roll it into the result + t1 = mkUifUV256(mce, t1, t2); + return t1; +} + /* --- --- Vector saturated narrowing --- --- */ @@ -3665,6 +3679,8 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce, /* V128-bit SIMD */ + case Iop_I32StoF32x4: + case Iop_F32toI32Sx4: case Iop_Sqrt32Fx4: return unary32Fx4_w_rm(mce, vatom1, vatom2); case Iop_Sqrt64Fx2: @@ -4743,9 +4759,13 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce, case Iop_CmpGT64Sx4: return binary64Ix4(mce, vatom1, vatom2); - /* Perm32x8: rearrange values in left arg using steering values - from right arg. So rearrange the vbits in the same way but - pessimise wrt steering values. */ + case Iop_I32StoF32x8: + case Iop_F32toI32Sx8: + return unary32Fx8_w_rm(mce, vatom1, vatom2); + + /* Perm32x8: rearrange values in left arg using steering values + from right arg. So rearrange the vbits in the same way but + pessimise wrt steering values. */ case Iop_Perm32x8: return mkUifUV256( mce, diff --git a/memcheck/tests/vbit-test/irops.c b/memcheck/tests/vbit-test/irops.c index 66b40ef..1047fa3 100644 --- a/memcheck/tests/vbit-test/irops.c +++ b/memcheck/tests/vbit-test/irops.c @@ -640,6 +640,8 @@ static irop_t irops[] = { { DEFOP(Iop_RSqrtStep32Fx4, UNDEF_UNKNOWN), }, { DEFOP(Iop_I32UtoFx4, UNDEF_UNKNOWN), }, { DEFOP(Iop_I32StoFx4, UNDEF_UNKNOWN), }, + { DEFOP(Iop_I32StoF32x4, UNDEF_UNKNOWN), }, + { DEFOP(Iop_F32toI32Sx4, UNDEF_UNKNOWN), }, { DEFOP(Iop_FtoI32Ux4_RZ, UNDEF_UNKNOWN), }, { DEFOP(Iop_FtoI32Sx4_RZ, UNDEF_UNKNOWN), }, { DEFOP(Iop_QFtoI32Ux4_RZ, UNDEF_UNKNOWN), }, @@ -1123,6 +1125,8 @@ static irop_t irops[] = { { DEFOP(Iop_Sub32Fx8, UNDEF_UNKNOWN), }, { DEFOP(Iop_Mul32Fx8, UNDEF_UNKNOWN), }, { DEFOP(Iop_Div32Fx8, UNDEF_UNKNOWN), }, + { DEFOP(Iop_I32StoF32x8, UNDEF_UNKNOWN), }, + { DEFOP(Iop_F32toI32Sx8, UNDEF_UNKNOWN), }, { DEFOP(Iop_Sqrt32Fx8, UNDEF_UNKNOWN), }, { DEFOP(Iop_Sqrt64Fx4, UNDEF_UNKNOWN), }, { DEFOP(Iop_RSqrtEst32Fx8, UNDEF_UNKNOWN), }, |
|
From: Julian S. <se...@so...> - 2018-12-22 12:35:57
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=901f3d3813c551b18a34ca5a52e3d9393524544c commit 901f3d3813c551b18a34ca5a52e3d9393524544c Author: Julian Seward <js...@ac...> Date: Sat Dec 22 13:34:11 2018 +0100 amd64 back end: generate better code for 128/256 bit vector shifts by immediate. n-i-bz. Diff: --- VEX/priv/host_amd64_defs.c | 62 +++++++++++++++++++++++++++++++++++++ VEX/priv/host_amd64_defs.h | 12 ++++++-- VEX/priv/host_amd64_isel.c | 76 ++++++++++++++++++++++++++++++++-------------- 3 files changed, 126 insertions(+), 24 deletions(-) diff --git a/VEX/priv/host_amd64_defs.c b/VEX/priv/host_amd64_defs.c index 48ca268..1536d81 100644 --- a/VEX/priv/host_amd64_defs.c +++ b/VEX/priv/host_amd64_defs.c @@ -1007,6 +1007,15 @@ AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ) { vassert(order >= 0 && order <= 0xFF); return i; } +AMD64Instr* AMD64Instr_SseShiftN ( AMD64SseOp op, + UInt shiftBits, HReg dst ) { + AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr)); + i->tag = Ain_SseShiftN; + i->Ain.SseShiftN.op = op; + i->Ain.SseShiftN.shiftBits = shiftBits; + i->Ain.SseShiftN.dst = dst; + return i; +} //uu AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad, //uu HReg reg, AMD64AMode* addr ) { //uu AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr)); @@ -1359,6 +1368,11 @@ void ppAMD64Instr ( const AMD64Instr* i, Bool mode64 ) vex_printf(","); ppHRegAMD64(i->Ain.SseShuf.dst); return; + case Ain_SseShiftN: + vex_printf("%s $%u, ", showAMD64SseOp(i->Ain.SseShiftN.op), + i->Ain.SseShiftN.shiftBits); + ppHRegAMD64(i->Ain.SseShiftN.dst); + return; //uu case Ain_AvxLdSt: //uu vex_printf("vmovups "); //uu if (i->Ain.AvxLdSt.isLoad) { @@ -1691,6 +1705,9 @@ void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 ) addHRegUse(u, HRmRead, i->Ain.SseShuf.src); addHRegUse(u, HRmWrite, i->Ain.SseShuf.dst); return; + case Ain_SseShiftN: + addHRegUse(u, HRmModify, i->Ain.SseShiftN.dst); + return; //uu case Ain_AvxLdSt: //uu addRegUsage_AMD64AMode(u, i->Ain.AvxLdSt.addr); //uu addHRegUse(u, i->Ain.AvxLdSt.isLoad ? HRmWrite : HRmRead, @@ -1906,6 +1923,9 @@ void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 ) mapReg(m, &i->Ain.SseShuf.src); mapReg(m, &i->Ain.SseShuf.dst); return; + case Ain_SseShiftN: + mapReg(m, &i->Ain.SseShiftN.dst); + return; //uu case Ain_AvxLdSt: //uu mapReg(m, &i->Ain.AvxLdSt.reg); //uu mapRegs_AMD64AMode(m, i->Ain.AvxLdSt.addr); @@ -3840,6 +3860,48 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc, *p++ = (UChar)(i->Ain.SseShuf.order); goto done; + case Ain_SseShiftN: { + opc = 0; // invalid + subopc_imm = 0; // invalid + UInt limit = 0; + UInt shiftImm = i->Ain.SseShiftN.shiftBits; + switch (i->Ain.SseShiftN.op) { + case Asse_SHL16: limit = 15; opc = 0x71; subopc_imm = 6; break; + case Asse_SHL32: limit = 31; opc = 0x72; subopc_imm = 6; break; + case Asse_SHL64: limit = 63; opc = 0x73; subopc_imm = 6; break; + case Asse_SAR16: limit = 15; opc = 0x71; subopc_imm = 4; break; + case Asse_SAR32: limit = 31; opc = 0x72; subopc_imm = 4; break; + case Asse_SHR16: limit = 15; opc = 0x71; subopc_imm = 2; break; + case Asse_SHR32: limit = 31; opc = 0x72; subopc_imm = 2; break; + case Asse_SHR64: limit = 63; opc = 0x73; subopc_imm = 2; break; + case Asse_SHL128: + if ((shiftImm & 7) != 0) goto bad; + shiftImm >>= 3; + limit = 15; opc = 0x73; subopc_imm = 7; + break; + case Asse_SHR128: + if ((shiftImm & 7) != 0) goto bad; + shiftImm >>= 3; + limit = 15; opc = 0x73; subopc_imm = 3; + break; + default: + // This should never happen .. SSE2 only offers the above 10 insns + // for the "shift with immediate" case + goto bad; + } + vassert(limit > 0 && opc > 0 && subopc_imm > 0); + if (shiftImm > limit) goto bad; + *p++ = 0x66; + *p++ = clearWBit( + rexAMode_R_enc_enc( subopc_imm, + vregEnc3210(i->Ain.SseShiftN.dst) )); + *p++ = 0x0F; + *p++ = opc; + p = doAMode_R_enc_enc(p, subopc_imm, vregEnc3210(i->Ain.SseShiftN.dst)); + *p++ = shiftImm; + goto done; + } + //uu case Ain_AvxLdSt: { //uu UInt vex = vexAMode_M( dvreg2ireg(i->Ain.AvxLdSt.reg), //uu i->Ain.AvxLdSt.addr ); diff --git a/VEX/priv/host_amd64_defs.h b/VEX/priv/host_amd64_defs.h index 6a72943..e1715a0 100644 --- a/VEX/priv/host_amd64_defs.h +++ b/VEX/priv/host_amd64_defs.h @@ -334,8 +334,8 @@ typedef Asse_MIN8U, Asse_CMPEQ8, Asse_CMPEQ16, Asse_CMPEQ32, Asse_CMPGT8S, Asse_CMPGT16S, Asse_CMPGT32S, - Asse_SHL16, Asse_SHL32, Asse_SHL64, - Asse_SHR16, Asse_SHR32, Asse_SHR64, + Asse_SHL16, Asse_SHL32, Asse_SHL64, Asse_SHL128, + Asse_SHR16, Asse_SHR32, Asse_SHR64, Asse_SHR128, Asse_SAR16, Asse_SAR32, Asse_PACKSSD, Asse_PACKSSW, Asse_PACKUSW, Asse_UNPCKHB, Asse_UNPCKHW, Asse_UNPCKHD, Asse_UNPCKHQ, @@ -400,6 +400,7 @@ typedef Ain_SseReRg, /* SSE binary general reg-reg, Re, Rg */ Ain_SseCMov, /* SSE conditional move */ Ain_SseShuf, /* SSE2 shuffle (pshufd) */ + Ain_SseShiftN, /* SSE2 shift by immediate */ //uu Ain_AvxLdSt, /* AVX load/store 256 bits, //uu no alignment constraints */ //uu Ain_AvxReRg, /* AVX binary general reg-reg, Re, Rg */ @@ -695,6 +696,11 @@ typedef HReg src; HReg dst; } SseShuf; + struct { + AMD64SseOp op; + UInt shiftBits; + HReg dst; + } SseShiftN; //uu struct { //uu Bool isLoad; //uu HReg reg; @@ -773,6 +779,8 @@ extern AMD64Instr* AMD64Instr_Sse64FLo ( AMD64SseOp, HReg, HReg ); extern AMD64Instr* AMD64Instr_SseReRg ( AMD64SseOp, HReg, HReg ); extern AMD64Instr* AMD64Instr_SseCMov ( AMD64CondCode, HReg src, HReg dst ); extern AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ); +extern AMD64Instr* AMD64Instr_SseShiftN ( AMD64SseOp, + UInt shiftBits, HReg dst ); //uu extern AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad, HReg, AMD64AMode* ); //uu extern AMD64Instr* AMD64Instr_AvxReRg ( AMD64SseOp, HReg, HReg ); extern AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter, diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c index 7974c80..59fd752 100644 --- a/VEX/priv/host_amd64_isel.c +++ b/VEX/priv/host_amd64_isel.c @@ -3135,9 +3135,10 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e ) HWord fn = 0; /* address of helper fn, if required */ Bool arg1isEReg = False; AMD64SseOp op = Asse_INVALID; - IRType ty = typeOfIRExpr(env->type_env,e); vassert(e); + IRType ty = typeOfIRExpr(env->type_env, e); vassert(ty == Ity_V128); + UInt laneBits = 0; if (e->tag == Iex_RdTmp) { return lookupIRTemp(env, e->Iex.RdTmp.tmp); @@ -3521,20 +3522,33 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e ) return dst; } - case Iop_ShlN16x8: op = Asse_SHL16; goto do_SseShift; - case Iop_ShlN32x4: op = Asse_SHL32; goto do_SseShift; - case Iop_ShlN64x2: op = Asse_SHL64; goto do_SseShift; - case Iop_SarN16x8: op = Asse_SAR16; goto do_SseShift; - case Iop_SarN32x4: op = Asse_SAR32; goto do_SseShift; - case Iop_ShrN16x8: op = Asse_SHR16; goto do_SseShift; - case Iop_ShrN32x4: op = Asse_SHR32; goto do_SseShift; - case Iop_ShrN64x2: op = Asse_SHR64; goto do_SseShift; + case Iop_ShlN16x8: laneBits = 16; op = Asse_SHL16; goto do_SseShift; + case Iop_ShlN32x4: laneBits = 32; op = Asse_SHL32; goto do_SseShift; + case Iop_ShlN64x2: laneBits = 64; op = Asse_SHL64; goto do_SseShift; + case Iop_SarN16x8: laneBits = 16; op = Asse_SAR16; goto do_SseShift; + case Iop_SarN32x4: laneBits = 32; op = Asse_SAR32; goto do_SseShift; + case Iop_ShrN16x8: laneBits = 16; op = Asse_SHR16; goto do_SseShift; + case Iop_ShrN32x4: laneBits = 32; op = Asse_SHR32; goto do_SseShift; + case Iop_ShrN64x2: laneBits = 64; op = Asse_SHR64; goto do_SseShift; do_SseShift: { - HReg greg = iselVecExpr(env, e->Iex.Binop.arg1); + HReg dst = newVRegV(env); + HReg greg = iselVecExpr(env, e->Iex.Binop.arg1); + /* If it's a shift by an in-range immediate, generate a single + instruction. */ + if (e->Iex.Binop.arg2->tag == Iex_Const) { + IRConst* c = e->Iex.Binop.arg2->Iex.Const.con; + vassert(c->tag == Ico_U8); + UInt shift = c->Ico.U8; + if (shift < laneBits) { + addInstr(env, mk_vMOVsd_RR(greg, dst)); + addInstr(env, AMD64Instr_SseShiftN(op, shift, dst)); + return dst; + } + } + /* Otherwise we have to do it the longwinded way. */ AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2); AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP()); HReg ereg = newVRegV(env); - HReg dst = newVRegV(env); addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0))); addInstr(env, AMD64Instr_Push(rmi)); addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0)); @@ -3762,8 +3776,9 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, { HWord fn = 0; /* address of helper fn, if required */ vassert(e); - IRType ty = typeOfIRExpr(env->type_env,e); + IRType ty = typeOfIRExpr(env->type_env, e); vassert(ty == Ity_V256); + UInt laneBits = 0; AMD64SseOp op = Asse_INVALID; @@ -3997,22 +4012,39 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, return; } - case Iop_ShlN16x16: op = Asse_SHL16; goto do_SseShift; - case Iop_ShlN32x8: op = Asse_SHL32; goto do_SseShift; - case Iop_ShlN64x4: op = Asse_SHL64; goto do_SseShift; - case Iop_SarN16x16: op = Asse_SAR16; goto do_SseShift; - case Iop_SarN32x8: op = Asse_SAR32; goto do_SseShift; - case Iop_ShrN16x16: op = Asse_SHR16; goto do_SseShift; - case Iop_ShrN32x8: op = Asse_SHR32; goto do_SseShift; - case Iop_ShrN64x4: op = Asse_SHR64; goto do_SseShift; + case Iop_ShlN16x16: laneBits = 16; op = Asse_SHL16; goto do_SseShift; + case Iop_ShlN32x8: laneBits = 32; op = Asse_SHL32; goto do_SseShift; + case Iop_ShlN64x4: laneBits = 64; op = Asse_SHL64; goto do_SseShift; + case Iop_SarN16x16: laneBits = 16; op = Asse_SAR16; goto do_SseShift; + case Iop_SarN32x8: laneBits = 32; op = Asse_SAR32; goto do_SseShift; + case Iop_ShrN16x16: laneBits = 16; op = Asse_SHR16; goto do_SseShift; + case Iop_ShrN32x8: laneBits = 32; op = Asse_SHR32; goto do_SseShift; + case Iop_ShrN64x4: laneBits = 64; op = Asse_SHR64; goto do_SseShift; do_SseShift: { + HReg dstHi = newVRegV(env); + HReg dstLo = newVRegV(env); HReg gregHi, gregLo; iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1); + /* If it's a shift by an in-range immediate, generate two single + instructions. */ + if (e->Iex.Binop.arg2->tag == Iex_Const) { + IRConst* c = e->Iex.Binop.arg2->Iex.Const.con; + vassert(c->tag == Ico_U8); + UInt shift = c->Ico.U8; + if (shift < laneBits) { + addInstr(env, mk_vMOVsd_RR(gregHi, dstHi)); + addInstr(env, AMD64Instr_SseShiftN(op, shift, dstHi)); + addInstr(env, mk_vMOVsd_RR(gregLo, dstLo)); + addInstr(env, AMD64Instr_SseShiftN(op, shift, dstLo)); + *rHi = dstHi; + *rLo = dstLo; + return; + } + } + /* Otherwise we have to do it the longwinded way. */ AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2); AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP()); HReg ereg = newVRegV(env); - HReg dstHi = newVRegV(env); - HReg dstLo = newVRegV(env); addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0))); addInstr(env, AMD64Instr_Push(rmi)); addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0)); |
|
From: Julian S. <se...@so...> - 2018-12-22 06:26:13
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=b078fabb56e34115e55357c319d589b9455dd189 commit b078fabb56e34115e55357c319d589b9455dd189 Author: Julian Seward <js...@ac...> Date: Sat Dec 22 07:23:00 2018 +0100 amd64 pipeline: generate much better code for pshufb mm/xmm/ymm. n-i-bz. pshufb mm/xmm/ymm rearranges byte lanes in vector registers. It's fairly widely used, but we generated terrible code for it. With this patch, we just generate, at the back end, pshufb plus a bit of masking, which is a great improvement. Diff: --- VEX/priv/guest_amd64_toIR.c | 107 +++++---------------------------------- VEX/priv/host_amd64_defs.c | 3 ++ VEX/priv/host_amd64_defs.h | 3 +- VEX/priv/host_amd64_isel.c | 15 +++++- VEX/priv/host_generic_simd128.c | 14 +++++ VEX/priv/host_generic_simd128.h | 5 ++ VEX/priv/host_generic_simd64.c | 25 ++++++++- VEX/priv/host_generic_simd64.h | 3 +- VEX/priv/ir_defs.c | 7 ++- VEX/pub/libvex_ir.h | 18 ++++++- memcheck/mc_translate.c | 26 +++++++--- memcheck/tests/vbit-test/irops.c | 2 + 12 files changed, 120 insertions(+), 108 deletions(-) diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c index e753ffa..2451a29 100644 --- a/VEX/priv/guest_amd64_toIR.c +++ b/VEX/priv/guest_amd64_toIR.c @@ -15583,90 +15583,16 @@ Long dis_ESC_0F__SSE3 ( Bool* decode_OK, static IRTemp math_PSHUFB_XMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ ) { - IRTemp sHi = newTemp(Ity_I64); - IRTemp sLo = newTemp(Ity_I64); - IRTemp dHi = newTemp(Ity_I64); - IRTemp dLo = newTemp(Ity_I64); - IRTemp rHi = newTemp(Ity_I64); - IRTemp rLo = newTemp(Ity_I64); - IRTemp sevens = newTemp(Ity_I64); - IRTemp mask0x80hi = newTemp(Ity_I64); - IRTemp mask0x80lo = newTemp(Ity_I64); - IRTemp maskBit3hi = newTemp(Ity_I64); - IRTemp maskBit3lo = newTemp(Ity_I64); - IRTemp sAnd7hi = newTemp(Ity_I64); - IRTemp sAnd7lo = newTemp(Ity_I64); - IRTemp permdHi = newTemp(Ity_I64); - IRTemp permdLo = newTemp(Ity_I64); - IRTemp res = newTemp(Ity_V128); - - assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) ); - assign( dLo, unop(Iop_V128to64, mkexpr(dV)) ); - assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) ); - assign( sLo, unop(Iop_V128to64, mkexpr(sV)) ); - - assign( sevens, mkU64(0x0707070707070707ULL) ); - - /* mask0x80hi = Not(SarN8x8(sHi,7)) - maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7) - sAnd7hi = And(sHi,sevens) - permdHi = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi), - And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) ) - rHi = And(permdHi,mask0x80hi) - */ - assign( - mask0x80hi, - unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7)))); - - assign( - maskBit3hi, - binop(Iop_SarN8x8, - binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)), - mkU8(7))); - - assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens))); - - assign( - permdHi, - binop( - Iop_Or64, - binop(Iop_And64, - binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)), - mkexpr(maskBit3hi)), - binop(Iop_And64, - binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)), - unop(Iop_Not64,mkexpr(maskBit3hi))) )); - - assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) ); - - /* And the same for the lower half of the result. What fun. */ - - assign( - mask0x80lo, - unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7)))); - - assign( - maskBit3lo, - binop(Iop_SarN8x8, - binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)), - mkU8(7))); - - assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens))); - - assign( - permdLo, - binop( - Iop_Or64, - binop(Iop_And64, - binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)), - mkexpr(maskBit3lo)), - binop(Iop_And64, - binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)), - unop(Iop_Not64,mkexpr(maskBit3lo))) )); - - assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) ); - - assign(res, binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))); + IRTemp halfMask = newTemp(Ity_I64); + assign(halfMask, mkU64(0x8F8F8F8F8F8F8F8FULL)); + IRExpr* mask = binop(Iop_64HLtoV128, mkexpr(halfMask), mkexpr(halfMask)); + IRTemp res = newTemp(Ity_V128); + assign(res, + binop(Iop_PermOrZero8x16, + mkexpr(dV), + // Mask off bits [6:3] of each source operand lane + binop(Iop_AndV128, mkexpr(sV), mask) + )); return res; } @@ -15945,15 +15871,10 @@ Long dis_ESC_0F38__SupSSE3 ( Bool* decode_OK, putMMXReg( gregLO3ofRM(modrm), binop( - Iop_And64, - /* permute the lanes */ - binop( - Iop_Perm8x8, - mkexpr(dV), - binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL)) - ), - /* mask off lanes which have (index & 0x80) == 0x80 */ - unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7))) + Iop_PermOrZero8x8, + mkexpr(dV), + // Mask off bits [6:3] of each source operand lane + binop(Iop_And64, mkexpr(sV), mkU64(0x8787878787878787ULL)) ) ); goto decode_success; diff --git a/VEX/priv/host_amd64_defs.c b/VEX/priv/host_amd64_defs.c index a554e28..48ca268 100644 --- a/VEX/priv/host_amd64_defs.c +++ b/VEX/priv/host_amd64_defs.c @@ -584,6 +584,7 @@ const HChar* showAMD64SseOp ( AMD64SseOp op ) { case Asse_UNPCKLW: return "punpcklw"; case Asse_UNPCKLD: return "punpckld"; case Asse_UNPCKLQ: return "punpcklq"; + case Asse_PSHUFB: return "pshufb"; default: vpanic("showAMD64SseOp"); } } @@ -3799,6 +3800,8 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc, case Asse_UNPCKLW: XX(0x66); XX(rex); XX(0x0F); XX(0x61); break; case Asse_UNPCKLD: XX(0x66); XX(rex); XX(0x0F); XX(0x62); break; case Asse_UNPCKLQ: XX(0x66); XX(rex); XX(0x0F); XX(0x6C); break; + case Asse_PSHUFB: XX(0x66); XX(rex); + XX(0x0F); XX(0x38); XX(0x00); break; default: goto bad; } p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseReRg.dst), diff --git a/VEX/priv/host_amd64_defs.h b/VEX/priv/host_amd64_defs.h index 68e199a..6a72943 100644 --- a/VEX/priv/host_amd64_defs.h +++ b/VEX/priv/host_amd64_defs.h @@ -339,7 +339,8 @@ typedef Asse_SAR16, Asse_SAR32, Asse_PACKSSD, Asse_PACKSSW, Asse_PACKUSW, Asse_UNPCKHB, Asse_UNPCKHW, Asse_UNPCKHD, Asse_UNPCKHQ, - Asse_UNPCKLB, Asse_UNPCKLW, Asse_UNPCKLD, Asse_UNPCKLQ + Asse_UNPCKLB, Asse_UNPCKLW, Asse_UNPCKLD, Asse_UNPCKLQ, + Asse_PSHUFB // Only for SSSE3 capable hosts } AMD64SseOp; diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c index 05e2e72..7974c80 100644 --- a/VEX/priv/host_amd64_isel.c +++ b/VEX/priv/host_amd64_isel.c @@ -1122,8 +1122,8 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e ) fn = (HWord)h_generic_calc_CatOddLanes16x4; break; case Iop_CatEvenLanes16x4: fn = (HWord)h_generic_calc_CatEvenLanes16x4; break; - case Iop_Perm8x8: - fn = (HWord)h_generic_calc_Perm8x8; break; + case Iop_PermOrZero8x8: + fn = (HWord)h_generic_calc_PermOrZero8x8; break; case Iop_Max8Ux8: fn = (HWord)h_generic_calc_Max8Ux8; break; @@ -3437,6 +3437,17 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e ) return dst; } + case Iop_PermOrZero8x16: + if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) { + op = Asse_PSHUFB; + goto do_SseReRg; + } + // Otherwise we'll have to generate a call to + // h_generic_calc_PermOrZero8x16 (ATK). But that would only be for a + // host which doesn't have SSSE3, in which case we don't expect this + // IROp to enter the compilation pipeline in the first place. + break; + case Iop_QNarrowBin32Sto16Sx8: op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg; case Iop_QNarrowBin16Sto8Sx16: diff --git a/VEX/priv/host_generic_simd128.c b/VEX/priv/host_generic_simd128.c index 62cb88e..8c6f707 100644 --- a/VEX/priv/host_generic_simd128.c +++ b/VEX/priv/host_generic_simd128.c @@ -368,6 +368,20 @@ void VEX_REGPARM(3) res->w32[3] = argL->w32[ argR->w32[3] & 3 ]; } +//void VEX_REGPARM(3) +// h_generic_calc_PermOrZero8x16 ( /*OUT*/V128* res, +// V128* argL, V128* argR ) +//{ +// for (UInt i = 0; i < 16; i++) { +// UChar ix = argR->w8[i]; +// Char zeroingMask = (Char)ix; +// zeroingMask ^= 0x80; +// zeroingMask >>= 7; +// ix &= 15; +// res->w8[i] = (argL->w8[ix] & zeroingMask) & 0xFF; +// } +//} + UInt /*not-regparm*/ h_generic_calc_GetMSBs8x16 ( ULong w64hi, ULong w64lo ) { diff --git a/VEX/priv/host_generic_simd128.h b/VEX/priv/host_generic_simd128.h index 0b63ab3..18b3518 100644 --- a/VEX/priv/host_generic_simd128.h +++ b/VEX/priv/host_generic_simd128.h @@ -86,6 +86,11 @@ extern VEX_REGPARM(3) extern VEX_REGPARM(3) void h_generic_calc_Perm32x4 ( /*OUT*/V128*, V128*, V128* ); +// This is correct and tested, but isn't used because we just generate +// PSHUFB on amd64 instead. +//extern VEX_REGPARM(3) +// void h_generic_calc_PermOrZero8x16 ( /*OUT*/V128*, V128*, V128* ); + extern /*not-regparm*/ UInt h_generic_calc_GetMSBs8x16 ( ULong w64hi, ULong w64lo ); diff --git a/VEX/priv/host_generic_simd64.c b/VEX/priv/host_generic_simd64.c index 48a55bd..a7e80b4 100644 --- a/VEX/priv/host_generic_simd64.c +++ b/VEX/priv/host_generic_simd64.c @@ -137,6 +137,14 @@ static inline UChar index8x8 ( ULong w64, UChar ix ) { return toUChar((w64 >> (8*ix)) & 0xFF); } +static inline UChar indexOrZero8x8 ( ULong w64, UChar ix ) { + Char zeroingMask = (Char)ix; + zeroingMask ^= 0x80; + zeroingMask >>= 7; + ix &= 7; + return toUChar( ((w64 >> (8*ix)) & zeroingMask) & 0xFF ); +} + /* Scalar helpers. */ @@ -974,7 +982,8 @@ ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb ) ); } -/* misc hack looking for a proper home */ +/* ------------ Permutation ------------ */ + ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb ) { return mk8x8( @@ -989,6 +998,20 @@ ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb ) ); } +ULong h_generic_calc_PermOrZero8x8 ( ULong aa, ULong bb ) +{ + return mk8x8( + indexOrZero8x8(aa, sel8x8_7(bb)), + indexOrZero8x8(aa, sel8x8_6(bb)), + indexOrZero8x8(aa, sel8x8_5(bb)), + indexOrZero8x8(aa, sel8x8_4(bb)), + indexOrZero8x8(aa, sel8x8_3(bb)), + indexOrZero8x8(aa, sel8x8_2(bb)), + indexOrZero8x8(aa, sel8x8_1(bb)), + indexOrZero8x8(aa, sel8x8_0(bb)) + ); +} + /* ------------ Shifting ------------ */ /* Note that because these primops are undefined if the shift amount equals or exceeds the lane width, the shift amount is masked so diff --git a/VEX/priv/host_generic_simd64.h b/VEX/priv/host_generic_simd64.h index a0f1ed8..e26ef6c 100644 --- a/VEX/priv/host_generic_simd64.h +++ b/VEX/priv/host_generic_simd64.h @@ -102,7 +102,8 @@ extern ULong h_generic_calc_InterleaveLO32x2 ( ULong, ULong ); extern ULong h_generic_calc_CatOddLanes16x4 ( ULong, ULong ); extern ULong h_generic_calc_CatEvenLanes16x4 ( ULong, ULong ); -extern ULong h_generic_calc_Perm8x8 ( ULong, ULong ); +extern ULong h_generic_calc_Perm8x8 ( ULong, ULong ); +extern ULong h_generic_calc_PermOrZero8x8 ( ULong, ULong ); extern ULong h_generic_calc_ShlN8x8 ( ULong, UInt ); extern ULong h_generic_calc_ShlN16x4 ( ULong, UInt ); diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c index 3221033..ae1c203 100644 --- a/VEX/priv/ir_defs.c +++ b/VEX/priv/ir_defs.c @@ -622,6 +622,7 @@ void ppIROp ( IROp op ) case Iop_Sal32x2: vex_printf("Sal32x2"); return; case Iop_Sal64x1: vex_printf("Sal64x1"); return; case Iop_Perm8x8: vex_printf("Perm8x8"); return; + case Iop_PermOrZero8x8: vex_printf("PermOrZero8x8"); return; case Iop_Reverse8sIn16_x4: vex_printf("Reverse8sIn16_x4"); return; case Iop_Reverse8sIn32_x2: vex_printf("Reverse8sIn32_x2"); return; case Iop_Reverse16sIn32_x2: vex_printf("Reverse16sIn32_x2"); return; @@ -1125,6 +1126,7 @@ void ppIROp ( IROp op ) case Iop_SliceV128: vex_printf("SliceV128"); return; case Iop_Perm8x16: vex_printf("Perm8x16"); return; + case Iop_PermOrZero8x16: vex_printf("PermOrZero8x16"); return; case Iop_Perm32x4: vex_printf("Perm32x4"); return; case Iop_Perm8x16x2: vex_printf("Perm8x16x2"); return; case Iop_Reverse8sIn16_x8: vex_printf("Reverse8sIn16_x8"); return; @@ -2661,7 +2663,7 @@ void typeOfPrimop ( IROp op, case Iop_CatOddLanes16x4: case Iop_CatEvenLanes16x4: case Iop_InterleaveOddLanes8x8: case Iop_InterleaveEvenLanes8x8: case Iop_InterleaveOddLanes16x4: case Iop_InterleaveEvenLanes16x4: - case Iop_Perm8x8: + case Iop_Perm8x8: case Iop_PermOrZero8x8: case Iop_Max8Ux8: case Iop_Max16Ux4: case Iop_Max32Ux2: case Iop_Max8Sx8: case Iop_Max16Sx4: case Iop_Max32Sx2: case Iop_Max32Fx2: case Iop_Min32Fx2: @@ -3132,7 +3134,8 @@ void typeOfPrimop ( IROp op, case Iop_PackOddLanes8x16: case Iop_PackEvenLanes8x16: case Iop_PackOddLanes16x8: case Iop_PackEvenLanes16x8: case Iop_PackOddLanes32x4: case Iop_PackEvenLanes32x4: - case Iop_Perm8x16: case Iop_Perm32x4: + case Iop_Perm8x16: case Iop_PermOrZero8x16: + case Iop_Perm32x4: case Iop_RecipStep32Fx4: case Iop_RecipStep64Fx2: case Iop_RSqrtStep32Fx4: case Iop_RSqrtStep64Fx2: case Iop_CipherV128: diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h index 93fa5ac..459d14b 100644 --- a/VEX/pub/libvex_ir.h +++ b/VEX/pub/libvex_ir.h @@ -1061,9 +1061,16 @@ typedef as indexed by control vector bytes: for i in 0 .. 7 . result[i] = argL[ argR[i] ] argR[i] values may only be in the range 0 .. 7, else behaviour - is undefined. */ + is undefined. That is, argR[i][7:3] must be zero. */ Iop_Perm8x8, + /* PERMUTING with optional zeroing: + for i in 0 .. 7 . result[i] = if argR[i] bit 7 is set + then zero else argL[ argR[i] ] + argR[i][6:3] must be zero, else behaviour is undefined. + */ + Iop_PermOrZero8x8, + /* MISC CONVERSION -- get high bits of each byte lane, a la x86/amd64 pmovmskb */ Iop_GetMSBs8x8, /* I64 -> I8 */ @@ -1842,10 +1849,17 @@ typedef as indexed by control vector bytes: for i in 0 .. 15 . result[i] = argL[ argR[i] ] argR[i] values may only be in the range 0 .. 15, else behaviour - is undefined. */ + is undefined. That is, argR[i][7:4] must be zero. */ Iop_Perm8x16, Iop_Perm32x4, /* ditto, except argR values are restricted to 0 .. 3 */ + /* PERMUTING with optional zeroing: + for i in 0 .. 15 . result[i] = if argR[i] bit 7 is set + then zero else argL[ argR[i] ] + argR[i][6:4] must be zero, else behaviour is undefined. + */ + Iop_PermOrZero8x16, + /* same, but Triop (argL consists of two 128-bit parts) */ /* correct range for argR values is 0..31 */ /* (V128, V128, V128) -> V128 */ diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c index 04ed864..6e449e2 100644 --- a/memcheck/mc_translate.c +++ b/memcheck/mc_translate.c @@ -3641,10 +3641,22 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce, complainIfUndefined(mce, atom2, NULL); return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2)); - /* Perm8x8: rearrange values in left arg using steering values - from right arg. So rearrange the vbits in the same way but - pessimise wrt steering values. */ + /* Perm8x8: rearrange values in left arg using steering values from + right arg. So rearrange the vbits in the same way but pessimise wrt + steering values. We assume that unused bits in the steering value + are defined zeros, so we can safely PCast within each lane of the the + steering value without having to take precautions to avoid a + dependency on those unused bits. + + This is also correct for PermOrZero8x8, but it is a bit subtle. For + each lane, if bit 7 of the steering value is zero, then we'll steer + the shadow value exactly as per Perm8x8. If that bit is one, then + the operation will set the resulting (concrete) value to zero. That + means it is defined, and should have a shadow value of zero. Hence + in both cases (bit 7 is 0 or 1) we can self-shadow (in the same way + as Perm8x8) and then pessimise against the steering values. */ case Iop_Perm8x8: + case Iop_PermOrZero8x8: return mkUifU64( mce, assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)), @@ -4121,10 +4133,12 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce, complainIfUndefined(mce, atom2, NULL); return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)); - /* Perm8x16: rearrange values in left arg using steering values - from right arg. So rearrange the vbits in the same way but - pessimise wrt steering values. Perm32x4 ditto. */ + /* Perm8x16: rearrange values in left arg using steering values + from right arg. So rearrange the vbits in the same way but + pessimise wrt steering values. Perm32x4 ditto. */ + /* PermOrZero8x16: see comments above for PermOrZero8x8. */ case Iop_Perm8x16: + case Iop_PermOrZero8x16: return mkUifUV128( mce, assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)), diff --git a/memcheck/tests/vbit-test/irops.c b/memcheck/tests/vbit-test/irops.c index e8bf67d..66b40ef 100644 --- a/memcheck/tests/vbit-test/irops.c +++ b/memcheck/tests/vbit-test/irops.c @@ -534,6 +534,7 @@ static irop_t irops[] = { { DEFOP(Iop_Reverse16sIn64_x1, UNDEF_UNKNOWN), }, { DEFOP(Iop_Reverse32sIn64_x1, UNDEF_UNKNOWN), }, { DEFOP(Iop_Perm8x8, UNDEF_UNKNOWN), }, + { DEFOP(Iop_PermOrZero8x8, UNDEF_UNKNOWN), }, { DEFOP(Iop_GetMSBs8x8, UNDEF_UNKNOWN), }, { DEFOP(Iop_RecipEst32Ux2, UNDEF_UNKNOWN), }, { DEFOP(Iop_RSqrtEst32Ux2, UNDEF_UNKNOWN), }, @@ -1033,6 +1034,7 @@ static irop_t irops[] = { { DEFOP(Iop_Reverse32sIn64_x2, UNDEF_UNKNOWN), }, { DEFOP(Iop_Reverse1sIn8_x16, UNDEF_UNKNOWN), }, { DEFOP(Iop_Perm8x16, UNDEF_UNKNOWN), }, + { DEFOP(Iop_PermOrZero8x16, UNDEF_UNKNOWN), }, { DEFOP(Iop_Perm32x4, UNDEF_UNKNOWN), }, { DEFOP(Iop_Perm8x16x2, UNDEF_UNKNOWN), }, { DEFOP(Iop_GetMSBs8x16, UNDEF_UNKNOWN), }, |
|
From: Julian S. <se...@so...> - 2018-12-22 05:12:10
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=6cb6bdbd0a38e9b5f5c4f676afb72a23b6bfb1b5 commit 6cb6bdbd0a38e9b5f5c4f676afb72a23b6bfb1b5 Author: Julian Seward <js...@ac...> Date: Sat Dec 22 06:06:19 2018 +0100 amd64 hosts: detect SSSE3 (not SSE3) capabilities on the host. As-yet unused. n-i-bz. Diff: --- VEX/priv/guest_amd64_toIR.c | 6 +++--- VEX/priv/host_amd64_isel.c | 1 + VEX/priv/main_main.c | 22 ++++++++++++++-------- VEX/pub/libvex.h | 1 + coregrind/m_machine.c | 13 ++++++++++--- 5 files changed, 29 insertions(+), 14 deletions(-) diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c index 2cabf80..e753ffa 100644 --- a/VEX/priv/guest_amd64_toIR.c +++ b/VEX/priv/guest_amd64_toIR.c @@ -22071,21 +22071,21 @@ Long dis_ESC_0F ( /* This isn't entirely correct, CPUID should depend on the VEX capabilities, not on the underlying CPU. See bug #324882. */ - if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) && + if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSSE3) && (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16) && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX2)) { fName = "amd64g_dirtyhelper_CPUID_avx2"; fAddr = &amd64g_dirtyhelper_CPUID_avx2; /* This is a Core-i7-4910-like machine */ } - else if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) && + else if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSSE3) && (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16) && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) { fName = "amd64g_dirtyhelper_CPUID_avx_and_cx16"; fAddr = &amd64g_dirtyhelper_CPUID_avx_and_cx16; /* This is a Core-i5-2300-like machine */ } - else if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) && + else if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSSE3) && (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16)) { fName = "amd64g_dirtyhelper_CPUID_sse42_and_cx16"; fAddr = &amd64g_dirtyhelper_CPUID_sse42_and_cx16; diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c index 1787e87..05e2e72 100644 --- a/VEX/priv/host_amd64_isel.c +++ b/VEX/priv/host_amd64_isel.c @@ -4953,6 +4953,7 @@ HInstrArray* iselSB_AMD64 ( const IRSB* bb, vassert(arch_host == VexArchAMD64); vassert(0 == (hwcaps_host & ~(VEX_HWCAPS_AMD64_SSE3 + | VEX_HWCAPS_AMD64_SSSE3 | VEX_HWCAPS_AMD64_CX16 | VEX_HWCAPS_AMD64_LZCNT | VEX_HWCAPS_AMD64_AVX diff --git a/VEX/priv/main_main.c b/VEX/priv/main_main.c index 41ad371..f387f16 100644 --- a/VEX/priv/main_main.c +++ b/VEX/priv/main_main.c @@ -1575,6 +1575,7 @@ static const HChar* show_hwcaps_amd64 ( UInt hwcaps ) { VEX_HWCAPS_AMD64_LZCNT, "lzcnt" }, { VEX_HWCAPS_AMD64_RDTSCP, "rdtscp" }, { VEX_HWCAPS_AMD64_SSE3, "sse3" }, + { VEX_HWCAPS_AMD64_SSSE3, "ssse3" }, { VEX_HWCAPS_AMD64_AVX, "avx" }, { VEX_HWCAPS_AMD64_AVX2, "avx2" }, { VEX_HWCAPS_AMD64_BMI, "bmi" }, @@ -1881,15 +1882,20 @@ static void check_hwcaps ( VexArch arch, UInt hwcaps ) orthogonal. */ /* Throw out obviously stupid cases: */ - Bool have_sse3 = (hwcaps & VEX_HWCAPS_AMD64_SSE3) != 0; - Bool have_avx = (hwcaps & VEX_HWCAPS_AMD64_AVX) != 0; - Bool have_bmi = (hwcaps & VEX_HWCAPS_AMD64_BMI) != 0; - Bool have_avx2 = (hwcaps & VEX_HWCAPS_AMD64_AVX2) != 0; - - /* AVX without SSE3 */ - if (have_avx && !have_sse3) + Bool have_sse3 = (hwcaps & VEX_HWCAPS_AMD64_SSE3) != 0; + Bool have_ssse3 = (hwcaps & VEX_HWCAPS_AMD64_SSSE3) != 0; + Bool have_avx = (hwcaps & VEX_HWCAPS_AMD64_AVX) != 0; + Bool have_bmi = (hwcaps & VEX_HWCAPS_AMD64_BMI) != 0; + Bool have_avx2 = (hwcaps & VEX_HWCAPS_AMD64_AVX2) != 0; + + /* SSSE3 without SSE3 */ + if (have_ssse3 && !have_sse3) + invalid_hwcaps(arch, hwcaps, + "Support for SSSE3 requires SSE3 capabilities\n"); + /* AVX without SSSE3 */ + if (have_avx && !have_ssse3) invalid_hwcaps(arch, hwcaps, - "Support for AVX requires SSE3 capabilities\n"); + "Support for AVX requires SSSE3 capabilities\n"); /* AVX2 or BMI without AVX */ if (have_avx2 && !have_avx) invalid_hwcaps(arch, hwcaps, diff --git a/VEX/pub/libvex.h b/VEX/pub/libvex.h index e6c1974..629a258 100644 --- a/VEX/pub/libvex.h +++ b/VEX/pub/libvex.h @@ -92,6 +92,7 @@ typedef /* amd64: baseline capability is SSE2, with cmpxchg8b but not cmpxchg16b. */ #define VEX_HWCAPS_AMD64_SSE3 (1<<5) /* SSE3 support */ +#define VEX_HWCAPS_AMD64_SSSE3 (1<<12) /* Supplemental SSE3 support */ #define VEX_HWCAPS_AMD64_CX16 (1<<6) /* cmpxchg16b support */ #define VEX_HWCAPS_AMD64_LZCNT (1<<7) /* SSE4a LZCNT insn */ #define VEX_HWCAPS_AMD64_AVX (1<<8) /* AVX instructions */ diff --git a/coregrind/m_machine.c b/coregrind/m_machine.c index 31b0e1b..7aa051b 100644 --- a/coregrind/m_machine.c +++ b/coregrind/m_machine.c @@ -943,7 +943,7 @@ Bool VG_(machine_get_hwcaps)( void ) } #elif defined(VGA_amd64) - { Bool have_sse3, have_cx8, have_cx16; + { Bool have_sse3, have_ssse3, have_cx8, have_cx16; Bool have_lzcnt, have_avx, have_bmi, have_avx2; Bool have_rdtscp; UInt eax, ebx, ecx, edx, max_basic, max_extended; @@ -951,6 +951,12 @@ Bool VG_(machine_get_hwcaps)( void ) HChar vstr[13]; vstr[0] = 0; + have_sse3 = have_ssse3 = have_cx8 = have_cx16 + = have_lzcnt = have_avx = have_bmi = have_avx2 + = have_rdtscp = False; + + eax = ebx = ecx = edx = max_basic = max_extended = 0; + if (!VG_(has_cpuid)()) /* we can't do cpuid at all. Give up. */ return False; @@ -975,8 +981,8 @@ Bool VG_(machine_get_hwcaps)( void ) VG_(cpuid)(1, 0, &eax, &ebx, &ecx, &edx); // we assume that SSE1 and SSE2 are available by default - have_sse3 = (ecx & (1<<0)) != 0; /* True => have sse3 insns */ - // ssse3 is ecx:9 + have_sse3 = (ecx & (1<<0)) != 0; /* True => have sse3 insns */ + have_ssse3 = (ecx & (1<<9)) != 0; /* True => have Sup SSE3 insns */ // sse41 is ecx:19 // sse42 is ecx:20 @@ -1054,6 +1060,7 @@ Bool VG_(machine_get_hwcaps)( void ) va = VexArchAMD64; vai.endness = VexEndnessLE; vai.hwcaps = (have_sse3 ? VEX_HWCAPS_AMD64_SSE3 : 0) + | (have_ssse3 ? VEX_HWCAPS_AMD64_SSSE3 : 0) | (have_cx16 ? VEX_HWCAPS_AMD64_CX16 : 0) | (have_lzcnt ? VEX_HWCAPS_AMD64_LZCNT : 0) | (have_avx ? VEX_HWCAPS_AMD64_AVX : 0) |