|
From: <sv...@va...> - 2014-11-23 17:31:17
|
Author: sewardj
Date: Sun Nov 23 17:31:10 2014
New Revision: 3018
Log:
Merge, from trunk, r2993
2993 arm64: implement ADDP etc
Modified:
branches/VEX_3_10_BRANCH/ (props changed)
branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
branches/VEX_3_10_BRANCH/priv/host_arm64_defs.c
branches/VEX_3_10_BRANCH/priv/host_arm64_defs.h
branches/VEX_3_10_BRANCH/priv/host_arm64_isel.c
Modified: branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c (original)
+++ branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c Sun Nov 23 17:31:10 2014
@@ -29,48 +29,21 @@
The GNU General Public License is contained in the file COPYING.
*/
-//ZZ /* XXXX thumb to check:
-//ZZ that all cases where putIRegT writes r15, we generate a jump.
-//ZZ
-//ZZ All uses of newTemp assign to an IRTemp and not a UInt
-//ZZ
-//ZZ For all thumb loads and stores, including VFP ones, new-ITSTATE is
-//ZZ backed out before the memory op, and restored afterwards. This
-//ZZ needs to happen even after we go uncond. (and for sure it doesn't
-//ZZ happen for VFP loads/stores right now).
-//ZZ
-//ZZ VFP on thumb: check that we exclude all r13/r15 cases that we
-//ZZ should.
-//ZZ
-//ZZ XXXX thumb to do: improve the ITSTATE-zeroing optimisation by
-//ZZ taking into account the number of insns guarded by an IT.
-//ZZ
-//ZZ remove the nasty hack, in the spechelper, of looking for Or32(...,
-//ZZ 0xE0) in as the first arg to armg_calculate_condition, and instead
-//ZZ use Slice44 as specified in comments in the spechelper.
-//ZZ
-//ZZ add specialisations for armg_calculate_flag_c and _v, as they
-//ZZ are moderately often needed in Thumb code.
-//ZZ
-//ZZ Correctness: ITSTATE handling in Thumb SVCs is wrong.
-//ZZ
-//ZZ Correctness (obscure): in m_transtab, when invalidating code
-//ZZ address ranges, invalidate up to 18 bytes after the end of the
-//ZZ range. This is because the ITSTATE optimisation at the top of
-//ZZ _THUMB_WRK below analyses up to 18 bytes before the start of any
-//ZZ given instruction, and so might depend on the invalidated area.
-//ZZ */
-//ZZ
-//ZZ /* Limitations, etc
-//ZZ
-//ZZ - pretty dodgy exception semantics for {LD,ST}Mxx and {LD,ST}RD.
-//ZZ These instructions are non-restartable in the case where the
-//ZZ transfer(s) fault.
-//ZZ
-//ZZ - SWP: the restart jump back is Ijk_Boring; it should be
-//ZZ Ijk_NoRedir but that's expensive. See comments on casLE() in
-//ZZ guest_x86_toIR.c.
-//ZZ */
+/* KNOWN LIMITATIONS 2014-Nov-16
+
+ * Correctness: FMAXNM, FMINNM are implemented the same as FMAX/FMIN.
+
+ Also FP comparison "unordered" .. is implemented as normal FP
+ comparison.
+
+ Both should be fixed. They behave incorrectly in the presence of
+ NaNs.
+
+ * Floating multiply-add (etc) insns. Are split into a multiply and
+ an add, and so suffer double rounding and hence sometimes the
+ least significant mantissa bit is incorrect. Fix: use the IR
+ multiply-add IROps instead.
+*/
/* "Special" instructions.
@@ -989,6 +962,26 @@
return ops[size];
}
+static IROp mkVecADDF ( UInt size ) {
+ const IROp ops[4]
+ = { Iop_INVALID, Iop_INVALID, Iop_Add32Fx4, Iop_Add64Fx2 };
+ vassert(size < 4);
+ return ops[size];
+}
+
+static IROp mkVecMAXF ( UInt size ) {
+ const IROp ops[4]
+ = { Iop_INVALID, Iop_INVALID, Iop_Max32Fx4, Iop_Max64Fx2 };
+ vassert(size < 4);
+ return ops[size];
+}
+
+static IROp mkVecMINF ( UInt size ) {
+ const IROp ops[4]
+ = { Iop_INVALID, Iop_INVALID, Iop_Min32Fx4, Iop_Min64Fx2 };
+ vassert(size < 4);
+ return ops[size];
+}
/* Generate IR to create 'arg rotated right by imm', for sane values
of 'ty' and 'imm'. */
@@ -8039,6 +8032,55 @@
}
+/* Generate IR to rearrange two vector values in a way which is useful
+ for doing S/D add-pair etc operations. There are 3 cases:
+
+ 2d: [m1 m0] [n1 n0] --> [m1 n1] [m0 n0]
+
+ 4s: [m3 m2 m1 m0] [n3 n2 n1 n0] --> [m3 m1 n3 n1] [m2 m0 n2 n0]
+
+ 2s: [m2 m2 m1 m0] [n3 n2 n1 n0] --> [0 0 m1 n1] [0 0 m0 n0]
+
+ The cases are distinguished as follows:
+ isD == True, bitQ == 1 => 2d
+ isD == False, bitQ == 1 => 4s
+ isD == False, bitQ == 0 => 2s
+*/
+static
+void math_REARRANGE_FOR_FLOATING_PAIRWISE (
+ /*OUT*/IRTemp* rearrL, /*OUT*/IRTemp* rearrR,
+ IRTemp vecM, IRTemp vecN, Bool isD, UInt bitQ
+ )
+{
+ vassert(rearrL && *rearrL == IRTemp_INVALID);
+ vassert(rearrR && *rearrR == IRTemp_INVALID);
+ *rearrL = newTempV128();
+ *rearrR = newTempV128();
+ if (isD) {
+ // 2d case
+ vassert(bitQ == 1);
+ assign(*rearrL, binop(Iop_InterleaveHI64x2, mkexpr(vecM), mkexpr(vecN)));
+ assign(*rearrR, binop(Iop_InterleaveLO64x2, mkexpr(vecM), mkexpr(vecN)));
+ }
+ else if (!isD && bitQ == 1) {
+ // 4s case
+ assign(*rearrL, binop(Iop_CatOddLanes32x4, mkexpr(vecM), mkexpr(vecN)));
+ assign(*rearrR, binop(Iop_CatEvenLanes32x4, mkexpr(vecM), mkexpr(vecN)));
+ } else {
+ // 2s case
+ vassert(!isD && bitQ == 0);
+ IRTemp m1n1m0n0 = newTempV128();
+ IRTemp m0n0m1n1 = newTempV128();
+ assign(m1n1m0n0, binop(Iop_InterleaveLO32x4,
+ mkexpr(vecM), mkexpr(vecN)));
+ assign(m0n0m1n1, triop(Iop_SliceV128,
+ mkexpr(m1n1m0n0), mkexpr(m1n1m0n0), mkU8(8)));
+ assign(*rearrL, unop(Iop_ZeroHI64ofV128, mkexpr(m1n1m0n0)));
+ assign(*rearrR, unop(Iop_ZeroHI64ofV128, mkexpr(m0n0m1n1)));
+ }
+}
+
+
/*------------------------------------------------------------*/
/*--- SIMD and FP instructions ---*/
/*------------------------------------------------------------*/
@@ -8931,6 +8973,26 @@
return True;
}
+ if (bitU == 1 && sz <= X01 && opcode == BITS5(0,1,1,0,1)) {
+ /* -------- 1,00,01101 ADDP s_2s -------- */
+ /* -------- 1,01,01101 ADDP d_2d -------- */
+ Bool isD = sz == X01;
+ IROp opZHI = mkVecZEROHIxxOFV128(isD ? 3 : 2);
+ IROp opADD = mkVecADDF(isD ? 3 : 2);
+ IRTemp src = newTempV128();
+ IRTemp argL = newTempV128();
+ IRTemp argR = newTempV128();
+ assign(src, getQReg128(nn));
+ assign(argL, unop(opZHI, mkexpr(src)));
+ assign(argR, unop(opZHI, triop(Iop_SliceV128, mkexpr(src), mkexpr(src),
+ mkU8(isD ? 8 : 4))));
+ putQReg128(dd, unop(opZHI,
+ triop(opADD, mkexpr(mk_get_IR_rounding_mode()),
+ mkexpr(argL), mkexpr(argR))));
+ DIP(isD ? "faddp d%u, v%u.2d\n" : "faddp s%u, v%u.2s\n", dd, nn);
+ return True;
+ }
+
return False;
# undef INSN
}
@@ -11000,6 +11062,30 @@
return True;
}
+ if (bitU == 1 && size <= X01 && opcode == BITS5(1,1,0,1,0)) {
+ /* -------- 1,0x,11010 FADDP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
+ Bool isD = size == X01;
+ if (bitQ == 0 && isD) return False; // implied 1d case
+ IRTemp srcN = newTempV128();
+ IRTemp srcM = newTempV128();
+ IRTemp preL = IRTemp_INVALID;
+ IRTemp preR = IRTemp_INVALID;
+ assign(srcN, getQReg128(nn));
+ assign(srcM, getQReg128(mm));
+ math_REARRANGE_FOR_FLOATING_PAIRWISE(&preL, &preR,
+ srcM, srcN, isD, bitQ);
+ putQReg128(
+ dd, math_MAYBE_ZERO_HI64_fromE(
+ bitQ,
+ triop(mkVecADDF(isD ? 3 : 2),
+ mkexpr(mk_get_IR_rounding_mode()),
+ mkexpr(preL), mkexpr(preR))));
+ const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
+ DIP("%s %s.%s, %s.%s, %s.%s\n", "faddp",
+ nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
+ return True;
+ }
+
if (bitU == 1 && size <= X01 && opcode == BITS5(1,1,1,1,1)) {
/* -------- 1,0x,11111 FDIV 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
Bool isD = (size & 1) == 1;
@@ -12047,6 +12133,7 @@
/* 31 28 23 21 20 15 11 9 4
000 11110 ty 1 m opcode 10 n d
The first 3 bits are really "M 0 S", but M and S are always zero.
+ Decode fields: ty, opcode
*/
# define INSN(_bMax,_bMin) SLICE_UInt(insn, (_bMax), (_bMin))
if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
@@ -12059,27 +12146,38 @@
UInt nn = INSN(9,5);
UInt dd = INSN(4,0);
- if (ty <= X01 && opcode <= BITS4(0,0,1,1)) {
+ if (ty <= X01 && opcode <= BITS4(0,1,1,1)) {
/* ------- 0x,0000: FMUL d_d, s_s ------- */
/* ------- 0x,0001: FDIV d_d, s_s ------- */
/* ------- 0x,0010: FADD d_d, s_s ------- */
/* ------- 0x,0011: FSUB d_d, s_s ------- */
+ /* ------- 0x,0100: FMAX d_d, s_s ------- */
+ /* ------- 0x,0101: FMIN d_d, s_s ------- */
+ /* ------- 0x,0110: FMAXNM d_d, s_s ------- (FIXME KLUDGED) */
+ /* ------- 0x,0111: FMINNM d_d, s_s ------- (FIXME KLUDGED) */
IRType ity = ty == X00 ? Ity_F32 : Ity_F64;
IROp iop = Iop_INVALID;
const HChar* nm = "???";
switch (opcode) {
- case BITS4(0,0,0,0): nm = "fmul"; iop = mkMULF(ity); break;
- case BITS4(0,0,0,1): nm = "fdiv"; iop = mkDIVF(ity); break;
- case BITS4(0,0,1,0): nm = "fadd"; iop = mkADDF(ity); break;
- case BITS4(0,0,1,1): nm = "fsub"; iop = mkSUBF(ity); break;
+ case BITS4(0,0,0,0): nm = "fmul"; iop = mkMULF(ity); break;
+ case BITS4(0,0,0,1): nm = "fdiv"; iop = mkDIVF(ity); break;
+ case BITS4(0,0,1,0): nm = "fadd"; iop = mkADDF(ity); break;
+ case BITS4(0,0,1,1): nm = "fsub"; iop = mkSUBF(ity); break;
+ case BITS4(0,1,0,0): nm = "fmax"; iop = mkVecMAXF(ty+2); break;
+ case BITS4(0,1,0,1): nm = "fmin"; iop = mkVecMINF(ty+2); break;
+ case BITS4(0,1,1,0): nm = "fmaxnm"; iop = mkVecMAXF(ty+2); break; //!!
+ case BITS4(0,1,1,1): nm = "fminnm"; iop = mkVecMINF(ty+2); break; //!!
default: vassert(0);
}
- IRExpr* resE = triop(iop, mkexpr(mk_get_IR_rounding_mode()),
- getQRegLO(nn, ity), getQRegLO(mm, ity));
- IRTemp res = newTemp(ity);
- assign(res, resE);
- putQReg128(dd, mkV128(0));
- putQRegLO(dd, mkexpr(res));
+ if (opcode <= BITS4(0,0,1,1)) {
+ // This is really not good code. TODO: avoid width-changing
+ putQReg128(dd, mkV128(0));
+ putQRegLO(dd, triop(iop, mkexpr(mk_get_IR_rounding_mode()),
+ getQRegLO(nn, ity), getQRegLO(mm, ity)));
+ } else {
+ putQReg128(dd, unop(mkVecZEROHIxxOFV128(ty+2),
+ binop(iop, getQReg128(nn), getQReg128(mm))));
+ }
DIP("%s %s, %s, %s\n",
nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
return True;
@@ -12330,6 +12428,7 @@
|| (iop == Iop_F64toI32S && irrm == Irrm_ZERO) /* FCVTZS Wd,Dn */
|| (iop == Iop_F64toI32S && irrm == Irrm_NegINF) /* FCVTMS Wd,Dn */
|| (iop == Iop_F64toI32S && irrm == Irrm_PosINF) /* FCVTPS Wd,Dn */
+ || (iop == Iop_F64toI32S && irrm == Irrm_NEAREST)/* FCVT{A,N}S W,D */
/* F64toI32U */
|| (iop == Iop_F64toI32U && irrm == Irrm_ZERO) /* FCVTZU Wd,Dn */
|| (iop == Iop_F64toI32U && irrm == Irrm_NegINF) /* FCVTMU Wd,Dn */
@@ -12338,7 +12437,7 @@
|| (iop == Iop_F64toI64S && irrm == Irrm_ZERO) /* FCVTZS Xd,Dn */
|| (iop == Iop_F64toI64S && irrm == Irrm_NegINF) /* FCVTMS Xd,Dn */
|| (iop == Iop_F64toI64S && irrm == Irrm_PosINF) /* FCVTPS Xd,Dn */
- || (iop == Iop_F64toI64S && irrm == Irrm_NEAREST) /* FCVT{A,N}S Xd,Dn */
+ || (iop == Iop_F64toI64S && irrm == Irrm_NEAREST)/* FCVT{A,N}S X,D */
/* F64toI64U */
|| (iop == Iop_F64toI64U && irrm == Irrm_ZERO) /* FCVTZU Xd,Dn */
|| (iop == Iop_F64toI64U && irrm == Irrm_NegINF) /* FCVTMU Xd,Dn */
Modified: branches/VEX_3_10_BRANCH/priv/host_arm64_defs.c
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/host_arm64_defs.c (original)
+++ branches/VEX_3_10_BRANCH/priv/host_arm64_defs.c Sun Nov 23 17:31:10 2014
@@ -589,6 +589,10 @@
case ARM64vecb_FSUB32x4: *nm = "fsub "; *ar = "4s"; return;
case ARM64vecb_FMUL32x4: *nm = "fmul "; *ar = "4s"; return;
case ARM64vecb_FDIV32x4: *nm = "fdiv "; *ar = "4s"; return;
+ case ARM64vecb_FMAX64x2: *nm = "fmax "; *ar = "2d"; return;
+ case ARM64vecb_FMAX32x4: *nm = "fmax "; *ar = "4s"; return;
+ case ARM64vecb_FMIN64x2: *nm = "fmin "; *ar = "2d"; return;
+ case ARM64vecb_FMIN32x4: *nm = "fmin "; *ar = "4s"; return;
case ARM64vecb_UMAX32x4: *nm = "umax "; *ar = "4s"; return;
case ARM64vecb_UMAX16x8: *nm = "umax "; *ar = "8h"; return;
case ARM64vecb_UMAX8x16: *nm = "umax "; *ar = "16b"; return;
@@ -4054,6 +4058,11 @@
011 01110 01 1 m 111111 n d FDIV Vd.2d, Vn.2d, Vm.2d
011 01110 00 1 m 111111 n d FDIV Vd.4s, Vn.4s, Vm.4s
+ 010 01110 01 1 m 111101 n d FMAX Vd.2d, Vn.2d, Vm.2d
+ 010 01110 00 1 m 111101 n d FMAX Vd.4s, Vn.4s, Vm.4s
+ 010 01110 11 1 m 111101 n d FMIN Vd.2d, Vn.2d, Vm.2d
+ 010 01110 10 1 m 111101 n d FMIN Vd.4s, Vn.4s, Vm.4s
+
011 01110 10 1 m 011001 n d UMAX Vd.4s, Vn.4s, Vm.4s
011 01110 01 1 m 011001 n d UMAX Vd.8h, Vn.8h, Vm.8h
011 01110 00 1 m 011001 n d UMAX Vd.16b, Vn.16b, Vm.16b
@@ -4230,6 +4239,19 @@
*p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X111111, vN, vD);
break;
+ case ARM64vecb_FMAX64x2:
+ *p++ = X_3_8_5_6_5_5(X010, X01110011, vM, X111101, vN, vD);
+ break;
+ case ARM64vecb_FMAX32x4:
+ *p++ = X_3_8_5_6_5_5(X010, X01110001, vM, X111101, vN, vD);
+ break;
+ case ARM64vecb_FMIN64x2:
+ *p++ = X_3_8_5_6_5_5(X010, X01110111, vM, X111101, vN, vD);
+ break;
+ case ARM64vecb_FMIN32x4:
+ *p++ = X_3_8_5_6_5_5(X010, X01110101, vM, X111101, vN, vD);
+ break;
+
case ARM64vecb_UMAX32x4:
*p++ = X_3_8_5_6_5_5(X011, X01110101, vM, X011001, vN, vD);
break;
Modified: branches/VEX_3_10_BRANCH/priv/host_arm64_defs.h
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/host_arm64_defs.h (original)
+++ branches/VEX_3_10_BRANCH/priv/host_arm64_defs.h Sun Nov 23 17:31:10 2014
@@ -317,6 +317,8 @@
ARM64vecb_FSUB64x2, ARM64vecb_FSUB32x4,
ARM64vecb_FMUL64x2, ARM64vecb_FMUL32x4,
ARM64vecb_FDIV64x2, ARM64vecb_FDIV32x4,
+ ARM64vecb_FMAX64x2, ARM64vecb_FMAX32x4,
+ ARM64vecb_FMIN64x2, ARM64vecb_FMIN32x4,
ARM64vecb_UMAX32x4,
ARM64vecb_UMAX16x8, ARM64vecb_UMAX8x16,
ARM64vecb_UMIN32x4,
Modified: branches/VEX_3_10_BRANCH/priv/host_arm64_isel.c
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/host_arm64_isel.c (original)
+++ branches/VEX_3_10_BRANCH/priv/host_arm64_isel.c Sun Nov 23 17:31:10 2014
@@ -2405,6 +2405,8 @@
case Iop_Rsh32Sx4: case Iop_Rsh64Sx2:
case Iop_Rsh8Ux16: case Iop_Rsh16Ux8:
case Iop_Rsh32Ux4: case Iop_Rsh64Ux2:
+ case Iop_Max64Fx2: case Iop_Max32Fx4:
+ case Iop_Min64Fx2: case Iop_Min32Fx4:
{
HReg res = newVRegV(env);
HReg argL = iselV128Expr(env, e->Iex.Binop.arg1);
@@ -2522,6 +2524,10 @@
case Iop_Rsh16Ux8: op = ARM64vecb_URSHL16x8; break;
case Iop_Rsh32Ux4: op = ARM64vecb_URSHL32x4; break;
case Iop_Rsh64Ux2: op = ARM64vecb_URSHL64x2; break;
+ case Iop_Max64Fx2: op = ARM64vecb_FMAX64x2; break;
+ case Iop_Max32Fx4: op = ARM64vecb_FMAX32x4; break;
+ case Iop_Min64Fx2: op = ARM64vecb_FMIN64x2; break;
+ case Iop_Min32Fx4: op = ARM64vecb_FMIN32x4; break;
default: vassert(0);
}
if (sw) {
|