|
From: <sv...@va...> - 2014-07-22 09:27:57
|
Author: sewardj
Date: Tue Jul 22 09:27:49 2014
New Revision: 2909
Log:
arm64: implement:
{sqdmlal,sqdmlsl,sqdmull}{d_s_s[],s_h_h[]}
{sqdmlal,sqdmlsl,sqdmull}{d_s_s,s_h_h}
{sqdmlal,sqdmlsl,sqdmull}{2d_(2s_2s)/(4s_4s), 4s_(4h_4h)/(8h_8h)}
sqrdmulh 4s,2s,8h,4h (vector)
Modified:
trunk/priv/guest_arm64_toIR.c
trunk/priv/host_arm64_defs.c
trunk/priv/host_arm64_defs.h
trunk/priv/host_arm64_isel.c
Modified: trunk/priv/guest_arm64_toIR.c
==============================================================================
--- trunk/priv/guest_arm64_toIR.c (original)
+++ trunk/priv/guest_arm64_toIR.c Tue Jul 22 09:27:49 2014
@@ -392,7 +392,6 @@
*t2 = newTempV128();
}
-/* Initialise V128 temporaries en masse. */
static
void newTempsV128_3(IRTemp* t1, IRTemp* t2, IRTemp* t3)
{
@@ -404,6 +403,19 @@
*t3 = newTempV128();
}
+//static
+//void newTempsV128_4(IRTemp* t1, IRTemp* t2, IRTemp* t3, IRTemp* t4)
+//{
+// vassert(t1 && *t1 == IRTemp_INVALID);
+// vassert(t2 && *t2 == IRTemp_INVALID);
+// vassert(t3 && *t3 == IRTemp_INVALID);
+// vassert(t4 && *t4 == IRTemp_INVALID);
+// *t1 = newTempV128();
+// *t2 = newTempV128();
+// *t3 = newTempV128();
+// *t4 = newTempV128();
+//}
+
static
void newTempsV128_7(IRTemp* t1, IRTemp* t2, IRTemp* t3,
IRTemp* t4, IRTemp* t5, IRTemp* t6, IRTemp* t7)
@@ -782,6 +794,20 @@
}
}
+static IROp mkVecQDMULHIS ( UInt size ) {
+ const IROp ops[4]
+ = { Iop_INVALID, Iop_QDMulHi16Sx8, Iop_QDMulHi32Sx4, Iop_INVALID };
+ vassert(size < 4);
+ return ops[size];
+}
+
+static IROp mkVecQRDMULHIS ( UInt size ) {
+ const IROp ops[4]
+ = { Iop_INVALID, Iop_QRDMulHi16Sx8, Iop_QRDMulHi32Sx4, Iop_INVALID };
+ vassert(size < 4);
+ return ops[size];
+}
+
/* Generate IR to create 'arg rotated right by imm', for sane values
of 'ty' and 'imm'. */
static IRTemp mathROR ( IRType ty, IRTemp arg, UInt imm )
@@ -6362,22 +6388,105 @@
}
+/* Generate IR for widening signed vector multiplies. The operands
+ have their lane width signedly widened, and they are then multiplied
+ at the wider width, returning results in two new IRTemps. */
+static
+void math_MULLS ( /*OUT*/IRTemp* resHI, /*OUT*/IRTemp* resLO,
+ UInt sizeNarrow, IRTemp argL, IRTemp argR )
+{
+ vassert(sizeNarrow <= 2);
+ newTempsV128_2(resHI, resLO);
+ IRTemp argLhi = newTemp(Ity_I64);
+ IRTemp argLlo = newTemp(Ity_I64);
+ IRTemp argRhi = newTemp(Ity_I64);
+ IRTemp argRlo = newTemp(Ity_I64);
+ assign(argLhi, unop(Iop_V128HIto64, mkexpr(argL)));
+ assign(argLlo, unop(Iop_V128to64, mkexpr(argL)));
+ assign(argRhi, unop(Iop_V128HIto64, mkexpr(argR)));
+ assign(argRlo, unop(Iop_V128to64, mkexpr(argR)));
+ IROp opMulls = mkVecMULLS(sizeNarrow);
+ assign(*resHI, binop(opMulls, mkexpr(argLhi), mkexpr(argRhi)));
+ assign(*resLO, binop(opMulls, mkexpr(argLlo), mkexpr(argRlo)));
+}
+
+
+static
+void math_SQDMULH ( /*OUT*/IRTemp* res,
+ /*OUT*/IRTemp* sat1q, /*OUT*/IRTemp* sat1n,
+ Bool isR, UInt size, IRTemp vN, IRTemp vM )
+{
+ vassert(size == X01 || size == X10); /* s or h only */
+
+ newTempsV128_3(res, sat1q, sat1n);
+
+ IRTemp mullsHI = IRTemp_INVALID, mullsLO = IRTemp_INVALID;
+ math_MULLS(&mullsHI, &mullsLO, size, vN, vM);
+
+ IRTemp addWide = mkVecADD(size+1);
+
+ if (isR) {
+ assign(*sat1q, binop(mkVecQRDMULHIS(size), mkexpr(vN), mkexpr(vM)));
+
+ Int rcShift = size == X01 ? 15 : 31;
+ IRTemp roundConst = math_VEC_DUP_IMM(size+1, 1ULL << rcShift);
+ assign(*sat1n,
+ binop(mkVecCATODDLANES(size),
+ binop(addWide,
+ binop(addWide, mkexpr(mullsHI), mkexpr(mullsHI)),
+ mkexpr(roundConst)),
+ binop(addWide,
+ binop(addWide, mkexpr(mullsLO), mkexpr(mullsLO)),
+ mkexpr(roundConst))));
+ } else {
+ assign(*sat1q, binop(mkVecQDMULHIS(size), mkexpr(vN), mkexpr(vM)));
+
+ assign(*sat1n,
+ binop(mkVecCATODDLANES(size),
+ binop(addWide, mkexpr(mullsHI), mkexpr(mullsHI)),
+ binop(addWide, mkexpr(mullsLO), mkexpr(mullsLO))));
+ }
+
+ assign(*res, mkexpr(*sat1q));
+}
+
+
/* QCFLAG tracks the SIMD sticky saturation status. Update the status
- thusly: if |qres| and |nres| hold the same value, leave QCFLAG
- unchanged. Otherwise, set it (implicitly) to 1. */
+ thusly: if, after application of |opZHI| to both |qres| and |nres|,
+ they have the same value, leave QCFLAG unchanged. Otherwise, set it
+ (implicitly) to 1. |opZHI| may only be one of the Iop_ZeroHIxxofV128
+ operators, or Iop_INVALID, in which case |qres| and |nres| are used
+ unmodified. The presence |opZHI| means this function can be used to
+ generate QCFLAG update code for both scalar and vector SIMD operations.
+*/
static
-void updateQCFLAGwithDifference ( IRTemp qres, IRTemp nres )
+void updateQCFLAGwithDifferenceZHI ( IRTemp qres, IRTemp nres, IROp opZHI )
{
IRTemp diff = newTempV128();
IRTemp oldQCFLAG = newTempV128();
IRTemp newQCFLAG = newTempV128();
- assign(diff, binop(Iop_XorV128, mkexpr(qres), mkexpr(nres)));
+ if (opZHI == Iop_INVALID) {
+ assign(diff, binop(Iop_XorV128, mkexpr(qres), mkexpr(nres)));
+ } else {
+ vassert(opZHI == Iop_ZeroHI64ofV128 || opZHI == Iop_ZeroHI96ofV128);
+ assign(diff, unop(opZHI, binop(Iop_XorV128, mkexpr(qres), mkexpr(nres))));
+ }
assign(oldQCFLAG, IRExpr_Get(OFFB_QCFLAG, Ity_V128));
assign(newQCFLAG, binop(Iop_OrV128, mkexpr(oldQCFLAG), mkexpr(diff)));
stmt(IRStmt_Put(OFFB_QCFLAG, mkexpr(newQCFLAG)));
}
+/* A variant of updateQCFLAGwithDifferenceZHI in which |qres| and |nres|
+ are used unmodified, hence suitable for QCFLAG updates for whole-vector
+ operations. */
+static
+void updateQCFLAGwithDifference ( IRTemp qres, IRTemp nres )
+{
+ updateQCFLAGwithDifferenceZHI(qres, nres, Iop_INVALID);
+}
+
+
/*------------------------------------------------------------*/
/*--- SIMD and FP instructions ---*/
/*------------------------------------------------------------*/
@@ -7270,7 +7379,67 @@
static
Bool dis_AdvSIMD_scalar_three_different(/*MB_OUT*/DisResult* dres, UInt insn)
{
+ /* 31 29 28 23 21 20 15 11 9 4
+ 01 U 11110 size 1 m opcode 00 n d
+ Decode fields: u,opcode
+ */
# define INSN(_bMax,_bMin) SLICE_UInt(insn, (_bMax), (_bMin))
+ if (INSN(31,30) != BITS2(0,1)
+ || INSN(28,24) != BITS5(1,1,1,1,0)
+ || INSN(21,21) != 1
+ || INSN(11,10) != BITS2(0,0)) {
+ return False;
+ }
+ UInt bitU = INSN(29,29);
+ UInt size = INSN(23,22);
+ UInt mm = INSN(20,16);
+ UInt opcode = INSN(15,12);
+ UInt nn = INSN(9,5);
+ UInt dd = INSN(4,0);
+ vassert(size < 4);
+
+ if (bitU == 0
+ && (opcode == BITS4(1,1,0,1)
+ || opcode == BITS4(1,0,0,1) || opcode == BITS4(1,0,1,1))) {
+ /* -------- 0,1101 SQDMULL -------- */ // 0 (ks)
+ /* -------- 0,1001 SQDMLAL -------- */ // 1
+ /* -------- 0,1011 SQDMLSL -------- */ // 2
+ /* Widens, and size refers to the narrowed lanes. */
+ UInt ks = 3;
+ switch (opcode) {
+ case BITS4(1,1,0,1): ks = 0; break;
+ case BITS4(1,0,0,1): ks = 1; break;
+ case BITS4(1,0,1,1): ks = 2; break;
+ default: vassert(0);
+ }
+ vassert(ks >= 0 && ks <= 2);
+ if (size == X00 || size == X11) return False;
+ vassert(size <= 2);
+ IRTemp vecN, vecM, vecD, res, sat1q, sat1n, sat2q, sat2n;
+ vecN = vecM = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
+ newTempsV128_3(&vecN, &vecM, &vecD);
+ assign(vecN, getQReg128(nn));
+ assign(vecM, getQReg128(mm));
+ assign(vecD, getQReg128(dd));
+ math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
+ False/*!is2*/, size, "mas"[ks],
+ vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
+ IROp opZHI = mkVecZEROHIxxOFV128(size+1);
+ putQReg128(dd, unop(opZHI, mkexpr(res)));
+ vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
+ updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
+ if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
+ updateQCFLAGwithDifferenceZHI(sat2q, sat2n, opZHI);
+ }
+ const HChar* nm = ks == 0 ? "sqdmull"
+ : (ks == 1 ? "sqdmlal" : "sqdmlsl");
+ const HChar arrNarrow = "bhsd"[size];
+ const HChar arrWide = "bhsd"[size+1];
+ DIP("%s %c%d, %c%d, %c%d\n",
+ nm, arrWide, dd, arrNarrow, nn, arrNarrow, mm);
+ return True;
+ }
+
return False;
# undef INSN
}
@@ -7523,7 +7692,84 @@
static
Bool dis_AdvSIMD_scalar_x_indexed_element(/*MB_OUT*/DisResult* dres, UInt insn)
{
+ /* 31 28 23 21 20 19 15 11 9 4
+ 01 U 11111 size L M m opcode H 0 n d
+ Decode fields are: u,size,opcode
+ M is really part of the mm register number. Individual
+ cases need to inspect L and H though.
+ */
# define INSN(_bMax,_bMin) SLICE_UInt(insn, (_bMax), (_bMin))
+ if (INSN(31,30) != BITS2(0,1)
+ || INSN(28,24) != BITS5(1,1,1,1,1) || INSN(10,10) !=0) {
+ return False;
+ }
+ UInt bitU = INSN(29,29);
+ UInt size = INSN(23,22);
+ UInt bitL = INSN(21,21);
+ UInt bitM = INSN(20,20);
+ UInt mmLO4 = INSN(19,16);
+ UInt opcode = INSN(15,12);
+ UInt bitH = INSN(11,11);
+ UInt nn = INSN(9,5);
+ UInt dd = INSN(4,0);
+ vassert(size < 4);
+ vassert(bitH < 2 && bitM < 2 && bitL < 2);
+
+ if (bitU == 0
+ && (opcode == BITS4(1,0,1,1)
+ || opcode == BITS4(0,0,1,1) || opcode == BITS4(0,1,1,1))) {
+ /* -------- 0,xx,1011 SQDMULL s/h variants only -------- */ // 0 (ks)
+ /* -------- 0,xx,0011 SQDMLAL s/h variants only -------- */ // 1
+ /* -------- 0,xx,0111 SQDMLSL s/h variants only -------- */ // 2
+ /* Widens, and size refers to the narrowed lanes. */
+ UInt ks = 3;
+ switch (opcode) {
+ case BITS4(1,0,1,1): ks = 0; break;
+ case BITS4(0,0,1,1): ks = 1; break;
+ case BITS4(0,1,1,1): ks = 2; break;
+ default: vassert(0);
+ }
+ vassert(ks >= 0 && ks <= 2);
+ UInt mm = 32; // invalid
+ UInt ix = 16; // invalid
+ switch (size) {
+ case X00:
+ return False; // h_b_b[] case is not allowed
+ case X01:
+ mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
+ case X10:
+ mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
+ case X11:
+ return False; // q_d_d[] case is not allowed
+ default:
+ vassert(0);
+ }
+ vassert(mm < 32 && ix < 16);
+ IRTemp vecN, vecD, res, sat1q, sat1n, sat2q, sat2n;
+ vecN = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
+ newTempsV128_2(&vecN, &vecD);
+ assign(vecN, getQReg128(nn));
+ IRTemp vecM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
+ assign(vecD, getQReg128(dd));
+ math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
+ False/*!is2*/, size, "mas"[ks],
+ vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
+ IROp opZHI = mkVecZEROHIxxOFV128(size+1);
+ putQReg128(dd, unop(opZHI, mkexpr(res)));
+ vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
+ updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
+ if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
+ updateQCFLAGwithDifferenceZHI(sat2q, sat2n, opZHI);
+ }
+ const HChar* nm = ks == 0 ? "sqmull"
+ : (ks == 1 ? "sqdmlal" : "sqdmlsl");
+ const HChar arrNarrow = "bhsd"[size];
+ const HChar arrWide = "bhsd"[size+1];
+ DIP("%s %c%d, %c%d, v%d.%c[%u]\n",
+ nm, arrWide, dd, arrNarrow, nn, dd, arrNarrow, ix);
+ return True;
+ }
+
return False;
# undef INSN
}
@@ -7814,8 +8060,6 @@
/* -------- 0,0010 SSUBL{2} -------- */
/* -------- 1,0010 USUBL{2} -------- */
/* Widens, and size refers to the narrowed lanes. */
- const IROp opADD[3] = { Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
- const IROp opSUB[3] = { Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 };
if (size == X11) return False;
vassert(size <= 2);
Bool isU = bitU == 1;
@@ -7823,7 +8067,7 @@
IRTemp argL = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(nn));
IRTemp argR = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
IRTemp res = newTempV128();
- assign(res, binop(isADD ? opADD[size] : opSUB[size],
+ assign(res, binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
mkexpr(argL), mkexpr(argR)));
putQReg128(dd, mkexpr(res));
const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
@@ -7966,6 +8210,48 @@
return True;
}
+ if (bitU == 0
+ && (opcode == BITS4(1,1,0,1)
+ || opcode == BITS4(1,0,0,1) || opcode == BITS4(1,0,1,1))) {
+ /* -------- 0,1101 SQDMULL{2} -------- */ // 0 (ks)
+ /* -------- 0,1001 SQDMLAL{2} -------- */ // 1
+ /* -------- 0,1011 SQDMLSL{2} -------- */ // 2
+ /* Widens, and size refers to the narrowed lanes. */
+ UInt ks = 3;
+ switch (opcode) {
+ case BITS4(1,1,0,1): ks = 0; break;
+ case BITS4(1,0,0,1): ks = 1; break;
+ case BITS4(1,0,1,1): ks = 2; break;
+ default: vassert(0);
+ }
+ vassert(ks >= 0 && ks <= 2);
+ if (size == X00 || size == X11) return False;
+ vassert(size <= 2);
+ IRTemp vecN, vecM, vecD, res, sat1q, sat1n, sat2q, sat2n;
+ vecN = vecM = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
+ newTempsV128_3(&vecN, &vecM, &vecD);
+ assign(vecN, getQReg128(nn));
+ assign(vecM, getQReg128(mm));
+ assign(vecD, getQReg128(dd));
+ math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
+ is2, size, "mas"[ks],
+ vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
+ putQReg128(dd, mkexpr(res));
+ vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
+ updateQCFLAGwithDifference(sat1q, sat1n);
+ if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
+ updateQCFLAGwithDifference(sat2q, sat2n);
+ }
+ const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
+ const HChar* arrWide = nameArr_Q_SZ(1, size+1);
+ const HChar* nm = ks == 0 ? "sqdmull"
+ : (ks == 1 ? "sqdmlal" : "sqdmlsl");
+ DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
+ nameQReg128(dd), arrWide,
+ nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
+ return True;
+ }
+
if (bitU == 0 && opcode == BITS4(1,1,1,0)) {
/* -------- 0,1110 PMULL{2} -------- */
/* Widens, and size refers to the narrowed lanes. */
@@ -8351,6 +8637,27 @@
return True;
}
+ if (opcode == BITS5(1,0,1,1,0)) {
+ /* -------- 0,xx,10110 SQDMULH s and h variants only -------- */
+ /* -------- 1,xx,10110 SQRDMULH s and h variants only -------- */
+ if (size == X00 || size == X11) return False;
+ Bool isR = bitU == 1;
+ IRTemp res, sat1q, sat1n, vN, vM;
+ res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
+ newTempsV128_2(&vN, &vM);
+ assign(vN, getQReg128(nn));
+ assign(vM, getQReg128(mm));
+ math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
+ putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
+ IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
+ updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
+ const HChar* arr = nameArr_Q_SZ(bitQ, size);
+ const HChar* nm = isR ? "sqrdmulh" : "sqdmulh";
+ DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
+ nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
+ return True;
+ }
+
if (bitU == 0 && opcode == BITS5(1,0,1,1,1)) {
/* -------- 0,xx,10111 ADDP std7_std7_std7 -------- */
if (bitQ == 0 && size == X11) return False; // implied 1d case
@@ -9080,7 +9387,7 @@
if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
updateQCFLAGwithDifference(sat2q, sat2n);
}
- const HChar* nm = ks == 0 ? "sqmull"
+ const HChar* nm = ks == 0 ? "sqdmull"
: (ks == 1 ? "sqdmlal" : "sqdmlsl");
const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
const HChar* arrWide = nameArr_Q_SZ(1, size+1);
Modified: trunk/priv/host_arm64_defs.c
==============================================================================
--- trunk/priv/host_arm64_defs.c (original)
+++ trunk/priv/host_arm64_defs.c Tue Jul 22 09:27:49 2014
@@ -947,6 +947,10 @@
case ARM64vecb_UQSUB8x16: *nm = "uqsub"; *ar = "16b"; return;
case ARM64vecb_SQDMULL2DSS: *nm = "sqdmull"; *ar = "2dss"; return;
case ARM64vecb_SQDMULL4SHH: *nm = "sqdmull"; *ar = "4shh"; return;
+ case ARM64vecb_SQDMULH32x4: *nm = "sqdmulh"; *ar = "4s"; return;
+ case ARM64vecb_SQDMULH16x8: *nm = "sqdmulh"; *ar = "8h"; return;
+ case ARM64vecb_SQRDMULH32x4: *nm = "sqrdmulh"; *ar = "4s"; return;
+ case ARM64vecb_SQRDMULH16x8: *nm = "sqrdmulh"; *ar = "8h"; return;
default: vpanic("showARM64VecBinOp");
}
}
@@ -3506,6 +3510,7 @@
#define X100101 BITS8(0,0, 1,0,0,1,0,1)
#define X100110 BITS8(0,0, 1,0,0,1,1,0)
#define X100111 BITS8(0,0, 1,0,0,1,1,1)
+#define X101101 BITS8(0,0, 1,0,1,1,0,1)
#define X101110 BITS8(0,0, 1,0,1,1,1,0)
#define X110000 BITS8(0,0, 1,1,0,0,0,0)
#define X110001 BITS8(0,0, 1,1,0,0,0,1)
@@ -5195,6 +5200,11 @@
000 01110 10 1 m 110100 n d SQDMULL Vd.2d, Vn.2s, Vm.2s
000 01110 01 1 m 110100 n d SQDMULL Vd.4s, Vn.4h, Vm.4h
+
+ 010 01110 10 1 m 101101 n d SQDMULH Vd.4s, Vn.4s, Vm.4s
+ 010 01110 01 1 m 101101 n d SQDMULH Vd.8h, Vn.8h, Vm.8h
+ 011 01110 10 1 m 101101 n d SQRDMULH Vd.4s, Vn.4s, Vm.4s
+ 011 01110 10 1 m 101101 n d SQRDMULH Vd.8h, Vn.8h, Vm.8h
*/
UInt vD = qregNo(i->ARM64in.VBinV.dst);
UInt vN = qregNo(i->ARM64in.VBinV.argL);
@@ -5505,6 +5515,19 @@
*p++ = X_3_8_5_6_5_5(X000, X01110011, vM, X110100, vN, vD);
break;
+ case ARM64vecb_SQDMULH32x4:
+ *p++ = X_3_8_5_6_5_5(X010, X01110101, vM, X101101, vN, vD);
+ break;
+ case ARM64vecb_SQDMULH16x8:
+ *p++ = X_3_8_5_6_5_5(X010, X01110011, vM, X101101, vN, vD);
+ break;
+ case ARM64vecb_SQRDMULH32x4:
+ *p++ = X_3_8_5_6_5_5(X011, X01110101, vM, X101101, vN, vD);
+ break;
+ case ARM64vecb_SQRDMULH16x8:
+ *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X101101, vN, vD);
+ break;
+
default:
goto bad;
}
Modified: trunk/priv/host_arm64_defs.h
==============================================================================
--- trunk/priv/host_arm64_defs.h (original)
+++ trunk/priv/host_arm64_defs.h Tue Jul 22 09:27:49 2014
@@ -360,7 +360,10 @@
ARM64vecb_UQSUB16x8, ARM64vecb_UQSUB8x16,
ARM64vecb_SQDMULL2DSS,
ARM64vecb_SQDMULL4SHH,
-
+ ARM64vecb_SQDMULH32x4,
+ ARM64vecb_SQDMULH16x8,
+ ARM64vecb_SQRDMULH32x4,
+ ARM64vecb_SQRDMULH16x8,
ARM64vecb_INVALID
}
ARM64VecBinOp;
Modified: trunk/priv/host_arm64_isel.c
==============================================================================
--- trunk/priv/host_arm64_isel.c (original)
+++ trunk/priv/host_arm64_isel.c Tue Jul 22 09:27:49 2014
@@ -4951,6 +4951,8 @@
case Iop_QSub16Sx8: case Iop_QSub8Sx16:
case Iop_QSub64Ux2: case Iop_QSub32Ux4:
case Iop_QSub16Ux8: case Iop_QSub8Ux16:
+ case Iop_QDMulHi32Sx4: case Iop_QDMulHi16Sx8:
+ case Iop_QRDMulHi32Sx4: case Iop_QRDMulHi16Sx8:
{
HReg res = newVRegV(env);
HReg argL = iselV128Expr(env, e->Iex.Binop.arg1);
@@ -5032,22 +5034,26 @@
case Iop_InterleaveLO8x16: op = ARM64vecb_ZIP18x16; sw = True;
break;
case Iop_PolynomialMul8x16: op = ARM64vecb_PMUL8x16; break;
- case Iop_QAdd64Sx2: op = ARM64vecb_SQADD64x2; break;
- case Iop_QAdd32Sx4: op = ARM64vecb_SQADD32x4; break;
- case Iop_QAdd16Sx8: op = ARM64vecb_SQADD16x8; break;
- case Iop_QAdd8Sx16: op = ARM64vecb_SQADD8x16; break;
- case Iop_QAdd64Ux2: op = ARM64vecb_UQADD64x2; break;
- case Iop_QAdd32Ux4: op = ARM64vecb_UQADD32x4; break;
- case Iop_QAdd16Ux8: op = ARM64vecb_UQADD16x8; break;
- case Iop_QAdd8Ux16: op = ARM64vecb_UQADD8x16; break;
- case Iop_QSub64Sx2: op = ARM64vecb_SQSUB64x2; break;
- case Iop_QSub32Sx4: op = ARM64vecb_SQSUB32x4; break;
- case Iop_QSub16Sx8: op = ARM64vecb_SQSUB16x8; break;
- case Iop_QSub8Sx16: op = ARM64vecb_SQSUB8x16; break;
- case Iop_QSub64Ux2: op = ARM64vecb_UQSUB64x2; break;
- case Iop_QSub32Ux4: op = ARM64vecb_UQSUB32x4; break;
- case Iop_QSub16Ux8: op = ARM64vecb_UQSUB16x8; break;
- case Iop_QSub8Ux16: op = ARM64vecb_UQSUB8x16; break;
+ case Iop_QAdd64Sx2: op = ARM64vecb_SQADD64x2; break;
+ case Iop_QAdd32Sx4: op = ARM64vecb_SQADD32x4; break;
+ case Iop_QAdd16Sx8: op = ARM64vecb_SQADD16x8; break;
+ case Iop_QAdd8Sx16: op = ARM64vecb_SQADD8x16; break;
+ case Iop_QAdd64Ux2: op = ARM64vecb_UQADD64x2; break;
+ case Iop_QAdd32Ux4: op = ARM64vecb_UQADD32x4; break;
+ case Iop_QAdd16Ux8: op = ARM64vecb_UQADD16x8; break;
+ case Iop_QAdd8Ux16: op = ARM64vecb_UQADD8x16; break;
+ case Iop_QSub64Sx2: op = ARM64vecb_SQSUB64x2; break;
+ case Iop_QSub32Sx4: op = ARM64vecb_SQSUB32x4; break;
+ case Iop_QSub16Sx8: op = ARM64vecb_SQSUB16x8; break;
+ case Iop_QSub8Sx16: op = ARM64vecb_SQSUB8x16; break;
+ case Iop_QSub64Ux2: op = ARM64vecb_UQSUB64x2; break;
+ case Iop_QSub32Ux4: op = ARM64vecb_UQSUB32x4; break;
+ case Iop_QSub16Ux8: op = ARM64vecb_UQSUB16x8; break;
+ case Iop_QSub8Ux16: op = ARM64vecb_UQSUB8x16; break;
+ case Iop_QDMulHi32Sx4: op = ARM64vecb_SQDMULH32x4; break;
+ case Iop_QDMulHi16Sx8: op = ARM64vecb_SQDMULH16x8; break;
+ case Iop_QRDMulHi32Sx4: op = ARM64vecb_SQRDMULH32x4; break;
+ case Iop_QRDMulHi16Sx8: op = ARM64vecb_SQRDMULH16x8; break;
default: vassert(0);
}
if (sw) {
|
|
From: Florian K. <fl...@ei...> - 2014-07-22 09:44:32
|
On 22.07.2014 11:27, sv...@va... wrote:
>
> +static IROp mkVecQDMULHIS ( UInt size ) {
> + const IROp ops[4]
> + = { Iop_INVALID, Iop_QDMulHi16Sx8, Iop_QDMulHi32Sx4, Iop_INVALID };
> + vassert(size < 4);
> + return ops[size];
> +}
If it's possible that more IROps are added here in the future, then the
following would be better as it wouldn't need adjustment of the magic
constant 4:
const IROp ops[] // let the compiler figure out # elements
= { Iop_INVALID, Iop_QDMulHi16Sx8, Iop_QDMulHi32Sx4, Iop_INVALID };
vassert(size < sizeof ops);
I also noticed that at the call site a return value of Iop_INVALID would
cause an IRExpr of that kind to be created which would cause some
indigestion downstream. Perhaps that value should not be returned here ?
> +
> +static IROp mkVecQRDMULHIS ( UInt size ) {
> + const IROp ops[4]
> + = { Iop_INVALID, Iop_QRDMulHi16Sx8, Iop_QRDMulHi32Sx4, Iop_INVALID };
> + vassert(size < 4);
> + return ops[size];
Likewise.
Florian
|
|
From: Julian S. <js...@ac...> - 2014-07-22 16:21:38
|
> const IROp ops[] // let the compiler figure out # elements
> = { Iop_INVALID, Iop_QDMulHi16Sx8, Iop_QDMulHi32Sx4, Iop_INVALID };
> vassert(size < sizeof ops);
That's neater. Although .. shouldn't that be:
size < sizeof(ops)/sizeof(ops[0]) ?
> I also noticed that at the call site a return value of Iop_INVALID would
> cause an IRExpr of that kind to be created which would cause some
> indigestion downstream. Perhaps that value should not be returned here ?
The IR sanity checker would eventually jump on it. But in fact that
wouldn't happen because the call sites are very careful to only call
with valid size values, in this case 1 or 2. So there are two levels
of protection.
J
|
|
From: Florian K. <fl...@ei...> - 2014-07-22 17:54:23
|
On 22.07.2014 18:21, Julian Seward wrote:
>
>> const IROp ops[] // let the compiler figure out # elements
>> = { Iop_INVALID, Iop_QDMulHi16Sx8, Iop_QDMulHi32Sx4, Iop_INVALID };
>> vassert(size < sizeof ops);
>
> That's neater. Although .. shouldn't that be:
> size < sizeof(ops)/sizeof(ops[0]) ?
You're absolutely right.
>
>> I also noticed that at the call site a return value of Iop_INVALID would
>> cause an IRExpr of that kind to be created which would cause some
>> indigestion downstream. Perhaps that value should not be returned here ?
>
> The IR sanity checker would eventually jump on it. But in fact that
> wouldn't happen because the call sites are very careful to only call
> with valid size values, in this case 1 or 2. So there are two levels
> of protection.
All true. What I stumbled upon was: why does the ops array contain the
Iop_INVALID value in the first place? Given that you never want to
return it.
Florian
|
|
From: Julian S. <js...@ac...> - 2014-07-22 20:55:51
|
> What I stumbled upon was: why does the ops array contain the > Iop_INVALID value in the first place? Given that you never want to > return it. The arm64 simd instructions operate on 128-bit registers. Pretty much all of the instructions contain a two-bit "size" field indicating the lane configuration, like this 00 16 lanes of 8 bits (8x16) 01 8 lanes of 16 bits (16x8) 10 4 lanes of 32 bits (32x4) 11 2 lanes of 64 bits (64x2) and it is convenient to simply take the field and pass it to one of the mkVecXXX functions, to get an IR primitive with the correct laneage. The problem is that not all instructions support all lane formats -- for example add supports all formats, but mul does not support 64x2, and that is reflected by Iop_INVALID in the last entry for mkVecMUL. Other instructions don't support the narrowest configuration (00). Removing the leading Iop_INVALID array entries in these cases (eg, mkVecQDMULLS) would require the caller subtracting 1 from the size field in the instruction, which is kind of inconvenient and inconsistent. How it is at the moment, the mkVecXXX functions have a uniform interface in that you just give them a "size" value without regard to which values are actually valid, and if you give it something invalid then you get back Iop_INVALID. In fact the callers of the mkVecXXX functions are very careful to throw out size values that are not supported by the hardware, so that the front end doesn't accidentally accept non-existent instructions. So in fact there should be no calls that really result in Iop_INVALID being returned. J |