You can subscribe to this list here.
| 2002 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
(1) |
Oct
(122) |
Nov
(152) |
Dec
(69) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2003 |
Jan
(6) |
Feb
(25) |
Mar
(73) |
Apr
(82) |
May
(24) |
Jun
(25) |
Jul
(10) |
Aug
(11) |
Sep
(10) |
Oct
(54) |
Nov
(203) |
Dec
(182) |
| 2004 |
Jan
(307) |
Feb
(305) |
Mar
(430) |
Apr
(312) |
May
(187) |
Jun
(342) |
Jul
(487) |
Aug
(637) |
Sep
(336) |
Oct
(373) |
Nov
(441) |
Dec
(210) |
| 2005 |
Jan
(385) |
Feb
(480) |
Mar
(636) |
Apr
(544) |
May
(679) |
Jun
(625) |
Jul
(810) |
Aug
(838) |
Sep
(634) |
Oct
(521) |
Nov
(965) |
Dec
(543) |
| 2006 |
Jan
(494) |
Feb
(431) |
Mar
(546) |
Apr
(411) |
May
(406) |
Jun
(322) |
Jul
(256) |
Aug
(401) |
Sep
(345) |
Oct
(542) |
Nov
(308) |
Dec
(481) |
| 2007 |
Jan
(427) |
Feb
(326) |
Mar
(367) |
Apr
(255) |
May
(244) |
Jun
(204) |
Jul
(223) |
Aug
(231) |
Sep
(354) |
Oct
(374) |
Nov
(497) |
Dec
(362) |
| 2008 |
Jan
(322) |
Feb
(482) |
Mar
(658) |
Apr
(422) |
May
(476) |
Jun
(396) |
Jul
(455) |
Aug
(267) |
Sep
(280) |
Oct
(253) |
Nov
(232) |
Dec
(304) |
| 2009 |
Jan
(486) |
Feb
(470) |
Mar
(458) |
Apr
(423) |
May
(696) |
Jun
(461) |
Jul
(551) |
Aug
(575) |
Sep
(134) |
Oct
(110) |
Nov
(157) |
Dec
(102) |
| 2010 |
Jan
(226) |
Feb
(86) |
Mar
(147) |
Apr
(117) |
May
(107) |
Jun
(203) |
Jul
(193) |
Aug
(238) |
Sep
(300) |
Oct
(246) |
Nov
(23) |
Dec
(75) |
| 2011 |
Jan
(133) |
Feb
(195) |
Mar
(315) |
Apr
(200) |
May
(267) |
Jun
(293) |
Jul
(353) |
Aug
(237) |
Sep
(278) |
Oct
(611) |
Nov
(274) |
Dec
(260) |
| 2012 |
Jan
(303) |
Feb
(391) |
Mar
(417) |
Apr
(441) |
May
(488) |
Jun
(655) |
Jul
(590) |
Aug
(610) |
Sep
(526) |
Oct
(478) |
Nov
(359) |
Dec
(372) |
| 2013 |
Jan
(467) |
Feb
(226) |
Mar
(391) |
Apr
(281) |
May
(299) |
Jun
(252) |
Jul
(311) |
Aug
(352) |
Sep
(481) |
Oct
(571) |
Nov
(222) |
Dec
(231) |
| 2014 |
Jan
(185) |
Feb
(329) |
Mar
(245) |
Apr
(238) |
May
(281) |
Jun
(399) |
Jul
(382) |
Aug
(500) |
Sep
(579) |
Oct
(435) |
Nov
(487) |
Dec
(256) |
| 2015 |
Jan
(338) |
Feb
(357) |
Mar
(330) |
Apr
(294) |
May
(191) |
Jun
(108) |
Jul
(142) |
Aug
(261) |
Sep
(190) |
Oct
(54) |
Nov
(83) |
Dec
(22) |
| 2016 |
Jan
(49) |
Feb
(89) |
Mar
(33) |
Apr
(50) |
May
(27) |
Jun
(34) |
Jul
(53) |
Aug
(53) |
Sep
(98) |
Oct
(206) |
Nov
(93) |
Dec
(53) |
| 2017 |
Jan
(65) |
Feb
(82) |
Mar
(102) |
Apr
(86) |
May
(187) |
Jun
(67) |
Jul
(23) |
Aug
(93) |
Sep
(65) |
Oct
(45) |
Nov
(35) |
Dec
(17) |
| 2018 |
Jan
(26) |
Feb
(35) |
Mar
(38) |
Apr
(32) |
May
(8) |
Jun
(43) |
Jul
(27) |
Aug
(30) |
Sep
(43) |
Oct
(42) |
Nov
(38) |
Dec
(67) |
| 2019 |
Jan
(32) |
Feb
(37) |
Mar
(53) |
Apr
(64) |
May
(49) |
Jun
(18) |
Jul
(14) |
Aug
(53) |
Sep
(25) |
Oct
(30) |
Nov
(49) |
Dec
(31) |
| 2020 |
Jan
(87) |
Feb
(45) |
Mar
(37) |
Apr
(51) |
May
(99) |
Jun
(36) |
Jul
(11) |
Aug
(14) |
Sep
(20) |
Oct
(24) |
Nov
(40) |
Dec
(23) |
| 2021 |
Jan
(14) |
Feb
(53) |
Mar
(85) |
Apr
(15) |
May
(19) |
Jun
(3) |
Jul
(14) |
Aug
(1) |
Sep
(57) |
Oct
(73) |
Nov
(56) |
Dec
(22) |
| 2022 |
Jan
(3) |
Feb
(22) |
Mar
(6) |
Apr
(55) |
May
(46) |
Jun
(39) |
Jul
(15) |
Aug
(9) |
Sep
(11) |
Oct
(34) |
Nov
(20) |
Dec
(36) |
| 2023 |
Jan
(79) |
Feb
(41) |
Mar
(99) |
Apr
(169) |
May
(48) |
Jun
(16) |
Jul
(16) |
Aug
(57) |
Sep
(32) |
Oct
|
Nov
|
Dec
|
| S | M | T | W | T | F | S |
|---|---|---|---|---|---|---|
|
|
|
|
|
|
|
1
(15) |
|
2
(11) |
3
(3) |
4
(20) |
5
(16) |
6
(17) |
7
(16) |
8
(11) |
|
9
(3) |
10
(15) |
11
(16) |
12
(14) |
13
(17) |
14
(17) |
15
(18) |
|
16
(13) |
17
(17) |
18
(17) |
19
(19) |
20
(19) |
21
(14) |
22
(15) |
|
23
(66) |
24
(18) |
25
(27) |
26
(15) |
27
(12) |
28
(1) |
29
(14) |
|
30
(11) |
|
|
|
|
|
|
|
From: <sv...@va...> - 2014-11-23 17:28:25
|
Author: sewardj
Date: Sun Nov 23 17:28:18 2014
New Revision: 3016
Log:
Merge, from trunk, r2991.
340856 disInstr(arm64): unhandled instruction 0x1E634C45 (fcsel)
2991
Modified:
branches/VEX_3_10_BRANCH/ (props changed)
branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
branches/VEX_3_10_BRANCH/priv/host_arm64_defs.c
branches/VEX_3_10_BRANCH/priv/host_arm64_defs.h
branches/VEX_3_10_BRANCH/priv/host_arm64_isel.c
Modified: branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c (original)
+++ branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c Sun Nov 23 17:28:18 2014
@@ -11844,7 +11844,40 @@
static
Bool dis_AdvSIMD_fp_conditional_select(/*MB_OUT*/DisResult* dres, UInt insn)
{
+ /* 31 23 21 20 15 11 9 5
+ 000 11110 ty 1 m cond 11 n d
+ The first 3 bits are really "M 0 S", but M and S are always zero.
+ Decode fields: ty
+ */
# define INSN(_bMax,_bMin) SLICE_UInt(insn, (_bMax), (_bMin))
+ if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0) || INSN(21,21) != 1
+ || INSN(11,10) != BITS2(1,1)) {
+ return False;
+ }
+ UInt ty = INSN(23,22);
+ UInt mm = INSN(20,16);
+ UInt cond = INSN(15,12);
+ UInt nn = INSN(9,5);
+ UInt dd = INSN(4,0);
+ if (ty <= X01) {
+ /* -------- 00: FCSEL s_s -------- */
+ /* -------- 00: FCSEL d_d -------- */
+ IRType ity = ty == X01 ? Ity_F64 : Ity_F32;
+ IRTemp srcT = newTemp(ity);
+ IRTemp srcF = newTemp(ity);
+ IRTemp res = newTemp(ity);
+ assign(srcT, getQRegLO(nn, ity));
+ assign(srcF, getQRegLO(mm, ity));
+ assign(res, IRExpr_ITE(
+ unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
+ mkexpr(srcT), mkexpr(srcF)));
+ putQReg128(dd, mkV128(0x0000));
+ putQRegLO(dd, mkexpr(res));
+ DIP("fcsel %s, %s, %s, %s\n",
+ nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity),
+ nameCC(cond));
+ return True;
+ }
return False;
# undef INSN
}
Modified: branches/VEX_3_10_BRANCH/priv/host_arm64_defs.c
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/host_arm64_defs.c (original)
+++ branches/VEX_3_10_BRANCH/priv/host_arm64_defs.c Sun Nov 23 17:28:18 2014
@@ -1112,6 +1112,17 @@
i->ARM64in.VCmpS.argR = argR;
return i;
}
+ARM64Instr* ARM64Instr_VFCSel ( HReg dst, HReg argL, HReg argR,
+ ARM64CondCode cond, Bool isD ) {
+ ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+ i->tag = ARM64in_VFCSel;
+ i->ARM64in.VFCSel.dst = dst;
+ i->ARM64in.VFCSel.argL = argL;
+ i->ARM64in.VFCSel.argR = argR;
+ i->ARM64in.VFCSel.cond = cond;
+ i->ARM64in.VFCSel.isD = isD;
+ return i;
+}
ARM64Instr* ARM64Instr_FPCR ( Bool toFPCR, HReg iReg ) {
ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
i->tag = ARM64in_FPCR;
@@ -1646,6 +1657,18 @@
vex_printf(", ");
ppHRegARM64asSreg(i->ARM64in.VCmpS.argR);
return;
+ case ARM64in_VFCSel: {
+ void (*ppHRegARM64fp)(HReg)
+ = (i->ARM64in.VFCSel.isD ? ppHRegARM64 : ppHRegARM64asSreg);
+ vex_printf("fcsel ");
+ ppHRegARM64fp(i->ARM64in.VFCSel.dst);
+ vex_printf(", ");
+ ppHRegARM64fp(i->ARM64in.VFCSel.argL);
+ vex_printf(", ");
+ ppHRegARM64fp(i->ARM64in.VFCSel.argR);
+ vex_printf(", %s", showARM64CondCode(i->ARM64in.VFCSel.cond));
+ return;
+ }
case ARM64in_FPCR:
if (i->ARM64in.FPCR.toFPCR) {
vex_printf("msr fpcr, ");
@@ -2028,6 +2051,11 @@
addHRegUse(u, HRmRead, i->ARM64in.VCmpS.argL);
addHRegUse(u, HRmRead, i->ARM64in.VCmpS.argR);
return;
+ case ARM64in_VFCSel:
+ addHRegUse(u, HRmRead, i->ARM64in.VFCSel.argL);
+ addHRegUse(u, HRmRead, i->ARM64in.VFCSel.argR);
+ addHRegUse(u, HRmWrite, i->ARM64in.VFCSel.dst);
+ return;
case ARM64in_FPCR:
if (i->ARM64in.FPCR.toFPCR)
addHRegUse(u, HRmRead, i->ARM64in.FPCR.iReg);
@@ -2256,6 +2284,11 @@
i->ARM64in.VCmpS.argL = lookupHRegRemap(m, i->ARM64in.VCmpS.argL);
i->ARM64in.VCmpS.argR = lookupHRegRemap(m, i->ARM64in.VCmpS.argR);
return;
+ case ARM64in_VFCSel:
+ i->ARM64in.VFCSel.argL = lookupHRegRemap(m, i->ARM64in.VFCSel.argL);
+ i->ARM64in.VFCSel.argR = lookupHRegRemap(m, i->ARM64in.VFCSel.argR);
+ i->ARM64in.VFCSel.dst = lookupHRegRemap(m, i->ARM64in.VFCSel.dst);
+ return;
case ARM64in_FPCR:
i->ARM64in.FPCR.iReg = lookupHRegRemap(m, i->ARM64in.FPCR.iReg);
return;
@@ -3958,6 +3991,21 @@
*p++ = X_3_8_5_6_5_5(X000, X11110001, sM, X001000, sN, X00000);
goto done;
}
+ case ARM64in_VFCSel: {
+ /* 31 23 21 20 15 11 9 5
+ 000 11110 00 1 m cond 11 n d FCSEL Sd,Sn,Sm,cond
+ 000 11110 01 1 m cond 11 n d FCSEL Dd,Dn,Dm,cond
+ */
+ Bool isD = i->ARM64in.VFCSel.isD;
+ UInt dd = dregNo(i->ARM64in.VFCSel.dst);
+ UInt nn = dregNo(i->ARM64in.VFCSel.argL);
+ UInt mm = dregNo(i->ARM64in.VFCSel.argR);
+ UInt cond = (UInt)i->ARM64in.VFCSel.cond;
+ vassert(cond < 16);
+ *p++ = X_3_8_5_6_5_5(X000, isD ? X11110011 : X11110001,
+ mm, (cond << 2) | X000011, nn, dd);
+ goto done;
+ }
case ARM64in_FPCR: {
Bool toFPCR = i->ARM64in.FPCR.toFPCR;
UInt iReg = iregNo(i->ARM64in.FPCR.iReg);
Modified: branches/VEX_3_10_BRANCH/priv/host_arm64_defs.h
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/host_arm64_defs.h (original)
+++ branches/VEX_3_10_BRANCH/priv/host_arm64_defs.h Sun Nov 23 17:28:18 2014
@@ -491,6 +491,7 @@
ARM64in_VBinS,
ARM64in_VCmpD,
ARM64in_VCmpS,
+ ARM64in_VFCSel,
ARM64in_FPCR,
ARM64in_FPSR,
/* ARM64in_V*V: vector ops on vector registers */
@@ -743,6 +744,15 @@
HReg argL;
HReg argR;
} VCmpS;
+ /* 32- or 64-bit FP conditional select */
+ struct {
+ HReg dst;
+ HReg argL;
+ HReg argR;
+ ARM64CondCode cond;
+ Bool isD;
+ }
+ VFCSel;
/* Move a 32-bit value to/from the FPCR */
struct {
Bool toFPCR;
@@ -889,6 +899,8 @@
extern ARM64Instr* ARM64Instr_VBinS ( ARM64FpBinOp op, HReg, HReg, HReg );
extern ARM64Instr* ARM64Instr_VCmpD ( HReg argL, HReg argR );
extern ARM64Instr* ARM64Instr_VCmpS ( HReg argL, HReg argR );
+extern ARM64Instr* ARM64Instr_VFCSel ( HReg dst, HReg argL, HReg argR,
+ ARM64CondCode cond, Bool isD );
extern ARM64Instr* ARM64Instr_FPCR ( Bool toFPCR, HReg iReg );
extern ARM64Instr* ARM64Instr_FPSR ( Bool toFPSR, HReg iReg );
extern ARM64Instr* ARM64Instr_VBinV ( ARM64VecBinOp op, HReg, HReg, HReg );
Modified: branches/VEX_3_10_BRANCH/priv/host_arm64_isel.c
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/host_arm64_isel.c (original)
+++ branches/VEX_3_10_BRANCH/priv/host_arm64_isel.c Sun Nov 23 17:28:18 2014
@@ -3067,6 +3067,17 @@
}
}
+ if (e->tag == Iex_ITE) {
+ /* ITE(ccexpr, iftrue, iffalse) */
+ ARM64CondCode cc;
+ HReg r1 = iselDblExpr(env, e->Iex.ITE.iftrue);
+ HReg r0 = iselDblExpr(env, e->Iex.ITE.iffalse);
+ HReg dst = newVRegD(env);
+ cc = iselCondCode(env, e->Iex.ITE.cond);
+ addInstr(env, ARM64Instr_VFCSel(dst, r1, r0, cc, True/*64-bit*/));
+ return dst;
+ }
+
ppIRExpr(e);
vpanic("iselDblExpr_wrk");
}
@@ -3222,6 +3233,17 @@
}
}
+ if (e->tag == Iex_ITE) {
+ /* ITE(ccexpr, iftrue, iffalse) */
+ ARM64CondCode cc;
+ HReg r1 = iselFltExpr(env, e->Iex.ITE.iftrue);
+ HReg r0 = iselFltExpr(env, e->Iex.ITE.iffalse);
+ HReg dst = newVRegD(env);
+ cc = iselCondCode(env, e->Iex.ITE.cond);
+ addInstr(env, ARM64Instr_VFCSel(dst, r1, r0, cc, False/*!64-bit*/));
+ return dst;
+ }
+
ppIRExpr(e);
vpanic("iselFltExpr_wrk");
}
|
|
From: <sv...@va...> - 2014-11-23 17:27:18
|
Author: sewardj
Date: Sun Nov 23 17:27:11 2014
New Revision: 3015
Log:
Merge, from trunk, r2990.
2990 Add detection of old ppc32 magic instructions from bug 278808.
Modified:
branches/VEX_3_10_BRANCH/ (props changed)
branches/VEX_3_10_BRANCH/priv/guest_ppc_toIR.c
Modified: branches/VEX_3_10_BRANCH/priv/guest_ppc_toIR.c
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/guest_ppc_toIR.c (original)
+++ branches/VEX_3_10_BRANCH/priv/guest_ppc_toIR.c Sun Nov 23 17:27:11 2014
@@ -18782,10 +18782,26 @@
UInt word2 = mode64 ? 0x78006800 : 0x5400683E;
UInt word3 = mode64 ? 0x7800E802 : 0x5400E83E;
UInt word4 = mode64 ? 0x78009802 : 0x5400983E;
+ Bool is_special_preamble = False;
if (getUIntPPCendianly(code+ 0) == word1 &&
getUIntPPCendianly(code+ 4) == word2 &&
getUIntPPCendianly(code+ 8) == word3 &&
getUIntPPCendianly(code+12) == word4) {
+ is_special_preamble = True;
+ } else if (! mode64 &&
+ getUIntPPCendianly(code+ 0) == 0x54001800 &&
+ getUIntPPCendianly(code+ 4) == 0x54006800 &&
+ getUIntPPCendianly(code+ 8) == 0x5400E800 &&
+ getUIntPPCendianly(code+12) == 0x54009800) {
+ static Bool reported = False;
+ if (!reported) {
+ vex_printf("disInstr(ppc): old ppc32 instruction magic detected. Code might clobber r0.\n");
+ vex_printf("disInstr(ppc): source needs to be recompiled against latest valgrind.h.\n");
+ reported = True;
+ }
+ is_special_preamble = True;
+ }
+ if (is_special_preamble) {
/* Got a "Special" instruction preamble. Which one is it? */
if (getUIntPPCendianly(code+16) == 0x7C210B78 /* or 1,1,1 */) {
/* %R3 = client_request ( %R4 ) */
|
|
From: <sv...@va...> - 2014-11-23 17:26:00
|
Author: sewardj
Date: Sun Nov 23 17:25:53 2014
New Revision: 3014
Log:
Merge, from trunk, r2988
340725 AVX2: Incorrect decoding of vpbroadcast{b,w} reg,reg forms
2988
Modified:
branches/VEX_3_10_BRANCH/ (props changed)
branches/VEX_3_10_BRANCH/priv/guest_amd64_toIR.c
Modified: branches/VEX_3_10_BRANCH/priv/guest_amd64_toIR.c
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/guest_amd64_toIR.c (original)
+++ branches/VEX_3_10_BRANCH/priv/guest_amd64_toIR.c Sun Nov 23 17:25:53 2014
@@ -28661,6 +28661,7 @@
IRTemp t8 = newTemp(Ity_I8);
if (epartIsReg(modrm)) {
UInt rE = eregOfRexRM(pfx, modrm);
+ delta++;
DIP("vpbroadcastb %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
assign(t8, unop(Iop_32to8, getXMMRegLane32(rE, 0)));
} else {
@@ -28687,6 +28688,7 @@
IRTemp t8 = newTemp(Ity_I8);
if (epartIsReg(modrm)) {
UInt rE = eregOfRexRM(pfx, modrm);
+ delta++;
DIP("vpbroadcastb %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
assign(t8, unop(Iop_32to8, getXMMRegLane32(rE, 0)));
} else {
@@ -28717,6 +28719,7 @@
IRTemp t16 = newTemp(Ity_I16);
if (epartIsReg(modrm)) {
UInt rE = eregOfRexRM(pfx, modrm);
+ delta++;
DIP("vpbroadcastw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
assign(t16, unop(Iop_32to16, getXMMRegLane32(rE, 0)));
} else {
@@ -28741,6 +28744,7 @@
IRTemp t16 = newTemp(Ity_I16);
if (epartIsReg(modrm)) {
UInt rE = eregOfRexRM(pfx, modrm);
+ delta++;
DIP("vpbroadcastw %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
assign(t16, unop(Iop_32to16, getXMMRegLane32(rE, 0)));
} else {
|
|
From: <sv...@va...> - 2014-11-23 17:24:58
|
Author: sewardj
Date: Sun Nov 23 17:24:51 2014
New Revision: 3013
Log:
Merge, from trunk, r2987
340632 arm64: unhandled instruction fcvtas
2987
Modified:
branches/VEX_3_10_BRANCH/ (props changed)
branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
Modified: branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c (original)
+++ branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c Sun Nov 23 17:24:51 2014
@@ -12304,6 +12304,7 @@
|| (iop == Iop_F64toI64S && irrm == Irrm_ZERO) /* FCVTZS Xd,Dn */
|| (iop == Iop_F64toI64S && irrm == Irrm_NegINF) /* FCVTMS Xd,Dn */
|| (iop == Iop_F64toI64S && irrm == Irrm_PosINF) /* FCVTPS Xd,Dn */
+ || (iop == Iop_F64toI64S && irrm == Irrm_NEAREST) /* FCVT{A,N}S Xd,Dn */
/* F64toI64U */
|| (iop == Iop_F64toI64U && irrm == Irrm_ZERO) /* FCVTZU Xd,Dn */
|| (iop == Iop_F64toI64U && irrm == Irrm_NegINF) /* FCVTMU Xd,Dn */
|
|
From: <sv...@va...> - 2014-11-23 17:23:31
|
Author: sewardj
Date: Sun Nov 23 17:23:24 2014
New Revision: 3012
Log:
Merge, from trunk, r2986
340033 arm64: unhandled instruction for dmb ishld and some other
isb-dmb-dsb variants...
2986
Modified:
branches/VEX_3_10_BRANCH/ (props changed)
branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
Modified: branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c (original)
+++ branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c Sun Nov 23 17:23:24 2014
@@ -6763,29 +6763,25 @@
}
/* ------------------ ISB, DMB, DSB ------------------ */
- if (INSN(31,0) == 0xD5033FDF) {
- stmt(IRStmt_MBE(Imbe_Fence));
- DIP("isb\n");
- return True;
- }
- if (INSN(31,0) == 0xD5033FBF) {
- stmt(IRStmt_MBE(Imbe_Fence));
- DIP("dmb sy\n");
- return True;
- }
- if (INSN(31,0) == 0xD5033BBF) {
- stmt(IRStmt_MBE(Imbe_Fence));
- DIP("dmb ish\n");
- return True;
- }
- if (INSN(31,0) == 0xD5033ABF) {
- stmt(IRStmt_MBE(Imbe_Fence));
- DIP("dmb ishst\n");
- return True;
- }
- if (INSN(31,0) == 0xD5033B9F) {
+ /* 31 21 11 7 6 4
+ 11010 10100 0 00 011 0011 CRm 1 01 11111 DMB opt
+ 11010 10100 0 00 011 0011 CRm 1 00 11111 DSB opt
+ 11010 10100 0 00 011 0011 CRm 1 10 11111 ISB opt
+ */
+ if (INSN(31,22) == BITS10(1,1,0,1,0,1,0,1,0,0)
+ && INSN(21,12) == BITS10(0,0,0,0,1,1,0,0,1,1)
+ && INSN(7,7) == 1
+ && INSN(6,5) <= BITS2(1,0) && INSN(4,0) == BITS5(1,1,1,1,1)) {
+ UInt opc = INSN(6,5);
+ UInt CRm = INSN(11,8);
+ vassert(opc <= 2 && CRm <= 15);
stmt(IRStmt_MBE(Imbe_Fence));
- DIP("dsb ish\n");
+ const HChar* opNames[3]
+ = { "dsb", "dmb", "isb" };
+ const HChar* howNames[16]
+ = { "#0", "oshld", "oshst", "osh", "#4", "nshld", "nshst", "nsh",
+ "#8", "ishld", "ishst", "ish", "#12", "ld", "st", "sy" };
+ DIP("%s %s\n", opNames[opc], howNames[CRm]);
return True;
}
|
|
From: <sv...@va...> - 2014-11-23 17:22:23
|
Author: sewardj
Date: Sun Nov 23 17:22:16 2014
New Revision: 3011
Log:
Merge, from trunk, r2985
335713 arm64: unhanded instruction: prfm (immediate)
2985
Modified:
branches/VEX_3_10_BRANCH/ (props changed)
branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
Modified: branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c (original)
+++ branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c Sun Nov 23 17:22:16 2014
@@ -6358,6 +6358,22 @@
return True;
}
+ /* ------------------ PRFM (immediate) ------------------ */
+ /* 31 21 9 4
+ 11 111 00110 imm12 n t PRFM pfrop=Rt, [Xn|SP, #pimm]
+ */
+ if (INSN(31,22) == BITS10(1,1,1,1,1,0,0,1,1,0)) {
+ UInt imm12 = INSN(21,10);
+ UInt nn = INSN(9,5);
+ UInt tt = INSN(4,0);
+ /* Generating any IR here is pointless, except for documentation
+ purposes, as it will get optimised away later. */
+ IRTemp ea = newTemp(Ity_I64);
+ assign(ea, binop(Iop_Add64, getIReg64orSP(nn), mkU64(imm12 * 8)));
+ DIP("prfm prfop=%u, [%s, #%u]\n", tt, nameIReg64orSP(nn), imm12 * 8);
+ return True;
+ }
+
vex_printf("ARM64 front end: load_store\n");
return False;
# undef INSN
|
|
From: <sv...@va...> - 2014-11-23 17:21:19
|
Author: sewardj
Date: Sun Nov 23 17:21:12 2014
New Revision: 3010
Log:
Merge, from trunk, r2984
340509 arm64: unhandled instruction fcvtas
2984
Modified:
branches/VEX_3_10_BRANCH/ (props changed)
branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
Modified: branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c (original)
+++ branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c Sun Nov 23 17:21:12 2014
@@ -12189,6 +12189,7 @@
/* 31 30 29 28 23 21 20 18 15 9 4
sf 0 0 11110 type 1 rmode opcode 000000 n d
The first 3 bits are really "sf 0 S", but S is always zero.
+ Decode fields: sf,type,rmode,opcode
*/
# define INSN(_bMax,_bMin) SLICE_UInt(insn, (_bMax), (_bMin))
if (INSN(30,29) != BITS2(0,0)
@@ -12205,7 +12206,7 @@
UInt dd = INSN(4,0);
// op = 000, 001
- /* -------- FCVT{N,P,M,Z}{S,U} (scalar, integer) -------- */
+ /* -------- FCVT{N,P,M,Z,A}{S,U} (scalar, integer) -------- */
/* 30 23 20 18 15 9 4
sf 00 11110 0x 1 00 000 000000 n d FCVTNS Rd, Fn (round to
sf 00 11110 0x 1 00 001 000000 n d FCVTNU Rd, Fn nearest)
@@ -12213,23 +12214,38 @@
---------------- 10 -------------- FCVTM-------- (round to -inf)
---------------- 11 -------------- FCVTZ-------- (round to zero)
+ ---------------- 00 100 ---------- FCVTAS------- (nearest, ties away)
+ ---------------- 00 101 ---------- FCVTAU------- (nearest, ties away)
+
Rd is Xd when sf==1, Wd when sf==0
Fn is Dn when x==1, Sn when x==0
20:19 carry the rounding mode, using the same encoding as FPCR
*/
- if (ty <= X01 && (op == BITS3(0,0,0) || op == BITS3(0,0,1))) {
+ if (ty <= X01
+ && ( ((op == BITS3(0,0,0) || op == BITS3(0,0,1)) && True)
+ || ((op == BITS3(1,0,0) || op == BITS3(1,0,1)) && rm == BITS2(0,0))
+ )
+ ) {
Bool isI64 = bitSF == 1;
Bool isF64 = (ty & 1) == 1;
Bool isU = (op & 1) == 1;
/* Decide on the IR rounding mode to use. */
IRRoundingMode irrm = 8; /*impossible*/
HChar ch = '?';
- switch (rm) {
- case BITS2(0,0): ch = 'n'; irrm = Irrm_NEAREST; break;
- case BITS2(0,1): ch = 'p'; irrm = Irrm_PosINF; break;
- case BITS2(1,0): ch = 'm'; irrm = Irrm_NegINF; break;
- case BITS2(1,1): ch = 'z'; irrm = Irrm_ZERO; break;
- default: vassert(0);
+ if (op == BITS3(0,0,0) || op == BITS3(0,0,1)) {
+ switch (rm) {
+ case BITS2(0,0): ch = 'n'; irrm = Irrm_NEAREST; break;
+ case BITS2(0,1): ch = 'p'; irrm = Irrm_PosINF; break;
+ case BITS2(1,0): ch = 'm'; irrm = Irrm_NegINF; break;
+ case BITS2(1,1): ch = 'z'; irrm = Irrm_ZERO; break;
+ default: vassert(0);
+ }
+ } else {
+ vassert(op == BITS3(1,0,0) || op == BITS3(1,0,1));
+ switch (rm) {
+ case BITS2(0,0): ch = 'a'; irrm = Irrm_NEAREST; break;
+ default: vassert(0);
+ }
}
vassert(irrm != 8);
/* Decide on the conversion primop, based on the source size,
@@ -12254,9 +12270,11 @@
(iop == Iop_F32toI32S && irrm == Irrm_ZERO) /* FCVTZS Wd,Sn */
|| (iop == Iop_F32toI32S && irrm == Irrm_NegINF) /* FCVTMS Wd,Sn */
|| (iop == Iop_F32toI32S && irrm == Irrm_PosINF) /* FCVTPS Wd,Sn */
+ || (iop == Iop_F32toI32S && irrm == Irrm_NEAREST)/* FCVT{A,N}S W,S */
/* F32toI32U */
|| (iop == Iop_F32toI32U && irrm == Irrm_ZERO) /* FCVTZU Wd,Sn */
|| (iop == Iop_F32toI32U && irrm == Irrm_NegINF) /* FCVTMU Wd,Sn */
+ || (iop == Iop_F32toI32U && irrm == Irrm_NEAREST)/* FCVT{A,N}U W,S */
/* F32toI64S */
|| (iop == Iop_F32toI64S && irrm == Irrm_ZERO) /* FCVTZS Xd,Sn */
/* F32toI64U */
|
|
From: <sv...@va...> - 2014-11-23 17:20:16
|
Author: sewardj
Date: Sun Nov 23 17:20:09 2014
New Revision: 3009
Log:
Merge, from trunk, r2983
339938 disInstr(arm64): unhandled instruction 0x4F8010A4 (fmla)
== 339950
2983
Modified:
branches/VEX_3_10_BRANCH/ (props changed)
branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
Modified: branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c (original)
+++ branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c Sun Nov 23 17:20:09 2014
@@ -11458,6 +11458,41 @@
vassert(size < 4);
vassert(bitH < 2 && bitM < 2 && bitL < 2);
+ if (bitU == 0 && size >= X10
+ && (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,1,0,1))) {
+ /* -------- 0,1x,0001 FMLA 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
+ /* -------- 0,1x,0101 FMLS 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
+ if (bitQ == 0 && size == X11) return False; // implied 1d case
+ Bool isD = (size & 1) == 1;
+ Bool isSUB = opcode == BITS4(0,1,0,1);
+ UInt index;
+ if (!isD) index = (bitH << 1) | bitL;
+ else if (isD && bitL == 0) index = bitH;
+ else return False; // sz:L == x11 => unallocated encoding
+ vassert(index < (isD ? 2 : 4));
+ IRType ity = isD ? Ity_F64 : Ity_F32;
+ IRTemp elem = newTemp(ity);
+ UInt mm = (bitM << 4) | mmLO4;
+ assign(elem, getQRegLane(mm, index, ity));
+ IRTemp dupd = math_DUP_TO_V128(elem, ity);
+ IROp opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
+ IROp opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
+ IROp opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
+ IRTemp rm = mk_get_IR_rounding_mode();
+ IRTemp t1 = newTempV128();
+ IRTemp t2 = newTempV128();
+ // FIXME: double rounding; use FMA primops instead
+ assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
+ assign(t2, triop(isSUB ? opSUB : opADD,
+ mkexpr(rm), getQReg128(dd), mkexpr(t1)));
+ putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
+ const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
+ DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", isSUB ? "fmls" : "fmla",
+ nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm),
+ isD ? 'd' : 's', index);
+ return True;
+ }
+
if (bitU == 0 && size >= X10 && opcode == BITS4(1,0,0,1)) {
/* -------- 0,1x,1001 FMUL 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
if (bitQ == 0 && size == X11) return False; // implied 1d case
|
|
From: <sv...@va...> - 2014-11-23 17:18:53
|
Author: sewardj
Date: Sun Nov 23 17:18:46 2014
New Revision: 3008
Log:
Merge, from trunk, r2982
339927 Unhandled instruction 0x9E7100C6 (fcvtmu) on aarch64
2982
Modified:
branches/VEX_3_10_BRANCH/ (props changed)
branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
Modified: branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c (original)
+++ branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c Sun Nov 23 17:18:46 2014
@@ -12241,6 +12241,7 @@
|| (iop == Iop_F64toI64S && irrm == Irrm_PosINF) /* FCVTPS Xd,Dn */
/* F64toI64U */
|| (iop == Iop_F64toI64U && irrm == Irrm_ZERO) /* FCVTZU Xd,Dn */
+ || (iop == Iop_F64toI64U && irrm == Irrm_NegINF) /* FCVTMU Xd,Dn */
|| (iop == Iop_F64toI64U && irrm == Irrm_PosINF) /* FCVTPU Xd,Dn */
) {
/* validated */
|
|
From: <sv...@va...> - 2014-11-23 17:17:47
|
Author: sewardj
Date: Sun Nov 23 17:17:39 2014
New Revision: 3007
Log:
Merge, from trunk, r2981
339926 Unhandled instruction 0x1E674001 (frintx) on aarm64
2981
Modified:
branches/VEX_3_10_BRANCH/ (props changed)
branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
Modified: branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c (original)
+++ branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c Sun Nov 23 17:17:39 2014
@@ -11921,7 +11921,7 @@
011 zero (FRINTZ)
000 tieeven
100 tieaway (FRINTA) -- !! FIXME KLUDGED !!
- 110 per FPCR + "exact = TRUE"
+ 110 per FPCR + "exact = TRUE" (FRINTX)
101 unallocated
*/
Bool isD = (ty & 1) == 1;
@@ -11935,6 +11935,10 @@
case BITS3(0,0,1): ch = 'p'; irrmE = mkU32(Irrm_PosINF); break;
// The following is a kludge. Should be: Irrm_NEAREST_TIE_AWAY_0
case BITS3(1,0,0): ch = 'a'; irrmE = mkU32(Irrm_NEAREST); break;
+ // I am unsure about the following, due to the "integral exact"
+ // description in the manual. What does it mean?
+ case BITS3(1,1,0):
+ ch = 'x'; irrmE = mkexpr(mk_get_IR_rounding_mode()); break;
default: break;
}
if (irrmE) {
|
|
From: <sv...@va...> - 2014-11-23 13:00:09
|
Author: sewardj
Date: Sun Nov 23 12:59:56 2014
New Revision: 14757
Log:
Merge, from trunk, r14667
14667 Enable test cases for arm64 load/store insns
Modified:
branches/VALGRIND_3_10_BRANCH/ (props changed)
branches/VALGRIND_3_10_BRANCH/none/tests/arm64/memory.c
branches/VALGRIND_3_10_BRANCH/none/tests/arm64/memory.stdout.exp
Modified: branches/VALGRIND_3_10_BRANCH/none/tests/arm64/memory.c
==============================================================================
--- branches/VALGRIND_3_10_BRANCH/none/tests/arm64/memory.c (original)
+++ branches/VALGRIND_3_10_BRANCH/none/tests/arm64/memory.c Sun Nov 23 12:59:56 2014
@@ -987,212 +987,212 @@
////////////////////////////////////////////////////////////////
printf("LD1/ST1 (multiple 1-elem structs to/from 2/3/4 regs\n");
-//MEM_TEST("st1 {v19.2d, v20.2d}, [x5]", 17, 7)
-//MEM_TEST("st1 {v19.2d, v20.2d}, [x5], #32", 9, 9)
-//MEM_TEST("st1 {v19.2d, v20.2d}, [x5], x6", -13, -5)
-//
-//MEM_TEST("st1 {v17.2d, v18.2d, v19.2d}, [x5]", 17, 7)
-//MEM_TEST("st1 {v17.2d, v18.2d, v19.2d}, [x5], #48", 9, 9)
-//MEM_TEST("st1 {v17.2d, v18.2d, v19.2d}, [x5], x6", -13, -5)
-//
-//MEM_TEST("st1 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5]", 17, 7)
-//MEM_TEST("st1 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5], #64", 9, 9)
-//MEM_TEST("st1 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5], x6", -13, -5)
-//
-//
-//MEM_TEST("st1 {v19.1d, v20.1d}, [x5]", 17, 7)
-//MEM_TEST("st1 {v19.1d, v20.1d}, [x5], #16", 9, 9)
-//MEM_TEST("st1 {v19.1d, v20.1d}, [x5], x6", -13, -5)
-//
-//MEM_TEST("st1 {v17.1d, v18.1d, v19.1d}, [x5]", 17, 7)
-//MEM_TEST("st1 {v17.1d, v18.1d, v19.1d}, [x5], #24", 9, 9)
-//MEM_TEST("st1 {v17.1d, v18.1d, v19.1d}, [x5], x6", -13, -5)
-//
-//MEM_TEST("st1 {v17.1d, v18.1d, v19.1d, v20.1d}, [x5]", 17, 7)
-//MEM_TEST("st1 {v17.1d, v18.1d, v19.1d, v20.1d}, [x5], #32", 9, 9)
-//MEM_TEST("st1 {v17.1d, v18.1d, v19.1d, v20.1d}, [x5], x6", -13, -5)
-//
-//
-//MEM_TEST("st1 {v19.4s, v20.4s}, [x5]", 17, 7)
-//MEM_TEST("st1 {v19.4s, v20.4s}, [x5], #32", 9, 9)
-//MEM_TEST("st1 {v19.4s, v20.4s}, [x5], x6", -13, -5)
-//
-//MEM_TEST("st1 {v17.4s, v18.4s, v19.4s}, [x5]", 17, 7)
-//MEM_TEST("st1 {v17.4s, v18.4s, v19.4s}, [x5], #48", 9, 9)
-//MEM_TEST("st1 {v17.4s, v18.4s, v19.4s}, [x5], x6", -13, -5)
-//
-//MEM_TEST("st1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5]", 17, 7)
-//MEM_TEST("st1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5], #64", 9, 9)
-//MEM_TEST("st1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5], x6", -13, -5)
-//
-//
-//MEM_TEST("st1 {v19.2s, v20.2s}, [x5]", 17, 7)
-//MEM_TEST("st1 {v19.2s, v20.2s}, [x5], #16", 9, 9)
-//MEM_TEST("st1 {v19.2s, v20.2s}, [x5], x6", -13, -5)
-//
-//MEM_TEST("st1 {v17.2s, v18.2s, v19.2s}, [x5]", 17, 7)
-//MEM_TEST("st1 {v17.2s, v18.2s, v19.2s}, [x5], #24", 9, 9)
-//MEM_TEST("st1 {v17.2s, v18.2s, v19.2s}, [x5], x6", -13, -5)
-//
-//MEM_TEST("st1 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5]", 17, 7)
-//MEM_TEST("st1 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5], #32", 9, 9)
-//MEM_TEST("st1 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5], x6", -13, -5)
-//
-//
-//MEM_TEST("st1 {v19.8h, v20.8h}, [x5]", 17, 7)
-//MEM_TEST("st1 {v19.8h, v20.8h}, [x5], #32", 9, 9)
-//MEM_TEST("st1 {v19.8h, v20.8h}, [x5], x6", -13, -5)
-//
-//MEM_TEST("st1 {v17.8h, v18.8h, v19.8h}, [x5]", 17, 7)
-//MEM_TEST("st1 {v17.8h, v18.8h, v19.8h}, [x5], #48", 9, 9)
-//MEM_TEST("st1 {v17.8h, v18.8h, v19.8h}, [x5], x6", -13, -5)
-//
-//MEM_TEST("st1 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5]", 17, 7)
-//MEM_TEST("st1 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5], #64", 9, 9)
-//MEM_TEST("st1 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5], x6", -13, -5)
-//
-//
-//MEM_TEST("st1 {v19.4h, v20.4h}, [x5]", 17, 7)
-//MEM_TEST("st1 {v19.4h, v20.4h}, [x5], #16", 9, 9)
-//MEM_TEST("st1 {v19.4h, v20.4h}, [x5], x6", -13, -5)
-//
-//MEM_TEST("st1 {v17.4h, v18.4h, v19.4h}, [x5]", 17, 7)
-//MEM_TEST("st1 {v17.4h, v18.4h, v19.4h}, [x5], #24", 9, 9)
-//MEM_TEST("st1 {v17.4h, v18.4h, v19.4h}, [x5], x6", -13, -5)
-//
-//MEM_TEST("st1 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5]", 17, 7)
-//MEM_TEST("st1 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5], #32", 9, 9)
-//MEM_TEST("st1 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5], x6", -13, -5)
+MEM_TEST("st1 {v19.2d, v20.2d}, [x5]", 17, 7)
+MEM_TEST("st1 {v19.2d, v20.2d}, [x5], #32", 9, 9)
+MEM_TEST("st1 {v19.2d, v20.2d}, [x5], x6", -13, -5)
+
+MEM_TEST("st1 {v17.2d, v18.2d, v19.2d}, [x5]", 17, 7)
+MEM_TEST("st1 {v17.2d, v18.2d, v19.2d}, [x5], #48", 9, 9)
+MEM_TEST("st1 {v17.2d, v18.2d, v19.2d}, [x5], x6", -13, -5)
+
+MEM_TEST("st1 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5]", 17, 7)
+MEM_TEST("st1 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5], #64", 9, 9)
+MEM_TEST("st1 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5], x6", -13, -5)
+
+
+MEM_TEST("st1 {v19.1d, v20.1d}, [x5]", 17, 7)
+MEM_TEST("st1 {v19.1d, v20.1d}, [x5], #16", 9, 9)
+MEM_TEST("st1 {v19.1d, v20.1d}, [x5], x6", -13, -5)
+
+MEM_TEST("st1 {v17.1d, v18.1d, v19.1d}, [x5]", 17, 7)
+MEM_TEST("st1 {v17.1d, v18.1d, v19.1d}, [x5], #24", 9, 9)
+MEM_TEST("st1 {v17.1d, v18.1d, v19.1d}, [x5], x6", -13, -5)
+
+MEM_TEST("st1 {v17.1d, v18.1d, v19.1d, v20.1d}, [x5]", 17, 7)
+MEM_TEST("st1 {v17.1d, v18.1d, v19.1d, v20.1d}, [x5], #32", 9, 9)
+MEM_TEST("st1 {v17.1d, v18.1d, v19.1d, v20.1d}, [x5], x6", -13, -5)
+
+
+MEM_TEST("st1 {v19.4s, v20.4s}, [x5]", 17, 7)
+MEM_TEST("st1 {v19.4s, v20.4s}, [x5], #32", 9, 9)
+MEM_TEST("st1 {v19.4s, v20.4s}, [x5], x6", -13, -5)
+
+MEM_TEST("st1 {v17.4s, v18.4s, v19.4s}, [x5]", 17, 7)
+MEM_TEST("st1 {v17.4s, v18.4s, v19.4s}, [x5], #48", 9, 9)
+MEM_TEST("st1 {v17.4s, v18.4s, v19.4s}, [x5], x6", -13, -5)
+
+MEM_TEST("st1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5]", 17, 7)
+MEM_TEST("st1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5], #64", 9, 9)
+MEM_TEST("st1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5], x6", -13, -5)
+
+
+MEM_TEST("st1 {v19.2s, v20.2s}, [x5]", 17, 7)
+MEM_TEST("st1 {v19.2s, v20.2s}, [x5], #16", 9, 9)
+MEM_TEST("st1 {v19.2s, v20.2s}, [x5], x6", -13, -5)
+
+MEM_TEST("st1 {v17.2s, v18.2s, v19.2s}, [x5]", 17, 7)
+MEM_TEST("st1 {v17.2s, v18.2s, v19.2s}, [x5], #24", 9, 9)
+MEM_TEST("st1 {v17.2s, v18.2s, v19.2s}, [x5], x6", -13, -5)
+
+MEM_TEST("st1 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5]", 17, 7)
+MEM_TEST("st1 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5], #32", 9, 9)
+MEM_TEST("st1 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5], x6", -13, -5)
+
+
+MEM_TEST("st1 {v19.8h, v20.8h}, [x5]", 17, 7)
+MEM_TEST("st1 {v19.8h, v20.8h}, [x5], #32", 9, 9)
+MEM_TEST("st1 {v19.8h, v20.8h}, [x5], x6", -13, -5)
+
+MEM_TEST("st1 {v17.8h, v18.8h, v19.8h}, [x5]", 17, 7)
+MEM_TEST("st1 {v17.8h, v18.8h, v19.8h}, [x5], #48", 9, 9)
+MEM_TEST("st1 {v17.8h, v18.8h, v19.8h}, [x5], x6", -13, -5)
+
+MEM_TEST("st1 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5]", 17, 7)
+MEM_TEST("st1 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5], #64", 9, 9)
+MEM_TEST("st1 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5], x6", -13, -5)
+
+
+MEM_TEST("st1 {v19.4h, v20.4h}, [x5]", 17, 7)
+MEM_TEST("st1 {v19.4h, v20.4h}, [x5], #16", 9, 9)
+MEM_TEST("st1 {v19.4h, v20.4h}, [x5], x6", -13, -5)
+
+MEM_TEST("st1 {v17.4h, v18.4h, v19.4h}, [x5]", 17, 7)
+MEM_TEST("st1 {v17.4h, v18.4h, v19.4h}, [x5], #24", 9, 9)
+MEM_TEST("st1 {v17.4h, v18.4h, v19.4h}, [x5], x6", -13, -5)
+
+MEM_TEST("st1 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5]", 17, 7)
+MEM_TEST("st1 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5], #32", 9, 9)
+MEM_TEST("st1 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5], x6", -13, -5)
MEM_TEST("st1 {v19.16b, v20.16b}, [x5]", 17, 7)
MEM_TEST("st1 {v19.16b, v20.16b}, [x5], #32", 9, 9)
-//MEM_TEST("st1 {v19.16b, v20.16b}, [x5], x6", -13, -5)
+MEM_TEST("st1 {v19.16b, v20.16b}, [x5], x6", -13, -5)
MEM_TEST("st1 {v17.16b, v18.16b, v19.16b}, [x5]", 17, 7)
-//MEM_TEST("st1 {v17.16b, v18.16b, v19.16b}, [x5], #48", 9, 9)
-//MEM_TEST("st1 {v17.16b, v18.16b, v19.16b}, [x5], x6", -13, -5)
-//
-//MEM_TEST("st1 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5]", 17, 7)
-//MEM_TEST("st1 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5], #64", 9, 9)
-//MEM_TEST("st1 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5], x6", -13, -5)
-//
-//
-//MEM_TEST("st1 {v19.8b, v20.8b}, [x5]", 17, 7)
-//MEM_TEST("st1 {v19.8b, v20.8b}, [x5], #16", 9, 9)
-//MEM_TEST("st1 {v19.8b, v20.8b}, [x5], x6", -13, -5)
-//
-//MEM_TEST("st1 {v17.8b, v18.8b, v19.8b}, [x5]", 17, 7)
-//MEM_TEST("st1 {v17.8b, v18.8b, v19.8b}, [x5], #24", 9, 9)
-//MEM_TEST("st1 {v17.8b, v18.8b, v19.8b}, [x5], x6", -13, -5)
-//
-//MEM_TEST("st1 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5]", 17, 7)
-//MEM_TEST("st1 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5], #32", 9, 9)
-//MEM_TEST("st1 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5], x6", -13, -5)
-//
-//
-//MEM_TEST("ld1 {v19.2d, v20.2d}, [x5]", 17, 7)
-//MEM_TEST("ld1 {v19.2d, v20.2d}, [x5], #32", 9, 9)
-//MEM_TEST("ld1 {v19.2d, v20.2d}, [x5], x6", -13, -5)
-//
-//MEM_TEST("ld1 {v17.2d, v18.2d, v19.2d}, [x5]", 17, 7)
-//MEM_TEST("ld1 {v17.2d, v18.2d, v19.2d}, [x5], #48", 9, 9)
-//MEM_TEST("ld1 {v17.2d, v18.2d, v19.2d}, [x5], x6", -13, -5)
-//
-//MEM_TEST("ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5]", 17, 7)
-//MEM_TEST("ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5], #64", 9, 9)
-//MEM_TEST("ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5], x6", -13, -5)
-//
-//
-//MEM_TEST("ld1 {v19.1d, v20.1d}, [x5]", 17, 7)
-//MEM_TEST("ld1 {v19.1d, v20.1d}, [x5], #16", 9, 9)
-//MEM_TEST("ld1 {v19.1d, v20.1d}, [x5], x6", -13, -5)
-//
-//MEM_TEST("ld1 {v17.1d, v18.1d, v19.1d}, [x5]", 17, 7)
-//MEM_TEST("ld1 {v17.1d, v18.1d, v19.1d}, [x5], #24", 9, 9)
-//MEM_TEST("ld1 {v17.1d, v18.1d, v19.1d}, [x5], x6", -13, -5)
-//
-//MEM_TEST("ld1 {v17.1d, v18.1d, v19.1d, v20.1d}, [x5]", 17, 7)
-//MEM_TEST("ld1 {v17.1d, v18.1d, v19.1d, v20.1d}, [x5], #32", 9, 9)
-//MEM_TEST("ld1 {v17.1d, v18.1d, v19.1d, v20.1d}, [x5], x6", -13, -5)
-//
-//
-//MEM_TEST("ld1 {v19.4s, v20.4s}, [x5]", 17, 7)
-//MEM_TEST("ld1 {v19.4s, v20.4s}, [x5], #32", 9, 9)
-//MEM_TEST("ld1 {v19.4s, v20.4s}, [x5], x6", -13, -5)
-//
-//MEM_TEST("ld1 {v17.4s, v18.4s, v19.4s}, [x5]", 17, 7)
-//MEM_TEST("ld1 {v17.4s, v18.4s, v19.4s}, [x5], #48", 9, 9)
-//MEM_TEST("ld1 {v17.4s, v18.4s, v19.4s}, [x5], x6", -13, -5)
-//
-//MEM_TEST("ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5]", 17, 7)
-//MEM_TEST("ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5], #64", 9, 9)
-//MEM_TEST("ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5], x6", -13, -5)
-//
-//
-//MEM_TEST("ld1 {v19.2s, v20.2s}, [x5]", 17, 7)
-//MEM_TEST("ld1 {v19.2s, v20.2s}, [x5], #16", 9, 9)
-//MEM_TEST("ld1 {v19.2s, v20.2s}, [x5], x6", -13, -5)
-//
-//MEM_TEST("ld1 {v17.2s, v18.2s, v19.2s}, [x5]", 17, 7)
-//MEM_TEST("ld1 {v17.2s, v18.2s, v19.2s}, [x5], #24", 9, 9)
-//MEM_TEST("ld1 {v17.2s, v18.2s, v19.2s}, [x5], x6", -13, -5)
-//
-//MEM_TEST("ld1 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5]", 17, 7)
-//MEM_TEST("ld1 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5], #32", 9, 9)
-//MEM_TEST("ld1 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5], x6", -13, -5)
-//
-//
-//MEM_TEST("ld1 {v19.8h, v20.8h}, [x5]", 17, 7)
-//MEM_TEST("ld1 {v19.8h, v20.8h}, [x5], #32", 9, 9)
-//MEM_TEST("ld1 {v19.8h, v20.8h}, [x5], x6", -13, -5)
-//
-//MEM_TEST("ld1 {v17.8h, v18.8h, v19.8h}, [x5]", 17, 7)
-//MEM_TEST("ld1 {v17.8h, v18.8h, v19.8h}, [x5], #48", 9, 9)
-//MEM_TEST("ld1 {v17.8h, v18.8h, v19.8h}, [x5], x6", -13, -5)
-//
-//MEM_TEST("ld1 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5]", 17, 7)
-//MEM_TEST("ld1 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5], #64", 9, 9)
-//MEM_TEST("ld1 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5], x6", -13, -5)
-//
-//
-//MEM_TEST("ld1 {v19.4h, v20.4h}, [x5]", 17, 7)
-//MEM_TEST("ld1 {v19.4h, v20.4h}, [x5], #16", 9, 9)
-//MEM_TEST("ld1 {v19.4h, v20.4h}, [x5], x6", -13, -5)
-//
-//MEM_TEST("ld1 {v17.4h, v18.4h, v19.4h}, [x5]", 17, 7)
-//MEM_TEST("ld1 {v17.4h, v18.4h, v19.4h}, [x5], #24", 9, 9)
-//MEM_TEST("ld1 {v17.4h, v18.4h, v19.4h}, [x5], x6", -13, -5)
-//
-//MEM_TEST("ld1 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5]", 17, 7)
-//MEM_TEST("ld1 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5], #32", 9, 9)
-//MEM_TEST("ld1 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5], x6", -13, -5)
+MEM_TEST("st1 {v17.16b, v18.16b, v19.16b}, [x5], #48", 9, 9)
+MEM_TEST("st1 {v17.16b, v18.16b, v19.16b}, [x5], x6", -13, -5)
+
+MEM_TEST("st1 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5]", 17, 7)
+MEM_TEST("st1 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5], #64", 9, 9)
+MEM_TEST("st1 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5], x6", -13, -5)
+
+
+MEM_TEST("st1 {v19.8b, v20.8b}, [x5]", 17, 7)
+MEM_TEST("st1 {v19.8b, v20.8b}, [x5], #16", 9, 9)
+MEM_TEST("st1 {v19.8b, v20.8b}, [x5], x6", -13, -5)
+
+MEM_TEST("st1 {v17.8b, v18.8b, v19.8b}, [x5]", 17, 7)
+MEM_TEST("st1 {v17.8b, v18.8b, v19.8b}, [x5], #24", 9, 9)
+MEM_TEST("st1 {v17.8b, v18.8b, v19.8b}, [x5], x6", -13, -5)
+
+MEM_TEST("st1 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5]", 17, 7)
+MEM_TEST("st1 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5], #32", 9, 9)
+MEM_TEST("st1 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5], x6", -13, -5)
+
+
+MEM_TEST("ld1 {v19.2d, v20.2d}, [x5]", 17, 7)
+MEM_TEST("ld1 {v19.2d, v20.2d}, [x5], #32", 9, 9)
+MEM_TEST("ld1 {v19.2d, v20.2d}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v17.2d, v18.2d, v19.2d}, [x5]", 17, 7)
+MEM_TEST("ld1 {v17.2d, v18.2d, v19.2d}, [x5], #48", 9, 9)
+MEM_TEST("ld1 {v17.2d, v18.2d, v19.2d}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5]", 17, 7)
+MEM_TEST("ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5], #64", 9, 9)
+MEM_TEST("ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5], x6", -13, -5)
+
+
+MEM_TEST("ld1 {v19.1d, v20.1d}, [x5]", 17, 7)
+MEM_TEST("ld1 {v19.1d, v20.1d}, [x5], #16", 9, 9)
+MEM_TEST("ld1 {v19.1d, v20.1d}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v17.1d, v18.1d, v19.1d}, [x5]", 17, 7)
+MEM_TEST("ld1 {v17.1d, v18.1d, v19.1d}, [x5], #24", 9, 9)
+MEM_TEST("ld1 {v17.1d, v18.1d, v19.1d}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v17.1d, v18.1d, v19.1d, v20.1d}, [x5]", 17, 7)
+MEM_TEST("ld1 {v17.1d, v18.1d, v19.1d, v20.1d}, [x5], #32", 9, 9)
+MEM_TEST("ld1 {v17.1d, v18.1d, v19.1d, v20.1d}, [x5], x6", -13, -5)
+
+
+MEM_TEST("ld1 {v19.4s, v20.4s}, [x5]", 17, 7)
+MEM_TEST("ld1 {v19.4s, v20.4s}, [x5], #32", 9, 9)
+MEM_TEST("ld1 {v19.4s, v20.4s}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v17.4s, v18.4s, v19.4s}, [x5]", 17, 7)
+MEM_TEST("ld1 {v17.4s, v18.4s, v19.4s}, [x5], #48", 9, 9)
+MEM_TEST("ld1 {v17.4s, v18.4s, v19.4s}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5]", 17, 7)
+MEM_TEST("ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5], #64", 9, 9)
+MEM_TEST("ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5], x6", -13, -5)
+
+
+MEM_TEST("ld1 {v19.2s, v20.2s}, [x5]", 17, 7)
+MEM_TEST("ld1 {v19.2s, v20.2s}, [x5], #16", 9, 9)
+MEM_TEST("ld1 {v19.2s, v20.2s}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v17.2s, v18.2s, v19.2s}, [x5]", 17, 7)
+MEM_TEST("ld1 {v17.2s, v18.2s, v19.2s}, [x5], #24", 9, 9)
+MEM_TEST("ld1 {v17.2s, v18.2s, v19.2s}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5]", 17, 7)
+MEM_TEST("ld1 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5], #32", 9, 9)
+MEM_TEST("ld1 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5], x6", -13, -5)
+
+
+MEM_TEST("ld1 {v19.8h, v20.8h}, [x5]", 17, 7)
+MEM_TEST("ld1 {v19.8h, v20.8h}, [x5], #32", 9, 9)
+MEM_TEST("ld1 {v19.8h, v20.8h}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v17.8h, v18.8h, v19.8h}, [x5]", 17, 7)
+MEM_TEST("ld1 {v17.8h, v18.8h, v19.8h}, [x5], #48", 9, 9)
+MEM_TEST("ld1 {v17.8h, v18.8h, v19.8h}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5]", 17, 7)
+MEM_TEST("ld1 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5], #64", 9, 9)
+MEM_TEST("ld1 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5], x6", -13, -5)
+
+
+MEM_TEST("ld1 {v19.4h, v20.4h}, [x5]", 17, 7)
+MEM_TEST("ld1 {v19.4h, v20.4h}, [x5], #16", 9, 9)
+MEM_TEST("ld1 {v19.4h, v20.4h}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v17.4h, v18.4h, v19.4h}, [x5]", 17, 7)
+MEM_TEST("ld1 {v17.4h, v18.4h, v19.4h}, [x5], #24", 9, 9)
+MEM_TEST("ld1 {v17.4h, v18.4h, v19.4h}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5]", 17, 7)
+MEM_TEST("ld1 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5], #32", 9, 9)
+MEM_TEST("ld1 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5], x6", -13, -5)
MEM_TEST("ld1 {v19.16b, v20.16b}, [x5]", 17, 7)
MEM_TEST("ld1 {v19.16b, v20.16b}, [x5], #32", 9, 9)
-//MEM_TEST("ld1 {v19.16b, v20.16b}, [x5], x6", -13, -5)
+MEM_TEST("ld1 {v19.16b, v20.16b}, [x5], x6", -13, -5)
MEM_TEST("ld1 {v17.16b, v18.16b, v19.16b}, [x5]", 17, 7)
-//MEM_TEST("ld1 {v17.16b, v18.16b, v19.16b}, [x5], #48", 9, 9)
-//MEM_TEST("ld1 {v17.16b, v18.16b, v19.16b}, [x5], x6", -13, -5)
-//
-//MEM_TEST("ld1 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5]", 17, 7)
-//MEM_TEST("ld1 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5], #64", 9, 9)
-//MEM_TEST("ld1 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5], x6", -13, -5)
-//
-//
-//MEM_TEST("ld1 {v19.8b, v20.8b}, [x5]", 17, 7)
-//MEM_TEST("ld1 {v19.8b, v20.8b}, [x5], #16", 9, 9)
-//MEM_TEST("ld1 {v19.8b, v20.8b}, [x5], x6", -13, -5)
-//
-//MEM_TEST("ld1 {v17.8b, v18.8b, v19.8b}, [x5]", 17, 7)
-//MEM_TEST("ld1 {v17.8b, v18.8b, v19.8b}, [x5], #24", 9, 9)
-//MEM_TEST("ld1 {v17.8b, v18.8b, v19.8b}, [x5], x6", -13, -5)
-//
-//MEM_TEST("ld1 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5]", 17, 7)
-//MEM_TEST("ld1 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5], #32", 9, 9)
-//MEM_TEST("ld1 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5], x6", -13, -5)
+MEM_TEST("ld1 {v17.16b, v18.16b, v19.16b}, [x5], #48", 9, 9)
+MEM_TEST("ld1 {v17.16b, v18.16b, v19.16b}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5]", 17, 7)
+MEM_TEST("ld1 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5], #64", 9, 9)
+MEM_TEST("ld1 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5], x6", -13, -5)
+
+
+MEM_TEST("ld1 {v19.8b, v20.8b}, [x5]", 17, 7)
+MEM_TEST("ld1 {v19.8b, v20.8b}, [x5], #16", 9, 9)
+MEM_TEST("ld1 {v19.8b, v20.8b}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v17.8b, v18.8b, v19.8b}, [x5]", 17, 7)
+MEM_TEST("ld1 {v17.8b, v18.8b, v19.8b}, [x5], #24", 9, 9)
+MEM_TEST("ld1 {v17.8b, v18.8b, v19.8b}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5]", 17, 7)
+MEM_TEST("ld1 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5], #32", 9, 9)
+MEM_TEST("ld1 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5], x6", -13, -5)
////////////////////////////////////////////////////////////////
@@ -1229,370 +1229,370 @@
////////////////////////////////////////////////////////////////
printf("LD2R (single structure, replicate)\n");
-//MEM_TEST("ld2r {v17.2d , v18.2d }, [x5]", 3, -5)
-//MEM_TEST("ld2r {v18.1d , v19.1d }, [x5]", 3, -4)
-//MEM_TEST("ld2r {v19.4s , v20.4s }, [x5]", 3, -3)
-//MEM_TEST("ld2r {v17.2s , v18.2s }, [x5]", 3, -2)
-//MEM_TEST("ld2r {v18.8h , v19.8h }, [x5]", 3, -1)
-//MEM_TEST("ld2r {v19.4h , v20.4h }, [x5]", 3, 1)
-//MEM_TEST("ld2r {v17.16b, v18.16b}, [x5]", 3, 2)
-//MEM_TEST("ld2r {v18.8b , v19.8b }, [x5]", 3, 3)
-//
-//MEM_TEST("ld2r {v19.2d , v20.2d }, [x5], #16", 3, -5)
-//MEM_TEST("ld2r {v17.1d , v18.1d }, [x5], #16", 3, -4)
-//MEM_TEST("ld2r {v18.4s , v19.4s }, [x5], #8", 3, -3)
-//MEM_TEST("ld2r {v19.2s , v20.2s }, [x5], #8", 3, -2)
-//MEM_TEST("ld2r {v17.8h , v18.8h }, [x5], #4", 3, -1)
-//MEM_TEST("ld2r {v18.4h , v19.4h }, [x5], #4", 3, 1)
-//MEM_TEST("ld2r {v19.16b, v20.16b}, [x5], #2", 3, 2)
-//MEM_TEST("ld2r {v17.8b , v18.8b }, [x5], #2", 3, 3)
-//
-//MEM_TEST("ld2r {v18.2d , v19.2d }, [x5], x6", 3, -5)
-//MEM_TEST("ld2r {v19.1d , v20.1d }, [x5], x6", 3, -4)
-//MEM_TEST("ld2r {v17.4s , v18.4s }, [x5], x6", 3, -3)
-//MEM_TEST("ld2r {v18.2s , v19.2s }, [x5], x6", 3, -2)
-//MEM_TEST("ld2r {v19.8h , v20.8h }, [x5], x6", 3, -1)
-//MEM_TEST("ld2r {v17.4h , v18.4h }, [x5], x6", 3, 1)
-//MEM_TEST("ld2r {v18.16b, v19.16b}, [x5], x6", 3, 2)
-//MEM_TEST("ld2r {v19.8b , v20.8b }, [x5], x6", 3, 3)
+MEM_TEST("ld2r {v17.2d , v18.2d }, [x5]", 3, -5)
+MEM_TEST("ld2r {v18.1d , v19.1d }, [x5]", 3, -4)
+MEM_TEST("ld2r {v19.4s , v20.4s }, [x5]", 3, -3)
+MEM_TEST("ld2r {v17.2s , v18.2s }, [x5]", 3, -2)
+MEM_TEST("ld2r {v18.8h , v19.8h }, [x5]", 3, -1)
+MEM_TEST("ld2r {v19.4h , v20.4h }, [x5]", 3, 1)
+MEM_TEST("ld2r {v17.16b, v18.16b}, [x5]", 3, 2)
+MEM_TEST("ld2r {v18.8b , v19.8b }, [x5]", 3, 3)
+
+MEM_TEST("ld2r {v19.2d , v20.2d }, [x5], #16", 3, -5)
+MEM_TEST("ld2r {v17.1d , v18.1d }, [x5], #16", 3, -4)
+MEM_TEST("ld2r {v18.4s , v19.4s }, [x5], #8", 3, -3)
+MEM_TEST("ld2r {v19.2s , v20.2s }, [x5], #8", 3, -2)
+MEM_TEST("ld2r {v17.8h , v18.8h }, [x5], #4", 3, -1)
+MEM_TEST("ld2r {v18.4h , v19.4h }, [x5], #4", 3, 1)
+MEM_TEST("ld2r {v19.16b, v20.16b}, [x5], #2", 3, 2)
+MEM_TEST("ld2r {v17.8b , v18.8b }, [x5], #2", 3, 3)
+
+MEM_TEST("ld2r {v18.2d , v19.2d }, [x5], x6", 3, -5)
+MEM_TEST("ld2r {v19.1d , v20.1d }, [x5], x6", 3, -4)
+MEM_TEST("ld2r {v17.4s , v18.4s }, [x5], x6", 3, -3)
+MEM_TEST("ld2r {v18.2s , v19.2s }, [x5], x6", 3, -2)
+MEM_TEST("ld2r {v19.8h , v20.8h }, [x5], x6", 3, -1)
+MEM_TEST("ld2r {v17.4h , v18.4h }, [x5], x6", 3, 1)
+MEM_TEST("ld2r {v18.16b, v19.16b}, [x5], x6", 3, 2)
+MEM_TEST("ld2r {v19.8b , v20.8b }, [x5], x6", 3, 3)
//////////////////////////////////////////////////////////////////
-//printf("LD3R (single structure, replicate)\n");
+printf("LD3R (single structure, replicate)\n");
-//MEM_TEST("ld3r {v17.2d , v18.2d , v19.2d }, [x5]", 3, -5)
-//MEM_TEST("ld3r {v18.1d , v19.1d , v20.1d }, [x5]", 3, -4)
-//MEM_TEST("ld3r {v17.4s , v18.4s , v19.4s }, [x5]", 3, -3)
-//MEM_TEST("ld3r {v18.2s , v19.2s , v20.2s }, [x5]", 3, -2)
-//MEM_TEST("ld3r {v17.8h , v18.8h , v19.8h }, [x5]", 3, -5)
-//MEM_TEST("ld3r {v18.4h , v19.4h , v20.4h }, [x5]", 3, -4)
-//MEM_TEST("ld3r {v17.16b, v18.16b, v19.16b}, [x5]", 3, -3)
-//MEM_TEST("ld3r {v18.8b , v19.8b , v20.8b }, [x5]", 3, -2)
-//
-//MEM_TEST("ld3r {v17.2d , v18.2d , v19.2d }, [x5], #24", 3, -5)
-//MEM_TEST("ld3r {v18.1d , v19.1d , v20.1d }, [x5], #24", 3, -4)
-//MEM_TEST("ld3r {v17.4s , v18.4s , v19.4s }, [x5], #12", 3, -3)
-//MEM_TEST("ld3r {v18.2s , v19.2s , v20.2s }, [x5], #12", 3, -2)
-//MEM_TEST("ld3r {v17.8h , v18.8h , v19.8h }, [x5], #6", 3, -5)
-//MEM_TEST("ld3r {v18.4h , v19.4h , v20.4h }, [x5], #6", 3, -4)
-//MEM_TEST("ld3r {v17.16b, v18.16b, v19.16b}, [x5], #3", 3, -3)
-//MEM_TEST("ld3r {v18.8b , v19.8b , v20.8b }, [x5], #3", 3, -2)
-//
-//MEM_TEST("ld3r {v17.2d , v18.2d , v19.2d }, [x5], x6", 3, -5)
-//MEM_TEST("ld3r {v18.1d , v19.1d , v20.1d }, [x5], x6", 3, -4)
-//MEM_TEST("ld3r {v17.4s , v18.4s , v19.4s }, [x5], x6", 3, -3)
-//MEM_TEST("ld3r {v18.2s , v19.2s , v20.2s }, [x5], x6", 3, -2)
-//MEM_TEST("ld3r {v17.8h , v18.8h , v19.8h }, [x5], x6", 3, -5)
-//MEM_TEST("ld3r {v18.4h , v19.4h , v20.4h }, [x5], x6", 3, -4)
-//MEM_TEST("ld3r {v17.16b, v18.16b, v19.16b}, [x5], x6", 3, -3)
-//MEM_TEST("ld3r {v18.8b , v19.8b , v20.8b }, [x5], x6", 3, -2)
+MEM_TEST("ld3r {v17.2d , v18.2d , v19.2d }, [x5]", 3, -5)
+MEM_TEST("ld3r {v18.1d , v19.1d , v20.1d }, [x5]", 3, -4)
+MEM_TEST("ld3r {v17.4s , v18.4s , v19.4s }, [x5]", 3, -3)
+MEM_TEST("ld3r {v18.2s , v19.2s , v20.2s }, [x5]", 3, -2)
+MEM_TEST("ld3r {v17.8h , v18.8h , v19.8h }, [x5]", 3, -5)
+MEM_TEST("ld3r {v18.4h , v19.4h , v20.4h }, [x5]", 3, -4)
+MEM_TEST("ld3r {v17.16b, v18.16b, v19.16b}, [x5]", 3, -3)
+MEM_TEST("ld3r {v18.8b , v19.8b , v20.8b }, [x5]", 3, -2)
+
+MEM_TEST("ld3r {v17.2d , v18.2d , v19.2d }, [x5], #24", 3, -5)
+MEM_TEST("ld3r {v18.1d , v19.1d , v20.1d }, [x5], #24", 3, -4)
+MEM_TEST("ld3r {v17.4s , v18.4s , v19.4s }, [x5], #12", 3, -3)
+MEM_TEST("ld3r {v18.2s , v19.2s , v20.2s }, [x5], #12", 3, -2)
+MEM_TEST("ld3r {v17.8h , v18.8h , v19.8h }, [x5], #6", 3, -5)
+MEM_TEST("ld3r {v18.4h , v19.4h , v20.4h }, [x5], #6", 3, -4)
+MEM_TEST("ld3r {v17.16b, v18.16b, v19.16b}, [x5], #3", 3, -3)
+MEM_TEST("ld3r {v18.8b , v19.8b , v20.8b }, [x5], #3", 3, -2)
+
+MEM_TEST("ld3r {v17.2d , v18.2d , v19.2d }, [x5], x6", 3, -5)
+MEM_TEST("ld3r {v18.1d , v19.1d , v20.1d }, [x5], x6", 3, -4)
+MEM_TEST("ld3r {v17.4s , v18.4s , v19.4s }, [x5], x6", 3, -3)
+MEM_TEST("ld3r {v18.2s , v19.2s , v20.2s }, [x5], x6", 3, -2)
+MEM_TEST("ld3r {v17.8h , v18.8h , v19.8h }, [x5], x6", 3, -5)
+MEM_TEST("ld3r {v18.4h , v19.4h , v20.4h }, [x5], x6", 3, -4)
+MEM_TEST("ld3r {v17.16b, v18.16b, v19.16b}, [x5], x6", 3, -3)
+MEM_TEST("ld3r {v18.8b , v19.8b , v20.8b }, [x5], x6", 3, -2)
////////////////////////////////////////////////////////////////
printf("LD4R (single structure, replicate)\n");
-//MEM_TEST("ld4r {v17.2d , v18.2d , v19.2d , v20.2d }, [x5]", 3, -5)
-//MEM_TEST("ld4r {v17.1d , v18.1d , v19.1d , v20.1d }, [x5]", 3, -4)
-//MEM_TEST("ld4r {v17.4s , v18.4s , v19.4s , v20.4s }, [x5]", 3, -3)
-//MEM_TEST("ld4r {v17.2s , v18.2s , v19.2s , v20.2s }, [x5]", 3, -2)
-//MEM_TEST("ld4r {v17.8h , v18.8h , v19.8h , v20.8h }, [x5]", 3, -5)
-//MEM_TEST("ld4r {v17.4h , v18.4h , v19.4h , v20.4h }, [x5]", 3, -4)
-//MEM_TEST("ld4r {v17.16b, v18.16b, v19.16b, v20.16b}, [x5]", 3, -3)
-//MEM_TEST("ld4r {v17.8b , v18.8b , v19.8b , v20.8b }, [x5]", 3, -2)
-//
-//MEM_TEST("ld4r {v17.2d , v18.2d , v19.2d , v20.2d }, [x5], #32", 3, -5)
-//MEM_TEST("ld4r {v17.1d , v18.1d , v19.1d , v20.1d }, [x5], #32", 3, -4)
-//MEM_TEST("ld4r {v17.4s , v18.4s , v19.4s , v20.4s }, [x5], #16", 3, -3)
-//MEM_TEST("ld4r {v17.2s , v18.2s , v19.2s , v20.2s }, [x5], #16", 3, -2)
-//MEM_TEST("ld4r {v17.8h , v18.8h , v19.8h , v20.8h }, [x5], #8", 3, -5)
-//MEM_TEST("ld4r {v17.4h , v18.4h , v19.4h , v20.4h }, [x5], #8", 3, -4)
-//MEM_TEST("ld4r {v17.16b, v18.16b, v19.16b, v20.16b}, [x5], #4", 3, -3)
-//MEM_TEST("ld4r {v17.8b , v18.8b , v19.8b , v20.8b }, [x5], #4", 3, -2)
-//
-//MEM_TEST("ld4r {v17.2d , v18.2d , v19.2d , v20.2d }, [x5], x6", 3, -5)
-//MEM_TEST("ld4r {v17.1d , v18.1d , v19.1d , v20.1d }, [x5], x6", 3, -4)
-//MEM_TEST("ld4r {v17.4s , v18.4s , v19.4s , v20.4s }, [x5], x6", 3, -3)
-//MEM_TEST("ld4r {v17.2s , v18.2s , v19.2s , v20.2s }, [x5], x6", 3, -2)
-//MEM_TEST("ld4r {v17.8h , v18.8h , v19.8h , v20.8h }, [x5], x6", 3, -5)
-//MEM_TEST("ld4r {v17.4h , v18.4h , v19.4h , v20.4h }, [x5], x6", 3, -4)
-//MEM_TEST("ld4r {v17.16b, v18.16b, v19.16b, v20.16b}, [x5], x6", 3, -3)
-//MEM_TEST("ld4r {v17.8b , v18.8b , v19.8b , v20.8b }, [x5], x6", 3, -2)
+MEM_TEST("ld4r {v17.2d , v18.2d , v19.2d , v20.2d }, [x5]", 3, -5)
+MEM_TEST("ld4r {v17.1d , v18.1d , v19.1d , v20.1d }, [x5]", 3, -4)
+MEM_TEST("ld4r {v17.4s , v18.4s , v19.4s , v20.4s }, [x5]", 3, -3)
+MEM_TEST("ld4r {v17.2s , v18.2s , v19.2s , v20.2s }, [x5]", 3, -2)
+MEM_TEST("ld4r {v17.8h , v18.8h , v19.8h , v20.8h }, [x5]", 3, -5)
+MEM_TEST("ld4r {v17.4h , v18.4h , v19.4h , v20.4h }, [x5]", 3, -4)
+MEM_TEST("ld4r {v17.16b, v18.16b, v19.16b, v20.16b}, [x5]", 3, -3)
+MEM_TEST("ld4r {v17.8b , v18.8b , v19.8b , v20.8b }, [x5]", 3, -2)
+
+MEM_TEST("ld4r {v17.2d , v18.2d , v19.2d , v20.2d }, [x5], #32", 3, -5)
+MEM_TEST("ld4r {v17.1d , v18.1d , v19.1d , v20.1d }, [x5], #32", 3, -4)
+MEM_TEST("ld4r {v17.4s , v18.4s , v19.4s , v20.4s }, [x5], #16", 3, -3)
+MEM_TEST("ld4r {v17.2s , v18.2s , v19.2s , v20.2s }, [x5], #16", 3, -2)
+MEM_TEST("ld4r {v17.8h , v18.8h , v19.8h , v20.8h }, [x5], #8", 3, -5)
+MEM_TEST("ld4r {v17.4h , v18.4h , v19.4h , v20.4h }, [x5], #8", 3, -4)
+MEM_TEST("ld4r {v17.16b, v18.16b, v19.16b, v20.16b}, [x5], #4", 3, -3)
+MEM_TEST("ld4r {v17.8b , v18.8b , v19.8b , v20.8b }, [x5], #4", 3, -2)
+
+MEM_TEST("ld4r {v17.2d , v18.2d , v19.2d , v20.2d }, [x5], x6", 3, -5)
+MEM_TEST("ld4r {v17.1d , v18.1d , v19.1d , v20.1d }, [x5], x6", 3, -4)
+MEM_TEST("ld4r {v17.4s , v18.4s , v19.4s , v20.4s }, [x5], x6", 3, -3)
+MEM_TEST("ld4r {v17.2s , v18.2s , v19.2s , v20.2s }, [x5], x6", 3, -2)
+MEM_TEST("ld4r {v17.8h , v18.8h , v19.8h , v20.8h }, [x5], x6", 3, -5)
+MEM_TEST("ld4r {v17.4h , v18.4h , v19.4h , v20.4h }, [x5], x6", 3, -4)
+MEM_TEST("ld4r {v17.16b, v18.16b, v19.16b, v20.16b}, [x5], x6", 3, -3)
+MEM_TEST("ld4r {v17.8b , v18.8b , v19.8b , v20.8b }, [x5], x6", 3, -2)
////////////////////////////////////////////////////////////////
printf("LD1/ST1 (single 1-elem struct to/from one lane of 1 reg\n");
-//MEM_TEST("st1 {v19.d}[0], [x5]", 17, 7)
-//MEM_TEST("st1 {v19.d}[0], [x5], #8", -9, 12)
-//MEM_TEST("st1 {v19.d}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("st1 {v19.d}[1], [x5]", 17, 7)
-//MEM_TEST("st1 {v19.d}[1], [x5], #8", -9, 12)
-//MEM_TEST("st1 {v19.d}[1], [x5], x6", 9, 13)
-//
-//MEM_TEST("st1 {v19.s}[0], [x5]", 17, 7)
-//MEM_TEST("st1 {v19.s}[0], [x5], #4", -9, 12)
-//MEM_TEST("st1 {v19.s}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("st1 {v19.s}[3], [x5]", 17, 7)
-//MEM_TEST("st1 {v19.s}[3], [x5], #4", -9, 12)
-//MEM_TEST("st1 {v19.s}[3], [x5], x6", 9, 13)
-//
-//MEM_TEST("st1 {v19.h}[0], [x5]", 17, 7)
-//MEM_TEST("st1 {v19.h}[0], [x5], #2", -9, 12)
-//MEM_TEST("st1 {v19.h}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("st1 {v19.h}[6], [x5]", 17, 7)
-//MEM_TEST("st1 {v19.h}[6], [x5], #2", -9, 12)
-//MEM_TEST("st1 {v19.h}[6], [x5], x6", 9, 13)
-//
-//MEM_TEST("st1 {v19.b}[0], [x5]", 17, 7)
-//MEM_TEST("st1 {v19.b}[0], [x5], #1", -9, 12)
-//MEM_TEST("st1 {v19.b}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("st1 {v19.b}[13], [x5]", 17, 7)
-//MEM_TEST("st1 {v19.b}[13], [x5], #1", -9, 12)
-//MEM_TEST("st1 {v19.b}[13], [x5], x6", 9, 13)
-//
-//
-//MEM_TEST("ld1 {v19.d}[0], [x5]", 17, 7)
-//MEM_TEST("ld1 {v19.d}[0], [x5], #8", -9, 12)
-//MEM_TEST("ld1 {v19.d}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld1 {v19.d}[1], [x5]", 17, 7)
-//MEM_TEST("ld1 {v19.d}[1], [x5], #8", -9, 12)
-//MEM_TEST("ld1 {v19.d}[1], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld1 {v19.s}[0], [x5]", 17, 7)
-//MEM_TEST("ld1 {v19.s}[0], [x5], #4", -9, 12)
-//MEM_TEST("ld1 {v19.s}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld1 {v19.s}[3], [x5]", 17, 7)
-//MEM_TEST("ld1 {v19.s}[3], [x5], #4", -9, 12)
-//MEM_TEST("ld1 {v19.s}[3], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld1 {v19.h}[0], [x5]", 17, 7)
-//MEM_TEST("ld1 {v19.h}[0], [x5], #2", -9, 12)
-//MEM_TEST("ld1 {v19.h}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld1 {v19.h}[6], [x5]", 17, 7)
-//MEM_TEST("ld1 {v19.h}[6], [x5], #2", -9, 12)
-//MEM_TEST("ld1 {v19.h}[6], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld1 {v19.b}[0], [x5]", 17, 7)
-//MEM_TEST("ld1 {v19.b}[0], [x5], #1", -9, 12)
-//MEM_TEST("ld1 {v19.b}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld1 {v19.b}[13], [x5]", 17, 7)
-//MEM_TEST("ld1 {v19.b}[13], [x5], #1", -9, 12)
-//MEM_TEST("ld1 {v19.b}[13], [x5], x6", 9, 13)
+MEM_TEST("st1 {v19.d}[0], [x5]", 17, 7)
+MEM_TEST("st1 {v19.d}[0], [x5], #8", -9, 12)
+MEM_TEST("st1 {v19.d}[0], [x5], x6", 9, 13)
+
+MEM_TEST("st1 {v19.d}[1], [x5]", 17, 7)
+MEM_TEST("st1 {v19.d}[1], [x5], #8", -9, 12)
+MEM_TEST("st1 {v19.d}[1], [x5], x6", 9, 13)
+
+MEM_TEST("st1 {v19.s}[0], [x5]", 17, 7)
+MEM_TEST("st1 {v19.s}[0], [x5], #4", -9, 12)
+MEM_TEST("st1 {v19.s}[0], [x5], x6", 9, 13)
+
+MEM_TEST("st1 {v19.s}[3], [x5]", 17, 7)
+MEM_TEST("st1 {v19.s}[3], [x5], #4", -9, 12)
+MEM_TEST("st1 {v19.s}[3], [x5], x6", 9, 13)
+
+MEM_TEST("st1 {v19.h}[0], [x5]", 17, 7)
+MEM_TEST("st1 {v19.h}[0], [x5], #2", -9, 12)
+MEM_TEST("st1 {v19.h}[0], [x5], x6", 9, 13)
+
+MEM_TEST("st1 {v19.h}[6], [x5]", 17, 7)
+MEM_TEST("st1 {v19.h}[6], [x5], #2", -9, 12)
+MEM_TEST("st1 {v19.h}[6], [x5], x6", 9, 13)
+
+MEM_TEST("st1 {v19.b}[0], [x5]", 17, 7)
+MEM_TEST("st1 {v19.b}[0], [x5], #1", -9, 12)
+MEM_TEST("st1 {v19.b}[0], [x5], x6", 9, 13)
+
+MEM_TEST("st1 {v19.b}[13], [x5]", 17, 7)
+MEM_TEST("st1 {v19.b}[13], [x5], #1", -9, 12)
+MEM_TEST("st1 {v19.b}[13], [x5], x6", 9, 13)
+
+
+MEM_TEST("ld1 {v19.d}[0], [x5]", 17, 7)
+MEM_TEST("ld1 {v19.d}[0], [x5], #8", -9, 12)
+MEM_TEST("ld1 {v19.d}[0], [x5], x6", 9, 13)
+
+MEM_TEST("ld1 {v19.d}[1], [x5]", 17, 7)
+MEM_TEST("ld1 {v19.d}[1], [x5], #8", -9, 12)
+MEM_TEST("ld1 {v19.d}[1], [x5], x6", 9, 13)
+
+MEM_TEST("ld1 {v19.s}[0], [x5]", 17, 7)
+MEM_TEST("ld1 {v19.s}[0], [x5], #4", -9, 12)
+MEM_TEST("ld1 {v19.s}[0], [x5], x6", 9, 13)
+
+MEM_TEST("ld1 {v19.s}[3], [x5]", 17, 7)
+MEM_TEST("ld1 {v19.s}[3], [x5], #4", -9, 12)
+MEM_TEST("ld1 {v19.s}[3], [x5], x6", 9, 13)
+
+MEM_TEST("ld1 {v19.h}[0], [x5]", 17, 7)
+MEM_TEST("ld1 {v19.h}[0], [x5], #2", -9, 12)
+MEM_TEST("ld1 {v19.h}[0], [x5], x6", 9, 13)
+
+MEM_TEST("ld1 {v19.h}[6], [x5]", 17, 7)
+MEM_TEST("ld1 {v19.h}[6], [x5], #2", -9, 12)
+MEM_TEST("ld1 {v19.h}[6], [x5], x6", 9, 13)
+
+MEM_TEST("ld1 {v19.b}[0], [x5]", 17, 7)
+MEM_TEST("ld1 {v19.b}[0], [x5], #1", -9, 12)
+MEM_TEST("ld1 {v19.b}[0], [x5], x6", 9, 13)
+
+MEM_TEST("ld1 {v19.b}[13], [x5]", 17, 7)
+MEM_TEST("ld1 {v19.b}[13], [x5], #1", -9, 12)
+MEM_TEST("ld1 {v19.b}[13], [x5], x6", 9, 13)
////////////////////////////////////////////////////////////////
printf("LD2/ST2 (single 2-elem struct to/from one lane of 2 regs\n");
-//MEM_TEST("st2 {v18.d, v19.d}[0], [x5]", 17, 7)
-//MEM_TEST("st2 {v18.d, v19.d}[0], [x5], #16", -9, 12)
-//MEM_TEST("st2 {v18.d, v19.d}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("st2 {v18.d, v19.d}[1], [x5]", 17, 7)
-//MEM_TEST("st2 {v18.d, v19.d}[1], [x5], #16", -9, 12)
-//MEM_TEST("st2 {v18.d, v19.d}[1], [x5], x6", 9, 13)
-//
-//MEM_TEST("st2 {v18.s, v19.s}[0], [x5]", 17, 7)
-//MEM_TEST("st2 {v18.s, v19.s}[0], [x5], #8", -9, 12)
-//MEM_TEST("st2 {v18.s, v19.s}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("st2 {v18.s, v19.s}[3], [x5]", 17, 7)
-//MEM_TEST("st2 {v18.s, v19.s}[3], [x5], #8", -9, 12)
-//MEM_TEST("st2 {v18.s, v19.s}[3], [x5], x6", 9, 13)
-//
-//MEM_TEST("st2 {v18.h, v19.h}[0], [x5]", 17, 7)
-//MEM_TEST("st2 {v18.h, v19.h}[0], [x5], #4", -9, 12)
-//MEM_TEST("st2 {v18.h, v19.h}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("st2 {v18.h, v19.h}[6], [x5]", 17, 7)
-//MEM_TEST("st2 {v18.h, v19.h}[6], [x5], #4", -9, 12)
-//MEM_TEST("st2 {v18.h, v19.h}[6], [x5], x6", 9, 13)
-//
-//MEM_TEST("st2 {v18.b, v19.b}[0], [x5]", 17, 7)
-//MEM_TEST("st2 {v18.b, v19.b}[0], [x5], #2", -9, 12)
-//MEM_TEST("st2 {v18.b, v19.b}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("st2 {v18.b, v19.b}[13], [x5]", 17, 7)
-//MEM_TEST("st2 {v18.b, v19.b}[13], [x5], #2", -9, 12)
-//MEM_TEST("st2 {v18.b, v19.b}[13], [x5], x6", 9, 13)
-//
-//
-//MEM_TEST("ld2 {v18.d, v19.d}[0], [x5]", 17, 7)
-//MEM_TEST("ld2 {v18.d, v19.d}[0], [x5], #16", -9, 12)
-//MEM_TEST("ld2 {v18.d, v19.d}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld2 {v18.d, v19.d}[1], [x5]", 17, 7)
-//MEM_TEST("ld2 {v18.d, v19.d}[1], [x5], #16", -9, 12)
-//MEM_TEST("ld2 {v18.d, v19.d}[1], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld2 {v18.s, v19.s}[0], [x5]", 17, 7)
-//MEM_TEST("ld2 {v18.s, v19.s}[0], [x5], #8", -9, 12)
-//MEM_TEST("ld2 {v18.s, v19.s}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld2 {v18.s, v19.s}[3], [x5]", 17, 7)
-//MEM_TEST("ld2 {v18.s, v19.s}[3], [x5], #8", -9, 12)
-//MEM_TEST("ld2 {v18.s, v19.s}[3], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld2 {v18.h, v19.h}[0], [x5]", 17, 7)
-//MEM_TEST("ld2 {v18.h, v19.h}[0], [x5], #4", -9, 12)
-//MEM_TEST("ld2 {v18.h, v19.h}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld2 {v18.h, v19.h}[6], [x5]", 17, 7)
-//MEM_TEST("ld2 {v18.h, v19.h}[6], [x5], #4", -9, 12)
-//MEM_TEST("ld2 {v18.h, v19.h}[6], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld2 {v18.b, v19.b}[0], [x5]", 17, 7)
-//MEM_TEST("ld2 {v18.b, v19.b}[0], [x5], #2", -9, 12)
-//MEM_TEST("ld2 {v18.b, v19.b}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld2 {v18.b, v19.b}[13], [x5]", 17, 7)
-//MEM_TEST("ld2 {v18.b, v19.b}[13], [x5], #2", -9, 12)
-//MEM_TEST("ld2 {v18.b, v19.b}[13], [x5], x6", 9, 13)
+MEM_TEST("st2 {v18.d, v19.d}[0], [x5]", 17, 7)
+MEM_TEST("st2 {v18.d, v19.d}[0], [x5], #16", -9, 12)
+MEM_TEST("st2 {v18.d, v19.d}[0], [x5], x6", 9, 13)
+
+MEM_TEST("st2 {v18.d, v19.d}[1], [x5]", 17, 7)
+MEM_TEST("st2 {v18.d, v19.d}[1], [x5], #16", -9, 12)
+MEM_TEST("st2 {v18.d, v19.d}[1], [x5], x6", 9, 13)
+
+MEM_TEST("st2 {v18.s, v19.s}[0], [x5]", 17, 7)
+MEM_TEST("st2 {v18.s, v19.s}[0], [x5], #8", -9, 12)
+MEM_TEST("st2 {v18.s, v19.s}[0], [x5], x6", 9, 13)
+
+MEM_TEST("st2 {v18.s, v19.s}[3], [x5]", 17, 7)
+MEM_TEST("st2 {v18.s, v19.s}[3], [x5], #8", -9, 12)
+MEM_TEST("st2 {v18.s, v19.s}[3], [x5], x6", 9, 13)
+
+MEM_TEST("st2 {v18.h, v19.h}[0], [x5]", 17, 7)
+MEM_TEST("st2 {v18.h, v19.h}[0], [x5], #4", -9, 12)
+MEM_TEST("st2 {v18.h, v19.h}[0], [x5], x6", 9, 13)
+
+MEM_TEST("st2 {v18.h, v19.h}[6], [x5]", 17, 7)
+MEM_TEST("st2 {v18.h, v19.h}[6], [x5], #4", -9, 12)
+MEM_TEST("st2 {v18.h, v19.h}[6], [x5], x6", 9, 13)
+
+MEM_TEST("st2 {v18.b, v19.b}[0], [x5]", 17, 7)
+MEM_TEST("st2 {v18.b, v19.b}[0], [x5], #2", -9, 12)
+MEM_TEST("st2 {v18.b, v19.b}[0], [x5], x6", 9, 13)
+
+MEM_TEST("st2 {v18.b, v19.b}[13], [x5]", 17, 7)
+MEM_TEST("st2 {v18.b, v19.b}[13], [x5], #2", -9, 12)
+MEM_TEST("st2 {v18.b, v19.b}[13], [x5], x6", 9, 13)
+
+
+MEM_TEST("ld2 {v18.d, v19.d}[0], [x5]", 17, 7)
+MEM_TEST("ld2 {v18.d, v19.d}[0], [x5], #16", -9, 12)
+MEM_TEST("ld2 {v18.d, v19.d}[0], [x5], x6", 9, 13)
+
+MEM_TEST("ld2 {v18.d, v19.d}[1], [x5]", 17, 7)
+MEM_TEST("ld2 {v18.d, v19.d}[1], [x5], #16", -9, 12)
+MEM_TEST("ld2 {v18.d, v19.d}[1], [x5], x6", 9, 13)
+
+MEM_TEST("ld2 {v18.s, v19.s}[0], [x5]", 17, 7)
+MEM_TEST("ld2 {v18.s, v19.s}[0], [x5], #8", -9, 12)
+MEM_TEST("ld2 {v18.s, v19.s}[0], [x5], x6", 9, 13)
+
+MEM_TEST("ld2 {v18.s, v19.s}[3], [x5]", 17, 7)
+MEM_TEST("ld2 {v18.s, v19.s}[3], [x5], #8", -9, 12)
+MEM_TEST("ld2 {v18.s, v19.s}[3], [x5], x6", 9, 13)
+
+MEM_TEST("ld2 {v18.h, v19.h}[0], [x5]", 17, 7)
+MEM_TEST("ld2 {v18.h, v19.h}[0], [x5], #4", -9, 12)
+MEM_TEST("ld2 {v18.h, v19.h}[0], [x5], x6", 9, 13)
+
+MEM_TEST("ld2 {v18.h, v19.h}[6], [x5]", 17, 7)
+MEM_TEST("ld2 {v18.h, v19.h}[6], [x5], #4", -9, 12)
+MEM_TEST("ld2 {v18.h, v19.h}[6], [x5], x6", 9, 13)
+
+MEM_TEST("ld2 {v18.b, v19.b}[0], [x5]", 17, 7)
+MEM_TEST("ld2 {v18.b, v19.b}[0], [x5], #2", -9, 12)
+MEM_TEST("ld2 {v18.b, v19.b}[0], [x5], x6", 9, 13)
+
+MEM_TEST("ld2 {v18.b, v19.b}[13], [x5]", 17, 7)
+MEM_TEST("ld2 {v18.b, v19.b}[13], [x5], #2", -9, 12)
+MEM_TEST("ld2 {v18.b, v19.b}[13], [x5], x6", 9, 13)
////////////////////////////////////////////////////////////////
printf("LD3/ST3 (single 3-elem struct to/from one lane of 3 regs\n");
-//MEM_TEST("st3 {v17.d, v18.d, v19.d}[0], [x5]", 17, 7)
-//MEM_TEST("st3 {v17.d, v18.d, v19.d}[0], [x5], #24", -9, 12)
-//MEM_TEST("st3 {v17.d, v18.d, v19.d}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("st3 {v17.d, v18.d, v19.d}[1], [x5]", 17, 7)
-//MEM_TEST("st3 {v17.d, v18.d, v19.d}[1], [x5], #24", -9, 12)
-//MEM_TEST("st3 {v17.d, v18.d, v19.d}[1], [x5], x6", 9, 13)
-//
-//MEM_TEST("st3 {v17.s, v18.s, v19.s}[0], [x5]", 17, 7)
-//MEM_TEST("st3 {v17.s, v18.s, v19.s}[0], [x5], #12", -9, 12)
-//MEM_TEST("st3 {v17.s, v18.s, v19.s}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("st3 {v17.s, v18.s, v19.s}[3], [x5]", 17, 7)
-//MEM_TEST("st3 {v17.s, v18.s, v19.s}[3], [x5], #12", -9, 12)
-//MEM_TEST("st3 {v17.s, v18.s, v19.s}[3], [x5], x6", 9, 13)
-//
-//MEM_TEST("st3 {v17.h, v18.h, v19.h}[0], [x5]", 17, 7)
-//MEM_TEST("st3 {v17.h, v18.h, v19.h}[0], [x5], #6", -9, 12)
-//MEM_TEST("st3 {v17.h, v18.h, v19.h}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("st3 {v17.h, v18.h, v19.h}[6], [x5]", 17, 7)
-//MEM_TEST("st3 {v17.h, v18.h, v19.h}[6], [x5], #6", -9, 12)
-//MEM_TEST("st3 {v17.h, v18.h, v19.h}[6], [x5], x6", 9, 13)
-//
-//MEM_TEST("st3 {v17.b, v18.b, v19.b}[0], [x5]", 17, 7)
-//MEM_TEST("st3 {v17.b, v18.b, v19.b}[0], [x5], #3", -9, 12)
-//MEM_TEST("st3 {v17.b, v18.b, v19.b}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("st3 {v17.b, v18.b, v19.b}[13], [x5]", 17, 7)
-//MEM_TEST("st3 {v17.b, v18.b, v19.b}[13], [x5], #3", -9, 12)
-//MEM_TEST("st3 {v17.b, v18.b, v19.b}[13], [x5], x6", 9, 13)
-//
-//
-//MEM_TEST("ld3 {v17.d, v18.d, v19.d}[0], [x5]", 17, 7)
-//MEM_TEST("ld3 {v17.d, v18.d, v19.d}[0], [x5], #24", -9, 12)
-//MEM_TEST("ld3 {v17.d, v18.d, v19.d}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld3 {v17.d, v18.d, v19.d}[1], [x5]", 17, 7)
-//MEM_TEST("ld3 {v17.d, v18.d, v19.d}[1], [x5], #24", -9, 12)
-//MEM_TEST("ld3 {v17.d, v18.d, v19.d}[1], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld3 {v17.s, v18.s, v19.s}[0], [x5]", 17, 7)
-//MEM_TEST("ld3 {v17.s, v18.s, v19.s}[0], [x5], #12", -9, 12)
-//MEM_TEST("ld3 {v17.s, v18.s, v19.s}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld3 {v17.s, v18.s, v19.s}[3], [x5]", 17, 7)
-//MEM_TEST("ld3 {v17.s, v18.s, v19.s}[3], [x5], #12", -9, 12)
-//MEM_TEST("ld3 {v17.s, v18.s, v19.s}[3], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld3 {v17.h, v18.h, v19.h}[0], [x5]", 17, 7)
-//MEM_TEST("ld3 {v17.h, v18.h, v19.h}[0], [x5], #6", -9, 12)
-//MEM_TEST("ld3 {v17.h, v18.h, v19.h}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld3 {v17.h, v18.h, v19.h}[6], [x5]", 17, 7)
-//MEM_TEST("ld3 {v17.h, v18.h, v19.h}[6], [x5], #6", -9, 12)
-//MEM_TEST("ld3 {v17.h, v18.h, v19.h}[6], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld3 {v17.b, v18.b, v19.b}[0], [x5]", 17, 7)
-//MEM_TEST("ld3 {v17.b, v18.b, v19.b}[0], [x5], #3", -9, 12)
-//MEM_TEST("ld3 {v17.b, v18.b, v19.b}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld3 {v17.b, v18.b, v19.b}[13], [x5]", 17, 7)
-//MEM_TEST("ld3 {v17.b, v18.b, v19.b}[13], [x5], #3", -9, 12)
-//MEM_TEST("ld3 {v17.b, v18.b, v19.b}[13], [x5], x6", 9, 13)
+MEM_TEST("st3 {v17.d, v18.d, v19.d}[0], [x5]", 17, 7)
+MEM_TEST("st3 {v17.d, v18.d, v19.d}[0], [x5], #24", -9, 12)
+MEM_TEST("st3 {v17.d, v18.d, v19.d}[0], [x5], x6", 9, 13)
+
+MEM_TEST("st3 {v17.d, v18.d, v19.d}[1], [x5]", 17, 7)
+MEM_TEST("st3 {v17.d, v18.d, v19.d}[1], [x5], #24", -9, 12)
+MEM_TEST("st3 {v17.d, v18.d, v19.d}[1], [x5], x6", 9, 13)
+
+MEM_TEST("st3 {v17.s, v18.s, v19.s}[0], [x5]", 17, 7)
+MEM_TEST("st3 {v17.s, v18.s, v19.s}[0], [x5], #12", -9, 12)
+MEM_TEST("st3 {v17.s, v18.s, v19.s}[0], [x5], x6", 9, 13)
+
+MEM_TEST("st3 {v17.s, v18.s, v19.s}[3], [x5]", 17, 7)
+MEM_TEST("st3 {v17.s, v18.s, v19.s}[3], [x5], #12", -9, 12)
+MEM_TEST("st3 {v17.s, v18.s, v19.s}[3], [x5], x6", 9, 13)
+
+MEM_TEST("st3 {v17.h, v18.h, v19.h}[0], [x5]", 17, 7)
+MEM_TEST("st3 {v17.h, v18.h, v19.h}[0], [x5], #6", -9, 12)
+MEM_TEST("st3 {v17.h, v18.h, v19.h}[0], [x5], x6", 9, 13)
+
+MEM_TEST("st3 {v17.h, v18.h, v19.h}[6], [x5]", 17, 7)
+MEM_TEST("st3 {v17.h, v18.h, v19.h}[6], [x5], #6", -9, 12)
+MEM_TEST("st3 {v17.h, v18.h, v19.h}[6], [x5], x6", 9, 13)
+
+MEM_TEST("st3 {v17.b, v18.b, v19.b}[0], [x5]", 17, 7)
+MEM_TEST("st3 {v17.b, v18.b, v19.b}[0], [x5], #3", -9, 12)
+MEM_TEST("st3 {v17.b, v18.b, v19.b}[0], [x5], x6", 9, 13)
+
+MEM_TEST("st3 {v17.b, v18.b, v19.b}[13], [x5]", 17, 7)
+MEM_TEST("st3 {v17.b, v18.b, v19.b}[13], [x5], #3", -9, 12)
+MEM_TEST("st3 {v17.b, v18.b, v19.b}[13], [x5], x6", 9, 13)
+
+
+MEM_TEST("ld3 {v17.d, v18.d, v19.d}[0], [x5]", 17, 7)
+MEM_TEST("ld3 {v17.d, v18.d, v19.d}[0], [x5], #24", -9, 12)
+MEM_TEST("ld3 {v17.d, v18.d, v19.d}[0], [x5], x6", 9, 13)
+
+MEM_TEST("ld3 {v17.d, v18.d, v19.d}[1], [x5]", 17, 7)
+MEM_TEST("ld3 {v17.d, v18.d, v19.d}[1], [x5], #24", -9, 12)
+MEM_TEST("ld3 {v17.d, v18.d, v19.d}[1], [x5], x6", 9, 13)
+
+MEM_TEST("ld3 {v17.s, v18.s, v19.s}[0], [x5]", 17, 7)
+MEM_TEST("ld3 {v17.s, v18.s, v19.s}[0], [x5], #12", -9, 12)
+MEM_TEST("ld3 {v17.s, v18.s, v19.s}[0], [x5], x6", 9, 13)
+
+MEM_TEST("ld3 {v17.s, v18.s, v19.s}[3], [x5]", 17, 7)
+MEM_TEST("ld3 {v17.s, v18.s, v19.s}[3], [x5], #12", -9, 12)
+MEM_TEST("ld3 {v17.s, v18.s, v19.s}[3], [x5], x6", 9, 13)
+
+MEM_TEST("ld3 {v17.h, v18.h, v19.h}[0], [x5]", 17, 7)
+MEM_TEST("ld3 {v17.h, v18.h, v19.h}[0], [x5], #6", -9, 12)
+MEM_TEST("ld3 {v17.h, v18.h, v19.h}[0], [x5], x6", 9, 13)
+
+MEM_TEST("ld3 {v17.h, v18.h, v19.h}[6], [x5]", 17, 7)
+MEM_TEST("ld3 {v17.h, v18.h, v19.h}[6], [x5], #6", -9, 12)
+MEM_TEST("ld3 {v17.h, v18.h, v19.h}[6], [x5], x6", 9, 13)
+
+MEM_TEST("ld3 {v17.b, v18.b, v19.b}[0], [x5]", 17, 7)
+MEM_TEST("ld3 {v17.b, v18.b, v19.b}[0], [x5], #3", -9, 12)
+MEM_TEST("ld3 {v17.b, v18.b, v19.b}[0], [x5], x6", 9, 13)
+
+MEM_TEST("ld3 {v17.b, v18.b, v19.b}[13], [x5]", 17, 7)
+MEM_TEST("ld3 {v17.b, v18.b, v19.b}[13], [x5], #3", -9, 12)
+MEM_TEST("ld3 {v17.b, v18.b, v19.b}[13], [x5], x6", 9, 13)
////////////////////////////////////////////////////////////////
printf("LD4/ST4 (single 4-elem struct to/from one lane of 4 regs\n");
-//MEM_TEST("st4 {v17.d, v18.d, v19.d, v20.d}[0], [x5]", 17, 7)
-//MEM_TEST("st4 {v17.d, v18.d, v19.d, v20.d}[0], [x5], #32", -9, 12)
-//MEM_TEST("st4 {v17.d, v18.d, v19.d, v20.d}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("st4 {v17.d, v18.d, v19.d, v20.d}[1], [x5]", 17, 7)
-//MEM_TEST("st4 {v17.d, v18.d, v19.d, v20.d}[1], [x5], #32", -9, 12)
-//MEM_TEST("st4 {v17.d, v18.d, v19.d, v20.d}[1], [x5], x6", 9, 13)
-//
-//MEM_TEST("st4 {v17.s, v18.s, v19.s, v20.s}[0], [x5]", 17, 7)
-//MEM_TEST("st4 {v17.s, v18.s, v19.s, v20.s}[0], [x5], #16", -9, 12)
-//MEM_TEST("st4 {v17.s, v18.s, v19.s, v20.s}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("st4 {v17.s, v18.s, v19.s, v20.s}[3], [x5]", 17, 7)
-//MEM_TEST("st4 {v17.s, v18.s, v19.s, v20.s}[3], [x5], #16", -9, 12)
-//MEM_TEST("st4 {v17.s, v18.s, v19.s, v20.s}[3], [x5], x6", 9, 13)
-//
-//MEM_TEST("st4 {v17.h, v18.h, v19.h, v20.h}[0], [x5]", 17, 7)
-//MEM_TEST("st4 {v17.h, v18.h, v19.h, v20.h}[0], [x5], #8", -9, 12)
-//MEM_TEST("st4 {v17.h, v18.h, v19.h, v20.h}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("st4 {v17.h, v18.h, v19.h, v20.h}[6], [x5]", 17, 7)
-//MEM_TEST("st4 {v17.h, v18.h, v19.h, v20.h}[6], [x5], #8", -9, 12)
-//MEM_TEST("st4 {v17.h, v18.h, v19.h, v20.h}[6], [x5], x6", 9, 13)
-//
-//MEM_TEST("st4 {v17.b, v18.b, v19.b, v20.b}[0], [x5]", 17, 7)
-//MEM_TEST("st4 {v17.b, v18.b, v19.b, v20.b}[0], [x5], #4", -9, 12)
-//MEM_TEST("st4 {v17.b, v18.b, v19.b, v20.b}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("st4 {v17.b, v18.b, v19.b, v20.b}[13], [x5]", 17, 7)
-//MEM_TEST("st4 {v17.b, v18.b, v19.b, v20.b}[13], [x5], #4", -9, 12)
-//MEM_TEST("st4 {v17.b, v18.b, v19.b, v20.b}[13], [x5], x6", 9, 13)
-//
-//
-//MEM_TEST("ld4 {v17.d, v18.d, v19.d, v20.d}[0], [x5]", 17, 7)
-//MEM_TEST("ld4 {v17.d, v18.d, v19.d, v20.d}[0], [x5], #32", -9, 12)
-//MEM_TEST("ld4 {v17.d, v18.d, v19.d, v20.d}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld4 {v17.d, v18.d, v19.d, v20.d}[1], [x5]", 17, 7)
-//MEM_TEST("ld4 {v17.d, v18.d, v19.d, v20.d}[1], [x5], #32", -9, 12)
-//MEM_TEST("ld4 {v17.d, v18.d, v19.d, v20.d}[1], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld4 {v17.s, v18.s, v19.s, v20.s}[0], [x5]", 17, 7)
-//MEM_TEST("ld4 {v17.s, v18.s, v19.s, v20.s}[0], [x5], #16", -9, 12)
-//MEM_TEST("ld4 {v17.s, v18.s, v19.s, v20.s}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld4 {v17.s, v18.s, v19.s, v20.s}[3], [x5]", 17, 7)
-//MEM_TEST("ld4 {v17.s, v18.s, v19.s, v20.s}[3], [x5], #16", -9, 12)
-//MEM_TEST("ld4 {v17.s, v18.s, v19.s, v20.s}[3], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld4 {v17.h, v18.h, v19.h, v20.h}[0], [x5]", 17, 7)
-//MEM_TEST("ld4 {v17.h, v18.h, v19.h, v20.h}[0], [x5], #8", -9, 12)
-//MEM_TEST("ld4 {v17.h, v18.h, v19.h, v20.h}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld4 {v17.h, v18.h, v19.h, v20.h}[6], [x5]", 17, 7)
-//MEM_TEST("ld4 {v17.h, v18.h, v19.h, v20.h}[6], [x5], #8", -9, 12)
-//MEM_TEST("ld4 {v17.h, v18.h, v19.h, v20.h}[6], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld4 {v17.b, v18.b, v19.b, v20.b}[0], [x5]", 17, 7)
-//MEM_TEST("ld4 {v17.b, v18.b, v19.b, v20.b}[0], [x5], #4", -9, 12)
-//MEM_TEST("ld4 {v17.b, v18.b, v19.b, v20.b}[0], [x5], x6", 9, 13)
-//
-//MEM_TEST("ld4 {v17.b, v18.b, v19.b, v20.b}[13], [x5]", 17, 7)
-//MEM_TEST("ld4 {v17.b, v18.b, v19.b, v20.b}[13], [x5], #4", -9, 12)
-//MEM_TEST("ld4 {v17.b, v18.b, v19.b, v20.b}[13], [x5], x6", 9, 13)
+MEM_TEST("st4 {v17.d, v18.d, v19.d, v20.d}[0], [x5]", 17, 7)
+MEM_TEST("st4 {v17.d, v18.d, v19.d, v20.d}[0], [x5], #32", -9, 12)
+MEM_TEST("st4 {v17.d, v18.d, v19.d, v20.d}[0], [x5], x6", 9, 13)
+
+MEM_TEST("st4 {v17.d, v18.d, v19.d, v20.d}[1], [x5]", 17, 7)
+MEM_TEST("st4 {v17.d, v18.d, v19.d, v20.d}[1], [x5], #32", -9, 12)
+MEM_TEST("st4 {v17.d, v18.d, v19.d, v20.d}[1], [x5], x6", 9, 13)
+
+MEM_TEST("st4 {v17.s, v18.s, v19.s, v20.s}[0], [x5]", 17, 7)
+MEM_TEST("st4 {v17.s, v18.s, v19.s, v20.s}[0], [x5], #16", -9, 12)
+MEM_TEST("st4 {v17.s, v18.s, v19.s, v20.s}[0], [x5], x6", 9, 13)
+
+MEM_TEST("st4 {v17.s, v18.s, v19.s, v20.s}[3], [x5]", 17, 7)
+MEM_TEST("st4 {v17.s, v18.s, v19.s, v20.s}[3], [x5], #16", -9, 12)
+MEM_TEST("st4 {v17.s, v18.s, v19.s, v20.s}[3], [x5], x6", 9, 13)
+
+MEM_TEST("st4 {v17.h, v18.h, v19.h, v20.h}[0], [x5]", 17, 7)
+MEM_TEST("st4 {v17.h, v18.h, v19.h, v20.h}[0], [x5], #8", -9, 12)
+MEM_TEST("st4 {v17.h, v18.h, v19.h, v20.h}[0], [x5], x6", 9, 13)
+
+MEM_TEST("st4 {v17.h, v18.h, v19.h, v20.h}[6], [x5]", 17, 7)
+MEM_TEST("st4 {v17.h, v18.h, v19.h, v20.h}[6], [x5], #8", -9, 12)
+MEM_TEST("st4 {v17.h, v18.h, v19.h, v20.h}[6], [x5], x6", 9, 13)
+
+MEM_TEST("st4 {v17.b, v18.b, v19.b, v20.b}[0], [x5]", 17, 7)
+MEM_TEST("st4 {v17.b, v18.b, v19.b, v20.b}[0], [x5], #4", -9, 12)
+MEM_TEST("st4 {v17.b, v18.b, v19.b, v20.b}[0], [x5], x6", 9, 13)
+
+MEM_TEST("st4 {v17.b, v18.b, v19.b, v20.b}[13], [x5]", 17, 7)
+MEM_TEST("st4 {v17.b, v18.b, v19.b, v20.b}[13], [x5], #4", -9, 12)
+MEM_TEST("st4 {v17.b, v18.b, v19.b, v20.b}[13], [x5], x6", 9, 13)
+
+
+MEM_TEST("ld4 {v17.d, v18.d, v19.d, v20.d}[0], [x5]", 17, 7)
+MEM_TEST("ld4 {v17.d, v18.d, v19.d, v20.d}[0], [x5], #32", -9, 12)
+MEM_TEST("ld4 {v17.d, v18.d, v19.d, v20.d}[0], [x5], x6", 9, 13)
+
+MEM_TEST("ld4 {v17.d, v18.d, v19.d, v20.d}[1], [x5]", 17, 7)
+MEM_TEST("ld4 {v17.d, v18.d, v19.d, v20.d}[1], [x5], #32", -9, 12)
+MEM_TEST("ld4 {v17.d, v18.d, v19.d, v20.d}[1], [x5], x6", 9, 13)
+
+MEM_TEST("ld4 {v17.s, v18.s, v19.s, v20.s}[0], [x5]", 17, 7)
+MEM_TEST("ld4 {v17.s, v18.s, v19.s, v20.s}[0], [x5], #16", -9, 12)
+MEM_TEST("ld4 {v17.s, v18.s, v19.s, v20.s}[0], [x5], x6", 9, 13)
+
+MEM_TEST("ld4 {v17.s, v18.s, v19.s, v20.s}[3], [x5]", 17, 7)
+MEM_TEST("ld4 {v17.s, v18.s, v19.s, v20.s}[3], [x5], #16", -9, 12)
+MEM_TEST("ld4 {v17.s, v18.s, v19.s, v20.s}[3], [x5], x6", 9, 13)
+
+MEM_TEST("ld4 {v17.h, v18.h, v19.h, v20.h}[0], [x5]", 17, 7)
+MEM_TEST("ld4 {v17.h, v18.h, v19.h, v20.h}[0], [x5], #8", -9, 12)
+MEM_TEST("ld4 {v17.h, v18.h, v19.h, v20.h}[0], [x5], x6", 9, 13)
+
+MEM_TEST("ld4 {v17.h, v18.h, v19.h, v20.h}[6], [x5]", 17, 7)
+MEM_TEST("ld4 {v17.h, v18.h, v19.h, v20.h}[6], [x5], #8", -9, 12)
+MEM_TEST("ld4 {v17.h, v18.h, v19.h, v20.h}[6], [x5], x6", 9, 13)
+
+MEM_TEST("ld4 {v17.b, v18.b, v19.b, v20.b}[0], [x5]", 17, 7)
+MEM_TEST("ld4 {v17.b, v18.b, v19.b, v20.b}[0], [x5], #4", -9, 12)
+MEM_TEST("ld4 {v17.b, v18.b, v19.b, v20.b}[0], [x5], x6", 9, 13)
+
+MEM_TEST("ld4 {v17.b, v18.b, v19.b, v20.b}[13], [x5]", 17, 7)
+MEM_TEST("ld4 {v17.b, v18.b, v19.b, v20.b}[13], [x5], #4", -9, 12)
+MEM_TEST("ld4 {v17.b, v18.b, v19.b, v20.b}[13], [x5], x6", 9, 13)
} /* end of test_memory2() */
Modified: branches/VALGRIND_3_10_BRANCH/none/tests/arm64/memory.stdout.exp
==============================================================================
--- branches/VALGRIND_3_10_BRANCH/none/tests/arm64/memory.stdout.exp (original)
+++ branches/VALGRIND_3_10_BRANCH/none/tests/arm64/memory.stdout.exp Sun Nov 23 12:59:56 2014
@@ -12868,6 +12868,1626 @@
0 x6 (sub, index reg)
LD1/ST1 (multiple 1-elem structs to/from 2/3/4 regs
+st1 {v19.2d, v20.2d}, [x5] with x5 = middle_of_block+17, x6=7
+ [ 0] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 16] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 32] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 48] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 64] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 80] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 96] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [112] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [128] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [144] .. ab d7 bd ea 5c d2 ef 93 ea 9e 58 ad 15 d3 6b
+ [160] 6f 92 04 a1 1e 4e 21 cf 3e 4d 6d cc c6 66 69 01
+ [176] 8f .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [192] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [208] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [224] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [240] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ 0000000000000000 x13 (xor, xfer intreg #1)
+ 0000000000000000 x23 (xor, xfer intreg #2)
+ 0000000000000000 v17.d[0] (xor, xfer vecreg #1)
+ 0000000000000000 v17.d[1] (xor, xfer vecreg #1)
+ 0000000000000000 v18.d[0] (xor, xfer vecreg #2)
+ 0000000000000000 v18.d[1] (xor, xfer vecreg #2)
+ 0000000000000000 v19.d[0] (xor, xfer vecreg #3)
+ 0000000000000000 v19.d[1] (xor, xfer vecreg #3)
+ 0000000000000000 v20.d[0] (xor, xfer vecreg #3)
+ 0000000000000000 v20.d[1] (xor, xfer vecreg #3)
+ 0 x5 (sub, base reg)
+ 0 x6 (sub, index reg)
+
+st1 {v19.2d, v20.2d}, [x5], #32 with x5 = middle_of_block+9, x6=9
+ [ 0] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 16] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 32] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 48] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 64] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 80] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 96] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [112] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [128] .. .. .. .. .. .. .. .. .. cc 79 aa 79 30 64 50
+ [144] 2a e9 a6 99 31 f0 0d 6b ac a1 02 22 80 7a 9a 59
+ [160] d8 8d 3c 3a c9 3f a1 e3 b8 .. .. .. .. .. .. ..
+ [176] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [192] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [208] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [224] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [240] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ 0000000000000000 x13 (xor, xfer intreg #1)
+ 0000000000000000 x23 (xor, xfer intreg #2)
+ 0000000000000000 v17.d[0] (xor, xfer vecreg #1)
+ 0000000000000000 v17.d[1] (xor, xfer vecreg #1)
+ 0000000000000000 v18.d[0] (xor, xfer vecreg #2)
+ 0000000000000000 v18.d[1] (xor, xfer vecreg #2)
+ 0000000000000000 v19.d[0] (xor, xfer vecreg #3)
+ 0000000000000000 v19.d[1] (xor, xfer vecreg #3)
+ 0000000000000000 v20.d[0] (xor, xfer vecreg #3)
+ 0000000000000000 v20.d[1] (xor, xfer vecreg #3)
+ 32 x5 (sub, base reg)
+ 0 x6 (sub, index reg)
+
+st1 {v19.2d, v20.2d}, [x5], x6 with x5 = middle_of_block+-13, x6=-5
+ [ 0] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 16] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 32] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 48] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 64] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 80] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 96] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [112] .. .. .. b7 10 15 41 15 b8 2c 6e db 28 f2 d7 47
+ [128] 4f bf a9 40 86 92 62 1c f1 f3 38 6d 13 a2 5a 69
+ [144] 84 87 11 .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [160] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [176] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [192] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [208] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [224] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [240] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ 0000000000000000 x13 (xor, xfer intreg #1)
+ 0000000000000000 x23 (xor, xfer intreg #2)
+ 0000000000000000 v17.d[0] (xor, xfer vecreg #1)
+ 0000000000000000 v17.d[1] (xor, xfer vecreg #1)
+ 0000000000000000 v18.d[0] (xor, xfer vecreg #2)
+ 0000000000000000 v18.d[1] (xor, xfer vecreg #2)
+ 0000000000000000 v19.d[0] (xor, xfer vecreg #3)
+ 0000000000000000 v19.d[1] (xor, xfer vecreg #3)
+ 0000000000000000 v20.d[0] (xor, xfer vecreg #3)
+ 0000000000000000 v20.d[1] (xor, xfer vecreg #3)
+ -5 x5 (sub, base reg)
+ 0 x6 (sub, index reg)
+
+st1 {v17.2d, v18.2d, v19.2d}, [x5] with x5 = middle_of_block+17, x6=7
+ [ 0] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 16] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 32] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 48] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 64] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 80] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [ 96] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [112] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [128] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+...
[truncated message content] |
|
From: <sv...@va...> - 2014-11-23 12:58:34
|
Author: sewardj
Date: Sun Nov 23 12:58:22 2014
New Revision: 14756
Log:
Merge, from trunk, r14653
14653 Add test cases for all known arm64 load/store instructions.
Modified:
branches/VALGRIND_3_10_BRANCH/ (props changed)
branches/VALGRIND_3_10_BRANCH/none/tests/arm64/memory.c
branches/VALGRIND_3_10_BRANCH/none/tests/arm64/memory.stdout.exp
Modified: branches/VALGRIND_3_10_BRANCH/none/tests/arm64/memory.c
==============================================================================
--- branches/VALGRIND_3_10_BRANCH/none/tests/arm64/memory.c (original)
+++ branches/VALGRIND_3_10_BRANCH/none/tests/arm64/memory.c Sun Nov 23 12:58:22 2014
@@ -110,11 +110,14 @@
////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////
+// //
+// test_memory_old //
+// //
////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////
-static __attribute((noinline)) void test_memory ( void )
+static __attribute((noinline)) void test_memory_old ( void )
{
printf("Integer loads\n");
@@ -286,13 +289,15 @@
////////////////////////////////////////////////////////////////
printf("STL{R,RH,RB} (entirely MISSING)\n");
-
-} /* end of test_memory() */
+} /* end of test_memory_old() */
////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////
+// //
+// test_memory_new //
+// //
////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////
@@ -393,7 +398,8 @@
free(area); \
}
-static __attribute__((noinline)) void test_memory2 ( void )
+
+static __attribute__((noinline)) void test_memory_new ( void )
{
////////////////////////////////////////////////////////////////
printf("LDR,STR (immediate, uimm12)");
@@ -432,10 +438,10 @@
MEM_TEST("stp w13, w23, [x5, #-40]", 0, 0);
////////////////////////////////////////////////////////////////
-printf("LDR (literal, int reg) (DONE ABOVE)\n");
+printf("LDR (literal, int reg) (done above by test_memory_old)\n");
////////////////////////////////////////////////////////////////
-printf("{LD,ST}R (integer register) (entirely MISSING)\n");
+printf("{LD,ST}R (integer register)\n");
MEM_TEST("str x13, [x5, x6]", 12, -4);
MEM_TEST("str x13, [x5, x6, lsl #3]", 12, -4);
MEM_TEST("str x13, [x5, w6, uxtw]", 12, 4);
@@ -497,7 +503,7 @@
MEM_TEST("ldrsb w13, [x5, #56]", -16, 4);
////////////////////////////////////////////////////////////////
-printf("LDRS{B,H,W} (simm9, upd) (upd check is MISSING)\n");
+printf("LDRS{B,H,W} (simm9, upd)\n");
MEM_TEST("ldrsw x13, [x5, #-24]!", -16, 4);
MEM_TEST("ldrsh x13, [x5, #-20]!", -16, 4);
MEM_TEST("ldrsh w13, [x5, #-44]!", -16, 4);
@@ -521,34 +527,44 @@
////////////////////////////////////////////////////////////////
printf("LDP,STP (immediate, simm7) (FP&VEC)\n");
-MEM_TEST("stp q17, q18, [x5, 32]", -16, 4);
-MEM_TEST("stp q17, q18, [x5, 32]!", -16, 4);
-MEM_TEST("stp q17, q18, [x5], 32", -16, 4);
-
-MEM_TEST("stp d17, d18, [x5, 32]", -16, 4);
-MEM_TEST("stp d17, d18, [x5, 32]!", -16, 4);
-MEM_TEST("stp d17, d18, [x5], 32", -16, 4);
-
-//MEM_TEST("stp s17, s18, [x5, 32]", -16, 4);
-//MEM_TEST("stp s17, s18, [x5, 32]!", -16, 4);
-//MEM_TEST("stp s17, s18, [x5], 32", -16, 4);
-
-MEM_TEST("ldp q17, q18, [x5, 32]", -16, 4);
-MEM_TEST("ldp q17, q18, [x5, 32]!", -16, 4);
-MEM_TEST("ldp q17, q18, [x5], 32", -16, 4);
-
-MEM_TEST("ldp d17, d18, [x5, 32]", -16, 4);
-MEM_TEST("ldp d17, d18, [x5, 32]!", -16, 4);
-MEM_TEST("ldp d17, d18, [x5], 32", -16, 4);
-
-//MEM_TEST("ldp s17, s18, [x5, 32]", -16, 4);
-//MEM_TEST("ldp s17, s18, [x5, 32]!", -16, 4);
-//MEM_TEST("ldp s17, s18, [x5], 32", -16, 4);
+MEM_TEST("stp q17, q18, [x5, 16]", -15, 4);
+MEM_TEST("stp q19, q18, [x5, 32]!", -11, 4);
+MEM_TEST("stp q20, q17, [x5], -48", -7, 4);
+
+MEM_TEST("stp d18, d17, [x5, 16]", -15, 4);
+MEM_TEST("stp d17, d19, [x5, 32]!", -11, 4);
+MEM_TEST("stp d20, d18, [x5], -48", -7, 4);
+
+MEM_TEST("stp s17, s18, [x5, 16]", -15, 4);
+MEM_TEST("stp s19, s18, [x5, 32]!", -11, 4);
+MEM_TEST("stp s20, s17, [x5], -48", -7, 4);
+
+MEM_TEST("ldp q17, q18, [x5, 16]", -15, 4);
+MEM_TEST("ldp q18, q19, [x5, 32]!", -11, 4);
+MEM_TEST("ldp q19, q20, [x5], -48", -7, 4);
+
+MEM_TEST("ldp d20, d17, [x5, 16]", -15, 4);
+MEM_TEST("ldp d17, d18, [x5, 32]!", -11, 4);
+MEM_TEST("ldp d18, d19, [x5], -48", -7, 4);
+
+MEM_TEST("ldp s19, s20, [x5, 16]", -15, 4);
+MEM_TEST("ldp s20, s17, [x5, 32]!", -11, 4);
+MEM_TEST("ldp s17, s18, [x5], -48", -7, 4);
+
+////////////////////////////////////////////////////////////////
+printf("LDNP,STNP (immediate, simm7) (FP&VEC, w/ nontemporal hint)\n");
+
+MEM_TEST("stnp q18, q17, [x5, 16]", -15, 4);
+MEM_TEST("stnp d20, d19, [x5, 40]", -15, 4);
+MEM_TEST("stnp s19, s18, [x5, 68]", -15, 4);
+
+MEM_TEST("ldnp q18, q17, [x5, 16]", -15, 4);
+MEM_TEST("ldnp d17, d20, [x5, 40]", -15, 4);
+MEM_TEST("ldnp s20, s19, [x5, 68]", -15, 4);
////////////////////////////////////////////////////////////////
printf("{LD,ST}R (vector register)\n");
-#if 0
MEM_TEST("str q17, [x5, x6]", 12, -4);
MEM_TEST("str q17, [x5, x6, lsl #4]", 12, -4);
MEM_TEST("str q17, [x5, w6, uxtw]", 12, 4);
@@ -561,7 +577,6 @@
MEM_TEST("ldr q17, [x5, w6, uxtw #4]", 12, 4);
MEM_TEST("ldr q17, [x5, w6, sxtw]", 12, 4);
MEM_TEST("ldr q17, [x5, w6, sxtw #4]", 12, -4);
-#endif
MEM_TEST("str d17, [x5, x6]", 12, -4);
MEM_TEST("str d17, [x5, x6, lsl #3]", 12, -4);
@@ -653,103 +668,536 @@
MEM_TEST("ldrsb w13, [x5,w6,sxtw #0]", 12, 4);
MEM_TEST("ldrsb w13, [x5,w6,sxtw #0]", 12, -4);
-
////////////////////////////////////////////////////////////////
printf("LDR/STR (immediate, SIMD&FP, unsigned offset)\n");
+
MEM_TEST("str q17, [x5, #-32]", 16, 0);
MEM_TEST("str d17, [x5, #-32]", 16, 0);
MEM_TEST("str s17, [x5, #-32]", 16, 0);
-//MEM_TEST("str h17, [x5, #-32]", 16, 0);
-//MEM_TEST("str b17, [x5, #-32]", 16, 0);
+MEM_TEST("str h17, [x5, #-32]", 16, 0);
+MEM_TEST("str b17, [x5, #-32]", 16, 0);
MEM_TEST("ldr q17, [x5, #-32]", 16, 0);
MEM_TEST("ldr d17, [x5, #-32]", 16, 0);
MEM_TEST("ldr s17, [x5, #-32]", 16, 0);
-//MEM_TEST("ldr h17, [x5, #-32]", 16, 0);
-//MEM_TEST("ldr b17, [x5, #-32]", 16, 0);
+MEM_TEST("ldr h17, [x5, #-32]", 16, 0);
+MEM_TEST("ldr b17, [x5, #-32]", 16, 0);
////////////////////////////////////////////////////////////////
printf("LDR/STR (immediate, SIMD&FP, pre/post index)\n");
+
MEM_TEST("str q17, [x5], #-32", 16, 0);
MEM_TEST("str d17, [x5], #-32", 16, 0);
MEM_TEST("str s17, [x5], #-32", 16, 0);
-//MEM_TEST("str h17, [x5], #-32", 16, 0);
-//MEM_TEST("str b17, [x5], #-32", 16, 0);
+MEM_TEST("str h17, [x5], #-32", 16, 0);
+MEM_TEST("str b17, [x5], #-32", 16, 0);
MEM_TEST("ldr q17, [x5], #-32", 16, 0);
MEM_TEST("ldr d17, [x5], #-32", 16, 0);
MEM_TEST("ldr s17, [x5], #-32", 16, 0);
-//MEM_TEST("ldr h17, [x5], #-32", 16, 0);
-//MEM_TEST("ldr b17, [x5], #-32", 16, 0);
+MEM_TEST("ldr h17, [x5], #-32", 16, 0);
+MEM_TEST("ldr b17, [x5], #-32", 16, 0);
MEM_TEST("str q17, [x5, #-32]!", 16, 0);
MEM_TEST("str d17, [x5, #-32]!", 16, 0);
MEM_TEST("str s17, [x5, #-32]!", 16, 0);
-//MEM_TEST("str h17, [x5, #-32]!", 16, 0);
-//MEM_TEST("str b17, [x5, #-32]!", 16, 0);
+MEM_TEST("str h17, [x5, #-32]!", 16, 0);
+MEM_TEST("str b17, [x5, #-32]!", 16, 0);
MEM_TEST("ldr q17, [x5, #-32]!", 16, 0);
MEM_TEST("ldr d17, [x5, #-32]!", 16, 0);
MEM_TEST("ldr s17, [x5, #-32]!", 16, 0);
-//MEM_TEST("ldr h17, [x5, #-32]!", 16, 0);
-//MEM_TEST("ldr b17, [x5, #-32]!", 16, 0);
-
+MEM_TEST("ldr h17, [x5, #-32]!", 16, 0);
+MEM_TEST("ldr b17, [x5, #-32]!", 16, 0);
////////////////////////////////////////////////////////////////
printf("LDUR/STUR (unscaled offset, SIMD&FP)\n");
+
MEM_TEST("str q17, [x5, #-13]", 16, 0);
MEM_TEST("str d17, [x5, #-13]", 16, 0);
MEM_TEST("str s17, [x5, #-13]", 16, 0);
-//MEM_TEST("str h17, [x5, #-13]", 16, 0);
-//MEM_TEST("str b17, [x5, #-13]", 16, 0);
+MEM_TEST("str h17, [x5, #-13]", 16, 0);
+MEM_TEST("str b17, [x5, #-13]", 16, 0);
MEM_TEST("ldr q17, [x5, #-13]", 16, 0);
MEM_TEST("ldr d17, [x5, #-13]", 16, 0);
MEM_TEST("ldr s17, [x5, #-13]", 16, 0);
-//MEM_TEST("ldr h17, [x5, #-13]", 16, 0);
-//MEM_TEST("ldr b17, [x5, #-13]", 16, 0);
+MEM_TEST("ldr h17, [x5, #-13]", 16, 0);
+MEM_TEST("ldr b17, [x5, #-13]", 16, 0);
////////////////////////////////////////////////////////////////
printf("LDR (literal, SIMD&FP) (entirely MISSING)\n");
+MEM_TEST("xyzzy10: ldr s17, xyzzy10 - 8", 0, 0)
+MEM_TEST("xyzzy11: ldr d17, xyzzy11 + 8", 0, 0)
+MEM_TEST("xyzzy12: ldr q17, xyzzy12 + 4", 0, 0)
+
+////////////////////////////////////////////////////////////////
+printf("LD1/ST1 (multiple 1-elem structs to/from 1 reg\n");
+
+MEM_TEST("st1 {v18.2d}, [x5]", 17, 7)
+MEM_TEST("st1 {v18.2d}, [x5], #16", 9, 9)
+MEM_TEST("st1 {v18.2d}, [x5], x6", -13, -5)
+
+MEM_TEST("st1 {v18.1d}, [x5]", 17, 7)
+MEM_TEST("st1 {v18.1d}, [x5], #8", 9, 9)
+MEM_TEST("st1 {v18.1d}, [x5], x6", -13, -5)
+
+MEM_TEST("st1 {v18.4s}, [x5]", 17, 7)
+MEM_TEST("st1 {v18.4s}, [x5], #16", 9, 9)
+MEM_TEST("st1 {v18.4s}, [x5], x6", -13, -5)
+
+MEM_TEST("st1 {v18.2s}, [x5]", 17, 7)
+MEM_TEST("st1 {v18.2s}, [x5], #8", 9, 9)
+MEM_TEST("st1 {v18.2s}, [x5], x6", -13, -5)
+
+MEM_TEST("st1 {v18.8h}, [x5]", 17, 7)
+MEM_TEST("st1 {v18.8h}, [x5], #16", 9, 9)
+MEM_TEST("st1 {v18.8h}, [x5], x6", -13, -5)
+
+MEM_TEST("st1 {v18.4h}, [x5]", 17, 7)
+MEM_TEST("st1 {v18.4h}, [x5], #8", 9, 9)
+MEM_TEST("st1 {v18.4h}, [x5], x6", -13, -5)
+
+MEM_TEST("st1 {v18.16b}, [x5]", 17, 7)
+MEM_TEST("st1 {v18.16b}, [x5], #16", 9, 9)
+MEM_TEST("st1 {v18.16b}, [x5], x6", -13, -5)
+
+MEM_TEST("st1 {v18.8b}, [x5]", 17, 7)
+MEM_TEST("st1 {v18.8b}, [x5], #8", 9, 9)
+MEM_TEST("st1 {v18.8b}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v18.2d}, [x5]", 17, 7)
+MEM_TEST("ld1 {v18.2d}, [x5], #16", 9, 9)
+MEM_TEST("ld1 {v18.2d}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v18.1d}, [x5]", 17, 7)
+MEM_TEST("ld1 {v18.1d}, [x5], #8", 9, 9)
+MEM_TEST("ld1 {v18.1d}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v18.4s}, [x5]", 17, 7)
+MEM_TEST("ld1 {v18.4s}, [x5], #16", 9, 9)
+MEM_TEST("ld1 {v18.4s}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v18.2s}, [x5]", 17, 7)
+MEM_TEST("ld1 {v18.2s}, [x5], #8", 9, 9)
+MEM_TEST("ld1 {v18.2s}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v18.8h}, [x5]", 17, 7)
+MEM_TEST("ld1 {v18.8h}, [x5], #16", 9, 9)
+MEM_TEST("ld1 {v18.8h}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v18.4h}, [x5]", 17, 7)
+MEM_TEST("ld1 {v18.4h}, [x5], #8", 9, 9)
+MEM_TEST("ld1 {v18.4h}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v18.16b}, [x5]", 17, 7)
+MEM_TEST("ld1 {v18.16b}, [x5], #16", 9, 9)
+MEM_TEST("ld1 {v18.16b}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v18.8b}, [x5]", 17, 7)
+MEM_TEST("ld1 {v18.8b}, [x5], #8", 9, 9)
+MEM_TEST("ld1 {v18.8b}, [x5], x6", -13, -5)
+
////////////////////////////////////////////////////////////////
-printf("LD1/ST1 (single structure, no offset)\n");
-MEM_TEST("st1 {v17.2d}, [x5]", 3, 0)
-MEM_TEST("st1 {v17.4s}, [x5]", 5, 0)
-MEM_TEST("st1 {v17.8h}, [x5]", 7, 0)
-MEM_TEST("st1 {v17.16b}, [x5]", 13, 0)
-MEM_TEST("st1 {v17.1d}, [x5]", 3, 0)
-MEM_TEST("st1 {v17.2s}, [x5]", 5, 0)
-MEM_TEST("st1 {v17.4h}, [x5]", 7, 0)
-MEM_TEST("st1 {v17.8b}, [x5]", 13, 0)
-
-MEM_TEST("ld1 {v17.2d}, [x5]", 3, 0)
-MEM_TEST("ld1 {v17.4s}, [x5]", 5, 0)
-MEM_TEST("ld1 {v17.8h}, [x5]", 7, 0)
-MEM_TEST("ld1 {v17.16b}, [x5]", 13, 0)
-MEM_TEST("ld1 {v17.1d}, [x5]", 3, 0)
-MEM_TEST("ld1 {v17.2s}, [x5]", 5, 0)
-MEM_TEST("ld1 {v17.4h}, [x5]", 7, 0)
-MEM_TEST("ld1 {v17.8b}, [x5]", 13, 0)
-
-////////////////////////////////////////////////////////////////
-printf("LD1/ST1 (single structure, post index)\n");
-MEM_TEST("st1 {v17.2d}, [x5], #16", 3, 0)
-MEM_TEST("st1 {v17.4s}, [x5], #16", 5, 0)
-MEM_TEST("st1 {v17.8h}, [x5], #16", 7, 0)
-MEM_TEST("st1 {v17.16b}, [x5], #16", 13, 0)
-MEM_TEST("st1 {v17.1d}, [x5], #8", 3, 0)
-MEM_TEST("st1 {v17.2s}, [x5], #8", 5, 0)
-MEM_TEST("st1 {v17.4h}, [x5], #8", 7, 0)
-MEM_TEST("st1 {v17.8b}, [x5], #8", 13, 0)
-
-MEM_TEST("ld1 {v17.2d}, [x5], #16", 3, 0)
-MEM_TEST("ld1 {v17.4s}, [x5], #16", 5, 0)
-MEM_TEST("ld1 {v17.8h}, [x5], #16", 7, 0)
-MEM_TEST("ld1 {v17.16b}, [x5], #16", 13, 0)
-MEM_TEST("ld1 {v17.1d}, [x5], #8", 3, 0)
-MEM_TEST("ld1 {v17.2s}, [x5], #8", 5, 0)
-MEM_TEST("ld1 {v17.4h}, [x5], #8", 7, 0)
-MEM_TEST("ld1 {v17.8b}, [x5], #8", 13, 0)
+printf("LD2/ST2 (multiple 2-elem structs to/from 2 regs\n");
+
+MEM_TEST("st2 {v18.2d, v19.2d}, [x5]", 17, 7)
+MEM_TEST("st2 {v18.2d, v19.2d}, [x5], #32", 9, 9)
+MEM_TEST("st2 {v18.2d, v19.2d}, [x5], x6", -13, -5)
+
+/* no 1d case */
+
+MEM_TEST("st2 {v18.4s, v19.4s}, [x5]", 17, 7)
+MEM_TEST("st2 {v18.4s, v19.4s}, [x5], #32", 9, 9)
+MEM_TEST("st2 {v18.4s, v19.4s}, [x5], x6", -13, -5)
+
+MEM_TEST("st2 {v18.2s, v19.2s}, [x5]", 17, 7)
+MEM_TEST("st2 {v18.2s, v19.2s}, [x5], #16", 9, 9)
+MEM_TEST("st2 {v18.2s, v19.2s}, [x5], x6", -13, -5)
+
+MEM_TEST("st2 {v18.8h, v19.8h}, [x5]", 17, 7)
+MEM_TEST("st2 {v18.8h, v19.8h}, [x5], #32", 9, 9)
+MEM_TEST("st2 {v18.8h, v19.8h}, [x5], x6", -13, -5)
+
+MEM_TEST("st2 {v18.4h, v19.4h}, [x5]", 17, 7)
+MEM_TEST("st2 {v18.4h, v19.4h}, [x5], #16", 9, 9)
+MEM_TEST("st2 {v18.4h, v19.4h}, [x5], x6", -13, -5)
+
+MEM_TEST("st2 {v18.16b, v19.16b}, [x5]", 17, 7)
+MEM_TEST("st2 {v18.16b, v19.16b}, [x5], #32", 9, 9)
+MEM_TEST("st2 {v18.16b, v19.16b}, [x5], x6", -13, -5)
+
+MEM_TEST("st2 {v18.8b, v19.8b}, [x5]", 17, 7)
+MEM_TEST("st2 {v18.8b, v19.8b}, [x5], #16", 9, 9)
+MEM_TEST("st2 {v18.8b, v19.8b}, [x5], x6", -13, -5)
+
+MEM_TEST("ld2 {v18.2d, v19.2d}, [x5]", 17, 7)
+MEM_TEST("ld2 {v18.2d, v19.2d}, [x5], #32", 9, 9)
+MEM_TEST("ld2 {v18.2d, v19.2d}, [x5], x6", -13, -5)
+
+/* no 1d case */
+
+MEM_TEST("ld2 {v18.4s, v19.4s}, [x5]", 17, 7)
+MEM_TEST("ld2 {v18.4s, v19.4s}, [x5], #32", 9, 9)
+MEM_TEST("ld2 {v18.4s, v19.4s}, [x5], x6", -13, -5)
+
+MEM_TEST("ld2 {v18.2s, v19.2s}, [x5]", 17, 7)
+MEM_TEST("ld2 {v18.2s, v19.2s}, [x5], #16", 9, 9)
+MEM_TEST("ld2 {v18.2s, v19.2s}, [x5], x6", -13, -5)
+
+MEM_TEST("ld2 {v18.8h, v19.8h}, [x5]", 17, 7)
+MEM_TEST("ld2 {v18.8h, v19.8h}, [x5], #32", 9, 9)
+MEM_TEST("ld2 {v18.8h, v19.8h}, [x5], x6", -13, -5)
+
+MEM_TEST("ld2 {v18.4h, v19.4h}, [x5]", 17, 7)
+MEM_TEST("ld2 {v18.4h, v19.4h}, [x5], #16", 9, 9)
+MEM_TEST("ld2 {v18.4h, v19.4h}, [x5], x6", -13, -5)
+
+MEM_TEST("ld2 {v18.16b, v19.16b}, [x5]", 17, 7)
+MEM_TEST("ld2 {v18.16b, v19.16b}, [x5], #32", 9, 9)
+MEM_TEST("ld2 {v18.16b, v19.16b}, [x5], x6", -13, -5)
+
+MEM_TEST("ld2 {v18.8b, v19.8b}, [x5]", 17, 7)
+MEM_TEST("ld2 {v18.8b, v19.8b}, [x5], #16", 9, 9)
+MEM_TEST("ld2 {v18.8b, v19.8b}, [x5], x6", -13, -5)
+
+////////////////////////////////////////////////////////////////
+printf("LD3/ST3 (multiple 3-elem structs to/from 3 regs\n");
+
+MEM_TEST("st3 {v17.2d, v18.2d, v19.2d}, [x5]", 17, 7)
+MEM_TEST("st3 {v17.2d, v18.2d, v19.2d}, [x5], #48", 9, 9)
+MEM_TEST("st3 {v17.2d, v18.2d, v19.2d}, [x5], x6", -13, -5)
+
+/* no 1d case */
+
+MEM_TEST("st3 {v17.4s, v18.4s, v19.4s}, [x5]", 17, 7)
+MEM_TEST("st3 {v17.4s, v18.4s, v19.4s}, [x5], #48", 9, 9)
+MEM_TEST("st3 {v17.4s, v18.4s, v19.4s}, [x5], x6", -13, -5)
+
+MEM_TEST("st3 {v17.2s, v18.2s, v19.2s}, [x5]", 17, 7)
+MEM_TEST("st3 {v17.2s, v18.2s, v19.2s}, [x5], #24", 9, 9)
+MEM_TEST("st3 {v17.2s, v18.2s, v19.2s}, [x5], x6", -13, -5)
+
+MEM_TEST("st3 {v17.8h, v18.8h, v19.8h}, [x5]", 17, 7)
+MEM_TEST("st3 {v17.8h, v18.8h, v19.8h}, [x5], #48", 9, 9)
+MEM_TEST("st3 {v17.8h, v18.8h, v19.8h}, [x5], x6", -13, -5)
+
+MEM_TEST("st3 {v17.4h, v18.4h, v19.4h}, [x5]", 17, 7)
+MEM_TEST("st3 {v17.4h, v18.4h, v19.4h}, [x5], #24", 9, 9)
+MEM_TEST("st3 {v17.4h, v18.4h, v19.4h}, [x5], x6", -13, -5)
+
+MEM_TEST("st3 {v17.16b, v18.16b, v19.16b}, [x5]", 17, 7)
+MEM_TEST("st3 {v17.16b, v18.16b, v19.16b}, [x5], #48", 9, 9)
+MEM_TEST("st3 {v17.16b, v18.16b, v19.16b}, [x5], x6", -13, -5)
+
+MEM_TEST("st3 {v17.8b, v18.8b, v19.8b}, [x5]", 17, 7)
+MEM_TEST("st3 {v17.8b, v18.8b, v19.8b}, [x5], #24", 9, 9)
+MEM_TEST("st3 {v17.8b, v18.8b, v19.8b}, [x5], x6", -13, -5)
+
+MEM_TEST("ld3 {v17.2d, v18.2d, v19.2d}, [x5]", 17, 7)
+MEM_TEST("ld3 {v17.2d, v18.2d, v19.2d}, [x5], #48", 9, 9)
+MEM_TEST("ld3 {v17.2d, v18.2d, v19.2d}, [x5], x6", -13, -5)
+
+/* no 1d case */
+
+MEM_TEST("ld3 {v17.4s, v18.4s, v19.4s}, [x5]", 17, 7)
+MEM_TEST("ld3 {v17.4s, v18.4s, v19.4s}, [x5], #48", 9, 9)
+MEM_TEST("ld3 {v17.4s, v18.4s, v19.4s}, [x5], x6", -13, -5)
+
+MEM_TEST("ld3 {v17.2s, v18.2s, v19.2s}, [x5]", 17, 7)
+MEM_TEST("ld3 {v17.2s, v18.2s, v19.2s}, [x5], #24", 9, 9)
+MEM_TEST("ld3 {v17.2s, v18.2s, v19.2s}, [x5], x6", -13, -5)
+
+MEM_TEST("ld3 {v17.8h, v18.8h, v19.8h}, [x5]", 17, 7)
+MEM_TEST("ld3 {v17.8h, v18.8h, v19.8h}, [x5], #48", 9, 9)
+MEM_TEST("ld3 {v17.8h, v18.8h, v19.8h}, [x5], x6", -13, -5)
+
+MEM_TEST("ld3 {v17.4h, v18.4h, v19.4h}, [x5]", 17, 7)
+MEM_TEST("ld3 {v17.4h, v18.4h, v19.4h}, [x5], #24", 9, 9)
+MEM_TEST("ld3 {v17.4h, v18.4h, v19.4h}, [x5], x6", -13, -5)
+
+MEM_TEST("ld3 {v17.16b, v18.16b, v19.16b}, [x5]", 17, 7)
+MEM_TEST("ld3 {v17.16b, v18.16b, v19.16b}, [x5], #48", 9, 9)
+MEM_TEST("ld3 {v17.16b, v18.16b, v19.16b}, [x5], x6", -13, -5)
+
+MEM_TEST("ld3 {v17.8b, v18.8b, v19.8b}, [x5]", 17, 7)
+MEM_TEST("ld3 {v17.8b, v18.8b, v19.8b}, [x5], #24", 9, 9)
+MEM_TEST("ld3 {v17.8b, v18.8b, v19.8b}, [x5], x6", -13, -5)
+
+////////////////////////////////////////////////////////////////
+printf("LD4/ST4 (multiple 4-elem structs to/from 4 regs\n");
+
+MEM_TEST("st4 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5]", 17, 7)
+MEM_TEST("st4 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5], #64", 9, 9)
+MEM_TEST("st4 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5], x6", -13, -5)
+
+/* no 1d case */
+
+MEM_TEST("st4 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5]", 17, 7)
+MEM_TEST("st4 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5], #64", 9, 9)
+MEM_TEST("st4 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5], x6", -13, -5)
+
+MEM_TEST("st4 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5]", 17, 7)
+MEM_TEST("st4 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5], #32", 9, 9)
+MEM_TEST("st4 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5], x6", -13, -5)
+
+MEM_TEST("st4 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5]", 17, 7)
+MEM_TEST("st4 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5], #64", 9, 9)
+MEM_TEST("st4 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5], x6", -13, -5)
+
+MEM_TEST("st4 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5]", 17, 7)
+MEM_TEST("st4 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5], #32", 9, 9)
+MEM_TEST("st4 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5], x6", -13, -5)
+
+MEM_TEST("st4 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5]", 17, 7)
+MEM_TEST("st4 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5], #64", 9, 9)
+MEM_TEST("st4 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5], x6", -13, -5)
+
+MEM_TEST("st4 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5]", 17, 7)
+MEM_TEST("st4 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5], #32", 9, 9)
+MEM_TEST("st4 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5], x6", -13, -5)
+
+MEM_TEST("ld4 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5]", 17, 7)
+MEM_TEST("ld4 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5], #64", 9, 9)
+MEM_TEST("ld4 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5], x6", -13, -5)
+
+/* no 1d case */
+
+MEM_TEST("ld4 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5]", 17, 7)
+MEM_TEST("ld4 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5], #64", 9, 9)
+MEM_TEST("ld4 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5], x6", -13, -5)
+
+MEM_TEST("ld4 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5]", 17, 7)
+MEM_TEST("ld4 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5], #32", 9, 9)
+MEM_TEST("ld4 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5], x6", -13, -5)
+
+MEM_TEST("ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5]", 17, 7)
+MEM_TEST("ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5], #64", 9, 9)
+MEM_TEST("ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5], x6", -13, -5)
+
+MEM_TEST("ld4 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5]", 17, 7)
+MEM_TEST("ld4 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5], #32", 9, 9)
+MEM_TEST("ld4 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5], x6", -13, -5)
+
+MEM_TEST("ld4 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5]", 17, 7)
+MEM_TEST("ld4 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5], #64", 9, 9)
+MEM_TEST("ld4 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5], x6", -13, -5)
+
+MEM_TEST("ld4 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5]", 17, 7)
+MEM_TEST("ld4 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5], #32", 9, 9)
+MEM_TEST("ld4 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5], x6", -13, -5)
+
+////////////////////////////////////////////////////////////////
+printf("LD1/ST1 (multiple 1-elem structs to/from 2/3/4 regs\n");
+
+//MEM_TEST("st1 {v19.2d, v20.2d}, [x5]", 17, 7)
+//MEM_TEST("st1 {v19.2d, v20.2d}, [x5], #32", 9, 9)
+//MEM_TEST("st1 {v19.2d, v20.2d}, [x5], x6", -13, -5)
+//
+//MEM_TEST("st1 {v17.2d, v18.2d, v19.2d}, [x5]", 17, 7)
+//MEM_TEST("st1 {v17.2d, v18.2d, v19.2d}, [x5], #48", 9, 9)
+//MEM_TEST("st1 {v17.2d, v18.2d, v19.2d}, [x5], x6", -13, -5)
+//
+//MEM_TEST("st1 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5]", 17, 7)
+//MEM_TEST("st1 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5], #64", 9, 9)
+//MEM_TEST("st1 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5], x6", -13, -5)
+//
+//
+//MEM_TEST("st1 {v19.1d, v20.1d}, [x5]", 17, 7)
+//MEM_TEST("st1 {v19.1d, v20.1d}, [x5], #16", 9, 9)
+//MEM_TEST("st1 {v19.1d, v20.1d}, [x5], x6", -13, -5)
+//
+//MEM_TEST("st1 {v17.1d, v18.1d, v19.1d}, [x5]", 17, 7)
+//MEM_TEST("st1 {v17.1d, v18.1d, v19.1d}, [x5], #24", 9, 9)
+//MEM_TEST("st1 {v17.1d, v18.1d, v19.1d}, [x5], x6", -13, -5)
+//
+//MEM_TEST("st1 {v17.1d, v18.1d, v19.1d, v20.1d}, [x5]", 17, 7)
+//MEM_TEST("st1 {v17.1d, v18.1d, v19.1d, v20.1d}, [x5], #32", 9, 9)
+//MEM_TEST("st1 {v17.1d, v18.1d, v19.1d, v20.1d}, [x5], x6", -13, -5)
+//
+//
+//MEM_TEST("st1 {v19.4s, v20.4s}, [x5]", 17, 7)
+//MEM_TEST("st1 {v19.4s, v20.4s}, [x5], #32", 9, 9)
+//MEM_TEST("st1 {v19.4s, v20.4s}, [x5], x6", -13, -5)
+//
+//MEM_TEST("st1 {v17.4s, v18.4s, v19.4s}, [x5]", 17, 7)
+//MEM_TEST("st1 {v17.4s, v18.4s, v19.4s}, [x5], #48", 9, 9)
+//MEM_TEST("st1 {v17.4s, v18.4s, v19.4s}, [x5], x6", -13, -5)
+//
+//MEM_TEST("st1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5]", 17, 7)
+//MEM_TEST("st1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5], #64", 9, 9)
+//MEM_TEST("st1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5], x6", -13, -5)
+//
+//
+//MEM_TEST("st1 {v19.2s, v20.2s}, [x5]", 17, 7)
+//MEM_TEST("st1 {v19.2s, v20.2s}, [x5], #16", 9, 9)
+//MEM_TEST("st1 {v19.2s, v20.2s}, [x5], x6", -13, -5)
+//
+//MEM_TEST("st1 {v17.2s, v18.2s, v19.2s}, [x5]", 17, 7)
+//MEM_TEST("st1 {v17.2s, v18.2s, v19.2s}, [x5], #24", 9, 9)
+//MEM_TEST("st1 {v17.2s, v18.2s, v19.2s}, [x5], x6", -13, -5)
+//
+//MEM_TEST("st1 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5]", 17, 7)
+//MEM_TEST("st1 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5], #32", 9, 9)
+//MEM_TEST("st1 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5], x6", -13, -5)
+//
+//
+//MEM_TEST("st1 {v19.8h, v20.8h}, [x5]", 17, 7)
+//MEM_TEST("st1 {v19.8h, v20.8h}, [x5], #32", 9, 9)
+//MEM_TEST("st1 {v19.8h, v20.8h}, [x5], x6", -13, -5)
+//
+//MEM_TEST("st1 {v17.8h, v18.8h, v19.8h}, [x5]", 17, 7)
+//MEM_TEST("st1 {v17.8h, v18.8h, v19.8h}, [x5], #48", 9, 9)
+//MEM_TEST("st1 {v17.8h, v18.8h, v19.8h}, [x5], x6", -13, -5)
+//
+//MEM_TEST("st1 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5]", 17, 7)
+//MEM_TEST("st1 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5], #64", 9, 9)
+//MEM_TEST("st1 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5], x6", -13, -5)
+//
+//
+//MEM_TEST("st1 {v19.4h, v20.4h}, [x5]", 17, 7)
+//MEM_TEST("st1 {v19.4h, v20.4h}, [x5], #16", 9, 9)
+//MEM_TEST("st1 {v19.4h, v20.4h}, [x5], x6", -13, -5)
+//
+//MEM_TEST("st1 {v17.4h, v18.4h, v19.4h}, [x5]", 17, 7)
+//MEM_TEST("st1 {v17.4h, v18.4h, v19.4h}, [x5], #24", 9, 9)
+//MEM_TEST("st1 {v17.4h, v18.4h, v19.4h}, [x5], x6", -13, -5)
+//
+//MEM_TEST("st1 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5]", 17, 7)
+//MEM_TEST("st1 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5], #32", 9, 9)
+//MEM_TEST("st1 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5], x6", -13, -5)
+
+
+MEM_TEST("st1 {v19.16b, v20.16b}, [x5]", 17, 7)
+MEM_TEST("st1 {v19.16b, v20.16b}, [x5], #32", 9, 9)
+//MEM_TEST("st1 {v19.16b, v20.16b}, [x5], x6", -13, -5)
+
+MEM_TEST("st1 {v17.16b, v18.16b, v19.16b}, [x5]", 17, 7)
+//MEM_TEST("st1 {v17.16b, v18.16b, v19.16b}, [x5], #48", 9, 9)
+//MEM_TEST("st1 {v17.16b, v18.16b, v19.16b}, [x5], x6", -13, -5)
+//
+//MEM_TEST("st1 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5]", 17, 7)
+//MEM_TEST("st1 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5], #64", 9, 9)
+//MEM_TEST("st1 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5], x6", -13, -5)
+//
+//
+//MEM_TEST("st1 {v19.8b, v20.8b}, [x5]", 17, 7)
+//MEM_TEST("st1 {v19.8b, v20.8b}, [x5], #16", 9, 9)
+//MEM_TEST("st1 {v19.8b, v20.8b}, [x5], x6", -13, -5)
+//
+//MEM_TEST("st1 {v17.8b, v18.8b, v19.8b}, [x5]", 17, 7)
+//MEM_TEST("st1 {v17.8b, v18.8b, v19.8b}, [x5], #24", 9, 9)
+//MEM_TEST("st1 {v17.8b, v18.8b, v19.8b}, [x5], x6", -13, -5)
+//
+//MEM_TEST("st1 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5]", 17, 7)
+//MEM_TEST("st1 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5], #32", 9, 9)
+//MEM_TEST("st1 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5], x6", -13, -5)
+//
+//
+//MEM_TEST("ld1 {v19.2d, v20.2d}, [x5]", 17, 7)
+//MEM_TEST("ld1 {v19.2d, v20.2d}, [x5], #32", 9, 9)
+//MEM_TEST("ld1 {v19.2d, v20.2d}, [x5], x6", -13, -5)
+//
+//MEM_TEST("ld1 {v17.2d, v18.2d, v19.2d}, [x5]", 17, 7)
+//MEM_TEST("ld1 {v17.2d, v18.2d, v19.2d}, [x5], #48", 9, 9)
+//MEM_TEST("ld1 {v17.2d, v18.2d, v19.2d}, [x5], x6", -13, -5)
+//
+//MEM_TEST("ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5]", 17, 7)
+//MEM_TEST("ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5], #64", 9, 9)
+//MEM_TEST("ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [x5], x6", -13, -5)
+//
+//
+//MEM_TEST("ld1 {v19.1d, v20.1d}, [x5]", 17, 7)
+//MEM_TEST("ld1 {v19.1d, v20.1d}, [x5], #16", 9, 9)
+//MEM_TEST("ld1 {v19.1d, v20.1d}, [x5], x6", -13, -5)
+//
+//MEM_TEST("ld1 {v17.1d, v18.1d, v19.1d}, [x5]", 17, 7)
+//MEM_TEST("ld1 {v17.1d, v18.1d, v19.1d}, [x5], #24", 9, 9)
+//MEM_TEST("ld1 {v17.1d, v18.1d, v19.1d}, [x5], x6", -13, -5)
+//
+//MEM_TEST("ld1 {v17.1d, v18.1d, v19.1d, v20.1d}, [x5]", 17, 7)
+//MEM_TEST("ld1 {v17.1d, v18.1d, v19.1d, v20.1d}, [x5], #32", 9, 9)
+//MEM_TEST("ld1 {v17.1d, v18.1d, v19.1d, v20.1d}, [x5], x6", -13, -5)
+//
+//
+//MEM_TEST("ld1 {v19.4s, v20.4s}, [x5]", 17, 7)
+//MEM_TEST("ld1 {v19.4s, v20.4s}, [x5], #32", 9, 9)
+//MEM_TEST("ld1 {v19.4s, v20.4s}, [x5], x6", -13, -5)
+//
+//MEM_TEST("ld1 {v17.4s, v18.4s, v19.4s}, [x5]", 17, 7)
+//MEM_TEST("ld1 {v17.4s, v18.4s, v19.4s}, [x5], #48", 9, 9)
+//MEM_TEST("ld1 {v17.4s, v18.4s, v19.4s}, [x5], x6", -13, -5)
+//
+//MEM_TEST("ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5]", 17, 7)
+//MEM_TEST("ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5], #64", 9, 9)
+//MEM_TEST("ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x5], x6", -13, -5)
+//
+//
+//MEM_TEST("ld1 {v19.2s, v20.2s}, [x5]", 17, 7)
+//MEM_TEST("ld1 {v19.2s, v20.2s}, [x5], #16", 9, 9)
+//MEM_TEST("ld1 {v19.2s, v20.2s}, [x5], x6", -13, -5)
+//
+//MEM_TEST("ld1 {v17.2s, v18.2s, v19.2s}, [x5]", 17, 7)
+//MEM_TEST("ld1 {v17.2s, v18.2s, v19.2s}, [x5], #24", 9, 9)
+//MEM_TEST("ld1 {v17.2s, v18.2s, v19.2s}, [x5], x6", -13, -5)
+//
+//MEM_TEST("ld1 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5]", 17, 7)
+//MEM_TEST("ld1 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5], #32", 9, 9)
+//MEM_TEST("ld1 {v17.2s, v18.2s, v19.2s, v20.2s}, [x5], x6", -13, -5)
+//
+//
+//MEM_TEST("ld1 {v19.8h, v20.8h}, [x5]", 17, 7)
+//MEM_TEST("ld1 {v19.8h, v20.8h}, [x5], #32", 9, 9)
+//MEM_TEST("ld1 {v19.8h, v20.8h}, [x5], x6", -13, -5)
+//
+//MEM_TEST("ld1 {v17.8h, v18.8h, v19.8h}, [x5]", 17, 7)
+//MEM_TEST("ld1 {v17.8h, v18.8h, v19.8h}, [x5], #48", 9, 9)
+//MEM_TEST("ld1 {v17.8h, v18.8h, v19.8h}, [x5], x6", -13, -5)
+//
+//MEM_TEST("ld1 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5]", 17, 7)
+//MEM_TEST("ld1 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5], #64", 9, 9)
+//MEM_TEST("ld1 {v17.8h, v18.8h, v19.8h, v20.8h}, [x5], x6", -13, -5)
+//
+//
+//MEM_TEST("ld1 {v19.4h, v20.4h}, [x5]", 17, 7)
+//MEM_TEST("ld1 {v19.4h, v20.4h}, [x5], #16", 9, 9)
+//MEM_TEST("ld1 {v19.4h, v20.4h}, [x5], x6", -13, -5)
+//
+//MEM_TEST("ld1 {v17.4h, v18.4h, v19.4h}, [x5]", 17, 7)
+//MEM_TEST("ld1 {v17.4h, v18.4h, v19.4h}, [x5], #24", 9, 9)
+//MEM_TEST("ld1 {v17.4h, v18.4h, v19.4h}, [x5], x6", -13, -5)
+//
+//MEM_TEST("ld1 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5]", 17, 7)
+//MEM_TEST("ld1 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5], #32", 9, 9)
+//MEM_TEST("ld1 {v17.4h, v18.4h, v19.4h, v20.4h}, [x5], x6", -13, -5)
+
+
+MEM_TEST("ld1 {v19.16b, v20.16b}, [x5]", 17, 7)
+MEM_TEST("ld1 {v19.16b, v20.16b}, [x5], #32", 9, 9)
+//MEM_TEST("ld1 {v19.16b, v20.16b}, [x5], x6", -13, -5)
+
+MEM_TEST("ld1 {v17.16b, v18.16b, v19.16b}, [x5]", 17, 7)
+//MEM_TEST("ld1 {v17.16b, v18.16b, v19.16b}, [x5], #48", 9, 9)
+//MEM_TEST("ld1 {v17.16b, v18.16b, v19.16b}, [x5], x6", -13, -5)
+//
+//MEM_TEST("ld1 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5]", 17, 7)
+//MEM_TEST("ld1 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5], #64", 9, 9)
+//MEM_TEST("ld1 {v17.16b, v18.16b, v19.16b, v20.16b}, [x5], x6", -13, -5)
+//
+//
+//MEM_TEST("ld1 {v19.8b, v20.8b}, [x5]", 17, 7)
+//MEM_TEST("ld1 {v19.8b, v20.8b}, [x5], #16", 9, 9)
+//MEM_TEST("ld1 {v19.8b, v20.8b}, [x5], x6", -13, -5)
+//
+//MEM_TEST("ld1 {v17.8b, v18.8b, v19.8b}, [x5]", 17, 7)
+//MEM_TEST("ld1 {v17.8b, v18.8b, v19.8b}, [x5], #24", 9, 9)
+//MEM_TEST("ld1 {v17.8b, v18.8b, v19.8b}, [x5], x6", -13, -5)
+//
+//MEM_TEST("ld1 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5]", 17, 7)
+//MEM_TEST("ld1 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5], #32", 9, 9)
+//MEM_TEST("ld1 {v17.8b, v18.8b, v19.8b, v20.8b}, [x5], x6", -13, -5)
+
////////////////////////////////////////////////////////////////
printf("LD1R (single structure, replicate)\n");
+
MEM_TEST("ld1r {v17.2d}, [x5]", 3, -5)
MEM_TEST("ld1r {v17.1d}, [x5]", 3, -4)
MEM_TEST("ld1r {v17.4s}, [x5]", 3, -3)
@@ -777,49 +1225,374 @@
MEM_TEST("ld1r {v17.16b}, [x5], x6", 3, 2)
MEM_TEST("ld1r {v17.8b}, [x5], x6", 3, 3)
-////////////////////////////////////////////////////////////////
-printf("LD2/ST2 (multiple 2-elem structs to/from 2/regs, post index)"
- " (VERY INCOMPLETE)\n");
-MEM_TEST("ld2 {v17.2d, v18.2d}, [x5], #32", 3, 0)
-MEM_TEST("st2 {v17.2d, v18.2d}, [x5], #32", 7, 0)
+////////////////////////////////////////////////////////////////
+printf("LD2R (single structure, replicate)\n");
-MEM_TEST("ld2 {v17.4s, v18.4s}, [x5], #32", 13, 0)
-MEM_TEST("st2 {v17.4s, v18.4s}, [x5], #32", 17, 0)
+//MEM_TEST("ld2r {v17.2d , v18.2d }, [x5]", 3, -5)
+//MEM_TEST("ld2r {v18.1d , v19.1d }, [x5]", 3, -4)
+//MEM_TEST("ld2r {v19.4s , v20.4s }, [x5]", 3, -3)
+//MEM_TEST("ld2r {v17.2s , v18.2s }, [x5]", 3, -2)
+//MEM_TEST("ld2r {v18.8h , v19.8h }, [x5]", 3, -1)
+//MEM_TEST("ld2r {v19.4h , v20.4h }, [x5]", 3, 1)
+//MEM_TEST("ld2r {v17.16b, v18.16b}, [x5]", 3, 2)
+//MEM_TEST("ld2r {v18.8b , v19.8b }, [x5]", 3, 3)
+//
+//MEM_TEST("ld2r {v19.2d , v20.2d }, [x5], #16", 3, -5)
+//MEM_TEST("ld2r {v17.1d , v18.1d }, [x5], #16", 3, -4)
+//MEM_TEST("ld2r {v18.4s , v19.4s }, [x5], #8", 3, -3)
+//MEM_TEST("ld2r {v19.2s , v20.2s }, [x5], #8", 3, -2)
+//MEM_TEST("ld2r {v17.8h , v18.8h }, [x5], #4", 3, -1)
+//MEM_TEST("ld2r {v18.4h , v19.4h }, [x5], #4", 3, 1)
+//MEM_TEST("ld2r {v19.16b, v20.16b}, [x5], #2", 3, 2)
+//MEM_TEST("ld2r {v17.8b , v18.8b }, [x5], #2", 3, 3)
+//
+//MEM_TEST("ld2r {v18.2d , v19.2d }, [x5], x6", 3, -5)
+//MEM_TEST("ld2r {v19.1d , v20.1d }, [x5], x6", 3, -4)
+//MEM_TEST("ld2r {v17.4s , v18.4s }, [x5], x6", 3, -3)
+//MEM_TEST("ld2r {v18.2s , v19.2s }, [x5], x6", 3, -2)
+//MEM_TEST("ld2r {v19.8h , v20.8h }, [x5], x6", 3, -1)
+//MEM_TEST("ld2r {v17.4h , v18.4h }, [x5], x6", 3, 1)
+//MEM_TEST("ld2r {v18.16b, v19.16b}, [x5], x6", 3, 2)
+//MEM_TEST("ld2r {v19.8b , v20.8b }, [x5], x6", 3, 3)
+
+
+//////////////////////////////////////////////////////////////////
+//printf("LD3R (single structure, replicate)\n");
+
+//MEM_TEST("ld3r {v17.2d , v18.2d , v19.2d }, [x5]", 3, -5)
+//MEM_TEST("ld3r {v18.1d , v19.1d , v20.1d }, [x5]", 3, -4)
+//MEM_TEST("ld3r {v17.4s , v18.4s , v19.4s }, [x5]", 3, -3)
+//MEM_TEST("ld3r {v18.2s , v19.2s , v20.2s }, [x5]", 3, -2)
+//MEM_TEST("ld3r {v17.8h , v18.8h , v19.8h }, [x5]", 3, -5)
+//MEM_TEST("ld3r {v18.4h , v19.4h , v20.4h }, [x5]", 3, -4)
+//MEM_TEST("ld3r {v17.16b, v18.16b, v19.16b}, [x5]", 3, -3)
+//MEM_TEST("ld3r {v18.8b , v19.8b , v20.8b }, [x5]", 3, -2)
+//
+//MEM_TEST("ld3r {v17.2d , v18.2d , v19.2d }, [x5], #24", 3, -5)
+//MEM_TEST("ld3r {v18.1d , v19.1d , v20.1d }, [x5], #24", 3, -4)
+//MEM_TEST("ld3r {v17.4s , v18.4s , v19.4s }, [x5], #12", 3, -3)
+//MEM_TEST("ld3r {v18.2s , v19.2s , v20.2s }, [x5], #12", 3, -2)
+//MEM_TEST("ld3r {v17.8h , v18.8h , v19.8h }, [x5], #6", 3, -5)
+//MEM_TEST("ld3r {v18.4h , v19.4h , v20.4h }, [x5], #6", 3, -4)
+//MEM_TEST("ld3r {v17.16b, v18.16b, v19.16b}, [x5], #3", 3, -3)
+//MEM_TEST("ld3r {v18.8b , v19.8b , v20.8b }, [x5], #3", 3, -2)
+//
+//MEM_TEST("ld3r {v17.2d , v18.2d , v19.2d }, [x5], x6", 3, -5)
+//MEM_TEST("ld3r {v18.1d , v19.1d , v20.1d }, [x5], x6", 3, -4)
+//MEM_TEST("ld3r {v17.4s , v18.4s , v19.4s }, [x5], x6", 3, -3)
+//MEM_TEST("ld3r {v18.2s , v19.2s , v20.2s }, [x5], x6", 3, -2)
+//MEM_TEST("ld3r {v17.8h , v18.8h , v19.8h }, [x5], x6", 3, -5)
+//MEM_TEST("ld3r {v18.4h , v19.4h , v20.4h }, [x5], x6", 3, -4)
+//MEM_TEST("ld3r {v17.16b, v18.16b, v19.16b}, [x5], x6", 3, -3)
+//MEM_TEST("ld3r {v18.8b , v19.8b , v20.8b }, [x5], x6", 3, -2)
////////////////////////////////////////////////////////////////
-printf("LD1/ST1 (multiple 1-elem structs to/from 2 regs, no offset)"
- " (VERY INCOMPLETE)\n");
-
-MEM_TEST("ld1 {v17.16b, v18.16b}, [x5]", 3, 0)
-MEM_TEST("st1 {v17.16b, v18.16b}, [x5]", 7, 0)
+printf("LD4R (single structure, replicate)\n");
+
+//MEM_TEST("ld4r {v17.2d , v18.2d , v19.2d , v20.2d }, [x5]", 3, -5)
+//MEM_TEST("ld4r {v17.1d , v18.1d , v19.1d , v20.1d }, [x5]", 3, -4)
+//MEM_TEST("ld4r {v17.4s , v18.4s , v19.4s , v20.4s }, [x5]", 3, -3)
+//MEM_TEST("ld4r {v17.2s , v18.2s , v19.2s , v20.2s }, [x5]", 3, -2)
+//MEM_TEST("ld4r {v17.8h , v18.8h , v19.8h , v20.8h }, [x5]", 3, -5)
+//MEM_TEST("ld4r {v17.4h , v18.4h , v19.4h , v20.4h }, [x5]", 3, -4)
+//MEM_TEST("ld4r {v17.16b, v18.16b, v19.16b, v20.16b}, [x5]", 3, -3)
+//MEM_TEST("ld4r {v17.8b , v18.8b , v19.8b , v20.8b }, [x5]", 3, -2)
+//
+//MEM_TEST("ld4r {v17.2d , v18.2d , v19.2d , v20.2d }, [x5], #32", 3, -5)
+//MEM_TEST("ld4r {v17.1d , v18.1d , v19.1d , v20.1d }, [x5], #32", 3, -4)
+//MEM_TEST("ld4r {v17.4s , v18.4s , v19.4s , v20.4s }, [x5], #16", 3, -3)
+//MEM_TEST("ld4r {v17.2s , v18.2s , v19.2s , v20.2s }, [x5], #16", 3, -2)
+//MEM_TEST("ld4r {v17.8h , v18.8h , v19.8h , v20.8h }, [x5], #8", 3, -5)
+//MEM_TEST("ld4r {v17.4h , v18.4h , v19.4h , v20.4h }, [x5], #8", 3, -4)
+//MEM_TEST("ld4r {v17.16b, v18.16b, v19.16b, v20.16b}, [x5], #4", 3, -3)
+//MEM_TEST("ld4r {v17.8b , v18.8b , v19.8b , v20.8b }, [x5], #4", 3, -2)
+//
+//MEM_TEST("ld4r {v17.2d , v18.2d , v19.2d , v20.2d }, [x5], x6", 3, -5)
+//MEM_TEST("ld4r {v17.1d , v18.1d , v19.1d , v20.1d }, [x5], x6", 3, -4)
+//MEM_TEST("ld4r {v17.4s , v18.4s , v19.4s , v20.4s }, [x5], x6", 3, -3)
+//MEM_TEST("ld4r {v17.2s , v18.2s , v19.2s , v20.2s }, [x5], x6", 3, -2)
+//MEM_TEST("ld4r {v17.8h , v18.8h , v19.8h , v20.8h }, [x5], x6", 3, -5)
+//MEM_TEST("ld4r {v17.4h , v18.4h , v19.4h , v20.4h }, [x5], x6", 3, -4)
+//MEM_TEST("ld4r {v17.16b, v18.16b, v19.16b, v20.16b}, [x5], x6", 3, -3)
+//MEM_TEST("ld4r {v17.8b , v18.8b , v19.8b , v20.8b }, [x5], x6", 3, -2)
////////////////////////////////////////////////////////////////
-printf("LD1/ST1 (multiple 1-elem structs to/from 2 regs, post index)"
- " (VERY INCOMPLETE)\n");
-
-MEM_TEST("ld1 {v17.16b, v18.16b}, [x5], #32", 3, 0)
-MEM_TEST("st1 {v17.16b, v18.16b}, [x5], #32", 7, 0)
+printf("LD1/ST1 (single 1-elem struct to/from one lane of 1 reg\n");
+
+//MEM_TEST("st1 {v19.d}[0], [x5]", 17, 7)
+//MEM_TEST("st1 {v19.d}[0], [x5], #8", -9, 12)
+//MEM_TEST("st1 {v19.d}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("st1 {v19.d}[1], [x5]", 17, 7)
+//MEM_TEST("st1 {v19.d}[1], [x5], #8", -9, 12)
+//MEM_TEST("st1 {v19.d}[1], [x5], x6", 9, 13)
+//
+//MEM_TEST("st1 {v19.s}[0], [x5]", 17, 7)
+//MEM_TEST("st1 {v19.s}[0], [x5], #4", -9, 12)
+//MEM_TEST("st1 {v19.s}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("st1 {v19.s}[3], [x5]", 17, 7)
+//MEM_TEST("st1 {v19.s}[3], [x5], #4", -9, 12)
+//MEM_TEST("st1 {v19.s}[3], [x5], x6", 9, 13)
+//
+//MEM_TEST("st1 {v19.h}[0], [x5]", 17, 7)
+//MEM_TEST("st1 {v19.h}[0], [x5], #2", -9, 12)
+//MEM_TEST("st1 {v19.h}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("st1 {v19.h}[6], [x5]", 17, 7)
+//MEM_TEST("st1 {v19.h}[6], [x5], #2", -9, 12)
+//MEM_TEST("st1 {v19.h}[6], [x5], x6", 9, 13)
+//
+//MEM_TEST("st1 {v19.b}[0], [x5]", 17, 7)
+//MEM_TEST("st1 {v19.b}[0], [x5], #1", -9, 12)
+//MEM_TEST("st1 {v19.b}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("st1 {v19.b}[13], [x5]", 17, 7)
+//MEM_TEST("st1 {v19.b}[13], [x5], #1", -9, 12)
+//MEM_TEST("st1 {v19.b}[13], [x5], x6", 9, 13)
+//
+//
+//MEM_TEST("ld1 {v19.d}[0], [x5]", 17, 7)
+//MEM_TEST("ld1 {v19.d}[0], [x5], #8", -9, 12)
+//MEM_TEST("ld1 {v19.d}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld1 {v19.d}[1], [x5]", 17, 7)
+//MEM_TEST("ld1 {v19.d}[1], [x5], #8", -9, 12)
+//MEM_TEST("ld1 {v19.d}[1], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld1 {v19.s}[0], [x5]", 17, 7)
+//MEM_TEST("ld1 {v19.s}[0], [x5], #4", -9, 12)
+//MEM_TEST("ld1 {v19.s}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld1 {v19.s}[3], [x5]", 17, 7)
+//MEM_TEST("ld1 {v19.s}[3], [x5], #4", -9, 12)
+//MEM_TEST("ld1 {v19.s}[3], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld1 {v19.h}[0], [x5]", 17, 7)
+//MEM_TEST("ld1 {v19.h}[0], [x5], #2", -9, 12)
+//MEM_TEST("ld1 {v19.h}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld1 {v19.h}[6], [x5]", 17, 7)
+//MEM_TEST("ld1 {v19.h}[6], [x5], #2", -9, 12)
+//MEM_TEST("ld1 {v19.h}[6], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld1 {v19.b}[0], [x5]", 17, 7)
+//MEM_TEST("ld1 {v19.b}[0], [x5], #1", -9, 12)
+//MEM_TEST("ld1 {v19.b}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld1 {v19.b}[13], [x5]", 17, 7)
+//MEM_TEST("ld1 {v19.b}[13], [x5], #1", -9, 12)
+//MEM_TEST("ld1 {v19.b}[13], [x5], x6", 9, 13)
////////////////////////////////////////////////////////////////
-printf("LD1/ST1 (multiple 1-elem structs to/from 3 regs, no offset)"
- " (VERY INCOMPLETE)\n");
+printf("LD2/ST2 (single 2-elem struct to/from one lane of 2 regs\n");
-MEM_TEST("ld1 {v17.16b, v18.16b, v19.16b}, [x5]", 3, 0)
-MEM_TEST("st1 {v17.16b, v18.16b, v19.16b}, [x5]", 7, 0)
+//MEM_TEST("st2 {v18.d, v19.d}[0], [x5]", 17, 7)
+//MEM_TEST("st2 {v18.d, v19.d}[0], [x5], #16", -9, 12)
+//MEM_TEST("st2 {v18.d, v19.d}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("st2 {v18.d, v19.d}[1], [x5]", 17, 7)
+//MEM_TEST("st2 {v18.d, v19.d}[1], [x5], #16", -9, 12)
+//MEM_TEST("st2 {v18.d, v19.d}[1], [x5], x6", 9, 13)
+//
+//MEM_TEST("st2 {v18.s, v19.s}[0], [x5]", 17, 7)
+//MEM_TEST("st2 {v18.s, v19.s}[0], [x5], #8", -9, 12)
+//MEM_TEST("st2 {v18.s, v19.s}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("st2 {v18.s, v19.s}[3], [x5]", 17, 7)
+//MEM_TEST("st2 {v18.s, v19.s}[3], [x5], #8", -9, 12)
+//MEM_TEST("st2 {v18.s, v19.s}[3], [x5], x6", 9, 13)
+//
+//MEM_TEST("st2 {v18.h, v19.h}[0], [x5]", 17, 7)
+//MEM_TEST("st2 {v18.h, v19.h}[0], [x5], #4", -9, 12)
+//MEM_TEST("st2 {v18.h, v19.h}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("st2 {v18.h, v19.h}[6], [x5]", 17, 7)
+//MEM_TEST("st2 {v18.h, v19.h}[6], [x5], #4", -9, 12)
+//MEM_TEST("st2 {v18.h, v19.h}[6], [x5], x6", 9, 13)
+//
+//MEM_TEST("st2 {v18.b, v19.b}[0], [x5]", 17, 7)
+//MEM_TEST("st2 {v18.b, v19.b}[0], [x5], #2", -9, 12)
+//MEM_TEST("st2 {v18.b, v19.b}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("st2 {v18.b, v19.b}[13], [x5]", 17, 7)
+//MEM_TEST("st2 {v18.b, v19.b}[13], [x5], #2", -9, 12)
+//MEM_TEST("st2 {v18.b, v19.b}[13], [x5], x6", 9, 13)
+//
+//
+//MEM_TEST("ld2 {v18.d, v19.d}[0], [x5]", 17, 7)
+//MEM_TEST("ld2 {v18.d, v19.d}[0], [x5], #16", -9, 12)
+//MEM_TEST("ld2 {v18.d, v19.d}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld2 {v18.d, v19.d}[1], [x5]", 17, 7)
+//MEM_TEST("ld2 {v18.d, v19.d}[1], [x5], #16", -9, 12)
+//MEM_TEST("ld2 {v18.d, v19.d}[1], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld2 {v18.s, v19.s}[0], [x5]", 17, 7)
+//MEM_TEST("ld2 {v18.s, v19.s}[0], [x5], #8", -9, 12)
+//MEM_TEST("ld2 {v18.s, v19.s}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld2 {v18.s, v19.s}[3], [x5]", 17, 7)
+//MEM_TEST("ld2 {v18.s, v19.s}[3], [x5], #8", -9, 12)
+//MEM_TEST("ld2 {v18.s, v19.s}[3], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld2 {v18.h, v19.h}[0], [x5]", 17, 7)
+//MEM_TEST("ld2 {v18.h, v19.h}[0], [x5], #4", -9, 12)
+//MEM_TEST("ld2 {v18.h, v19.h}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld2 {v18.h, v19.h}[6], [x5]", 17, 7)
+//MEM_TEST("ld2 {v18.h, v19.h}[6], [x5], #4", -9, 12)
+//MEM_TEST("ld2 {v18.h, v19.h}[6], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld2 {v18.b, v19.b}[0], [x5]", 17, 7)
+//MEM_TEST("ld2 {v18.b, v19.b}[0], [x5], #2", -9, 12)
+//MEM_TEST("ld2 {v18.b, v19.b}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld2 {v18.b, v19.b}[13], [x5]", 17, 7)
+//MEM_TEST("ld2 {v18.b, v19.b}[13], [x5], #2", -9, 12)
+//MEM_TEST("ld2 {v18.b, v19.b}[13], [x5], x6", 9, 13)
////////////////////////////////////////////////////////////////
-printf("LD3/ST3 (multiple 3-elem structs to/from 3/regs, post index)"
- " (VERY INCOMPLETE)\n");
+printf("LD3/ST3 (single 3-elem struct to/from one lane of 3 regs\n");
-MEM_TEST("ld3 {v17.2d, v18.2d, v19.2d}, [x5], #48", 13, 0)
-MEM_TEST("st3 {v17.2d, v18.2d, v19.2d}, [x5], #48", 17, 0)
+//MEM_TEST("st3 {v17.d, v18.d, v19.d}[0], [x5]", 17, 7)
+//MEM_TEST("st3 {v17.d, v18.d, v19.d}[0], [x5], #24", -9, 12)
+//MEM_TEST("st3 {v17.d, v18.d, v19.d}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("st3 {v17.d, v18.d, v19.d}[1], [x5]", 17, 7)
+//MEM_TEST("st3 {v17.d, v18.d, v19.d}[1], [x5], #24", -9, 12)
+//MEM_TEST("st3 {v17.d, v18.d, v19.d}[1], [x5], x6", 9, 13)
+//
+//MEM_TEST("st3 {v17.s, v18.s, v19.s}[0], [x5]", 17, 7)
+//MEM_TEST("st3 {v17.s, v18.s, v19.s}[0], [x5], #12", -9, 12)
+//MEM_TEST("st3 {v17.s, v18.s, v19.s}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("st3 {v17.s, v18.s, v19.s}[3], [x5]", 17, 7)
+//MEM_TEST("st3 {v17.s, v18.s, v19.s}[3], [x5], #12", -9, 12)
+//MEM_TEST("st3 {v17.s, v18.s, v19.s}[3], [x5], x6", 9, 13)
+//
+//MEM_TEST("st3 {v17.h, v18.h, v19.h}[0], [x5]", 17, 7)
+//MEM_TEST("st3 {v17.h, v18.h, v19.h}[0], [x5], #6", -9, 12)
+//MEM_TEST("st3 {v17.h, v18.h, v19.h}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("st3 {v17.h, v18.h, v19.h}[6], [x5]", 17, 7)
+//MEM_TEST("st3 {v17.h, v18.h, v19.h}[6], [x5], #6", -9, 12)
+//MEM_TEST("st3 {v17.h, v18.h, v19.h}[6], [x5], x6", 9, 13)
+//
+//MEM_TEST("st3 {v17.b, v18.b, v19.b}[0], [x5]", 17, 7)
+//MEM_TEST("st3 {v17.b, v18.b, v19.b}[0], [x5], #3", -9, 12)
+//MEM_TEST("st3 {v17.b, v18.b, v19.b}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("st3 {v17.b, v18.b, v19.b}[13], [x5]", 17, 7)
+//MEM_TEST("st3 {v17.b, v18.b, v19.b}[13], [x5], #3", -9, 12)
+//MEM_TEST("st3 {v17.b, v18.b, v19.b}[13], [x5], x6", 9, 13)
+//
+//
+//MEM_TEST("ld3 {v17.d, v18.d, v19.d}[0], [x5]", 17, 7)
+//MEM_TEST("ld3 {v17.d, v18.d, v19.d}[0], [x5], #24", -9, 12)
+//MEM_TEST("ld3 {v17.d, v18.d, v19.d}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld3 {v17.d, v18.d, v19.d}[1], [x5]", 17, 7)
+//MEM_TEST("ld3 {v17.d, v18.d, v19.d}[1], [x5], #24", -9, 12)
+//MEM_TEST("ld3 {v17.d, v18.d, v19.d}[1], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld3 {v17.s, v18.s, v19.s}[0], [x5]", 17, 7)
+//MEM_TEST("ld3 {v17.s, v18.s, v19.s}[0], [x5], #12", -9, 12)
+//MEM_TEST("ld3 {v17.s, v18.s, v19.s}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld3 {v17.s, v18.s, v19.s}[3], [x5]", 17, 7)
+//MEM_TEST("ld3 {v17.s, v18.s, v19.s}[3], [x5], #12", -9, 12)
+//MEM_TEST("ld3 {v17.s, v18.s, v19.s}[3], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld3 {v17.h, v18.h, v19.h}[0], [x5]", 17, 7)
+//MEM_TEST("ld3 {v17.h, v18.h, v19.h}[0], [x5], #6", -9, 12)
+//MEM_TEST("ld3 {v17.h, v18.h, v19.h}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld3 {v17.h, v18.h, v19.h}[6], [x5]", 17, 7)
+//MEM_TEST("ld3 {v17.h, v18.h, v19.h}[6], [x5], #6", -9, 12)
+//MEM_TEST("ld3 {v17.h, v18.h, v19.h}[6], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld3 {v17.b, v18.b, v19.b}[0], [x5]", 17, 7)
+//MEM_TEST("ld3 {v17.b, v18.b, v19.b}[0], [x5], #3", -9, 12)
+//MEM_TEST("ld3 {v17.b, v18.b, v19.b}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld3 {v17.b, v18.b, v19.b}[13], [x5]", 17, 7)
+//MEM_TEST("ld3 {v17.b, v18.b, v19.b}[13], [x5], #3", -9, 12)
+//MEM_TEST("ld3 {v17.b, v18.b, v19.b}[13], [x5], x6", 9, 13)
+////////////////////////////////////////////////////////////////
+printf("LD4/ST4 (single 4-elem struct to/from one lane of 4 regs\n");
+
+//MEM_TEST("st4 {v17.d, v18.d, v19.d, v20.d}[0], [x5]", 17, 7)
+//MEM_TEST("st4 {v17.d, v18.d, v19.d, v20.d}[0], [x5], #32", -9, 12)
+//MEM_TEST("st4 {v17.d, v18.d, v19.d, v20.d}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("st4 {v17.d, v18.d, v19.d, v20.d}[1], [x5]", 17, 7)
+//MEM_TEST("st4 {v17.d, v18.d, v19.d, v20.d}[1], [x5], #32", -9, 12)
+//MEM_TEST("st4 {v17.d, v18.d, v19.d, v20.d}[1], [x5], x6", 9, 13)
+//
+//MEM_TEST("st4 {v17.s, v18.s, v19.s, v20.s}[0], [x5]", 17, 7)
+//MEM_TEST("st4 {v17.s, v18.s, v19.s, v20.s}[0], [x5], #16", -9, 12)
+//MEM_TEST("st4 {v17.s, v18.s, v19.s, v20.s}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("st4 {v17.s, v18.s, v19.s, v20.s}[3], [x5]", 17, 7)
+//MEM_TEST("st4 {v17.s, v18.s, v19.s, v20.s}[3], [x5], #16", -9, 12)
+//MEM_TEST("st4 {v17.s, v18.s, v19.s, v20.s}[3], [x5], x6", 9, 13)
+//
+//MEM_TEST("st4 {v17.h, v18.h, v19.h, v20.h}[0], [x5]", 17, 7)
+//MEM_TEST("st4 {v17.h, v18.h, v19.h, v20.h}[0], [x5], #8", -9, 12)
+//MEM_TEST("st4 {v17.h, v18.h, v19.h, v20.h}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("st4 {v17.h, v18.h, v19.h, v20.h}[6], [x5]", 17, 7)
+//MEM_TEST("st4 {v17.h, v18.h, v19.h, v20.h}[6], [x5], #8", -9, 12)
+//MEM_TEST("st4 {v17.h, v18.h, v19.h, v20.h}[6], [x5], x6", 9, 13)
+//
+//MEM_TEST("st4 {v17.b, v18.b, v19.b, v20.b}[0], [x5]", 17, 7)
+//MEM_TEST("st4 {v17.b, v18.b, v19.b, v20.b}[0], [x5], #4", -9, 12)
+//MEM_TEST("st4 {v17.b, v18.b, v19.b, v20.b}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("st4 {v17.b, v18.b, v19.b, v20.b}[13], [x5]", 17, 7)
+//MEM_TEST("st4 {v17.b, v18.b, v19.b, v20.b}[13], [x5], #4", -9, 12)
+//MEM_TEST("st4 {v17.b, v18.b, v19.b, v20.b}[13], [x5], x6", 9, 13)
+//
+//
+//MEM_TEST("ld4 {v17.d, v18.d, v19.d, v20.d}[0], [x5]", 17, 7)
+//MEM_TEST("ld4 {v17.d, v18.d, v19.d, v20.d}[0], [x5], #32", -9, 12)
+//MEM_TEST("ld4 {v17.d, v18.d, v19.d, v20.d}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld4 {v17.d, v18.d, v19.d, v20.d}[1], [x5]", 17, 7)
+//MEM_TEST("ld4 {v17.d, v18.d, v19.d, v20.d}[1], [x5], #32", -9, 12)
+//MEM_TEST("ld4 {v17.d, v18.d, v19.d, v20.d}[1], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld4 {v17.s, v18.s, v19.s, v20.s}[0], [x5]", 17, 7)
+//MEM_TEST("ld4 {v17.s, v18.s, v19.s, v20.s}[0], [x5], #16", -9, 12)
+//MEM_TEST("ld4 {v17.s, v18.s, v19.s, v20.s}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld4 {v17.s, v18.s, v19.s, v20.s}[3], [x5]", 17, 7)
+//MEM_TEST("ld4 {v17.s, v18.s, v19.s, v20.s}[3], [x5], #16", -9, 12)
+//MEM_TEST("ld4 {v17.s, v18.s, v19.s, v20.s}[3], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld4 {v17.h, v18.h, v19.h, v20.h}[0], [x5]", 17, 7)
+//MEM_TEST("ld4 {v17.h, v18.h, v19.h, v20.h}[0], [x5], #8", -9, 12)
+//MEM_TEST("ld4 {v17.h, v18.h, v19.h, v20.h}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld4 {v17.h, v18.h, v19.h, v20.h}[6], [x5]", 17, 7)
+//MEM_TEST("ld4 {v17.h, v18.h, v19.h, v20.h}[6], [x5], #8", -9, 12)
+//MEM_TEST("ld4 {v17.h, v18.h, v19.h, v20.h}[6], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld4 {v17.b, v18.b, v19.b, v20.b}[0], [x5]", 17, 7)
+//MEM_TEST("ld4 {v17.b, v18.b, v19.b, v20.b}[0], [x5], #4", -9, 12)
+//MEM_TEST("ld4 {v17.b, v18.b, v19.b, v20.b}[0], [x5], x6", 9, 13)
+//
+//MEM_TEST("ld4 {v17.b, v18.b, v19.b, v20.b}[13], [x5]", 17, 7)
+//MEM_TEST("ld4 {v17.b, v18.b, v19.b, v20.b}[13], [x5], #4", -9, 12)
+//MEM_TEST("ld4 {v17.b, v18.b, v19.b, v20.b}[13], [x5], x6", 9, 13)
} /* end of test_memory2() */
@@ -832,7 +1605,7 @@
int main ( void )
{
- if (1) test_memory();
- if (1) test_memory2();
+ if (1) test_memory_old();
+ if (1) test_memory_new();
return 0;
}
Modified: branches/VALGRIND_3_10_BRANCH/none/tests/arm64/memory.stdout.exp
==============================================================================
--- branches/VALGRIND_3_10_BRANCH/none/tests/arm64/memory.stdout.exp (original)
+++ branches/VALGRIND_3_10_BRANCH/none/tests/arm64/memory.stdout.exp Sun Nov 23 12:58:22 2014
@@ -880,8 +880,8 @@
0 x5 (sub, base reg)
0 x6 (sub, index reg)
-LDR (literal, int reg) (DONE ABOVE)
-{LD,ST}R (integer register) (entirely MISSING)
+LDR (literal, int reg) (done above by test_memory_old)
+{LD,ST}R (integer register)
str x13, [x5, x6] with x5 = middle_of_block+12, x6=-4
[ 0] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[ 16] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
@@ -2473,7 +2473,7 @@
0 x5 (sub, base reg)
0 x6 (sub, index reg)
-LDRS{B,H,W} (simm9, upd) (upd check is MISSING)
+LDRS{B,H,W} (simm9, upd)
ldrsw x13, [x5, #-24]! with x5 = middle_of_block+-16, x6=4
[ 0] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[ 16] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
@@ -2926,7 +2926,7 @@
0 x6 (sub, index reg)
LDP,STP (immediate, simm7) (FP&VEC)
-stp q17, q18, [x5, 32] with x5 = middle_of_block+-16, x6=4
+stp q17, q18, [x5, 16] with x5 = middle_of_block+-15, x6=4
[ 0] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[ 16] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[ 32] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
@@ -2935,9 +2935,9 @@
[ 80] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[ 96] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[112] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
- [128] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
- [144] 55 18 f1 5c aa 84 c0 38 cd 7e 31 c8 92 f4 b0 e7
- [160] 0e 6c 4b d1 1e 2a 76 4c e2 a7 c8 5a 26 59 0e 5b
+ [128] .. bd 4c a2 27 58 b6 cf 33 b0 ec 02 4e cc f7 5d
+ [144] 81 6f 2c 5d 12 32 3e 5e d7 fe 1c a2 88 01 9f 33
+ [160] 10 .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[176] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[192] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[208] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
@@ -2956,7 +2956,7 @@
0 x5 (sub, base reg)
0 x6 (sub, index reg)
-stp q17, q18, [x5, 32]! with x5 = middle_of_block+-16, x6=4
+stp q19, q18, [x5, 32]! with x5 = middle_of_block+-11, x6=4
[ 0] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[ 16] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[ 32] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
@@ -2966,9 +2966,9 @@
[ 96] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[112] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[128] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
- [144] 02 3e c1 07 ca e4 d0 ed 19 98 1e 29 25 e0 75 25
- [160] e1 0f a7 69 a1 4c 5b 2c 01 08 48 ca f8 ff dc 16
- [176] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [144] .. .. .. .. .. f6 6f 1e 81 d6 09 02 1b d1 46 55
+ [160] 8c 95 04 fe d9 a0 72 a8 70 85 36 45 34 12 90 c2
+ [176] 38 61 c9 6d 5a .. .. .. .. .. .. .. .. .. .. ..
[192] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[208] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[224] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
@@ -2986,7 +2986,7 @@
32 x5 (sub, base reg)
0 x6 (sub, index reg)
-stp q17, q18, [x5], 32 with x5 = middle_of_block+-16, x6=4
+stp q20, q17, [x5], -48 with x5 = middle_of_block+-7, x6=4
[ 0] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[ 16] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[ 32] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
@@ -2994,9 +2994,9 @@
[ 64] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[ 80] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[ 96] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
- [112] 67 98 a3 78 5f 8e f9 57 5e 90 fc 32 c8 db d6 2c
- [128] 20 68 2a 31 1b f7 e9 b2 9f 6a 21 20 db 21 17 27
- [144] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [112] .. .. .. .. .. .. .. .. .. a2 41 aa 2b 45 8f 49
+ [128] 40 cb 2f 6e 6f ad 6d dc bf 7b fc 5a 14 1b 6f d2
+ [144] e9 bf cc d2 e1 68 5b 88 c9 .. .. .. .. .. .. ..
[160] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[176] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[192] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
@@ -3013,10 +3013,10 @@
0000000000000000 v19.d[1] (xor, xfer vecreg #3)
0000000000000000 v20.d[0] (xor, xfer vecreg #3)
0000000000000000 v20.d[1] (xor, xfer vecreg #3)
- 32 x5 (sub, base reg)
+ -48 x5 (sub, base reg)
0 x6 (sub, index reg)
-stp d17, d18, [x5, 32] with x5 = middle_of_block+-16, x6=4
+stp d18, d17, [x5, 16] with x5 = middle_of_block+-15, x6=4
[ 0] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[ 16] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[ 32] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
@@ -3025,8 +3025,8 @@
[ 80] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[ 96] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[112] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
- [128] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
- [144] a0 6c d2 7f 89 d1 b1 b6 c5 5d 74 11 63 9d cb b9
+ [128] .. 16 31 7c 68 e5 76 f9 30 7b 9d e8 08 b7 66 71
+ [144] 7e .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[160] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[176] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[192] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
@@ -3046,7 +3046,7 @@
0 x5 (sub, base reg)
0 x6 (sub, index reg)
-stp d17, d18, [x5, 32]! with x5 = middle_of_block+-16, x6=4
+stp d17, d19, [x5, 32]! with x5 = middle_of_block+-11, x6=4
[ 0] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[ 16] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[ 32] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
@@ -3056,8 +3056,8 @@
[ 96] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[112] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[128] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
- [144] 6f 14 75 6c 06 fe e1 ea 40 30 6e 55 7c 36 4d c4
- [160] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [144] .. .. .. .. .. c3 f0 f2 4b b4 31 3e 1a 81 56 08
+ [160] d0 18 fd a8 ee .. .. .. .. .. .. .. .. .. .. ..
[176] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[192] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[208] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
@@ -3076,7 +3076,7 @@
32 x5 (sub, base reg)
0 x6 (sub, index reg)
-stp d17, d18, [x5], 32 with x5 = middle_of_block+-16, x6=4
+stp d20, d18, [x5], -48 with x5 = middle_of_block+-7, x6=4
[ 0] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[ 16] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[ 32] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
@@ -3084,8 +3084,8 @@
[ 64] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[ 80] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[ 96] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
- [112] c2 ae 80 3d 80 4f 9f 9e 93 76 25 55 85 51 97 1a
- [128] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [112] .. .. .. .. .. .. .. .. .. 0a 9c c9 d7 ad 2c 37
+ [128] 86 b9 a7 1f 66 8d 68 37 e2 .. .. .. .. .. .. ..
[144] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[160] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[176] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
@@ -3103,10 +3103,10 @@
0000000000000000 v19.d[1] (xor, xfer vecreg #3)
0000000000000000 v20.d[0] (xor, xfer vecreg #3)
0000000000000000 v20.d[1] (xor, xfer vecreg #3)
- 32 x5 (sub, base reg)
+ -48 x5 (sub, base reg)
0 x6 (sub, index reg)
-ldp q17, q18, [x5, 32] with x5 = middle_of_block+-16, x6=4
+stp s17, s18, [x5, 16] with x5 = middle_of_block+-15, x6=4
[ 0] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[ 16] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[ 32] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
@@ -3115,7 +3115,7 @@
[ 80] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[ 96] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
[112] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
- [128] .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. ..
+ [128] .. 62 fd 3c be d8 63 15...
[truncated message content] |
|
From: <sv...@va...> - 2014-11-23 12:52:13
|
Author: sewardj
Date: Sun Nov 23 12:52:05 2014
New Revision: 3006
Log:
Merge, from trunk, r2979
* add a missing extra m-reg check for some LD/ST vector cases
* implement
LD1/ST1 (multiple 1-elem structs to/from 2 regs
LD1/ST1 (multiple 1-elem structs to/from 3 regs
LD1/ST1 (multiple 1-elem structs to/from 4 regs
LD1R (single structure, replicate)
LD2R (single structure, replicate)
LD3R (single structure, replicate)
LD4R (single structure, replicate)
LD1/ST1 (single structure, to/from one lane)
LD2/ST2 (single structure, to/from one lane)
LD3/ST3 (single structure, to/from one lane)
LD4/ST4 (single structure, to/from one lane)
I believe this completes the implementation of load and store
instructions for AArch64 ARMv8.
Modified:
branches/VEX_3_10_BRANCH/ (props changed)
branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
Modified: branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c (original)
+++ branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c Sun Nov 23 12:52:05 2014
@@ -5606,17 +5606,17 @@
/* ------ LD4/ST4 (multiple 4-elem structs to/from 4 regs ------ */
/* 31 29 26 22 21 20 15 11 9 4
- 0q 001 1000 L 0 00000 0000 sz n t xx4 {Vt..t+3.T}, [Xn|SP]
- 0q 001 1001 L 0 m 0000 sz n t xx4 {Vt..t+3.T}, [Xn|SP], step
+ 0q 001 1000 L 0 00000 0000 sz n t xx4 {Vt..t+3.T}, [Xn|SP]
+ 0q 001 1001 L 0 m 0000 sz n t xx4 {Vt..t+3.T}, [Xn|SP], step
- 0q 001 1000 L 0 00000 0100 sz n t xx3 {Vt..t+2.T}, [Xn|SP]
- 0q 001 1001 L 0 m 0100 sz n t xx3 {Vt..t+2.T}, [Xn|SP], step
+ 0q 001 1000 L 0 00000 0100 sz n t xx3 {Vt..t+2.T}, [Xn|SP]
+ 0q 001 1001 L 0 m 0100 sz n t xx3 {Vt..t+2.T}, [Xn|SP], step
- 0q 001 1000 L 0 00000 1000 sz n t xx2 {Vt..t+1.T}, [Xn|SP]
- 0q 001 1001 L 0 m 1000 sz n t xx2 {Vt..t+1.T}, [Xn|SP], step
+ 0q 001 1000 L 0 00000 1000 sz n t xx2 {Vt..t+1.T}, [Xn|SP]
+ 0q 001 1001 L 0 m 1000 sz n t xx2 {Vt..t+1.T}, [Xn|SP], step
- 0q 001 1000 L 0 00000 0111 sz n t xx1 {Vt.T}, [Xn|SP]
- 0q 001 1001 L 0 m 0111 sz n t xx1 {Vt.T}, [Xn|SP], step
+ 0q 001 1000 L 0 00000 0111 sz n t xx1 {Vt.T}, [Xn|SP]
+ 0q 001 1001 L 0 m 0111 sz n t xx1 {Vt.T}, [Xn|SP], step
T = defined by Q and sz in the normal way
step = if m == 11111 then transfer-size else Xm
@@ -5642,6 +5642,12 @@
case BITS4(0,1,1,1): nRegs = 1; break;
default: break;
}
+
+ /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed.
+ If we see it, set nRegs to 0 so as to cause the next conditional
+ to fail. */
+ if (!isPX && mm != 0)
+ nRegs = 0;
if (nRegs == 1 /* .1d is allowed */
|| (nRegs >= 2 && nRegs <= 4 && !is1d) /* .1d is not allowed */) {
@@ -5744,7 +5750,6 @@
binop(Iop_Add64, mkexpr(tTA),
mkU64(1 * step)))));
/* fallthru */
-
case 1:
assign(i0, MAYBE_WIDEN_FROM_64(
loadLE(loadTy,
@@ -5813,145 +5818,448 @@
/* else fall through */
}
- /* ---------- LD1R (single structure, replicate) ---------- */
- /* 31 29 22 20 15 11 9 4
- 0q 001 1010 10 00000 110 0 sz n t LD1R Vt.T, [Xn|SP]
- 0q 001 1011 10 m 110 0 sz n t LD1R Vt.T, [Xn|SP], #sz (m=11111)
- , Xm (m!=11111)
+ /* ------ LD1/ST1 (multiple 1-elem structs to/from 2 regs ------ */
+ /* ------ LD1/ST1 (multiple 1-elem structs to/from 3 regs ------ */
+ /* ------ LD1/ST1 (multiple 1-elem structs to/from 4 regs ------ */
+ /* 31 29 26 22 21 20 15 11 9 4
+
+ 0q 001 1000 L 0 00000 0010 sz n t xx1 {Vt..t+3.T}, [Xn|SP]
+ 0q 001 1001 L 0 m 0010 sz n t xx1 {Vt..t+3.T}, [Xn|SP], step
+
+ 0q 001 1000 L 0 00000 0110 sz n t xx1 {Vt..t+2.T}, [Xn|SP]
+ 0q 001 1001 L 0 m 0110 sz n t xx1 {Vt..t+2.T}, [Xn|SP], step
+
+ 0q 001 1000 L 0 00000 1010 sz n t xx1 {Vt..t+1.T}, [Xn|SP]
+ 0q 001 1001 L 0 m 1010 sz n t xx1 {Vt..t+1.T}, [Xn|SP], step
+
+ T = defined by Q and sz in the normal way
+ step = if m == 11111 then transfer-size else Xm
+ xx = case L of 1 -> LD ; 0 -> ST
*/
- if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,1)
- && INSN(22,21) == BITS2(1,0) && INSN(15,12) == BITS4(1,1,0,0)) {
- UInt bitQ = INSN(30,30);
- Bool isPX = INSN(23,23) == 1;
- UInt mm = INSN(20,16);
- UInt sz = INSN(11,10);
- UInt nn = INSN(9,5);
- UInt tt = INSN(4,0);
- IRType ty = integerIRTypeOfSize(1 << sz);
- IRTemp tEA = newTemp(Ity_I64);
- assign(tEA, getIReg64orSP(nn));
- if (nn == 31) { /* FIXME generate stack alignment check */ }
- IRTemp loaded = newTemp(ty);
- assign(loaded, loadLE(ty, mkexpr(tEA)));
- IRTemp dupd = math_DUP_TO_V128(loaded, ty);
- putQReg128(tt, math_MAYBE_ZERO_HI64(bitQ, dupd));
- const HChar* arr = nameArr_Q_SZ(bitQ, sz);
- /* Deal with the writeback, if any. */
- if (!isPX && mm == BITS5(0,0,0,0,0)) {
- /* No writeback. */
- DIP("ld1r v%u.%s, [%s]\n", tt, arr, nameIReg64orSP(nn));
- return True;
+ if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,0)
+ && INSN(21,21) == 0) {
+ Bool bitQ = INSN(30,30);
+ Bool isPX = INSN(23,23) == 1;
+ Bool isLD = INSN(22,22) == 1;
+ UInt mm = INSN(20,16);
+ UInt opc = INSN(15,12);
+ UInt sz = INSN(11,10);
+ UInt nn = INSN(9,5);
+ UInt tt = INSN(4,0);
+ Bool isQ = bitQ == 1;
+ UInt nRegs = 0;
+ switch (opc) {
+ case BITS4(0,0,1,0): nRegs = 4; break;
+ case BITS4(0,1,1,0): nRegs = 3; break;
+ case BITS4(1,0,1,0): nRegs = 2; break;
+ default: break;
}
- if (isPX) {
- putIReg64orSP(nn, binop(Iop_Add64, mkexpr(tEA),
- mm == BITS5(1,1,1,1,1) ? mkU64(1 << sz)
- : getIReg64orZR(mm)));
- if (mm == BITS5(1,1,1,1,1)) {
- DIP("ld1r v%u.%s, [%s], %s\n", tt, arr,
- nameIReg64orSP(nn), nameIReg64orZR(mm));
- } else {
- DIP("ld1r v%u.%s, [%s], #%u\n", tt, arr,
- nameIReg64orSP(nn), 1 << sz);
+
+ /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed.
+ If we see it, set nRegs to 0 so as to cause the next conditional
+ to fail. */
+ if (!isPX && mm != 0)
+ nRegs = 0;
+
+ if (nRegs >= 2 && nRegs <= 4) {
+
+ UInt xferSzB = (isQ ? 16 : 8) * nRegs;
+
+ /* Generate the transfer address (TA) and if necessary the
+ writeback address (WB) */
+ IRTemp tTA = newTemp(Ity_I64);
+ assign(tTA, getIReg64orSP(nn));
+ if (nn == 31) { /* FIXME generate stack alignment check */ }
+ IRTemp tWB = IRTemp_INVALID;
+ if (isPX) {
+ tWB = newTemp(Ity_I64);
+ assign(tWB, binop(Iop_Add64,
+ mkexpr(tTA),
+ mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
+ : getIReg64orZR(mm)));
+ }
+
+ /* -- BEGIN generate the transfers -- */
+
+ IRTemp u0, u1, u2, u3;
+ u0 = u1 = u2 = u3 = IRTemp_INVALID;
+ switch (nRegs) {
+ case 4: u3 = newTempV128(); /* fallthru */
+ case 3: u2 = newTempV128(); /* fallthru */
+ case 2: u1 = newTempV128();
+ u0 = newTempV128(); break;
+ default: vassert(0);
+ }
+
+ /* -- Multiple 128 or 64 bit stores -- */
+ if (!isLD) {
+ switch (nRegs) {
+ case 4: assign(u3, getQReg128((tt+3) % 32)); /* fallthru */
+ case 3: assign(u2, getQReg128((tt+2) % 32)); /* fallthru */
+ case 2: assign(u1, getQReg128((tt+1) % 32));
+ assign(u0, getQReg128((tt+0) % 32)); break;
+ default: vassert(0);
+ }
+# define MAYBE_NARROW_TO_64(_expr) \
+ (isQ ? (_expr) : unop(Iop_V128to64,(_expr)))
+ UInt step = isQ ? 16 : 8;
+ switch (nRegs) {
+ case 4: storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(3*step)),
+ MAYBE_NARROW_TO_64(mkexpr(u3)) );
+ /* fallthru */
+ case 3: storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(2*step)),
+ MAYBE_NARROW_TO_64(mkexpr(u2)) );
+ /* fallthru */
+ case 2: storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(1*step)),
+ MAYBE_NARROW_TO_64(mkexpr(u1)) );
+ storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(0*step)),
+ MAYBE_NARROW_TO_64(mkexpr(u0)) );
+ break;
+ default: vassert(0);
+ }
+# undef MAYBE_NARROW_TO_64
+ }
+
+ /* -- Multiple 128 or 64 bit loads -- */
+ else /* isLD */ {
+ UInt step = isQ ? 16 : 8;
+ IRType loadTy = isQ ? Ity_V128 : Ity_I64;
+# define MAYBE_WIDEN_FROM_64(_expr) \
+ (isQ ? (_expr) : unop(Iop_64UtoV128,(_expr)))
+ switch (nRegs) {
+ case 4:
+ assign(u3, MAYBE_WIDEN_FROM_64(
+ loadLE(loadTy,
+ binop(Iop_Add64, mkexpr(tTA),
+ mkU64(3 * step)))));
+ /* fallthru */
+ case 3:
+ assign(u2, MAYBE_WIDEN_FROM_64(
+ loadLE(loadTy,
+ binop(Iop_Add64, mkexpr(tTA),
+ mkU64(2 * step)))));
+ /* fallthru */
+ case 2:
+ assign(u1, MAYBE_WIDEN_FROM_64(
+ loadLE(loadTy,
+ binop(Iop_Add64, mkexpr(tTA),
+ mkU64(1 * step)))));
+ assign(u0, MAYBE_WIDEN_FROM_64(
+ loadLE(loadTy,
+ binop(Iop_Add64, mkexpr(tTA),
+ mkU64(0 * step)))));
+ break;
+ default:
+ vassert(0);
+ }
+# undef MAYBE_WIDEN_FROM_64
+ switch (nRegs) {
+ case 4: putQReg128( (tt+3) % 32,
+ math_MAYBE_ZERO_HI64(bitQ, u3));
+ /* fallthru */
+ case 3: putQReg128( (tt+2) % 32,
+ math_MAYBE_ZERO_HI64(bitQ, u2));
+ /* fallthru */
+ case 2: putQReg128( (tt+1) % 32,
+ math_MAYBE_ZERO_HI64(bitQ, u1));
+ putQReg128( (tt+0) % 32,
+ math_MAYBE_ZERO_HI64(bitQ, u0));
+ break;
+ default: vassert(0);
+ }
}
+
+ /* -- END generate the transfers -- */
+
+ /* Do the writeback, if necessary */
+ if (isPX) {
+ putIReg64orSP(nn, mkexpr(tWB));
+ }
+
+ HChar pxStr[20];
+ pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
+ if (isPX) {
+ if (mm == BITS5(1,1,1,1,1))
+ vex_sprintf(pxStr, ", #%u", xferSzB);
+ else
+ vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
+ }
+ const HChar* arr = nameArr_Q_SZ(bitQ, sz);
+ DIP("%s1 {v%u.%s .. v%u.%s}, [%s]%s\n",
+ isLD ? "ld" : "st",
+ (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
+ pxStr);
+
return True;
}
- return False;
+ /* else fall through */
}
- /* -------- LD1/ST1 (multi 1-elem structs, 2 regs, no offset) -------- */
- /* Only a very few cases. */
- /* 31 23
- 0100 1100 0100 0000 1010 00 n t LD1 {Vt.16b, V(t+1)%32.16b}, [Xn|SP]
- 0100 1100 0000 0000 1010 00 n t ST1 {Vt.16b, V(t+1)%32.16b}, [Xn|SP]
- */
- if ( (insn & 0xFFFFFC00) == 0x4C40A000 // LD1
- || (insn & 0xFFFFFC00) == 0x4C00A000 // ST1
- ) {
- Bool isLD = INSN(22,22) == 1;
- UInt rN = INSN(9,5);
- UInt vT = INSN(4,0);
- IRTemp tEA = newTemp(Ity_I64);
- const HChar* name = "16b";
- assign(tEA, getIReg64orSP(rN));
- if (rN == 31) { /* FIXME generate stack alignment check */ }
- IRExpr* tEA_0 = binop(Iop_Add64, mkexpr(tEA), mkU64(0));
- IRExpr* tEA_16 = binop(Iop_Add64, mkexpr(tEA), mkU64(16));
- if (isLD) {
- putQReg128((vT+0) % 32, loadLE(Ity_V128, tEA_0));
- putQReg128((vT+1) % 32, loadLE(Ity_V128, tEA_16));
- } else {
- storeLE(tEA_0, getQReg128((vT+0) % 32));
- storeLE(tEA_16, getQReg128((vT+1) % 32));
+ /* ---------- LD1R (single structure, replicate) ---------- */
+ /* ---------- LD2R (single structure, replicate) ---------- */
+ /* ---------- LD3R (single structure, replicate) ---------- */
+ /* ---------- LD4R (single structure, replicate) ---------- */
+ /* 31 29 22 20 15 11 9 4
+ 0q 001 1010 10 00000 110 0 sz n t LD1R {Vt.T}, [Xn|SP]
+ 0q 001 1011 10 m 110 0 sz n t LD1R {Vt.T}, [Xn|SP], step
+
+ 0q 001 1010 11 00000 110 0 sz n t LD2R {Vt..t+1.T}, [Xn|SP]
+ 0q 001 1011 11 m 110 0 sz n t LD2R {Vt..t+1.T}, [Xn|SP], step
+
+ 0q 001 1010 10 00000 111 0 sz n t LD3R {Vt..t+2.T}, [Xn|SP]
+ 0q 001 1011 10 m 111 0 sz n t LD3R {Vt..t+2.T}, [Xn|SP], step
+
+ 0q 001 1010 11 00000 111 0 sz n t LD4R {Vt..t+3.T}, [Xn|SP]
+ 0q 001 1011 11 m 111 0 sz n t LD4R {Vt..t+3.T}, [Xn|SP], step
+
+ step = if m == 11111 then transfer-size else Xm
+ */
+ if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,1)
+ && INSN(22,22) == 1 && INSN(15,14) == BITS2(1,1)
+ && INSN(12,12) == 0) {
+ UInt bitQ = INSN(30,30);
+ Bool isPX = INSN(23,23) == 1;
+ UInt nRegs = ((INSN(13,13) << 1) | INSN(21,21)) + 1;
+ UInt mm = INSN(20,16);
+ UInt sz = INSN(11,10);
+ UInt nn = INSN(9,5);
+ UInt tt = INSN(4,0);
+
+ /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed. */
+ if (isPX || mm == 0) {
+
+ IRType ty = integerIRTypeOfSize(1 << sz);
+
+ UInt laneSzB = 1 << sz;
+ UInt xferSzB = laneSzB * nRegs;
+
+ /* Generate the transfer address (TA) and if necessary the
+ writeback address (WB) */
+ IRTemp tTA = newTemp(Ity_I64);
+ assign(tTA, getIReg64orSP(nn));
+ if (nn == 31) { /* FIXME generate stack alignment check */ }
+ IRTemp tWB = IRTemp_INVALID;
+ if (isPX) {
+ tWB = newTemp(Ity_I64);
+ assign(tWB, binop(Iop_Add64,
+ mkexpr(tTA),
+ mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
+ : getIReg64orZR(mm)));
+ }
+
+ /* Do the writeback, if necessary */
+ if (isPX) {
+ putIReg64orSP(nn, mkexpr(tWB));
+ }
+
+ IRTemp e0, e1, e2, e3, v0, v1, v2, v3;
+ e0 = e1 = e2 = e3 = v0 = v1 = v2 = v3 = IRTemp_INVALID;
+ switch (nRegs) {
+ case 4:
+ e3 = newTemp(ty);
+ assign(e3, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
+ mkU64(3 * laneSzB))));
+ v3 = math_DUP_TO_V128(e3, ty);
+ putQReg128((tt+3) % 32, math_MAYBE_ZERO_HI64(bitQ, v3));
+ /* fallthrough */
+ case 3:
+ e2 = newTemp(ty);
+ assign(e2, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
+ mkU64(2 * laneSzB))));
+ v2 = math_DUP_TO_V128(e2, ty);
+ putQReg128((tt+2) % 32, math_MAYBE_ZERO_HI64(bitQ, v2));
+ /* fallthrough */
+ case 2:
+ e1 = newTemp(ty);
+ assign(e1, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
+ mkU64(1 * laneSzB))));
+ v1 = math_DUP_TO_V128(e1, ty);
+ putQReg128((tt+1) % 32, math_MAYBE_ZERO_HI64(bitQ, v1));
+ /* fallthrough */
+ case 1:
+ e0 = newTemp(ty);
+ assign(e0, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
+ mkU64(0 * laneSzB))));
+ v0 = math_DUP_TO_V128(e0, ty);
+ putQReg128((tt+0) % 32, math_MAYBE_ZERO_HI64(bitQ, v0));
+ break;
+ default:
+ vassert(0);
+ }
+
+ HChar pxStr[20];
+ pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
+ if (isPX) {
+ if (mm == BITS5(1,1,1,1,1))
+ vex_sprintf(pxStr, ", #%u", xferSzB);
+ else
+ vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
+ }
+ const HChar* arr = nameArr_Q_SZ(bitQ, sz);
+ DIP("ld%ur {v%u.%s .. v%u.%s}, [%s]%s\n",
+ nRegs,
+ (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
+ pxStr);
+
+ return True;
}
- DIP("%s {v%u.%s, v%u.%s}, [%s]\n", isLD ? "ld1" : "st1",
- (vT+0) % 32, name, (vT+1) % 32, name, nameIReg64orSP(rN));
- return True;
+ /* else fall through */
}
- /* -------- LD1/ST1 (multi 1-elem structs, 2 regs, post index) -------- */
- /* Only a very few cases. */
- /* 31 23
- 0100 1100 1101 1111 1010 00 n t LD1 {Vt.16b, V(t+1)%32.16b}, [Xn|SP], #32
- 0100 1100 1001 1111 1010 00 n t ST1 {Vt.16b, V(t+1)%32.16b}, [Xn|SP], #32
- */
- if ( (insn & 0xFFFFFC00) == 0x4CDFA000 // LD1
- || (insn & 0xFFFFFC00) == 0x4C9FA000 // ST1
- ) {
- Bool isLD = INSN(22,22) == 1;
- UInt rN = INSN(9,5);
- UInt vT = INSN(4,0);
- IRTemp tEA = newTemp(Ity_I64);
- const HChar* name = "16b";
- assign(tEA, getIReg64orSP(rN));
- if (rN == 31) { /* FIXME generate stack alignment check */ }
- IRExpr* tEA_0 = binop(Iop_Add64, mkexpr(tEA), mkU64(0));
- IRExpr* tEA_16 = binop(Iop_Add64, mkexpr(tEA), mkU64(16));
- if (isLD) {
- putQReg128((vT+0) % 32, loadLE(Ity_V128, tEA_0));
- putQReg128((vT+1) % 32, loadLE(Ity_V128, tEA_16));
- } else {
- storeLE(tEA_0, getQReg128((vT+0) % 32));
- storeLE(tEA_16, getQReg128((vT+1) % 32));
+ /* ------ LD1/ST1 (single structure, to/from one lane) ------ */
+ /* ------ LD2/ST2 (single structure, to/from one lane) ------ */
+ /* ------ LD3/ST3 (single structure, to/from one lane) ------ */
+ /* ------ LD4/ST4 (single structure, to/from one lane) ------ */
+ /* 31 29 22 21 20 15 11 9 4
+ 0q 001 1010 L 0 00000 xx0 S sz n t op1 {Vt.T}[ix], [Xn|SP]
+ 0q 001 1011 L 0 m xx0 S sz n t op1 {Vt.T}[ix], [Xn|SP], step
+
+ 0q 001 1010 L 1 00000 xx0 S sz n t op2 {Vt..t+1.T}[ix], [Xn|SP]
+ 0q 001 1011 L 1 m xx0 S sz n t op2 {Vt..t+1.T}[ix], [Xn|SP], step
+
+ 0q 001 1010 L 0 00000 xx1 S sz n t op3 {Vt..t+2.T}[ix], [Xn|SP]
+ 0q 001 1011 L 0 m xx1 S sz n t op3 {Vt..t+2.T}[ix], [Xn|SP], step
+
+ 0q 001 1010 L 1 00000 xx1 S sz n t op4 {Vt..t+3.T}[ix], [Xn|SP]
+ 0q 001 1011 L 1 m xx1 S sz n t op4 {Vt..t+3.T}[ix], [Xn|SP], step
+
+ step = if m == 11111 then transfer-size else Xm
+ op = case L of 1 -> LD ; 0 -> ST
+
+ laneszB,ix = case xx:q:S:sz of 00:b:b:bb -> 1, bbbb
+ 01:b:b:b0 -> 2, bbb
+ 10:b:b:00 -> 4, bb
+ 10:b:0:01 -> 8, b
+ */
+ if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,1)) {
+ UInt bitQ = INSN(30,30);
+ Bool isPX = INSN(23,23) == 1;
+ Bool isLD = INSN(22,22) == 1;
+ UInt nRegs = ((INSN(13,13) << 1) | INSN(21,21)) + 1;
+ UInt mm = INSN(20,16);
+ UInt xx = INSN(15,14);
+ UInt bitS = INSN(12,12);
+ UInt sz = INSN(11,10);
+ UInt nn = INSN(9,5);
+ UInt tt = INSN(4,0);
+
+ Bool valid = True;
+
+ /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed. */
+ if (!isPX && mm != 0)
+ valid = False;
+
+ UInt laneSzB = 0; /* invalid */
+ UInt ix = 16; /* invalid */
+
+ UInt xx_q_S_sz = (xx << 4) | (bitQ << 3) | (bitS << 2) | sz;
+ switch (xx_q_S_sz) {
+ case 0x00: case 0x01: case 0x02: case 0x03:
+ case 0x04: case 0x05: case 0x06: case 0x07:
+ case 0x08: case 0x09: case 0x0A: case 0x0B:
+ case 0x0C: case 0x0D: case 0x0E: case 0x0F:
+ laneSzB = 1; ix = xx_q_S_sz & 0xF;
+ break;
+ case 0x10: case 0x12: case 0x14: case 0x16:
+ case 0x18: case 0x1A: case 0x1C: case 0x1E:
+ laneSzB = 2; ix = (xx_q_S_sz >> 1) & 7;
+ break;
+ case 0x20: case 0x24: case 0x28: case 0x2C:
+ laneSzB = 4; ix = (xx_q_S_sz >> 2) & 3;
+ break;
+ case 0x21: case 0x29:
+ laneSzB = 8; ix = (xx_q_S_sz >> 3) & 1;
+ break;
+ default:
+ break;
}
- putIReg64orSP(rN, binop(Iop_Add64, mkexpr(tEA), mkU64(32)));
- DIP("%s {v%u.%s, v%u.%s}, [%s], #32\n", isLD ? "ld1" : "st1",
- (vT+0) % 32, name, (vT+1) % 32, name, nameIReg64orSP(rN));
- return True;
- }
- /* -------- LD1/ST1 (multi 1-elem structs, 3 regs, no offset) -------- */
- /* Only a very few cases. */
- /* 31 23
- 0100 1100 0100 0000 0110 00 n t LD1 {Vt.16b .. V(t+2)%32.16b}, [Xn|SP]
- 0100 1100 0000 0000 0110 00 n t ST1 {Vt.16b .. V(t+2)%32.16b}, [Xn|SP]
- */
- if ( (insn & 0xFFFFFC00) == 0x4C406000 // LD1
- || (insn & 0xFFFFFC00) == 0x4C006000 // ST1
- ) {
- Bool isLD = INSN(22,22) == 1;
- UInt rN = INSN(9,5);
- UInt vT = INSN(4,0);
- IRTemp tEA = newTemp(Ity_I64);
- const HChar* name = "16b";
- assign(tEA, getIReg64orSP(rN));
- if (rN == 31) { /* FIXME generate stack alignment check */ }
- IRExpr* tEA_0 = binop(Iop_Add64, mkexpr(tEA), mkU64(0));
- IRExpr* tEA_16 = binop(Iop_Add64, mkexpr(tEA), mkU64(16));
- IRExpr* tEA_32 = binop(Iop_Add64, mkexpr(tEA), mkU64(32));
- if (isLD) {
- putQReg128((vT+0) % 32, loadLE(Ity_V128, tEA_0));
- putQReg128((vT+1) % 32, loadLE(Ity_V128, tEA_16));
- putQReg128((vT+2) % 32, loadLE(Ity_V128, tEA_32));
- } else {
- storeLE(tEA_0, getQReg128((vT+0) % 32));
- storeLE(tEA_16, getQReg128((vT+1) % 32));
- storeLE(tEA_32, getQReg128((vT+2) % 32));
- }
- DIP("%s {v%u.%s, v%u.%s, v%u.%s}, [%s], #32\n",
- isLD ? "ld1" : "st1",
- (vT+0) % 32, name, (vT+1) % 32, name, (vT+2) % 32, name,
- nameIReg64orSP(rN));
- return True;
+ if (valid && laneSzB != 0) {
+
+ IRType ty = integerIRTypeOfSize(laneSzB);
+ UInt xferSzB = laneSzB * nRegs;
+
+ /* Generate the transfer address (TA) and if necessary the
+ writeback address (WB) */
+ IRTemp tTA = newTemp(Ity_I64);
+ assign(tTA, getIReg64orSP(nn));
+ if (nn == 31) { /* FIXME generate stack alignment check */ }
+ IRTemp tWB = IRTemp_INVALID;
+ if (isPX) {
+ tWB = newTemp(Ity_I64);
+ assign(tWB, binop(Iop_Add64,
+ mkexpr(tTA),
+ mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
+ : getIReg64orZR(mm)));
+ }
+
+ /* Do the writeback, if necessary */
+ if (isPX) {
+ putIReg64orSP(nn, mkexpr(tWB));
+ }
+
+ switch (nRegs) {
+ case 4: {
+ IRExpr* addr
+ = binop(Iop_Add64, mkexpr(tTA), mkU64(3 * laneSzB));
+ if (isLD) {
+ putQRegLane((tt+3) % 32, ix, loadLE(ty, addr));
+ } else {
+ storeLE(addr, getQRegLane((tt+3) % 32, ix, ty));
+ }
+ /* fallthrough */
+ }
+ case 3: {
+ IRExpr* addr
+ = binop(Iop_Add64, mkexpr(tTA), mkU64(2 * laneSzB));
+ if (isLD) {
+ putQRegLane((tt+2) % 32, ix, loadLE(ty, addr));
+ } else {
+ storeLE(addr, getQRegLane((tt+2) % 32, ix, ty));
+ }
+ /* fallthrough */
+ }
+ case 2: {
+ IRExpr* addr
+ = binop(Iop_Add64, mkexpr(tTA), mkU64(1 * laneSzB));
+ if (isLD) {
+ putQRegLane((tt+1) % 32, ix, loadLE(ty, addr));
+ } else {
+ storeLE(addr, getQRegLane((tt+1) % 32, ix, ty));
+ }
+ /* fallthrough */
+ }
+ case 1: {
+ IRExpr* addr
+ = binop(Iop_Add64, mkexpr(tTA), mkU64(0 * laneSzB));
+ if (isLD) {
+ putQRegLane((tt+0) % 32, ix, loadLE(ty, addr));
+ } else {
+ storeLE(addr, getQRegLane((tt+0) % 32, ix, ty));
+ }
+ break;
+ }
+ default:
+ vassert(0);
+ }
+
+ HChar pxStr[20];
+ pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
+ if (isPX) {
+ if (mm == BITS5(1,1,1,1,1))
+ vex_sprintf(pxStr, ", #%u", xferSzB);
+ else
+ vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
+ }
+ const HChar* arr = nameArr_Q_SZ(bitQ, sz);
+ DIP("%s%u {v%u.%s .. v%u.%s}[%u], [%s]%s\n",
+ isLD ? "ld" : "st", nRegs,
+ (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr,
+ ix, nameIReg64orSP(nn), pxStr);
+
+ return True;
+ }
+ /* else fall through */
}
/* ------------------ LD{,A}X{R,RH,RB} ------------------ */
|
|
From: <sv...@va...> - 2014-11-23 12:49:24
|
Author: sewardj
Date: Sun Nov 23 12:49:14 2014
New Revision: 3005
Log:
Merge, from trunk, r2976
Implement SIMD (de)interleaving loads/stores:
LD1/ST1 (multiple 1-elem structs to/from 1 reg
LD2/ST2 (multiple 2-elem structs to/from 2 regs
LD3/ST3 (multiple 3-elem structs to/from 3 regs
LD4/ST4 (multiple 4-elem structs to/from 4 regs
Also:
LDNP, STNP (load/store vector pair, non-temporal)
Modified:
branches/VEX_3_10_BRANCH/ (props changed)
branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
branches/VEX_3_10_BRANCH/priv/host_arm64_defs.c
branches/VEX_3_10_BRANCH/priv/host_arm64_isel.c
Modified: branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c (original)
+++ branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c Sun Nov 23 12:49:14 2014
@@ -404,18 +404,18 @@
*t3 = newTempV128();
}
-//static
-//void newTempsV128_4(IRTemp* t1, IRTemp* t2, IRTemp* t3, IRTemp* t4)
-//{
-// vassert(t1 && *t1 == IRTemp_INVALID);
-// vassert(t2 && *t2 == IRTemp_INVALID);
-// vassert(t3 && *t3 == IRTemp_INVALID);
-// vassert(t4 && *t4 == IRTemp_INVALID);
-// *t1 = newTempV128();
-// *t2 = newTempV128();
-// *t3 = newTempV128();
-// *t4 = newTempV128();
-//}
+static
+void newTempsV128_4(IRTemp* t1, IRTemp* t2, IRTemp* t3, IRTemp* t4)
+{
+ vassert(t1 && *t1 == IRTemp_INVALID);
+ vassert(t2 && *t2 == IRTemp_INVALID);
+ vassert(t3 && *t3 == IRTemp_INVALID);
+ vassert(t4 && *t4 == IRTemp_INVALID);
+ *t1 = newTempV128();
+ *t2 = newTempV128();
+ *t3 = newTempV128();
+ *t4 = newTempV128();
+}
static
void newTempsV128_7(IRTemp* t1, IRTemp* t2, IRTemp* t3,
@@ -3467,6 +3467,950 @@
/*------------------------------------------------------------*/
+/*--- Math helpers for vector interleave/deinterleave ---*/
+/*------------------------------------------------------------*/
+
+#define EX(_tmp) \
+ mkexpr(_tmp)
+#define SL(_hi128,_lo128,_nbytes) \
+ ( (_nbytes) == 0 \
+ ? (_lo128) \
+ : triop(Iop_SliceV128,(_hi128),(_lo128),mkU8(_nbytes)) )
+#define ROR(_v128,_nbytes) \
+ SL((_v128),(_v128),(_nbytes))
+#define ROL(_v128,_nbytes) \
+ SL((_v128),(_v128),16-(_nbytes))
+#define SHR(_v128,_nbytes) \
+ binop(Iop_ShrV128,(_v128),mkU8(8*(_nbytes)))
+#define SHL(_v128,_nbytes) \
+ binop(Iop_ShlV128,(_v128),mkU8(8*(_nbytes)))
+#define ILO64x2(_argL,_argR) \
+ binop(Iop_InterleaveLO64x2,(_argL),(_argR))
+#define IHI64x2(_argL,_argR) \
+ binop(Iop_InterleaveHI64x2,(_argL),(_argR))
+#define ILO32x4(_argL,_argR) \
+ binop(Iop_InterleaveLO32x4,(_argL),(_argR))
+#define IHI32x4(_argL,_argR) \
+ binop(Iop_InterleaveHI32x4,(_argL),(_argR))
+#define ILO16x8(_argL,_argR) \
+ binop(Iop_InterleaveLO16x8,(_argL),(_argR))
+#define IHI16x8(_argL,_argR) \
+ binop(Iop_InterleaveHI16x8,(_argL),(_argR))
+#define ILO8x16(_argL,_argR) \
+ binop(Iop_InterleaveLO8x16,(_argL),(_argR))
+#define IHI8x16(_argL,_argR) \
+ binop(Iop_InterleaveHI8x16,(_argL),(_argR))
+#define CEV32x4(_argL,_argR) \
+ binop(Iop_CatEvenLanes32x4,(_argL),(_argR))
+#define COD32x4(_argL,_argR) \
+ binop(Iop_CatOddLanes32x4,(_argL),(_argR))
+#define COD16x8(_argL,_argR) \
+ binop(Iop_CatOddLanes16x8,(_argL),(_argR))
+#define COD8x16(_argL,_argR) \
+ binop(Iop_CatOddLanes8x16,(_argL),(_argR))
+#define CEV8x16(_argL,_argR) \
+ binop(Iop_CatEvenLanes8x16,(_argL),(_argR))
+#define AND(_arg1,_arg2) \
+ binop(Iop_AndV128,(_arg1),(_arg2))
+#define OR2(_arg1,_arg2) \
+ binop(Iop_OrV128,(_arg1),(_arg2))
+#define OR3(_arg1,_arg2,_arg3) \
+ binop(Iop_OrV128,(_arg1),binop(Iop_OrV128,(_arg2),(_arg3)))
+#define OR4(_arg1,_arg2,_arg3,_arg4) \
+ binop(Iop_OrV128, \
+ binop(Iop_OrV128,(_arg1),(_arg2)), \
+ binop(Iop_OrV128,(_arg3),(_arg4)))
+
+
+/* Do interleaving for 1 128 bit vector, for ST1 insns. */
+static
+void math_INTERLEAVE1_128( /*OUTx1*/ IRTemp* i0,
+ UInt laneSzBlg2, IRTemp u0 )
+{
+ assign(*i0, mkexpr(u0));
+}
+
+
+/* Do interleaving for 2 128 bit vectors, for ST2 insns. */
+static
+void math_INTERLEAVE2_128( /*OUTx2*/ IRTemp* i0, IRTemp* i1,
+ UInt laneSzBlg2, IRTemp u0, IRTemp u1 )
+{
+ /* This is pretty easy, since we have primitives directly to
+ hand. */
+ if (laneSzBlg2 == 3) {
+ // 64x2
+ // u1 == B1 B0, u0 == A1 A0
+ // i1 == B1 A1, i0 == B0 A0
+ assign(*i0, binop(Iop_InterleaveLO64x2, mkexpr(u1), mkexpr(u0)));
+ assign(*i1, binop(Iop_InterleaveHI64x2, mkexpr(u1), mkexpr(u0)));
+ return;
+ }
+ if (laneSzBlg2 == 2) {
+ // 32x4
+ // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0,
+ // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0
+ assign(*i0, binop(Iop_InterleaveLO32x4, mkexpr(u1), mkexpr(u0)));
+ assign(*i1, binop(Iop_InterleaveHI32x4, mkexpr(u1), mkexpr(u0)));
+ return;
+ }
+ if (laneSzBlg2 == 1) {
+ // 16x8
+ // u1 == B{7..0}, u0 == A{7..0}
+ // i0 == B3 A3 B2 A2 B1 A1 B0 A0
+ // i1 == B7 A7 B6 A6 B5 A5 B4 A4
+ assign(*i0, binop(Iop_InterleaveLO16x8, mkexpr(u1), mkexpr(u0)));
+ assign(*i1, binop(Iop_InterleaveHI16x8, mkexpr(u1), mkexpr(u0)));
+ return;
+ }
+ if (laneSzBlg2 == 0) {
+ // 8x16
+ // u1 == B{f..0}, u0 == A{f..0}
+ // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
+ // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8
+ assign(*i0, binop(Iop_InterleaveLO8x16, mkexpr(u1), mkexpr(u0)));
+ assign(*i1, binop(Iop_InterleaveHI8x16, mkexpr(u1), mkexpr(u0)));
+ return;
+ }
+ /*NOTREACHED*/
+ vassert(0);
+}
+
+
+/* Do interleaving for 3 128 bit vectors, for ST3 insns. */
+static
+void math_INTERLEAVE3_128(
+ /*OUTx3*/ IRTemp* i0, IRTemp* i1, IRTemp* i2,
+ UInt laneSzBlg2,
+ IRTemp u0, IRTemp u1, IRTemp u2 )
+{
+ if (laneSzBlg2 == 3) {
+ // 64x2
+ // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0
+ // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0,
+ assign(*i2, IHI64x2( EX(u2), EX(u1) ));
+ assign(*i1, ILO64x2( ROR(EX(u0),8), EX(u2) ));
+ assign(*i0, ILO64x2( EX(u1), EX(u0) ));
+ return;
+ }
+
+ if (laneSzBlg2 == 2) {
+ // 32x4
+ // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0
+ // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0
+ // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0
+ IRTemp p0 = newTempV128();
+ IRTemp p1 = newTempV128();
+ IRTemp p2 = newTempV128();
+ IRTemp c1100 = newTempV128();
+ IRTemp c0011 = newTempV128();
+ IRTemp c0110 = newTempV128();
+ assign(c1100, mkV128(0xFF00));
+ assign(c0011, mkV128(0x00FF));
+ assign(c0110, mkV128(0x0FF0));
+ // First interleave them at 64x2 granularity,
+ // generating partial ("p") values.
+ math_INTERLEAVE3_128(&p0, &p1, &p2, 3, u0, u1, u2);
+ // And more shuffling around for the final answer
+ assign(*i2, OR2( AND( IHI32x4(EX(p2), ROL(EX(p2),8)), EX(c1100) ),
+ AND( IHI32x4(ROR(EX(p1),4), EX(p2)), EX(c0011) ) ));
+ assign(*i1, OR3( SHL(EX(p2),12),
+ AND(EX(p1),EX(c0110)),
+ SHR(EX(p0),12) ));
+ assign(*i0, OR2( AND( ILO32x4(EX(p0),ROL(EX(p1),4)), EX(c1100) ),
+ AND( ILO32x4(ROR(EX(p0),8),EX(p0)), EX(c0011) ) ));
+ return;
+ }
+
+ if (laneSzBlg2 == 1) {
+ // 16x8
+ // u2 == C7 C6 C5 C4 C3 C2 C1 C0
+ // u1 == B7 B6 B5 B4 B3 B2 B1 B0
+ // u0 == A7 A6 A5 A4 A3 A2 A1 A0
+ //
+ // p2 == C7 C6 B7 B6 A7 A6 C5 C4
+ // p1 == B5 B4 A5 A4 C3 C2 B3 B2
+ // p0 == A3 A2 C1 C0 B1 B0 A1 A0
+ //
+ // i2 == C7 B7 A7 C6 B6 A6 C5 B5
+ // i1 == A5 C4 B4 A4 C4 B3 A3 C2
+ // i0 == B2 A2 C1 B1 A1 C0 B0 A0
+ IRTemp p0 = newTempV128();
+ IRTemp p1 = newTempV128();
+ IRTemp p2 = newTempV128();
+ IRTemp c1000 = newTempV128();
+ IRTemp c0100 = newTempV128();
+ IRTemp c0010 = newTempV128();
+ IRTemp c0001 = newTempV128();
+ assign(c1000, mkV128(0xF000));
+ assign(c0100, mkV128(0x0F00));
+ assign(c0010, mkV128(0x00F0));
+ assign(c0001, mkV128(0x000F));
+ // First interleave them at 32x4 granularity,
+ // generating partial ("p") values.
+ math_INTERLEAVE3_128(&p0, &p1, &p2, 2, u0, u1, u2);
+ // And more shuffling around for the final answer
+ assign(*i2,
+ OR4( AND( IHI16x8( EX(p2), ROL(EX(p2),4) ), EX(c1000) ),
+ AND( IHI16x8( ROL(EX(p2),6), EX(p2) ), EX(c0100) ),
+ AND( IHI16x8( ROL(EX(p2),2), ROL(EX(p2),6) ), EX(c0010) ),
+ AND( ILO16x8( ROR(EX(p2),2), ROL(EX(p1),2) ), EX(c0001) )
+ ));
+ assign(*i1,
+ OR4( AND( IHI16x8( ROL(EX(p1),4), ROR(EX(p2),2) ), EX(c1000) ),
+ AND( IHI16x8( EX(p1), ROL(EX(p1),4) ), EX(c0100) ),
+ AND( IHI16x8( ROL(EX(p1),4), ROL(EX(p1),8) ), EX(c0010) ),
+ AND( IHI16x8( ROR(EX(p0),6), ROL(EX(p1),4) ), EX(c0001) )
+ ));
+ assign(*i0,
+ OR4( AND( IHI16x8( ROR(EX(p1),2), ROL(EX(p0),2) ), EX(c1000) ),
+ AND( IHI16x8( ROL(EX(p0),2), ROL(EX(p0),6) ), EX(c0100) ),
+ AND( IHI16x8( ROL(EX(p0),8), ROL(EX(p0),2) ), EX(c0010) ),
+ AND( IHI16x8( ROL(EX(p0),4), ROL(EX(p0),8) ), EX(c0001) )
+ ));
+ return;
+ }
+
+ if (laneSzBlg2 == 0) {
+ // 8x16. It doesn't seem worth the hassle of first doing a
+ // 16x8 interleave, so just generate all 24 partial results
+ // directly :-(
+ // u2 == Cf .. C0, u1 == Bf .. B0, u0 == Af .. A0
+ // i2 == Cf Bf Af Ce .. Bb Ab Ca
+ // i1 == Ba Aa C9 B9 .. A6 C5 B5
+ // i0 == A5 C4 B4 A4 .. C0 B0 A0
+
+ IRTemp i2_FEDC = newTempV128(); IRTemp i2_BA98 = newTempV128();
+ IRTemp i2_7654 = newTempV128(); IRTemp i2_3210 = newTempV128();
+ IRTemp i1_FEDC = newTempV128(); IRTemp i1_BA98 = newTempV128();
+ IRTemp i1_7654 = newTempV128(); IRTemp i1_3210 = newTempV128();
+ IRTemp i0_FEDC = newTempV128(); IRTemp i0_BA98 = newTempV128();
+ IRTemp i0_7654 = newTempV128(); IRTemp i0_3210 = newTempV128();
+ IRTemp i2_hi64 = newTempV128(); IRTemp i2_lo64 = newTempV128();
+ IRTemp i1_hi64 = newTempV128(); IRTemp i1_lo64 = newTempV128();
+ IRTemp i0_hi64 = newTempV128(); IRTemp i0_lo64 = newTempV128();
+
+ // eg XXXX(qqq, CC, 0xF, BB, 0xA)) sets qqq to be a vector
+ // of the form 14 bytes junk : CC[0xF] : BB[0xA]
+ //
+# define XXXX(_tempName,_srcVec1,_srcShift1,_srcVec2,_srcShift2) \
+ IRTemp t_##_tempName = newTempV128(); \
+ assign(t_##_tempName, \
+ ILO8x16( ROR(EX(_srcVec1),(_srcShift1)), \
+ ROR(EX(_srcVec2),(_srcShift2)) ) )
+
+ // Let CC, BB, AA be (handy) aliases of u2, u1, u0 respectively
+ IRTemp CC = u2; IRTemp BB = u1; IRTemp AA = u0;
+
+ // The slicing and reassembly are done as interleavedly as possible,
+ // so as to minimise the demand for registers in the back end, which
+ // was observed to be a problem in testing.
+
+ XXXX(CfBf, CC, 0xf, BB, 0xf); // i2[15:14]
+ XXXX(AfCe, AA, 0xf, CC, 0xe);
+ assign(i2_FEDC, ILO16x8(EX(t_CfBf), EX(t_AfCe)));
+
+ XXXX(BeAe, BB, 0xe, AA, 0xe);
+ XXXX(CdBd, CC, 0xd, BB, 0xd);
+ assign(i2_BA98, ILO16x8(EX(t_BeAe), EX(t_CdBd)));
+ assign(i2_hi64, ILO32x4(EX(i2_FEDC), EX(i2_BA98)));
+
+ XXXX(AdCc, AA, 0xd, CC, 0xc);
+ XXXX(BcAc, BB, 0xc, AA, 0xc);
+ assign(i2_7654, ILO16x8(EX(t_AdCc), EX(t_BcAc)));
+
+ XXXX(CbBb, CC, 0xb, BB, 0xb);
+ XXXX(AbCa, AA, 0xb, CC, 0xa); // i2[1:0]
+ assign(i2_3210, ILO16x8(EX(t_CbBb), EX(t_AbCa)));
+ assign(i2_lo64, ILO32x4(EX(i2_7654), EX(i2_3210)));
+ assign(*i2, ILO64x2(EX(i2_hi64), EX(i2_lo64)));
+
+ XXXX(BaAa, BB, 0xa, AA, 0xa); // i1[15:14]
+ XXXX(C9B9, CC, 0x9, BB, 0x9);
+ assign(i1_FEDC, ILO16x8(EX(t_BaAa), EX(t_C9B9)));
+
+ XXXX(A9C8, AA, 0x9, CC, 0x8);
+ XXXX(B8A8, BB, 0x8, AA, 0x8);
+ assign(i1_BA98, ILO16x8(EX(t_A9C8), EX(t_B8A8)));
+ assign(i1_hi64, ILO32x4(EX(i1_FEDC), EX(i1_BA98)));
+
+ XXXX(C7B7, CC, 0x7, BB, 0x7);
+ XXXX(A7C6, AA, 0x7, CC, 0x6);
+ assign(i1_7654, ILO16x8(EX(t_C7B7), EX(t_A7C6)));
+
+ XXXX(B6A6, BB, 0x6, AA, 0x6);
+ XXXX(C5B5, CC, 0x5, BB, 0x5); // i1[1:0]
+ assign(i1_3210, ILO16x8(EX(t_B6A6), EX(t_C5B5)));
+ assign(i1_lo64, ILO32x4(EX(i1_7654), EX(i1_3210)));
+ assign(*i1, ILO64x2(EX(i1_hi64), EX(i1_lo64)));
+
+ XXXX(A5C4, AA, 0x5, CC, 0x4); // i0[15:14]
+ XXXX(B4A4, BB, 0x4, AA, 0x4);
+ assign(i0_FEDC, ILO16x8(EX(t_A5C4), EX(t_B4A4)));
+
+ XXXX(C3B3, CC, 0x3, BB, 0x3);
+ XXXX(A3C2, AA, 0x3, CC, 0x2);
+ assign(i0_BA98, ILO16x8(EX(t_C3B3), EX(t_A3C2)));
+ assign(i0_hi64, ILO32x4(EX(i0_FEDC), EX(i0_BA98)));
+
+ XXXX(B2A2, BB, 0x2, AA, 0x2);
+ XXXX(C1B1, CC, 0x1, BB, 0x1);
+ assign(i0_7654, ILO16x8(EX(t_B2A2), EX(t_C1B1)));
+
+ XXXX(A1C0, AA, 0x1, CC, 0x0);
+ XXXX(B0A0, BB, 0x0, AA, 0x0); // i0[1:0]
+ assign(i0_3210, ILO16x8(EX(t_A1C0), EX(t_B0A0)));
+ assign(i0_lo64, ILO32x4(EX(i0_7654), EX(i0_3210)));
+ assign(*i0, ILO64x2(EX(i0_hi64), EX(i0_lo64)));
+
+# undef XXXX
+ return;
+ }
+
+ /*NOTREACHED*/
+ vassert(0);
+}
+
+
+/* Do interleaving for 4 128 bit vectors, for ST4 insns. */
+static
+void math_INTERLEAVE4_128(
+ /*OUTx4*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, IRTemp* i3,
+ UInt laneSzBlg2,
+ IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3 )
+{
+ if (laneSzBlg2 == 3) {
+ // 64x2
+ assign(*i0, ILO64x2(EX(u1), EX(u0)));
+ assign(*i1, ILO64x2(EX(u3), EX(u2)));
+ assign(*i2, IHI64x2(EX(u1), EX(u0)));
+ assign(*i3, IHI64x2(EX(u3), EX(u2)));
+ return;
+ }
+ if (laneSzBlg2 == 2) {
+ // 32x4
+ // First, interleave at the 64-bit lane size.
+ IRTemp p0 = newTempV128();
+ IRTemp p1 = newTempV128();
+ IRTemp p2 = newTempV128();
+ IRTemp p3 = newTempV128();
+ math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 3, u0, u1, u2, u3);
+ // And interleave (cat) at the 32 bit size.
+ assign(*i0, CEV32x4(EX(p1), EX(p0)));
+ assign(*i1, COD32x4(EX(p1), EX(p0)));
+ assign(*i2, CEV32x4(EX(p3), EX(p2)));
+ assign(*i3, COD32x4(EX(p3), EX(p2)));
+ return;
+ }
+ if (laneSzBlg2 == 1) {
+ // 16x8
+ // First, interleave at the 32-bit lane size.
+ IRTemp p0 = newTempV128();
+ IRTemp p1 = newTempV128();
+ IRTemp p2 = newTempV128();
+ IRTemp p3 = newTempV128();
+ math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 2, u0, u1, u2, u3);
+ // And rearrange within each vector, to get the right 16 bit lanes.
+ assign(*i0, COD16x8(EX(p0), SHL(EX(p0), 2)));
+ assign(*i1, COD16x8(EX(p1), SHL(EX(p1), 2)));
+ assign(*i2, COD16x8(EX(p2), SHL(EX(p2), 2)));
+ assign(*i3, COD16x8(EX(p3), SHL(EX(p3), 2)));
+ return;
+ }
+ if (laneSzBlg2 == 0) {
+ // 8x16
+ // First, interleave at the 16-bit lane size.
+ IRTemp p0 = newTempV128();
+ IRTemp p1 = newTempV128();
+ IRTemp p2 = newTempV128();
+ IRTemp p3 = newTempV128();
+ math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 1, u0, u1, u2, u3);
+ // And rearrange within each vector, to get the right 8 bit lanes.
+ assign(*i0, IHI32x4(COD8x16(EX(p0),EX(p0)), CEV8x16(EX(p0),EX(p0))));
+ assign(*i1, IHI32x4(COD8x16(EX(p1),EX(p1)), CEV8x16(EX(p1),EX(p1))));
+ assign(*i2, IHI32x4(COD8x16(EX(p2),EX(p2)), CEV8x16(EX(p2),EX(p2))));
+ assign(*i3, IHI32x4(COD8x16(EX(p3),EX(p3)), CEV8x16(EX(p3),EX(p3))));
+ return;
+ }
+ /*NOTREACHED*/
+ vassert(0);
+}
+
+
+/* Do deinterleaving for 1 128 bit vector, for LD1 insns. */
+static
+void math_DEINTERLEAVE1_128( /*OUTx1*/ IRTemp* u0,
+ UInt laneSzBlg2, IRTemp i0 )
+{
+ assign(*u0, mkexpr(i0));
+}
+
+
+/* Do deinterleaving for 2 128 bit vectors, for LD2 insns. */
+static
+void math_DEINTERLEAVE2_128( /*OUTx2*/ IRTemp* u0, IRTemp* u1,
+ UInt laneSzBlg2, IRTemp i0, IRTemp i1 )
+{
+ /* This is pretty easy, since we have primitives directly to
+ hand. */
+ if (laneSzBlg2 == 3) {
+ // 64x2
+ // i1 == B1 A1, i0 == B0 A0
+ // u1 == B1 B0, u0 == A1 A0
+ assign(*u0, binop(Iop_InterleaveLO64x2, mkexpr(i1), mkexpr(i0)));
+ assign(*u1, binop(Iop_InterleaveHI64x2, mkexpr(i1), mkexpr(i0)));
+ return;
+ }
+ if (laneSzBlg2 == 2) {
+ // 32x4
+ // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0
+ // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0,
+ assign(*u0, binop(Iop_CatEvenLanes32x4, mkexpr(i1), mkexpr(i0)));
+ assign(*u1, binop(Iop_CatOddLanes32x4, mkexpr(i1), mkexpr(i0)));
+ return;
+ }
+ if (laneSzBlg2 == 1) {
+ // 16x8
+ // i0 == B3 A3 B2 A2 B1 A1 B0 A0
+ // i1 == B7 A7 B6 A6 B5 A5 B4 A4
+ // u1 == B{7..0}, u0 == A{7..0}
+ assign(*u0, binop(Iop_CatEvenLanes16x8, mkexpr(i1), mkexpr(i0)));
+ assign(*u1, binop(Iop_CatOddLanes16x8, mkexpr(i1), mkexpr(i0)));
+ return;
+ }
+ if (laneSzBlg2 == 0) {
+ // 8x16
+ // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
+ // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8
+ // u1 == B{f..0}, u0 == A{f..0}
+ assign(*u0, binop(Iop_CatEvenLanes8x16, mkexpr(i1), mkexpr(i0)));
+ assign(*u1, binop(Iop_CatOddLanes8x16, mkexpr(i1), mkexpr(i0)));
+ return;
+ }
+ /*NOTREACHED*/
+ vassert(0);
+}
+
+
+/* Do deinterleaving for 3 128 bit vectors, for LD3 insns. */
+static
+void math_DEINTERLEAVE3_128(
+ /*OUTx3*/ IRTemp* u0, IRTemp* u1, IRTemp* u2,
+ UInt laneSzBlg2,
+ IRTemp i0, IRTemp i1, IRTemp i2 )
+{
+ if (laneSzBlg2 == 3) {
+ // 64x2
+ // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0,
+ // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0
+ assign(*u2, ILO64x2( ROL(EX(i2),8), EX(i1) ));
+ assign(*u1, ILO64x2( EX(i2), ROL(EX(i0),8) ));
+ assign(*u0, ILO64x2( ROL(EX(i1),8), EX(i0) ));
+ return;
+ }
+
+ if (laneSzBlg2 == 2) {
+ // 32x4
+ // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0
+ // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0
+ // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0
+ IRTemp t_a1c0b0a0 = newTempV128();
+ IRTemp t_a2c1b1a1 = newTempV128();
+ IRTemp t_a3c2b2a2 = newTempV128();
+ IRTemp t_a0c3b3a3 = newTempV128();
+ IRTemp p0 = newTempV128();
+ IRTemp p1 = newTempV128();
+ IRTemp p2 = newTempV128();
+ // Compute some intermediate values.
+ assign(t_a1c0b0a0, EX(i0));
+ assign(t_a2c1b1a1, SL(EX(i1),EX(i0),3*4));
+ assign(t_a3c2b2a2, SL(EX(i2),EX(i1),2*4));
+ assign(t_a0c3b3a3, SL(EX(i0),EX(i2),1*4));
+ // First deinterleave into lane-pairs
+ assign(p0, ILO32x4(EX(t_a2c1b1a1),EX(t_a1c0b0a0)));
+ assign(p1, ILO64x2(ILO32x4(EX(t_a0c3b3a3), EX(t_a3c2b2a2)),
+ IHI32x4(EX(t_a2c1b1a1), EX(t_a1c0b0a0))));
+ assign(p2, ILO32x4(ROR(EX(t_a0c3b3a3),1*4), ROR(EX(t_a3c2b2a2),1*4)));
+ // Then deinterleave at 64x2 granularity.
+ math_DEINTERLEAVE3_128(u0, u1, u2, 3, p0, p1, p2);
+ return;
+ }
+
+ if (laneSzBlg2 == 1) {
+ // 16x8
+ // u2 == C7 C6 C5 C4 C3 C2 C1 C0
+ // u1 == B7 B6 B5 B4 B3 B2 B1 B0
+ // u0 == A7 A6 A5 A4 A3 A2 A1 A0
+ //
+ // i2 == C7 B7 A7 C6 B6 A6 C5 B5
+ // i1 == A5 C4 B4 A4 C4 B3 A3 C2
+ // i0 == B2 A2 C1 B1 A1 C0 B0 A0
+ //
+ // p2 == C7 C6 B7 B6 A7 A6 C5 C4
+ // p1 == B5 B4 A5 A4 C3 C2 B3 B2
+ // p0 == A3 A2 C1 C0 B1 B0 A1 A0
+
+ IRTemp s0, s1, s2, s3, t0, t1, t2, t3, p0, p1, p2, c00111111;
+ s0 = s1 = s2 = s3
+ = t0 = t1 = t2 = t3 = p0 = p1 = p2 = c00111111 = IRTemp_INVALID;
+ newTempsV128_4(&s0, &s1, &s2, &s3);
+ newTempsV128_4(&t0, &t1, &t2, &t3);
+ newTempsV128_4(&p0, &p1, &p2, &c00111111);
+
+ // s0 == b2a2 c1b1a1 c0b0a0
+ // s1 == b4a4 c3b3c3 c2b2a2
+ // s2 == b6a6 c5b5a5 c4b4a4
+ // s3 == b0a0 c7b7a7 c6b6a6
+ assign(s0, EX(i0));
+ assign(s1, SL(EX(i1),EX(i0),6*2));
+ assign(s2, SL(EX(i2),EX(i1),4*2));
+ assign(s3, SL(EX(i0),EX(i2),2*2));
+
+ // t0 == 0 0 c1c0 b1b0 a1a0
+ // t1 == 0 0 c3c2 b3b2 a3a2
+ // t2 == 0 0 c5c4 b5b4 a5a4
+ // t3 == 0 0 c7c6 b7b6 a7a6
+ assign(c00111111, mkV128(0x0FFF));
+ assign(t0, AND( ILO16x8( ROR(EX(s0),3*2), EX(s0)), EX(c00111111)));
+ assign(t1, AND( ILO16x8( ROR(EX(s1),3*2), EX(s1)), EX(c00111111)));
+ assign(t2, AND( ILO16x8( ROR(EX(s2),3*2), EX(s2)), EX(c00111111)));
+ assign(t3, AND( ILO16x8( ROR(EX(s3),3*2), EX(s3)), EX(c00111111)));
+
+ assign(p0, OR2(EX(t0), SHL(EX(t1),6*2)));
+ assign(p1, OR2(SHL(EX(t2),4*2), SHR(EX(t1),2*2)));
+ assign(p2, OR2(SHL(EX(t3),2*2), SHR(EX(t2),4*2)));
+
+ // Then deinterleave at 32x4 granularity.
+ math_DEINTERLEAVE3_128(u0, u1, u2, 2, p0, p1, p2);
+ return;
+ }
+
+ if (laneSzBlg2 == 0) {
+ // 8x16. This is the same scheme as for 16x8, with twice the
+ // number of intermediate values.
+ //
+ // u2 == C{f..0}
+ // u1 == B{f..0}
+ // u0 == A{f..0}
+ //
+ // i2 == CBA{f} CBA{e} CBA{d} CBA{c} CBA{b} C{a}
+ // i1 == BA{a} CBA{9} CBA{8} CBA{7} CBA{6} CB{5}
+ // i0 == A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0}
+ //
+ // p2 == C{fe} B{fe} A{fe} C{dc} B{dc} A{dc} C{ba} B{ba}
+ // p1 == A{ba} C{98} B{98} A{98} C{76} B{76} A{76} C{54}
+ // p0 == B{54} A{54} C{32} B{32} A{32} C{10} B{10} A{10}
+ //
+ IRTemp s0, s1, s2, s3, s4, s5, s6, s7,
+ t0, t1, t2, t3, t4, t5, t6, t7, p0, p1, p2, cMASK;
+ s0 = s1 = s2 = s3 = s4 = s5 = s6 = s7
+ = t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = p0 = p1 = p2 = cMASK
+ = IRTemp_INVALID;
+ newTempsV128_4(&s0, &s1, &s2, &s3);
+ newTempsV128_4(&s4, &s5, &s6, &s7);
+ newTempsV128_4(&t0, &t1, &t2, &t3);
+ newTempsV128_4(&t4, &t5, &t6, &t7);
+ newTempsV128_4(&p0, &p1, &p2, &cMASK);
+
+ // s0 == A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0}
+ // s1 == A{7} CBA{6} CBA{5} CBA{4} CBA{3} CBA{2}
+ // s2 == A{9} CBA{8} CBA{7} CBA{6} CBA{5} CBA{4}
+ // s3 == A{b} CBA{a} CBA{9} CBA{8} CBA{7} CBA{6}
+ // s4 == A{d} CBA{c} CBA{b} CBA{a} CBA{9} CBA{8}
+ // s5 == A{f} CBA{e} CBA{d} CBA{c} CBA{b} CBA{a}
+ // s6 == A{1} CBA{0} CBA{f} CBA{e} CBA{d} CBA{c}
+ // s7 == A{3} CBA{2} CBA{1} CBA{0} CBA{f} CBA{e}
+ assign(s0, SL(EX(i1),EX(i0), 0));
+ assign(s1, SL(EX(i1),EX(i0), 6));
+ assign(s2, SL(EX(i1),EX(i0),12));
+ assign(s3, SL(EX(i2),EX(i1), 2));
+ assign(s4, SL(EX(i2),EX(i1), 8));
+ assign(s5, SL(EX(i2),EX(i1),14));
+ assign(s6, SL(EX(i0),EX(i2), 4));
+ assign(s7, SL(EX(i0),EX(i2),10));
+
+ // t0 == 0--(ten)--0 C1 C0 B1 B0 A1 A0
+ // t1 == 0--(ten)--0 C3 C2 B3 B2 A3 A2
+ // t2 == 0--(ten)--0 C5 C4 B5 B4 A5 A4
+ // t3 == 0--(ten)--0 C7 C6 B7 B6 A7 A6
+ // t4 == 0--(ten)--0 C9 C8 B9 B8 A9 A8
+ // t5 == 0--(ten)--0 Cb Ca Bb Ba Ab Aa
+ // t6 == 0--(ten)--0 Cd Cc Bd Bc Ad Ac
+ // t7 == 0--(ten)--0 Cf Ce Bf Be Af Ae
+ assign(cMASK, mkV128(0x003F));
+ assign(t0, AND( ILO8x16( ROR(EX(s0),3), EX(s0)), EX(cMASK)));
+ assign(t1, AND( ILO8x16( ROR(EX(s1),3), EX(s1)), EX(cMASK)));
+ assign(t2, AND( ILO8x16( ROR(EX(s2),3), EX(s2)), EX(cMASK)));
+ assign(t3, AND( ILO8x16( ROR(EX(s3),3), EX(s3)), EX(cMASK)));
+ assign(t4, AND( ILO8x16( ROR(EX(s4),3), EX(s4)), EX(cMASK)));
+ assign(t5, AND( ILO8x16( ROR(EX(s5),3), EX(s5)), EX(cMASK)));
+ assign(t6, AND( ILO8x16( ROR(EX(s6),3), EX(s6)), EX(cMASK)));
+ assign(t7, AND( ILO8x16( ROR(EX(s7),3), EX(s7)), EX(cMASK)));
+
+ assign(p0, OR3( SHL(EX(t2),12), SHL(EX(t1),6), EX(t0) ));
+ assign(p1, OR4( SHL(EX(t5),14), SHL(EX(t4),8),
+ SHL(EX(t3),2), SHR(EX(t2),4) ));
+ assign(p2, OR3( SHL(EX(t7),10), SHL(EX(t6),4), SHR(EX(t5),2) ));
+
+ // Then deinterleave at 16x8 granularity.
+ math_DEINTERLEAVE3_128(u0, u1, u2, 1, p0, p1, p2);
+ return;
+ }
+
+ /*NOTREACHED*/
+ vassert(0);
+}
+
+
+/* Do deinterleaving for 4 128 bit vectors, for LD4 insns. */
+static
+void math_DEINTERLEAVE4_128(
+ /*OUTx4*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, IRTemp* u3,
+ UInt laneSzBlg2,
+ IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3 )
+{
+ if (laneSzBlg2 == 3) {
+ // 64x2
+ assign(*u0, ILO64x2(EX(i2), EX(i0)));
+ assign(*u1, IHI64x2(EX(i2), EX(i0)));
+ assign(*u2, ILO64x2(EX(i3), EX(i1)));
+ assign(*u3, IHI64x2(EX(i3), EX(i1)));
+ return;
+ }
+ if (laneSzBlg2 == 2) {
+ // 32x4
+ IRTemp p0 = newTempV128();
+ IRTemp p2 = newTempV128();
+ IRTemp p1 = newTempV128();
+ IRTemp p3 = newTempV128();
+ assign(p0, ILO32x4(EX(i1), EX(i0)));
+ assign(p1, IHI32x4(EX(i1), EX(i0)));
+ assign(p2, ILO32x4(EX(i3), EX(i2)));
+ assign(p3, IHI32x4(EX(i3), EX(i2)));
+ // And now do what we did for the 64-bit case.
+ math_DEINTERLEAVE4_128(u0, u1, u2, u3, 3, p0, p1, p2, p3);
+ return;
+ }
+ if (laneSzBlg2 == 1) {
+ // 16x8
+ // Deinterleave into 32-bit chunks, then do as the 32-bit case.
+ IRTemp p0 = newTempV128();
+ IRTemp p1 = newTempV128();
+ IRTemp p2 = newTempV128();
+ IRTemp p3 = newTempV128();
+ assign(p0, IHI16x8(EX(i0), SHL(EX(i0), 8)));
+ assign(p1, IHI16x8(EX(i1), SHL(EX(i1), 8)));
+ assign(p2, IHI16x8(EX(i2), SHL(EX(i2), 8)));
+ assign(p3, IHI16x8(EX(i3), SHL(EX(i3), 8)));
+ // From here on is like the 32 bit case.
+ math_DEINTERLEAVE4_128(u0, u1, u2, u3, 2, p0, p1, p2, p3);
+ return;
+ }
+ if (laneSzBlg2 == 0) {
+ // 8x16
+ // Deinterleave into 16-bit chunks, then do as the 16-bit case.
+ IRTemp p0 = newTempV128();
+ IRTemp p1 = newTempV128();
+ IRTemp p2 = newTempV128();
+ IRTemp p3 = newTempV128();
+ assign(p0, IHI64x2( IHI8x16(EX(i0),ROL(EX(i0),4)),
+ ILO8x16(EX(i0),ROL(EX(i0),4)) ));
+ assign(p1, IHI64x2( IHI8x16(EX(i1),ROL(EX(i1),4)),
+ ILO8x16(EX(i1),ROL(EX(i1),4)) ));
+ assign(p2, IHI64x2( IHI8x16(EX(i2),ROL(EX(i2),4)),
+ ILO8x16(EX(i2),ROL(EX(i2),4)) ));
+ assign(p3, IHI64x2( IHI8x16(EX(i3),ROL(EX(i3),4)),
+ ILO8x16(EX(i3),ROL(EX(i3),4)) ));
+ // From here on is like the 16 bit case.
+ math_DEINTERLEAVE4_128(u0, u1, u2, u3, 1, p0, p1, p2, p3);
+ return;
+ }
+ /*NOTREACHED*/
+ vassert(0);
+}
+
+
+/* Wrappers that use the full-width (de)interleavers to do half-width
+ (de)interleaving. The scheme is to clone each input lane in the
+ lower half of each incoming value, do a full width (de)interleave
+ at the next lane size up, and remove every other lane of the the
+ result. The returned values may have any old junk in the upper
+ 64 bits -- the caller must ignore that. */
+
+/* Helper function -- get doubling and narrowing operations. */
+static
+void math_get_doubler_and_halver ( /*OUT*/IROp* doubler,
+ /*OUT*/IROp* halver,
+ UInt laneSzBlg2 )
+{
+ switch (laneSzBlg2) {
+ case 2:
+ *doubler = Iop_InterleaveLO32x4; *halver = Iop_CatEvenLanes32x4;
+ break;
+ case 1:
+ *doubler = Iop_InterleaveLO16x8; *halver = Iop_CatEvenLanes16x8;
+ break;
+ case 0:
+ *doubler = Iop_InterleaveLO8x16; *halver = Iop_CatEvenLanes8x16;
+ break;
+ default:
+ vassert(0);
+ }
+}
+
+/* Do interleaving for 1 64 bit vector, for ST1 insns. */
+static
+void math_INTERLEAVE1_64( /*OUTx1*/ IRTemp* i0,
+ UInt laneSzBlg2, IRTemp u0 )
+{
+ assign(*i0, mkexpr(u0));
+}
+
+
+/* Do interleaving for 2 64 bit vectors, for ST2 insns. */
+static
+void math_INTERLEAVE2_64( /*OUTx2*/ IRTemp* i0, IRTemp* i1,
+ UInt laneSzBlg2, IRTemp u0, IRTemp u1 )
+{
+ if (laneSzBlg2 == 3) {
+ // 1x64, degenerate case
+ assign(*i0, EX(u0));
+ assign(*i1, EX(u1));
+ return;
+ }
+
+ vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
+ IROp doubler = Iop_INVALID, halver = Iop_INVALID;
+ math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
+
+ IRTemp du0 = newTempV128();
+ IRTemp du1 = newTempV128();
+ assign(du0, binop(doubler, EX(u0), EX(u0)));
+ assign(du1, binop(doubler, EX(u1), EX(u1)));
+ IRTemp di0 = newTempV128();
+ IRTemp di1 = newTempV128();
+ math_INTERLEAVE2_128(&di0, &di1, laneSzBlg2 + 1, du0, du1);
+ assign(*i0, binop(halver, EX(di0), EX(di0)));
+ assign(*i1, binop(halver, EX(di1), EX(di1)));
+}
+
+
+/* Do interleaving for 3 64 bit vectors, for ST3 insns. */
+static
+void math_INTERLEAVE3_64(
+ /*OUTx3*/ IRTemp* i0, IRTemp* i1, IRTemp* i2,
+ UInt laneSzBlg2,
+ IRTemp u0, IRTemp u1, IRTemp u2 )
+{
+ if (laneSzBlg2 == 3) {
+ // 1x64, degenerate case
+ assign(*i0, EX(u0));
+ assign(*i1, EX(u1));
+ assign(*i2, EX(u2));
+ return;
+ }
+
+ vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
+ IROp doubler = Iop_INVALID, halver = Iop_INVALID;
+ math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
+
+ IRTemp du0 = newTempV128();
+ IRTemp du1 = newTempV128();
+ IRTemp du2 = newTempV128();
+ assign(du0, binop(doubler, EX(u0), EX(u0)));
+ assign(du1, binop(doubler, EX(u1), EX(u1)));
+ assign(du2, binop(doubler, EX(u2), EX(u2)));
+ IRTemp di0 = newTempV128();
+ IRTemp di1 = newTempV128();
+ IRTemp di2 = newTempV128();
+ math_INTERLEAVE3_128(&di0, &di1, &di2, laneSzBlg2 + 1, du0, du1, du2);
+ assign(*i0, binop(halver, EX(di0), EX(di0)));
+ assign(*i1, binop(halver, EX(di1), EX(di1)));
+ assign(*i2, binop(halver, EX(di2), EX(di2)));
+}
+
+
+/* Do interleaving for 4 64 bit vectors, for ST4 insns. */
+static
+void math_INTERLEAVE4_64(
+ /*OUTx4*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, IRTemp* i3,
+ UInt laneSzBlg2,
+ IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3 )
+{
+ if (laneSzBlg2 == 3) {
+ // 1x64, degenerate case
+ assign(*i0, EX(u0));
+ assign(*i1, EX(u1));
+ assign(*i2, EX(u2));
+ assign(*i3, EX(u3));
+ return;
+ }
+
+ vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
+ IROp doubler = Iop_INVALID, halver = Iop_INVALID;
+ math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
+
+ IRTemp du0 = newTempV128();
+ IRTemp du1 = newTempV128();
+ IRTemp du2 = newTempV128();
+ IRTemp du3 = newTempV128();
+ assign(du0, binop(doubler, EX(u0), EX(u0)));
+ assign(du1, binop(doubler, EX(u1), EX(u1)));
+ assign(du2, binop(doubler, EX(u2), EX(u2)));
+ assign(du3, binop(doubler, EX(u3), EX(u3)));
+ IRTemp di0 = newTempV128();
+ IRTemp di1 = newTempV128();
+ IRTemp di2 = newTempV128();
+ IRTemp di3 = newTempV128();
+ math_INTERLEAVE4_128(&di0, &di1, &di2, &di3,
+ laneSzBlg2 + 1, du0, du1, du2, du3);
+ assign(*i0, binop(halver, EX(di0), EX(di0)));
+ assign(*i1, binop(halver, EX(di1), EX(di1)));
+ assign(*i2, binop(halver, EX(di2), EX(di2)));
+ assign(*i3, binop(halver, EX(di3), EX(di3)));
+}
+
+
+/* Do deinterleaving for 1 64 bit vector, for LD1 insns. */
+static
+void math_DEINTERLEAVE1_64( /*OUTx1*/ IRTemp* u0,
+ UInt laneSzBlg2, IRTemp i0 )
+{
+ assign(*u0, mkexpr(i0));
+}
+
+
+/* Do deinterleaving for 2 64 bit vectors, for LD2 insns. */
+static
+void math_DEINTERLEAVE2_64( /*OUTx2*/ IRTemp* u0, IRTemp* u1,
+ UInt laneSzBlg2, IRTemp i0, IRTemp i1 )
+{
+ if (laneSzBlg2 == 3) {
+ // 1x64, degenerate case
+ assign(*u0, EX(i0));
+ assign(*u1, EX(i1));
+ return;
+ }
+
+ vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
+ IROp doubler = Iop_INVALID, halver = Iop_INVALID;
+ math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
+
+ IRTemp di0 = newTempV128();
+ IRTemp di1 = newTempV128();
+ assign(di0, binop(doubler, EX(i0), EX(i0)));
+ assign(di1, binop(doubler, EX(i1), EX(i1)));
+
+ IRTemp du0 = newTempV128();
+ IRTemp du1 = newTempV128();
+ math_DEINTERLEAVE2_128(&du0, &du1, laneSzBlg2 + 1, di0, di1);
+ assign(*u0, binop(halver, EX(du0), EX(du0)));
+ assign(*u1, binop(halver, EX(du1), EX(du1)));
+}
+
+
+/* Do deinterleaving for 3 64 bit vectors, for LD3 insns. */
+static
+void math_DEINTERLEAVE3_64(
+ /*OUTx3*/ IRTemp* u0, IRTemp* u1, IRTemp* u2,
+ UInt laneSzBlg2,
+ IRTemp i0, IRTemp i1, IRTemp i2 )
+{
+ if (laneSzBlg2 == 3) {
+ // 1x64, degenerate case
+ assign(*u0, EX(i0));
+ assign(*u1, EX(i1));
+ assign(*u2, EX(i2));
+ return;
+ }
+
+ vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
+ IROp doubler = Iop_INVALID, halver = Iop_INVALID;
+ math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
+
+ IRTemp di0 = newTempV128();
+ IRTemp di1 = newTempV128();
+ IRTemp di2 = newTempV128();
+ assign(di0, binop(doubler, EX(i0), EX(i0)));
+ assign(di1, binop(doubler, EX(i1), EX(i1)));
+ assign(di2, binop(doubler, EX(i2), EX(i2)));
+ IRTemp du0 = newTempV128();
+ IRTemp du1 = newTempV128();
+ IRTemp du2 = newTempV128();
+ math_DEINTERLEAVE3_128(&du0, &du1, &du2, laneSzBlg2 + 1, di0, di1, di2);
+ assign(*u0, binop(halver, EX(du0), EX(du0)));
+ assign(*u1, binop(halver, EX(du1), EX(du1)));
+ assign(*u2, binop(halver, EX(du2), EX(du2)));
+}
+
+
+/* Do deinterleaving for 4 64 bit vectors, for LD4 insns. */
+static
+void math_DEINTERLEAVE4_64(
+ /*OUTx4*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, IRTemp* u3,
+ UInt laneSzBlg2,
+ IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3 )
+{
+ if (laneSzBlg2 == 3) {
+ // 1x64, degenerate case
+ assign(*u0, EX(i0));
+ assign(*u1, EX(i1));
+ assign(*u2, EX(i2));
+ assign(*u3, EX(i3));
+ return;
+ }
+
+ vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
+ IROp doubler = Iop_INVALID, halver = Iop_INVALID;
+ math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
+
+ IRTemp di0 = newTempV128();
+ IRTemp di1 = newTempV128();
+ IRTemp di2 = newTempV128();
+ IRTemp di3 = newTempV128();
+ assign(di0, binop(doubler, EX(i0), EX(i0)));
+ assign(di1, binop(doubler, EX(i1), EX(i1)));
+ assign(di2, binop(doubler, EX(i2), EX(i2)));
+ assign(di3, binop(doubler, EX(i3), EX(i3)));
+ IRTemp du0 = newTempV128();
+ IRTemp du1 = newTempV128();
+ IRTemp du2 = newTempV128();
+ IRTemp du3 = newTempV128();
+ math_DEINTERLEAVE4_128(&du0, &du1, &du2, &du3,
+ laneSzBlg2 + 1, di0, di1, di2, di3);
+ assign(*u0, binop(halver, EX(du0), EX(du0)));
+ assign(*u1, binop(halver, EX(du1), EX(du1)));
+ assign(*u2, binop(halver, EX(du2), EX(du2)));
+ assign(*u3, binop(halver, EX(du3), EX(du3)));
+}
+
+
+#undef EX
+#undef SL
+#undef ROR
+#undef ROL
+#undef SHR
+#undef SHL
+#undef ILO64x2
+#undef IHI64x2
+#undef ILO32x4
+#undef IHI32x4
+#undef ILO16x8
+#undef IHI16x8
+#undef ILO16x8
+#undef IHI16x8
+#undef CEV32x4
+#undef COD32x4
+#undef COD16x8
+#undef COD8x16
+#undef CEV8x16
+#undef AND
+#undef OR2
+#undef OR3
+#undef OR4
+
+
+/*------------------------------------------------------------*/
/*--- Load and Store instructions ---*/
/*------------------------------------------------------------*/
@@ -3526,7 +4470,7 @@
case BITS3(0,1,0): break; // 32 bit, valid for both int and vec
case BITS3(0,1,1): break; // 64 bit, valid for both int and vec
case BITS3(1,0,0): // can only ever be valid for the vector case
- if (isInt) goto fail; else goto fail;
+ if (isInt) goto fail; else break;
case BITS3(1,0,1): // these sizes are never valid
case BITS3(1,1,0):
case BITS3(1,1,1): goto fail;
@@ -4250,21 +5194,21 @@
sz==11 isn't allowed
simm7 is scaled by the (single-register) transfer size
- 31 29 22 21 14 9 4
- sz 101 1001 L imm7 t2 n t1 mmP SDQt1, SDQt2, [Xn|SP], #imm
- (at-Rn-then-Rn=EA)
+ 31 29 26 22 21 14 9 4
- sz 101 1011 L imm7 t2 n t1 mmP SDQt1, SDQt2, [Xn|SP, #imm]!
- (at-EA-then-Rn=EA)
+ sz 101 1000 L imm7 t2 n t1 mmNP SDQt1, SDQt2, [Xn|SP, #imm]
+ (at-EA, with nontemporal hint)
+
+ sz 101 1001 L imm7 t2 n t1 mmP SDQt1, SDQt2, [Xn|SP], #imm
+ (at-Rn-then-Rn=EA)
sz 101 1010 L imm7 t2 n t1 mmP SDQt1, SDQt2, [Xn|SP, #imm]
- (at-EA)
- */
+ (at-EA)
- UInt insn_29_23 = INSN(29,23);
- if (insn_29_23 == BITS7(1,0,1,1,0,0,1)
- || insn_29_23 == BITS7(1,0,1,1,0,1,1)
- || insn_29_23 == BITS7(1,0,1,1,0,1,0)) {
+ sz 101 1011 L imm7 t2 n t1 mmP SDQt1, SDQt2, [Xn|SP, #imm]!
+ (at-EA-then-Rn=EA)
+ */
+ if (INSN(29,25) == BITS5(1,0,1,1,0)) {
UInt szSlg2 = INSN(31,30); // log2 of the xfer size in 32-bit units
Bool isLD = INSN(22,22) == 1;
Bool wBack = INSN(23,23) == 1;
@@ -4293,6 +5237,7 @@
case BITS2(1,1):
assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
case BITS2(1,0):
+ case BITS2(0,0):
assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
default:
vassert(0); /* NOTREACHED */
@@ -4358,6 +5303,9 @@
case BITS2(1,0):
fmt_str = "%sp %s, %s, [%s, #%lld] (at-Rn)\n";
break;
+ case BITS2(0,0):
+ fmt_str = "%snp %s, %s, [%s, #%lld] (at-Rn)\n";
+ break;
default:
vassert(0);
}
@@ -4389,7 +5337,7 @@
UInt szLg2 = (INSN(23,23) << 2) | INSN(31,30);
Bool isLD = INSN(22,22) == 1;
UInt tt = INSN(4,0);
- if (szLg2 >= 4) goto after_LDR_STR_vector_register;
+ if (szLg2 > 4) goto after_LDR_STR_vector_register;
IRTemp ea = gen_indexed_EA(dis_buf, insn, False/*to/from vec regs*/);
if (ea == IRTemp_INVALID) goto after_LDR_STR_vector_register;
switch (szLg2) {
@@ -4433,8 +5381,17 @@
DIP("str %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf);
}
break;
- case 4: return False; //ATC
- default: vassert(0);
+ case 4:
+ if (isLD) {
+ putQReg128(tt, loadLE(Ity_V128, mkexpr(ea)));
+ DIP("ldr %s, %s\n", nameQReg128(tt), dis_buf);
+ } else {
+ storeLE(mkexpr(ea), getQReg128(tt));
+ DIP("str %s, %s\n", nameQReg128(tt), dis_buf);
+ }
+ break;
+ default:
+ vassert(0);
}
return True;
}
@@ -4643,140 +5600,217 @@
return True;
}
- /* ---------- LD1/ST1 (single structure, no offset) ---------- */
- /* 31 23
- 0100 1100 0100 0000 0111 11 N T LD1 {vT.2d}, [Xn|SP]
- 0100 1100 0000 0000 0111 11 N T ST1 {vT.2d}, [Xn|SP]
- 0100 1100 0100 0000 0111 10 N T LD1 {vT.4s}, [Xn|SP]
- 0100 1100 0000 0000 0111 10 N T ST1 {vT.4s}, [Xn|SP]
- 0100 1100 0100 0000 0111 01 N T LD1 {vT.8h}, [Xn|SP]
- 0100 1100 0000 0000 0111 01 N T ST1 {vT.8h}, [Xn|SP]
- 0100 1100 0100 0000 0111 00 N T LD1 {vT.16b}, [Xn|SP]
- 0100 1100 0000 0000 0111 00 N T ST1 {vT.16b}, [Xn|SP]
- FIXME does this assume that the host is little endian?
- */
- if ( (insn & 0xFFFFF000) == 0x4C407000 // LD1 cases
- || (insn & 0xFFFFF000) == 0x4C007000 // ST1 cases
- ) {
- Bool isLD = INSN(22,22) == 1;
- UInt rN = INSN(9,5);
- UInt vT = INSN(4,0);
- IRTemp tEA = newTemp(Ity_I64);
- const HChar* names[4] = { "2d", "4s", "8h", "16b" };
- const HChar* name = names[INSN(11,10)];
- assign(tEA, getIReg64orSP(rN));
- if (rN == 31) { /* FIXME generate stack alignment check */ }
- if (isLD) {
- putQReg128(vT, loadLE(Ity_V128, mkexpr(tEA)));
- } else {
- storeLE(mkexpr(tEA), getQReg128(vT));
+ /* ------ LD1/ST1 (multiple 1-elem structs to/from 1 reg ------ */
+ /* ------ LD2/ST2 (multiple 2-elem structs to/from 2 regs ------ */
+ /* ------ LD3/ST3 (multiple 3-elem structs to/from 3 regs ------ */
+ /* ------ LD4/ST4 (multiple 4-elem structs to/from 4 regs ------ */
+ /* 31 29 26 22 21 20 15 11 9 4
+
+ 0q 001 1000 L 0 00000 0000 sz n t xx4 {Vt..t+3.T}, [Xn|SP]
+ 0q 001 1001 L 0 m 0000 sz n t xx4 {Vt..t+3.T}, [Xn|SP], step
+
+ 0q 001 1000 L 0 00000 0100 sz n t xx3 {Vt..t+2.T}, [Xn|SP]
+ 0q 001 1001 L 0 m 0100 sz n t xx3 {Vt..t+2.T}, [Xn|SP], step
+
+ 0q 001 1000 L 0 00000 1000 sz n t xx2 {Vt..t+1.T}, [Xn|SP]
+ 0q 001 1001 L 0 m 1000 sz n t xx2 {Vt..t+1.T}, [Xn|SP], step
+
+ 0q 001 1000 L 0 00000 0111 sz n t xx1 {Vt.T}, [Xn|SP]
+ 0q 001 1001 L 0 m 0111 sz n t xx1 {Vt.T}, [Xn|SP], step
+
+ T = defined by Q and sz in the normal way
+ step = if m == 11111 then transfer-size else Xm
+ xx = case L of 1 -> LD ; 0 -> ST
+ */
+ if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,0)
+ && INSN(21,21) == 0) {
+ Bool bitQ = INSN(30,30);
+ Bool isPX = INSN(23,23) == 1;
+ Bool isLD = INSN(22,22) == 1;
+ UInt mm = INSN(20,16);
+ UInt opc = INSN(15,12);
+ UInt sz = INSN(11,10);
+ UInt nn = INSN(9,5);
+ UInt tt = INSN(4,0);
+ Bool isQ = bitQ == 1;
+ Bool is1d = sz == BITS2(1,1) && !isQ;
+ UInt nRegs = 0;
+ switch (opc) {
+ case BITS4(0,0,0,0): nRegs = 4; break;
+ case BITS4(0,1,0,0): nRegs = 3; break;
+ case BITS4(1,0,0,0): nRegs = 2; break;
+ case BITS4(0,1,1,1): nRegs = 1; break;
+ default: break;
}
- DIP("%s {v%u.%s}, [%s]\n", isLD ? "ld1" : "st1",
- vT, name, nameIReg64orSP(rN));
- return True;
- }
+
+ if (nRegs == 1 /* .1d is allowed */
+ || (nRegs >= 2 && nRegs <= 4 && !is1d) /* .1d is not allowed */) {
- /* 31 23
- 0000 1100 0100 0000 0111 11 N T LD1 {vT.1d}, [Xn|SP]
- 0000 1100 0000 0000 0111 11 N T ST1 {vT.1d}, [Xn|SP]
- 0000 1100 0100 0000 0111 10 N T LD1 {vT.2s}, [Xn|SP]
- 0000 1100 0000 0000 0111 10 N T ST1 {vT.2s}, [Xn|SP]
- 0000 1100 0100 0000 0111 01 N T LD1 {vT.4h}, [Xn|SP]
- 0000 1100 0000 0000 0111 01 N T ST1 {vT.4h}, [Xn|SP]
- 0000 1100 0100 0000 0111 00 N T LD1 {vT.8b}, [Xn|SP]
- 0000 1100 0000 0000 0111 00 N T ST1 {vT.8b}, [Xn|SP]
- FIXME does this assume that the host is little endian?
- */
- if ( (insn & 0xFFFFF000) == 0x0C407000 // LD1 cases
- || (insn & 0xFFFFF000) == 0x0C007000 // ST1 cases
- ) {
- Bool isLD = INSN(22,22) == 1;
- UInt rN = INSN(9,5);
- UInt vT = INSN(4,0);
- IRTemp tEA = newTemp(Ity_I64);
- const HChar* names[4] = { "1d", "2s", "4h", "8b" };
- const HChar* name = names[INSN(11,10)];
- assign(tEA, getIReg64orSP(rN));
- if (rN == 31) { /* FIXME generate stack alignment check */ }
- if (isLD) {
- putQRegLane(vT, 0, loadLE(Ity_I64, mkexpr(tEA)));
- putQRegLane(vT, 1, mkU64(0));
- } else {
- storeLE(mkexpr(tEA), getQRegLane(vT, 0, Ity_I64));
- }
- DIP("%s {v%u.%s}, [%s]\n", isLD ? "ld1" : "st1",
- vT, name, nameIReg64orSP(rN));
- return True;
- }
+ UInt xferSzB = (isQ ? 16 : 8) * nRegs;
- /* ---------- LD1/ST1 (single structure, post index) ---------- */
- /* 31 23
- 0100 1100 1001 1111 0111 11 N T ST1 {vT.2d}, [xN|SP], #16
- 0100 1100 1101 1111 0111 11 N T LD1 {vT.2d}, [xN|SP], #16
- 0100 1100 1001 1111 0111 10 N T ST1 {vT.4s}, [xN|SP], #16
- 0100 1100 1101 1111 0111 10 N T LD1 {vT.4s}, [xN|SP], #16
- 0100 1100 1001 1111 0111 01 N T ST1 {vT.8h}, [xN|SP], #16
- 0100 1100 1101 1111 0111 01 N T LD1 {vT.8h}, [xN|SP], #16
- 0100 1100 1001 1111 0111 00 N T ST1 {vT.16b}, [xN|SP], #16
- 0100 1100 1101 1111 0111 00 N T LD1 {vT.16b}, [xN|SP], #16
- Note that #16 is implied and cannot be any other value.
- FIXME does this assume that the host is little endian?
- */
- if ( (insn & 0xFFFFF000) == 0x4CDF7000 // LD1 cases
- || (insn & 0xFFFFF000) == 0x4C9F7000 // ST1 cases
- ) {
- Bool isLD = INSN(22,22) == 1;
- UInt rN = INSN(9,5);
- UInt vT = INSN(4,0);
- IRTemp tEA = newTemp(Ity_I64);
- const HChar* names[4] = { "2d", "4s", "8h", "16b" };
- const HChar* name = names[INSN(11,10)];
- assign(tEA, getIReg64orSP(rN));
- if (rN == 31) { /* FIXME generate stack alignment check */ }
- if (isLD) {
- putQReg128(vT, loadLE(Ity_V128, mkexpr(tEA)));
- } else {
- storeLE(mkexpr(tEA), getQReg128(vT));
- }
- putIReg64orSP(rN, binop(Iop_Add64, mkexpr(tEA), mkU64(16)));
- DIP("%s {v%u.%s}, [%s], #16\n", isLD ? "ld1" : "st1",
- vT, name, nameIReg64orSP(rN));
- return True;
- }
+ /* Generate the transfer address (TA) and if necessary the
+ writeback address (WB) */
+ IRTemp tTA = newTemp(Ity_I64);
+ assign(tTA, getIReg64orSP(nn));
+ if (nn == 31) { /* FIXME generate stack alignment check */ }
+ IRTemp tWB = IRTemp_INVALID;
+ if (isPX) {
+ tWB = newTemp(Ity_I64);
+ assign(tWB, binop(Iop_Add64,
+ mkexpr(tTA),
+ mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
+ : getIReg64orZR(mm)));
+ }
+
+ /* -- BEGIN generate the transfers -- */
+
+ IRTemp u0, u1, u2, u3, i0, i1, i2, i3;
+ u0 = u1 = u2 = u3 = i0 = i1 = i2 = i3 = IRTemp_INVALID;
+ switch (nRegs) {
+ case 4: u3 = newTempV128(); i3 = newTempV128(); /* fallthru */
+ case 3: u2 = newTempV128(); i2 = newTempV128(); /* fallthru */
+ case 2: u1 = newTempV128(); i1 = newTempV128(); /* fallthru */
+ case 1: u0 = newTempV128(); i0 = newTempV128(); break;
+ default: vassert(0);
+ }
- /* 31 23
- 0000 1100 1001 1111 0111 11 N T ST1 {vT.1d}, [xN|SP], #8
- 0000 1100 1101 1111 0111 11 N T LD1 {vT.1d}, [xN|SP], #8
- 0000 1100 1001 1111 0111 10 N T ST1 {vT.2s}, [xN|SP], #8
- 0000 1100 1101 1111 0111 10 N T LD1 {vT.2s}, [xN|SP], #8
- 0000 1100 1001 1111 0111 01 N T ST1 {vT.4h}, [xN|SP], #8
- 0000 1100 1101 1111 0111 01 N T LD1 {vT.4h}, [xN|SP], #8
- 0000 1100 1001 1111 0111 00 N T ST1 {vT.8b}, [xN|SP], #8
- 0000 1100 1101 1111 0111 00 N T LD1 {vT.8b}, [xN|SP], #8
- Note that #8 is implied and cannot be any other value.
- FIXME does this assume that the host is little endian?
- */
- if ( (insn & 0xFFFFF000) == 0x0CDF7000 // LD1 cases
- || (insn & 0xFFFFF000) == 0x0C9F7000 // ST1 cases
- ) {
- Bool isLD = INSN(22,22) == 1;
- UInt rN = INSN(9,5);
- UInt vT = INSN(4,0);
- IRTemp tEA = newTemp(Ity_I64);
- const HChar* names[4] = { "1d", "2s", "4h", "8b" };
- const HChar* name = names[INSN(11,10)];
- assign(tEA, getIReg64orSP(rN));
- if (rN == 31) { /* FIXME generate stack alignment check */ }
- if (isLD) {
- putQRegLane(vT, 0, loadLE(Ity_I64, mkexpr(tEA)));
- putQRegLane(vT, 1, mkU64(0));
- } else {
- storeLE(mkexpr(tEA), getQRegLane(vT, 0, Ity_I64));
+ /* -- Multiple 128 or 64 bit stores -- */
+ if (!isLD) {
+ switch (nRegs) {
+ case 4: assign(u3, getQReg128((tt+3) % 32)); /* fallthru */
+ case 3: assign(u2, getQReg128((tt+2) % 32)); /* fallthru */
+ case 2: assign(u1, getQReg128((tt+1) % 32)); /* fallthru */
+ case 1: assign(u0, getQReg128((tt+0) % 32)); break;
+ default: vassert(0);
+ }
+ switch (nRegs) {
+ case 4: (isQ ? math_INTERLEAVE4_128 : math_INTERLEAVE4_64)
+ (&i0, &i1, &i2, &i3, sz, u0, u1, u2, u3);
+ break;
+ case 3: (isQ ? math_INTERLEAVE3_128 : math_INTERLEAVE3_64)
+ (&i0, &i1, &i2, sz, u0, u1, u2);
+ break;
+ case 2: (isQ ? math_INTERLEAVE2_128 : math_INTERLEAVE2_64)
+ (&i0, &i1, sz, u0, u1);
+ break;
+ case 1: (isQ ? math_INTERLEAVE1_128 : math_INTERLEAVE1_64)
+ (&i0, sz, u0);
+ break;
+ default: vassert(0);
+ }
+# define MAYBE_NARROW_TO_64(_expr) \
+ (isQ ? (_expr) : unop(Iop_V128to64,(_expr)))
+ UInt step = isQ ? 16 : 8;
+ switch (nRegs) {
+ case 4: storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(3*step)),
+ MAYBE_NARROW_TO_64(mkexpr(i3)) );
+ /* fallthru */
+ case 3: storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(2*step)),
+ MAYBE_NARROW_TO_64(mkexpr(i2)) );
+ /* fallthru */
+ case 2: storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(1*step)),
+ MAYBE_NARROW_TO_64(mkexpr(i1)) );
+ /* fallthru */
+ case 1: storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(0*step)),
+ MAYBE_NARROW_TO_64(mkexpr(i0)) );
+ break;
+ default: vassert(0);
+ }
+# undef MAYBE_NARROW_TO_64
+ }
+
+ /* -- Multiple 128 or 64 bit loads -- */
+ else /* isLD */ {
+ UInt step = isQ ? 16 : 8;
+ IRType loadTy = isQ ? Ity_V128 : Ity_I64;
+# define MAYBE_WIDEN_FROM_64(_expr) \
+ (isQ ? (_expr) : unop(Iop_64UtoV128,(_expr)))
+ switch (nRegs) {
+ case 4:
+ assign(i3, MAYBE_WIDEN_FROM_64(
+ loadLE(loadTy,
+ binop(Iop_Add64, mkexpr(tTA),
+ mkU64(3 * step)))));
+ /* fallthru */
+ case 3:
+ assign(i2, MAYBE_WIDEN_FROM_64(
+ loadLE(loadTy,
+ binop(Iop_Add64, mkexpr(tTA),
+ mkU64(2 * step)))));
+ /* fallthru */
+ case 2:
+ assign(i1, MAYBE_WIDEN_FROM_64(
+ loadLE(loadTy,
+ binop(Iop_Add64, mkexpr(tTA),
+ mkU64(1 * step)))));
+ /* fallthru */
+
+ case 1:
+ assign(i0, MAYBE_WIDEN_FROM_64(
+ loadLE(loadTy,
+ binop(Iop_Add64, mkexpr(tTA),
+ mkU64(0 * step)))));
+ break;
+ default:
+ vassert(0);
+ }
+# undef MAYBE_WIDEN_FROM_64
+ switch (nRegs) {
+ case 4: (isQ ? math_DEINTERLEAVE4_128 : math_DEINTERLEAVE4_64)
+ (&u0, &u1, &u2, &u3, sz, i0,i1,i2,i3);
+ break;
+ case 3: (isQ ? math_DEINTERLEAVE3_128 : math_DEINTERLEAVE3_64)
+ (&u0, &u1, &u2, sz, i0, i1, i2);
+ break;
+ case 2: (isQ ? math_DEINTERLEAVE2_128 : math_DEINTERLEAVE2_64)
+ (&u0, &u1, sz, i0, i1);
+ break;
+ case 1: (isQ ? math_DEINTERLEAVE1_128 : math_DEINTERLEAVE1_64)
+ (&u0, sz, i0);
+ break;
+ default: vassert(0);
+ }
+ switch (nRegs) {
+ case 4: putQReg128( (tt+3) % 32,
+ math_MAYBE_ZERO_HI64(bitQ, u3));
+ /* fallthru */
+ case 3: putQReg128( (tt+2) % 32,
+ math_MAYBE_ZERO_HI64(bitQ, u2));
+ /* fallthru */
+ case 2: putQReg128( (tt+1) % 32,
+ math_MAYBE_ZERO_HI64(bitQ, u1));
+ /* fallthru */
+ case 1: putQReg128( (tt+0) % 32,
+ math_MAYBE_ZERO_HI64(bitQ, u0));
+ break;
+ default: vassert(0);
+ }
+ }
+
+ /* -- END generate the transfers -- */
+
+ /* Do the writeback, if necessary */
+ if (isPX) {
+ putIReg64orSP(nn, mkexpr(tWB));
+ }
+
+ HChar pxStr[20];
+ pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
+ if (isPX) {
+ if (mm == BITS5(1,1,1,1,1))
+ vex_sprintf(pxStr, ", #%u", xferSzB);
+ else
+ vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
+ }
+ const HChar* arr = nameArr_Q_SZ(bitQ, sz);
+ DIP("%s%u {v%u.%s .. v%u.%s}, [%s]%s\n",
+ isLD ? "ld" : "st", nRegs,
+ (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
+ pxStr);
+
+ return True;
}
- putIReg64orSP(rN, binop(Iop_Add64, mkexpr(tEA), mkU64(8)));
- DIP("%s {v%u.%s}, [%s], #8\n", isLD ? "ld1" : "st1",
- vT, name, nameIReg64orSP(rN));
- return True;
+ /* else fall through */
}
/* ---------- LD1R (single structure, replicate) ---------- */
@@ -4824,81 +5858,6 @@
return False;
}
- /* -------- LD2/ST2 (multi 2-elem structs, 2 regs, post index) -------- */
- /* Only a very few cases. */
- /* 31 23 11 9 4
- 0100 1100 1101 1111 1000 11 n t LD2 {Vt.2d, V(t+1)%32.2d}, [Xn|SP], #32
- 0100 1100 1001 1111 1000 11 n t ST2 {Vt.2d, V(t+1)%32.2d}, [Xn|SP], #32
- 0100 1100 1101 1111 1000 10 n t LD2 {Vt.4s, V(t+1)%32.4s}, [Xn|SP], #32
- 0100 1100 1001 1111 1000 10 n t ST2 {Vt.4s, V(t+1)%32.4s}, [Xn|SP], #32
- */
- if ( (insn & 0xFFFFFC00) == 0x4CDF8C00 // LD2 .2d
- || (insn & 0xFFFFFC00) == 0x4C9F8C00 // ST2 .2d
- || (insn & 0xFFFFFC00) == 0x4CDF8800 // LD2 .4s
- || (insn & 0xFFFFFC00) == 0x4C9F8800 // ST2 .4s
- ) {
- Bool isLD = INSN(22,22) == 1;
- UInt rN = INSN(9,5);
- UInt vT = INSN(4,0);
- IRTemp tEA = newTemp(Ity_I64);
- UInt sz = INSN(11,10);
- const HChar* name = "??";
- assign(tEA, getIReg64orSP(rN));
- if (rN == 31) { /* FIXME generate stack alignment check */ }
- IRExpr* tEA_0 = binop(Iop_Add64, mkexpr(tEA), mkU64(0));
- IRExpr* tEA_8 = binop(Iop_Add64, mkexpr(tEA), mkU64(8));
- IRExpr* tEA_16 = binop(Iop_Add64, mkexpr(tEA), mkU64(16));
- IRExpr* tEA_24 = binop(Iop_Add64, mkexpr(tEA), mkU64(24));
- if (sz == BITS2(1,1)) {
- name = "2d";
- if (isLD) {
- putQRegLane((vT+0) % 32, 0, loadLE(Ity_I64, tEA_0));
- putQRegLane((vT+0) % 32, 1, loadLE(Ity_I64, tEA_16));
- putQRegLane((vT+1) % 32, 0, loadLE(Ity_I64, tEA_8));
- putQRegLane((vT+1) % 32, 1, loadLE(Ity_I64, tEA_24));
- } else {
- storeLE(tEA_0, getQRegLane((vT+0) % 32, 0, Ity_I64));
- storeLE(tEA_16, getQRegLane((vT+0) % 32, 1, Ity_I64));
- storeLE(tEA_8, getQRegLane((vT+1) % 32, 0, Ity_I64));
- storeLE(tEA_24, getQRegLane((vT+1) % 32, 1, Ity_I64));
- }
- }
- else if (sz == BITS2(1,0)) {
- /* Uh, this is ugly. TODO: better. */
- name = "4s";
- IRExpr* tEA_4 = binop(Iop_Add64, mkexpr(tEA), mkU64(4));
- IRExpr* tEA_12 = binop(Iop_Add64, mkexpr(tEA), mkU64(12));
- IRExpr* tEA_20 = binop(Iop_Add64, mkexpr(tEA), mkU64(20));
- IRExpr* tEA_28 = binop(Iop_Add64, mkexpr(tEA), mkU64(28));
- if (isLD) {
- putQRegLane((vT+0) % 32, 0, loadLE(Ity_I32, tEA_0));
- putQRegLane((vT+0) % 32, 1, loadLE(Ity_I32, tEA_8));
- putQRegLane((vT+0) % 32, 2, loadLE(Ity_I32, tEA_16));
- putQRegLane((vT+0) % 32, 3, loadLE(Ity_I32, tEA_24));
- putQRegLane((vT+1) % 32, 0, loadLE(Ity_I32, tEA_4));
- putQRegLane((vT+1) % 32, 1, loadLE(Ity_I32, tEA_12));
- putQRegLane((vT+1) % 32, 2, loadLE(Ity_I32, tEA_20));
- putQRegLane((vT+1) % 32, 3, loadLE(Ity_I32, tEA_28));
- } else {
- storeLE(tEA_0, getQRegLane((vT+0) % 32, 0, Ity_I32));
- storeLE(tEA_8, getQRegLane((vT+0) % 32, 1, Ity_I32));
- storeLE(tEA_16, getQRegLane((vT+0) % 32, 2, Ity_I32));
- storeLE(tEA_24, getQRegLane((vT+0) % 32, 3, Ity_I32));
- storeLE(tEA_4, getQRegLane((vT+1) % 32, 0, Ity_I32));
- storeLE(tEA_12, getQRegLane((vT+1) % 32, 1, Ity_I32));
- storeLE(tEA_20, getQRegLane((vT+1) % 32, 2, Ity_I32));
- storeLE(tEA_28, getQRegLane((vT+1) % 32, 3, Ity_I32));
- }
- }
- else {
- vassert(0); // Can't happen.
- }
- putIReg64orSP(rN, binop(Iop_Add64, mkexpr(tEA), mkU64(32)));
- DIP("%s {v%u.%s, v%u.%s}, [%s], #32\n", isLD ? "ld2" : "st2",
- (vT+0) % 32, name, (vT+1) % 32, name, nameIReg64orSP(rN));
- return True;
- }
-
/* -------- LD1/ST1 (multi 1-elem structs, 2 regs, no offset) -------- */
/* Only a very few cases. */
/* 31 23
@@ -4995,58 +5954,6 @@
return True;
}
- /* -------- LD3/ST3 (multi 3-elem structs, 3 regs, post index) -------- */
- /* Only a very few cases. */
- /* 31 23 11 9 4
- 0100 1100 1101 1111 0100 11 n t LD3 {Vt.2d .. V(t+2)%32.2d}, [Xn|SP], #48
- 0100 1100 1001 1111 0100 11 n t ST3 {Vt.2d .. V(t+2)%32.2d}, [Xn|SP], #48
- */
- if ( (insn & 0xFFFFFC00) == 0x4CDF4C00 // LD3 .2d
- || (insn & 0xFFFFFC00) == 0x4C9F4C00 // ST3 .2d
- ) {
- Bool isLD = INSN(22,22) == 1;
- UInt rN = INSN(9,5);
- UInt vT = INSN(4,0);
- IRTemp tEA = newTemp(Ity_I64);
- UInt sz = INSN(11,10);
- const HChar* name = "??";
- assign(tEA, getIReg64orSP(rN));
- if (rN == 31) { /* FIXME generate stack alignment check */ }
- IRExpr* tEA_0 = binop(Iop_Add64, mkexpr(tEA), mkU64(0));
- IRExpr* tEA_8 = binop(Iop_Add64, mkexpr(tEA), mkU64(8));
- IRExpr* tEA_16 = binop(Iop_Add64, mkexpr(tEA), mkU64(16));
- IRExpr* tEA_24 = binop(Iop_Add64, mkexpr(tEA), mkU64(24));
- IRExpr* tEA_32 = binop(Iop_Add64, mkexpr(tEA), mkU64(32));
- IRExpr* tEA_40 = binop(Iop_Add64, mkexpr(tEA), mkU64(40));
- if (sz == BITS2(1,1)) {
- name = "2d";
- if (isLD) {
- putQRegLane((vT+0) % 32, 0, loadLE(Ity_I64, tEA_0));
- putQRegLane((vT+0) % 32, 1, loadLE(Ity_I64, tEA_24));
- putQRegLane((vT+1) % 32, 0, loadLE(Ity_I64, tEA_8));
- putQRegLane((vT+1) % 32, 1, loadLE(Ity_I64, tEA_32));
- putQRegLane((vT+2) % 32, 0, loadLE(Ity_I64, tEA_16));
- putQRegLane((vT+2) % 32, 1, loadLE(Ity_I64, tEA_40));
- } else {
- storeLE(tEA_0, getQRegLane((vT+0) % 32, 0, Ity_I64));
- s...
[truncated message content] |
|
From: <sv...@va...> - 2014-11-23 12:38:29
|
Author: sewardj
Date: Sun Nov 23 12:38:22 2014
New Revision: 14755
Log:
Merge, from trunk, r14646 (just the fix, not the test)
339706 Fix false positive for ioctl(TIOCSIG) on linux
14646 (just the fix)
Modified:
branches/VALGRIND_3_10_BRANCH/coregrind/m_syswrap/syswrap-linux.c (contents, props changed)
branches/VALGRIND_3_10_BRANCH/include/vki/vki-linux.h (contents, props changed)
Modified: branches/VALGRIND_3_10_BRANCH/coregrind/m_syswrap/syswrap-linux.c
==============================================================================
--- branches/VALGRIND_3_10_BRANCH/coregrind/m_syswrap/syswrap-linux.c (original)
+++ branches/VALGRIND_3_10_BRANCH/coregrind/m_syswrap/syswrap-linux.c Sun Nov 23 12:38:22 2014
@@ -5481,6 +5481,7 @@
case VKI_TCXONC:
case VKI_TCSBRKP:
case VKI_TCFLSH:
+ case VKI_TIOCSIG:
/* These just take an int by value */
break;
case VKI_TIOCGWINSZ:
@@ -8269,6 +8270,7 @@
case VKI_TCXONC:
case VKI_TCSBRKP:
case VKI_TCFLSH:
+ case VKI_TIOCSIG:
break;
case VKI_TIOCGWINSZ:
POST_MEM_WRITE( ARG3, sizeof(struct vki_winsize) );
Modified: branches/VALGRIND_3_10_BRANCH/include/vki/vki-linux.h
==============================================================================
--- branches/VALGRIND_3_10_BRANCH/include/vki/vki-linux.h (original)
+++ branches/VALGRIND_3_10_BRANCH/include/vki/vki-linux.h Sun Nov 23 12:38:22 2014
@@ -2955,6 +2955,10 @@
#define VKI_FIOQSIZE 0x5460 /* Value differs on some platforms */
#endif
+#ifndef VKI_TIOCSIG
+#define VKI_TIOCSIG _VKI_IOW('T', 0x36, int) /* Value differs on some platforms */
+#endif
+
//----------------------------------------------------------------------
// From kernel/common/include/linux/ashmem.h
//----------------------------------------------------------------------
|
|
From: <sv...@va...> - 2014-11-23 12:30:30
|
Author: sewardj
Date: Sun Nov 23 12:30:23 2014
New Revision: 14754
Log:
Merge, from trunk, r14631
14631 Enable sys_fadvise64_64 on arm32.
Modified:
branches/VALGRIND_3_10_BRANCH/ (props changed)
branches/VALGRIND_3_10_BRANCH/coregrind/m_syswrap/syswrap-arm-linux.c
Modified: branches/VALGRIND_3_10_BRANCH/coregrind/m_syswrap/syswrap-arm-linux.c
==============================================================================
--- branches/VALGRIND_3_10_BRANCH/coregrind/m_syswrap/syswrap-arm-linux.c (original)
+++ branches/VALGRIND_3_10_BRANCH/coregrind/m_syswrap/syswrap-arm-linux.c Sun Nov 23 12:30:23 2014
@@ -1088,7 +1088,6 @@
LINX_(__NR_tgkill, sys_tgkill), // 270 */Linux
GENX_(__NR_utimes, sys_utimes), // 271
-// LINX_(__NR_fadvise64_64, sys_fadvise64_64), // 272 */(Linux?)
GENX_(__NR_vserver, sys_ni_syscall), // 273
LINX_(__NR_mbind, sys_mbind), // 274 ?/?
@@ -1189,6 +1188,8 @@
// correspond to what's in include/vki/vki-scnums-arm-linux.h.
// From here onwards, please ensure the numbers are correct.
+ LINX_(__NR_arm_fadvise64_64, sys_fadvise64_64), // 270 */(Linux?)
+
LINX_(__NR_pselect6, sys_pselect6), // 335
LINXY(__NR_ppoll, sys_ppoll), // 336
|
|
From: <sv...@va...> - 2014-11-23 12:28:51
|
Author: sewardj
Date: Sun Nov 23 12:28:45 2014
New Revision: 14753
Log:
Merge, from trunk, r14617
339855 arm64 unhandled getsid/setsid syscalls
14617
Modified:
branches/VALGRIND_3_10_BRANCH/ (props changed)
branches/VALGRIND_3_10_BRANCH/coregrind/m_syswrap/syswrap-arm64-linux.c
Modified: branches/VALGRIND_3_10_BRANCH/coregrind/m_syswrap/syswrap-arm64-linux.c
==============================================================================
--- branches/VALGRIND_3_10_BRANCH/coregrind/m_syswrap/syswrap-arm64-linux.c (original)
+++ branches/VALGRIND_3_10_BRANCH/coregrind/m_syswrap/syswrap-arm64-linux.c Sun Nov 23 12:28:45 2014
@@ -956,6 +956,8 @@
GENXY(__NR_times, sys_times), // 153
GENX_(__NR_setpgid, sys_setpgid), // 154
GENX_(__NR_getpgid, sys_getpgid), // 155
+ GENX_(__NR_getsid, sys_getsid), // 156
+ GENX_(__NR_setsid, sys_setsid), // 157
GENXY(__NR_uname, sys_newuname), // 160
GENXY(__NR_getrlimit, sys_old_getrlimit), // 163
GENX_(__NR_setrlimit, sys_setrlimit), // 164
@@ -1092,7 +1094,6 @@
//ZZ GENX_(__NR_getppid, sys_getppid), // 64
//ZZ
//ZZ GENX_(__NR_getpgrp, sys_getpgrp), // 65
-//ZZ GENX_(__NR_setsid, sys_setsid), // 66
//ZZ LINXY(__NR_sigaction, sys_sigaction), // 67
//ZZ //zz // (__NR_sgetmask, sys_sgetmask), // 68 */* (ANSI C)
//ZZ //zz // (__NR_ssetmask, sys_ssetmask), // 69 */* (ANSI C)
@@ -1176,7 +1177,6 @@
//ZZ GENX_(__NR_flock, sys_flock), // 143
//ZZ GENX_(__NR_msync, sys_msync), // 144
//ZZ
-//ZZ GENX_(__NR_getsid, sys_getsid), // 147
//ZZ GENX_(__NR_fdatasync, sys_fdatasync), // 148
//ZZ LINXY(__NR__sysctl, sys_sysctl), // 149
//ZZ
|
|
From: <sv...@va...> - 2014-11-23 12:27:28
|
Author: sewardj
Date: Sun Nov 23 12:27:22 2014
New Revision: 14752
Log:
Merge, from trunk, 14618
14618 Handle (by ignoring) Imbe_CancelReservation. (hg)
Modified:
branches/VALGRIND_3_10_BRANCH/ (props changed)
branches/VALGRIND_3_10_BRANCH/helgrind/hg_main.c
Modified: branches/VALGRIND_3_10_BRANCH/helgrind/hg_main.c
==============================================================================
--- branches/VALGRIND_3_10_BRANCH/helgrind/hg_main.c (original)
+++ branches/VALGRIND_3_10_BRANCH/helgrind/hg_main.c Sun Nov 23 12:27:22 2014
@@ -4568,6 +4568,7 @@
case Ist_MBE:
switch (st->Ist.MBE.event) {
case Imbe_Fence:
+ case Imbe_CancelReservation:
break; /* not interesting */
default:
goto unhandled;
|
|
From: <sv...@va...> - 2014-11-23 12:25:34
|
Author: sewardj
Date: Sun Nov 23 12:25:27 2014
New Revision: 14751
Log:
Merge, from trunk, r14616
339853 arm64 times syscall unknown
14616
Modified:
branches/VALGRIND_3_10_BRANCH/ (props changed)
branches/VALGRIND_3_10_BRANCH/coregrind/m_syswrap/syswrap-arm64-linux.c
Modified: branches/VALGRIND_3_10_BRANCH/coregrind/m_syswrap/syswrap-arm64-linux.c
==============================================================================
--- branches/VALGRIND_3_10_BRANCH/coregrind/m_syswrap/syswrap-arm64-linux.c (original)
+++ branches/VALGRIND_3_10_BRANCH/coregrind/m_syswrap/syswrap-arm64-linux.c Sun Nov 23 12:25:27 2014
@@ -953,6 +953,7 @@
LINX_(__NR_setresuid, sys_setresuid), // 147
LINXY(__NR_getresuid, sys_getresuid), // 148
LINXY(__NR_getresgid, sys_getresgid), // 150
+ GENXY(__NR_times, sys_times), // 153
GENX_(__NR_setpgid, sys_setpgid), // 154
GENX_(__NR_getpgid, sys_getpgid), // 155
GENXY(__NR_uname, sys_newuname), // 160
@@ -1068,7 +1069,6 @@
//ZZ
//ZZ GENX_(__NR_rmdir, sys_rmdir), // 40
//ZZ LINXY(__NR_pipe, sys_pipe), // 42
-//ZZ GENXY(__NR_times, sys_times), // 43
//ZZ // GENX_(__NR_prof, sys_ni_syscall), // 44
//ZZ LINX_(__NR_setgid, sys_setgid16), // 46
|
|
From: <sv...@va...> - 2014-11-23 12:23:53
|
Author: sewardj
Date: Sun Nov 23 12:23:46 2014
New Revision: 14750
Log:
Merge, from trunk, 14603,14610
339721 assertion 'check_sibling == sibling' failed in readdwarf3.c ...
14603,14610
Modified:
branches/VALGRIND_3_10_BRANCH/ (props changed)
branches/VALGRIND_3_10_BRANCH/coregrind/m_debuginfo/readdwarf3.c
Modified: branches/VALGRIND_3_10_BRANCH/coregrind/m_debuginfo/readdwarf3.c
==============================================================================
--- branches/VALGRIND_3_10_BRANCH/coregrind/m_debuginfo/readdwarf3.c (original)
+++ branches/VALGRIND_3_10_BRANCH/coregrind/m_debuginfo/readdwarf3.c Sun Nov 23 12:23:46 2014
@@ -1373,11 +1373,29 @@
TRACE_D3("%x ", (UInt)u8);
work >>= 8;
}
- /* Due to the way that the hash table is constructed, the
- resulting DIE offset here is already "cooked". See
- cook_die_using_form. */
- cts->u.val = lookup_signatured_type (cc->signature_types, signature,
- c->barf);
+
+ /* cc->signature_types is only built/initialised when
+ VG_(clo_read_var_info) is set. In this case,
+ the DW_FORM_ref_sig8 can be looked up.
+ But we can also arrive here when only reading inline info
+ and VG_(clo_trace_symtab) is set. In such a case,
+ we cannot lookup the DW_FORM_ref_sig8, we rather assign
+ a dummy value. This is a kludge, but otherwise,
+ the 'dwarf inline info reader' tracing would have to
+ do type processing/reading. It is better to avoid
+ adding significant 'real' processing only due to tracing. */
+ if (VG_(clo_read_var_info)) {
+ /* Due to the way that the hash table is constructed, the
+ resulting DIE offset here is already "cooked". See
+ cook_die_using_form. */
+ cts->u.val = lookup_signatured_type (cc->signature_types, signature,
+ c->barf);
+ } else {
+ vg_assert (td3);
+ vg_assert (VG_(clo_read_inline_info));
+ TRACE_D3("<not dereferencing signature type>");
+ cts->u.val = 0; /* Assign a dummy/rubbish value */
+ }
cts->szB = sizeof(UWord);
break;
}
@@ -1500,7 +1518,7 @@
case DW_FORM_block:
return VARSZ_FORM;
case DW_FORM_ref_sig8:
- return 8 + 8;
+ return 8;
case DW_FORM_indirect:
return VARSZ_FORM;
case DW_FORM_GNU_ref_alt:
|
|
From: <sv...@va...> - 2014-11-23 12:21:40
|
Author: sewardj
Date: Sun Nov 23 12:21:33 2014
New Revision: 14749
Log:
Merge, from trunk, r14599
339645 Use correct tag names in sys_getdents/64 wrappers
Modified:
branches/VALGRIND_3_10_BRANCH/ (props changed)
branches/VALGRIND_3_10_BRANCH/coregrind/m_syswrap/syswrap-generic.c
Modified: branches/VALGRIND_3_10_BRANCH/coregrind/m_syswrap/syswrap-generic.c
==============================================================================
--- branches/VALGRIND_3_10_BRANCH/coregrind/m_syswrap/syswrap-generic.c (original)
+++ branches/VALGRIND_3_10_BRANCH/coregrind/m_syswrap/syswrap-generic.c Sun Nov 23 12:21:33 2014
@@ -3242,7 +3242,7 @@
*flags |= SfMayBlock;
PRINT("sys_getdents ( %ld, %#lx, %ld )", ARG1,ARG2,ARG3);
PRE_REG_READ3(long, "getdents",
- unsigned int, fd, struct linux_dirent *, dirp,
+ unsigned int, fd, struct vki_dirent *, dirp,
unsigned int, count);
PRE_MEM_WRITE( "getdents(dirp)", ARG2, ARG3 );
}
@@ -3259,7 +3259,7 @@
*flags |= SfMayBlock;
PRINT("sys_getdents64 ( %ld, %#lx, %ld )",ARG1,ARG2,ARG3);
PRE_REG_READ3(long, "getdents64",
- unsigned int, fd, struct linux_dirent64 *, dirp,
+ unsigned int, fd, struct vki_dirent64 *, dirp,
unsigned int, count);
PRE_MEM_WRITE( "getdents64(dirp)", ARG2, ARG3 );
}
|
|
From: <sv...@va...> - 2014-11-23 12:16:17
|
Author: sewardj
Date: Sun Nov 23 12:16:11 2014
New Revision: 3004
Log:
Merge, from trunk, r2975
339858 arm64 dmb sy not implemented
2975 (subsequently overwritten by 2986)
Modified:
branches/VEX_3_10_BRANCH/ (props changed)
branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
Modified: branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c (original)
+++ branches/VEX_3_10_BRANCH/priv/guest_arm64_toIR.c Sun Nov 23 12:16:11 2014
@@ -5537,6 +5537,11 @@
DIP("isb\n");
return True;
}
+ if (INSN(31,0) == 0xD5033FBF) {
+ stmt(IRStmt_MBE(Imbe_Fence));
+ DIP("dmb sy\n");
+ return True;
+ }
if (INSN(31,0) == 0xD5033BBF) {
stmt(IRStmt_MBE(Imbe_Fence));
DIP("dmb ish\n");
|
|
From: <sv...@va...> - 2014-11-23 12:14:48
|
Author: sewardj
Date: Sun Nov 23 12:14:41 2014
New Revision: 3003
Log:
Merge, from trunk, 2962, 2966, 2967, 2973
339433 ppc64 lxvw4x instruction uses four 32-byte loads
Modified:
branches/VEX_3_10_BRANCH/ (props changed)
branches/VEX_3_10_BRANCH/priv/guest_ppc_toIR.c
branches/VEX_3_10_BRANCH/priv/host_ppc_defs.c
branches/VEX_3_10_BRANCH/priv/host_ppc_defs.h
branches/VEX_3_10_BRANCH/priv/host_ppc_isel.c
Modified: branches/VEX_3_10_BRANCH/priv/guest_ppc_toIR.c
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/guest_ppc_toIR.c (original)
+++ branches/VEX_3_10_BRANCH/priv/guest_ppc_toIR.c Sun Nov 23 12:14:41 2014
@@ -15319,26 +15319,27 @@
}
case 0x30C:
{
- IRExpr * t3, *t2, *t1, *t0;
- UInt ea_off = 0;
- IRExpr* irx_addr;
+ IRExpr *t0;
DIP("lxvw4x %d,r%u,r%u\n", (UInt)XT, rA_addr, rB_addr);
- t3 = load( Ity_I32, mkexpr( EA ) );
- ea_off += 4;
- irx_addr = binop( mkSzOp( ty, Iop_Add8 ), mkexpr( EA ),
- ty == Ity_I64 ? mkU64( ea_off ) : mkU32( ea_off ) );
- t2 = load( Ity_I32, irx_addr );
- ea_off += 4;
- irx_addr = binop( mkSzOp( ty, Iop_Add8 ), mkexpr( EA ),
- ty == Ity_I64 ? mkU64( ea_off ) : mkU32( ea_off ) );
- t1 = load( Ity_I32, irx_addr );
- ea_off += 4;
- irx_addr = binop( mkSzOp( ty, Iop_Add8 ), mkexpr( EA ),
- ty == Ity_I64 ? mkU64( ea_off ) : mkU32( ea_off ) );
- t0 = load( Ity_I32, irx_addr );
- putVSReg( XT, binop( Iop_64HLtoV128, binop( Iop_32HLto64, t3, t2 ),
- binop( Iop_32HLto64, t1, t0 ) ) );
+
+ /* The load will result in the data being in BE order. */
+ if (host_endness == VexEndnessLE) {
+ IRExpr *t0_BE;
+ IRTemp perm_LE = newTemp(Ity_V128);
+
+ t0_BE = load( Ity_V128, mkexpr( EA ) );
+
+ /* Permute the data to LE format */
+ assign( perm_LE, binop( Iop_64HLtoV128, mkU64(0x0c0d0e0f08090a0b),
+ mkU64(0x0405060700010203)));
+
+ t0 = binop( Iop_Perm8x16, t0_BE, mkexpr(perm_LE) );
+ } else {
+ t0 = load( Ity_V128, mkexpr( EA ) );
+ }
+
+ putVSReg( XT, t0 );
break;
}
default:
Modified: branches/VEX_3_10_BRANCH/priv/host_ppc_defs.c
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/host_ppc_defs.c (original)
+++ branches/VEX_3_10_BRANCH/priv/host_ppc_defs.c Sun Nov 23 12:14:41 2014
@@ -1426,6 +1426,14 @@
i->Pin.AvSel.srcR = srcR;
return i;
}
+PPCInstr* PPCInstr_AvSh ( Bool shLeft, HReg dst, PPCAMode* addr ) {
+ PPCInstr* i = LibVEX_Alloc(sizeof(PPCInstr));
+ i->tag = Pin_AvSh;
+ i->Pin.AvSh.shLeft = shLeft;
+ i->Pin.AvSh.dst = dst;
+ i->Pin.AvSh.addr = addr;
+ return i;
+}
PPCInstr* PPCInstr_AvShlDbl ( UChar shift, HReg dst,
HReg srcL, HReg srcR ) {
PPCInstr* i = LibVEX_Alloc(sizeof(PPCInstr));
@@ -2008,6 +2016,30 @@
ppHRegPPC(i->Pin.AvSel.ctl);
return;
+ case Pin_AvSh:
+ /* This only generates the following instructions with RA
+ * register number set to 0.
+ */
+ if (i->Pin.AvSh.addr->tag == Pam_IR) {
+ ppLoadImm(hregPPC_GPR30(mode64),
+ i->Pin.AvSh.addr->Pam.IR.index, mode64);
+ vex_printf(" ; ");
+ }
+
+ if (i->Pin.AvSh.shLeft)
+ vex_printf("lvsl ");
+ else
+ vex_printf("lvsr ");
+
+ ppHRegPPC(i->Pin.AvSh.dst);
+ if (i->Pin.AvSh.addr->tag == Pam_IR)
+ vex_printf("%%r30");
+ else
+ ppHRegPPC(i->Pin.AvSh.addr->Pam.RR.index);
+ vex_printf(",");
+ ppHRegPPC(i->Pin.AvSh.addr->Pam.RR.base);
+ return;
+
case Pin_AvShlDbl:
vex_printf("vsldoi ");
ppHRegPPC(i->Pin.AvShlDbl.dst);
@@ -2517,6 +2549,12 @@
addHRegUse(u, HRmRead, i->Pin.AvSel.srcL);
addHRegUse(u, HRmRead, i->Pin.AvSel.srcR);
return;
+ case Pin_AvSh:
+ addHRegUse(u, HRmWrite, i->Pin.AvSh.dst);
+ if (i->Pin.AvSh.addr->tag == Pam_IR)
+ addHRegUse(u, HRmWrite, hregPPC_GPR30(mode64));
+ addRegUsage_PPCAMode(u, i->Pin.AvSh.addr);
+ return;
case Pin_AvShlDbl:
addHRegUse(u, HRmWrite, i->Pin.AvShlDbl.dst);
addHRegUse(u, HRmRead, i->Pin.AvShlDbl.srcL);
@@ -2846,6 +2884,10 @@
mapReg(m, &i->Pin.AvSel.srcR);
mapReg(m, &i->Pin.AvSel.ctl);
return;
+ case Pin_AvSh:
+ mapReg(m, &i->Pin.AvSh.dst);
+ mapRegs_PPCAMode(m, i->Pin.AvSh.addr);
+ return;
case Pin_AvShlDbl:
mapReg(m, &i->Pin.AvShlDbl.dst);
mapReg(m, &i->Pin.AvShlDbl.srcL);
@@ -3709,6 +3751,19 @@
return emit32(p, theInstr, endness_host);
}
+static UChar* mkFormVXI ( UChar* p, UInt opc1, UInt r1, UInt r2,
+ UInt r3, UInt opc2, VexEndness endness_host )
+{
+ UInt theInstr;
+ vassert(opc1 < 0x40);
+ vassert(r1 < 0x20);
+ vassert(r2 < 0x20);
+ vassert(r3 < 0x20);
+ vassert(opc2 < 0x27);
+ theInstr = ((opc1<<26) | (r1<<21) | (r2<<16) | (r3<<11) | opc2<<1);
+ return emit32(p, theInstr, endness_host);
+}
+
static UChar* mkFormVXR ( UChar* p, UInt opc1, UInt r1, UInt r2,
UInt r3, UInt Rc, UInt opc2,
VexEndness endness_host )
@@ -5214,6 +5269,30 @@
goto done;
}
+ case Pin_AvSh: { // vsl or vsr
+ UInt v_dst = vregNo(i->Pin.AvSh.dst);
+ Bool idxd = toBool(i->Pin.AvSh.addr->tag == Pam_RR);
+ UInt r_idx, r_base;
+
+ r_base = iregNo(i->Pin.AvSh.addr->Pam.RR.base, mode64);
+
+ if (!idxd) {
+ r_idx = 30; // XXX: Using r30 as temp
+ p = mkLoadImm(p, r_idx,
+ i->Pin.AvSh.addr->Pam.IR.index, mode64, endness_host);
+ } else {
+ r_idx = iregNo(i->Pin.AvSh.addr->Pam.RR.index, mode64);
+ }
+
+ if (i->Pin.AvSh.shLeft)
+ //vsl VRT,RA,RB
+ p = mkFormVXI( p, 31, v_dst, r_idx, r_base, 6, endness_host );
+ else
+ //vsr VRT,RA,RB
+ p = mkFormVXI( p, 31, v_dst, r_idx, r_base, 38, endness_host );
+ goto done;
+ }
+
case Pin_AvShlDbl: { // vsldoi
UInt shift = i->Pin.AvShlDbl.shift;
UInt v_dst = vregNo(i->Pin.AvShlDbl.dst);
Modified: branches/VEX_3_10_BRANCH/priv/host_ppc_defs.h
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/host_ppc_defs.h (original)
+++ branches/VEX_3_10_BRANCH/priv/host_ppc_defs.h Sun Nov 23 12:14:41 2014
@@ -522,6 +522,7 @@
Pin_AvPerm, /* AV permute (shuffle) */
Pin_AvSel, /* AV select */
+ Pin_AvSh, /* AV shift left or right */
Pin_AvShlDbl, /* AV shift-left double by imm */
Pin_AvSplat, /* One elem repeated throughout dst */
Pin_AvLdVSCR, /* mtvscr */
@@ -855,6 +856,11 @@
HReg ctl;
} AvSel;
struct {
+ Bool shLeft;
+ HReg dst;
+ PPCAMode* addr;
+ } AvSh;
+ struct {
UChar shift;
HReg dst;
HReg srcL;
@@ -1077,6 +1083,7 @@
extern PPCInstr* PPCInstr_AvUn32Fx4 ( PPCAvFpOp op, HReg dst, HReg src );
extern PPCInstr* PPCInstr_AvPerm ( HReg dst, HReg srcL, HReg srcR, HReg ctl );
extern PPCInstr* PPCInstr_AvSel ( HReg ctl, HReg dst, HReg srcL, HReg srcR );
+extern PPCInstr* PPCInstr_AvSh ( Bool shLeft, HReg dst, PPCAMode* am_addr );
extern PPCInstr* PPCInstr_AvShlDbl ( UChar shift, HReg dst, HReg srcL, HReg srcR );
extern PPCInstr* PPCInstr_AvSplat ( UChar sz, HReg dst, PPCVI5s* src );
extern PPCInstr* PPCInstr_AvCMov ( PPCCondCode, HReg dst, HReg src );
Modified: branches/VEX_3_10_BRANCH/priv/host_ppc_isel.c
==============================================================================
--- branches/VEX_3_10_BRANCH/priv/host_ppc_isel.c (original)
+++ branches/VEX_3_10_BRANCH/priv/host_ppc_isel.c Sun Nov 23 12:14:41 2014
@@ -4871,12 +4871,57 @@
}
if (e->tag == Iex_Load && e->Iex.Load.end == IEndianess) {
- PPCAMode* am_addr;
+ /* Need to be able to do V128 unaligned loads. The BE unaligned load
+ * can be accomplised using the following code sequece from the ISA.
+ * It uses the lvx instruction that does two aligned loads and then
+ * permute the data to store the required data as if it had been an
+ * unaligned load.
+ *
+ * lvx Vhi,0,Rb # load MSQ, using the unaligned address in Rb
+ * lvsl Vp, 0,Rb # Set permute control vector
+ * addi Rb,Rb,15 # Address of LSQ
+ * lvx Vlo,0,Rb # load LSQ
+ * vperm Vt,Vhi,Vlo,Vp # align the data as requested
+ */
+
+ HReg Vhi = newVRegV(env);
+ HReg Vlo = newVRegV(env);
+ HReg Vp = newVRegV(env);
HReg v_dst = newVRegV(env);
+ HReg rB;
+ HReg rB_plus_15 = newVRegI(env);
+
vassert(e->Iex.Load.ty == Ity_V128);
- am_addr = iselWordExpr_AMode(env, e->Iex.Load.addr, Ity_V128/*xfer*/,
- IEndianess);
- addInstr(env, PPCInstr_AvLdSt( True/*load*/, 16, v_dst, am_addr));
+ rB = iselWordExpr_R( env, e->Iex.Load.addr, IEndianess );
+
+ // lvx Vhi, 0, Rb
+ addInstr(env, PPCInstr_AvLdSt( True/*load*/, 16, Vhi,
+ PPCAMode_IR(0, rB)) );
+
+ if (IEndianess == Iend_LE)
+ // lvsr Vp, 0, Rb
+ addInstr(env, PPCInstr_AvSh( False/*right shift*/, Vp,
+ PPCAMode_IR(0, rB)) );
+ else
+ // lvsl Vp, 0, Rb
+ addInstr(env, PPCInstr_AvSh( True/*left shift*/, Vp,
+ PPCAMode_IR(0, rB)) );
+
+ // addi Rb_plus_15, Rb, 15
+ addInstr(env, PPCInstr_Alu( Palu_ADD, rB_plus_15,
+ rB, PPCRH_Imm(True, toUShort(15))) );
+
+ // lvx Vlo, 0, Rb_plus_15
+ addInstr(env, PPCInstr_AvLdSt( True/*load*/, 16, Vlo,
+ PPCAMode_IR(0, rB_plus_15)) );
+
+ if (IEndianess == Iend_LE)
+ // vperm Vt, Vhi, Vlo, Vp
+ addInstr(env, PPCInstr_AvPerm( v_dst, Vlo, Vhi, Vp ));
+ else
+ // vperm Vt, Vhi, Vlo, Vp
+ addInstr(env, PPCInstr_AvPerm( v_dst, Vhi, Vlo, Vp ));
+
return v_dst;
}
|
|
From: <sv...@va...> - 2014-11-23 12:10:31
|
Author: sewardj
Date: Sun Nov 23 12:10:22 2014
New Revision: 14748
Log:
Merge from trunk, r14565
14565 Glibc versions prior to 2.5 do not define PTRACE_GETSIGINFO
Modified:
branches/VALGRIND_3_10_BRANCH/ (props changed)
branches/VALGRIND_3_10_BRANCH/coregrind/vgdb-invoker-ptrace.c
Modified: branches/VALGRIND_3_10_BRANCH/coregrind/vgdb-invoker-ptrace.c
==============================================================================
--- branches/VALGRIND_3_10_BRANCH/coregrind/vgdb-invoker-ptrace.c (original)
+++ branches/VALGRIND_3_10_BRANCH/coregrind/vgdb-invoker-ptrace.c Sun Nov 23 12:10:22 2014
@@ -64,8 +64,9 @@
#include <sys/procfs.h>
-#if defined(VGA_s390x)
-/* RHEL 5 uses glibc 2.3.4 which does not define PTRACE_GETSIGINFO */
+// glibc versions prior to 2.5 do not define PTRACE_GETSIGINFO on
+// the platforms we support.
+#if !((__GLIBC__ > 2) || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 5))
# ifndef PTRACE_GETSIGINFO
# define PTRACE_GETSIGINFO 0x4202
# endif
|
|
From: <sv...@va...> - 2014-11-23 12:05:27
|
Author: sewardj
Date: Sun Nov 23 12:05:04 2014
New Revision: 14747
Log:
Merge, from trunk, r14561
14561 Add missing ]] to terminate CDATA.
Modified:
branches/VALGRIND_3_10_BRANCH/ (props changed)
branches/VALGRIND_3_10_BRANCH/docs/xml/manual-core.xml
Modified: branches/VALGRIND_3_10_BRANCH/docs/xml/manual-core.xml
==============================================================================
--- branches/VALGRIND_3_10_BRANCH/docs/xml/manual-core.xml (original)
+++ branches/VALGRIND_3_10_BRANCH/docs/xml/manual-core.xml Sun Nov 23 12:05:04 2014
@@ -1852,7 +1852,7 @@
==15363== by 0x8048550: main (varinfo1.c:56)
==15363== Address 0xbea0d0cc is on thread 1's stack
==15363== in frame #1, created by main (varinfo1.c:45)
-></programlisting>
+]]></programlisting>
<para>And here are the same errors with
<option>--read-var-info=yes</option>:</para>
|