|
From: Julian S. <se...@so...> - 2021-01-07 07:36:01
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=3b1710d38cf19619242c9113a2dbe291e914a8c2 commit 3b1710d38cf19619242c9113a2dbe291e914a8c2 Author: Julian Seward <js...@ac...> Date: Thu Jan 7 08:34:14 2021 +0100 Bug 413547 - regression test does not check for Arm 64 features. Patches from/by Assad Hashmi (ass...@li...). Diff: --- .gitignore | 1 + VEX/priv/guest_arm64_toIR.c | 33 +- VEX/priv/host_arm64_defs.c | 5 + VEX/priv/host_arm64_defs.h | 1 + VEX/priv/host_arm64_isel.c | 1 + VEX/priv/ir_defs.c | 3 + VEX/pub/libvex_ir.h | 5 + configure.ac | 25 + memcheck/mc_translate.c | 41 + memcheck/tests/vbit-test/irops.c | 1 + none/tests/arm64/Makefile.am | 9 +- none/tests/arm64/fp_and_simd_v82.c | 2285 +++++++++++++++++++++++++++ none/tests/arm64/fp_and_simd_v82.stderr.exp | 0 none/tests/arm64/fp_and_simd_v82.stdout.exp | 63 + none/tests/arm64/fp_and_simd_v82.vgtest | 3 + tests/Makefile.am | 4 +- tests/arm64_features.c | 178 +++ 17 files changed, 2651 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index edb8edd22b..dff20848e6 100644 --- a/.gitignore +++ b/.gitignore @@ -2078,6 +2078,7 @@ /tests/true /tests/vg_regtest /tests/x86_amd64_features +/tests/arm64_features # /VEX/ /VEX/libvex*.a diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c index 1121ce88af..a296565591 100644 --- a/VEX/priv/guest_arm64_toIR.c +++ b/VEX/priv/guest_arm64_toIR.c @@ -993,7 +993,7 @@ static IROp mkVecQSHLNSATSU ( UInt size ) { static IROp mkVecADDF ( UInt size ) { const IROp ops[4] - = { Iop_INVALID, Iop_INVALID, Iop_Add32Fx4, Iop_Add64Fx2 }; + = { Iop_INVALID, Iop_Add16Fx8, Iop_Add32Fx4, Iop_Add64Fx2 }; vassert(size < 4); return ops[size]; } @@ -9806,7 +9806,8 @@ Bool dis_AdvSIMD_scalar_copy(/*MB_OUT*/DisResult* dres, UInt insn) static -Bool dis_AdvSIMD_scalar_pairwise(/*MB_OUT*/DisResult* dres, UInt insn) +Bool dis_AdvSIMD_scalar_pairwise(/*MB_OUT*/DisResult* dres, UInt insn, + const VexArchInfo* archinfo) { /* 31 28 23 21 16 11 9 4 01 u 11110 sz 11000 opcode 10 n d @@ -9857,6 +9858,27 @@ Bool dis_AdvSIMD_scalar_pairwise(/*MB_OUT*/DisResult* dres, UInt insn) return True; } + /* Half-precision floating point ADDP (v8.2). */ + if (bitU == 0 && sz <= X00 && opcode == BITS5(0,1,1,0,1)) { + /* -------- 0,00,01101 ADDP h_2h -------- */ + if ((archinfo->hwcaps & VEX_HWCAPS_ARM64_FP16) == 0) + return False; + IROp opZHI = mkVecZEROHIxxOFV128(1); + IROp opADD = mkVecADDF(1); + IRTemp src = newTempV128(); + IRTemp argL = newTempV128(); + IRTemp argR = newTempV128(); + assign(src, getQReg128(nn)); + assign(argL, unop(opZHI, mkexpr(src))); + assign(argR, unop(opZHI, triop(Iop_SliceV128, mkexpr(src), mkexpr(src), + mkU8(2)))); + putQReg128(dd, unop(opZHI, + triop(opADD, mkexpr(mk_get_IR_rounding_mode()), + mkexpr(argL), mkexpr(argR)))); + DIP("faddp h%u, v%u.2h\n", dd, nn); + return True; + } + if (bitU == 1 && (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,1,1))) { /* -------- 1,0x,01100 FMAXNMP d_2d, s_2s -------- */ @@ -14946,7 +14968,8 @@ Bool dis_AdvSIMD_fp_to_from_int_conv(/*MB_OUT*/DisResult* dres, UInt insn) static -Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn) +Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn, + const VexArchInfo* archinfo) { Bool ok; ok = dis_AdvSIMD_EXT(dres, insn); @@ -14963,7 +14986,7 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn) if (UNLIKELY(ok)) return True; ok = dis_AdvSIMD_scalar_copy(dres, insn); if (UNLIKELY(ok)) return True; - ok = dis_AdvSIMD_scalar_pairwise(dres, insn); + ok = dis_AdvSIMD_scalar_pairwise(dres, insn, archinfo); if (UNLIKELY(ok)) return True; ok = dis_AdvSIMD_scalar_shift_by_imm(dres, insn); if (UNLIKELY(ok)) return True; @@ -15175,7 +15198,7 @@ Bool disInstr_ARM64_WRK ( break; case BITS4(0,1,1,1): case BITS4(1,1,1,1): // Data processing - SIMD and floating point - ok = dis_ARM64_simd_and_fp(dres, insn); + ok = dis_ARM64_simd_and_fp(dres, insn, archinfo); break; case BITS4(0,0,0,0): case BITS4(0,0,0,1): case BITS4(0,0,1,0): case BITS4(0,0,1,1): diff --git a/VEX/priv/host_arm64_defs.c b/VEX/priv/host_arm64_defs.c index 526da570a5..e6b06e5fbe 100644 --- a/VEX/priv/host_arm64_defs.c +++ b/VEX/priv/host_arm64_defs.c @@ -600,6 +600,7 @@ static void showARM64VecBinOp(/*OUT*/const HChar** nm, case ARM64vecb_FMUL64x2: *nm = "fmul "; *ar = "2d"; return; case ARM64vecb_FDIV64x2: *nm = "fdiv "; *ar = "2d"; return; case ARM64vecb_FADD32x4: *nm = "fadd "; *ar = "4s"; return; + case ARM64vecb_FADD16x8: *nm = "fadd "; *ar = "8h"; return; case ARM64vecb_FSUB32x4: *nm = "fsub "; *ar = "4s"; return; case ARM64vecb_FMUL32x4: *nm = "fmul "; *ar = "4s"; return; case ARM64vecb_FDIV32x4: *nm = "fdiv "; *ar = "4s"; return; @@ -2869,6 +2870,7 @@ static inline UInt qregEnc ( HReg r ) #define X000010 BITS8(0,0, 0,0,0,0,1,0) #define X000011 BITS8(0,0, 0,0,0,0,1,1) #define X000100 BITS8(0,0, 0,0,0,1,0,0) +#define X000101 BITS8(0,0, 0,0,0,1,0,1) #define X000110 BITS8(0,0, 0,0,0,1,1,0) #define X000111 BITS8(0,0, 0,0,0,1,1,1) #define X001000 BITS8(0,0, 0,0,1,0,0,0) @@ -4831,6 +4833,9 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc, case ARM64vecb_FADD32x4: *p++ = X_3_8_5_6_5_5(X010, X01110001, vM, X110101, vN, vD); break; + case ARM64vecb_FADD16x8: + *p++ = X_3_8_5_6_5_5(X010, X01110010, vM, X000101, vN, vD); + break; case ARM64vecb_FSUB64x2: *p++ = X_3_8_5_6_5_5(X010, X01110111, vM, X110101, vN, vD); break; diff --git a/VEX/priv/host_arm64_defs.h b/VEX/priv/host_arm64_defs.h index 105d7ce843..8cece7b9c5 100644 --- a/VEX/priv/host_arm64_defs.h +++ b/VEX/priv/host_arm64_defs.h @@ -335,6 +335,7 @@ typedef ARM64vecb_MUL32x4, ARM64vecb_MUL16x8, ARM64vecb_MUL8x16, ARM64vecb_FADD64x2, ARM64vecb_FADD32x4, + ARM64vecb_FADD16x8, ARM64vecb_FSUB64x2, ARM64vecb_FSUB32x4, ARM64vecb_FMUL64x2, ARM64vecb_FMUL32x4, ARM64vecb_FDIV64x2, ARM64vecb_FDIV32x4, diff --git a/VEX/priv/host_arm64_isel.c b/VEX/priv/host_arm64_isel.c index 1b8ad20a5a..c0464abf33 100644 --- a/VEX/priv/host_arm64_isel.c +++ b/VEX/priv/host_arm64_isel.c @@ -3157,6 +3157,7 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e ) case Iop_Sub32Fx4: vecbop = ARM64vecb_FSUB32x4; break; case Iop_Mul32Fx4: vecbop = ARM64vecb_FMUL32x4; break; case Iop_Div32Fx4: vecbop = ARM64vecb_FDIV32x4; break; + case Iop_Add16Fx8: vecbop = ARM64vecb_FADD16x8; break; default: break; } if (vecbop != ARM64vecb_INVALID) { diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c index 1359abb3f2..2734776f5b 100644 --- a/VEX/priv/ir_defs.c +++ b/VEX/priv/ir_defs.c @@ -640,6 +640,7 @@ void ppIROp ( IROp op ) case Iop_CmpNEZ16x4: vex_printf("CmpNEZ16x4"); return; case Iop_CmpNEZ8x8: vex_printf("CmpNEZ8x8"); return; + case Iop_Add16Fx8: vex_printf("Add16Fx8"); return; case Iop_Add32Fx4: vex_printf("Add32Fx4"); return; case Iop_Add32Fx2: vex_printf("Add32Fx2"); return; case Iop_Add32F0x4: vex_printf("Add32F0x4"); return; @@ -1546,6 +1547,7 @@ Bool primopMightTrap ( IROp op ) case Iop_DPBtoBCD: case Iop_BCDtoDPB: case Iop_BCDAdd: case Iop_BCDSub: case Iop_I128StoBCD128: case Iop_BCD128toI128S: case Iop_ReinterpI64asD64: case Iop_ReinterpD64asI64: + case Iop_Add16Fx8: case Iop_Add32Fx4: case Iop_Sub32Fx4: case Iop_Mul32Fx4: case Iop_Div32Fx4: case Iop_Max32Fx4: case Iop_Min32Fx4: case Iop_Add32Fx2: case Iop_Sub32Fx2: @@ -3760,6 +3762,7 @@ void typeOfPrimop ( IROp op, case Iop_Mul64Fx2: case Iop_Div64Fx2: case Iop_Add32Fx4: case Iop_Sub32Fx4: case Iop_Mul32Fx4: case Iop_Div32Fx4: + case Iop_Add16Fx8: case Iop_F64x2_2toQ32x4: case Iop_F32x4_2toQ16x8: TERNARY(ity_RMode,Ity_V128,Ity_V128, Ity_V128); diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h index 6a854e43f1..00899e5335 100644 --- a/VEX/pub/libvex_ir.h +++ b/VEX/pub/libvex_ir.h @@ -1344,6 +1344,11 @@ typedef /* ------------------ 128-bit SIMD FP. ------------------ */ + /* --- 16x8 vector FP --- */ + + /* ternary :: IRRoundingMode(I16) x V128 x V128 -> V128 */ + Iop_Add16Fx8, + /* --- 32x4 vector FP --- */ /* ternary :: IRRoundingMode(I32) x V128 x V128 -> V128 */ diff --git a/configure.ac b/configure.ac index 2b949ed84e..41ae942429 100755 --- a/configure.ac +++ b/configure.ac @@ -3240,6 +3240,31 @@ CFLAGS="$save_CFLAGS" AM_CONDITIONAL(BUILD_ARMV81_TESTS, test x$ac_have_armv81_feature = xyes) +# Does the C compiler support the armv82 flag and the assembler v8.2 instructions +# Note, this doesn't generate a C-level symbol. It generates a +# automake-level symbol (BUILD_ARMV82_TESTS), used in test Makefile.am's +AC_MSG_CHECKING([if gcc supports the armv82 feature flag and assembler supports v8.2 instructions]) + +save_CFLAGS="$CFLAGS" +CFLAGS="$CFLAGS -march=armv8.2-a+fp16 -Werror" +AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ +int main() +{ + __asm__ __volatile__("faddp h0, v1.2h"); + return 0; +} +]])], [ +ac_have_armv82_feature=yes +AC_MSG_RESULT([yes]) +], [ +ac_have_armv82_feature=no +AC_MSG_RESULT([no]) +]) +CFLAGS="$save_CFLAGS" + +AM_CONDITIONAL(BUILD_ARMV82_TESTS, test x$ac_have_armv82_feature = xyes) + + # XXX JRS 2010 Oct 13: what is this for? For sure, we don't need this # when building the tool executables. I think we should get rid of it. # diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c index e91d510946..91f23ed60e 100644 --- a/memcheck/mc_translate.c +++ b/memcheck/mc_translate.c @@ -2692,6 +2692,23 @@ IRAtom* unary64F0x2 ( MCEnv* mce, IRAtom* vatomX ) return at; } +/* --- --- ... and ... 16Fx8 versions of the same --- --- */ + +static +IRAtom* binary16Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY ) +{ + IRAtom* at; + tl_assert(isShadowAtom(mce, vatomX)); + tl_assert(isShadowAtom(mce, vatomY)); + at = mkUifUV128(mce, vatomX, vatomY); + at = assignNew('V', mce, Ity_V128, mkPCast16x8(mce, at)); + return at; +} + +/* TODO: remaining versions of 16x4 FP ops when more of the half-precision IR is + implemented. +*/ + /* --- --- ... and ... 32Fx2 versions of the same --- --- */ static @@ -2806,6 +2823,24 @@ IRAtom* binary64Fx4_w_rm ( MCEnv* mce, IRAtom* vRM, return t1; } +/* --- ... and ... 16Fx8 versions of the same --- */ + +static +IRAtom* binary16Fx8_w_rm ( MCEnv* mce, IRAtom* vRM, + IRAtom* vatomX, IRAtom* vatomY ) +{ + IRAtom* t1 = binary16Fx8(mce, vatomX, vatomY); + // PCast the RM, and widen it to 128 bits + IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM); + // Roll it into the result + t1 = mkUifUV128(mce, t1, t2); + return t1; +} + +/* TODO: remaining versions of 16x4 FP ops when more of the half-precision IR is + implemented. +*/ + /* --- ... and ... 32Fx8 versions of the same --- */ static @@ -3393,6 +3428,12 @@ IRAtom* expr2vbits_Triop ( MCEnv* mce, case Iop_Div64Fx4: return binary64Fx4_w_rm(mce, vatom1, vatom2, vatom3); + /* TODO: remaining versions of 16x4 FP ops when more of the half-precision + IR is implemented. + */ + case Iop_Add16Fx8: + return binary16Fx8_w_rm(mce, vatom1, vatom2, vatom3); + case Iop_Add32Fx8: case Iop_Sub32Fx8: case Iop_Mul32Fx8: diff --git a/memcheck/tests/vbit-test/irops.c b/memcheck/tests/vbit-test/irops.c index 1f1ba909ac..39939b711d 100644 --- a/memcheck/tests/vbit-test/irops.c +++ b/memcheck/tests/vbit-test/irops.c @@ -613,6 +613,7 @@ static irop_t irops[] = { { DEFOP(Iop_ReinterpI64asD64, UNDEF_SAME), .s390x = 1, .ppc64 = 1, .ppc32 = 1 }, { DEFOP(Iop_ReinterpD64asI64, UNDEF_SAME), .s390x = 1, .ppc64 = 1, .ppc32 = 1 }, /* ------------------ 128-bit SIMD FP. ------------------ */ + { DEFOP(Iop_Add16Fx8, UNDEF_UNKNOWN), }, { DEFOP(Iop_Add32Fx4, UNDEF_UNKNOWN), }, { DEFOP(Iop_Sub32Fx4, UNDEF_UNKNOWN), }, { DEFOP(Iop_Mul32Fx4, UNDEF_UNKNOWN), }, diff --git a/none/tests/arm64/Makefile.am b/none/tests/arm64/Makefile.am index 4ecab36add..00cbfa52c1 100644 --- a/none/tests/arm64/Makefile.am +++ b/none/tests/arm64/Makefile.am @@ -11,7 +11,8 @@ EXTRA_DIST = \ memory.stdout.exp memory.stderr.exp memory.vgtest \ atomics_v81.stdout.exp atomics_v81.stderr.exp atomics_v81.vgtest \ simd_v81.stdout.exp simd_v81.stderr.exp simd_v81.vgtest \ - fmadd_sub.stdout.exp fmadd_sub.stderr.exp fmadd_sub.vgtest + fmadd_sub.stdout.exp fmadd_sub.stderr.exp fmadd_sub.vgtest \ + fp_and_simd_v82.stdout.exp fp_and_simd_v82.stderr.exp fp_and_simd_v82.vgtest check_PROGRAMS = \ allexec \ @@ -29,6 +30,10 @@ if BUILD_ARMV81_TESTS check_PROGRAMS += atomics_v81 simd_v81 endif +if BUILD_ARMV82_TESTS + check_PROGRAMS += fp_and_simd_v82 +endif + AM_CFLAGS += @FLAG_M64@ AM_CXXFLAGS += @FLAG_M64@ AM_CCASFLAGS += @FLAG_M64@ @@ -39,7 +44,9 @@ crc32_CFLAGS = $(AM_CFLAGS) -march=armv8-a+crc atomics_v81_CFLAGS = $(AM_CFLAGS) -march=armv8.1-a simd_v81_CFLAGS = $(AM_CFLAGS) -march=armv8.1-a+crypto fp_and_simd_CFLAGS = $(AM_CFLAGS) -march=armv8-a+crypto +fp_and_simd_v82_CFLAGS = $(AM_CFLAGS) -march=armv8.2-a+fp16+crypto integer_CFLAGS = $(AM_CFLAGS) -g -O0 -DTEST_BFM=0 fp_and_simd_LDADD = -lm simd_v81_LDADD = -lm +fp_and_simd_v82_LDADD = -lm diff --git a/none/tests/arm64/fp_and_simd_v82.c b/none/tests/arm64/fp_and_simd_v82.c new file mode 100644 index 0000000000..8c66ff27bd --- /dev/null +++ b/none/tests/arm64/fp_and_simd_v82.c @@ -0,0 +1,2285 @@ + +#include <stdio.h> +#include <assert.h> +#include <malloc.h> // memalign +#include <string.h> // memset +#include "tests/malloc.h" +#include <math.h> // isnormal + +typedef unsigned char UChar; +typedef unsigned short int UShort; +typedef unsigned int UInt; +typedef signed int Int; +typedef unsigned char UChar; +typedef unsigned long long int ULong; +typedef signed long long int Long; +typedef double Double; +typedef float Float; +/* To test half-precision floating point instructions a synthesized 16 bit type + is used rather than native __fp16. This allows gradual support of v8.2 + instructions without test binaries like this failing to run with Valgrind + because a half-precision instruction which is not supported appears in the + test binary. The functions halfToSingleFPAsInt() and shortToSingle() below + are used to create a Float16 type for testing purposes. Float16 should be + typedefed to __fp16 when all v8.2 instructions are supported. +*/ +typedef unsigned short int Float16; + +typedef unsigned char Bool; +#define False ((Bool)0) +#define True ((Bool)1) + + +#define ITERS 1 + +typedef + enum { TyHF=1234, TySF, TyDF, TyB, TyH, TyS, TyD, TyNONE } + LaneTy; + +union _V128 { + UChar u8[16]; + UShort u16[8]; + UInt u32[4]; + ULong u64[2]; + Float16 f16[8]; + Float f32[4]; + Double f64[2]; +}; +typedef union _V128 V128; + +/* Conversion based on IEEE half-precision, as described in the IEEE 754-2008 + standard and Arm Reference Manual 'A1.4.2 Half-precision floating-point + formats' where hardware capability supports __fp16 (VEX_HWCAPS_ARM64_FP16 + and VEX_HWCAPS_ARM64_VFP16 set). +*/ +static UInt halfToSingleFPAsInt(UShort y) +{ + int s = (y >> 15) & 0x00000001; // Sign bit + int e = (y >> 10) & 0x0000001f; // Exponent + int f = y & 0x000003ff; // Fraction + + // Handle +/- INF (7c00 and fc00 -INF) and +/-0 + if (e == 0) { + if (f == 0) + return s << 31; + else { // Normalize + while (!(f & 0x00000400)) { + f <<= 1; + e -= 1; + } + e += 1; + f &= ~0x00000400; + } + } else if (e == 31) { + if (f == 0) // INF + return (s << 31) | 0x7f800000; + else // NaN + return (s << 31) | 0x7f800000 | (f << 13); + } + + e = e + (127 - 15); + f = f << 13; + + return ((s << 31) | (e << 23) | f); +} + +static float shortToSingle(UShort imm) +{ + union { float f; UInt i; } v; + v.i = halfToSingleFPAsInt(imm); + return v.f; +} + +static inline UChar randUChar ( void ) +{ + static UInt seed = 80021; + seed = 1103515245 * seed + 12345; + return (seed >> 17) & 0xFF; +} + +static ULong randULong ( LaneTy ty ) +{ + Int i; + ULong r = 0; + for (i = 0; i < 8; i++) { + r = (r << 8) | (ULong)(0xFF & randUChar()); + } + return r; +} + +/* Generates a random V128. Ensures that that it contains normalised FP numbers + when viewed as either F16x8, F32x4 or F64x2, so that it is reasonable to use + in FP test cases. */ +static void randV128 ( /*OUT*/V128* v, LaneTy ty ) +{ + static UInt nCalls = 0, nIters = 0; + Int i; + nCalls++; + while (1) { + nIters++; + for (i = 0; i < 16; i++) { + v->u8[i] = randUChar(); + } + if (isnormal(v->f32[0]) && isnormal(v->f32[1]) && isnormal(v->f32[2]) + && isnormal(v->f32[3]) && isnormal(v->f64[0]) && isnormal(v->f64[1]) + && isnormal(shortToSingle(v->f16[0])) && isnormal(shortToSingle(v->f16[1])) + && isnormal(shortToSingle(v->f16[2])) && isnormal(shortToSingle(v->f16[3])) + && isnormal(shortToSingle(v->f16[4])) && isnormal(shortToSingle(v->f16[5])) + && isnormal(shortToSingle(v->f16[6])) && isnormal(shortToSingle(v->f16[7]))) { + break; + } + } + if (0 == (nCalls & 0xFF)) + printf("randV128: %u calls, %u iters\n", nCalls, nIters); +} + +static void showV128 ( V128* v ) +{ + Int i; + for (i = 15; i >= 0; i--) + printf("%02x", (Int)v->u8[i]); +} + +static void showBlock ( const char* msg, V128* block, Int nBlock ) +{ + Int i; + printf("%s\n", msg); + for (i = 0; i < nBlock; i++) { + printf(" "); + showV128(&block[i]); + printf("\n"); + } +} + +static ULong dup4x16 ( UInt x ) +{ + ULong r = x & 0xF; + r |= (r << 4); + r |= (r << 8); + r |= (r << 16); + r |= (r << 32); + return r; +} + +// Generate a random double-precision number. About 1 time in 2, +// instead return a special value (+/- Inf, +/-Nan, denorm). +// This ensures that many of the groups of 4 calls here will +// return a special value. + +static Double special_values[10]; +static Bool special_values_initted = False; + +static __attribute__((noinline)) +Double negate ( Double d ) { return -d; } +static __attribute__((noinline)) +Double divf64 ( Double x, Double y ) { return x/y; } + +static __attribute__((noinline)) +Double plusZero ( void ) { return 0.0; } +static __attribute__((noinline)) +Double minusZero ( void ) { return negate(plusZero()); } + +static __attribute__((noinline)) +Double plusOne ( void ) { return 1.0; } +static __attribute__((noinline)) +Double minusOne ( void ) { return negate(plusOne()); } + +static __attribute__((noinline)) +Double plusInf ( void ) { return 1.0 / 0.0; } +static __attribute__((noinline)) +Double minusInf ( void ) { return negate(plusInf()); } + +static __attribute__((noinline)) +Double plusNaN ( void ) { return divf64(plusInf(),plusInf()); } +static __attribute__((noinline)) +Double minusNaN ( void ) { return negate(plusNaN()); } + +static __attribute__((noinline)) +Double plusDenorm ( void ) { return 1.23e-315 / 1e3; } +static __attribute__((noinline)) +Double minusDenorm ( void ) { return negate(plusDenorm()); } + + +static void ensure_special_values_initted ( void ) +{ + if (special_values_initted) return; + special_values[0] = plusZero(); + special_values[1] = minusZero(); + special_values[2] = plusOne(); + special_values[3] = minusOne(); + special_values[4] = plusInf(); + special_values[5] = minusInf(); + special_values[6] = plusNaN(); + special_values[7] = minusNaN(); + special_values[8] = plusDenorm(); + special_values[9] = minusDenorm(); + special_values_initted = True; + int i; + printf("\n"); + for (i = 0; i < 10; i++) { + printf("special value %d = %e\n", i, special_values[i]); + } + printf("\n"); +} + +static Double randDouble ( void ) +{ + ensure_special_values_initted(); + UChar c = randUChar(); + if (c >= 128) { + // return a normal number about half of the time. + // 0 .. 2^63-1 + ULong u64 = randULong(TyDF); + // -2^62 .. 2^62-1 + Long s64 = (Long)u64; + // -2^55 .. 2^55-1 + s64 >>= (62-55); + // and now as a float + return (Double)s64; + } + c = randUChar() % 10; + return special_values[c]; +} + +static Float randFloat ( void ) +{ + ensure_special_values_initted(); + UChar c = randUChar(); + if (c >= 128) { + // return a normal number about half of the time. + // 0 .. 2^63-1 + ULong u64 = randULong(TyDF); + // -2^62 .. 2^62-1 + Long s64 = (Long)u64; + // -2^25 .. 2^25-1 + s64 >>= (62-25); + // and now as a float + return (Float)s64; + } + c = randUChar() % 10; + return special_values[c]; +} + +void randBlock_Doubles ( V128* block, Int nBlock ) +{ + Int i; + for (i = 0; i < nBlock; i++) { + block[i].f64[0] = randDouble(); + block[i].f64[1] = randDouble(); + } +} + +void randBlock_Floats ( V128* block, Int nBlock ) +{ + Int i; + for (i = 0; i < nBlock; i++) { + block[i].f32[0] = randFloat(); + block[i].f32[1] = randFloat(); + block[i].f32[2] = randFloat(); + block[i].f32[3] = randFloat(); + } +} + + +/* ---------------------------------------------------------------- */ +/* -- Parameterisable test macros -- */ +/* ---------------------------------------------------------------- */ + +#define DO50(_action) \ + do { \ + Int _qq; for (_qq = 0; _qq < 50; _qq++) { _action ; } \ + } while (0) + + +/* Note this also sets the destination register to a known value (0x55..55) + since it can sometimes be an input to the instruction too. */ +#define GEN_UNARY_TEST(INSN,SUFFIXD,SUFFIXN) \ + __attribute__((noinline)) \ + static void test_##INSN##_##SUFFIXD##_##SUFFIXN ( LaneTy ty ) { \ + Int i; \ + for (i = 0; i < ITERS; i++) { \ + V128 block[2+1]; \ + memset(block, 0x55, sizeof(block)); \ + randV128(&block[0], ty); \ + randV128(&block[1], ty); \ + __asm__ __volatile__( \ + "mov x30, #0 ; msr fpsr, x30 ; " \ + "ldr q7, [%0, #0] ; " \ + "ldr q8, [%0, #16] ; " \ + #INSN " v8." #SUFFIXD ", v7." #SUFFIXN " ; " \ + "str q8, [%0, #16] ; " \ + "mrs x30, fpsr ; str x30, [%0, #32] " \ + : : "r"(&block[0]) : "memory", "v7", "v8", "x30" \ + ); \ + printf(#INSN " v8." #SUFFIXD ", v7." #SUFFIXN); \ + UInt fpsr = 0xFFFFFF60 & block[2].u32[0]; \ + showV128(&block[0]); printf(" "); \ + showV128(&block[1]); printf(" fpsr=%08x\n", fpsr); \ + } \ + } + + +/* Note this also sets the destination register to a known value (0x55..55) + since it can sometimes be an input to the instruction too. */ +#define GEN_BINARY_TEST(INSN,SUFFIXD,SUFFIXN,SUFFIXM) \ + __attribute__((noinline)) \ + static void test_##INSN##_##SUFFIXD##_##SUFFIXN##_##SUFFIXM ( LaneTy ty ) { \ + Int i; \ + for (i = 0; i < ITERS; i++) { \ + V128 block[3+1]; \ + memset(block, 0x55, sizeof(block)); \ + randV128(&block[0], ty); \ + randV128(&block[1], ty); \ + randV128(&block[2], ty); \ + __asm__ __volatile__( \ + "mov x30, #0 ; msr fpsr, x30 ; " \ + "ldr q7, [%0, #0] ; " \ + "ldr q8, [%0, #16] ; " \ + "ldr q9, [%0, #32] ; " \ + #INSN " v9." #SUFFIXD ", v7." #SUFFIXN ", v8." #SUFFIXM " ; " \ + "str q9, [%0, #32] ; " \ + "mrs x30, fpsr ; str x30, [%0, #48] " \ + : : "r"(&block[0]) : "memory", "v7", "v8", "v9", "x30" \ + ); \ + printf(#INSN " v9." #SUFFIXD \ + ", v7." #SUFFIXN ", v8." #SUFFIXM " "); \ + UInt fpsr = 0xFFFFFF60 & block[3].u32[0]; \ + showV128(&block[0]); printf(" "); \ + showV128(&block[1]); printf(" "); \ + showV128(&block[2]); printf(" fpsr=%08x\n", fpsr); \ + } \ + } + + +/* Note this also sets the destination register to a known value (0x55..55) + since it can sometimes be an input to the instruction too. */ +#define GEN_SHIFT_TEST(INSN,SUFFIXD,SUFFIXN,AMOUNT) \ + __attribute__((noinline)) \ + static void test_##INSN##_##SUFFIXD##_##SUFFIXN##_##AMOUNT ( LaneTy ty ) { \ + Int i; \ + for (i = 0; i < ITERS; i++) { \ + V128 block[2+1]; \ + memset(block, 0x55, sizeof(block)); \ + randV128(&block[0], ty); \ + randV128(&block[1], ty); \ + __asm__ __volatile__( \ + "mov x30, #0 ; msr fpsr, x30 ; " \ + "ldr q7, [%0, #0] ; " \ + "ldr q8, [%0, #16] ; " \ + #INSN " v8." #SUFFIXD ", v7." #SUFFIXN ", #" #AMOUNT " ; " \ + "str q8, [%0, #16] ; " \ + "mrs x30, fpsr ; str x30, [%0, #32] " \ + : : "r"(&block[0]) : "memory", "v7", "v8", "x30" \ + ); \ + printf(#INSN " v8." #SUFFIXD ", v7." #SUFFIXN ", #" #AMOUNT " "); \ + UInt fpsr = 0xFFFFFF60 & block[2].u32[0]; \ + showV128(&block[0]); printf(" "); \ + showV128(&block[1]); printf(" fpsr=%08x\n", fpsr); \ + } \ + } + + +/* Generate a test that involves one integer reg and one vector reg, + with no bias as towards which is input or output. */ +#define GEN_ONEINT_ONEVEC_TEST(TESTNAME,INSN,INTREGNO,VECREGNO) \ + __attribute__((noinline)) \ + static void test_##TESTNAME ( LaneTy ty ) { \ + Int i; \ + assert(INTREGNO != 30); \ + for (i = 0; i < ITERS; i++) { \ + V128 block[4+1]; \ + memset(block, 0x55, sizeof(block)); \ + randV128(&block[0], ty); \ + randV128(&block[1], ty); \ + randV128(&block[2], ty); \ + randV128(&block[3], ty); \ + __asm__ __volatile__( \ + "mov x30, #0 ; msr fpsr, x30 ; " \ + "ldr q"#VECREGNO", [%0, #0] ; " \ + "ldr x"#INTREGNO", [%0, #16] ; " \ + INSN " ; " \ + "str q"#VECREGNO", [%0, #32] ; " \ + "str x"#INTREGNO", [%0, #48] ; " \ + "mrs x30, fpsr ; str x30, [%0, #64] " \ + : : "r"(&block[0]) : "memory", "v"#VECREGNO, "x"#INTREGNO, "x30" \ + ); \ + printf(INSN " "); \ + UInt fpsr = 0xFFFFFF60 & block[4].u32[0]; \ + showV128(&block[0]); printf(" "); \ + showV128(&block[1]); printf(" "); \ + showV128(&block[2]); printf(" "); \ + showV128(&block[3]); printf(" fpsr=%08x\n", fpsr); \ + } \ + } + + +/* Generate a test that involves two vector regs, + with no bias as towards which is input or output. + It's OK to use x10 as scratch.*/ +#define GEN_TWOVEC_TEST(TESTNAME,INSN,VECREG1NO,VECREG2NO) \ + __attribute__((noinline)) \ + static void test_##TESTNAME ( LaneTy ty ) { \ + Int i; \ + for (i = 0; i < ITERS; i++) { \ + V128 block[4+1]; \ + memset(block, 0x55, sizeof(block)); \ + randV128(&block[0], ty); \ + randV128(&block[1], ty); \ + randV128(&block[2], ty); \ + randV128(&block[3], ty); \ + __asm__ __volatile__( \ + "mov x30, #0 ; msr fpsr, x30 ; " \ + "ldr q"#VECREG1NO", [%0, #0] ; " \ + "ldr q"#VECREG2NO", [%0, #16] ; " \ + INSN " ; " \ + "str q"#VECREG1NO", [%0, #32] ; " \ + "str q"#VECREG2NO", [%0, #48] ; " \ + "mrs x30, fpsr ; str x30, [%0, #64] " \ + : : "r"(&block[0]) \ + : "memory", "v"#VECREG1NO, "v"#VECREG2NO, "x10", "x30" \ + ); \ + printf(INSN " "); \ + UInt fpsr = 0xFFFFFF60 & block[4].u32[0]; \ + showV128(&block[0]); printf(" "); \ + showV128(&block[1]); printf(" "); \ + showV128(&block[2]); printf(" "); \ + showV128(&block[3]); printf(" fpsr=%08x\n", fpsr); \ + } \ + } + + +/* Generate a test that involves three vector regs, + with no bias as towards which is input or output. It's also OK + to use v16, v17, v18 as scratch. */ +#define GEN_THREEVEC_TEST(TESTNAME,INSN,VECREG1NO,VECREG2NO,VECREG3NO) \ + __attribute__((noinline)) \ + static void test_##TESTNAME ( LaneTy ty ) { \ + Int i; \ + for (i = 0; i < ITERS; i++) { \ + V128 block[6+1]; \ + memset(block, 0x55, sizeof(block)); \ + randV128(&block[0], ty); \ + randV128(&block[1], ty); \ + randV128(&block[2], ty); \ + randV128(&block[3], ty); \ + randV128(&block[4], ty); \ + randV128(&block[5], ty); \ + __asm__ __volatile__( \ + "mov x30, #0 ; msr fpsr, x30 ; " \ + "ldr q"#VECREG1NO", [%0, #0] ; " \ + "ldr q"#VECREG2NO", [%0, #16] ; " \ + "ldr q"#VECREG3NO", [%0, #32] ; " \ + INSN " ; " \ + "str q"#VECREG1NO", [%0, #48] ; " \ + "str q"#VECREG2NO", [%0, #64] ; " \ + "str q"#VECREG3NO", [%0, #80] ; " \ + "mrs x30, fpsr ; str x30, [%0, #96] " \ + : : "r"(&block[0]) \ + : "memory", "v"#VECREG1NO, "v"#VECREG2NO, "v"#VECREG3NO, \ + "v16", "v17", "v18", "x30" \ + ); \ + printf(INSN " "); \ + UInt fpsr = 0xFFFFFF60 & block[6].u32[0]; \ + showV128(&block[0]); printf(" "); \ + showV128(&block[1]); printf(" "); \ + showV128(&block[2]); printf(" "); \ + showV128(&block[3]); printf(" "); \ + showV128(&block[4]); printf(" "); \ + showV128(&block[5]); printf(" fpsr=%08x\n", fpsr); \ + } \ + } + + +/* Generate a test that involves four vector regs, + with no bias as towards which is input or output. It's also OK + to use v16, v17, v18 as scratch. */ +#define GEN_FOURVEC_TEST(TESTNAME,INSN,VECREG1NO,VECREG2NO, \ + VECREG3NO,VECREG4NO) \ + __attribute__((noinline)) \ + static void test_##TESTNAME ( LaneTy ty ) { \ + Int i; \ + for (i = 0; i < ITERS; i++) { \ + V128 block[8+1]; \ + memset(block, 0x55, sizeof(block)); \ + randV128(&block[0], ty); \ + randV128(&block[1], ty); \ + randV128(&block[2], ty); \ + randV128(&block[3], ty); \ + randV128(&block[4], ty); \ + randV128(&block[5], ty); \ + randV128(&block[6], ty); \ + randV128(&block[7], ty); \ + __asm__ __volatile__( \ + "mov x30, #0 ; msr fpsr, x30 ; " \ + "ldr q"#VECREG1NO", [%0, #0] ; " \ + "ldr q"#VECREG2NO", [%0, #16] ; " \ + "ldr q"#VECREG3NO", [%0, #32] ; " \ + "ldr q"#VECREG4NO", [%0, #48] ; " \ + INSN " ; " \ + "str q"#VECREG1NO", [%0, #64] ; " \ + "str q"#VECREG2NO", [%0, #80] ; " \ + "str q"#VECREG3NO", [%0, #96] ; " \ + "str q"#VECREG4NO", [%0, #112] ; " \ + "mrs x30, fpsr ; str x30, [%0, #128] " \ + : : "r"(&block[0]) \ + : "memory", "v"#VECREG1NO, "v"#VECREG2NO, \ + "v"#VECREG3NO, "v"#VECREG4NO, \ + "v16", "v17", "v18", "x30" \ + ); \ + printf(INSN " "); \ + UInt fpsr = 0xFFFFFF60 & block[8].u32[0]; \ + showV128(&block[0]); printf(" "); \ + showV128(&block[1]); printf(" "); \ + showV128(&block[2]); printf(" "); \ + showV128(&block[3]); printf(" "); \ + showV128(&block[4]); printf(" "); \ + showV128(&block[5]); printf(" "); \ + showV128(&block[6]); printf(" "); \ + showV128(&block[7]); printf(" fpsr=%08x\n", fpsr); \ + } \ + } + + +/* ---------------------------------------------------------------- */ +/* -- Test functions and non-parameterisable test macros -- */ +/* ---------------------------------------------------------------- */ + +void test_UMINV ( void ) +{ + int i; + V128 block[2]; + + /* -- 4s -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyS); + randV128(&block[1], TyS); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "uminv s8, v7.4s ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("UMINV v8, v7.4s "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 8h -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyH); + randV128(&block[1], TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "uminv h8, v7.8h ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("UMINV h8, v7.8h "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 4h -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyH); + randV128(&block[1], TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "uminv h8, v7.4h ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("UMINV h8, v7.4h "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 16b -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyB); + randV128(&block[1], TyB); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "uminv b8, v7.16b ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("UMINV b8, v7.16b "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 8b -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyB); + randV128(&block[1], TyB); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "uminv b8, v7.8b ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("UMINV b8, v7.8b "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + +} + + +void test_UMAXV ( void ) +{ + int i; + V128 block[2]; + + /* -- 4s -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyS); + randV128(&block[1], TyS); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "umaxv s8, v7.4s ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("UMAXV v8, v7.4s "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 8h -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyH); + randV128(&block[1], TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "umaxv h8, v7.8h ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("UMAXV h8, v7.8h "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 4h -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyH); + randV128(&block[1], TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "umaxv h8, v7.4h ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("UMAXV h8, v7.4h "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 16b -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyB); + randV128(&block[1], TyB); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "umaxv b8, v7.16b ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("UMAXV b8, v7.16b "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 8b -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyB); + randV128(&block[1], TyB); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "umaxv b8, v7.8b ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("UMAXV b8, v7.8b "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + +} + + +void test_INS_general ( void ) +{ + V128 block[3]; + + /* -- D[0..1] -- */ + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyD); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.d[0], x19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.u64[0],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyD); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.d[1], x19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.d[1],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + /* -- S[0..3] -- */ + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyS); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.s[0], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.s[0],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyS); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.s[1], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.s[1],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyS); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.s[2], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.s[2],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyS); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.s[3], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.s[3],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + /* -- H[0..7] -- */ + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.h[0], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.h[0],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.h[1], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.h[1],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.h[2], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.h[2],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.h[3], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.h[3],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.h[4], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.h[4],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.h[5], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.h[5],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.h[6], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.h[6],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.h[7], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.h[7],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + /* -- B[0,15] -- */ + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyB); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.b[0], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.b[0],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); + + memset(&block, 0x55, sizeof(block)); + block[1].u64[0] = randULong(TyB); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "ldr x19, [%0, #16] ; " + "ins v7.b[15], w19 ; " + "str q7, [%0, #32] " + : : "r"(&block[0]) : "memory", "x19", "v7" + ); + printf("INS v7.b[15],x19 "); + showV128(&block[0]); printf(" %016llx ", block[1].u64[0]); + showV128(&block[2]); printf("\n"); +} + + + +void test_SMINV ( void ) +{ + int i; + V128 block[2]; + + /* -- 4s -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyS); + randV128(&block[1], TyS); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "sminv s8, v7.4s ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("SMINV v8, v7.4s "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 8h -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyH); + randV128(&block[1], TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "sminv h8, v7.8h ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("SMINV h8, v7.8h "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 4h -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyH); + randV128(&block[1], TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "sminv h8, v7.4h ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("SMINV h8, v7.4h "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 16b -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyB); + randV128(&block[1], TyB); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "sminv b8, v7.16b ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("SMINV b8, v7.16b "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 8b -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyB); + randV128(&block[1], TyB); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "sminv b8, v7.8b ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("SMINV b8, v7.8b "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + +} + + +void test_SMAXV ( void ) +{ + int i; + V128 block[2]; + + /* -- 4s -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyS); + randV128(&block[1], TyS); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "smaxv s8, v7.4s ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("SMAXV v8, v7.4s "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 8h -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyH); + randV128(&block[1], TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "smaxv h8, v7.8h ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("SMAXV h8, v7.8h "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 4h -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyH); + randV128(&block[1], TyH); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "smaxv h8, v7.4h ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("SMAXV h8, v7.4h "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 16b -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyB); + randV128(&block[1], TyB); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "smaxv b8, v7.16b ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("SMAXV b8, v7.16b "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + + /* -- 8b -- */ + + for (i = 0; i < 10; i++) { + memset(&block, 0x55, sizeof(block)); + randV128(&block[0], TyB); + randV128(&block[1], TyB); + __asm__ __volatile__( + "ldr q7, [%0, #0] ; " + "smaxv b8, v7.8b ; " + "str q8, [%0, #16] " + : : "r"(&block[0]) : "memory", "v7", "v8" + ); + printf("SMAXV b8, v7.8b "); + showV128(&block[0]); printf(" "); + showV128(&block[1]); printf("\n"); + } + +} + + +//======== FCCMP_D ========// + +#define GEN_test_FCCMP_D_D_0xF_EQ \ + __attribute__((noinline)) static void test_FCCMP_D_D_0xF_EQ ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMP_D_D_0xF_EQ before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmp d29, d11, #0xf, eq; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMP_D_D_0xF_EQ after", &block[0], 4); \ + printf("\n"); \ + } + +#define GEN_test_FCCMP_D_D_0xF_NE \ + __attribute__((noinline)) static void test_FCCMP_D_D_0xF_NE ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMP_D_D_0xF_NE before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmp d29, d11, #0xf, ne; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMP_D_D_0xF_NE after", &block[0], 4); \ + printf("\n"); \ + } + +#define GEN_test_FCCMP_D_D_0x0_EQ \ + __attribute__((noinline)) static void test_FCCMP_D_D_0x0_EQ ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMP_D_D_0x0_EQ before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmp d29, d11, #0x0, eq; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMP_D_D_0x0_EQ after", &block[0], 4); \ + printf("\n"); \ + } + +#define GEN_test_FCCMP_D_D_0x0_NE \ + __attribute__((noinline)) static void test_FCCMP_D_D_0x0_NE ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMP_D_D_0x0_NE before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmp d29, d11, #0x0, ne; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMP_D_D_0x0_NE after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCCMP_S ========// + +#define GEN_test_FCCMP_S_S_0xF_EQ \ + __attribute__((noinline)) static void test_FCCMP_S_S_0xF_EQ ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMP_S_S_0xF_EQ before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmp s29, s11, #0xf, eq; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMP_S_S_0xF_EQ after", &block[0], 4); \ + printf("\n"); \ + } + +#define GEN_test_FCCMP_S_S_0xF_NE \ + __attribute__((noinline)) static void test_FCCMP_S_S_0xF_NE ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMP_S_S_0xF_NE before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmp s29, s11, #0xf, ne; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMP_S_S_0xF_NE after", &block[0], 4); \ + printf("\n"); \ + } + +#define GEN_test_FCCMP_S_S_0x0_EQ \ + __attribute__((noinline)) static void test_FCCMP_S_S_0x0_EQ ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMP_S_S_0x0_EQ before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmp s29, s11, #0x0, eq; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMP_S_S_0x0_EQ after", &block[0], 4); \ + printf("\n"); \ + } + +#define GEN_test_FCCMP_S_S_0x0_NE \ + __attribute__((noinline)) static void test_FCCMP_S_S_0x0_NE ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMP_S_S_0x0_NE before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmp s29, s11, #0x0, ne; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMP_S_S_0x0_NE after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCCMPE_D ========// + +#define GEN_test_FCCMPE_D_D_0xF_EQ \ + __attribute__((noinline)) static void test_FCCMPE_D_D_0xF_EQ ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMPE_D_D_0xF_EQ before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmpe d29, d11, #0xf, eq; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMPE_D_D_0xF_EQ after", &block[0], 4); \ + printf("\n"); \ + } + +#define GEN_test_FCCMPE_D_D_0xF_NE \ + __attribute__((noinline)) static void test_FCCMPE_D_D_0xF_NE ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMPE_D_D_0xF_NE before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmpe d29, d11, #0xf, ne; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMPE_D_D_0xF_NE after", &block[0], 4); \ + printf("\n"); \ + } + +#define GEN_test_FCCMPE_D_D_0x0_EQ \ + __attribute__((noinline)) static void test_FCCMPE_D_D_0x0_EQ ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMPE_D_D_0x0_EQ before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmpe d29, d11, #0x0, eq; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMPE_D_D_0x0_EQ after", &block[0], 4); \ + printf("\n"); \ + } + +#define GEN_test_FCCMPE_D_D_0x0_NE \ + __attribute__((noinline)) static void test_FCCMPE_D_D_0x0_NE ( void ) \ + { \ + V128 block[4]; \ + randBlock_Doubles(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMPE_D_D_0x0_NE before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmpe d29, d11, #0x0, ne; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMPE_D_D_0x0_NE after", &block[0], 4); \ + printf("\n"); \ + } + +//======== FCCMPE_S ========// + +#define GEN_test_FCCMPE_S_S_0xF_EQ \ + __attribute__((noinline)) static void test_FCCMPE_S_S_0xF_EQ ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMP_S_S_0xF_EQ before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmpe s29, s11, #0xf, eq; " \ + "mrs x9, nzcv; str x9, [%0, 48]; " \ + "str q29, [%0, #0]; str q11, [%0, #16]; str q9, [%0, #32]; " \ + ::"r"(&block[0]) : "x9","cc","memory","v9","v11","v29" \ + ); \ + showBlock("FCCMPE_S_S_0xF_EQ after", &block[0], 4); \ + printf("\n"); \ + } + +#define GEN_test_FCCMPE_S_S_0xF_NE \ + __attribute__((noinline)) static void test_FCCMPE_S_S_0xF_NE ( void ) \ + { \ + V128 block[4]; \ + randBlock_Floats(&block[0], 3); \ + block[3].u64[0] = dup4x16(0x5); block[3].u64[1] = dup4x16(0xA); \ + showBlock("FCCMPE_S_S_0xF_NE before", &block[0], 4); \ + __asm__ __volatile__( \ + "ldr x9, [%0, 48]; msr nzcv, x9; " \ + "ldr q29, [%0, #0]; ldr q11, [%0, #16]; ldr q9, [%0, #32]; " \ + "fccmpe s29, ... [truncated message content] |