You can subscribe to this list here.
| 2002 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
(1) |
Oct
(122) |
Nov
(152) |
Dec
(69) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2003 |
Jan
(6) |
Feb
(25) |
Mar
(73) |
Apr
(82) |
May
(24) |
Jun
(25) |
Jul
(10) |
Aug
(11) |
Sep
(10) |
Oct
(54) |
Nov
(203) |
Dec
(182) |
| 2004 |
Jan
(307) |
Feb
(305) |
Mar
(430) |
Apr
(312) |
May
(187) |
Jun
(342) |
Jul
(487) |
Aug
(637) |
Sep
(336) |
Oct
(373) |
Nov
(441) |
Dec
(210) |
| 2005 |
Jan
(385) |
Feb
(480) |
Mar
(636) |
Apr
(544) |
May
(679) |
Jun
(625) |
Jul
(810) |
Aug
(838) |
Sep
(634) |
Oct
(521) |
Nov
(965) |
Dec
(543) |
| 2006 |
Jan
(494) |
Feb
(431) |
Mar
(546) |
Apr
(411) |
May
(406) |
Jun
(322) |
Jul
(256) |
Aug
(401) |
Sep
(345) |
Oct
(542) |
Nov
(308) |
Dec
(481) |
| 2007 |
Jan
(427) |
Feb
(326) |
Mar
(367) |
Apr
(255) |
May
(244) |
Jun
(204) |
Jul
(223) |
Aug
(231) |
Sep
(354) |
Oct
(374) |
Nov
(497) |
Dec
(362) |
| 2008 |
Jan
(322) |
Feb
(482) |
Mar
(658) |
Apr
(422) |
May
(476) |
Jun
(396) |
Jul
(455) |
Aug
(267) |
Sep
(280) |
Oct
(253) |
Nov
(232) |
Dec
(304) |
| 2009 |
Jan
(486) |
Feb
(470) |
Mar
(458) |
Apr
(423) |
May
(696) |
Jun
(461) |
Jul
(551) |
Aug
(575) |
Sep
(134) |
Oct
(110) |
Nov
(157) |
Dec
(102) |
| 2010 |
Jan
(226) |
Feb
(86) |
Mar
(147) |
Apr
(117) |
May
(107) |
Jun
(203) |
Jul
(193) |
Aug
(238) |
Sep
(300) |
Oct
(246) |
Nov
(23) |
Dec
(75) |
| 2011 |
Jan
(133) |
Feb
(195) |
Mar
(315) |
Apr
(200) |
May
(267) |
Jun
(293) |
Jul
(353) |
Aug
(237) |
Sep
(278) |
Oct
(611) |
Nov
(274) |
Dec
(260) |
| 2012 |
Jan
(303) |
Feb
(391) |
Mar
(417) |
Apr
(441) |
May
(488) |
Jun
(655) |
Jul
(590) |
Aug
(610) |
Sep
(526) |
Oct
(478) |
Nov
(359) |
Dec
(372) |
| 2013 |
Jan
(467) |
Feb
(226) |
Mar
(391) |
Apr
(281) |
May
(299) |
Jun
(252) |
Jul
(311) |
Aug
(352) |
Sep
(481) |
Oct
(571) |
Nov
(222) |
Dec
(231) |
| 2014 |
Jan
(185) |
Feb
(329) |
Mar
(245) |
Apr
(238) |
May
(281) |
Jun
(399) |
Jul
(382) |
Aug
(500) |
Sep
(579) |
Oct
(435) |
Nov
(487) |
Dec
(256) |
| 2015 |
Jan
(338) |
Feb
(357) |
Mar
(330) |
Apr
(294) |
May
(191) |
Jun
(108) |
Jul
(142) |
Aug
(261) |
Sep
(190) |
Oct
(54) |
Nov
(83) |
Dec
(22) |
| 2016 |
Jan
(49) |
Feb
(89) |
Mar
(33) |
Apr
(50) |
May
(27) |
Jun
(34) |
Jul
(53) |
Aug
(53) |
Sep
(98) |
Oct
(206) |
Nov
(93) |
Dec
(53) |
| 2017 |
Jan
(65) |
Feb
(82) |
Mar
(102) |
Apr
(86) |
May
(187) |
Jun
(67) |
Jul
(23) |
Aug
(93) |
Sep
(65) |
Oct
(45) |
Nov
(35) |
Dec
(17) |
| 2018 |
Jan
(26) |
Feb
(35) |
Mar
(38) |
Apr
(32) |
May
(8) |
Jun
(43) |
Jul
(27) |
Aug
(30) |
Sep
(43) |
Oct
(42) |
Nov
(38) |
Dec
(67) |
| 2019 |
Jan
(32) |
Feb
(37) |
Mar
(53) |
Apr
(64) |
May
(49) |
Jun
(18) |
Jul
(14) |
Aug
(53) |
Sep
(25) |
Oct
(30) |
Nov
(49) |
Dec
(31) |
| 2020 |
Jan
(87) |
Feb
(45) |
Mar
(37) |
Apr
(51) |
May
(99) |
Jun
(36) |
Jul
(11) |
Aug
(14) |
Sep
(20) |
Oct
(24) |
Nov
(40) |
Dec
(23) |
| 2021 |
Jan
(14) |
Feb
(53) |
Mar
(85) |
Apr
(15) |
May
(19) |
Jun
(3) |
Jul
(14) |
Aug
(1) |
Sep
(57) |
Oct
(73) |
Nov
(56) |
Dec
(22) |
| 2022 |
Jan
(3) |
Feb
(22) |
Mar
(6) |
Apr
(55) |
May
(46) |
Jun
(39) |
Jul
(15) |
Aug
(9) |
Sep
(11) |
Oct
(34) |
Nov
(20) |
Dec
(36) |
| 2023 |
Jan
(79) |
Feb
(41) |
Mar
(99) |
Apr
(169) |
May
(48) |
Jun
(16) |
Jul
(16) |
Aug
(57) |
Sep
(19) |
Oct
|
Nov
|
Dec
|
| S | M | T | W | T | F | S |
|---|---|---|---|---|---|---|
|
|
|
|
|
|
|
1
|
|
2
(1) |
3
|
4
(1) |
5
|
6
(3) |
7
|
8
|
|
9
|
10
(1) |
11
(1) |
12
|
13
|
14
|
15
|
|
16
|
17
(1) |
18
(2) |
19
(1) |
20
|
21
|
22
|
|
23
(1) |
24
(2) |
25
(1) |
26
|
27
|
28
(1) |
29
|
|
30
|
31
|
|
|
|
|
|
|
From: Ashley, W. <wa...@am...> - 2023-07-28 17:04:29
|
https://bugs.kde.org/show_bug.cgi?id=460616 This adds support for the FEAT_DotProd instructions that are optional in arm v8.2 and 8.3 -a profiles, then mandatory in v8.4+ (at least that's my understanding from ARM's docs). I hope this is the correct mechanism to submit a change. I don't have a sourceware account so I can't push it to a user branch there and reference that. commit fd75f20fa461b326c4f5734b8dd001ad0661e58f Author: William Ashley <wa...@am...> Date: Thu Jul 27 14:49:17 2023 +0000 Bug 460616 - Add support for aarch64 dotprod instructions This change adds support for the FEAT_DotProd instructions SDOT <Vd>.<Ta>, <Vn>.<Tb>, <Vm>.4B[<index>] SDOT <Vd>.<Ta>, <Vn>.<Tb>, <Vm>.<Tb> UDOT <Vd>.<Ta>, <Vn>.<Tb>, <Vm>.4B[<index>] UDOT <Vd>.<Ta>, <Vn>.<Tb>, <Vm>.<Tb> diff --git a/.gitignore b/.gitignore index 6538eb718..c39f03b9a 100644 --- a/.gitignore +++ b/.gitignore @@ -1710,6 +1710,7 @@ /none/tests/arm64/fp_and_simd_v82 /none/tests/arm64/integer /none/tests/arm64/memory +/none/tests/arm64/simd_dotprod /none/tests/arm64/simd_v81 # /none/tests/darwin/ diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c index 16a7e075f..9fe164483 100644 --- a/VEX/priv/guest_arm64_toIR.c +++ b/VEX/priv/guest_arm64_toIR.c @@ -9113,6 +9113,21 @@ IRTemp math_RHADD ( UInt size, Bool isU, IRTemp aa, IRTemp bb ) } +/* Generate IR to do {U,S}ADDLP */ +static +IRTemp math_ADDLP ( UInt sizeNarrow, Bool isU, IRTemp src ) +{ + IRTemp res = newTempV128(); + assign(res, + binop(mkVecADD(sizeNarrow+1), + mkexpr(math_WIDEN_EVEN_OR_ODD_LANES( + isU, True/*fromOdd*/, sizeNarrow, mkexpr(src))), + mkexpr(math_WIDEN_EVEN_OR_ODD_LANES( + isU, False/*!fromOdd*/, sizeNarrow, mkexpr(src))))); + return res; +} + + /* QCFLAG tracks the SIMD sticky saturation status. Update the status thusly: if, after application of |opZHI| to both |qres| and |nres|, they have the same value, leave QCFLAG unchanged. Otherwise, set it @@ -13406,12 +13421,7 @@ Bool dis_AdvSIMD_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn) IRTemp sum = newTempV128(); IRTemp res = newTempV128(); assign(src, getQReg128(nn)); - assign(sum, - binop(mkVecADD(size+1), - mkexpr(math_WIDEN_EVEN_OR_ODD_LANES( - isU, True/*fromOdd*/, size, mkexpr(src))), - mkexpr(math_WIDEN_EVEN_OR_ODD_LANES( - isU, False/*!fromOdd*/, size, mkexpr(src))))); + sum = math_ADDLP(size, isU, src); assign(res, isACC ? binop(mkVecADD(size+1), mkexpr(sum), getQReg128(dd)) : mkexpr(sum)); putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res)); @@ -15692,6 +15702,91 @@ Bool dis_AdvSIMD_fp_to_from_int_conv(/*MB_OUT*/DisResult* dres, UInt insn) } +static +Bool dis_AdvSIMD_dot_product(/*MB_OUT*/DisResult* dres, UInt insn) +{ + /* by element + 31 30 29 28 23 21 20 15 11 10 9 4 + 0 Q U 01111 size L m 1110 H 0 n d + vector + 31 30 29 28 23 21 20 15 11 10 9 4 + 0 Q U 01110 size 0 m 1001 0 1 n d + */ +# define INSN(_bMax,_bMin) SLICE_UInt(insn, (_bMax), (_bMin)) + if (INSN(31,31) != 0) { + return False; + } + UInt bitQ = INSN(30,30); + UInt bitU = INSN(29,29); + UInt opcode1 = INSN(28,24); + UInt size = INSN(23,22); + UInt bitL = INSN(21,21); + UInt mm = INSN(20,16); + UInt opcode2 = INSN(15,12); + UInt bitH = INSN(11,11); + UInt opcode3 = INSN(10,10); + UInt nn = INSN(9,5); + UInt dd = INSN(4,0); + UInt index = (bitH << 1) + bitL; + vassert(index <= 3); + + Bool byElement; + if (opcode1 == BITS5(0,1,1,1,1) + && opcode2 == BITS4(1,1,1,0) + && opcode3 == 0) { + byElement = True; + } else if (opcode1 == BITS5(0,1,1,1,0) + && opcode2 == BITS4(1,0,0,1) + && opcode3 == 1 + && bitL == 0 && bitH == 0) { + byElement = False; + } else { + return False; + } + + // '10' is the only valid size + if (size != X10) return False; + + IRExpr* src1 = math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(nn)); + IRExpr* src2 = getQReg128(mm); + if (byElement) { + src2 = mkexpr(math_DUP_VEC_ELEM(src2, X10, index)); + } + + IROp mulOp = bitU ? Iop_Mull8Ux8 : Iop_Mull8Sx8; + IRTemp loProductSums = math_ADDLP( + X01, bitU, math_BINARY_WIDENING_V128(False, mulOp, src1, src2)); + IRTemp hiProductSums = math_ADDLP( + X01, bitU, math_BINARY_WIDENING_V128(True, mulOp, src1, src2)); + + IRTemp res = newTempV128(); + assign(res, binop(Iop_Add32x4, + mk_CatEvenLanes32x4(hiProductSums, loProductSums), + mk_CatOddLanes32x4(hiProductSums, loProductSums))); + + // These instructions accumulate into the destination, but in non-q + // form the upper 64 bits get forced to 0 + IRExpr* accVal = math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(dd)); + putQReg128(dd, binop(mkVecADD(size), mkexpr(res), accVal)); + + const HChar* nm = bitU ? "udot" : "sdot"; + const HChar* destWidth = nameArr_Q_SZ(bitQ, size); + const HChar* srcWidth = nameArr_Q_SZ(bitQ, X00); + if (byElement) { + DIP("%s v%u.%s, v%u.%s, v%u.4b[%u]\n", nm, + dd, destWidth, + nn, srcWidth, mm, index); + } else { + DIP("%s v%u.%s, v%u.%s, v%u.%s\n", nm, + dd, destWidth, + nn, srcWidth, mm, srcWidth); + } + + return True; +# undef INSN +} + + static Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn, const VexArchInfo* archinfo, Bool sigill_diag) @@ -15767,6 +15862,8 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn, if (UNLIKELY(ok)) return True; ok = dis_AdvSIMD_fp_to_from_int_conv(dres, insn); if (UNLIKELY(ok)) return True; + ok = dis_AdvSIMD_dot_product(dres, insn); + if (UNLIKELY(ok)) return True; return False; } diff --git a/configure.ac b/configure.ac index b4e9c1142..7a2da9d7c 100755 --- a/configure.ac +++ b/configure.ac @@ -3730,6 +3730,31 @@ CFLAGS="$save_CFLAGS" AM_CONDITIONAL(BUILD_ARMV82_TESTS, test x$ac_have_armv82_feature = xyes) +# Does the C compiler support the armv82-a+dotprod flag and assembler dotprod instructions +# Note, this doesn't generate a C-level symbol. It generates a +# automake-level symbol (BUILD_ARMV82_DOTPROD_TESTS), used in test Makefile.am's +AC_MSG_CHECKING([if gcc supports the armv82-a+dotprod feature flag and assembler supports dotprod instructions]) + +save_CFLAGS="$CFLAGS" +CFLAGS="$CFLAGS -march=armv8.2-a+dotprod -Werror" +AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ +int main() +{ + __asm__ __volatile__("sdot v1.4s, v2.16b, v3.16b"); + return 0; +} +]])], [ +ac_have_armv82_dotprod_feature=yes +AC_MSG_RESULT([yes]) +], [ +ac_have_armv82_dotprod_feature=no +AC_MSG_RESULT([no]) +]) +CFLAGS="$save_CFLAGS" + +AM_CONDITIONAL(BUILD_ARMV82_DOTPROD_TESTS, test x$ac_have_armv82_dotprod_feature = xyes) + + # XXX JRS 2010 Oct 13: what is this for? For sure, we don't need this # when building the tool executables. I think we should get rid of it. # diff --git a/none/tests/arm64/Makefile.am b/none/tests/arm64/Makefile.am index 4a06f0996..cc0ed1481 100644 --- a/none/tests/arm64/Makefile.am +++ b/none/tests/arm64/Makefile.am @@ -11,6 +11,7 @@ EXTRA_DIST = \ memory.stdout.exp memory.stderr.exp memory.vgtest \ atomics_v81.stdout.exp atomics_v81.stderr.exp atomics_v81.vgtest \ simd_v81.stdout.exp simd_v81.stderr.exp simd_v81.vgtest \ + simd_dotprod.stdout.exp simd_dotprod.stderr.exp simd_dotprod.vgtest \ fmadd_sub.stdout.exp fmadd_sub.stderr.exp fmadd_sub.vgtest \ fp_and_simd_v82.stdout.exp fp_and_simd_v82.stderr.exp \ fp_and_simd_v82.vgtest \ @@ -40,6 +41,10 @@ if BUILD_ARMV82_TESTS check_PROGRAMS += fp_and_simd_v82 endif +if BUILD_ARMV82_DOTPROD_TESTS + check_PROGRAMS += simd_dotprod +endif + AM_CFLAGS += @FLAG_M64@ AM_CXXFLAGS += @FLAG_M64@ AM_CCASFLAGS += @FLAG_M64@ @@ -49,6 +54,7 @@ allexec_CFLAGS = $(AM_CFLAGS) @FLAG_W_NO_NONNULL@ crc32_CFLAGS = $(AM_CFLAGS) -march=armv8-a+crc atomics_v81_CFLAGS = $(AM_CFLAGS) -march=armv8.1-a simd_v81_CFLAGS = $(AM_CFLAGS) -march=armv8.1-a+crypto +simd_dotprod_CFLAGS = $(AM_CFLAGS) -march=armv8.2-a+dotprod fp_and_simd_CFLAGS = $(AM_CFLAGS) -march=armv8-a+crypto fp_and_simd_v82_CFLAGS = $(AM_CFLAGS) -march=armv8.2-a+fp16+crypto integer_CFLAGS = $(AM_CFLAGS) -g -O0 -DTEST_BFM=0 diff --git a/none/tests/arm64/simd_dotprod.c b/none/tests/arm64/simd_dotprod.c new file mode 100644 index 000000000..ca67da551 --- /dev/null +++ b/none/tests/arm64/simd_dotprod.c @@ -0,0 +1,110 @@ +#include <stdio.h> +#include <assert.h> + +typedef unsigned char UChar; +typedef unsigned int UInt; +typedef signed int Int; + +#define ITERS 1 + +union _V128 { + UChar u8[16]; +}; +typedef union _V128 V128; + +static inline UChar randUChar ( void ) +{ + static UInt seed = 80021; + seed = 1103515245 * seed + 12345; + return (seed >> 17) & 0xFF; +} + +/* Generates a random V128. */ +static void randV128 ( /*OUT*/V128* v) +{ + static UInt nCalls = 0; + Int i; + nCalls++; + for (i = 0; i < 16; i++) { + v->u8[i] = randUChar(); + } + if (0 == (nCalls & 0xFF)) + printf("randV128: %u calls\n", nCalls); +} + +static void showV128 ( V128* v ) +{ + Int i; + for (i = 15; i >= 0; i--) + printf("%02x", (Int)v->u8[i]); +} + +#define GEN_BINARY_TEST_BODY(INSN,SUFFIXD,SUFFIXN,SUFFIXM) \ + Int i; \ + for (i = 0; i < ITERS; i++) { \ + V128 block[3]; \ + randV128(&block[0]); \ + randV128(&block[1]); \ + randV128(&block[2]); \ + __asm__ __volatile__( \ + "ldr q7, [%0, #0];" \ + "ldr q8, [%0, #16];" \ + "ldr q9, [%0, #32];" \ + #INSN " v9." #SUFFIXD ", v7." #SUFFIXN ", v8." SUFFIXM " ; " \ + "str q9, [%0, #32];" \ + : : "r"(&block[0]) : "memory", "v7", "v8", "v9" \ + ); \ + printf(#INSN " v9." #SUFFIXD \ + ", v7." #SUFFIXN ", v8." SUFFIXM " "); \ + showV128(&block[0]); printf(" "); \ + showV128(&block[1]); printf(" "); \ + showV128(&block[2]); printf("\n"); \ + } \ + +#define GEN_BINARY_TEST_BY_ELEM(INSN,SUFFIXD,SUFFIXN,MELEM) \ + __attribute__((noinline)) \ + static void test_##INSN##_##SUFFIXD##_##SUFFIXN##_elem_##MELEM () { \ + GEN_BINARY_TEST_BODY(INSN,SUFFIXD,SUFFIXN,"4b[" #MELEM "]") \ + } + +#define GEN_BINARY_TEST(INSN,SUFFIXD,SUFFIXN,SUFFIXM) \ + __attribute__((noinline)) \ + static void test_##INSN##_##SUFFIXD##_##SUFFIXN##_##SUFFIXM () { \ + GEN_BINARY_TEST_BODY(INSN,SUFFIXD,SUFFIXN,#SUFFIXM) \ + } + +GEN_BINARY_TEST(sdot, 2s, 8b, 8b) +GEN_BINARY_TEST(udot, 2s, 8b, 8b) +GEN_BINARY_TEST(sdot, 4s, 16b, 16b) +GEN_BINARY_TEST(udot, 4s, 16b, 16b) +GEN_BINARY_TEST_BY_ELEM(sdot, 2s, 8b, 0) +GEN_BINARY_TEST_BY_ELEM(udot, 2s, 8b, 1) +GEN_BINARY_TEST_BY_ELEM(sdot, 4s, 16b, 2) +GEN_BINARY_TEST_BY_ELEM(udot, 4s, 16b, 3) + +int main ( void ) +{ + assert(sizeof(V128) == 16); + + // ======================== {S,U}DOT by element ==================== + // sdot 2s,8b,4b[0] + // udot 2s,8b,4b[1] + // sdot 4s,16b,4b[2] + // udot 4s,16b,4b[3] + test_sdot_2s_8b_elem_0(); + test_udot_2s_8b_elem_1(); + test_sdot_4s_16b_elem_2(); + test_udot_4s_16b_elem_3(); + + // ======================== {S,U}DOT vector ======================== + // sdot 2s,8b,8b + // udot 2s,8b,8b + // sdot 4s,16b,16b + // udot 4s,16b,16b + test_sdot_2s_8b_8b(); + test_udot_2s_8b_8b(); + test_sdot_4s_16b_16b(); + test_udot_4s_16b_16b(); + + return 0; +} diff --git a/none/tests/arm64/simd_dotprod.stderr.exp b/none/tests/arm64/simd_dotprod.stderr.exp new file mode 100644 index 000000000..e69de29bb diff --git a/none/tests/arm64/simd_dotprod.stdout.exp b/none/tests/arm64/simd_dotprod.stdout.exp new file mode 100644 index 000000000..88724550d --- /dev/null +++ b/none/tests/arm64/simd_dotprod.stdout.exp @@ -0,0 +1,8 @@ +sdot v9.2s, v7.8b, v8.4b[0] 5175e39d19c9ca1e98f24a4984175700 7d6528c5fa956a0d69c3e9a6af27d13b 000000000000000047b8fac3eeef3914 +udot v9.2s, v7.8b, v8.4b[1] b6d2fb5aa7bc5127fe9915e556a044b2 19a348215c3a67fd399182c2dbcc2d38 0000000000000000842c23cf5066b549 +sdot v9.4s, v7.16b, v8.4b[2] d89998df5035ed364a4bc43968bc40e5 cb509970b8136c85d740b80eb7839b97 f9dd31bff8c05f5456afd620b0ca1b30 +udot v9.4s, v7.16b, v8.4b[3] 5ff85bc9535c191fd3a727d1a705f65d d8bc5c6dee699597398e0039cf03663d 20a33823cbca1faf542f38453df87d2b +sdot v9.2s, v7.8b, v8.8b d182c916cebc2e17cfaff39be272ef40 6897b536bbe4da8a369dab4f9465b86e 0000000000000000f4e068450523c8a1 +udot v9.2s, v7.8b, v8.8b 95264321bf3b68b255c2b9e2c95c9810 81f2a547be8d181184ededbc53239dcf 00000000000000008d6b78e8f7e97e90 +sdot v9.4s, v7.16b, v8.16b f0350ca70523e0e45ba1ec54e87d39b3 0a3e0f7c75cb0842b95ed64d3b13ff64 e98e9eeaa89323fc54cac842e13de403 +udot v9.4s, v7.16b, v8.16b 0a5f45c55f1c9202b76ddefcb0ebfe6e c84ab713406845904d325b2d5a70a792 5f49643cced88b926263a4c2727e0a11 diff --git a/none/tests/arm64/simd_dotprod.vgtest b/none/tests/arm64/simd_dotprod.vgtest new file mode 100644 index 000000000..1997e64fa --- /dev/null +++ b/none/tests/arm64/simd_dotprod.vgtest @@ -0,0 +1,3 @@ +prog: simd_dotprod +prereq: test -x simd_dotprod && ../../../tests/arm64_features asimddp +vgopts: -q |