You can subscribe to this list here.
| 2002 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
(1) |
Oct
(122) |
Nov
(152) |
Dec
(69) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2003 |
Jan
(6) |
Feb
(25) |
Mar
(73) |
Apr
(82) |
May
(24) |
Jun
(25) |
Jul
(10) |
Aug
(11) |
Sep
(10) |
Oct
(54) |
Nov
(203) |
Dec
(182) |
| 2004 |
Jan
(307) |
Feb
(305) |
Mar
(430) |
Apr
(312) |
May
(187) |
Jun
(342) |
Jul
(487) |
Aug
(637) |
Sep
(336) |
Oct
(373) |
Nov
(441) |
Dec
(210) |
| 2005 |
Jan
(385) |
Feb
(480) |
Mar
(636) |
Apr
(544) |
May
(679) |
Jun
(625) |
Jul
(810) |
Aug
(838) |
Sep
(634) |
Oct
(521) |
Nov
(965) |
Dec
(543) |
| 2006 |
Jan
(494) |
Feb
(431) |
Mar
(546) |
Apr
(411) |
May
(406) |
Jun
(322) |
Jul
(256) |
Aug
(401) |
Sep
(345) |
Oct
(542) |
Nov
(308) |
Dec
(481) |
| 2007 |
Jan
(427) |
Feb
(326) |
Mar
(367) |
Apr
(255) |
May
(244) |
Jun
(204) |
Jul
(223) |
Aug
(231) |
Sep
(354) |
Oct
(374) |
Nov
(497) |
Dec
(362) |
| 2008 |
Jan
(322) |
Feb
(482) |
Mar
(658) |
Apr
(422) |
May
(476) |
Jun
(396) |
Jul
(455) |
Aug
(267) |
Sep
(280) |
Oct
(253) |
Nov
(232) |
Dec
(304) |
| 2009 |
Jan
(486) |
Feb
(470) |
Mar
(458) |
Apr
(423) |
May
(696) |
Jun
(461) |
Jul
(551) |
Aug
(575) |
Sep
(134) |
Oct
(110) |
Nov
(157) |
Dec
(102) |
| 2010 |
Jan
(226) |
Feb
(86) |
Mar
(147) |
Apr
(117) |
May
(107) |
Jun
(203) |
Jul
(193) |
Aug
(238) |
Sep
(300) |
Oct
(246) |
Nov
(23) |
Dec
(75) |
| 2011 |
Jan
(133) |
Feb
(195) |
Mar
(315) |
Apr
(200) |
May
(267) |
Jun
(293) |
Jul
(353) |
Aug
(237) |
Sep
(278) |
Oct
(611) |
Nov
(274) |
Dec
(260) |
| 2012 |
Jan
(303) |
Feb
(391) |
Mar
(417) |
Apr
(441) |
May
(488) |
Jun
(655) |
Jul
(590) |
Aug
(610) |
Sep
(526) |
Oct
(478) |
Nov
(359) |
Dec
(372) |
| 2013 |
Jan
(467) |
Feb
(226) |
Mar
(391) |
Apr
(281) |
May
(299) |
Jun
(252) |
Jul
(311) |
Aug
(352) |
Sep
(481) |
Oct
(571) |
Nov
(222) |
Dec
(231) |
| 2014 |
Jan
(185) |
Feb
(329) |
Mar
(245) |
Apr
(238) |
May
(281) |
Jun
(399) |
Jul
(382) |
Aug
(500) |
Sep
(579) |
Oct
(435) |
Nov
(487) |
Dec
(256) |
| 2015 |
Jan
(338) |
Feb
(357) |
Mar
(330) |
Apr
(294) |
May
(191) |
Jun
(108) |
Jul
(142) |
Aug
(261) |
Sep
(190) |
Oct
(54) |
Nov
(83) |
Dec
(22) |
| 2016 |
Jan
(49) |
Feb
(89) |
Mar
(33) |
Apr
(50) |
May
(27) |
Jun
(34) |
Jul
(53) |
Aug
(53) |
Sep
(98) |
Oct
(206) |
Nov
(93) |
Dec
(53) |
| 2017 |
Jan
(65) |
Feb
(82) |
Mar
(102) |
Apr
(86) |
May
(187) |
Jun
(67) |
Jul
(23) |
Aug
(93) |
Sep
(65) |
Oct
(45) |
Nov
(35) |
Dec
(17) |
| 2018 |
Jan
(26) |
Feb
(35) |
Mar
(38) |
Apr
(32) |
May
(8) |
Jun
(43) |
Jul
(27) |
Aug
(30) |
Sep
(43) |
Oct
(42) |
Nov
(38) |
Dec
(67) |
| 2019 |
Jan
(32) |
Feb
(37) |
Mar
(53) |
Apr
(64) |
May
(49) |
Jun
(18) |
Jul
(14) |
Aug
(53) |
Sep
(25) |
Oct
(30) |
Nov
(49) |
Dec
(31) |
| 2020 |
Jan
(87) |
Feb
(45) |
Mar
(37) |
Apr
(51) |
May
(99) |
Jun
(36) |
Jul
(11) |
Aug
(14) |
Sep
(20) |
Oct
(24) |
Nov
(40) |
Dec
(23) |
| 2021 |
Jan
(14) |
Feb
(53) |
Mar
(85) |
Apr
(15) |
May
(19) |
Jun
(3) |
Jul
(14) |
Aug
(1) |
Sep
(57) |
Oct
(73) |
Nov
(56) |
Dec
(22) |
| 2022 |
Jan
(3) |
Feb
(22) |
Mar
(6) |
Apr
(55) |
May
(46) |
Jun
(39) |
Jul
(15) |
Aug
(9) |
Sep
(11) |
Oct
(34) |
Nov
(20) |
Dec
(36) |
| 2023 |
Jan
(79) |
Feb
(41) |
Mar
(99) |
Apr
(169) |
May
(48) |
Jun
(16) |
Jul
(16) |
Aug
(57) |
Sep
(19) |
Oct
|
Nov
|
Dec
|
|
From: Ashley, W. <wa...@am...> - 2023-08-01 18:50:43
|
Sorry, I missed one file
diff --git a/coregrind/m_initimg/initimg-linux.c b/coregrind/m_initimg/initimg-linux.c
index 7a7d45335..7680baa8e 100644
--- a/coregrind/m_initimg/initimg-linux.c
+++ b/coregrind/m_initimg/initimg-linux.c
@@ -734,7 +734,8 @@ Addr setup_client_stack( void* init_sp,
| VKI_HWCAP_SHA2 \
| VKI_HWCAP_CRC32 \
| VKI_HWCAP_FP \
- | VKI_HWCAP_ASIMD)
+ | VKI_HWCAP_ASIMD \
+ | VKI_HWCAP_ASIMDDP)
auxv->u.a_val &= ARM64_SUPPORTED_HWCAP;
}
# endif
|
|
From: Ashley, W. <wa...@am...> - 2023-07-28 17:04:29
|
https://bugs.kde.org/show_bug.cgi?id=460616 This adds support for the FEAT_DotProd instructions that are optional in arm v8.2 and 8.3 -a profiles, then mandatory in v8.4+ (at least that's my understanding from ARM's docs). I hope this is the correct mechanism to submit a change. I don't have a sourceware account so I can't push it to a user branch there and reference that. commit fd75f20fa461b326c4f5734b8dd001ad0661e58f Author: William Ashley <wa...@am...> Date: Thu Jul 27 14:49:17 2023 +0000 Bug 460616 - Add support for aarch64 dotprod instructions This change adds support for the FEAT_DotProd instructions SDOT <Vd>.<Ta>, <Vn>.<Tb>, <Vm>.4B[<index>] SDOT <Vd>.<Ta>, <Vn>.<Tb>, <Vm>.<Tb> UDOT <Vd>.<Ta>, <Vn>.<Tb>, <Vm>.4B[<index>] UDOT <Vd>.<Ta>, <Vn>.<Tb>, <Vm>.<Tb> diff --git a/.gitignore b/.gitignore index 6538eb718..c39f03b9a 100644 --- a/.gitignore +++ b/.gitignore @@ -1710,6 +1710,7 @@ /none/tests/arm64/fp_and_simd_v82 /none/tests/arm64/integer /none/tests/arm64/memory +/none/tests/arm64/simd_dotprod /none/tests/arm64/simd_v81 # /none/tests/darwin/ diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c index 16a7e075f..9fe164483 100644 --- a/VEX/priv/guest_arm64_toIR.c +++ b/VEX/priv/guest_arm64_toIR.c @@ -9113,6 +9113,21 @@ IRTemp math_RHADD ( UInt size, Bool isU, IRTemp aa, IRTemp bb ) } +/* Generate IR to do {U,S}ADDLP */ +static +IRTemp math_ADDLP ( UInt sizeNarrow, Bool isU, IRTemp src ) +{ + IRTemp res = newTempV128(); + assign(res, + binop(mkVecADD(sizeNarrow+1), + mkexpr(math_WIDEN_EVEN_OR_ODD_LANES( + isU, True/*fromOdd*/, sizeNarrow, mkexpr(src))), + mkexpr(math_WIDEN_EVEN_OR_ODD_LANES( + isU, False/*!fromOdd*/, sizeNarrow, mkexpr(src))))); + return res; +} + + /* QCFLAG tracks the SIMD sticky saturation status. Update the status thusly: if, after application of |opZHI| to both |qres| and |nres|, they have the same value, leave QCFLAG unchanged. Otherwise, set it @@ -13406,12 +13421,7 @@ Bool dis_AdvSIMD_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn) IRTemp sum = newTempV128(); IRTemp res = newTempV128(); assign(src, getQReg128(nn)); - assign(sum, - binop(mkVecADD(size+1), - mkexpr(math_WIDEN_EVEN_OR_ODD_LANES( - isU, True/*fromOdd*/, size, mkexpr(src))), - mkexpr(math_WIDEN_EVEN_OR_ODD_LANES( - isU, False/*!fromOdd*/, size, mkexpr(src))))); + sum = math_ADDLP(size, isU, src); assign(res, isACC ? binop(mkVecADD(size+1), mkexpr(sum), getQReg128(dd)) : mkexpr(sum)); putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res)); @@ -15692,6 +15702,91 @@ Bool dis_AdvSIMD_fp_to_from_int_conv(/*MB_OUT*/DisResult* dres, UInt insn) } +static +Bool dis_AdvSIMD_dot_product(/*MB_OUT*/DisResult* dres, UInt insn) +{ + /* by element + 31 30 29 28 23 21 20 15 11 10 9 4 + 0 Q U 01111 size L m 1110 H 0 n d + vector + 31 30 29 28 23 21 20 15 11 10 9 4 + 0 Q U 01110 size 0 m 1001 0 1 n d + */ +# define INSN(_bMax,_bMin) SLICE_UInt(insn, (_bMax), (_bMin)) + if (INSN(31,31) != 0) { + return False; + } + UInt bitQ = INSN(30,30); + UInt bitU = INSN(29,29); + UInt opcode1 = INSN(28,24); + UInt size = INSN(23,22); + UInt bitL = INSN(21,21); + UInt mm = INSN(20,16); + UInt opcode2 = INSN(15,12); + UInt bitH = INSN(11,11); + UInt opcode3 = INSN(10,10); + UInt nn = INSN(9,5); + UInt dd = INSN(4,0); + UInt index = (bitH << 1) + bitL; + vassert(index <= 3); + + Bool byElement; + if (opcode1 == BITS5(0,1,1,1,1) + && opcode2 == BITS4(1,1,1,0) + && opcode3 == 0) { + byElement = True; + } else if (opcode1 == BITS5(0,1,1,1,0) + && opcode2 == BITS4(1,0,0,1) + && opcode3 == 1 + && bitL == 0 && bitH == 0) { + byElement = False; + } else { + return False; + } + + // '10' is the only valid size + if (size != X10) return False; + + IRExpr* src1 = math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(nn)); + IRExpr* src2 = getQReg128(mm); + if (byElement) { + src2 = mkexpr(math_DUP_VEC_ELEM(src2, X10, index)); + } + + IROp mulOp = bitU ? Iop_Mull8Ux8 : Iop_Mull8Sx8; + IRTemp loProductSums = math_ADDLP( + X01, bitU, math_BINARY_WIDENING_V128(False, mulOp, src1, src2)); + IRTemp hiProductSums = math_ADDLP( + X01, bitU, math_BINARY_WIDENING_V128(True, mulOp, src1, src2)); + + IRTemp res = newTempV128(); + assign(res, binop(Iop_Add32x4, + mk_CatEvenLanes32x4(hiProductSums, loProductSums), + mk_CatOddLanes32x4(hiProductSums, loProductSums))); + + // These instructions accumulate into the destination, but in non-q + // form the upper 64 bits get forced to 0 + IRExpr* accVal = math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(dd)); + putQReg128(dd, binop(mkVecADD(size), mkexpr(res), accVal)); + + const HChar* nm = bitU ? "udot" : "sdot"; + const HChar* destWidth = nameArr_Q_SZ(bitQ, size); + const HChar* srcWidth = nameArr_Q_SZ(bitQ, X00); + if (byElement) { + DIP("%s v%u.%s, v%u.%s, v%u.4b[%u]\n", nm, + dd, destWidth, + nn, srcWidth, mm, index); + } else { + DIP("%s v%u.%s, v%u.%s, v%u.%s\n", nm, + dd, destWidth, + nn, srcWidth, mm, srcWidth); + } + + return True; +# undef INSN +} + + static Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn, const VexArchInfo* archinfo, Bool sigill_diag) @@ -15767,6 +15862,8 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn, if (UNLIKELY(ok)) return True; ok = dis_AdvSIMD_fp_to_from_int_conv(dres, insn); if (UNLIKELY(ok)) return True; + ok = dis_AdvSIMD_dot_product(dres, insn); + if (UNLIKELY(ok)) return True; return False; } diff --git a/configure.ac b/configure.ac index b4e9c1142..7a2da9d7c 100755 --- a/configure.ac +++ b/configure.ac @@ -3730,6 +3730,31 @@ CFLAGS="$save_CFLAGS" AM_CONDITIONAL(BUILD_ARMV82_TESTS, test x$ac_have_armv82_feature = xyes) +# Does the C compiler support the armv82-a+dotprod flag and assembler dotprod instructions +# Note, this doesn't generate a C-level symbol. It generates a +# automake-level symbol (BUILD_ARMV82_DOTPROD_TESTS), used in test Makefile.am's +AC_MSG_CHECKING([if gcc supports the armv82-a+dotprod feature flag and assembler supports dotprod instructions]) + +save_CFLAGS="$CFLAGS" +CFLAGS="$CFLAGS -march=armv8.2-a+dotprod -Werror" +AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ +int main() +{ + __asm__ __volatile__("sdot v1.4s, v2.16b, v3.16b"); + return 0; +} +]])], [ +ac_have_armv82_dotprod_feature=yes +AC_MSG_RESULT([yes]) +], [ +ac_have_armv82_dotprod_feature=no +AC_MSG_RESULT([no]) +]) +CFLAGS="$save_CFLAGS" + +AM_CONDITIONAL(BUILD_ARMV82_DOTPROD_TESTS, test x$ac_have_armv82_dotprod_feature = xyes) + + # XXX JRS 2010 Oct 13: what is this for? For sure, we don't need this # when building the tool executables. I think we should get rid of it. # diff --git a/none/tests/arm64/Makefile.am b/none/tests/arm64/Makefile.am index 4a06f0996..cc0ed1481 100644 --- a/none/tests/arm64/Makefile.am +++ b/none/tests/arm64/Makefile.am @@ -11,6 +11,7 @@ EXTRA_DIST = \ memory.stdout.exp memory.stderr.exp memory.vgtest \ atomics_v81.stdout.exp atomics_v81.stderr.exp atomics_v81.vgtest \ simd_v81.stdout.exp simd_v81.stderr.exp simd_v81.vgtest \ + simd_dotprod.stdout.exp simd_dotprod.stderr.exp simd_dotprod.vgtest \ fmadd_sub.stdout.exp fmadd_sub.stderr.exp fmadd_sub.vgtest \ fp_and_simd_v82.stdout.exp fp_and_simd_v82.stderr.exp \ fp_and_simd_v82.vgtest \ @@ -40,6 +41,10 @@ if BUILD_ARMV82_TESTS check_PROGRAMS += fp_and_simd_v82 endif +if BUILD_ARMV82_DOTPROD_TESTS + check_PROGRAMS += simd_dotprod +endif + AM_CFLAGS += @FLAG_M64@ AM_CXXFLAGS += @FLAG_M64@ AM_CCASFLAGS += @FLAG_M64@ @@ -49,6 +54,7 @@ allexec_CFLAGS = $(AM_CFLAGS) @FLAG_W_NO_NONNULL@ crc32_CFLAGS = $(AM_CFLAGS) -march=armv8-a+crc atomics_v81_CFLAGS = $(AM_CFLAGS) -march=armv8.1-a simd_v81_CFLAGS = $(AM_CFLAGS) -march=armv8.1-a+crypto +simd_dotprod_CFLAGS = $(AM_CFLAGS) -march=armv8.2-a+dotprod fp_and_simd_CFLAGS = $(AM_CFLAGS) -march=armv8-a+crypto fp_and_simd_v82_CFLAGS = $(AM_CFLAGS) -march=armv8.2-a+fp16+crypto integer_CFLAGS = $(AM_CFLAGS) -g -O0 -DTEST_BFM=0 diff --git a/none/tests/arm64/simd_dotprod.c b/none/tests/arm64/simd_dotprod.c new file mode 100644 index 000000000..ca67da551 --- /dev/null +++ b/none/tests/arm64/simd_dotprod.c @@ -0,0 +1,110 @@ +#include <stdio.h> +#include <assert.h> + +typedef unsigned char UChar; +typedef unsigned int UInt; +typedef signed int Int; + +#define ITERS 1 + +union _V128 { + UChar u8[16]; +}; +typedef union _V128 V128; + +static inline UChar randUChar ( void ) +{ + static UInt seed = 80021; + seed = 1103515245 * seed + 12345; + return (seed >> 17) & 0xFF; +} + +/* Generates a random V128. */ +static void randV128 ( /*OUT*/V128* v) +{ + static UInt nCalls = 0; + Int i; + nCalls++; + for (i = 0; i < 16; i++) { + v->u8[i] = randUChar(); + } + if (0 == (nCalls & 0xFF)) + printf("randV128: %u calls\n", nCalls); +} + +static void showV128 ( V128* v ) +{ + Int i; + for (i = 15; i >= 0; i--) + printf("%02x", (Int)v->u8[i]); +} + +#define GEN_BINARY_TEST_BODY(INSN,SUFFIXD,SUFFIXN,SUFFIXM) \ + Int i; \ + for (i = 0; i < ITERS; i++) { \ + V128 block[3]; \ + randV128(&block[0]); \ + randV128(&block[1]); \ + randV128(&block[2]); \ + __asm__ __volatile__( \ + "ldr q7, [%0, #0];" \ + "ldr q8, [%0, #16];" \ + "ldr q9, [%0, #32];" \ + #INSN " v9." #SUFFIXD ", v7." #SUFFIXN ", v8." SUFFIXM " ; " \ + "str q9, [%0, #32];" \ + : : "r"(&block[0]) : "memory", "v7", "v8", "v9" \ + ); \ + printf(#INSN " v9." #SUFFIXD \ + ", v7." #SUFFIXN ", v8." SUFFIXM " "); \ + showV128(&block[0]); printf(" "); \ + showV128(&block[1]); printf(" "); \ + showV128(&block[2]); printf("\n"); \ + } \ + +#define GEN_BINARY_TEST_BY_ELEM(INSN,SUFFIXD,SUFFIXN,MELEM) \ + __attribute__((noinline)) \ + static void test_##INSN##_##SUFFIXD##_##SUFFIXN##_elem_##MELEM () { \ + GEN_BINARY_TEST_BODY(INSN,SUFFIXD,SUFFIXN,"4b[" #MELEM "]") \ + } + +#define GEN_BINARY_TEST(INSN,SUFFIXD,SUFFIXN,SUFFIXM) \ + __attribute__((noinline)) \ + static void test_##INSN##_##SUFFIXD##_##SUFFIXN##_##SUFFIXM () { \ + GEN_BINARY_TEST_BODY(INSN,SUFFIXD,SUFFIXN,#SUFFIXM) \ + } + +GEN_BINARY_TEST(sdot, 2s, 8b, 8b) +GEN_BINARY_TEST(udot, 2s, 8b, 8b) +GEN_BINARY_TEST(sdot, 4s, 16b, 16b) +GEN_BINARY_TEST(udot, 4s, 16b, 16b) +GEN_BINARY_TEST_BY_ELEM(sdot, 2s, 8b, 0) +GEN_BINARY_TEST_BY_ELEM(udot, 2s, 8b, 1) +GEN_BINARY_TEST_BY_ELEM(sdot, 4s, 16b, 2) +GEN_BINARY_TEST_BY_ELEM(udot, 4s, 16b, 3) + +int main ( void ) +{ + assert(sizeof(V128) == 16); + + // ======================== {S,U}DOT by element ==================== + // sdot 2s,8b,4b[0] + // udot 2s,8b,4b[1] + // sdot 4s,16b,4b[2] + // udot 4s,16b,4b[3] + test_sdot_2s_8b_elem_0(); + test_udot_2s_8b_elem_1(); + test_sdot_4s_16b_elem_2(); + test_udot_4s_16b_elem_3(); + + // ======================== {S,U}DOT vector ======================== + // sdot 2s,8b,8b + // udot 2s,8b,8b + // sdot 4s,16b,16b + // udot 4s,16b,16b + test_sdot_2s_8b_8b(); + test_udot_2s_8b_8b(); + test_sdot_4s_16b_16b(); + test_udot_4s_16b_16b(); + + return 0; +} diff --git a/none/tests/arm64/simd_dotprod.stderr.exp b/none/tests/arm64/simd_dotprod.stderr.exp new file mode 100644 index 000000000..e69de29bb diff --git a/none/tests/arm64/simd_dotprod.stdout.exp b/none/tests/arm64/simd_dotprod.stdout.exp new file mode 100644 index 000000000..88724550d --- /dev/null +++ b/none/tests/arm64/simd_dotprod.stdout.exp @@ -0,0 +1,8 @@ +sdot v9.2s, v7.8b, v8.4b[0] 5175e39d19c9ca1e98f24a4984175700 7d6528c5fa956a0d69c3e9a6af27d13b 000000000000000047b8fac3eeef3914 +udot v9.2s, v7.8b, v8.4b[1] b6d2fb5aa7bc5127fe9915e556a044b2 19a348215c3a67fd399182c2dbcc2d38 0000000000000000842c23cf5066b549 +sdot v9.4s, v7.16b, v8.4b[2] d89998df5035ed364a4bc43968bc40e5 cb509970b8136c85d740b80eb7839b97 f9dd31bff8c05f5456afd620b0ca1b30 +udot v9.4s, v7.16b, v8.4b[3] 5ff85bc9535c191fd3a727d1a705f65d d8bc5c6dee699597398e0039cf03663d 20a33823cbca1faf542f38453df87d2b +sdot v9.2s, v7.8b, v8.8b d182c916cebc2e17cfaff39be272ef40 6897b536bbe4da8a369dab4f9465b86e 0000000000000000f4e068450523c8a1 +udot v9.2s, v7.8b, v8.8b 95264321bf3b68b255c2b9e2c95c9810 81f2a547be8d181184ededbc53239dcf 00000000000000008d6b78e8f7e97e90 +sdot v9.4s, v7.16b, v8.16b f0350ca70523e0e45ba1ec54e87d39b3 0a3e0f7c75cb0842b95ed64d3b13ff64 e98e9eeaa89323fc54cac842e13de403 +udot v9.4s, v7.16b, v8.16b 0a5f45c55f1c9202b76ddefcb0ebfe6e c84ab713406845904d325b2d5a70a792 5f49643cced88b926263a4c2727e0a11 diff --git a/none/tests/arm64/simd_dotprod.vgtest b/none/tests/arm64/simd_dotprod.vgtest new file mode 100644 index 000000000..1997e64fa --- /dev/null +++ b/none/tests/arm64/simd_dotprod.vgtest @@ -0,0 +1,3 @@ +prog: simd_dotprod +prereq: test -x simd_dotprod && ../../../tests/arm64_features asimddp +vgopts: -q |
|
From: Petr P. <pet...@da...> - 2023-07-25 19:55:29
|
On 17. Jul 23 15:05, Jojo R wrote: > Hi, > > Sorry for the late reply, > > i have been pushing the progress of valgrind RVV implementation 😄 > We finished the first version and tested with full RVV intrinsics spec. > > For real project and developers, we implement the first useable/ full > functionality's RVV valgrind with dirtycall method, > and we will make experiment or optimize RVV implementation on ideal RVV > design. > > Back to the RVV RFC, we are happy to share our thinking of design, see > attachment for more details :) This is a good summary. As mentioned in another part of the thread, I think that in long run it will be indeed needed to implement the approach described as "RVV to variable-length IR". I hope to help with making sure it can work for Arm SVE too. I guess if initial experiments show that this option is hard and will take time to implement then it could make sense in short term for the RISC-V port to go with the "RVV to dirty helper" implementation. Thanks, Petr |
|
From: Paul F. <pa...@so...> - 2023-07-24 20:10:41
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=6ce0979884a8f246c80a098333ceef1a7b7f694d commit 6ce0979884a8f246c80a098333ceef1a7b7f694d Author: Paul Floyd <pj...@wa...> Date: Mon Jul 24 22:06:00 2023 +0200 Bug 472219 - Syscall param ppoll(ufds.events) points to uninitialised byte(s) Add checks that (p)poll fd is not negative. If it is negative, don't check the events field. Diff: --- .gitignore | 1 + NEWS | 1 + coregrind/m_syswrap/syswrap-freebsd.c | 12 ++++++------ coregrind/m_syswrap/syswrap-generic.c | 6 ++++-- coregrind/m_syswrap/syswrap-linux.c | 6 ++++-- coregrind/m_syswrap/syswrap-solaris.c | 5 +++-- memcheck/tests/Makefile.am | 4 ++++ memcheck/tests/bug472219.c | 16 ++++++++++++++++ memcheck/tests/bug472219.stderr.exp | 0 memcheck/tests/bug472219.vgtest | 2 ++ memcheck/tests/freebsd/scalar.c | 12 +++++++++--- memcheck/tests/freebsd/scalar.stderr.exp | 23 +++++++++++++++++++---- memcheck/tests/freebsd/scalar.stderr.exp-x86 | 23 +++++++++++++++++++---- memcheck/tests/solaris/scalar.stderr.exp | 4 ---- memcheck/tests/x86-linux/scalar.stderr.exp | 5 ----- 15 files changed, 88 insertions(+), 32 deletions(-) diff --git a/.gitignore b/.gitignore index 9e16ac126d..6538eb718b 100644 --- a/.gitignore +++ b/.gitignore @@ -845,6 +845,7 @@ /memcheck/tests/bug287260 /memcheck/tests/bug340392 /memcheck/tests/bug464969_d_demangle +/memcheck/tests/bug472219 /memcheck/tests/calloc-overflow /memcheck/tests/cdebug_zlib /memcheck/tests/cdebug_zlib_gnu diff --git a/NEWS b/NEWS index 783612fbb9..867d2f0f43 100644 --- a/NEWS +++ b/NEWS @@ -45,6 +45,7 @@ are not entered into bugzilla tend to get forgotten about or ignored. Assertion 'resolved' failed 470830 Don't print actions vgdb me ... continue for vgdb --multi mode 470978 s390x: Valgrind cannot start qemu-kvm when "sysctl vm.allocate_pgste=0" +472219 Syscall param ppoll(ufds.events) points to uninitialised byte(s) To see details of a given bug, visit https://bugs.kde.org/show_bug.cgi?id=XXXXXX diff --git a/coregrind/m_syswrap/syswrap-freebsd.c b/coregrind/m_syswrap/syswrap-freebsd.c index 6b9f3d2109..9af37cfb83 100644 --- a/coregrind/m_syswrap/syswrap-freebsd.c +++ b/coregrind/m_syswrap/syswrap-freebsd.c @@ -6124,15 +6124,15 @@ PRE(sys_ppoll) struct vki_pollfd *, fds, unsigned int, nfds, struct vki_timespec *, timeout, vki_sigset_t *, newsigmask); - if (ML_(safe_to_deref)(fds, ARG2*sizeof(struct vki_pollfd))) { - for (i = 0; i < ARG2; i++) { - PRE_MEM_READ( "ppoll(fds.fd)", - (Addr)(&fds[i].fd), sizeof(fds[i].fd) ); + for (i = 0; i < ARG2; i++) { + PRE_MEM_READ( "ppoll(fds.fd)", + (Addr)(&fds[i].fd), sizeof(fds[i].fd) ); + if (ML_(safe_to_deref)(&fds[i].fd, sizeof(fds[i].fd)) && fds[i].fd >= 0) { PRE_MEM_READ( "ppoll(fds.events)", (Addr)(&fds[i].events), sizeof(fds[i].events) ); - PRE_MEM_WRITE( "ppoll(fds.revents)", - (Addr)(&fds[i].revents), sizeof(fds[i].revents) ); } + PRE_MEM_WRITE( "ppoll(fds.revents)", + (Addr)(&fds[i].revents), sizeof(fds[i].revents) ); } if (ARG3) { diff --git a/coregrind/m_syswrap/syswrap-generic.c b/coregrind/m_syswrap/syswrap-generic.c index efdae60e10..ed9d14685f 100644 --- a/coregrind/m_syswrap/syswrap-generic.c +++ b/coregrind/m_syswrap/syswrap-generic.c @@ -4339,8 +4339,10 @@ PRE(sys_poll) for (i = 0; i < ARG2; i++) { PRE_MEM_READ( "poll(ufds.fd)", (Addr)(&ufds[i].fd), sizeof(ufds[i].fd) ); - PRE_MEM_READ( "poll(ufds.events)", - (Addr)(&ufds[i].events), sizeof(ufds[i].events) ); + if (ML_(safe_to_deref)(&ufds[i].fd, sizeof(ufds[i].fd)) && ufds[i].fd >= 0) { + PRE_MEM_READ( "poll(ufds.events)", + (Addr)(&ufds[i].events), sizeof(ufds[i].events) ); + } PRE_MEM_WRITE( "poll(ufds.revents)", (Addr)(&ufds[i].revents), sizeof(ufds[i].revents) ); } diff --git a/coregrind/m_syswrap/syswrap-linux.c b/coregrind/m_syswrap/syswrap-linux.c index f8621f8f0d..20c68c877c 100644 --- a/coregrind/m_syswrap/syswrap-linux.c +++ b/coregrind/m_syswrap/syswrap-linux.c @@ -1984,8 +1984,10 @@ static void ppoll_pre_helper ( ThreadId tid, SyscallArgLayout* layout, for (i = 0; i < ARG2; i++) { PRE_MEM_READ( "ppoll(ufds.fd)", (Addr)(&ufds[i].fd), sizeof(ufds[i].fd) ); - PRE_MEM_READ( "ppoll(ufds.events)", - (Addr)(&ufds[i].events), sizeof(ufds[i].events) ); + if (ufds[i].fd >= 0) { + PRE_MEM_READ( "ppoll(ufds.events)", + (Addr)(&ufds[i].events), sizeof(ufds[i].events) ); + } PRE_MEM_WRITE( "ppoll(ufds.revents)", (Addr)(&ufds[i].revents), sizeof(ufds[i].revents) ); } diff --git a/coregrind/m_syswrap/syswrap-solaris.c b/coregrind/m_syswrap/syswrap-solaris.c index 8a2a140f95..ed3cb4a551 100644 --- a/coregrind/m_syswrap/syswrap-solaris.c +++ b/coregrind/m_syswrap/syswrap-solaris.c @@ -7831,8 +7831,9 @@ PRE(sys_pollsys) for (i = 0; i < ARG2; i++) { vki_pollfd_t *u = &ufds[i]; PRE_FIELD_READ("poll(ufds.fd)", u->fd); - /* XXX Check if it's valid? */ - PRE_FIELD_READ("poll(ufds.events)", u->events); + if (ML_(safe_to_deref)(&ufds[i].fd, sizeof(ufds[i].fd)) && ufds[i].fd >= 0) { + PRE_FIELD_READ("poll(ufds.events)", u->events); + } PRE_FIELD_WRITE("poll(ufds.revents)", u->revents); } diff --git a/memcheck/tests/Makefile.am b/memcheck/tests/Makefile.am index 5a17fd35d4..307f47bd8e 100644 --- a/memcheck/tests/Makefile.am +++ b/memcheck/tests/Makefile.am @@ -118,6 +118,7 @@ EXTRA_DIST = \ bug340392.stderr.exp bug340392.vgtest \ bug464969_d_demangle.stderr.exp bug464969_d_demangle.vgtest \ bug464969_d_demangle.stdout.exp \ + bug472219.stderr.exp bug472219.vgtest \ calloc-overflow.stderr.exp calloc-overflow.vgtest\ cdebug_zlib.stderr.exp cdebug_zlib.vgtest \ cdebug_zlib_gnu.stderr.exp cdebug_zlib_gnu.vgtest \ @@ -415,6 +416,7 @@ check_PROGRAMS = \ bug287260 \ bug340392 \ bug464969_d_demangle \ + bug472219 \ calloc-overflow \ client-msg \ clientperm \ @@ -566,6 +568,7 @@ leak_cpp_interior_SOURCES = leak_cpp_interior.cpp accounting_CFLAGS = $(AM_CFLAGS) @FLAG_W_NO_ALLOC_SIZE_LARGER_THAN@ badfree_CFLAGS = $(AM_CFLAGS) @FLAG_W_NO_FREE_NONHEAP_OBJECT@ bug155125_CFLAGS = $(AM_CFLAGS) -Wno-unused-result @FLAG_W_NO_ALLOC_SIZE_LARGER_THAN@ +bug472219_CFLAGS = $(AM_CFLAGS) @FLAG_W_NO_UNINITIALIZED@ mallinfo_CFLAGS = $(AM_CFLAGS) -Wno-deprecated-declarations malloc3_CFLAGS = $(AM_CFLAGS) @FLAG_W_NO_ALLOC_SIZE_LARGER_THAN@ sbfragment_CFLAGS = $(AM_CFLAGS) -Wno-deprecated-declarations @@ -663,6 +666,7 @@ reach_thread_register_LDADD = -lpthread realloc_size_zero_CFLAGS = $(AM_CFLAGS) @FLAG_W_NO_INCOMPATIBLE_POINTER_TYPES_DISCARDS_QUALIFIERS@ realloc_size_zero_mismatch_SOURCES = realloc_size_zero_mismatch.cpp +realloc_size_zero_mismatch_CXXFLAGS = $(AM_CXXFLAGS) @FLAG_W_NO_MISMATCHED_NEW_DELETE@ resvn_stack_CFLAGS = $(AM_CFLAGS) @FLAG_W_NO_UNINITIALIZED@ diff --git a/memcheck/tests/bug472219.c b/memcheck/tests/bug472219.c new file mode 100644 index 0000000000..88567caa2c --- /dev/null +++ b/memcheck/tests/bug472219.c @@ -0,0 +1,16 @@ +#include <poll.h> +#include <stdlib.h> +#include "../../config.h" + +int main() +{ + int uninit; + struct pollfd fds[] = {{-1, uninit, 0}, {2, POLLIN, 0}}; + + poll(fds, 2, 100); + +#if defined(HAVE_PPOLL) + struct timespec timeout = {0, 1e8}; + ppoll(fds, 2, &timeout, NULL); +#endif +} diff --git a/memcheck/tests/bug472219.stderr.exp b/memcheck/tests/bug472219.stderr.exp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/memcheck/tests/bug472219.vgtest b/memcheck/tests/bug472219.vgtest new file mode 100644 index 0000000000..8cd48c785d --- /dev/null +++ b/memcheck/tests/bug472219.vgtest @@ -0,0 +1,2 @@ +prog: bug472219 +vgopts: -q diff --git a/memcheck/tests/freebsd/scalar.c b/memcheck/tests/freebsd/scalar.c index c6a7ff2d5c..6c8d81aa6e 100644 --- a/memcheck/tests/freebsd/scalar.c +++ b/memcheck/tests/freebsd/scalar.c @@ -781,9 +781,15 @@ int main(void) /* netbsd newreboot 208 */ /* SYS_poll 209 */ - GO(SYS_poll, "3s 3m"); + GO(SYS_poll, "2s 2m"); SY(SYS_poll, x0, x0+1, x0); FAIL; + { + struct pollfd fds = { x0, x0, x0 }; + GO(SYS_poll, "0s 2m"); + SY(SYS_poll, &fds, 1, 1); SUCC; + } + /* SYS_freebsd7___semctl 220 */ GO(SYS_freebsd7___semctl, "(IPC_INFO) 4s 1m"); SY(SYS_freebsd7___semctl, x0, x0, x0+IPC_INFO, x0+1); FAIL; @@ -1948,8 +1954,8 @@ int main(void) { struct pollfd arg1; arg1.fd = arg1.events = arg1.revents = x0; - GO(SYS_ppoll, "2s 2+2m"); - SY(SYS_ppoll, &arg1, 1, x0+1, x0+1); FAIL; + GO(SYS_ppoll, "2s 2+2m"); + SY(SYS_ppoll, &arg1, 1, x0+1, x0+1); FAIL; } /* SYS_futimens 546 */ diff --git a/memcheck/tests/freebsd/scalar.stderr.exp b/memcheck/tests/freebsd/scalar.stderr.exp index 2595bd38c5..5a4f3230f1 100644 --- a/memcheck/tests/freebsd/scalar.stderr.exp +++ b/memcheck/tests/freebsd/scalar.stderr.exp @@ -1529,7 +1529,7 @@ Syscall param getpgid(pid) contains uninitialised byte(s) ... --------------------------------------------------------- -209: SYS_poll 3s 3m +209: SYS_poll 2s 2m --------------------------------------------------------- Syscall param poll(ufds) contains uninitialised byte(s) ... @@ -1544,13 +1544,20 @@ Syscall param poll(ufds.fd) points to unaddressable byte(s) ... Address 0x........ is not stack'd, malloc'd or (recently) free'd -Syscall param poll(ufds.events) points to unaddressable byte(s) +Syscall param poll(ufds.revents) points to unaddressable byte(s) ... Address 0x........ is not stack'd, malloc'd or (recently) free'd -Syscall param poll(ufds.revents) points to unaddressable byte(s) +--------------------------------------------------------- +209: SYS_poll 0s 2m +--------------------------------------------------------- +Syscall param poll(ufds.fd) points to uninitialised byte(s) ... - Address 0x........ is not stack'd, malloc'd or (recently) free'd + Address 0x........ is on thread 1's stack + +Syscall param poll(ufds.events) points to uninitialised byte(s) + ... + Address 0x........ is on thread 1's stack --------------------------------------------------------- 220: SYS_freebsd7___semctl (IPC_INFO) 4s 1m @@ -4968,6 +4975,14 @@ Syscall param ppoll(timeout) contains uninitialised byte(s) Syscall param ppoll(newsigmask) contains uninitialised byte(s) ... +Syscall param ppoll(fds.fd) points to unaddressable byte(s) + ... + Address 0x........ is not stack'd, malloc'd or (recently) free'd + +Syscall param ppoll(fds.revents) points to unaddressable byte(s) + ... + Address 0x........ is not stack'd, malloc'd or (recently) free'd + Syscall param ppoll(timeout) points to unaddressable byte(s) ... Address 0x........ is not stack'd, malloc'd or (recently) free'd diff --git a/memcheck/tests/freebsd/scalar.stderr.exp-x86 b/memcheck/tests/freebsd/scalar.stderr.exp-x86 index e995fc28d6..a45d0601c3 100644 --- a/memcheck/tests/freebsd/scalar.stderr.exp-x86 +++ b/memcheck/tests/freebsd/scalar.stderr.exp-x86 @@ -1529,7 +1529,7 @@ Syscall param getpgid(pid) contains uninitialised byte(s) ... --------------------------------------------------------- -209: SYS_poll 3s 3m +209: SYS_poll 2s 2m --------------------------------------------------------- Syscall param poll(ufds) contains uninitialised byte(s) ... @@ -1544,13 +1544,20 @@ Syscall param poll(ufds.fd) points to unaddressable byte(s) ... Address 0x........ is not stack'd, malloc'd or (recently) free'd -Syscall param poll(ufds.events) points to unaddressable byte(s) +Syscall param poll(ufds.revents) points to unaddressable byte(s) ... Address 0x........ is not stack'd, malloc'd or (recently) free'd -Syscall param poll(ufds.revents) points to unaddressable byte(s) +--------------------------------------------------------- +209: SYS_poll 0s 2m +--------------------------------------------------------- +Syscall param poll(ufds.fd) points to uninitialised byte(s) ... - Address 0x........ is not stack'd, malloc'd or (recently) free'd + Address 0x........ is on thread 1's stack + +Syscall param poll(ufds.events) points to uninitialised byte(s) + ... + Address 0x........ is on thread 1's stack --------------------------------------------------------- 220: SYS_freebsd7___semctl (IPC_INFO) 4s 1m @@ -5023,6 +5030,14 @@ Syscall param ppoll(timeout) contains uninitialised byte(s) Syscall param ppoll(newsigmask) contains uninitialised byte(s) ... +Syscall param ppoll(fds.fd) points to unaddressable byte(s) + ... + Address 0x........ is not stack'd, malloc'd or (recently) free'd + +Syscall param ppoll(fds.revents) points to unaddressable byte(s) + ... + Address 0x........ is not stack'd, malloc'd or (recently) free'd + Syscall param ppoll(timeout) points to unaddressable byte(s) ... Address 0x........ is not stack'd, malloc'd or (recently) free'd diff --git a/memcheck/tests/solaris/scalar.stderr.exp b/memcheck/tests/solaris/scalar.stderr.exp index 1a04979d19..a1b5d97d7a 100644 --- a/memcheck/tests/solaris/scalar.stderr.exp +++ b/memcheck/tests/solaris/scalar.stderr.exp @@ -3244,10 +3244,6 @@ Syscall param poll(ufds.fd) points to unaddressable byte(s) ... Address 0x........ is not stack'd, malloc'd or (recently) free'd -Syscall param poll(ufds.events) points to unaddressable byte(s) - ... - Address 0x........ is not stack'd, malloc'd or (recently) free'd - Syscall param poll(ufds.revents) points to unaddressable byte(s) ... Address 0x........ is not stack'd, malloc'd or (recently) free'd diff --git a/memcheck/tests/x86-linux/scalar.stderr.exp b/memcheck/tests/x86-linux/scalar.stderr.exp index b9202a8c2f..6b8c7677f5 100644 --- a/memcheck/tests/x86-linux/scalar.stderr.exp +++ b/memcheck/tests/x86-linux/scalar.stderr.exp @@ -2122,11 +2122,6 @@ Syscall param poll(ufds.fd) points to unaddressable byte(s) by 0x........: main (scalar.c:761) Address 0x........ is not stack'd, malloc'd or (recently) free'd -Syscall param poll(ufds.events) points to unaddressable byte(s) - ... - by 0x........: main (scalar.c:761) - Address 0x........ is not stack'd, malloc'd or (recently) free'd - Syscall param poll(ufds.revents) points to unaddressable byte(s) ... by 0x........: main (scalar.c:761) |
|
From: Paul F. <pa...@so...> - 2023-07-24 19:33:46
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=b368b44c552d0deb4d0ee77968cb0e8e02a07812 commit b368b44c552d0deb4d0ee77968cb0e8e02a07812 Author: Paul Floyd <pj...@wa...> Date: Mon Jul 24 21:32:45 2023 +0200 Solaris: add a configure test for getaddrinfo Not available on Solaris 11.3 Diff: --- configure.ac | 3 +++ drd/tests/getaddrinfo.vgtest | 2 +- helgrind/tests/Makefile.am | 5 ++++- helgrind/tests/getaddrinfo.vgtest | 1 + 4 files changed, 9 insertions(+), 2 deletions(-) diff --git a/configure.ac b/configure.ac index 4dbb1753c7..b4e9c11428 100755 --- a/configure.ac +++ b/configure.ac @@ -4849,6 +4849,7 @@ AC_CHECK_FUNCS([ \ copy_file_range \ epoll_create \ epoll_pwait \ + getaddrinfo \ klogctl \ mallinfo \ memchr \ @@ -4916,6 +4917,8 @@ AM_CONDITIONAL([HAVE_SETCONTEXT], [test x$ac_cv_func_setcontext = xyes]) AM_CONDITIONAL([HAVE_SWAPCONTEXT], [test x$ac_cv_func_swapcontext = xyes]) AM_CONDITIONAL([HAVE_MEMFD_CREATE], [test x$ac_cv_func_memfd_create = xyes]) +AM_CONDITIONAL([HAVE_GETADDRINFO], + [test x$ac_cv_func_getaddrinfo = xyes]) if test x$VGCONF_PLATFORM_PRI_CAPS = xMIPS32_LINUX \ -o x$VGCONF_PLATFORM_PRI_CAPS = xMIPS64_LINUX \ diff --git a/drd/tests/getaddrinfo.vgtest b/drd/tests/getaddrinfo.vgtest index 6faa2b6bde..a62baadb92 100644 --- a/drd/tests/getaddrinfo.vgtest +++ b/drd/tests/getaddrinfo.vgtest @@ -1,3 +1,3 @@ -prereq: ./supported_libpthread +prereq: ./supported_libpthread && test -e ../../helgrind/tests/getaddrinfo vgopts: -q prog: ../../helgrind/tests/getaddrinfo diff --git a/helgrind/tests/Makefile.am b/helgrind/tests/Makefile.am index 13e2d4db66..3e2efad0be 100755 --- a/helgrind/tests/Makefile.am +++ b/helgrind/tests/Makefile.am @@ -154,7 +154,6 @@ check_PROGRAMS = \ cond_timedwait_invalid \ cond_timedwait_test \ free_is_write \ - getaddrinfo \ hg01_all_ok \ hg02_deadlock \ hg03_inherit \ @@ -239,6 +238,10 @@ check_PROGRAMS += annotate_rwlock annotate_rwlock_CFLAGS = $(AM_CFLAGS) @FLAG_W_NO_UNUSED_BUT_SET_VARIABLE@ endif +if HAVE_GETADDRINFO +check_PROGRAMS += getaddrinfo +endif + AM_CFLAGS += $(AM_FLAG_M3264_PRI) AM_CXXFLAGS += $(AM_FLAG_M3264_PRI) diff --git a/helgrind/tests/getaddrinfo.vgtest b/helgrind/tests/getaddrinfo.vgtest index b58c618887..9543cbd046 100644 --- a/helgrind/tests/getaddrinfo.vgtest +++ b/helgrind/tests/getaddrinfo.vgtest @@ -1,2 +1,3 @@ +prereq: test -e getaddrinfo prog: getaddrinfo vgopts: -q |
|
From: Paul F. <pa...@so...> - 2023-07-23 17:24:01
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=37dd8263942708d20af9c04b9ac6f601cf3aa020 commit 37dd8263942708d20af9c04b9ac6f601cf3aa020 Author: Paul Floyd <pj...@wa...> Date: Sun Jul 23 19:22:51 2023 +0200 FreeBSD: Add a DRD supppression for getaddrinfo On FreeBSD 13.2 x86 Diff: --- freebsd-drd.supp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/freebsd-drd.supp b/freebsd-drd.supp index 93ad79f4bd..33f7e8ede2 100644 --- a/freebsd-drd.supp +++ b/freebsd-drd.supp @@ -240,3 +240,12 @@ obj:*/lib*/libc.so.7 fun:vsprintf } +{ + DRD-FREEBSD132-GETADDRINFO + drd:ConflictingAccess + ... + obj:*/lib*/libc.so.7 + fun:nsdispatch + obj:*/lib*/libc.so.7 + fun:getaddrinfo +} |
|
From: Wu, F. <fe...@in...> - 2023-07-19 01:25:15
|
On 7/19/2023 3:08 AM, Petr Pavlu wrote:
> On 11. Jul 23 19:28, Wu, Fei wrote:
>> On 7/11/2023 4:50 AM, Petr Pavlu wrote:
>>> On 6. Jul 23 20:39, Wu, Fei wrote:
>>>> [...]
>>>>
>>>> This approach will introduce a bunch of new vlen Vector IRs, especially
>>>> the arithmetic IRs such as vadd, my goal is for a good solution which
>>>> takes reasonable time to reach usable status, yet still be able to
>>>> evolve and generic enough for other vector ISA. Any comments?
>
> This personally looks to me as a right direction. Supporting scalable
> vector extensions in Valgrind as a first-class citizen would be my
> preferred choice. I think it is something that will be needed to handle
> Arm SVE and RISC-V RVV well. On the other hand, it is likely the most
> complex approach and could take time to iron out.
>
>>> Could you please share a repository with your changes or send them to me
>>> as patches? I have a few questions but I think it might be easier for me
>>> first to see the actual code.
>>>
>> Please see attachment. It's a very raw version to just verify the idea,
>> mask is not added but expected to be done as mentioned above, it's based
>> on commit 71272b2529 on your branch, patch 0013 is the key.
>
> Thanks for sharing this code. The previous discussions and this series
> introduces a new concept of translating client code per some CPU state.
> That is something I spent most time thinking about.
>
> I can see it is indeed necessary for RVV. In particular, this
> "versioning" of translations allows that Valgrind IR can statically
> express an element type of each vector operation, i.e. that it is an
> operation on I32, F64, ... An alternative would be to try to express the
> type dynamically in IR. That should be still somewhat manageable in the
> toIR frontend but I have a hard time seeing how it would work for the
> instrumentation and codegen.
>
> The versioning should work well for RVV translations because my
> expectation is that most RVV loops will consist of a call to vsetvli
> (with a static vtype), followed by some actual vector operations. Such
> a block then requires only one translation.
>
> This is however true only if translations are versioned just per vtype,
> without vl. If I understood correctly, the patches version them per vl
> too but it isn't clear to me conceptually if this is really necessary.
>
Yes, this series does version vl, it helps the situation such as in the
last patch, it can break the large vl to multiple small vl operations,
in case the backend doesn't have a register allocation algorithm for LMUL>1.
> For instance, I think VAdd8 could look as follows:
> VAdd8(<len>, <in1>, <in2>, <flags?>) where <len> is something as
> IRExpr_Get(OFFB_VL, Ity_I64).
>
> Another problem which I noticed is that blocks containing no RVV
> instructions are also versioned. Consider the following:
> while (true) {
> // (1) some RVV code which can set vtype to different values
> // (2) a large chunk of non-RVV code
> }
>
> The code in (2) will currently have multiple same translations for each
> residue left in vtype by (1).
>
Yes, indeed. This is one place to optimize.
> In general, I think the concept of allowing translations per some CPU
> state could be useful in other cases and for other architectures too.
> For RISC-V, it could be beneficial for floating-point operations. My
> expectation is that regular RISC-V FP code will have instructions with
> encoded rm=DYN and always executed with frm=RNE. The current approach is
> that the toIR frontend generates an IR which reads the rounding mode
> from frm and remaps it to the Valgrind's representation. The codegen
> then does the opposite. The idea here is that the frontend would know
> the actual rounding mode and could create IR which has directly this
> mode, for instance, AddF64(Irrm_NEAREST, <in1>, <in2>). The codegen then
> doesn't need to know how to handle any dynamic rounding modes as they
> become static.
>
> I plan to look further into this series. Specifically, I'd like to have
> a stab at adding some basic support for Arm SVE to get a better
> understanding if this is generic enough.
>
Great, I will add more RVV support if it's proved to be the right
direction, and thank you for the review.
Thanks,
Fei.
> Thanks,
> Petr
|
|
From: Petr P. <pet...@da...> - 2023-07-18 19:26:03
|
On 11. Jul 23 19:28, Wu, Fei wrote:
> On 7/11/2023 4:50 AM, Petr Pavlu wrote:
> > On 6. Jul 23 20:39, Wu, Fei wrote:
> >> [...]
> >>
> >> This approach will introduce a bunch of new vlen Vector IRs, especially
> >> the arithmetic IRs such as vadd, my goal is for a good solution which
> >> takes reasonable time to reach usable status, yet still be able to
> >> evolve and generic enough for other vector ISA. Any comments?
This personally looks to me as a right direction. Supporting scalable
vector extensions in Valgrind as a first-class citizen would be my
preferred choice. I think it is something that will be needed to handle
Arm SVE and RISC-V RVV well. On the other hand, it is likely the most
complex approach and could take time to iron out.
> > Could you please share a repository with your changes or send them to me
> > as patches? I have a few questions but I think it might be easier for me
> > first to see the actual code.
> >
> Please see attachment. It's a very raw version to just verify the idea,
> mask is not added but expected to be done as mentioned above, it's based
> on commit 71272b2529 on your branch, patch 0013 is the key.
Thanks for sharing this code. The previous discussions and this series
introduces a new concept of translating client code per some CPU state.
That is something I spent most time thinking about.
I can see it is indeed necessary for RVV. In particular, this
"versioning" of translations allows that Valgrind IR can statically
express an element type of each vector operation, i.e. that it is an
operation on I32, F64, ... An alternative would be to try to express the
type dynamically in IR. That should be still somewhat manageable in the
toIR frontend but I have a hard time seeing how it would work for the
instrumentation and codegen.
The versioning should work well for RVV translations because my
expectation is that most RVV loops will consist of a call to vsetvli
(with a static vtype), followed by some actual vector operations. Such
a block then requires only one translation.
This is however true only if translations are versioned just per vtype,
without vl. If I understood correctly, the patches version them per vl
too but it isn't clear to me conceptually if this is really necessary.
For instance, I think VAdd8 could look as follows:
VAdd8(<len>, <in1>, <in2>, <flags?>) where <len> is something as
IRExpr_Get(OFFB_VL, Ity_I64).
Another problem which I noticed is that blocks containing no RVV
instructions are also versioned. Consider the following:
while (true) {
// (1) some RVV code which can set vtype to different values
// (2) a large chunk of non-RVV code
}
The code in (2) will currently have multiple same translations for each
residue left in vtype by (1).
In general, I think the concept of allowing translations per some CPU
state could be useful in other cases and for other architectures too.
For RISC-V, it could be beneficial for floating-point operations. My
expectation is that regular RISC-V FP code will have instructions with
encoded rm=DYN and always executed with frm=RNE. The current approach is
that the toIR frontend generates an IR which reads the rounding mode
from frm and remaps it to the Valgrind's representation. The codegen
then does the opposite. The idea here is that the frontend would know
the actual rounding mode and could create IR which has directly this
mode, for instance, AddF64(Irrm_NEAREST, <in1>, <in2>). The codegen then
doesn't need to know how to handle any dynamic rounding modes as they
become static.
I plan to look further into this series. Specifically, I'd like to have
a stab at adding some basic support for Arm SVE to get a better
understanding if this is generic enough.
Thanks,
Petr
|
|
From: Wu, F. <fe...@in...> - 2023-07-18 01:44:56
|
On 7/11/2023 7:28 PM, Wu, Fei wrote: > On 7/11/2023 4:50 AM, Petr Pavlu wrote: >> On 6. Jul 23 20:39, Wu, Fei wrote: >>> On 5/29/2023 11:29 AM, Wu, Fei wrote: >>>> On 5/28/2023 1:06 AM, Petr Pavlu wrote: >>>>> On 21. Apr 23 17:25, Jojo R wrote: >>>>>> We consider to add RVV/Vector [1] feature in valgrind, there are some >>>>>> challenges. >>>>>> RVV like ARM's SVE [2] programming model, it's scalable/VLA, that means the >>>>>> vector length is agnostic. >>>>>> ARM's SVE is not supported in valgrind :( >>>>>> >>>>>> There are three major issues in implementing RVV instruction set in Valgrind >>>>>> as following: >>>>>> >>>>>> 1. Scalable vector register width VLENB >>>>>> 2. Runtime changing property of LMUL and SEW >>>>>> 3. Lack of proper VEX IR to represent all vector operations >>>>>> >>>>>> We propose applicable methods to solve 1 and 2. As for 3, we explore several >>>>>> possible but maybe imperfect approaches to handle different cases. >>>>>> >>> I did a very basic prototype for vlen Vector-IR, particularly on RISC-V >>> Vector (RVV): >>> >>> * Define new iops such as Iop_VAdd8/16/32/64, the difference from >>> existing SIMD version is that no element number is specified like >>> Iop_Add8x32 >>> >>> * Define new IR type Ity_VLen along side existing types such as Ity_I64, >>> Ity_V256 >>> >>> * Define new class HRcVecVLen in HRegClass for vlen vector registers >>> The real length is embedded in both IROp and IRType for vlen ops/types, >>> it's runtime-decided and already known when handling insn such as vadd, >>> this leads to more flexibility, e.g. backend can issue extra vsetvl if >>> necessary. >>> >>> With the above, RVV instruction in the guest can be passed from >>> frontend, to memcheck, to the backend, and generate the final RVV insn >>> during host isel, a very basic testcase has been tested. >>> >>> Now here comes to the complexities: >>> >>> 1. RVV has the concept of LMUL, which groups multiple (or partial) >>> vector registers, e.g. when LMUL==2, v2 means the real v2+v3. This >>> complicates the register allocation. >>> >>> 2. RVV uses the "implicit" v0 for mask, its content must be loaded to >>> the exact "v0" register instead of any other ones if host isel wants to >>> leverage RVV insn, this implicitness in ISA requires more explicitness >>> in Valgrind implementation. >>> >>> For #1 LMUL, a new register allocation algorithm for it can be added, >>> and it will be great if someone is willing to try it, I'm not sure how >>> much effort it will take. The other way is splitting it into multiple >>> ops which only takes one vector register, taking vadd for example, 2 >>> vadd will run with LMUL=1 for one vadd with LMUL=2, this is still okay >>> for the widening insn, most of the arithmetic insns can be covered in >>> this way. The exception could be register gather insn vrgather, which we >>> can consult other ways for it, e.g. scalar or helper. >>> >>> For #2 v0 mask, one way is to handle the mask in the very beginning at >>> guest_riscv64_toIR.c, similar to what AVX port does: >>> >>> a) Read the whole dest register without mask >>> b) Generate unmasked result by running op without mask >>> c) Applying mask to a,b and generate the final dest >>> >>> by doing this, insn with mask is converted to non-mask ones, although >>> more insns are generated but the performance should be acceptable. There >>> are still exceptions, e.g. vadc (Add-with-Carry), v0 is not used as mask >>> but as carry, but just as mentioned above, it's okay to use other ways >>> for a few insns. Eventually, we can pass v0 mask down to the backend if >>> it's proved a better solution. >>> >>> This approach will introduce a bunch of new vlen Vector IRs, especially >>> the arithmetic IRs such as vadd, my goal is for a good solution which >>> takes reasonable time to reach usable status, yet still be able to >>> evolve and generic enough for other vector ISA. Any comments? >> >> Could you please share a repository with your changes or send them to me >> as patches? I have a few questions but I think it might be easier for me >> first to see the actual code. >> > Please see attachment. It's a very raw version to just verify the idea, > mask is not added but expected to be done as mentioned above, it's based > on commit 71272b2529 on your branch, patch 0013 is the key. > Hi Petr, Have you taken a look? Any comments? Thanks, Fei. > btw, I will setup a repository but it takes a few days to pass the > internal process. > > Thanks, > Fei. > >> Thanks, >> Petr |
|
From: Jojo R <rj...@li...> - 2023-07-17 07:06:22
|
Hi,
Sorry for the late reply,
i have been pushing the progress of valgrind RVV implementation 😄
We finished the first version and tested with full RVV intrinsics spec.
For real project and developers, we implement the first useable/ full
functionality's RVV valgrind with dirtycall method,
and we will make experiment or optimize RVV implementation on ideal RVV
design.
Back to the RVV RFC, we are happy to share our thinking of design, see
attachment for more details :)
Regards
--Jojo
在 2023/4/21 17:25, Jojo R 写道:
>
> Hi,
>
> We consider to add RVV/Vector [1] feature in valgrind, there are some
> challenges.
> RVV like ARM's SVE [2] programming model, it's scalable/VLA, that
> means the vector length is agnostic.
> ARM's SVE is not supported in valgrind :(
>
> There are three major issues in implementing RVV instruction set in
> Valgrind as following:
>
> 1. Scalable vector register width VLENB
> 2. Runtime changing property of LMUL and SEW
> 3. Lack of proper VEX IR to represent all vector operations
>
> We propose applicable methods to solve 1 and 2. As for 3, we explore
> several possible but maybe imperfect approaches to handle different cases.
>
> We start from 1. As each guest register should be described in
> VEXGuestState struct, the vector registers with scalable width of
> VLENB can be added into VEXGuestState as arrays using an allowable
> maximum length like 2048/4096.
>
> The actual available access range can be determined at Valgrind
> startup time by querying the CPU for its vector capability or some
> suitable setup steps.
>
>
> To solve problem 2, we are inspired by already-proven techniques in
> QEMU, where translation blocks are broken up when certain critical
> CSRs are set. Because the guest code to IR translation relies on the
> precise value of LMUL/SEW and they may change within a basic block, we
> can break up the basic block each time encountering a vsetvl{i}
> instruction and return to the scheduler to execute the translated code
> and update LMUL/SEW. Accordingly, translation cache management should
> be refactored to detect the changing of LMUL/SEW to invalidate
> outdated code cache. Without losing the generality, the LMUL/SEW
> should be encoded into an ULong flag such that other architectures can
> leverage this flag to store their arch-dependent information. The
> TTentry struct should also take the flag into account no matter
> insertion or deletion. By doing this, the flag carries the newest
> LMUL/SEW throughout the simulation and can be passed to disassemble
> functions using the VEXArchInfo struct such that we can get the real
> and newest value of LMUL and SEW to facilitate our translation.
>
> Also, some architecture-related code should be taken care of. Like
> m_dispatch part, disp_cp_xindir function looks up code cache using
> hardcoded assembly by checking the requested guest state IP and
> translation cache entry address with no more constraints. Many other
> modules should be checked to ensure the in-time update of LMUL/SEW is
> instantly visible to essential parts in Valgrind.
>
>
> The last remaining big issue is 3, which we introduce some ad-hoc
> approaches to deal with. We summarize these approaches into three
> types as following:
>
> 1. Break down a vector instruction to scalar VEX IR ops.
> 2. Break down a vector instruction to fixed-length VEX IR ops.
> 3. Use dirty helpers to realize vector instructions.
>
> The very first method theoretically exists but is probably not
> applicable as the number of IR ops explodes when a large VLENB is
> adopted. Imaging a configuration of VLENB=512, SEW=8, LMUL=8, the VL
> is 512 * 8 / 8 = 512, meaning that a single vector instruction turns
> into 512 scalar instructions and each scalar instruction would be
> expanded to multiple IRs. To make things worse, the tool
> instrumentation will insert more IRs between adjacent scalar IR ops.
> As a result, the performance is likely to be slowed down thousand
> times during running a real-world application with lots of vector
> instructions. Therefore, the other two methods are more promising and
> we will discuss them below.
>
> 2 and 3 are not mutually exclusive as we may choose a suitable method
> from them to implement a vector instruction regarding its concrete
> behavior. To explain these methods in detail, we present some
> instances to illustrate their pros and cons.
>
> In terms of method 2, we have real values of VLENB/LMUL/SEW. The
> simple case is VLENB <= 256 and LMUL=1, where many SIMD IR ops are
> available and can be directly applied to represent vector operations.
> However, even when VLENB is restricted to 128, it still exceeds the
> maximum SIMD width of 256 supported by VEX IR if LMUL>2. Hence, here
> are two variants of method 2 to deal with long vectors:
>
>
> *2.1*Add more SIMD IR ops such as 1024/2048/4096, and translate vector
> instructions in the granularity of VLENB. Accordingly, VLENB=4096 with
> LMUL=2 is fulfilled by two 4096 SIMD VEX IR ops.
>
> * *pros*: it encourages VEX backend to generate more compact and
> efficient SIMD code (maybe). Particularly,it accommodatesmask and
> gather/scatter (indexed) instructions by delivering more
> information in IR itself.
> * *cons*: too many new IR ops need to be introduced in VEX as each
> op of different length should implement its add/sub/mul variants.
> New data types to denote long vectors are necessary too, causing
> difficulties in both VEX backend register allocation and tool
> instrumentation.
>
> *2.2*Break down long vectors to multiple repeated SIMD ops. For
> instance, a vadd.vv vector instruction with VLENB=256/LMUL=2/SEW=8 is
> composed of four operators of Iop_Add8x16 type.
>
> * *pros:*less efforts are required in register allocation and tool
> instrumentation. The VEX frontend is able to notify the backend to
> generate efficient vector instructions by existing Iops. It better
> trades off the complexity of adding many long vector IR ops and
> the benefit of generating high-efficiency host code.
> * *cons:*it is hard to describe a mask operation given that the mask
> is pretty flexible (the least significant bit of each segment of
> v0). Additionally, gather/scatter instructions may have similar
> problems in appropriately dividing index registers. There are
> various corner cases left here such as widening arithmetic
> operations (widening SIMD IR ops are currently not compatible) and
> vstart CSR register. When using fixed-length IR ops to comprise a
> vector instruction, we will inevitably tell each IR op which
> position encoded in vstart you can start to process the data. We
> can use vstart as a normal guest state virtual register to
> calculate each op's start position as a guard IRExpr or obtain the
> value of vstart like what we do in LMUL/SEW. Nevertheless, it is
> non-trivial to decompose a vector instruction concisely.
>
> In short, both 2.1 and 2.2 confront a dilemma in reducing engineering
> efforts of refactoring Valgrind elegantly as well as implementing the
> vector instruction set efficiently. Same obstacles exist in ARM SVE as
> they are scalable vector instructions and flexible in many ways.
>
> The final solution is the dirty helper. It is undoubtedly practical
> and requires possibly the least engineering efforts in dealing with so
> many details in Valgrind. In this design, each instruction is
> completed using an inline assembly running the same instruction on the
> host. Moreover, tool instrumentation already handles IRDirty except
> that new fields should be added in _IRDirty struct to indicate
> strided/indexed/masked memory accesses and arithmetic operations.
>
> * *pros:*it supports all instructions without bothering to build
> complicated IR expressions and statements. It executes vector
> instructions using host CPU to get acceleration to some extent.
> Besides, we do not need to add VEX backend to translate new IRs to
> vector instructions.
> * *cons:*the dirty helper always keeps its operations in a black box
> such that tools can never see what happens in a dirty helper. Like
> memcheck, the bit precision merit is missing once it meets a dirty
> helper as the V-bit propagation chain adopts a pretty coarse
> determination strategy. On the other hand, it is also not an
> elegant way to implement the entire ISA extension in dirty helpers.
>
> In summary, it is far to reach a truly applicable solution in adding
> vector extensions in Valgrind. We need to do detailed and
> comprehensive estimations on different vector instruction categories.
>
> Any feedback is welcome in github [3] also.
>
>
> [1] https://github.com/riscv/riscv-v-spec
>
> [2]
> https://community.arm.com/arm-research/b/articles/posts/the-arm-scalable-vector-extension-sve
>
> [3] https://github.com/petrpavlu/valgrind-riscv64/issues/17
>
>
> Thanks.
>
> Jojo
>
>
>
> _______________________________________________
> Valgrind-developers mailing list
> Val...@li...
> https://lists.sourceforge.net/lists/listinfo/valgrind-developers |
|
From: Wu, F. <fe...@in...> - 2023-07-11 11:29:25
|
On 7/11/2023 4:50 AM, Petr Pavlu wrote: > On 6. Jul 23 20:39, Wu, Fei wrote: >> On 5/29/2023 11:29 AM, Wu, Fei wrote: >>> On 5/28/2023 1:06 AM, Petr Pavlu wrote: >>>> On 21. Apr 23 17:25, Jojo R wrote: >>>>> We consider to add RVV/Vector [1] feature in valgrind, there are some >>>>> challenges. >>>>> RVV like ARM's SVE [2] programming model, it's scalable/VLA, that means the >>>>> vector length is agnostic. >>>>> ARM's SVE is not supported in valgrind :( >>>>> >>>>> There are three major issues in implementing RVV instruction set in Valgrind >>>>> as following: >>>>> >>>>> 1. Scalable vector register width VLENB >>>>> 2. Runtime changing property of LMUL and SEW >>>>> 3. Lack of proper VEX IR to represent all vector operations >>>>> >>>>> We propose applicable methods to solve 1 and 2. As for 3, we explore several >>>>> possible but maybe imperfect approaches to handle different cases. >>>>> >> I did a very basic prototype for vlen Vector-IR, particularly on RISC-V >> Vector (RVV): >> >> * Define new iops such as Iop_VAdd8/16/32/64, the difference from >> existing SIMD version is that no element number is specified like >> Iop_Add8x32 >> >> * Define new IR type Ity_VLen along side existing types such as Ity_I64, >> Ity_V256 >> >> * Define new class HRcVecVLen in HRegClass for vlen vector registers >> The real length is embedded in both IROp and IRType for vlen ops/types, >> it's runtime-decided and already known when handling insn such as vadd, >> this leads to more flexibility, e.g. backend can issue extra vsetvl if >> necessary. >> >> With the above, RVV instruction in the guest can be passed from >> frontend, to memcheck, to the backend, and generate the final RVV insn >> during host isel, a very basic testcase has been tested. >> >> Now here comes to the complexities: >> >> 1. RVV has the concept of LMUL, which groups multiple (or partial) >> vector registers, e.g. when LMUL==2, v2 means the real v2+v3. This >> complicates the register allocation. >> >> 2. RVV uses the "implicit" v0 for mask, its content must be loaded to >> the exact "v0" register instead of any other ones if host isel wants to >> leverage RVV insn, this implicitness in ISA requires more explicitness >> in Valgrind implementation. >> >> For #1 LMUL, a new register allocation algorithm for it can be added, >> and it will be great if someone is willing to try it, I'm not sure how >> much effort it will take. The other way is splitting it into multiple >> ops which only takes one vector register, taking vadd for example, 2 >> vadd will run with LMUL=1 for one vadd with LMUL=2, this is still okay >> for the widening insn, most of the arithmetic insns can be covered in >> this way. The exception could be register gather insn vrgather, which we >> can consult other ways for it, e.g. scalar or helper. >> >> For #2 v0 mask, one way is to handle the mask in the very beginning at >> guest_riscv64_toIR.c, similar to what AVX port does: >> >> a) Read the whole dest register without mask >> b) Generate unmasked result by running op without mask >> c) Applying mask to a,b and generate the final dest >> >> by doing this, insn with mask is converted to non-mask ones, although >> more insns are generated but the performance should be acceptable. There >> are still exceptions, e.g. vadc (Add-with-Carry), v0 is not used as mask >> but as carry, but just as mentioned above, it's okay to use other ways >> for a few insns. Eventually, we can pass v0 mask down to the backend if >> it's proved a better solution. >> >> This approach will introduce a bunch of new vlen Vector IRs, especially >> the arithmetic IRs such as vadd, my goal is for a good solution which >> takes reasonable time to reach usable status, yet still be able to >> evolve and generic enough for other vector ISA. Any comments? > > Could you please share a repository with your changes or send them to me > as patches? I have a few questions but I think it might be easier for me > first to see the actual code. > Please see attachment. It's a very raw version to just verify the idea, mask is not added but expected to be done as mentioned above, it's based on commit 71272b2529 on your branch, patch 0013 is the key. btw, I will setup a repository but it takes a few days to pass the internal process. Thanks, Fei. > Thanks, > Petr |
|
From: Petr P. <pet...@da...> - 2023-07-10 21:06:01
|
On 6. Jul 23 20:39, Wu, Fei wrote: > On 5/29/2023 11:29 AM, Wu, Fei wrote: > > On 5/28/2023 1:06 AM, Petr Pavlu wrote: > >> On 21. Apr 23 17:25, Jojo R wrote: > >>> We consider to add RVV/Vector [1] feature in valgrind, there are some > >>> challenges. > >>> RVV like ARM's SVE [2] programming model, it's scalable/VLA, that means the > >>> vector length is agnostic. > >>> ARM's SVE is not supported in valgrind :( > >>> > >>> There are three major issues in implementing RVV instruction set in Valgrind > >>> as following: > >>> > >>> 1. Scalable vector register width VLENB > >>> 2. Runtime changing property of LMUL and SEW > >>> 3. Lack of proper VEX IR to represent all vector operations > >>> > >>> We propose applicable methods to solve 1 and 2. As for 3, we explore several > >>> possible but maybe imperfect approaches to handle different cases. > >>> > I did a very basic prototype for vlen Vector-IR, particularly on RISC-V > Vector (RVV): > > * Define new iops such as Iop_VAdd8/16/32/64, the difference from > existing SIMD version is that no element number is specified like > Iop_Add8x32 > > * Define new IR type Ity_VLen along side existing types such as Ity_I64, > Ity_V256 > > * Define new class HRcVecVLen in HRegClass for vlen vector registers > The real length is embedded in both IROp and IRType for vlen ops/types, > it's runtime-decided and already known when handling insn such as vadd, > this leads to more flexibility, e.g. backend can issue extra vsetvl if > necessary. > > With the above, RVV instruction in the guest can be passed from > frontend, to memcheck, to the backend, and generate the final RVV insn > during host isel, a very basic testcase has been tested. > > Now here comes to the complexities: > > 1. RVV has the concept of LMUL, which groups multiple (or partial) > vector registers, e.g. when LMUL==2, v2 means the real v2+v3. This > complicates the register allocation. > > 2. RVV uses the "implicit" v0 for mask, its content must be loaded to > the exact "v0" register instead of any other ones if host isel wants to > leverage RVV insn, this implicitness in ISA requires more explicitness > in Valgrind implementation. > > For #1 LMUL, a new register allocation algorithm for it can be added, > and it will be great if someone is willing to try it, I'm not sure how > much effort it will take. The other way is splitting it into multiple > ops which only takes one vector register, taking vadd for example, 2 > vadd will run with LMUL=1 for one vadd with LMUL=2, this is still okay > for the widening insn, most of the arithmetic insns can be covered in > this way. The exception could be register gather insn vrgather, which we > can consult other ways for it, e.g. scalar or helper. > > For #2 v0 mask, one way is to handle the mask in the very beginning at > guest_riscv64_toIR.c, similar to what AVX port does: > > a) Read the whole dest register without mask > b) Generate unmasked result by running op without mask > c) Applying mask to a,b and generate the final dest > > by doing this, insn with mask is converted to non-mask ones, although > more insns are generated but the performance should be acceptable. There > are still exceptions, e.g. vadc (Add-with-Carry), v0 is not used as mask > but as carry, but just as mentioned above, it's okay to use other ways > for a few insns. Eventually, we can pass v0 mask down to the backend if > it's proved a better solution. > > This approach will introduce a bunch of new vlen Vector IRs, especially > the arithmetic IRs such as vadd, my goal is for a good solution which > takes reasonable time to reach usable status, yet still be able to > evolve and generic enough for other vector ISA. Any comments? Could you please share a repository with your changes or send them to me as patches? I have a few questions but I think it might be easier for me first to see the actual code. Thanks, Petr |
|
From: Andreas A. <ar...@so...> - 2023-07-06 15:19:08
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=cb684b50e7d4d845b56abea72fd9b9925fed644e commit cb684b50e7d4d845b56abea72fd9b9925fed644e Author: Andreas Arnez <ar...@li...> Date: Mon May 22 19:49:08 2023 +0200 Bug 470132 - s390x: Increase test coverage for VGM Add more tests for the VGM instruction, to verify the fix for the VGM wrap-around case. Also test setting unused bits in the I2 and I3 fields, to check that Valgrind ignores them as it should. Diff: --- none/tests/s390x/vec2.c | 26 ++++++++++++++++++++++++++ none/tests/s390x/vec2.stdout.exp | 20 ++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/none/tests/s390x/vec2.c b/none/tests/s390x/vec2.c index 73b04dee49..0d549cb235 100644 --- a/none/tests/s390x/vec2.c +++ b/none/tests/s390x/vec2.c @@ -301,6 +301,31 @@ static void test_all_fp_int_conversions() #undef TEST_EXEC #undef TEST_GENERATE +/* -- Vector generate mask -- */ + +#define XTEST(insn, i2, i3) \ + do { \ + ulong_v out = vec_ini; \ + puts(#insn " " #i2 "," #i3); \ + __asm__(#insn " %[out]," #i2 "," #i3 : [out] "+v"(out) : :); \ + printf("\t%016lx %016lx\n", out[0], out[1]); \ + } while (0) + +static void test_all_generate_mask() +{ + XTEST(vgmb, 2, 1); + XTEST(vgmb, 0xf7, 0x30); + XTEST(vgmb, 0, 0); + XTEST(vgmh, 3, 2); + XTEST(vgmh, 15, 15); + XTEST(vgmf, 4, 3); + XTEST(vgmf, 16, 17); + XTEST(vgmg, 55, 63); + XTEST(vgmg, 43, 55); + XTEST(vgmg, 63, 2); +} + +#undef XTEST int main() { @@ -310,5 +335,6 @@ int main() test_all_double_bitshifts(); test_all_int_fp_conversions(); test_all_fp_int_conversions(); + test_all_generate_mask(); return 0; } diff --git a/none/tests/s390x/vec2.stdout.exp b/none/tests/s390x/vec2.stdout.exp index b32cbe1bc0..7b894b9519 100644 --- a/none/tests/s390x/vec2.stdout.exp +++ b/none/tests/s390x/vec2.stdout.exp @@ -166,3 +166,23 @@ vcsfp 0 vcsfp 8 00ffffff - - - 00000004 - - - +vgmb 2,1 + ffffffffffffffff ffffffffffffffff +vgmb 0xf7,0x30 + 8181818181818181 8181818181818181 +vgmb 0,0 + 8080808080808080 8080808080808080 +vgmh 3,2 + ffffffffffffffff ffffffffffffffff +vgmh 15,15 + 0001000100010001 0001000100010001 +vgmf 4,3 + ffffffffffffffff ffffffffffffffff +vgmf 16,17 + 0000c0000000c000 0000c0000000c000 +vgmg 55,63 + 00000000000001ff 00000000000001ff +vgmg 43,55 + 00000000001fff00 00000000001fff00 +vgmg 63,2 + e000000000000001 e000000000000001 |
|
From: Andreas A. <ar...@so...> - 2023-07-06 15:19:06
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=6635fc58345ba2c36589f0bef4d326166e947023 commit 6635fc58345ba2c36589f0bef4d326166e947023 Author: Andreas Arnez <ar...@li...> Date: Mon May 22 18:57:35 2023 +0200 Bug 470132 - s390x: Fix the wrap-around case in VGM Valgrind's implementation of VGM is incomplete: * It doesn't support generating a wrap-around bit mask. Such a mask should result when the ending bit position is smaller than the starting bit position. Valgrind runs into an assertion failure instead. * It doesn't ignore unused bits in the I2 and I3 fields of the instruction, as it should. Fix this by re-implementing the main logic in s390_irgen_VGM(). Diff: --- NEWS | 1 + VEX/priv/guest_s390_toIR.c | 57 ++++++++++++++++++---------------------------- 2 files changed, 23 insertions(+), 35 deletions(-) diff --git a/NEWS b/NEWS index a4e7533115..783612fbb9 100644 --- a/NEWS +++ b/NEWS @@ -38,6 +38,7 @@ are not entered into bugzilla tend to get forgotten about or ignored. 469146 massif --ignore-fn does not ignore inlined functions 469768 Make it possible to install gdb scripts in a different location 470121 Can't run callgrind_control with valgrind 3.21.0 because of perl errors +470132 s390x: Assertion failure on VGM instruction 470520 Multiple realloc zero errors crash in MC_(eq_Error) 470713 Failure on the Yosys project: valgrind: m_libcfile.c:1802 (Bool vgPlain_realpath(const HChar *, HChar *)): diff --git a/VEX/priv/guest_s390_toIR.c b/VEX/priv/guest_s390_toIR.c index 11dda41ef5..d9d746c38a 100644 --- a/VEX/priv/guest_s390_toIR.c +++ b/VEX/priv/guest_s390_toIR.c @@ -16388,50 +16388,37 @@ s390_irgen_VGBM(UChar v1, UShort i2, UChar m3 __attribute__((unused))) static const HChar * s390_irgen_VGM(UChar v1, UShort i2, UChar m3) { - UChar from = (i2 & 0xff00) >> 8; - UChar to = (i2 & 0x00ff); - ULong value = 0UL; - IRType type = s390_vr_get_type(m3); - vassert(from <= to); - - UChar maxIndex = 0; - switch (type) { - case Ity_I8: - maxIndex = 7; - break; - case Ity_I16: - maxIndex = 15; - break; - case Ity_I32: - maxIndex = 31; - break; - case Ity_I64: - maxIndex = 63; - break; - default: - vpanic("s390_irgen_VGM: unknown type"); - } - - for(UChar index = from; index <= to; index++) { - value |= (1ULL << (maxIndex - index)); - } - - IRExpr *fillValue; - switch (type) { - case Ity_I8: + s390_insn_assert("vgm", m3 <= 3); + + UChar max_idx = (8 << m3) - 1; + UChar from = max_idx & (i2 >> 8); + UChar to = max_idx & i2; + ULong all_one = (1ULL << max_idx << 1) - 1; + ULong value = (all_one >> from) ^ (all_one >> to >> 1); + + /* In case of wrap-around we now have a value that needs inverting: + to from + V V + 00000111111111110000000000000000 */ + if (to < from) + value ^= all_one; + + IRExpr* fillValue; + switch (m3) { + case 0: fillValue = mkU8(value); break; - case Ity_I16: + case 1: fillValue = mkU16(value); break; - case Ity_I32: + case 2: fillValue = mkU32(value); break; - case Ity_I64: + case 3: fillValue = mkU64(value); break; default: - vpanic("s390_irgen_VGM: unknown type"); + vpanic("s390_irgen_VGM: unknown element size"); } s390_vr_fill(v1, fillValue); |
|
From: Wu, F. <fe...@in...> - 2023-07-06 12:40:15
|
On 5/29/2023 11:29 AM, Wu, Fei wrote:
> On 5/28/2023 1:06 AM, Petr Pavlu wrote:
>> On 21. Apr 23 17:25, Jojo R wrote:
>>> We consider to add RVV/Vector [1] feature in valgrind, there are some
>>> challenges.
>>> RVV like ARM's SVE [2] programming model, it's scalable/VLA, that means the
>>> vector length is agnostic.
>>> ARM's SVE is not supported in valgrind :(
>>>
>>> There are three major issues in implementing RVV instruction set in Valgrind
>>> as following:
>>>
>>> 1. Scalable vector register width VLENB
>>> 2. Runtime changing property of LMUL and SEW
>>> 3. Lack of proper VEX IR to represent all vector operations
>>>
>>> We propose applicable methods to solve 1 and 2. As for 3, we explore several
>>> possible but maybe imperfect approaches to handle different cases.
>>>
I did a very basic prototype for vlen Vector-IR, particularly on RISC-V
Vector (RVV):
* Define new iops such as Iop_VAdd8/16/32/64, the difference from
existing SIMD version is that no element number is specified like
Iop_Add8x32
* Define new IR type Ity_VLen along side existing types such as Ity_I64,
Ity_V256
* Define new class HRcVecVLen in HRegClass for vlen vector registers
The real length is embedded in both IROp and IRType for vlen ops/types,
it's runtime-decided and already known when handling insn such as vadd,
this leads to more flexibility, e.g. backend can issue extra vsetvl if
necessary.
With the above, RVV instruction in the guest can be passed from
frontend, to memcheck, to the backend, and generate the final RVV insn
during host isel, a very basic testcase has been tested.
Now here comes to the complexities:
1. RVV has the concept of LMUL, which groups multiple (or partial)
vector registers, e.g. when LMUL==2, v2 means the real v2+v3. This
complicates the register allocation.
2. RVV uses the "implicit" v0 for mask, its content must be loaded to
the exact "v0" register instead of any other ones if host isel wants to
leverage RVV insn, this implicitness in ISA requires more explicitness
in Valgrind implementation.
For #1 LMUL, a new register allocation algorithm for it can be added,
and it will be great if someone is willing to try it, I'm not sure how
much effort it will take. The other way is splitting it into multiple
ops which only takes one vector register, taking vadd for example, 2
vadd will run with LMUL=1 for one vadd with LMUL=2, this is still okay
for the widening insn, most of the arithmetic insns can be covered in
this way. The exception could be register gather insn vrgather, which we
can consult other ways for it, e.g. scalar or helper.
For #2 v0 mask, one way is to handle the mask in the very beginning at
guest_riscv64_toIR.c, similar to what AVX port does:
a) Read the whole dest register without mask
b) Generate unmasked result by running op without mask
c) Applying mask to a,b and generate the final dest
by doing this, insn with mask is converted to non-mask ones, although
more insns are generated but the performance should be acceptable. There
are still exceptions, e.g. vadc (Add-with-Carry), v0 is not used as mask
but as carry, but just as mentioned above, it's okay to use other ways
for a few insns. Eventually, we can pass v0 mask down to the backend if
it's proved a better solution.
This approach will introduce a bunch of new vlen Vector IRs, especially
the arithmetic IRs such as vadd, my goal is for a good solution which
takes reasonable time to reach usable status, yet still be able to
evolve and generic enough for other vector ISA. Any comments?
Best Regards,
Fei.
>>> We start from 1. As each guest register should be described in VEXGuestState
>>> struct, the vector registers with scalable width of VLENB can be added into
>>> VEXGuestState as arrays using an allowable maximum length like 2048/4096.
>>
>> Size of VexGuestRISCV64State is currently 592 bytes. Adding these large
>> vector registers will bump it by 32*2048/8=8192 bytes.
>>
> Yes, that's the reason in my RFC patches the vlen is set to 128, that's
> the largest room for vector in current design.
>
>> The baseblock layout in VEX is: the guest state, two equal sized areas
>> for shadow state and then a spill area. The RISC-V port accesses the
>> baseblock in generated code via x8/s0. The register is set to the
>> address of the baseblock+2048 (file
>> coregrind/m_dispatch/dispatch-riscv64-linux.S). The extra offset is
>> a small optimization to utilize the fact that load/store instructions in
>> RVI have a signed offset in range [-2048,2047]. The end result is that
>> it is possible to access the baseblock data using only a single
>> instruction.
>>
> Nice design.
>
>> Adding the new vector registers will cause that more instructions will
>> be necessary. For instance, accessing any shadow guest state would
>> naively require a sequence of LUI+ADDI+LOAD/STORE.
>>
>> I suspect this could affect performance quite a bit and might need some
>> optimizing.
>>
> Yes, can we separate the vector registers from the other ones, is it
> able to use two baseblocks? Or we can do some experiments to measure the
> overhead.
>
>>>
>>> The actual available access range can be determined at Valgrind startup time
>>> by querying the CPU for its vector capability or some suitable setup steps.
>>
>> Something to consider is that the virtual CPU provided by Valgrind does
>> not necessarily need to match the host CPU. For instance, VEX could
>> hardcode that its vector registers are only 128 bits in size.
>>
>> I was originally hoping that this is how support for the V extension
>> could be added, but the LMUL grouping looks to break this model.
>>
> Originally I had the same idea, but 128 vlen hardware cannot run the
> software built for larger vlen, e.g. clang has option
> -riscv-v-vector-bits-min, if it's set to 256, then it assumes the
> underlying hardware has at least 256 vlen.
>
>>>
>>>
>>> To solve problem 2, we are inspired by already-proven techniques in QEMU,
>>> where translation blocks are broken up when certain critical CSRs are set.
>>> Because the guest code to IR translation relies on the precise value of
>>> LMUL/SEW and they may change within a basic block, we can break up the basic
>>> block each time encountering a vsetvl{i} instruction and return to the
>>> scheduler to execute the translated code and update LMUL/SEW. Accordingly,
>>> translation cache management should be refactored to detect the changing of
>>> LMUL/SEW to invalidate outdated code cache. Without losing the generality,
>>> the LMUL/SEW should be encoded into an ULong flag such that other
>>> architectures can leverage this flag to store their arch-dependent
>>> information. The TTentry struct should also take the flag into account no
>>> matter insertion or deletion. By doing this, the flag carries the newest
>>> LMUL/SEW throughout the simulation and can be passed to disassemble
>>> functions using the VEXArchInfo struct such that we can get the real and
>>> newest value of LMUL and SEW to facilitate our translation.
>>>
>>> Also, some architecture-related code should be taken care of. Like
>>> m_dispatch part, disp_cp_xindir function looks up code cache using hardcoded
>>> assembly by checking the requested guest state IP and translation cache
>>> entry address with no more constraints. Many other modules should be checked
>>> to ensure the in-time update of LMUL/SEW is instantly visible to essential
>>> parts in Valgrind.
>>>
>>>
>>> The last remaining big issue is 3, which we introduce some ad-hoc approaches
>>> to deal with. We summarize these approaches into three types as following:
>>>
>>> 1. Break down a vector instruction to scalar VEX IR ops.
>>> 2. Break down a vector instruction to fixed-length VEX IR ops.
>>> 3. Use dirty helpers to realize vector instructions.
>>
>> I would also look at adding new VEX IR ops for scalable vector
>> instructions. In particular, if it could be shown that RVV and SVE can
>> use same new ops then it could make a good argument for adding them.
>>
>> Perhaps interesting is if such new scalable vector ops could also
>> represent fixed operations on other architectures, but that is just me
>> thinking out loud.
>>
> It's a good idea to consolidate all vector/simd together, the challenge
> is to verify its feasibility and to speedup the adaption progress, as
> it's supposed to take more efforts and longer time. Is there anyone with
> knowledge or experience of other ISA such as avx/sve on valgrind can
> share the pain and gain, or we can do some quick prototype?
>
> Thanks,
> Fei.
>
>>> [...]
>>> In summary, it is far to reach a truly applicable solution in adding vector
>>> extensions in Valgrind. We need to do detailed and comprehensive estimations
>>> on different vector instruction categories.
>>>
>>> Any feedback is welcome in github [3] also.
>>>
>>>
>>> [1] https://github.com/riscv/riscv-v-spec
>>>
>>> [2] https://community.arm.com/arm-research/b/articles/posts/the-arm-scalable-vector-extension-sve
>>>
>>> [3] https://github.com/petrpavlu/valgrind-riscv64/issues/17
>>
>> Sorry for not being more helpful at this point. As mentioned in the
>> GitHub issue, I still need to get myself more familiar with RVV and how
>> Valgrind handles vector instructions.
>>
>> Thanks,
>> Petr
>>
>>
>>
>> _______________________________________________
>> Valgrind-developers mailing list
>> Val...@li...
>> https://lists.sourceforge.net/lists/listinfo/valgrind-developers
>
>
>
> _______________________________________________
> Valgrind-developers mailing list
> Val...@li...
> https://lists.sourceforge.net/lists/listinfo/valgrind-developers
|
|
From: Paul F. <pj...@wa...> - 2023-07-04 05:29:46
|
Hi I just pushed a change to the web pages that adds this info. A+ Paul |
|
From: Paul F. <pa...@so...> - 2023-07-02 11:02:04
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=73ec73ed7fe20ec6427dba63e52534136f3c19bd commit 73ec73ed7fe20ec6427dba63e52534136f3c19bd Author: Paul Floyd <pj...@wa...> Date: Sun Jul 2 12:59:40 2023 +0200 FreeBSD: add default to configure.ac FreeBSD 13 versions Also add comment to README.freebsd about ensuring that jails set "uname -r" to be something compatible with the normal RELEASE/STABLE/CURRENT releases. Diff: --- README.freebsd | 4 ++++ configure.ac | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/README.freebsd b/README.freebsd index 90eefc89b9..d197efcaf3 100644 --- a/README.freebsd +++ b/README.freebsd @@ -21,6 +21,10 @@ $ ./configure --prefix=/where/ever $ gmake $ gmake install +If you are using a jail for building, make sure that it is configured so that +"uname -r" returns a string that matches the pattern "XX.Y-*" where XX is the +major version (12, 13, 14 ...) and Y is the minor version (0, 1, 2, 3). + Known Limitations (June 2022) 0. Be aware that if you use a wrapper script and run Valgrind on the wrapper diff --git a/configure.ac b/configure.ac index 1d4164a7d8..4dbb1753c7 100755 --- a/configure.ac +++ b/configure.ac @@ -444,6 +444,10 @@ case "${host_os}" in AC_DEFINE([FREEBSD_VERS], FREEBSD_13_2, [FreeBSD version]) freebsd_vers=$freebsd_13_2 ;; + *) + AC_MSG_RESULT([unsupported (${kernel})]) + AC_MSG_ERROR([Valgrind works on FreeBSD 10.x to 14.x]) + ;; esac ;; 14.*) |
|
From: Feiyang C. <chr...@gm...> - 2023-06-30 09:57:47
|
Hi, I sent patches v5, which were rebased on master and squashed into 40 commits. I am working on supporting vector on LoongArch64 now. https://bugs.kde.org/show_bug.cgi?id=457504 Thanks, Feiyang |
|
From: Andreas A. <ar...@so...> - 2023-06-28 14:20:55
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=b4cc7815ba722426c5456831f858a2aeceb3761f commit b4cc7815ba722426c5456831f858a2aeceb3761f Author: Andreas Arnez <ar...@li...> Date: Thu Jun 15 17:24:53 2023 +0200 Bug 470978 - s390x: Link the tools with -Wl,--s390-pgste Programs that require the PGSTE mode to be enabled may currently fail under Valgrind. In particular this affects qemu-kvm. While it is also possible to enable the PGSTE mode globally with sysctl vm.allocate_psgte=1 the problem can more easily be prevented by linking the Valgrind tools with -Wl,--s390-pgste. Add a configure check if the linker supports this, and activate the flag if it does. To verify the intended result, the following shell command can be used to list the executables having this flag set: find . -type f -perm -u+x -execdir \ /bin/sh -c 'readelf -lW $0 2>/dev/null | grep PGSTE' {} \; -print Diff: --- Makefile.tool.am | 2 +- NEWS | 1 + configure.ac | 20 ++++++++++++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/Makefile.tool.am b/Makefile.tool.am index df95029138..4ce6d5ab0d 100644 --- a/Makefile.tool.am +++ b/Makefile.tool.am @@ -78,7 +78,7 @@ TOOL_LDFLAGS_ARM64_LINUX = \ $(TOOL_LDFLAGS_COMMON_LINUX) @FLAG_M64@ TOOL_LDFLAGS_S390X_LINUX = \ - $(TOOL_LDFLAGS_COMMON_LINUX) @FLAG_M64@ + $(TOOL_LDFLAGS_COMMON_LINUX) @FLAG_M64@ @FLAG_S390_PGSTE@ TOOL_LDFLAGS_X86_DARWIN = \ $(TOOL_LDFLAGS_COMMON_DARWIN) -arch i386 diff --git a/NEWS b/NEWS index c22c82131d..a4e7533115 100644 --- a/NEWS +++ b/NEWS @@ -43,6 +43,7 @@ are not entered into bugzilla tend to get forgotten about or ignored. (Bool vgPlain_realpath(const HChar *, HChar *)): Assertion 'resolved' failed 470830 Don't print actions vgdb me ... continue for vgdb --multi mode +470978 s390x: Valgrind cannot start qemu-kvm when "sysctl vm.allocate_pgste=0" To see details of a given bug, visit https://bugs.kde.org/show_bug.cgi?id=XXXXXX diff --git a/configure.ac b/configure.ac index 0cf84a1c00..1d4164a7d8 100755 --- a/configure.ac +++ b/configure.ac @@ -3096,6 +3096,26 @@ AC_SUBST([FLAG_NO_BUILD_ID], [""]) fi CFLAGS=$safe_CFLAGS +# On s390x, if the linker supports -Wl,--s390-pgste, then we build the +# tools with that flag. This enables running programs that need it, such +# as qemu-kvm. +if test x$VGCONF_PLATFORM_PRI_CAPS = xS390X_LINUX; then +AC_MSG_CHECKING([if the linker accepts -Wl,--s390-pgste]) +safe_CFLAGS=$CFLAGS +CFLAGS="-Wl,--s390-pgste" + +AC_LINK_IFELSE( +[AC_LANG_PROGRAM([ ], [return 0;])], +[ + AC_SUBST([FLAG_S390_PGSTE], ["-Wl,--s390-pgste"]) + AC_MSG_RESULT([yes]) +], [ + AC_SUBST([FLAG_S390_PGSTE], [""]) + AC_MSG_RESULT([no]) +]) +CFLAGS=$safe_CFLAGS +fi + # does the ppc assembler support "mtocrf" et al? AC_MSG_CHECKING([if ppc32/64 as supports mtocrf/mfocrf]) |
|
From: Mark W. <ma...@so...> - 2023-06-15 15:50:01
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=bf0c73231b76e293a103ed8b2178975c7032f669 commit bf0c73231b76e293a103ed8b2178975c7032f669 Author: Mark Wielaard <ma...@kl...> Date: Fri Jun 9 15:21:57 2023 +0200 Don't print action vgdb me ... and continuing ... in vgdb --multi mode Guard each (action) vgdb me ... VG_(umsg) printing with !(VG_(clo_launched_with_multi)) https://bugs.kde.org/show_bug.cgi?id=470830 Diff: --- NEWS | 4 +++- coregrind/m_errormgr.c | 6 ++++-- coregrind/m_gdbserver/m_gdbserver.c | 9 +++++---- coregrind/m_libcassert.c | 3 ++- coregrind/m_main.c | 10 ++++++---- 5 files changed, 20 insertions(+), 12 deletions(-) diff --git a/NEWS b/NEWS index 52ee38ab8b..c22c82131d 100644 --- a/NEWS +++ b/NEWS @@ -40,7 +40,9 @@ are not entered into bugzilla tend to get forgotten about or ignored. 470121 Can't run callgrind_control with valgrind 3.21.0 because of perl errors 470520 Multiple realloc zero errors crash in MC_(eq_Error) 470713 Failure on the Yosys project: valgrind: m_libcfile.c:1802 - (Bool vgPlain_realpath(const HChar *, HChar *)): Assertion 'resolved' failed + (Bool vgPlain_realpath(const HChar *, HChar *)): + Assertion 'resolved' failed +470830 Don't print actions vgdb me ... continue for vgdb --multi mode To see details of a given bug, visit https://bugs.kde.org/show_bug.cgi?id=XXXXXX diff --git a/coregrind/m_errormgr.c b/coregrind/m_errormgr.c index 6be637190a..63c0e4eaa7 100644 --- a/coregrind/m_errormgr.c +++ b/coregrind/m_errormgr.c @@ -526,9 +526,11 @@ void do_actions_on_error(const Error* err, Bool allow_db_attach) if (VG_(clo_vgdb) != Vg_VgdbNo && allow_db_attach && VG_(clo_vgdb_error) <= n_errs_shown) { - VG_(umsg)("(action on error) vgdb me ... \n"); + if (!(VG_(clo_launched_with_multi))) + VG_(umsg)("(action on error) vgdb me ... \n"); VG_(gdbserver)( err->tid ); - VG_(umsg)("Continuing ...\n"); + if (!(VG_(clo_launched_with_multi))) + VG_(umsg)("Continuing ...\n"); } /* Or maybe we want to generate the error's suppression? */ diff --git a/coregrind/m_gdbserver/m_gdbserver.c b/coregrind/m_gdbserver/m_gdbserver.c index f8fbc5af23..5d0973e9ed 100644 --- a/coregrind/m_gdbserver/m_gdbserver.c +++ b/coregrind/m_gdbserver/m_gdbserver.c @@ -602,11 +602,11 @@ void VG_(gdbserver_prerun_action) (ThreadId tid) // Using VG_(clo_vgdb_error) allows the user to control if gdbserver // stops after a fork. if ((VG_(clo_vgdb_error) == 0 - || (VgdbStopAtiS(VgdbStopAt_Startup, VG_(clo_vgdb_stop_at)))) - && !(VG_(clo_launched_with_multi))) { + || (VgdbStopAtiS(VgdbStopAt_Startup, VG_(clo_vgdb_stop_at))))) { /* The below call allows gdb to attach at startup before the first guest instruction is executed. */ - VG_(umsg)("(action at startup) vgdb me ... \n"); + if (!(VG_(clo_launched_with_multi))) + VG_(umsg)("(action at startup) vgdb me ... \n"); VG_(gdbserver)(tid); } else { /* User has activated gdbserver => initialize now the FIFOs @@ -975,7 +975,8 @@ void VG_(gdbserver_report_fatal_signal) (const vki_siginfo_t *info, return; } - VG_(umsg)("(action on fatal signal) vgdb me ... \n"); + if (!(VG_(clo_launched_with_multi))) + VG_(umsg)("(action on fatal signal) vgdb me ... \n"); /* indicate to gdbserver that there is a signal */ gdbserver_signal_encountered (info); diff --git a/coregrind/m_libcassert.c b/coregrind/m_libcassert.c index 35f37f88df..0b04bfcc1d 100644 --- a/coregrind/m_libcassert.c +++ b/coregrind/m_libcassert.c @@ -282,7 +282,8 @@ static void exit_wrk( Int status, Bool gdbserver_call_allowed) if (status != 0 && VgdbStopAtiS(VgdbStopAt_ValgrindAbExit, VG_(clo_vgdb_stop_at))) { if (VG_(gdbserver_init_done)()) { - VG_(umsg)("(action at valgrind abnormal exit) vgdb me ... \n"); + if (!(VG_(clo_launched_with_multi))) + VG_(umsg)("(action at valgrind abnormal exit) vgdb me ... \n"); VG_(gdbserver) (atid); } else { VG_(umsg)("(action at valgrind abnormal exit)\n" diff --git a/coregrind/m_main.c b/coregrind/m_main.c index a857e5afeb..b8751341a0 100644 --- a/coregrind/m_main.c +++ b/coregrind/m_main.c @@ -2258,12 +2258,14 @@ void shutdown_actions_NORETURN( ThreadId tid, /* Final call to gdbserver, if requested. */ if (VG_(gdbserver_stop_at) (VgdbStopAt_Abexit) && tid_exit_code (tid) != 0) { - VG_(umsg)("(action at abexit, exit code %d) vgdb me ... \n", - tid_exit_code (tid)); + if (!(VG_(clo_launched_with_multi))) + VG_(umsg)("(action at abexit, exit code %d) vgdb me ... \n", + tid_exit_code (tid)); VG_(gdbserver) (tid); } else if (VG_(gdbserver_stop_at) (VgdbStopAt_Exit)) { - VG_(umsg)("(action at exit, exit code %d) vgdb me ... \n", - tid_exit_code (tid)); + if (!(VG_(clo_launched_with_multi))) + VG_(umsg)("(action at exit, exit code %d) vgdb me ... \n", + tid_exit_code (tid)); VG_(gdbserver) (tid); } VG_(threads)[tid].status = VgTs_Empty; |
|
From: Mark W. <ma...@so...> - 2023-06-15 15:02:05
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=5a97c06080078aab8adfcc8985aecce7bfa5a738 commit 5a97c06080078aab8adfcc8985aecce7bfa5a738 Author: Tulio Magno Quites Machado Filho <tu...@re...> Date: Wed Jun 14 11:28:38 2023 -0300 s390x: Replace absolute jump for a relative one The bne instruction expects an absolute target address and it isn't best-suited for implementing a short range jump, such as the one in XCHG_M_R(). Replace it with jne which expects a relative address that can be correctly computed a link time. Interestingly, the jump is almost never taken. If it would, this would crash the test. However, linkers may complain when relacating the target address used in bne. Diff: --- helgrind/tests/tc11_XCHG.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helgrind/tests/tc11_XCHG.c b/helgrind/tests/tc11_XCHG.c index f6ff1c9846..08e34a0b57 100644 --- a/helgrind/tests/tc11_XCHG.c +++ b/helgrind/tests/tc11_XCHG.c @@ -81,7 +81,7 @@ __asm__ __volatile__( \ "0: l 0,%[global]\n\t" \ " cs 0,%[local],%[global]\n\t" \ - " bne 0b\n\t" \ + " jne 0b\n\t" \ " lr %[local],0\n\t" \ : /*out*/ [global]"+m"(_addr), [local]"+d"(_lval) \ : /*in*/ \ |
|
From: Mark W. <ma...@kl...> - 2023-06-14 21:26:37
|
Hi, On Thu, Jun 08, 2023 at 04:55:38PM +0200, Floyd, Paul wrote: > On 16/05/2023 06:43, Nicholas Nethercote wrote: > > > >Are there any consequences of note for Valgrind? Judging by this > >paragraph, not particularly: > > > >> Sourceware will continue its long standing mission of providing free > > software infrastructure to the projects it supports, and this will not > > change moving forward. The affiliation with SFC will be transparent to > > the projects hosted on Sourceware. Project admins will keep being in > > charge of how they utilize the services Sourceware provides. > > > >Is that right? Yeah, it really is about the infrastructure, not about how projects use the infrastructure. But if we want any changes to the services provided we can always ask. Maybe one concrete thing might be for Sourceware/SFC to hold the valgrind.org domain for the project so no one individual is responsible for keeping it valid (although that isn't a big burden, just a convenient way to reduce the "bus factor"). > >I have been thinking a bit recently about the fact that Valgrind > >doesn't have any explicit governance structure or decision-making > >processes, and how it would be good to have some. > >developers > > I've been listening to afew 'Oxide and Friends' podcasts recently > (which has a heavy Rust slant), and yes, it would be good to have > some more in the way of governance. > > But first it would be even better to have more developers. Yes, but having a bit more visible "governance" might help with that. So people who join have a better view of what to expect. I admit I am still acting as if Julian is the BDFL. If we do something really bad he will certainly step in :) But that is cheating a little I guess. In practice our governance is having consensus around the DEVELOPER and processes READMEs. Doing releases twice a year at fixed dates/months (and the video chats planning those) do seem to work well. What we don't really have is a process for when there isn't clear consensus. Which means we never really make radical changes. Also we don't have enough reviewers for bigger changes (there are still two ports pending). Cheers, Mark |
|
From: LATHUILIERE B. <bru...@ed...> - 2023-06-12 12:52:34
|
Hi, I like the idea to add verrou in the variant list. You can get the source and documentation from github : https://github.com/edf-hpc/verrou/ The direct link to the documentation of the last version : http://edf-hpc.github.io/verrou/vr-manual.html (Soon or later I will change the link, to keep the documentation of old versions) The main references about verrou are : - François Févotte and Bruno Lathuilière. Debugging and optimization of HPC programs with the Verrou tool. In International Workshop on Software Correctness for HPC Applications (Correctness), Denver, CO, USA, Nov. 2019. DOI: 10.1109/Correctness49594.2019.00006 https://hal.science/hal-02044101/ - François Févotte and Bruno Lathuilière. Studying the numerical quality of an industrial computing code: A case study on code_aster. In 10th International Workshop on Numerical Software Verification (NSV), pages 61--80, Heidelberg, Germany, July 2017. DOI: 10.1007/978-3-319-63501-9_5 https://www.fevotte.net/publications/fevotte2017a.pdf - François Févotte and Bruno Lathuilière. VERROU: a CESTAC evaluation without recompilation. In International Symposium on Scientific Computing, Computer Arithmetics and Verified Numerics (SCAN), Uppsala, Sweden, September 2016. https://www.fevotte.net/publications/fevotte2016.pdf And if you are interested by the required number of samples, you should read the following paper (not specific to verrou) : - Devan Sohier, Pablo De Oliveira Castro, François Févotte, Bruno Lathuilière, Eric Petit, and Olivier Jamond. Confidence intervals for stochastic arithmetic. ACM Transactions on Mathematical Software, 47(2), 2021. https://hal.science/hal-01827319 ++ Bruno Lathuilière -----Message d'origine----- De : pj...@wa... <pj...@wa...> Envoyé : lundi 12 juin 2023 11:26 À : val...@li... Objet : Re: [Valgrind-developers] RFC: support scalable vector model / riscv vector On 01/06/2023 13:13, LATHUILIERE Bruno via Valgrind-developers wrote: > I don't know if my experience is the one you expect, nevertheless I will try to share it. > I'm the main developer of a valgrind tool called verrou (url: https://github.com/edf-hpc/verrou ) which currently only works with x86_64 architecture. > From user's point of view, verrou enables to estimate the effect of the floating-point rounding error propagation (If you are interested by the subject, there are documentation and publication). [snip] Interesting, I don't remember having seen anything on verrou. I need to look more at the doc and publications. I'll add a link to https://valgrind.org/downloads/variants.html (which is a bit out of date) A+ Paul _______________________________________________ Valgrind-developers mailing list Val...@li... https://lists.sourceforge.net/lists/listinfo/valgrind-developers Ce message et toutes les pièces jointes (ci-après le 'Message') sont établis à l'intention exclusive des destinataires et les informations qui y figurent sont strictement confidentielles. Toute utilisation de ce Message non conforme à sa destination, toute diffusion ou toute publication totale ou partielle, est interdite sauf autorisation expresse. Si vous n'êtes pas le destinataire de ce Message, il vous est interdit de le copier, de le faire suivre, de le divulguer ou d'en utiliser tout ou partie. Si vous avez reçu ce Message par erreur, merci de le supprimer de votre système, ainsi que toutes ses copies, et de n'en garder aucune trace sur quelque support que ce soit. Nous vous remercions également d'en avertir immédiatement l'expéditeur par retour du message. Il est impossible de garantir que les communications par messagerie électronique arrivent en temps utile, sont sécurisées ou dénuées de toute erreur ou virus. ____________________________________________________ This message and any attachments (the 'Message') are intended solely for the addressees. The information contained in this Message is confidential. Any use of information contained in this Message not in accord with its purpose, any dissemination or disclosure, either whole or partial, is prohibited except formal approval. If you are not the addressee, you may not copy, forward, disclose or use any part of it. If you have received this message in error, please delete it and all copies from your system and notify the sender immediately by return message. E-mail communication cannot be guaranteed to be timely secure, error or virus-free. |
|
From: Floyd, P. <pj...@wa...> - 2023-06-12 09:26:16
|
On 01/06/2023 13:13, LATHUILIERE Bruno via Valgrind-developers wrote: > I don't know if my experience is the one you expect, nevertheless I will try to share it. > I'm the main developer of a valgrind tool called verrou (url: https://github.com/edf-hpc/verrou ) which currently only works with x86_64 architecture. > From user's point of view, verrou enables to estimate the effect of the floating-point rounding error propagation (If you are interested by the subject, there are documentation and publication). [snip] Interesting, I don't remember having seen anything on verrou. I need to look more at the doc and publications. I'll add a link to https://valgrind.org/downloads/variants.html (which is a bit out of date) A+ Paul |
|
From: Paul F. <pa...@so...> - 2023-06-09 11:20:26
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=3df8a00a4ed7dbe436f28d8b3db72e679eb1b427 commit 3df8a00a4ed7dbe436f28d8b3db72e679eb1b427 Author: Paul Floyd <pj...@wa...> Date: Fri Jun 9 13:17:58 2023 +0200 470121 - Can't run callgrind_control with valgrind 3.21.0 because of perl errors Diff: --- NEWS | 1 + callgrind/callgrind_control.in | 94 +++++++++++++++++++++++++----------------- 2 files changed, 57 insertions(+), 38 deletions(-) diff --git a/NEWS b/NEWS index 4c5635dde1..52ee38ab8b 100644 --- a/NEWS +++ b/NEWS @@ -37,6 +37,7 @@ are not entered into bugzilla tend to get forgotten about or ignored. 469049 link failure on ppc64 (big endian) valgrind 3.20 469146 massif --ignore-fn does not ignore inlined functions 469768 Make it possible to install gdb scripts in a different location +470121 Can't run callgrind_control with valgrind 3.21.0 because of perl errors 470520 Multiple realloc zero errors crash in MC_(eq_Error) 470713 Failure on the Yosys project: valgrind: m_libcfile.c:1802 (Bool vgPlain_realpath(const HChar *, HChar *)): Assertion 'resolved' failed diff --git a/callgrind/callgrind_control.in b/callgrind/callgrind_control.in index 083ffa29fc..bee6661efb 100644 --- a/callgrind/callgrind_control.in +++ b/callgrind/callgrind_control.in @@ -29,6 +29,12 @@ use File::Basename; # vgdb_exe will be set to a vgdb found 'near' the callgrind_control file my $vgdb_exe = ""; +my $vgdbPrefixOption = ""; +my $cmd = ""; +my %cmd; +my %cmdline; +my $pid = -1; +my @pids = (); sub getCallgrindPids { @@ -50,6 +56,8 @@ sub getCallgrindPids { close LIST; } +my $headerPrinted = 0; + sub printHeader { if ($headerPrinted) { return; } $headerPrinted = 1; @@ -95,11 +103,17 @@ sub printHelp { # Parts more or less copied from cg_annotate (author: Nicholas Nethercote) # +my $event = ""; +my $events = ""; +my %events = (); +my @events = (); +my @show_events = (); +my @show_order = (); + sub prepareEvents { @events = split(/\s+/, $events); - %events = (); - $n = 0; + my $n = 0; foreach $event (@events) { $events{$event} = $n; $n++; @@ -178,7 +192,7 @@ sub print_events ($) { my ($CC_col_widths) = @_; - foreach my $i (@show_order) { + foreach my $i (@show_order) { my $event = $events[$i]; my $event_width = length($event); my $col_width = $CC_col_widths->[$i]; @@ -209,7 +223,7 @@ if (-x $controldir . "/vgdb") { # To find the list of active pids, we need to have # the --vgdb-prefix option if given. -$vgdbPrefixOption = ""; +my $arg = ""; foreach $arg (@ARGV) { if ($arg =~ /^--vgdb-prefix=.*$/) { $vgdbPrefixOption=$arg; @@ -219,15 +233,19 @@ foreach $arg (@ARGV) { getCallgrindPids; -$requestEvents = 0; -$requestDump = 0; -$switchInstr = 0; -$headerPrinted = 0; -$dumpHint = ""; +my $requestEvents = 0; +my $requestDump = 0; +my $switchInstr = 0; +my $dumpHint = ""; +my $printBacktrace = 0; +my $printStatus = 0; +my $switchInstrMode = ""; +my $requestKill = ""; +my $requestZero = ""; -$verbose = 0; +my $verbose = 0; -%spids = (); +my %spids = (); foreach $arg (@ARGV) { if ($arg =~ /^-/) { if ($requestDump == 1) { $requestDump = 2; } @@ -329,8 +347,8 @@ foreach $arg (@ARGV) { } if (defined $cmd{$arg}) { $spids{$arg} = 1; next; } - $nameFound = 0; - foreach $p (@pids) { + my $nameFound = 0; + foreach my $p (@pids) { if ($cmd{$p} =~ /$arg$/) { $nameFound = 1; $spids{$p} = 1; @@ -353,11 +371,11 @@ if (scalar @pids == 0) { exit; } -@spids = keys %spids; +my @spids = keys %spids; if (scalar @spids >0) { @pids = @spids; } -$vgdbCommand = ""; -$waitForAnswer = 0; +my $vgdbCommand = ""; +my $waitForAnswer = 0; if ($requestDump) { $vgdbCommand = "dump"; if ($dumpHint ne "") { $vgdbCommand .= " ".$dumpHint; } @@ -371,7 +389,7 @@ if ($printStatus || $printBacktrace || $requestEvents) { } foreach $pid (@pids) { - $pidstr = "PID $pid: "; + my $pidstr = "PID $pid: "; if ($pid >0) { print $pidstr.$cmdline{$pid}; } if ($vgdbCommand eq "") { @@ -385,24 +403,24 @@ foreach $pid (@pids) { } open RESULT, $vgdb_exe . " $vgdbPrefixOption --pid=$pid $vgdbCommand|"; - @tids = (); - $ctid = 0; - %fcount = (); - %func = (); - %calls = (); - %events = (); - @events = (); - @threads = (); - %totals = (); - - $exec_bbs = 0; - $dist_bbs = 0; - $exec_calls = 0; - $dist_calls = 0; - $dist_ctxs = 0; - $dist_funcs = 0; - $threads = ""; - $events = ""; + my @tids = (); + my $tid; + my $ctid = 0; + my %fcount = (); + my %func = (); + my %calls = (); + my @threads = (); + my %totals = (); + my $totals_width = []; + + my $exec_bbs = 0; + my $dist_bbs = 0; + my $exec_calls = 0; + my $dist_calls = 0; + my $dist_ctxs = 0; + my $dist_funcs = 0; + my $threads = ""; + my $instrumentation = ""; while(<RESULT>) { if (/function-(\d+)-(\d+): (.+)$/) { @@ -485,10 +503,10 @@ foreach $pid (@pids) { } print "Backtrace for Thread $tid\n"; - $i = $fcount{$tid}; - $c = 0; + my $i = $fcount{$tid}; + my $c = 0; while($i>0 && $c<100) { - $fc = substr(" $c",-2); + my $fc = substr(" $c",-2); print " [$fc] "; if ($requestEvents >0) { print_CC($events{$tid,$i-1}, $totals_width); |