|
From: Julian S. <se...@so...> - 2021-07-13 07:11:15
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=e5f66a2aa00fa88ba3e0fb004510f0a630881ef1 commit e5f66a2aa00fa88ba3e0fb004510f0a630881ef1 Author: Julian Seward <js...@ac...> Date: Tue Jul 13 09:07:45 2021 +0200 Reimplement h_generic_calc_GetMSBs8x16 to be more efficient. h_generic_calc_GetMSBs8x16 concatenates the top bit of each 8-bit lane in a 128-bit value, producing a 16-bit scalar value. (It is PMOVMSKB, really). The existing implementation is excessively inefficient and shows up sometimes in 'perf' profiles of V. This commit replaces it with a logarithmic (4-stage) algorithm which is hopefully much faster. Diff: --- VEX/priv/host_generic_simd128.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/VEX/priv/host_generic_simd128.c b/VEX/priv/host_generic_simd128.c index 1c0f7cfbaf..f895de46f4 100644 --- a/VEX/priv/host_generic_simd128.c +++ b/VEX/priv/host_generic_simd128.c @@ -383,23 +383,20 @@ void VEX_REGPARM(3) UInt /*not-regparm*/ h_generic_calc_GetMSBs8x16 ( ULong w64hi, ULong w64lo ) { - UInt r = 0; - if (w64hi & (1ULL << (64-1))) r |= (1<<15); - if (w64hi & (1ULL << (56-1))) r |= (1<<14); - if (w64hi & (1ULL << (48-1))) r |= (1<<13); - if (w64hi & (1ULL << (40-1))) r |= (1<<12); - if (w64hi & (1ULL << (32-1))) r |= (1<<11); - if (w64hi & (1ULL << (24-1))) r |= (1<<10); - if (w64hi & (1ULL << (16-1))) r |= (1<<9); - if (w64hi & (1ULL << ( 8-1))) r |= (1<<8); - if (w64lo & (1ULL << (64-1))) r |= (1<<7); - if (w64lo & (1ULL << (56-1))) r |= (1<<6); - if (w64lo & (1ULL << (48-1))) r |= (1<<5); - if (w64lo & (1ULL << (40-1))) r |= (1<<4); - if (w64lo & (1ULL << (32-1))) r |= (1<<3); - if (w64lo & (1ULL << (24-1))) r |= (1<<2); - if (w64lo & (1ULL << (16-1))) r |= (1<<1); - if (w64lo & (1ULL << ( 8-1))) r |= (1<<0); + /* Some serious bit twiddling going on here. Mostly we can do it in + parallel for the upper and lower 64 bits, assuming the processor offers + a suitably high level of ILP. */ + w64hi &= 0x8080808080808080ULL; + w64lo &= 0x8080808080808080ULL; + w64hi >>= 7; + w64lo >>= 7; + w64hi |= (w64hi >> 7); + w64lo |= (w64lo >> 7); + w64hi |= (w64hi >> 14); + w64lo |= (w64lo >> 14); + w64hi |= (w64hi >> 28); + w64lo |= (w64lo >> 28); + UInt r = ((w64hi & 0xFF) << 8) | (w64lo & 0xFF); return r; } |