|
From: Matthias S. <zz...@ge...> - 2015-07-04 00:34:41
|
Bug 349879
Measurements:
$ perl perf/vg_perf --reps=10 --tools=memcheck --vg=../orig/ --vg=../x86-load-asm/ perf/
-- Running tests in perf ----------------------------------------------
-- bigcode1 --
bigcode1 orig :0.06s me: 2.5s (41.2x, -----)
bigcode1 x86-load-asm:0.06s me: 2.5s (41.2x, 0.0%)
-- bigcode2 --
bigcode2 orig :0.06s me: 5.8s (97.0x, -----)
bigcode2 x86-load-asm:0.06s me: 5.8s (96.5x, 0.5%)
-- bz2 --
bz2 orig :0.45s me: 5.1s (11.3x, -----)
bz2 x86-load-asm:0.45s me: 5.0s (11.0x, 2.5%)
-- fbench --
fbench orig :0.27s me: 3.2s (11.9x, -----)
fbench x86-load-asm:0.27s me: 3.1s (11.5x, 3.1%)
-- ffbench --
ffbench orig :0.18s me: 1.9s (10.8x, -----)
ffbench x86-load-asm:0.18s me: 1.9s (10.3x, 4.6%)
-- heap --
heap orig :0.07s me: 4.6s (65.6x, -----)
heap x86-load-asm:0.07s me: 4.6s (65.1x, 0.7%)
-- heap_pdb4 --
heap_pdb4 orig :0.08s me: 7.3s (91.5x, -----)
heap_pdb4 x86-load-asm:0.08s me: 7.2s (90.6x, 1.0%)
-- many-loss-records --
many-loss-records orig :0.01s me: 1.2s (122.0x, -----)
many-loss-records x86-load-asm:0.01s me: 1.2s (121.0x, 0.8%)
-- many-xpts --
many-xpts orig :0.04s me: 1.4s (34.0x, -----)
many-xpts x86-load-asm:0.04s me: 1.4s (33.8x, 0.7%)
-- memrw --
memrw orig :0.04s me: 1.1s (28.2x, -----)
memrw x86-load-asm:0.04s me: 1.1s (28.2x, 0.0%)
-- sarp --
sarp orig :0.01s me: 2.1s (215.0x, -----)
sarp x86-load-asm:0.01s me: 2.1s (208.0x, 3.3%)
-- tinycc --
tinycc orig :0.13s me: 5.5s (42.1x, -----)
tinycc x86-load-asm:0.13s me: 5.4s (41.7x, 0.9%)
-- Finished tests in perf ----------------------------------------------
== 12 programs, 24 timings =================
---
memcheck/mc_main.c | 173 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 170 insertions(+), 3 deletions(-)
diff --git a/memcheck/mc_main.c b/memcheck/mc_main.c
index 73778fa..56e5618 100644
--- a/memcheck/mc_main.c
+++ b/memcheck/mc_main.c
@@ -1311,6 +1311,7 @@ void mc_LOADV_128_or_256_slow ( /*OUT*/ULong* res,
static
__attribute__((noinline))
+__attribute__((used))
VG_REGPARM(3) /* make sure we're using a fixed calling convention, since
this function may get called from hand written assembly. */
ULong mc_LOADVn_slow ( Addr a, SizeT nBits, Bool bigendian )
@@ -4430,10 +4431,49 @@ VG_REGPARM(1) ULong MC_(helperc_LOADV64be) ( Addr a )
{
return mc_LOADV64(a, True);
}
+
+#if ENABLE_ASSEMBLY_HELPERS && defined(PERF_FAST_LOADV) \
+ && defined(VGP_x86_linux)
+__asm__(
+".text\n"
+".align 16\n"
+".global vgMemCheck_helperc_LOADV64le\n"
+".type vgMemCheck_helperc_LOADV64le, @function\n"
+"vgMemCheck_helperc_LOADV64le:\n"
+" test $0x7,%al\n"
+" jnz 2f\n" /* jump if not aligned */
+" mov %eax,%ecx\n"
+" movzwl %ax,%edx\n"
+" shr $0x10,%ecx\n"
+" mov primary_map(,%ecx,4),%ecx\n"
+" shr $0x3,%edx\n"
+" movzwl (%ecx,%edx,2),%edx\n"
+" cmp $0xaaaa,%dx\n"
+" jne 1f\n" /* jump if not all defined */
+" xor %eax,%eax\n" /* return 0 in edx:eax */
+" xor %edx,%edx\n"
+" ret\n"
+".align 16\n"
+"1:\n"
+" cmp $0x5555,%dx\n"
+" jne 2f\n" /* jump if not all undefined */
+" or $0xffffffff,%eax\n" /* else return all bits set in edx:eax */
+" mov %eax,%edx\n"
+" ret\n"
+".align 16\n"
+"2:\n"
+" xor %ecx,%ecx\n" /* tail call to mc_LOADVn_slow(a, 64, 0) */
+" mov $64,%edx\n"
+" jmp mc_LOADVn_slow\n"
+".size vgMemCheck_helperc_LOADV64le, .-vgMemCheck_helperc_LOADV64le\n"
+".previous\n"
+);
+#else
VG_REGPARM(1) ULong MC_(helperc_LOADV64le) ( Addr a )
{
return mc_LOADV64(a, False);
}
+#endif
static INLINE
@@ -4586,8 +4626,43 @@ __asm__( /* Derived from NCode template */
".previous\n"
);
+#elif ENABLE_ASSEMBLY_HELPERS && defined(PERF_FAST_LOADV) \
+ && defined(VGP_x86_linux)
+__asm__(
+".text\n"
+".align 16\n"
+".global vgMemCheck_helperc_LOADV32le\n"
+".type vgMemCheck_helperc_LOADV32le, @function\n"
+"vgMemCheck_helperc_LOADV32le:\n"
+" test $0x3, %al\n"
+" jnz 2f\n" /* jump if misaligned */
+" mov %eax, %edx\n"
+" shr $16, %edx\n"
+" mov primary_map(,%edx,4), %ecx\n"
+" movzwl %ax, %edx\n"
+" shr $2,%edx\n"
+" movzbl (%ecx,%edx,1),%edx\n"
+" cmp $0xaa, %dl\n" /* compare to VA_BITS8_DEFINED */
+" jne 1f\n" /* jump if not completely defined */
+" xor %eax, %eax\n" /* else return V_BITS32_DEFINED */
+" ret\n"
+".align 16\n"
+"1:\n"
+" cmp $0x55, %dl\n" /* compare to VA_BITS8_UNDEFINED */
+" jne 2f\n" /* jump if not completely undefined */
+" or $0xffffffff, %eax\n" /* else return 0xffffffff == V_BITS32_UNDEFINED */
+" ret\n"
+".align 16\n"
+"2:\n"
+" xor %ecx, %ecx\n" /* tail call mc_LOADVn_slow(a, 32, 0) */
+" mov $32, %edx\n"
+" jmp mc_LOADVn_slow\n"
+".size vgMemCheck_helperc_LOADV32le, .-vgMemCheck_helperc_LOADV32le\n"
+".previous\n"
+);
+
#else
-// Generic for all platforms except arm32-linux
+// Generic for all platforms
VG_REGPARM(1) UWord MC_(helperc_LOADV32le) ( Addr a )
{
return mc_LOADV32(a, False);
@@ -4762,8 +4837,55 @@ __asm__( /* Derived from NCode template */
".previous\n"
);
+#elif ENABLE_ASSEMBLY_HELPERS && defined(PERF_FAST_LOADV) \
+ && defined(VGP_x86_linux)
+__asm__(
+".text\n"
+".align 16\n"
+".global vgMemCheck_helperc_LOADV16le\n"
+".type vgMemCheck_helperc_LOADV16le, @function\n"
+"vgMemCheck_helperc_LOADV16le:\n"
+" test $0x1,%al\n"
+" jnz 5f\n" /* jump if not aligned */
+" mov %eax,%edx\n"
+" shr $0x10,%edx\n"
+" mov primary_map(,%edx,4),%ecx\n"
+" movzwl %ax,%edx\n"
+" shr $0x2,%edx\n"
+" movzbl (%ecx,%edx,1),%edx\n" /* dl = VA bits for 32bit */
+" cmp $0xaa,%dl\n" /* compare to VA_BITS8_DEFINED */
+" jne 2f\n" /* jump if not all 32bits defined */
+"1:\n"
+" mov $0xffff0000,%eax\n" /* V_BITS16_DEFINED | top16safe */
+" ret\n"
+".align 16\n"
+"2:\n"
+" cmp $0x55,%dl\n" /* compare to VA_BITS8_UNDEFINED */
+" jne 4f\n" /* jump if not all 32bits undefined */
+"3:\n"
+" or $0xffffffff,%eax\n" /* V_BITS16_UNDEFINED | top16safe */
+" ret\n"
+".align 16\n"
+"4:\n"
+" mov %eax,%ecx\n"
+" and $0x2,%ecx\n"
+" shl $1,%ecx\n"
+" sar %cl,%dl\n"
+" and $0xf,%dl\n"
+" cmp $0xa,%dl\n"
+" je 1b\n" /* jump if all 16bits are defined */
+" cmp $0x5,%dl\n"
+" je 3b\n" /* jump if all 16bits are undefined */
+"5:\n"
+" xor %ecx,%ecx\n" /* tail call mc_LOADVn_slow(a, 16, 0) */
+" mov $16,%edx\n"
+" jmp mc_LOADVn_slow\n"
+".size vgMemCheck_helperc_LOADV16le, .-vgMemCheck_helperc_LOADV16le \n"
+".previous\n"
+);
+
#else
-// Generic for all platforms except arm32-linux
+// Generic for all platforms except x86 and arm32-linux
VG_REGPARM(1) UWord MC_(helperc_LOADV16le) ( Addr a )
{
return mc_LOADV16(a, False);
@@ -4904,8 +5026,53 @@ __asm__( /* Derived from NCode template */
".previous\n"
);
+/* Non-generic assembly for x86-linux */
+#elif ENABLE_ASSEMBLY_HELPERS && defined(PERF_FAST_LOADV) \
+ && defined(VGP_x86_linux)
+__asm__(
+".text\n"
+".align 16\n"
+".global vgMemCheck_helperc_LOADV8\n"
+".type vgMemCheck_helperc_LOADV8, @function\n"
+"vgMemCheck_helperc_LOADV8:\n"
+" mov %eax,%edx\n"
+" shr $0x10,%edx\n"
+" mov primary_map(,%edx,4),%ecx\n"
+" movzwl %ax,%edx\n"
+" shr $0x2,%edx\n"
+" movzbl (%ecx,%edx,1),%edx\n" /* dl = VA bits for 32bit */
+" cmp $0xaa,%dl\n" /* compara to VA_BITS8_DEFINED? */
+" jne 2f\n" /* jump if not defined */
+"1:\n"
+" mov $0xffffff00,%eax\n" /* V_BITS8_DEFINED | top24safe */
+" ret\n"
+".align 16\n"
+"2:\n"
+" cmp $0x55,%dl\n" /* compare to VA_BITS8_UNDEFINED */
+" jne 4f\n" /* jump if not all 32bits are undefined */
+"3:\n"
+" or $0xffffffff,%eax\n" /* V_BITS8_UNDEFINED | top24safe */
+" ret\n"
+".align 16\n"
+"4:\n"
+" mov %eax,%ecx\n"
+" and $0x3,%ecx\n"
+" shl $1,%ecx\n"
+" sar %cl,%dl\n"
+" and $0x3,%dl\n"
+" cmp $0x2,%dl\n"
+" je 1b\n" /* jump if all 8bits are defined */
+" cmp $0x1,%dl\n"
+" je 3b\n" /* jump if all 8bits are undefined */
+" xor %ecx,%ecx\n" /* tail call to mc_LOADVn_slow(a, 8, 0) */
+" mov $0x8,%edx\n"
+" jmp mc_LOADVn_slow\n"
+".size vgMemCheck_helperc_LOADV8, .-vgMemCheck_helperc_LOADV8\n"
+".previous\n"
+);
+
#else
-// Generic for all platforms except arm32-linux
+// Generic for all platforms
VG_REGPARM(1)
UWord MC_(helperc_LOADV8) ( Addr a )
{
--
2.4.3
|
|
From: Julian S. <js...@ac...> - 2015-07-10 10:23:45
|
Yes, sounds like it's worth committing. One comment: For the places where we do a zero-extend load into a 32 bit register, followed by a narrower-than-32-bit comparison on the loaded data, eg > +" movzwl (%ecx,%edx,2),%edx\n" > +" cmp $0xaaaa,%dx\n" and > +" movzbl (%ecx,%edx,1),%edx\n" /* dl = VA bits for 32bit */ > +" cmp $0xaa,%dl\n" /* compara to VA_BITS8_DEFINED? */ .. is there any performance gain or change compared to doing a 32 bit comparison at the full width? cmpl $0xaaaa, %edx cmpl $0xaa, %edx I would prefer to use 32 bits throughout, since it avoids any possible microarchitectural bad effects -- due to sub-register reads, or length- changing prefixes (0x66) -- that might happen. J |
|
From: John R. <jr...@bi...> - 2015-07-15 12:19:45
|
>> .. is there any performance gain or change compared to doing a 32 bit
>> comparison at the full width?
>>
>> cmpl $0xaaaa, %edx
>> cmpl $0xaa, %edx
>>
>> I would prefer to use 32 bits throughout, since it avoids any possible
>> microarchitectural bad effects -- due to sub-register reads, or length-
>> changing prefixes (0x66) -- that might happen.
Sub-register writes usually take extra cycles (writing a narrow result
requires an extra Read and perhaps an extra cycle for the Merge), but
sub-register reads usually have no penalty at _execution_. The penalty
on many chips is for decoding the 0x66 prefix (16 bit length when 32-bit default.)
>
> I did a first try today.
> Observations:
> cmpl instruction has no 0x66 prefix, but is encoded with 32bit immediate
> value. But I guess instruction length is not really relevant for
> performance.
It might affect the instruction prefetcher: usually 8 bytes (aligned) per 1 cycle;
perhaps 16 bytes on recent high-end chips.
>
> 0x28008870 <+48>: 66 81 fa 55 55 cmp $0x5555,%dx
> vs.
> 0x28008870 <+48>: 81 fa 55 55 00 00 cmp $0x5555,%edx
>
> I think it does not make sense to replace the cmp in the parts of
> LOADV16le and LOADV8 that care about 16bit or 8bit chunks.
Most instruction prefixes (except perhaps 0x0f, and REX on amd64) add 1 cycle
to instruction decode on low-end chips (Celeron, perhaps Pentium) but often not
on high-end chips (Core, Core i3/5/7, Xeon). Sometimes the extra cycle(s)
can be hidden by execution of preceding slow opcodes, but instruction decode
often is a bottleneck for memcheck.
>
> Does it make sense to align jump targets to 16 bytes?
The purpose of alignment is to increase efficiency of the prefetcher
so that the decoder has as many complete instructions as it can process.
Always align the start of a loop. Prefer fall-through (no branch) over branch.
Do not align if the prefetcher already has the complete target; else align,
but beware fragmentation. Aligning to 8 bytes often is enough
if the first prefetch at the target holds two complete instructions.
Decoding and usage of execution units can matter, especially on low-end chips.
High-end chips have "too much hardware ;-)" that often compensates
for less-than-ideal compiling. Example: for vgMemCheck_helperc_LOAD64le
on amd64, then gcc-4.6.3 generated
movabs $0xfffffff000000007,%rax
test %rax,%rdi
jne slow
mov %rdi,%rdx
movzwl %di,%eax
shr $0x10,%rdx
shr $0x3,%rax
mov table(,%rdx,8),%rdx
movzwl (%rdx,%rax,2),%eax
cmp $0xaaaa,%rax
jne not_allV
xor %eax,%eax
which is ugly because most CPU have only one shifter (and sometimes
must be decoded first in a cycle) [and for other reasons, too.]
Hand code of
mov %rdi,%rdx
movzwl %di,%ecx
movabs $0xfffffff000000007,%rax
shr $0x10,%rdx
test %rax,%rdi; jne slow
shr $0x3,%ecx
mov table(,%rdx,8),%rdx
xor %eax,%eax
movzwl (%rdx,%rcx,2),%ecx
cmp $0xaaaa,%ecx; jne not_allV
is beautiful but no faster in Core i3/5/7 because dynamic scheduling
and a plethora of internal hardware compensate for the ugly code.
The hand code is 6% faster on an old AMD Phenom(tm) II X2 555.
>
> First performance measurements show no clear indication faster/slower here.
> About the perf test suite: It measures cpu time used in userspace. Why
> is it not deterministic?
Linux allocates physical page frames randomly, so pages might not map evenly
into the data cache. For instance, if the physical frame numbers of your
data pages all have the same remainder modulo 64, then the dcache effectively
is only 1/64 as big (or 1/32, or 1/16, etc., depending on associativity), so there
will be many more cache misses, and execution will be slower. I saw one case with
variance of 15% in otherwise-controlled execution. Matching within 2%
can be difficult.
|
|
From: Josef W. <Jos...@gm...> - 2015-07-15 13:55:43
|
Am 15.07.2015 um 14:19 schrieb John Reiser: > It might affect the instruction prefetcher: usually 8 bytes (aligned) per 1 cycle; > perhaps 16 bytes on recent high-end chips. Recent Intel x86 (Sandy-Bridge+) have a trace cache (L0) for decoded micro-ops, which allows the x86 decoder to go idle. Then performance much more depends on whether there is a associativity conflict in L0 such that the decoder cannot go idle... difficult to predict. > Always align the start of a loop. Prefer fall-through (no branch) over branch. Usually if there is no branch prediction available, static prediction expects fall-through for forward branches, but branch for backwards (predicting a loop). >> First performance measurements show no clear indication faster/slower here. >> About the perf test suite: It measures cpu time used in userspace. Why >> is it not deterministic? > > Linux allocates physical page frames randomly, so pages might not map evenly Another reason: Changing frequency (turbo boost) depending on other stuff running and current CPU temperature, and whether the system was in some sleep mode before. Josef |
|
From: Matthias S. <zz...@ge...> - 2015-07-14 21:59:41
|
Am 10.07.2015 um 12:23 schrieb Julian Seward: > > Yes, sounds like it's worth committing. One comment: > > For the places where we do a zero-extend load into a 32 bit > register, followed by a narrower-than-32-bit comparison on the loaded > data, eg > >> +" movzwl (%ecx,%edx,2),%edx\n" >> +" cmp $0xaaaa,%dx\n" > > and > >> +" movzbl (%ecx,%edx,1),%edx\n" /* dl = VA bits for 32bit */ >> +" cmp $0xaa,%dl\n" /* compara to VA_BITS8_DEFINED? */ > > .. is there any performance gain or change compared to doing a 32 bit > comparison at the full width? > > cmpl $0xaaaa, %edx > cmpl $0xaa, %edx > > I would prefer to use 32 bits throughout, since it avoids any possible > microarchitectural bad effects -- due to sub-register reads, or length- > changing prefixes (0x66) -- that might happen. I did a first try today. Observations: cmpl instruction has no 0x66 prefix, but is encoded with 32bit immediate value. But I guess instruction length is not really relevant for performance. 0x28008870 <+48>: 66 81 fa 55 55 cmp $0x5555,%dx vs. 0x28008870 <+48>: 81 fa 55 55 00 00 cmp $0x5555,%edx I think it does not make sense to replace the cmp in the parts of LOADV16le and LOADV8 that care about 16bit or 8bit chunks. Does it make sense to align jump targets to 16 bytes? First performance measurements show no clear indication faster/slower here. About the perf test suite: It measures cpu time used in userspace. Why is it not deterministic? Regards Matthias |