|
From: Julian S. <js...@ac...> - 2015-01-14 10:43:55
|
An initial implementation of partial inlining of instrumentation fast paths that would have been, until now, in C helper functions, is now available at svn://svn.valgrind.org/valgrind/branches/NCODE This is the first runnable code. What it does (atm) is: instead of generating calls to MC_(helperc_LOADV64le) -- the most common helper for 64-bit Memcheck -- it generates in-line the abstract machine code template shown below, which is defined at mk_tmpl__LOADV64le_on_64. None of the other Memcheck helper functions have been template-ised yet. Each template is divided into two sections, the hot part, really in-line, and the cold part, placed at the end of the translation in the hope of not trashing the instruction cache too much. The template is translated into amd64 machine code by emit_AMD64NInstr, which will have to be reimplemented for each target. But it's pretty short, about 340 lines of code. I believe the implementation is correct. Unfortunately it doesn't seem to give much of a speedup :-/. Is under investigation. J # a0 is an arg register, r0 is a result register, s0 is a scratch register NCode [r0] = "LOADV64le_on_64" [a0] s0 { hot: 0 tst.w a0, #0xFFFFFFF000000007 1 bnz cold.4 2 shr.w s0, a0, #16 3 ld.64 s0, [0x3847D880 + s0 << #3] 4 and.w r0, a0, #0xFFFF 5 shr.w r0, r0, #3 6 ld.16 r0, [s0 + r0 << #1] 7 cmp.w r0, #0xAAAA 8 bnz cold.0 9 imm.w r0, #0x0 10 nop cold: 0 mov.w s0, r0 1 imm.w r0, #0xFFFFFFFFFFFFFFFF 2 cmp.w s0, #0x5555 3 bz hot.10 4 call r0 = mc_LOADV64le_slow[3800AE40](a0) 5 b hot.10 } An example instantiation on amd64 is as shown below. One of the first things to do is reduce the number of registers saved around the slow-path function call. NCode-AMD64:LOADV64le_on_64 [%r10] <= [%r13] scratch [%r9] hot.0: tst.w a0, #0xFFFFFFF000000007 movabsq $0xFFFFFFF000000007,%r11 andq %r13,%r11 hot.1: bnz cold.4 jnz rel32 hot.2: shr.w s0, a0, #16 movq %r13,%r9 shrq $16,%r9 hot.3: ld.64 s0, [0x3847D880 + s0 << #3] movabsq $0x0,%r11 movq 0x3847D880(%r11,%r9,8),%r9 hot.4: and.w r0, a0, #0xFFFF movq %r13,%r10 andq $0xFFFF,%r10 hot.5: shr.w r0, r0, #3 shrq $3,%r10 hot.6: ld.16 r0, [s0 + r0 << #1] movzwq 0x0(%r9,%r10,2),%r10 hot.7: cmp.w r0, #0xAAAA cmpq $0xAAAA,%r10 hot.8: bnz cold.0 jnz rel32 hot.9: imm.w r0, #0x0 movabsq $0x0,%r10 hot.10: nop cold.0: mov.w s0, r0 movq %r10,%r9 cold.1: imm.w r0, #0xFFFFFFFFFFFFFFFF movabsq $0xFFFFFFFFFFFFFFFF,%r10 cold.2: cmp.w s0, #0x5555 cmpq $0x5555,%r9 cold.3: bz hot.10 jz rel32 cold.4: call r0 = mc_LOADV64le_slow[3800AE40](a0) # set1: {%rbx,%rsi,%rdi,%r8,%r9,%r10,%r12,%r13,%r14,%r15,%xmm3,%xmm4,%xmm5,%xmm6,%xmm7,%xmm8,%xmm9,%xmm10,%xmm11,%xmm12} # set2: {%r9,%r10,%r13} # set3: {%rbx,%rbp,%r12,%r13,%r14,%r15} # set4: {%r10} # pres: {%rsi,%rdi,%r8,%r9,%xmm3,%xmm4,%xmm5,%xmm6,%xmm7,%xmm8,%xmm9,%xmm10,%xmm11,%xmm12} subq $0xE0,%rsp movq %rsi,(%rsp) movq %rdi,0x10(%rsp) movq %r8,0x20(%rsp) movq %r9,0x30(%rsp) movups %xmm3,0x40(%rsp) movups %xmm4,0x50(%rsp) movups %xmm5,0x60(%rsp) movups %xmm6,0x70(%rsp) movups %xmm7,0x80(%rsp) movups %xmm8,0x90(%rsp) movups %xmm9,0xA0(%rsp) movups %xmm10,0xB0(%rsp) movups %xmm11,0xC0(%rsp) movups %xmm12,0xD0(%rsp) movq %r13,%rdi movabsq $0x3800AE40,%r11 call* %r11 movq %rax,%r10 movq (%rsp),%rsi movq 0x10(%rsp),%rdi movq 0x20(%rsp),%r8 movq 0x30(%rsp),%r9 movups 0x40(%rsp),%xmm3 movups 0x50(%rsp),%xmm4 movups 0x60(%rsp),%xmm5 movups 0x70(%rsp),%xmm6 movups 0x80(%rsp),%xmm7 movups 0x90(%rsp),%xmm8 movups 0xA0(%rsp),%xmm9 movups 0xB0(%rsp),%xmm10 movups 0xC0(%rsp),%xmm11 movups 0xD0(%rsp),%xmm12 addq $0xE0,%rsp cold.5: b hot.10 jmp rel32 reloc: (hot[273] bits[31..0]) refers-to (cold.4) bias -4 rshift 0 reloc: (hot[273] bits[31..0]) refers-to (cold[302]) bias -4 rshift 0 reloc: (hot[327] bits[31..0]) refers-to (cold.0) bias -4 rshift 0 reloc: (hot[327] bits[31..0]) refers-to (cold[276]) bias -4 rshift 0 reloc: (cold[298] bits[31..0]) refers-to (hot.10) bias -4 rshift 0 reloc: (cold[298] bits[31..0]) refers-to (hot[337]) bias -4 rshift 0 reloc: (cold[532] bits[31..0]) refers-to (hot.10) bias -4 rshift 0 reloc: (cold[532] bits[31..0]) refers-to (hot[337]) bias -4 rshift 0 |