|
From: Julian S. <se...@so...> - 2019-01-25 08:20:02
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=50bb127b1df8d31812141aafa567d325d1fbc1b3 commit 50bb127b1df8d31812141aafa567d325d1fbc1b3 Author: Julian Seward <js...@ac...> Date: Fri Jan 25 09:14:56 2019 +0100 Bug 402781 - Redo the cache used to process indirect branch targets. [This commit contains an implementation for all targets except amd64-solaris and x86-solaris, which will be completed shortly.] In the baseline simulator, jumps to guest code addresses that are not known at JIT time have to be looked up in a guest->host mapping table. That means: indirect branches, indirect calls and most commonly, returns. Since there are huge numbers of these (often 10+ million/second) the mapping mechanism needs to be extremely cheap. Currently, this is implemented using a direct-mapped cache, VG_(tt_fast), with 2^15 (guest_addr, host_addr) pairs. This is queried in handwritten assembly in VG_(disp_cp_xindir) in dispatch-<arch>-<os>.S. If there is a miss in the cache then we fall back out to C land, and do a slow lookup using VG_(search_transtab). Given that the size of the translation table(s) in recent years has expanded significantly in order to keep pace with increasing application sizes, two bad things have happened: (1) the cost of a miss in the fast cache has risen significantly, and (2) the miss rate on the fast cache has also increased significantly. This means that large (~ one-million-basic-blocks-JITted) applications that run for a long time end up spending a lot of time in VG_(search_transtab). The proposed fix is to increase associativity of the fast cache, from 1 (direct mapped) to 4. Simulations of various cache configurations using indirect-branch traces from a large application show that is the best of various configurations. In an extreme case with 5.7 billion indirect branches: * The increase of associativity from 1 way to 4 way, whilst keeping the overall cache size the same (32k guest/host pairs), reduces the miss rate by around a factor of 3, from 4.02% to 1.30%. * The use of a slightly better hash function than merely slicing off the bottom 15 bits of the address, reduces the miss rate further, from 1.30% to 0.53%. Overall the VG_(tt_fast) miss rate is almost unchanged on small workloads, but reduced by a factor of up to almost 8 on large workloads. By implementing each (4-entry) cache set using a move-to-front scheme in the case of hits in ways 1, 2 or 3, the vast majority of hits can be made to happen in way 0. Hence the cost of having this extra associativity is almost zero in the case of a hit. The improved hash function costs an extra 2 ALU shots (a shift and an xor) but overall this seems performance neutral to a win. Diff: --- coregrind/m_dispatch/dispatch-amd64-darwin.S | 111 +++++++++++++---- coregrind/m_dispatch/dispatch-amd64-linux.S | 99 ++++++++++++--- coregrind/m_dispatch/dispatch-arm-linux.S | 134 +++++++++++++++----- coregrind/m_dispatch/dispatch-arm64-linux.S | 132 +++++++++++++++----- coregrind/m_dispatch/dispatch-mips32-linux.S | 151 ++++++++++++++++------ coregrind/m_dispatch/dispatch-mips64-linux.S | 151 ++++++++++++++++------ coregrind/m_dispatch/dispatch-ppc32-linux.S | 148 +++++++++++++++++----- coregrind/m_dispatch/dispatch-ppc64be-linux.S | 165 ++++++++++++++++++------ coregrind/m_dispatch/dispatch-ppc64le-linux.S | 173 ++++++++++++++++++++------ coregrind/m_dispatch/dispatch-s390x-linux.S | 161 +++++++++++++++++------- coregrind/m_dispatch/dispatch-x86-darwin.S | 98 ++++++++++++--- coregrind/m_dispatch/dispatch-x86-linux.S | 96 +++++++++++--- coregrind/m_scheduler/scheduler.c | 75 +++++++---- coregrind/m_transtab.c | 104 ++++++++++------ coregrind/pub_core_transtab.h | 101 +++++++++++++-- coregrind/pub_core_transtab_asm.h | 111 ++++++++++++----- include/pub_tool_libcbase.h | 1 + 17 files changed, 1544 insertions(+), 467 deletions(-) diff --git a/coregrind/m_dispatch/dispatch-amd64-darwin.S b/coregrind/m_dispatch/dispatch-amd64-darwin.S index d560306..ccf2b91 100644 --- a/coregrind/m_dispatch/dispatch-amd64-darwin.S +++ b/coregrind/m_dispatch/dispatch-amd64-darwin.S @@ -201,33 +201,98 @@ VG_(disp_cp_chain_me_to_fastEP): jmp postamble /* ------ Indirect but boring jump ------ */ -.globl VG_(disp_cp_xindir) +.global VG_(disp_cp_xindir) VG_(disp_cp_xindir): - /* Where are we going? */ - movq OFFSET_amd64_RIP(%rbp), %rax + /* Where are we going? */ + movq OFFSET_amd64_RIP(%rbp), %rax // "guest" /* stats only */ - movabsq $VG_(stats__n_xindirs_32), %r10 - addl $1, (%r10) - - /* try a fast lookup in the translation cache */ - movabsq $VG_(tt_fast), %rcx - movq %rax, %rbx /* next guest addr */ - andq $VG_TT_FAST_MASK, %rbx /* entry# */ - shlq $4, %rbx /* entry# * sizeof(FastCacheEntry) */ - movq 0(%rcx,%rbx,1), %r10 /* .guest */ - movq 8(%rcx,%rbx,1), %r11 /* .host */ - cmpq %rax, %r10 - jnz fast_lookup_failed - - /* Found a match. Jump to .host. */ - jmp *%r11 - ud2 /* persuade insn decoders not to speculate past here */ - -fast_lookup_failed: + movabsq $VG_(stats__n_xIndirs_32), %r8 + addl $1, (%r8) + + // LIVE: %rbp (guest state ptr), %rax (guest address to go to). + // We use 5 temporaries: + // %r9 (to point at the relevant FastCacheSet), + // %r10, %r11 and %r12 (scratch). + // %r8 (scratch address) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute %r9 = VG_TT_FAST_HASH(guest) + movq %rax, %r9 // guest + shrq $VG_TT_FAST_BITS, %r9 // (guest >> VG_TT_FAST_BITS) + xorq %rax, %r9 // (guest >> VG_TT_FAST_BITS) ^ guest + andq $VG_TT_FAST_MASK, %r9 // setNo + + // Compute %r9 = &VG_(tt_fast)[%r9] + shlq $VG_FAST_CACHE_SET_BITS, %r9 // setNo * sizeof(FastCacheSet) + movabsq $VG_(tt_fast), %r10 // &VG_(tt_fast)[0] + leaq (%r10, %r9), %r9 // &VG_(tt_fast)[setNo] + + // LIVE: %rbp (guest state ptr), %rax (guest addr), %r9 (cache set) + // try way 0 + cmpq %rax, FCS_g0(%r9) // cmp against .guest0 + jnz 1f + // hit at way 0 + jmp *FCS_h0(%r9) // goto .host0 + ud2 + +1: // try way 1 + cmpq %rax, FCS_g1(%r9) // cmp against .guest1 + jnz 2f + // hit at way 1; swap upwards + /* stats only */ + movabsq $VG_(stats__n_xIndir_hits1_32), %r8 + addl $1, (%r8) + movq FCS_g0(%r9), %r10 // r10 = old .guest0 + movq FCS_h0(%r9), %r11 // r11 = old .host0 + movq FCS_h1(%r9), %r12 // r12 = old .host1 + movq %rax, FCS_g0(%r9) // new .guest0 = guest + movq %r12, FCS_h0(%r9) // new .host0 = old .host1 + movq %r10, FCS_g1(%r9) // new .guest1 = old .guest0 + movq %r11, FCS_h1(%r9) // new .host1 = old .host0 + jmp *%r12 // goto old .host1 a.k.a. new .host0 + ud2 + +2: // try way 2 + cmpq %rax, FCS_g2(%r9) // cmp against .guest2 + jnz 3f + // hit at way 2; swap upwards + /* stats only */ + movabsq $VG_(stats__n_xIndir_hits2_32), %r8 + addl $1, (%r8) + movq FCS_g1(%r9), %r10 + movq FCS_h1(%r9), %r11 + movq FCS_h2(%r9), %r12 + movq %rax, FCS_g1(%r9) + movq %r12, FCS_h1(%r9) + movq %r10, FCS_g2(%r9) + movq %r11, FCS_h2(%r9) + jmp *%r12 + ud2 + +3: // try way 3 + cmpq %rax, FCS_g3(%r9) // cmp against .guest3 + jnz 4f + // hit at way 3; swap upwards + /* stats only */ + movabsq $VG_(stats__n_xIndir_hits3_32), %r8 + addl $1, (%r8) + movq FCS_g2(%r9), %r10 + movq FCS_h2(%r9), %r11 + movq FCS_h3(%r9), %r12 + movq %rax, FCS_g2(%r9) + movq %r12, FCS_h2(%r9) + movq %r10, FCS_g3(%r9) + movq %r11, FCS_h3(%r9) + jmp *%r12 + ud2 + +4: // fast lookup failed /* stats only */ - movabsq $VG_(stats__n_xindir_misses_32), %r10 - addl $1, (%r10) + movabsq $VG_(stats__n_xIndir_misses_32), %r8 + addl $1, (%r8) movq $VG_TRC_INNER_FASTMISS, %rax movq $0, %rdx diff --git a/coregrind/m_dispatch/dispatch-amd64-linux.S b/coregrind/m_dispatch/dispatch-amd64-linux.S index 62717d3..007c495 100644 --- a/coregrind/m_dispatch/dispatch-amd64-linux.S +++ b/coregrind/m_dispatch/dispatch-amd64-linux.S @@ -205,28 +205,89 @@ VG_(disp_cp_chain_me_to_fastEP): .global VG_(disp_cp_xindir) VG_(disp_cp_xindir): /* Where are we going? */ - movq OFFSET_amd64_RIP(%rbp), %rax + movq OFFSET_amd64_RIP(%rbp), %rax // "guest" /* stats only */ - addl $1, VG_(stats__n_xindirs_32) - - /* try a fast lookup in the translation cache */ - movabsq $VG_(tt_fast), %rcx - movq %rax, %rbx /* next guest addr */ - andq $VG_TT_FAST_MASK, %rbx /* entry# */ - shlq $4, %rbx /* entry# * sizeof(FastCacheEntry) */ - movq 0(%rcx,%rbx,1), %r10 /* .guest */ - movq 8(%rcx,%rbx,1), %r11 /* .host */ - cmpq %rax, %r10 - jnz fast_lookup_failed - - /* Found a match. Jump to .host. */ - jmp *%r11 - ud2 /* persuade insn decoders not to speculate past here */ - -fast_lookup_failed: + addl $1, VG_(stats__n_xIndirs_32) + + // LIVE: %rbp (guest state ptr), %rax (guest address to go to). + // We use 4 temporaries: + // %r9 (to point at the relevant FastCacheSet), + // %r10, %r11 and %r12 (scratch). + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute %r9 = VG_TT_FAST_HASH(guest) + movq %rax, %r9 // guest + shrq $VG_TT_FAST_BITS, %r9 // (guest >> VG_TT_FAST_BITS) + xorq %rax, %r9 // (guest >> VG_TT_FAST_BITS) ^ guest + andq $VG_TT_FAST_MASK, %r9 // setNo + + // Compute %r9 = &VG_(tt_fast)[%r9] + shlq $VG_FAST_CACHE_SET_BITS, %r9 // setNo * sizeof(FastCacheSet) + movabsq $VG_(tt_fast), %r10 // &VG_(tt_fast)[0] + leaq (%r10, %r9), %r9 // &VG_(tt_fast)[setNo] + + // LIVE: %rbp (guest state ptr), %rax (guest addr), %r9 (cache set) + // try way 0 + cmpq %rax, FCS_g0(%r9) // cmp against .guest0 + jnz 1f + // hit at way 0 + jmp *FCS_h0(%r9) // goto .host0 + ud2 + +1: // try way 1 + cmpq %rax, FCS_g1(%r9) // cmp against .guest1 + jnz 2f + // hit at way 1; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits1_32) + movq FCS_g0(%r9), %r10 // r10 = old .guest0 + movq FCS_h0(%r9), %r11 // r11 = old .host0 + movq FCS_h1(%r9), %r12 // r12 = old .host1 + movq %rax, FCS_g0(%r9) // new .guest0 = guest + movq %r12, FCS_h0(%r9) // new .host0 = old .host1 + movq %r10, FCS_g1(%r9) // new .guest1 = old .guest0 + movq %r11, FCS_h1(%r9) // new .host1 = old .host0 + jmp *%r12 // goto old .host1 a.k.a. new .host0 + ud2 + +2: // try way 2 + cmpq %rax, FCS_g2(%r9) // cmp against .guest2 + jnz 3f + // hit at way 2; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits2_32) + movq FCS_g1(%r9), %r10 + movq FCS_h1(%r9), %r11 + movq FCS_h2(%r9), %r12 + movq %rax, FCS_g1(%r9) + movq %r12, FCS_h1(%r9) + movq %r10, FCS_g2(%r9) + movq %r11, FCS_h2(%r9) + jmp *%r12 + ud2 + +3: // try way 3 + cmpq %rax, FCS_g3(%r9) // cmp against .guest3 + jnz 4f + // hit at way 3; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits3_32) + movq FCS_g2(%r9), %r10 + movq FCS_h2(%r9), %r11 + movq FCS_h3(%r9), %r12 + movq %rax, FCS_g2(%r9) + movq %r12, FCS_h2(%r9) + movq %r10, FCS_g3(%r9) + movq %r11, FCS_h3(%r9) + jmp *%r12 + ud2 + +4: // fast lookup failed /* stats only */ - addl $1, VG_(stats__n_xindir_misses_32) + addl $1, VG_(stats__n_xIndir_misses_32) movq $VG_TRC_INNER_FASTMISS, %rax movq $0, %rdx diff --git a/coregrind/m_dispatch/dispatch-arm-linux.S b/coregrind/m_dispatch/dispatch-arm-linux.S index 3731c2e..b61818c 100644 --- a/coregrind/m_dispatch/dispatch-arm-linux.S +++ b/coregrind/m_dispatch/dispatch-arm-linux.S @@ -154,36 +154,114 @@ VG_(disp_cp_xindir): ldr r0, [r8, #OFFSET_arm_R15T] /* stats only */ - movw r1, #:lower16:vgPlain_stats__n_xindirs_32 - movt r1, #:upper16:vgPlain_stats__n_xindirs_32 - ldr r2, [r1, #0] - add r2, r2, #1 - str r2, [r1, #0] + movw r4, #:lower16:VG_(stats__n_xIndirs_32) + movt r4, #:upper16:VG_(stats__n_xIndirs_32) + ldr r5, [r4, #0] + add r5, r5, #1 + str r5, [r4, #0] + + // LIVE: r8 (guest state ptr), r0 (guest address to go to). + // We use 6 temporaries: + // r6 (to point at the relevant FastCacheSet), + // r1, r2, r3 (scratch, for swapping entries within a set) + // r4, r5 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute r6 = VG_TT_FAST_HASH(guest) + lsr r6, r0, #1 // g1 = guest >> 1 + eor r6, r6, r6, LSR #VG_TT_FAST_BITS // (g1 >> VG_TT_FAST_BITS) ^ g1 + ubfx r6, r6, #0, #VG_TT_FAST_BITS // setNo - /* try a fast lookup in the translation cache */ - // r0 = next guest, r1,r2,r3,r4 scratch - movw r1, #VG_TT_FAST_MASK // r1 = VG_TT_FAST_MASK + // Compute r6 = &VG_(tt_fast)[r6] movw r4, #:lower16:VG_(tt_fast) - - and r2, r1, r0, LSR #1 // r2 = entry # - movt r4, #:upper16:VG_(tt_fast) // r4 = &VG_(tt_fast) - - add r1, r4, r2, LSL #3 // r1 = &tt_fast[entry#] - - ldrd r4, r5, [r1, #0] // r4 = .guest, r5 = .host - - cmp r4, r0 - - // jump to host if lookup succeeded - bxeq r5 - - /* otherwise the fast lookup failed */ - /* RM ME -- stats only */ - movw r1, #:lower16:vgPlain_stats__n_xindir_misses_32 - movt r1, #:upper16:vgPlain_stats__n_xindir_misses_32 - ldr r2, [r1, #0] - add r2, r2, #1 - str r2, [r1, #0] + movt r4, #:upper16:VG_(tt_fast) + add r6, r4, r6, LSL #VG_FAST_CACHE_SET_BITS // &VG_(tt_fast)[setNo] + + // LIVE: r8 (guest state ptr), r0 (guest addr), r6 (cache set) + // try way 0 + ldr r4, [r6, #FCS_g0] // .guest0 + ldr r5, [r6, #FCS_h0] // .host0 + cmp r4, r0 // cmp against .guest0 + bne 1f + // hit at way 0 + // goto .host0 + bx r5 + /*NOTREACHED*/ + +1: // try way 1 + ldr r4, [r6, #FCS_g1] + cmp r4, r0 // cmp against .guest1 + bne 2f + // hit at way 1; swap upwards + ldr r1, [r6, #FCS_g0] // r1 = old .guest0 + ldr r2, [r6, #FCS_h0] // r2 = old .host0 + ldr r3, [r6, #FCS_h1] // r3 = old .host1 + str r0, [r6, #FCS_g0] // new .guest0 = guest + str r3, [r6, #FCS_h0] // new .host0 = old .host1 + str r1, [r6, #FCS_g1] // new .guest1 = old .guest0 + str r2, [r6, #FCS_h1] // new .host1 = old .host0 + // stats only + movw r4, #:lower16:VG_(stats__n_xIndir_hits1_32) + movt r4, #:upper16:VG_(stats__n_xIndir_hits1_32) + ldr r5, [r4, #0] + add r5, r5, #1 + str r5, [r4, #0] + // goto old .host1 a.k.a. new .host0 + bx r3 + /*NOTREACHED*/ + +2: // try way 2 + ldr r4, [r6, #FCS_g2] + cmp r4, r0 // cmp against .guest2 + bne 3f + // hit at way 2; swap upwards + ldr r1, [r6, #FCS_g1] + ldr r2, [r6, #FCS_h1] + ldr r3, [r6, #FCS_h2] + str r0, [r6, #FCS_g1] + str r3, [r6, #FCS_h1] + str r1, [r6, #FCS_g2] + str r2, [r6, #FCS_h2] + // stats only + movw r4, #:lower16:VG_(stats__n_xIndir_hits2_32) + movt r4, #:upper16:VG_(stats__n_xIndir_hits2_32) + ldr r5, [r4, #0] + add r5, r5, #1 + str r5, [r4, #0] + // goto old .host2 a.k.a. new .host1 + bx r3 + /*NOTREACHED*/ + +3: // try way 3 + ldr r4, [r6, #FCS_g3] + cmp r4, r0 // cmp against .guest3 + bne 4f + // hit at way 3; swap upwards + ldr r1, [r6, #FCS_g2] + ldr r2, [r6, #FCS_h2] + ldr r3, [r6, #FCS_h3] + str r0, [r6, #FCS_g2] + str r3, [r6, #FCS_h2] + str r1, [r6, #FCS_g3] + str r2, [r6, #FCS_h3] + // stats only + movw r4, #:lower16:VG_(stats__n_xIndir_hits3_32) + movt r4, #:upper16:VG_(stats__n_xIndir_hits3_32) + ldr r5, [r4, #0] + add r5, r5, #1 + str r5, [r4, #0] + // goto old .host3 a.k.a. new .host2 + bx r3 + /*NOTREACHED*/ + +4: // fast lookup failed + movw r4, #:lower16:VG_(stats__n_xIndir_misses_32) + movt r4, #:upper16:VG_(stats__n_xIndir_misses_32) + ldr r5, [r4, #0] + add r5, r5, #1 + str r5, [r4, #0] mov r1, #VG_TRC_INNER_FASTMISS mov r2, #0 diff --git a/coregrind/m_dispatch/dispatch-arm64-linux.S b/coregrind/m_dispatch/dispatch-arm64-linux.S index ee289fa..554fa9b 100644 --- a/coregrind/m_dispatch/dispatch-arm64-linux.S +++ b/coregrind/m_dispatch/dispatch-arm64-linux.S @@ -173,42 +173,118 @@ VG_(disp_cp_chain_me_to_fastEP): /* ------ Indirect but boring jump ------ */ .global VG_(disp_cp_xindir) VG_(disp_cp_xindir): - /* Where are we going? */ + // Where are we going? ldr x0, [x21, #OFFSET_arm64_PC] - /* stats only */ - adrp x1, VG_(stats__n_xindirs_32) - add x1, x1, :lo12:VG_(stats__n_xindirs_32) - ldr w2, [x1, #0] - add w2, w2, #1 - str w2, [x1, #0] - - /* try a fast lookup in the translation cache */ - // x0 = next guest, x1,x2,x3,x4 scratch - mov x1, #VG_TT_FAST_MASK // x1 = VG_TT_FAST_MASK - and x2, x1, x0, LSR #2 // x2 = entry # = (x1 & (x0 >> 2)) - + // stats only + adrp x4, VG_(stats__n_xIndirs_32) + add x4, x4, :lo12:VG_(stats__n_xIndirs_32) + ldr w5, [x4, #0] + add w5, w5, #1 + str w5, [x4, #0] + + // LIVE: x21 (guest state ptr), x0 (guest address to go to). + // We use 6 temporaries: + // x6 (to point at the relevant FastCacheSet), + // x1, x2, x3 (scratch, for swapping entries within a set) + // x4, x5 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute x6 = VG_TT_FAST_HASH(guest) + lsr x6, x0, #2 // g2 = guest >> 2 + eor x6, x6, x6, LSR #VG_TT_FAST_BITS // (g2 >> VG_TT_FAST_BITS) ^ g2 + mov x4, #VG_TT_FAST_MASK // VG_TT_FAST_MASK + and x6, x6, x4 // setNo + + // Compute x6 = &VG_(tt_fast)[x6] adrp x4, VG_(tt_fast) - add x4, x4, :lo12:VG_(tt_fast) // x4 = &VG_(tt_fast) - - add x1, x4, x2, LSL #4 // r1 = &tt_fast[entry#] + add x4, x4, :lo12:VG_(tt_fast) // &VG_(tt_fast)[0] + add x6, x4, x6, LSL #VG_FAST_CACHE_SET_BITS // &VG_(tt_fast)[setNo] + + // LIVE: x21 (guest state ptr), x0 (guest addr), x6 (cache set) + // try way 0 + ldp x4, x5, [x6, #FCS_g0] // x4 = .guest0, x5 = .host0 + cmp x4, x0 // cmp against .guest0 + bne 1f + // hit at way 0 + // goto .host0 + br x5 + /*NOTREACHED*/ - ldp x4, x5, [x1, #0] // x4 = .guest, x5 = .host +1: // try way 1 + ldr x4, [x6, #FCS_g1] + cmp x4, x0 // cmp against .guest1 + bne 2f + // hit at way 1; swap upwards + ldr x1, [x6, #FCS_g0] // x1 = old .guest0 + ldr x2, [x6, #FCS_h0] // x2 = old .host0 + ldr x3, [x6, #FCS_h1] // x3 = old .host1 + str x0, [x6, #FCS_g0] // new .guest0 = guest + str x3, [x6, #FCS_h0] // new .host0 = old .host1 + str x1, [x6, #FCS_g1] // new .guest1 = old .guest0 + str x2, [x6, #FCS_h1] // new .host1 = old .host0 + // stats only + adrp x4, VG_(stats__n_xIndir_hits1_32) + add x4, x4, :lo12:VG_(stats__n_xIndir_hits1_32) + ldr w5, [x4, #0] + add w5, w5, #1 + str w5, [x4, #0] + // goto old .host1 a.k.a. new .host0 + br x3 + /*NOTREACHED*/ - cmp x4, x0 +2: // try way 2 + ldr x4, [x6, #FCS_g2] + cmp x4, x0 // cmp against .guest2 + bne 3f + // hit at way 2; swap upwards + ldr x1, [x6, #FCS_g1] + ldr x2, [x6, #FCS_h1] + ldr x3, [x6, #FCS_h2] + str x0, [x6, #FCS_g1] + str x3, [x6, #FCS_h1] + str x1, [x6, #FCS_g2] + str x2, [x6, #FCS_h2] + // stats only + adrp x4, VG_(stats__n_xIndir_hits2_32) + add x4, x4, :lo12:VG_(stats__n_xIndir_hits2_32) + ldr w5, [x4, #0] + add w5, w5, #1 + str w5, [x4, #0] + // goto old .host2 a.k.a. new .host1 + br x3 + /*NOTREACHED*/ - // jump to host if lookup succeeded - bne fast_lookup_failed - br x5 +3: // try way 3 + ldr x4, [x6, #FCS_g3] + cmp x4, x0 // cmp against .guest3 + bne 4f + // hit at way 3; swap upwards + ldr x1, [x6, #FCS_g2] + ldr x2, [x6, #FCS_h2] + ldr x3, [x6, #FCS_h3] + str x0, [x6, #FCS_g2] + str x3, [x6, #FCS_h2] + str x1, [x6, #FCS_g3] + str x2, [x6, #FCS_h3] + // stats only + adrp x4, VG_(stats__n_xIndir_hits3_32) + add x4, x4, :lo12:VG_(stats__n_xIndir_hits3_32) + ldr w5, [x4, #0] + add w5, w5, #1 + str w5, [x4, #0] + // goto old .host3 a.k.a. new .host2 + br x3 /*NOTREACHED*/ -fast_lookup_failed: - /* RM ME -- stats only */ - adrp x1, VG_(stats__n_xindir_misses_32) - add x1, x1, :lo12:VG_(stats__n_xindir_misses_32) - ldr w2, [x1, #0] - add w2, w2, #1 - str w2, [x1, #0] +4: // fast lookup failed + adrp x4, VG_(stats__n_xIndir_misses_32) + add x4, x4, :lo12:VG_(stats__n_xIndir_misses_32) + ldr w5, [x4, #0] + add w5, w5, #1 + str w5, [x4, #0] mov x1, #VG_TRC_INNER_FASTMISS mov x2, #0 diff --git a/coregrind/m_dispatch/dispatch-mips32-linux.S b/coregrind/m_dispatch/dispatch-mips32-linux.S index 9918403..fdb1e29 100644 --- a/coregrind/m_dispatch/dispatch-mips32-linux.S +++ b/coregrind/m_dispatch/dispatch-mips32-linux.S @@ -175,47 +175,116 @@ VG_(disp_cp_chain_me_to_fastEP): .global VG_(disp_cp_xindir) VG_(disp_cp_xindir): /* Where are we going? */ - lw $11, OFFSET_mips32_PC($23) - - lw $13, vgPlain_stats__n_xindirs_32 - addiu $13, $13, 0x1 - sw $13, vgPlain_stats__n_xindirs_32 - - /* try a fast lookup in the translation cache */ - /* t1 = VG_TT_FAST_HASH(addr) * sizeof(ULong*) - = (t8 >> 2 & VG_TT_FAST_MASK) << 3 */ - - move $14, $11 - li $12, VG_TT_FAST_MASK - srl $14, $14, 2 - and $14, $14, $12 - sll $14, $14, 3 - - /* t2 = (addr of VG_(tt_fast)) + t1 */ - la $13, VG_(tt_fast) - addu $13, $13, $14 - - lw $12, 0($13) /* t3 = VG_(tt_fast)[hash] :: ULong* */ - addiu $13, $13, 4 - lw $25, 0($13) /* little-endian, so comparing 1st 32bit word */ - nop - -check: - bne $12, $11, fast_lookup_failed - /* run the translation */ - jr $25 - .long 0x0 /* persuade insn decoders not to speculate past here */ - -fast_lookup_failed: - /* %PC is up to date */ - /* back out decrement of the dispatch counter */ - /* hold dispatch_ctr in t0 (r8) */ - lw $13, vgPlain_stats__n_xindirs_32 - addiu $13, $13, 0x1 - sw $13, vgPlain_stats__n_xindirs_32 - li $2, VG_TRC_INNER_FASTMISS - li $3, 0 - b postamble + lw $10, OFFSET_mips32_PC($23) + + /* stats only */ + lw $15, VG_(stats__n_xIndirs_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndirs_32) + + // LIVE: r23 (guest state ptr), r10 (guest address to go to). + // We use 6 temporaries: + // r16 (to point at the relevant FastCacheSet), + // r11, r12, r13 (scratch, for swapping entries within a set) + // r14, r15 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute r16 = VG_TT_FAST_HASH(guest) + srl $16, $10, 2 // g2 = guest >> 2 + srl $15, $10, (VG_TT_FAST_BITS + 2) // (g2 >> VG_TT_FAST_BITS) + xor $16, $16, $15 // (g2 >> VG_TT_FAST_BITS) ^ g2 + li $15, VG_TT_FAST_MASK + and $16, $16, $15 // setNo + + // Compute r16 = &VG_(tt_fast)[r16] + la $15, VG_(tt_fast) + sll $16, $16, VG_FAST_CACHE_SET_BITS + addu $16, $16, $15 + + // LIVE: r23 (guest state ptr), r10 (guest addr), r16 (cache set) + // try way 0 + lw $14, FCS_g0($16) // .guest0 + lw $15, FCS_h0($16) // .host0 + bne $14, $10, 1f // cmp against .guest0 + // hit at way 0 + // goto .host0 + jr $15 + /*NOTREACHED*/ + .long 0x0 + +1: // try way 1 + lw $14, FCS_g1($16) + bne $14, $10, 2f // cmp against .guest1 + // hit at way 1; swap upwards + lw $11, FCS_g0($16) // $11 = old .guest0 + lw $12, FCS_h0($16) // $12 = old .host0 + lw $13, FCS_h1($16) // $13 = old .host1 + sw $10, FCS_g0($16) // new .guest0 = guest + sw $13, FCS_h0($16) // new .host0 = old .host1 + sw $11, FCS_g1($16) // new .guest1 = old .guest0 + sw $12, FCS_h1($16) // new .host1 = old .host0 + // stats only + lw $15, VG_(stats__n_xIndir_hits1_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_hits1_32) + // goto old .host1 a.k.a. new .host0 + jr $13 + /*NOTREACHED*/ + .long 0x0 + +2: // try way 2 + lw $14, FCS_g2($16) + bne $14, $10, 3f // cmp against .guest2 + // hit at way 2; swap upwards + lw $11, FCS_g1($16) + lw $12, FCS_h1($16) + lw $13, FCS_h2($16) + sw $10, FCS_g1($16) + sw $13, FCS_h1($16) + sw $11, FCS_g2($16) + sw $12, FCS_h2($16) + // stats only + lw $15, VG_(stats__n_xIndir_hits2_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_hits2_32) + // goto old .host2 a.k.a. new .host1 + jr $13 + /*NOTREACHED*/ + .long 0x0 + +3: // try way 3 + lw $14, FCS_g3($16) + bne $14, $10, 4f // cmp against .guest3 + // hit at way 3; swap upwards + lw $11, FCS_g2($16) + lw $12, FCS_h2($16) + lw $13, FCS_h3($16) + sw $10, FCS_g2($16) + sw $13, FCS_h2($16) + sw $11, FCS_g3($16) + sw $12, FCS_h3($16) + // stats only + lw $15, VG_(stats__n_xIndir_hits3_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_hits3_32) + // goto old .host3 a.k.a. new .host2 + jr $13 + /*NOTREACHED*/ + .long 0x0 + +4: // fast lookup failed: + /* stats only */ + lw $15, VG_(stats__n_xIndir_misses_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_misses_32) + + li $2, VG_TRC_INNER_FASTMISS + li $3, 0 + b postamble + /*NOTREACHED*/ + .long 0x0 /* ------ Assisted jump ------ */ .global VG_(disp_cp_xassisted) diff --git a/coregrind/m_dispatch/dispatch-mips64-linux.S b/coregrind/m_dispatch/dispatch-mips64-linux.S index 4a2b1b7..5d1efd6 100644 --- a/coregrind/m_dispatch/dispatch-mips64-linux.S +++ b/coregrind/m_dispatch/dispatch-mips64-linux.S @@ -182,47 +182,116 @@ VG_(disp_cp_chain_me_to_fastEP): .global VG_(disp_cp_xindir) VG_(disp_cp_xindir): /* Where are we going? */ - ld $11, OFFSET_mips64_PC($23) - - lw $13, vgPlain_stats__n_xindirs_32 - addiu $13, $13, 0x1 - sw $13, vgPlain_stats__n_xindirs_32 - - /* try a fast lookup in the translation cache */ - /* t1 = VG_TT_FAST_HASH(addr) * sizeof(ULong*) - = (t8 >> 2 & VG_TT_FAST_MASK) << 3 */ - - move $14, $11 - li $12, VG_TT_FAST_MASK - srl $14, $14, 2 - and $14, $14, $12 - sll $14, $14, 3 - - /* t2 = (addr of VG_(tt_fast)) + t1 */ - dla $13, VG_(tt_fast) - daddu $13, $13, $14 - - ld $12, 0($13) /* t3 = VG_(tt_fast)[hash] :: ULong* */ - daddiu $13, $13, 8 - ld $25, 0($13) /* little-endian, so comparing 1st 32bit word */ - nop - -check: - bne $12, $11, fast_lookup_failed - /* run the translation */ - jr $25 - .long 0x0 /* persuade insn decoders not to speculate past here */ - -fast_lookup_failed: - /* %PC is up to date */ - /* back out decrement of the dispatch counter */ - /* hold dispatch_ctr in t0 (r8) */ - lw $13, vgPlain_stats__n_xindirs_32 - addiu $13, $13, 0x1 - sw $13, vgPlain_stats__n_xindirs_32 - li $2, VG_TRC_INNER_FASTMISS - li $3, 0 - b postamble + ld $10, OFFSET_mips64_PC($23) + + /* stats only */ + lw $15, VG_(stats__n_xIndirs_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndirs_32) + + // LIVE: r23 (guest state ptr), r10 (guest address to go to). + // We use 6 temporaries: + // r16 (to point at the relevant FastCacheSet), + // r11, r12, r13 (scratch, for swapping entries within a set) + // r14, r15 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute r16 = VG_TT_FAST_HASH(guest) + dsrl $16, $10, 2 // g2 = guest >> 2 + dsrl $15, $10, (VG_TT_FAST_BITS + 2) // (g2 >> VG_TT_FAST_BITS) + xor $16, $16, $15 // (g2 >> VG_TT_FAST_BITS) ^ g2 + li $15, VG_TT_FAST_MASK + and $16, $16, $15 // setNo + + // Compute r16 = &VG_(tt_fast)[r16] + dla $15, VG_(tt_fast) + dsll $16, $16, VG_FAST_CACHE_SET_BITS + daddu $16, $16, $15 + + // LIVE: r23 (guest state ptr), r10 (guest addr), r16 (cache set) + // try way 0 + ld $14, FCS_g0($16) // .guest0 + ld $15, FCS_h0($16) // .host0 + bne $14, $10, 1f // cmp against .guest0 + // hit at way 0 + // goto .host0 + jr $15 + /*NOTREACHED*/ + .long 0x0 + +1: // try way 1 + ld $14, FCS_g1($16) + bne $14, $10, 2f // cmp against .guest1 + // hit at way 1; swap upwards + ld $11, FCS_g0($16) // $11 = old .guest0 + ld $12, FCS_h0($16) // $12 = old .host0 + ld $13, FCS_h1($16) // $13 = old .host1 + sd $10, FCS_g0($16) // new .guest0 = guest + sd $13, FCS_h0($16) // new .host0 = old .host1 + sd $11, FCS_g1($16) // new .guest1 = old .guest0 + sd $12, FCS_h1($16) // new .host1 = old .host0 + // stats only + lw $15, VG_(stats__n_xIndir_hits1_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_hits1_32) + // goto old .host1 a.k.a. new .host0 + jr $13 + /*NOTREACHED*/ + .long 0x0 + +2: // try way 2 + ld $14, FCS_g2($16) + bne $14, $10, 3f // cmp against .guest2 + // hit at way 2; swap upwards + ld $11, FCS_g1($16) + ld $12, FCS_h1($16) + ld $13, FCS_h2($16) + sd $10, FCS_g1($16) + sd $13, FCS_h1($16) + sd $11, FCS_g2($16) + sd $12, FCS_h2($16) + // stats only + lw $15, VG_(stats__n_xIndir_hits2_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_hits2_32) + // goto old .host2 a.k.a. new .host1 + jr $13 + /*NOTREACHED*/ + .long 0x0 + +3: // try way 3 + ld $14, FCS_g3($16) + bne $14, $10, 4f // cmp against .guest3 + // hit at way 3; swap upwards + ld $11, FCS_g2($16) + ld $12, FCS_h2($16) + ld $13, FCS_h3($16) + sd $10, FCS_g2($16) + sd $13, FCS_h2($16) + sd $11, FCS_g3($16) + sd $12, FCS_h3($16) + // stats only + lw $15, VG_(stats__n_xIndir_hits3_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_hits3_32) + // goto old .host3 a.k.a. new .host2 + jr $13 + /*NOTREACHED*/ + .long 0x0 + +4: // fast lookup failed: + /* stats only */ + lw $15, VG_(stats__n_xIndir_misses_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_misses_32) + + li $2, VG_TRC_INNER_FASTMISS + li $3, 0 + b postamble + /*NOTREACHED*/ + .long 0x0 /* ------ Assisted jump ------ */ .global VG_(disp_cp_xassisted) diff --git a/coregrind/m_dispatch/dispatch-ppc32-linux.S b/coregrind/m_dispatch/dispatch-ppc32-linux.S index 432306b..d3ff2d1 100644 --- a/coregrind/m_dispatch/dispatch-ppc32-linux.S +++ b/coregrind/m_dispatch/dispatch-ppc32-linux.S @@ -437,44 +437,128 @@ VG_(disp_cp_chain_me_to_fastEP): .global VG_(disp_cp_xindir) VG_(disp_cp_xindir): /* Where are we going? */ - lwz 3,OFFSET_ppc32_CIA(31) + lwz 20, OFFSET_ppc32_CIA(31) /* stats only */ - lis 5,VG_(stats__n_xindirs_32)@ha - addi 5,5,VG_(stats__n_xindirs_32)@l - lwz 6,0(5) - addi 6,6,1 - stw 6,0(5) + lis 24, VG_(stats__n_xIndirs_32)@ha + addi 24, 24, VG_(stats__n_xIndirs_32)@l + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + + // LIVE: r31 (guest state ptr), r20 (guest address to go to). + // We use 6 temporaries: + // r26 (to point at the relevant FastCacheSet), + // r21, r22, r23 (scratch, for swapping entries within a set) + // r24, r25 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute r26 = VG_TT_FAST_HASH(guest) + srwi 26, 20, 2 // g2 = guest >> 2 + srwi 25, 20, (VG_TT_FAST_BITS + 2) // (g2 >> VG_TT_FAST_BITS) + xor 26, 26, 25 // (g2 >> VG_TT_FAST_BITS) ^ g2 + andi. 26, 26, VG_TT_FAST_MASK // setNo - /* r5 = &VG_(tt_fast) */ - lis 5,VG_(tt_fast)@ha - addi 5,5,VG_(tt_fast)@l /* & VG_(tt_fast) */ - - /* try a fast lookup in the translation cache */ - /* r4 = VG_TT_FAST_HASH(addr) * sizeof(FastCacheEntry) - = ((r3 >>u 2) & VG_TT_FAST_MASK) << 3 */ - rlwinm 4,3,1, 29-VG_TT_FAST_BITS, 28 /* entry# * 8 */ - add 5,5,4 /* & VG_(tt_fast)[entry#] */ - lwz 6,0(5) /* .guest */ - lwz 7,4(5) /* .host */ - cmpw 3,6 - bne fast_lookup_failed - - /* Found a match. Jump to .host. */ - mtctr 7 + // Compute r6 = &VG_(tt_fast)[r6] + lis 25, VG_(tt_fast)@ha + addi 25, 25, VG_(tt_fast)@l + slwi 26, 26, VG_FAST_CACHE_SET_BITS + add 26, 26, 25 + + // LIVE: r31 (guest state ptr), r20 (guest addr), r26 (cache set) + // try way 0 + lwz 24, FCS_g0(26) // .guest0 + lwz 25, FCS_h0(26) // .host0 + cmpw 24, 20 // cmp against .guest0 + bne 1f + // hit at way 0 + // goto .host0 + mtctr 25 bctr + /*NOTREACHED*/ + +1: // try way 1 + lwz 24, FCS_g1(26) + cmpw 24, 20 // cmp against .guest1 + bne 2f + // hit at way 1; swap upwards + lwz 21, FCS_g0(26) // 21 = old .guest0 + lwz 22, FCS_h0(26) // 22 = old .host0 + lwz 23, FCS_h1(26) // 23 = old .host1 + stw 20, FCS_g0(26) // new .guest0 = guest + stw 23, FCS_h0(26) // new .host0 = old .host1 + stw 21, FCS_g1(26) // new .guest1 = old .guest0 + stw 22, FCS_h1(26) // new .host1 = old .host0 + // stats only + lis 24, VG_(stats__n_xIndir_hits1_32)@ha + addi 24, 24, VG_(stats__n_xIndir_hits1_32)@l + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host1 a.k.a. new .host0 + mtctr 23 + bctr + /*NOTREACHED*/ + +2: // try way 2 + lwz 24, FCS_g2(26) + cmpw 24, 20 // cmp against .guest2 + bne 3f + // hit at way 2; swap upwards + lwz 21, FCS_g1(26) + lwz 22, FCS_h1(26) + lwz 23, FCS_h2(26) + stw 20, FCS_g1(26) + stw 23, FCS_h1(26) + stw 21, FCS_g2(26) + stw 22, FCS_h2(26) + // stats only + lis 24, VG_(stats__n_xIndir_hits2_32)@ha + addi 24, 24, VG_(stats__n_xIndir_hits2_32)@l + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host2 a.k.a. new .host1 + mtctr 23 + bctr + /*NOTREACHED*/ + +3: // try way 3 + lwz 24, FCS_g3(26) + cmpw 24, 20 // cmp against .guest3 + bne 4f + // hit at way 3; swap upwards + lwz 21, FCS_g2(26) + lwz 22, FCS_h2(26) + lwz 23, FCS_h3(26) + stw 20, FCS_g2(26) + stw 23, FCS_h2(26) + stw 21, FCS_g3(26) + stw 22, FCS_h3(26) + // stats only + lis 24, VG_(stats__n_xIndir_hits3_32)@ha + addi 24, 24, VG_(stats__n_xIndir_hits3_32)@l + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host3 a.k.a. new .host2 + mtctr 23 + bctr + /*NOTREACHED*/ -fast_lookup_failed: +4: // fast lookup failed: /* stats only */ - lis 5,VG_(stats__n_xindir_misses_32)@ha - addi 5,5,VG_(stats__n_xindir_misses_32)@l - lwz 6,0(5) - addi 6,6,1 - stw 6,0(5) - - li 6,VG_TRC_INNER_FASTMISS - li 7,0 - b postamble + lis 24, VG_(stats__n_xIndir_misses_32)@ha + addi 24, 24, VG_(stats__n_xIndir_misses_32)@l + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + + li 6, VG_TRC_INNER_FASTMISS + li 7, 0 + b postamble /*NOTREACHED*/ /* ------ Assisted jump ------ */ diff --git a/coregrind/m_dispatch/dispatch-ppc64be-linux.S b/coregrind/m_dispatch/dispatch-ppc64be-linux.S index 91bd3b2..c5592d4 100644 --- a/coregrind/m_dispatch/dispatch-ppc64be-linux.S +++ b/coregrind/m_dispatch/dispatch-ppc64be-linux.S @@ -45,14 +45,27 @@ .type vgPlain_tt_fast, @object */ .section ".toc","aw" + .tocent__vgPlain_tt_fast: .tc vgPlain_tt_fast[TC],vgPlain_tt_fast -.tocent__vgPlain_stats__n_xindirs_32: - .tc vgPlain_stats__n_xindirs_32[TC],vgPlain_stats__n_xindirs_32 -.tocent__vgPlain_stats__n_xindir_misses_32: - .tc vgPlain_stats__n_xindir_misses_32[TC],vgPlain_stats__n_xindir_misses_32 + +.tocent__vgPlain_stats__n_xIndirs_32: + .tc vgPlain_stats__n_xIndirs_32[TC], vgPlain_stats__n_xIndirs_32 + +.tocent__vgPlain_stats__n_xIndir_hits1_32: + .tc vgPlain_stats__n_xIndir_hits1_32[TC], vgPlain_stats__n_xIndir_hits1_32 + +.tocent__vgPlain_stats__n_xIndir_hits2_32: + .tc vgPlain_stats__n_xIndir_hits2_32[TC], vgPlain_stats__n_xIndir_hits2_32 + +.tocent__vgPlain_stats__n_xIndir_hits3_32: + .tc vgPlain_stats__n_xIndir_hits3_32[TC], vgPlain_stats__n_xIndir_hits3_32 + +.tocent__vgPlain_stats__n_xIndir_misses_32: + .tc vgPlain_stats__n_xIndir_misses_32[TC], vgPlain_stats__n_xIndir_misses_32 + .tocent__vgPlain_machine_ppc64_has_VMX: - .tc vgPlain_machine_ppc64_has_VMX[TC],vgPlain_machine_ppc64_has_VMX + .tc vgPlain_machine_ppc64_has_VMX[TC], vgPlain_machine_ppc64_has_VMX /*------------------------------------------------------------*/ /*--- ---*/ @@ -454,42 +467,122 @@ VG_(disp_cp_xindir): .globl .VG_(disp_cp_xindir) .VG_(disp_cp_xindir): /* Where are we going? */ - ld 3,OFFSET_ppc64_CIA(31) + ld 20, OFFSET_ppc64_CIA(31) /* stats only */ - ld 5, .tocent__vgPlain_stats__n_xindirs_32@toc(2) - lwz 6,0(5) - addi 6,6,1 - stw 6,0(5) - - /* r5 = &VG_(tt_fast) */ - ld 5, .tocent__vgPlain_tt_fast@toc(2) /* &VG_(tt_fast) */ - - /* try a fast lookup in the translation cache */ - /* r4 = VG_TT_FAST_HASH(addr) * sizeof(FastCacheEntry) - = ((r3 >>u 2) & VG_TT_FAST_MASK) << 4 */ - rldicl 4,3, 62, 64-VG_TT_FAST_BITS /* entry# */ - sldi 4,4,4 /* entry# * sizeof(FastCacheEntry) */ - add 5,5,4 /* & VG_(tt_fast)[entry#] */ - ld 6,0(5) /* .guest */ - ld 7,8(5) /* .host */ - cmpd 3,6 - bne .fast_lookup_failed - - /* Found a match. Jump to .host. */ - mtctr 7 + ld 24, .tocent__vgPlain_stats__n_xIndirs_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + + // LIVE: r31 (guest state ptr), r20 (guest address to go to). + // We use 6 temporaries: + // r26 (to point at the relevant FastCacheSet), + // r21, r22, r23 (scratch, for swapping entries within a set) + // r24, r25 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute r26 = VG_TT_FAST_HASH(guest) + srdi 26, 20, 2 // g2 = guest >> 2 + srdi 25, 20, (VG_TT_FAST_BITS + 2) // (g2 >> VG_TT_FAST_BITS) + xor 26, 26, 25 // (g2 >> VG_TT_FAST_BITS) ^ g2 + andi. 26, 26, VG_TT_FAST_MASK // setNo + + // Compute r6 = &VG_(tt_fast)[r6] + ld 25, .tocent__vgPlain_tt_fast@toc(2) + sldi 26, 26, VG_FAST_CACHE_SET_BITS + add 26, 26, 25 + + // LIVE: r31 (guest state ptr), r20 (guest addr), r26 (cache set) + // try way 0 + ld 24, FCS_g0(26) // .guest0 + ld 25, FCS_h0(26) // .host0 + cmpd 24, 20 // cmp against .guest0 + bne 1f + // hit at way 0 + // goto .host0 + mtctr 25 bctr + /*NOTREACHED*/ + +1: // try way 1 + ld 24, FCS_g1(26) + cmpd 24, 20 // cmp against .guest1 + bne 2f + // hit at way 1; swap upwards + ld 21, FCS_g0(26) // 21 = old .guest0 + ld 22, FCS_h0(26) // 22 = old .host0 + ld 23, FCS_h1(26) // 23 = old .host1 + std 20, FCS_g0(26) // new .guest0 = guest + std 23, FCS_h0(26) // new .host0 = old .host1 + std 21, FCS_g1(26) // new .guest1 = old .guest0 + std 22, FCS_h1(26) // new .host1 = old .host0 + // stats only + ld 24, .tocent__vgPlain_stats__n_xIndir_hits1_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host1 a.k.a. new .host0 + mtctr 23 + bctr + /*NOTREACHED*/ + +2: // try way 2 + ld 24, FCS_g2(26) + cmpd 24, 20 // cmp against .guest2 + bne 3f + // hit at way 2; swap upwards + ld 21, FCS_g1(26) + ld 22, FCS_h1(26) + ld 23, FCS_h2(26) + std 20, FCS_g1(26) + std 23, FCS_h1(26) + std 21, FCS_g2(26) + std 22, FCS_h2(26) + // stats only + ld 24, .tocent__vgPlain_stats__n_xIndir_hits2_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host2 a.k.a. new .host1 + mtctr 23 + bctr + /*NOTREACHED*/ + +3: // try way 3 + ld 24, FCS_g3(26) + cmpd 24, 20 // cmp against .guest3 + bne 4f + // hit at way 3; swap upwards + ld 21, FCS_g2(26) + ld 22, FCS_h2(26) + ld 23, FCS_h3(26) + std 20, FCS_g2(26) + std 23, FCS_h2(26) + std 21, FCS_g3(26) + std 22, FCS_h3(26) + // stats only + ld 24, .tocent__vgPlain_stats__n_xIndir_hits3_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host3 a.k.a. new .host2 + mtctr 23 + bctr + /*NOTREACHED*/ -.fast_lookup_failed: +4: // fast lookup failed: /* stats only */ - ld 5, .tocent__vgPlain_stats__n_xindir_misses_32@toc(2) - lwz 6,0(5) - addi 6,6,1 - stw 6,0(5) - - li 6,VG_TRC_INNER_FASTMISS - li 7,0 - b .postamble + ld 24, .tocent__vgPlain_stats__n_xIndir_misses_32@toc(2) + lwz 25, 0(24) + addi 25 ,25, 1 + stw 25 ,0(24) + + li 6,VG_TRC_INNER_FASTMISS + li 7,0 + b .postamble /*NOTREACHED*/ /* ------ Assisted jump ------ */ diff --git a/coregrind/m_dispatch/dispatch-ppc64le-linux.S b/coregrind/m_dispatch/dispatch-ppc64le-linux.S index 21e4358..3e26d77 100644 --- a/coregrind/m_dispatch/dispatch-ppc64le-linux.S +++ b/coregrind/m_dispatch/dispatch-ppc64le-linux.S @@ -54,14 +54,27 @@ .type vgPlain_tt_fast, @object */ .section ".toc","aw" + .tocent__vgPlain_tt_fast: .tc vgPlain_tt_fast[TC],vgPlain_tt_fast -.tocent__vgPlain_stats__n_xindirs_32: - .tc vgPlain_stats__n_xindirs_32[TC],vgPlain_stats__n_xindirs_32 -.tocent__vgPlain_stats__n_xindir_misses_32: - .tc vgPlain_stats__n_xindir_misses_32[TC],vgPlain_stats__n_xindir_misses_32 + +.tocent__vgPlain_stats__n_xIndirs_32: + .tc vgPlain_stats__n_xIndirs_32[TC], vgPlain_stats__n_xIndirs_32 + +.tocent__vgPlain_stats__n_xIndir_hits1_32: + .tc vgPlain_stats__n_xIndir_hits1_32[TC], vgPlain_stats__n_xIndir_hits1_32 + +.tocent__vgPlain_stats__n_xIndir_hits2_32: + .tc vgPlain_stats__n_xIndir_hits2_32[TC], vgPlain_stats__n_xIndir_hits2_32 + +.tocent__vgPlain_stats__n_xIndir_hits3_32: + .tc vgPlain_stats__n_xIndir_hits3_32[TC], vgPlain_stats__n_xIndir_hits3_32 + +.tocent__vgPlain_stats__n_xIndir_misses_32: + .tc vgPlain_stats__n_xIndir_misses_32[TC], vgPlain_stats__n_xIndir_misses_32 + .tocent__vgPlain_machine_ppc64_has_VMX: - .tc vgPlain_machine_ppc64_has_VMX[TC],vgPlain_machine_ppc64_has_VMX + .tc vgPlain_machine_ppc64_has_VMX[TC], vgPlain_machine_ppc64_has_VMX /*------------------------------------------------------------*/ /*--- ---*/ @@ -518,47 +531,127 @@ VG_(disp_cp_xindir): addi 2,2,.TOC.-0b@l .localentry VG_(disp_cp_xindir), .-VG_(disp_cp_xindir) #endif - /* Where are we going? */ - ld 3,OFFSET_ppc64_CIA(31) + /* Where are we going? */ + ld 20, OFFSET_ppc64_CIA(31) /* stats only */ - ld 5, .tocent__vgPlain_stats__n_xindirs_32@toc(2) - lwz 6,0(5) - addi 6,6,1 - stw 6,0(5) - - /* r5 = &VG_(tt_fast) */ - ld 5, .tocent__vgPlain_tt_fast@toc(2) /* &VG_(tt_fast) */ - - /* try a fast lookup in the translation cache */ - /* r4 = VG_TT_FAST_HASH(addr) * sizeof(FastCacheEntry) - = ((r3 >>u 2) & VG_TT_FAST_MASK) << 4 */ - rldicl 4,3, 62, 64-VG_TT_FAST_BITS /* entry# */ - sldi 4,4,4 /* entry# * sizeof(FastCacheEntry) */ - add 5,5,4 /* & VG_(tt_fast)[entry#] */ - ld 6,0(5) /* .guest */ - ld 7,8(5) /* .host */ - cmpd 3,6 - bne .fast_lookup_failed - - /* Found a match. Jump to .host. */ - mtctr 7 + ld 24, .tocent__vgPlain_stats__n_xIndirs_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + + // LIVE: r31 (guest state ptr), r20 (guest address to go to). + // We use 6 temporaries: + // r26 (to point at the relevant FastCacheSet), + // r21, r22, r23 (scratch, for swapping entries within a set) + // r24, r25 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute r26 = VG_TT_FAST_HASH(guest) + srdi 26, 20, 2 // g2 = guest >> 2 + srdi 25, 20, (VG_TT_FAST_BITS + 2) // (g2 >> VG_TT_FAST_BITS) + xor 26, 26, 25 // (g2 >> VG_TT_FAST_BITS) ^ g2 + andi. 26, 26, VG_TT_FAST_MASK // setNo + + // Compute r6 = &VG_(tt_fast)[r6] + ld 25, .tocent__vgPlain_tt_fast@toc(2) + sldi 26, 26, VG_FAST_CACHE_SET_BITS + add 26, 26, 25 + + // LIVE: r31 (guest state ptr), r20 (guest addr), r26 (cache set) + // try way 0 + ld 24, FCS_g0(26) // .guest0 + ld 25, FCS_h0(26) // .host0 + cmpd 24, 20 // cmp against .guest0 + bne 1f + // hit at way 0 + // goto .host0 + mtctr 25 bctr -#if _CALL_ELF == 2 - .size VG_(disp_cp_xindir),.-VG_(disp_cp_xindir) -#endif + /*NOTREACHED*/ + +1: // try way 1 + ld 24, FCS_g1(26) + cmpd 24, 20 // cmp against .guest1 + bne 2f + // hit at way 1; swap upwards + ld 21, FCS_g0(26) // 21 = old .guest0 + ld 22, FCS_h0(26) // 22 = old .host0 + ld 23, FCS_h1(26) // 23 = old .host1 + std 20, FCS_g0(26) // new .guest0 = guest + std 23, FCS_h0(26) // new .host0 = old .host1 + std 21, FCS_g1(26) // new .guest1 = old .guest0 + std 22, FCS_h1(26) // new .host1 = old .host0 + // stats only + ld 24, .tocent__vgPlain_stats__n_xIndir_hits1_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host1 a.k.a. new .host0 + mtctr 23 + bctr + /*NOTREACHED*/ + +2: // try way 2 + ld 24, FCS_g2(26) + cmpd 24, 20 // cmp against .guest2 + bne 3f + // hit at way 2; swap upwards + ld 21, FCS_g1(26) + ld 22, FCS_h1(26) + ld 23, FCS_h2(26) + std 20, FCS_g1(26) + std 23, FCS_h1(26) + std 21, FCS_g2(26) + std 22, FCS_h2(26) + // stats only + ld 24, .tocent__vgPlain_stats__n_xIndir_hits2_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host2 a.k.a. new .host1 + mtctr 23 + bctr + /*NOTREACHED*/ + +3: // try way 3 + ld 24, FCS_g3(26) + cmpd 24, 20 // cmp against .guest3 + bne 4f + // hit at way 3; swap upwards + ld 21, FCS_g2(26) + ld 22, FCS_h2(26) + ld 23, FCS_h3(26) + std 20, FCS_g2(26) + std 23, FCS_h2(26) + std 21, FCS_g3(26) + std 22, FCS_h3(26) + // stats only + ld 24, .tocent__vgPlain_stats__n_xIndir_hits3_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host3 a.k.a. new .host2 + mtctr 23 + bctr + /*NOTREACHED*/ -.fast_lookup_failed: +4: // fast lookup failed: /* stats only */ - ld 5, .tocent__vgPlain_stats__n_xindir_misses_32@toc(2) - lwz 6,0(5) - addi 6,6,1 - stw 6,0(5) - - li 6,VG_TRC_INNER_FASTMISS - li 7,0 - b .postamble + ld 24, .tocent__vgPlain_stats__n_xIndir_misses_32@toc(2) + lwz 25, 0(24) + addi 25 ,25, 1 + stw 25 ,0(24) + + li 6,VG_TRC_INNER_FASTMISS + li 7,0 + b .postamble /*NOTREACHED*/ +#if _CALL_ELF == 2 + .size VG_(disp_cp_xindir),.-VG_(disp_cp_xindir) +#endif /* ------ Assisted jump ------ */ .section ".text" diff --git a/coregrind/m_dispatch/dispatch-s390x-linux.S b/coregrind/m_dispatch/dispatch-s390x-linux.S index 83c2e2a..c31e32a 100644 --- a/coregrind/m_dispatch/dispatch-s390x-linux.S +++ b/coregrind/m_dispatch/dispatch-s390x-linux.S @@ -197,54 +197,121 @@ VG_(disp_cp_chain_me_to_fastEP): /* ------ Indirect but boring jump ------ */ .global VG_(disp_cp_xindir) VG_(disp_cp_xindir): - /* Where are we going? */ - lg %r2, OFFSET_s390x_IA(%r13) - - /* Increment VG_(stats__n_xindirs_32) */ - larl %r8, VG_(stats__n_xindirs_32) - l %r10,0(%r8) - ahi %r10,1 - st %r10,0(%r8) - - /* Try a fast lookup in the translation cache: - Compute offset (not index) into VT_(tt_fast): - - offset = VG_TT_FAST_HASH(addr) * sizeof(FastCacheEntry) - - with VG_TT_FAST_HASH(addr) == (addr >> 1) & VG_TT_FAST_MASK - and sizeof(FastCacheEntry) == 16 - - offset = ((addr >> 1) & VG_TT_FAST_MASK) << 4 - which is - offset = ((addr & (VG_TT_FAST_MASK << 1) ) << 3 - */ - larl %r8, VG_(tt_fast) - llill %r5,(VG_TT_FAST_MASK << 1) & 0xffff -#if ((( VG_TT_FAST_MASK << 1) & 0xffff0000) >> 16 != 0) - iilh %r5,((VG_TT_FAST_MASK << 1) & 0xffff0000) >> 16 -#endif - ngr %r5,%r2 - sllg %r7,%r5,3 - lg %r11, 8(%r8,%r7) /* .host */ - cg %r2, 0(%r8,%r7) /* next guest address == .guest ? */ - jne fast_lookup_failed - - /* Found a match. Call .host. - r11 is an address. There we will find the instrumented client code. - That code may modify the guest state register r13. */ - br %r11 - .long 0x0 /* persuade insn decoders not to speculate past here */ - -fast_lookup_failed: - /* Increment VG_(stats__n_xindir_misses_32) */ - larl %r8, VG_(stats__n_xindir_misses_32) - l %r10,0(%r8) - ahi %r10,1 - st %r10,0(%r8) - - lghi %r0,VG_TRC_INNER_FASTMISS - lghi %r1,0 + /* Where are we going? */ + lg %r6, OFFSET_s390x_IA(%r13) // "guest" + + /* stats only */ + larl %r11, VG_(stats__n_xIndirs_32) + l %r12, 0(%r11) + ahi %r12, 1 + st %r12, 0(%r11) + + // LIVE: r13 (guest state ptr), r6 (guest address to go to). + // We use 6 temporaries: + // r7 (to point at the relevant FastCacheSet), + // r8, r9, r10 (scratch, for swapping entries within a set) + // r11, r12 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute %r7 = VG_TT_FAST_HASH(guest) + srlg %r7, %r6, 1 // g1 = guest >> 1 + srlg %r8, %r6, (VG_TT_FAST_BITS + 1) // (g1 >> VG_TT_FAST_BITS) + xgr %r7, %r8 // (g1 >> VG_TT_FAST_BITS) ^ g1 + llill %r8, VG_TT_FAST_MASK & 0xffff +# if ((VG_TT_FAST_MASK & 0xffff0000) >> 16 != 0) + iilh %r8, (VG_TT_FAST_MASK & 0xffff0000) >> 16 +# endif + ngr %r7, %r8 // setNo + + // Compute %r7 = &VG_(tt_fast)[%r7] + sllg %r7,%r7, VG_FAST_CACHE_SET_BITS // setNo * sizeof(FastCacheSet) + larl %r8, VG_(tt_fast) // &VG_(tt_fast)[0] + agr %r7, %r8 // &VG_(tt_fast)[setNo] + + // LIVE: %r13 (guest state ptr), %r6 (guest addr), %r7 (cache set) + // try way 0 + cg %r6, FCS_g0(%r7) // cmp against .guest0 + lg %r8, FCS_h0(%r7) + jne 1f + // hit at way 0 + // goto .host0 + br %r8 + /*NOTREACHED*/ + .long 0 + +1: // try way 1 + cg %r6, FCS_g1(%r7) // cmp against .guest1 + jne 2f + // hit at way 1; swap upwards + lg %r8, FCS_g0(%r7) // r8 = old .guest0 + lg %r9, FCS_h0(%r7) // r9 = old .host0 + lg %r10, FCS_h1(%r7) // r10 = old .host1 + stg %r6, FCS_g0(%r7) // new .guest0 = guest + stg %r10, FCS_h0(%r7) // new .host0 = old .host1 + stg %r8, FCS_g1(%r7) // new .guest1 = old .guest0 + stg %r9, FCS_h1(%r7) // new .host1 = old .host0 + // stats only + larl %r11, VG_(stats__n_xIndir_hits1_32) + l %r12, 0(%r11) + ahi %r12, 1 + st %r12, 0(%r11) + // goto old .host1 a.k.a. new .host0 + br %r10 + /*NOTREACHED*/ + .long 0 + +2: // try way 2 + cg %r6, FCS_g2(%r7) // cmp against .guest2 + jne 3f + lg %r8, FCS_g1(%r7) + lg %r9, FCS_h1(%r7) + lg %r10, FCS_h2(%r7) + stg %r6, FCS_g1(%r7) + stg %r10, FCS_h1(%r7) + stg %r8, FCS_g2(%r7) + stg %r9, FCS_h2(%r7) + // stats only + larl %r11, VG_(stats__n_xIndir_hits2_32) + l %r12, 0(%r11) + ahi %r12, 1 + st %r12, 0(%r11) + // goto old .host2 a.k.a. new .host1 + br %r10 + /*NOTREACHED*/ + .long 0 + +3: // try way 3 + cg %r6, FCS_g3(%r7) // cmp against .guest3 + jne 4f + // hit at way 3; swap upwards + lg %r8, FCS_g2(%r7) + lg %r9, FCS_h2(%r7) + lg %r10, FCS_h3(%r7) + stg %r6, FCS_g2(%r7) + stg %r10, FCS_h2(%r7) + stg %r8, FCS_g3(%r7) + stg %r9, FCS_h3(%r7) + // stats only + larl %r11, VG_(stats__n_xIndir_hits3_32) + l %r12, 0(%r11) + ahi %r12, 1 + st %r12, 0(%r11) + // goto old .host3 a.k.a. new .host2 + br %r10 + .long 0 + +4: // fast lookup failed + larl %r11, VG_(stats__n_xIndir_misses_32) + l %r12, 0(%r11) + ahi %r12, 1 + st %r12, 0(%r11) + + lghi %r0, VG_TRC_INNER_FASTMISS + lghi %r1, 0 j postamble + /*NOTREACHED*/ /* ------ Assisted jump ------ */ diff --git a/coregrind/m_dispatch/dispatch-x86-darwin.S b/coregrind/m_dispatch/dispatch-x86-darwin.S index 55188e9..467d7d6 100644 --- a/coregrind/m_dispatch/dispatch-x86-darwin.S +++ b/coregrind/m_dispatch/dispatch-x86-darwin.S @@ -194,29 +19... [truncated message content] |
|
From: Julian S. <se...@so...> - 2019-01-25 08:29:43
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=f96d131ce24cb403cc7a43c19bb651dd25fbe122 commit f96d131ce24cb403cc7a43c19bb651dd25fbe122 Author: Julian Seward <js...@ac...> Date: Fri Jan 25 09:27:23 2019 +0100 Bug 402781 - Redo the cache used to process indirect branch targets. Implementation for x86-solaris and amd64-solaris. This completes the implementations for all targets. Note these two are untested because I don't have any way to test them. Diff: --- coregrind/m_dispatch/dispatch-amd64-solaris.S | 99 ++++++++++++++++++++++----- coregrind/m_dispatch/dispatch-x86-solaris.S | 96 +++++++++++++++++++++----- 2 files changed, 159 insertions(+), 36 deletions(-) diff --git a/coregrind/m_dispatch/dispatch-amd64-solaris.S b/coregrind/m_dispatch/dispatch-amd64-solaris.S index 79bb512..2cccf1f 100644 --- a/coregrind/m_dispatch/dispatch-amd64-solaris.S +++ b/coregrind/m_dispatch/dispatch-amd64-solaris.S @@ -205,28 +205,89 @@ VG_(disp_cp_chain_me_to_fastEP): .global VG_(disp_cp_xindir) VG_(disp_cp_xindir): /* Where are we going? */ - movq OFFSET_amd64_RIP(%rbp), %rax + movq OFFSET_amd64_RIP(%rbp), %rax // "guest" /* stats only */ - addl $1, VG_(stats__n_xindirs_32) - - /* try a fast lookup in the translation cache */ - movabsq $VG_(tt_fast), %rcx - movq %rax, %rbx /* next guest addr */ - andq $VG_TT_FAST_MASK, %rbx /* entry# */ - shlq $4, %rbx /* entry# * sizeof(FastCacheEntry) */ - movq 0(%rcx,%rbx,1), %r10 /* .guest */ - movq 8(%rcx,%rbx,1), %r11 /* .host */ - cmpq %rax, %r10 - jnz fast_lookup_failed - - /* Found a match. Jump to .host. */ - jmp *%r11 - ud2 /* persuade insn decoders not to speculate past here */ - -fast_lookup_failed: + addl $1, VG_(stats__n_xIndirs_32) + + // LIVE: %rbp (guest state ptr), %rax (guest address to go to). + // We use 4 temporaries: + // %r9 (to point at the relevant FastCacheSet), + // %r10, %r11 and %r12 (scratch). + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute %r9 = VG_TT_FAST_HASH(guest) + movq %rax, %r9 // guest + shrq $VG_TT_FAST_BITS, %r9 // (guest >> VG_TT_FAST_BITS) + xorq %rax, %r9 // (guest >> VG_TT_FAST_BITS) ^ guest + andq $VG_TT_FAST_MASK, %r9 // setNo + + // Compute %r9 = &VG_(tt_fast)[%r9] + shlq $VG_FAST_CACHE_SET_BITS, %r9 // setNo * sizeof(FastCacheSet) + movabsq $VG_(tt_fast), %r10 // &VG_(tt_fast)[0] + leaq (%r10, %r9), %r9 // &VG_(tt_fast)[setNo] + + // LIVE: %rbp (guest state ptr), %rax (guest addr), %r9 (cache set) + // try way 0 + cmpq %rax, FCS_g0(%r9) // cmp against .guest0 + jnz 1f + // hit at way 0 + jmp *FCS_h0(%r9) // goto .host0 + ud2 + +1: // try way 1 + cmpq %rax, FCS_g1(%r9) // cmp against .guest1 + jnz 2f + // hit at way 1; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits1_32) + movq FCS_g0(%r9), %r10 // r10 = old .guest0 + movq FCS_h0(%r9), %r11 // r11 = old .host0 + movq FCS_h1(%r9), %r12 // r12 = old .host1 + movq %rax, FCS_g0(%r9) // new .guest0 = guest + movq %r12, FCS_h0(%r9) // new .host0 = old .host1 + movq %r10, FCS_g1(%r9) // new .guest1 = old .guest0 + movq %r11, FCS_h1(%r9) // new .host1 = old .host0 + jmp *%r12 // goto old .host1 a.k.a. new .host0 + ud2 + +2: // try way 2 + cmpq %rax, FCS_g2(%r9) // cmp against .guest2 + jnz 3f + // hit at way 2; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits2_32) + movq FCS_g1(%r9), %r10 + movq FCS_h1(%r9), %r11 + movq FCS_h2(%r9), %r12 + movq %rax, FCS_g1(%r9) + movq %r12, FCS_h1(%r9) + movq %r10, FCS_g2(%r9) + movq %r11, FCS_h2(%r9) + jmp *%r12 + ud2 + +3: // try way 3 + cmpq %rax, FCS_g3(%r9) // cmp against .guest3 + jnz 4f + // hit at way 3; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits3_32) + movq FCS_g2(%r9), %r10 + movq FCS_h2(%r9), %r11 + movq FCS_h3(%r9), %r12 + movq %rax, FCS_g2(%r9) + movq %r12, FCS_h2(%r9) + movq %r10, FCS_g3(%r9) + movq %r11, FCS_h3(%r9) + jmp *%r12 + ud2 + +4: // fast lookup failed /* stats only */ - addl $1, VG_(stats__n_xindir_misses_32) + addl $1, VG_(stats__n_xIndir_misses_32) movq $VG_TRC_INNER_FASTMISS, %rax movq $0, %rdx diff --git a/coregrind/m_dispatch/dispatch-x86-solaris.S b/coregrind/m_dispatch/dispatch-x86-solaris.S index aec5b3a..c7d23f2 100644 --- a/coregrind/m_dispatch/dispatch-x86-solaris.S +++ b/coregrind/m_dispatch/dispatch-x86-solaris.S @@ -198,26 +198,88 @@ VG_(disp_cp_chain_me_to_fastEP): .global VG_(disp_cp_xindir) VG_(disp_cp_xindir): /* Where are we going? */ - movl OFFSET_x86_EIP(%ebp), %eax + movl OFFSET_x86_EIP(%ebp), %eax // "guest" /* stats only */ - addl $1, VG_(stats__n_xindirs_32) - - /* try a fast lookup in the translation cache */ - movl %eax, %ebx /* next guest addr */ - andl $VG_TT_FAST_MASK, %ebx /* entry# */ - movl 0+VG_(tt_fast)(,%ebx,8), %esi /* .guest */ - movl 4+VG_(tt_fast)(,%ebx,8), %edi /* .host */ - cmpl %eax, %esi - jnz fast_lookup_failed - - /* Found a match. Jump to .host. */ - jmp *%edi - ud2 /* persuade insn decoders not to speculate past here */ - -fast_lookup_failed: + addl $1, VG_(stats__n_xIndirs_32) + + // LIVE: %ebp (guest state ptr), %eax (guest address to go to). + // We use 4 temporaries: + // %esi (to point at the relevant FastCacheSet), + // %ebx, %ecx and %edx (scratch). + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute %esi = VG_TT_FAST_HASH(guest) + movl %eax, %esi // guest + shrl $VG_TT_FAST_BITS, %esi // (guest >> VG_TT_FAST_BITS) + xorl %eax, %esi // (guest >> VG_TT_FAST_BITS) ^ guest + andl $VG_TT_FAST_MASK, %esi // setNo + + // Compute %esi = &VG_(tt_fast)[%esi] + shll $VG_FAST_CACHE_SET_BITS, %esi // setNo * sizeof(FastCacheSet) + leal VG_(tt_fast)(%esi), %esi // &VG_(tt_fast)[setNo] + + // LIVE: %ebp (guest state ptr), %eax (guest addr), %esi (cache set) + // try way 0 + cmpl %eax, FCS_g0(%esi) // cmp against .guest0 + jnz 1f + // hit at way 0 + jmp *FCS_h0(%esi) // goto .host0 + ud2 + +1: // try way 1 + cmpl %eax, FCS_g1(%esi) // cmp against .guest1 + jnz 2f + // hit at way 1; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits1_32) + movl FCS_g0(%esi), %ebx // ebx = old .guest0 + movl FCS_h0(%esi), %ecx // ecx = old .host0 + movl FCS_h1(%esi), %edx // edx = old .host1 + movl %eax, FCS_g0(%esi) // new .guest0 = guest + movl %edx, FCS_h0(%esi) // new .host0 = old .host1 + movl %ebx, FCS_g1(%esi) // new .guest1 = old .guest0 + movl %ecx, FCS_h1(%esi) // new .host1 = old .host0 + jmp *%edx // goto old .host1 a.k.a. new .host0 + ud2 + +2: // try way 2 + cmpl %eax, FCS_g2(%esi) // cmp against .guest2 + jnz 3f + // hit at way 2; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits2_32) + movl FCS_g1(%esi), %ebx + movl FCS_h1(%esi), %ecx + movl FCS_h2(%esi), %edx + movl %eax, FCS_g1(%esi) + movl %edx, FCS_h1(%esi) + movl %ebx, FCS_g2(%esi) + movl %ecx, FCS_h2(%esi) + jmp *%edx + ud2 + +3: // try way 3 + cmpl %eax, FCS_g3(%esi) // cmp against .guest3 + jnz 4f + // hit at way 3; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits3_32) + movl FCS_g2(%esi), %ebx + movl FCS_h2(%esi), %ecx + movl FCS_h3(%esi), %edx + movl %eax, FCS_g2(%esi) + movl %edx, FCS_h2(%esi) + movl %ebx, FCS_g3(%esi) + movl %ecx, FCS_h3(%esi) + jmp *%edx + ud2 + +4: // fast lookup failed /* stats only */ - addl $1, VG_(stats__n_xindir_misses_32) + addl $1, VG_(stats__n_xIndir_misses_32) movl $VG_TRC_INNER_FASTMISS, %eax movl $0, %edx |
|
From: Ed M. <em...@fr...> - 2019-02-14 18:42:15
|
On Fri, 25 Jan 2019 at 03:20, Julian Seward <se...@so...> wrote: > > https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=50bb127b1df8d31812141aafa567d325d1fbc1b3 > > commit 50bb127b1df8d31812141aafa567d325d1fbc1b3 > Author: Julian Seward <js...@ac...> > Date: Fri Jan 25 09:14:56 2019 +0100 > > Bug 402781 - Redo the cache used to process indirect branch targets. > > [This commit contains an implementation for all targets except amd64-solaris > and x86-solaris, which will be completed shortly.] I'll try to make sure FreeBSD implementations get added as soon as possible. |