You can subscribe to this list here.
| 2002 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
(1) |
Oct
(122) |
Nov
(152) |
Dec
(69) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2003 |
Jan
(6) |
Feb
(25) |
Mar
(73) |
Apr
(82) |
May
(24) |
Jun
(25) |
Jul
(10) |
Aug
(11) |
Sep
(10) |
Oct
(54) |
Nov
(203) |
Dec
(182) |
| 2004 |
Jan
(307) |
Feb
(305) |
Mar
(430) |
Apr
(312) |
May
(187) |
Jun
(342) |
Jul
(487) |
Aug
(637) |
Sep
(336) |
Oct
(373) |
Nov
(441) |
Dec
(210) |
| 2005 |
Jan
(385) |
Feb
(480) |
Mar
(636) |
Apr
(544) |
May
(679) |
Jun
(625) |
Jul
(810) |
Aug
(838) |
Sep
(634) |
Oct
(521) |
Nov
(965) |
Dec
(543) |
| 2006 |
Jan
(494) |
Feb
(431) |
Mar
(546) |
Apr
(411) |
May
(406) |
Jun
(322) |
Jul
(256) |
Aug
(401) |
Sep
(345) |
Oct
(542) |
Nov
(308) |
Dec
(481) |
| 2007 |
Jan
(427) |
Feb
(326) |
Mar
(367) |
Apr
(255) |
May
(244) |
Jun
(204) |
Jul
(223) |
Aug
(231) |
Sep
(354) |
Oct
(374) |
Nov
(497) |
Dec
(362) |
| 2008 |
Jan
(322) |
Feb
(482) |
Mar
(658) |
Apr
(422) |
May
(476) |
Jun
(396) |
Jul
(455) |
Aug
(267) |
Sep
(280) |
Oct
(253) |
Nov
(232) |
Dec
(304) |
| 2009 |
Jan
(486) |
Feb
(470) |
Mar
(458) |
Apr
(423) |
May
(696) |
Jun
(461) |
Jul
(551) |
Aug
(575) |
Sep
(134) |
Oct
(110) |
Nov
(157) |
Dec
(102) |
| 2010 |
Jan
(226) |
Feb
(86) |
Mar
(147) |
Apr
(117) |
May
(107) |
Jun
(203) |
Jul
(193) |
Aug
(238) |
Sep
(300) |
Oct
(246) |
Nov
(23) |
Dec
(75) |
| 2011 |
Jan
(133) |
Feb
(195) |
Mar
(315) |
Apr
(200) |
May
(267) |
Jun
(293) |
Jul
(353) |
Aug
(237) |
Sep
(278) |
Oct
(611) |
Nov
(274) |
Dec
(260) |
| 2012 |
Jan
(303) |
Feb
(391) |
Mar
(417) |
Apr
(441) |
May
(488) |
Jun
(655) |
Jul
(590) |
Aug
(610) |
Sep
(526) |
Oct
(478) |
Nov
(359) |
Dec
(372) |
| 2013 |
Jan
(467) |
Feb
(226) |
Mar
(391) |
Apr
(281) |
May
(299) |
Jun
(252) |
Jul
(311) |
Aug
(352) |
Sep
(481) |
Oct
(571) |
Nov
(222) |
Dec
(231) |
| 2014 |
Jan
(185) |
Feb
(329) |
Mar
(245) |
Apr
(238) |
May
(281) |
Jun
(399) |
Jul
(382) |
Aug
(500) |
Sep
(579) |
Oct
(435) |
Nov
(487) |
Dec
(256) |
| 2015 |
Jan
(338) |
Feb
(357) |
Mar
(330) |
Apr
(294) |
May
(191) |
Jun
(108) |
Jul
(142) |
Aug
(261) |
Sep
(190) |
Oct
(54) |
Nov
(83) |
Dec
(22) |
| 2016 |
Jan
(49) |
Feb
(89) |
Mar
(33) |
Apr
(50) |
May
(27) |
Jun
(34) |
Jul
(53) |
Aug
(53) |
Sep
(98) |
Oct
(206) |
Nov
(93) |
Dec
(53) |
| 2017 |
Jan
(65) |
Feb
(82) |
Mar
(102) |
Apr
(86) |
May
(187) |
Jun
(67) |
Jul
(23) |
Aug
(93) |
Sep
(65) |
Oct
(45) |
Nov
(35) |
Dec
(17) |
| 2018 |
Jan
(26) |
Feb
(35) |
Mar
(38) |
Apr
(32) |
May
(8) |
Jun
(43) |
Jul
(27) |
Aug
(30) |
Sep
(43) |
Oct
(42) |
Nov
(38) |
Dec
(67) |
| 2019 |
Jan
(32) |
Feb
(37) |
Mar
(53) |
Apr
(64) |
May
(49) |
Jun
(18) |
Jul
(14) |
Aug
(53) |
Sep
(25) |
Oct
(30) |
Nov
(49) |
Dec
(31) |
| 2020 |
Jan
(87) |
Feb
(45) |
Mar
(37) |
Apr
(51) |
May
(99) |
Jun
(36) |
Jul
(11) |
Aug
(14) |
Sep
(20) |
Oct
(24) |
Nov
(40) |
Dec
(23) |
| 2021 |
Jan
(14) |
Feb
(53) |
Mar
(85) |
Apr
(15) |
May
(19) |
Jun
(3) |
Jul
(14) |
Aug
(1) |
Sep
(57) |
Oct
(73) |
Nov
(56) |
Dec
(22) |
| 2022 |
Jan
(3) |
Feb
(22) |
Mar
(6) |
Apr
(55) |
May
(46) |
Jun
(39) |
Jul
(15) |
Aug
(9) |
Sep
(11) |
Oct
(34) |
Nov
(20) |
Dec
(36) |
| 2023 |
Jan
(79) |
Feb
(41) |
Mar
(99) |
Apr
(169) |
May
(48) |
Jun
(16) |
Jul
(16) |
Aug
(57) |
Sep
(19) |
Oct
|
Nov
|
Dec
|
| S | M | T | W | T | F | S |
|---|---|---|---|---|---|---|
|
|
|
1
|
2
(6) |
3
(1) |
4
(1) |
5
|
|
6
|
7
|
8
(1) |
9
|
10
|
11
(4) |
12
(1) |
|
13
|
14
(1) |
15
|
16
|
17
|
18
|
19
|
|
20
|
21
|
22
|
23
|
24
(1) |
25
(4) |
26
(4) |
|
27
(5) |
28
|
29
(2) |
30
|
31
(1) |
|
|
|
From: Julian S. <se...@so...> - 2019-01-25 11:08:06
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=a05a920edcf0490529815d77d2d0cc47c121c120 commit a05a920edcf0490529815d77d2d0cc47c121c120 Author: Julian Seward <js...@ac...> Date: Fri Jan 25 12:06:37 2019 +0100 VG_(discard_translations): try to avoid invalidating the entire VG_(tt_fast) cache. n-i-bz. It is very commonly the case that a call to VG_(discard_translations) results in the discarding of exactly one superblock. In such cases, it's much cheaper to find and invalidate the VG_(tt_fast) cache entry associated with the block, than it is to invalidate the entire cache, because (1) invalidating the fast cache is expensive, and (2) repopulating the fast cache after invalidation is even more expensive. For QEMU, which intensively invalidates individual translations (presumably due to patching them), this reduces the fast-cache miss rate from circa one in 33 lookups to around one in 130 lookups. Diff: --- coregrind/m_transtab.c | 123 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 95 insertions(+), 28 deletions(-) diff --git a/coregrind/m_transtab.c b/coregrind/m_transtab.c index 23ecb11..f7717f6 100644 --- a/coregrind/m_transtab.c +++ b/coregrind/m_transtab.c @@ -1475,6 +1475,34 @@ static void invalidateFastCache ( void ) n_fast_flushes++; } +/* Invalidate a single fast cache entry. */ +static void invalidateFastCacheEntry ( Addr guest ) +{ + /* This shouldn't fail. It should be assured by m_translate + which should reject any attempt to make translation of code + starting at TRANSTAB_BOGUS_GUEST_ADDR. */ + vg_assert(guest != TRANSTAB_BOGUS_GUEST_ADDR); + /* If any entry in the line is the right one, just set it to + TRANSTAB_BOGUS_GUEST_ADDR. Doing so ensure that the entry will never + be used in future, so will eventually fall off the end of the line, + due to LRU replacement, and be replaced with something that's actually + useful. */ + UWord setNo = (UInt)VG_TT_FAST_HASH(guest); + FastCacheSet* set = &VG_(tt_fast)[setNo]; + if (set->guest0 == guest) { + set->guest0 = TRANSTAB_BOGUS_GUEST_ADDR; + } + if (set->guest1 == guest) { + set->guest1 = TRANSTAB_BOGUS_GUEST_ADDR; + } + if (set->guest2 == guest) { + set->guest2 = TRANSTAB_BOGUS_GUEST_ADDR; + } + if (set->guest3 == guest) { + set->guest3 = TRANSTAB_BOGUS_GUEST_ADDR; + } +} + static void setFastCacheEntry ( Addr guest, ULong* tcptr ) { /* This shouldn't fail. It should be assured by m_translate @@ -1984,7 +2012,8 @@ Bool overlaps ( Addr start, ULong range, const TTEntryH* tteH ) /* Delete a tt entry, and update all the eclass data accordingly. */ -static void delete_tte ( /*MOD*/Sector* sec, SECno secNo, TTEno tteno, +static void delete_tte ( /*OUT*/Addr* ga_deleted, + /*MOD*/Sector* sec, SECno secNo, TTEno tteno, VexArch arch_host, VexEndness endness_host ) { Int i, ec_idx; @@ -1999,6 +2028,10 @@ static void delete_tte ( /*MOD*/Sector* sec, SECno secNo, TTEno tteno, vg_assert(tteH->status == InUse); vg_assert(tteC->n_tte2ec >= 1 && tteC->n_tte2ec <= 3); + vg_assert(tteH->vge_n_used >= 1 && tteH->vge_n_used <= 3); + vg_assert(tteH->vge_base[0] != TRANSTAB_BOGUS_GUEST_ADDR); + *ga_deleted = tteH->vge_base[0]; + /* Unchain .. */ unchain_in_preparation_for_deletion(arch_host, endness_host, secNo, tteno); @@ -2053,15 +2086,16 @@ static void delete_tte ( /*MOD*/Sector* sec, SECno secNo, TTEno tteno, only consider translations in the specified eclass. */ static -Bool delete_translations_in_sector_eclass ( /*MOD*/Sector* sec, SECno secNo, - Addr guest_start, ULong range, - EClassNo ec, - VexArch arch_host, - VexEndness endness_host ) +SizeT delete_translations_in_sector_eclass ( /*OUT*/Addr* ga_deleted, + /*MOD*/Sector* sec, SECno secNo, + Addr guest_start, ULong range, + EClassNo ec, + VexArch arch_host, + VexEndness endness_host ) { Int i; TTEno tteno; - Bool anyDeld = False; + SizeT numDeld = 0; vg_assert(ec >= 0 && ec < ECLASS_N); @@ -2079,13 +2113,13 @@ Bool delete_translations_in_sector_eclass ( /*MOD*/Sector* sec, SECno secNo, vg_assert(tteH->status == InUse); if (overlaps( guest_start, range, tteH )) { - anyDeld = True; - delete_tte( sec, secNo, tteno, arch_host, endness_host ); + numDeld++; + delete_tte( ga_deleted, sec, secNo, tteno, arch_host, endness_host ); } } - return anyDeld; + return numDeld; } @@ -2093,13 +2127,14 @@ Bool delete_translations_in_sector_eclass ( /*MOD*/Sector* sec, SECno secNo, slow way, by inspecting all translations in sec. */ static -Bool delete_translations_in_sector ( /*MOD*/Sector* sec, SECno secNo, - Addr guest_start, ULong range, - VexArch arch_host, - VexEndness endness_host ) +SizeT delete_translations_in_sector ( /*OUT*/Addr* ga_deleted, + /*MOD*/Sector* sec, SECno secNo, + Addr guest_start, ULong range, + VexArch arch_host, + VexEndness endness_host ) { TTEno i; - Bool anyDeld = False; + SizeT numDeld = 0; for (i = 0; i < N_TTES_PER_SECTOR; i++) { /* The entire and only purpose of splitting TTEntry into cold @@ -2108,12 +2143,12 @@ Bool delete_translations_in_sector ( /*MOD*/Sector* sec, SECno secNo, of the cold data up the memory hierarchy. */ if (UNLIKELY(sec->ttH[i].status == InUse && overlaps( guest_start, range, &sec->ttH[i] ))) { - anyDeld = True; - delete_tte( sec, secNo, i, arch_host, endness_host ); + numDeld++; + delete_tte( ga_deleted, sec, secNo, i, arch_host, endness_host ); } } - return anyDeld; + return numDeld; } @@ -2123,7 +2158,24 @@ void VG_(discard_translations) ( Addr guest_start, ULong range, Sector* sec; SECno sno; EClassNo ec; - Bool anyDeleted = False; + + /* It is very commonly the case that a call here results in discarding of + exactly one superblock. As an optimisation only, use ga_deleted and + numDeleted to detect this situation and to record the guest addr involved. + That is then used to avoid calling invalidateFastCache in this case. + Instead the individual entry in the fast cache is removed. This can reduce + the overall VG_(fast_cache) miss rate significantly in applications that do + a lot of short code discards (basically jit generated code that is + subsequently patched). + + ga_deleted is made to hold the guest address of the last superblock deleted + (in this call to VG_(discard_translations)). If more than one superblock + is deleted (or none), then we ignore what is written to ga_deleted. If + exactly one superblock is deleted then ga_deleted holds exactly what we + want and will be used. + */ + Addr ga_deleted = TRANSTAB_BOGUS_GUEST_ADDR; + SizeT numDeleted = 0; vg_assert(init_done); @@ -2184,13 +2236,13 @@ void VG_(discard_translations) ( Addr guest_start, ULong range, sec = §ors[sno]; if (sec->tc == NULL) continue; - anyDeleted |= delete_translations_in_sector_eclass( - sec, sno, guest_start, range, ec, - arch_host, endness_host + numDeleted += delete_translations_in_sector_eclass( + &ga_deleted, sec, sno, guest_start, range, + ec, arch_host, endness_host ); - anyDeleted |= delete_translations_in_sector_eclass( - sec, sno, guest_start, range, ECLASS_MISC, - arch_host, endness_host + numDeleted += delete_translations_in_sector_eclass( + &ga_deleted, sec, sno, guest_start, range, + ECLASS_MISC, arch_host, endness_host ); } @@ -2205,16 +2257,31 @@ void VG_(discard_translations) ( Addr guest_start, ULong range, sec = §ors[sno]; if (sec->tc == NULL) continue; - anyDeleted |= delete_translations_in_sector( - sec, sno, guest_start, range, + numDeleted += delete_translations_in_sector( + &ga_deleted, sec, sno, guest_start, range, arch_host, endness_host ); } } - if (anyDeleted) + if (numDeleted == 0) { + // "ga_deleted was never set" + vg_assert(ga_deleted == TRANSTAB_BOGUS_GUEST_ADDR); + } else + if (numDeleted == 1) { + // "ga_deleted was set to something valid" + vg_assert(ga_deleted != TRANSTAB_BOGUS_GUEST_ADDR); + // Just invalidate the individual VG_(tt_fast) cache entry \o/ + invalidateFastCacheEntry(ga_deleted); + Addr fake_host = 0; + vg_assert(! VG_(lookupInFastCache)(&fake_host, ga_deleted)); + } else { + // "ga_deleted was set to something valid" + vg_assert(ga_deleted != TRANSTAB_BOGUS_GUEST_ADDR); + // Nuke the entire VG_(tt_fast) cache. Sigh. invalidateFastCache(); + } /* don't forget the no-redir cache */ unredir_discard_translations( guest_start, range ); |
|
From: Julian S. <se...@so...> - 2019-01-25 08:31:49
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=f4072abf6bc9e7791e7b34157fb25210a523dba9 commit f4072abf6bc9e7791e7b34157fb25210a523dba9 Author: Julian Seward <js...@ac...> Date: Fri Jan 25 09:31:19 2019 +0100 Update. Diff: --- NEWS | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS b/NEWS index fcd5ad9..01b1018 100644 --- a/NEWS +++ b/NEWS @@ -86,6 +86,7 @@ where XXXXXX is the bug number as listed below. 402515 Implement new option --show-error-list=no|yes / -s 402519 POWER 3.0 addex instruction incorrectly implemented 403552 s390x: wrong facility bit checked for vector facility +402781 Redo the cache used to process indirect branch targets n-i-bz add syswrap for PTRACE_GET|SET_THREAD_AREA on amd64. |
|
From: Julian S. <se...@so...> - 2019-01-25 08:29:43
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=f96d131ce24cb403cc7a43c19bb651dd25fbe122 commit f96d131ce24cb403cc7a43c19bb651dd25fbe122 Author: Julian Seward <js...@ac...> Date: Fri Jan 25 09:27:23 2019 +0100 Bug 402781 - Redo the cache used to process indirect branch targets. Implementation for x86-solaris and amd64-solaris. This completes the implementations for all targets. Note these two are untested because I don't have any way to test them. Diff: --- coregrind/m_dispatch/dispatch-amd64-solaris.S | 99 ++++++++++++++++++++++----- coregrind/m_dispatch/dispatch-x86-solaris.S | 96 +++++++++++++++++++++----- 2 files changed, 159 insertions(+), 36 deletions(-) diff --git a/coregrind/m_dispatch/dispatch-amd64-solaris.S b/coregrind/m_dispatch/dispatch-amd64-solaris.S index 79bb512..2cccf1f 100644 --- a/coregrind/m_dispatch/dispatch-amd64-solaris.S +++ b/coregrind/m_dispatch/dispatch-amd64-solaris.S @@ -205,28 +205,89 @@ VG_(disp_cp_chain_me_to_fastEP): .global VG_(disp_cp_xindir) VG_(disp_cp_xindir): /* Where are we going? */ - movq OFFSET_amd64_RIP(%rbp), %rax + movq OFFSET_amd64_RIP(%rbp), %rax // "guest" /* stats only */ - addl $1, VG_(stats__n_xindirs_32) - - /* try a fast lookup in the translation cache */ - movabsq $VG_(tt_fast), %rcx - movq %rax, %rbx /* next guest addr */ - andq $VG_TT_FAST_MASK, %rbx /* entry# */ - shlq $4, %rbx /* entry# * sizeof(FastCacheEntry) */ - movq 0(%rcx,%rbx,1), %r10 /* .guest */ - movq 8(%rcx,%rbx,1), %r11 /* .host */ - cmpq %rax, %r10 - jnz fast_lookup_failed - - /* Found a match. Jump to .host. */ - jmp *%r11 - ud2 /* persuade insn decoders not to speculate past here */ - -fast_lookup_failed: + addl $1, VG_(stats__n_xIndirs_32) + + // LIVE: %rbp (guest state ptr), %rax (guest address to go to). + // We use 4 temporaries: + // %r9 (to point at the relevant FastCacheSet), + // %r10, %r11 and %r12 (scratch). + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute %r9 = VG_TT_FAST_HASH(guest) + movq %rax, %r9 // guest + shrq $VG_TT_FAST_BITS, %r9 // (guest >> VG_TT_FAST_BITS) + xorq %rax, %r9 // (guest >> VG_TT_FAST_BITS) ^ guest + andq $VG_TT_FAST_MASK, %r9 // setNo + + // Compute %r9 = &VG_(tt_fast)[%r9] + shlq $VG_FAST_CACHE_SET_BITS, %r9 // setNo * sizeof(FastCacheSet) + movabsq $VG_(tt_fast), %r10 // &VG_(tt_fast)[0] + leaq (%r10, %r9), %r9 // &VG_(tt_fast)[setNo] + + // LIVE: %rbp (guest state ptr), %rax (guest addr), %r9 (cache set) + // try way 0 + cmpq %rax, FCS_g0(%r9) // cmp against .guest0 + jnz 1f + // hit at way 0 + jmp *FCS_h0(%r9) // goto .host0 + ud2 + +1: // try way 1 + cmpq %rax, FCS_g1(%r9) // cmp against .guest1 + jnz 2f + // hit at way 1; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits1_32) + movq FCS_g0(%r9), %r10 // r10 = old .guest0 + movq FCS_h0(%r9), %r11 // r11 = old .host0 + movq FCS_h1(%r9), %r12 // r12 = old .host1 + movq %rax, FCS_g0(%r9) // new .guest0 = guest + movq %r12, FCS_h0(%r9) // new .host0 = old .host1 + movq %r10, FCS_g1(%r9) // new .guest1 = old .guest0 + movq %r11, FCS_h1(%r9) // new .host1 = old .host0 + jmp *%r12 // goto old .host1 a.k.a. new .host0 + ud2 + +2: // try way 2 + cmpq %rax, FCS_g2(%r9) // cmp against .guest2 + jnz 3f + // hit at way 2; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits2_32) + movq FCS_g1(%r9), %r10 + movq FCS_h1(%r9), %r11 + movq FCS_h2(%r9), %r12 + movq %rax, FCS_g1(%r9) + movq %r12, FCS_h1(%r9) + movq %r10, FCS_g2(%r9) + movq %r11, FCS_h2(%r9) + jmp *%r12 + ud2 + +3: // try way 3 + cmpq %rax, FCS_g3(%r9) // cmp against .guest3 + jnz 4f + // hit at way 3; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits3_32) + movq FCS_g2(%r9), %r10 + movq FCS_h2(%r9), %r11 + movq FCS_h3(%r9), %r12 + movq %rax, FCS_g2(%r9) + movq %r12, FCS_h2(%r9) + movq %r10, FCS_g3(%r9) + movq %r11, FCS_h3(%r9) + jmp *%r12 + ud2 + +4: // fast lookup failed /* stats only */ - addl $1, VG_(stats__n_xindir_misses_32) + addl $1, VG_(stats__n_xIndir_misses_32) movq $VG_TRC_INNER_FASTMISS, %rax movq $0, %rdx diff --git a/coregrind/m_dispatch/dispatch-x86-solaris.S b/coregrind/m_dispatch/dispatch-x86-solaris.S index aec5b3a..c7d23f2 100644 --- a/coregrind/m_dispatch/dispatch-x86-solaris.S +++ b/coregrind/m_dispatch/dispatch-x86-solaris.S @@ -198,26 +198,88 @@ VG_(disp_cp_chain_me_to_fastEP): .global VG_(disp_cp_xindir) VG_(disp_cp_xindir): /* Where are we going? */ - movl OFFSET_x86_EIP(%ebp), %eax + movl OFFSET_x86_EIP(%ebp), %eax // "guest" /* stats only */ - addl $1, VG_(stats__n_xindirs_32) - - /* try a fast lookup in the translation cache */ - movl %eax, %ebx /* next guest addr */ - andl $VG_TT_FAST_MASK, %ebx /* entry# */ - movl 0+VG_(tt_fast)(,%ebx,8), %esi /* .guest */ - movl 4+VG_(tt_fast)(,%ebx,8), %edi /* .host */ - cmpl %eax, %esi - jnz fast_lookup_failed - - /* Found a match. Jump to .host. */ - jmp *%edi - ud2 /* persuade insn decoders not to speculate past here */ - -fast_lookup_failed: + addl $1, VG_(stats__n_xIndirs_32) + + // LIVE: %ebp (guest state ptr), %eax (guest address to go to). + // We use 4 temporaries: + // %esi (to point at the relevant FastCacheSet), + // %ebx, %ecx and %edx (scratch). + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute %esi = VG_TT_FAST_HASH(guest) + movl %eax, %esi // guest + shrl $VG_TT_FAST_BITS, %esi // (guest >> VG_TT_FAST_BITS) + xorl %eax, %esi // (guest >> VG_TT_FAST_BITS) ^ guest + andl $VG_TT_FAST_MASK, %esi // setNo + + // Compute %esi = &VG_(tt_fast)[%esi] + shll $VG_FAST_CACHE_SET_BITS, %esi // setNo * sizeof(FastCacheSet) + leal VG_(tt_fast)(%esi), %esi // &VG_(tt_fast)[setNo] + + // LIVE: %ebp (guest state ptr), %eax (guest addr), %esi (cache set) + // try way 0 + cmpl %eax, FCS_g0(%esi) // cmp against .guest0 + jnz 1f + // hit at way 0 + jmp *FCS_h0(%esi) // goto .host0 + ud2 + +1: // try way 1 + cmpl %eax, FCS_g1(%esi) // cmp against .guest1 + jnz 2f + // hit at way 1; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits1_32) + movl FCS_g0(%esi), %ebx // ebx = old .guest0 + movl FCS_h0(%esi), %ecx // ecx = old .host0 + movl FCS_h1(%esi), %edx // edx = old .host1 + movl %eax, FCS_g0(%esi) // new .guest0 = guest + movl %edx, FCS_h0(%esi) // new .host0 = old .host1 + movl %ebx, FCS_g1(%esi) // new .guest1 = old .guest0 + movl %ecx, FCS_h1(%esi) // new .host1 = old .host0 + jmp *%edx // goto old .host1 a.k.a. new .host0 + ud2 + +2: // try way 2 + cmpl %eax, FCS_g2(%esi) // cmp against .guest2 + jnz 3f + // hit at way 2; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits2_32) + movl FCS_g1(%esi), %ebx + movl FCS_h1(%esi), %ecx + movl FCS_h2(%esi), %edx + movl %eax, FCS_g1(%esi) + movl %edx, FCS_h1(%esi) + movl %ebx, FCS_g2(%esi) + movl %ecx, FCS_h2(%esi) + jmp *%edx + ud2 + +3: // try way 3 + cmpl %eax, FCS_g3(%esi) // cmp against .guest3 + jnz 4f + // hit at way 3; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits3_32) + movl FCS_g2(%esi), %ebx + movl FCS_h2(%esi), %ecx + movl FCS_h3(%esi), %edx + movl %eax, FCS_g2(%esi) + movl %edx, FCS_h2(%esi) + movl %ebx, FCS_g3(%esi) + movl %ecx, FCS_h3(%esi) + jmp *%edx + ud2 + +4: // fast lookup failed /* stats only */ - addl $1, VG_(stats__n_xindir_misses_32) + addl $1, VG_(stats__n_xIndir_misses_32) movl $VG_TRC_INNER_FASTMISS, %eax movl $0, %edx |
|
From: Julian S. <se...@so...> - 2019-01-25 08:20:02
|
https://sourceware.org/git/gitweb.cgi?p=valgrind.git;h=50bb127b1df8d31812141aafa567d325d1fbc1b3 commit 50bb127b1df8d31812141aafa567d325d1fbc1b3 Author: Julian Seward <js...@ac...> Date: Fri Jan 25 09:14:56 2019 +0100 Bug 402781 - Redo the cache used to process indirect branch targets. [This commit contains an implementation for all targets except amd64-solaris and x86-solaris, which will be completed shortly.] In the baseline simulator, jumps to guest code addresses that are not known at JIT time have to be looked up in a guest->host mapping table. That means: indirect branches, indirect calls and most commonly, returns. Since there are huge numbers of these (often 10+ million/second) the mapping mechanism needs to be extremely cheap. Currently, this is implemented using a direct-mapped cache, VG_(tt_fast), with 2^15 (guest_addr, host_addr) pairs. This is queried in handwritten assembly in VG_(disp_cp_xindir) in dispatch-<arch>-<os>.S. If there is a miss in the cache then we fall back out to C land, and do a slow lookup using VG_(search_transtab). Given that the size of the translation table(s) in recent years has expanded significantly in order to keep pace with increasing application sizes, two bad things have happened: (1) the cost of a miss in the fast cache has risen significantly, and (2) the miss rate on the fast cache has also increased significantly. This means that large (~ one-million-basic-blocks-JITted) applications that run for a long time end up spending a lot of time in VG_(search_transtab). The proposed fix is to increase associativity of the fast cache, from 1 (direct mapped) to 4. Simulations of various cache configurations using indirect-branch traces from a large application show that is the best of various configurations. In an extreme case with 5.7 billion indirect branches: * The increase of associativity from 1 way to 4 way, whilst keeping the overall cache size the same (32k guest/host pairs), reduces the miss rate by around a factor of 3, from 4.02% to 1.30%. * The use of a slightly better hash function than merely slicing off the bottom 15 bits of the address, reduces the miss rate further, from 1.30% to 0.53%. Overall the VG_(tt_fast) miss rate is almost unchanged on small workloads, but reduced by a factor of up to almost 8 on large workloads. By implementing each (4-entry) cache set using a move-to-front scheme in the case of hits in ways 1, 2 or 3, the vast majority of hits can be made to happen in way 0. Hence the cost of having this extra associativity is almost zero in the case of a hit. The improved hash function costs an extra 2 ALU shots (a shift and an xor) but overall this seems performance neutral to a win. Diff: --- coregrind/m_dispatch/dispatch-amd64-darwin.S | 111 +++++++++++++---- coregrind/m_dispatch/dispatch-amd64-linux.S | 99 ++++++++++++--- coregrind/m_dispatch/dispatch-arm-linux.S | 134 +++++++++++++++----- coregrind/m_dispatch/dispatch-arm64-linux.S | 132 +++++++++++++++----- coregrind/m_dispatch/dispatch-mips32-linux.S | 151 ++++++++++++++++------ coregrind/m_dispatch/dispatch-mips64-linux.S | 151 ++++++++++++++++------ coregrind/m_dispatch/dispatch-ppc32-linux.S | 148 +++++++++++++++++----- coregrind/m_dispatch/dispatch-ppc64be-linux.S | 165 ++++++++++++++++++------ coregrind/m_dispatch/dispatch-ppc64le-linux.S | 173 ++++++++++++++++++++------ coregrind/m_dispatch/dispatch-s390x-linux.S | 161 +++++++++++++++++------- coregrind/m_dispatch/dispatch-x86-darwin.S | 98 ++++++++++++--- coregrind/m_dispatch/dispatch-x86-linux.S | 96 +++++++++++--- coregrind/m_scheduler/scheduler.c | 75 +++++++---- coregrind/m_transtab.c | 104 ++++++++++------ coregrind/pub_core_transtab.h | 101 +++++++++++++-- coregrind/pub_core_transtab_asm.h | 111 ++++++++++++----- include/pub_tool_libcbase.h | 1 + 17 files changed, 1544 insertions(+), 467 deletions(-) diff --git a/coregrind/m_dispatch/dispatch-amd64-darwin.S b/coregrind/m_dispatch/dispatch-amd64-darwin.S index d560306..ccf2b91 100644 --- a/coregrind/m_dispatch/dispatch-amd64-darwin.S +++ b/coregrind/m_dispatch/dispatch-amd64-darwin.S @@ -201,33 +201,98 @@ VG_(disp_cp_chain_me_to_fastEP): jmp postamble /* ------ Indirect but boring jump ------ */ -.globl VG_(disp_cp_xindir) +.global VG_(disp_cp_xindir) VG_(disp_cp_xindir): - /* Where are we going? */ - movq OFFSET_amd64_RIP(%rbp), %rax + /* Where are we going? */ + movq OFFSET_amd64_RIP(%rbp), %rax // "guest" /* stats only */ - movabsq $VG_(stats__n_xindirs_32), %r10 - addl $1, (%r10) - - /* try a fast lookup in the translation cache */ - movabsq $VG_(tt_fast), %rcx - movq %rax, %rbx /* next guest addr */ - andq $VG_TT_FAST_MASK, %rbx /* entry# */ - shlq $4, %rbx /* entry# * sizeof(FastCacheEntry) */ - movq 0(%rcx,%rbx,1), %r10 /* .guest */ - movq 8(%rcx,%rbx,1), %r11 /* .host */ - cmpq %rax, %r10 - jnz fast_lookup_failed - - /* Found a match. Jump to .host. */ - jmp *%r11 - ud2 /* persuade insn decoders not to speculate past here */ - -fast_lookup_failed: + movabsq $VG_(stats__n_xIndirs_32), %r8 + addl $1, (%r8) + + // LIVE: %rbp (guest state ptr), %rax (guest address to go to). + // We use 5 temporaries: + // %r9 (to point at the relevant FastCacheSet), + // %r10, %r11 and %r12 (scratch). + // %r8 (scratch address) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute %r9 = VG_TT_FAST_HASH(guest) + movq %rax, %r9 // guest + shrq $VG_TT_FAST_BITS, %r9 // (guest >> VG_TT_FAST_BITS) + xorq %rax, %r9 // (guest >> VG_TT_FAST_BITS) ^ guest + andq $VG_TT_FAST_MASK, %r9 // setNo + + // Compute %r9 = &VG_(tt_fast)[%r9] + shlq $VG_FAST_CACHE_SET_BITS, %r9 // setNo * sizeof(FastCacheSet) + movabsq $VG_(tt_fast), %r10 // &VG_(tt_fast)[0] + leaq (%r10, %r9), %r9 // &VG_(tt_fast)[setNo] + + // LIVE: %rbp (guest state ptr), %rax (guest addr), %r9 (cache set) + // try way 0 + cmpq %rax, FCS_g0(%r9) // cmp against .guest0 + jnz 1f + // hit at way 0 + jmp *FCS_h0(%r9) // goto .host0 + ud2 + +1: // try way 1 + cmpq %rax, FCS_g1(%r9) // cmp against .guest1 + jnz 2f + // hit at way 1; swap upwards + /* stats only */ + movabsq $VG_(stats__n_xIndir_hits1_32), %r8 + addl $1, (%r8) + movq FCS_g0(%r9), %r10 // r10 = old .guest0 + movq FCS_h0(%r9), %r11 // r11 = old .host0 + movq FCS_h1(%r9), %r12 // r12 = old .host1 + movq %rax, FCS_g0(%r9) // new .guest0 = guest + movq %r12, FCS_h0(%r9) // new .host0 = old .host1 + movq %r10, FCS_g1(%r9) // new .guest1 = old .guest0 + movq %r11, FCS_h1(%r9) // new .host1 = old .host0 + jmp *%r12 // goto old .host1 a.k.a. new .host0 + ud2 + +2: // try way 2 + cmpq %rax, FCS_g2(%r9) // cmp against .guest2 + jnz 3f + // hit at way 2; swap upwards + /* stats only */ + movabsq $VG_(stats__n_xIndir_hits2_32), %r8 + addl $1, (%r8) + movq FCS_g1(%r9), %r10 + movq FCS_h1(%r9), %r11 + movq FCS_h2(%r9), %r12 + movq %rax, FCS_g1(%r9) + movq %r12, FCS_h1(%r9) + movq %r10, FCS_g2(%r9) + movq %r11, FCS_h2(%r9) + jmp *%r12 + ud2 + +3: // try way 3 + cmpq %rax, FCS_g3(%r9) // cmp against .guest3 + jnz 4f + // hit at way 3; swap upwards + /* stats only */ + movabsq $VG_(stats__n_xIndir_hits3_32), %r8 + addl $1, (%r8) + movq FCS_g2(%r9), %r10 + movq FCS_h2(%r9), %r11 + movq FCS_h3(%r9), %r12 + movq %rax, FCS_g2(%r9) + movq %r12, FCS_h2(%r9) + movq %r10, FCS_g3(%r9) + movq %r11, FCS_h3(%r9) + jmp *%r12 + ud2 + +4: // fast lookup failed /* stats only */ - movabsq $VG_(stats__n_xindir_misses_32), %r10 - addl $1, (%r10) + movabsq $VG_(stats__n_xIndir_misses_32), %r8 + addl $1, (%r8) movq $VG_TRC_INNER_FASTMISS, %rax movq $0, %rdx diff --git a/coregrind/m_dispatch/dispatch-amd64-linux.S b/coregrind/m_dispatch/dispatch-amd64-linux.S index 62717d3..007c495 100644 --- a/coregrind/m_dispatch/dispatch-amd64-linux.S +++ b/coregrind/m_dispatch/dispatch-amd64-linux.S @@ -205,28 +205,89 @@ VG_(disp_cp_chain_me_to_fastEP): .global VG_(disp_cp_xindir) VG_(disp_cp_xindir): /* Where are we going? */ - movq OFFSET_amd64_RIP(%rbp), %rax + movq OFFSET_amd64_RIP(%rbp), %rax // "guest" /* stats only */ - addl $1, VG_(stats__n_xindirs_32) - - /* try a fast lookup in the translation cache */ - movabsq $VG_(tt_fast), %rcx - movq %rax, %rbx /* next guest addr */ - andq $VG_TT_FAST_MASK, %rbx /* entry# */ - shlq $4, %rbx /* entry# * sizeof(FastCacheEntry) */ - movq 0(%rcx,%rbx,1), %r10 /* .guest */ - movq 8(%rcx,%rbx,1), %r11 /* .host */ - cmpq %rax, %r10 - jnz fast_lookup_failed - - /* Found a match. Jump to .host. */ - jmp *%r11 - ud2 /* persuade insn decoders not to speculate past here */ - -fast_lookup_failed: + addl $1, VG_(stats__n_xIndirs_32) + + // LIVE: %rbp (guest state ptr), %rax (guest address to go to). + // We use 4 temporaries: + // %r9 (to point at the relevant FastCacheSet), + // %r10, %r11 and %r12 (scratch). + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute %r9 = VG_TT_FAST_HASH(guest) + movq %rax, %r9 // guest + shrq $VG_TT_FAST_BITS, %r9 // (guest >> VG_TT_FAST_BITS) + xorq %rax, %r9 // (guest >> VG_TT_FAST_BITS) ^ guest + andq $VG_TT_FAST_MASK, %r9 // setNo + + // Compute %r9 = &VG_(tt_fast)[%r9] + shlq $VG_FAST_CACHE_SET_BITS, %r9 // setNo * sizeof(FastCacheSet) + movabsq $VG_(tt_fast), %r10 // &VG_(tt_fast)[0] + leaq (%r10, %r9), %r9 // &VG_(tt_fast)[setNo] + + // LIVE: %rbp (guest state ptr), %rax (guest addr), %r9 (cache set) + // try way 0 + cmpq %rax, FCS_g0(%r9) // cmp against .guest0 + jnz 1f + // hit at way 0 + jmp *FCS_h0(%r9) // goto .host0 + ud2 + +1: // try way 1 + cmpq %rax, FCS_g1(%r9) // cmp against .guest1 + jnz 2f + // hit at way 1; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits1_32) + movq FCS_g0(%r9), %r10 // r10 = old .guest0 + movq FCS_h0(%r9), %r11 // r11 = old .host0 + movq FCS_h1(%r9), %r12 // r12 = old .host1 + movq %rax, FCS_g0(%r9) // new .guest0 = guest + movq %r12, FCS_h0(%r9) // new .host0 = old .host1 + movq %r10, FCS_g1(%r9) // new .guest1 = old .guest0 + movq %r11, FCS_h1(%r9) // new .host1 = old .host0 + jmp *%r12 // goto old .host1 a.k.a. new .host0 + ud2 + +2: // try way 2 + cmpq %rax, FCS_g2(%r9) // cmp against .guest2 + jnz 3f + // hit at way 2; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits2_32) + movq FCS_g1(%r9), %r10 + movq FCS_h1(%r9), %r11 + movq FCS_h2(%r9), %r12 + movq %rax, FCS_g1(%r9) + movq %r12, FCS_h1(%r9) + movq %r10, FCS_g2(%r9) + movq %r11, FCS_h2(%r9) + jmp *%r12 + ud2 + +3: // try way 3 + cmpq %rax, FCS_g3(%r9) // cmp against .guest3 + jnz 4f + // hit at way 3; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits3_32) + movq FCS_g2(%r9), %r10 + movq FCS_h2(%r9), %r11 + movq FCS_h3(%r9), %r12 + movq %rax, FCS_g2(%r9) + movq %r12, FCS_h2(%r9) + movq %r10, FCS_g3(%r9) + movq %r11, FCS_h3(%r9) + jmp *%r12 + ud2 + +4: // fast lookup failed /* stats only */ - addl $1, VG_(stats__n_xindir_misses_32) + addl $1, VG_(stats__n_xIndir_misses_32) movq $VG_TRC_INNER_FASTMISS, %rax movq $0, %rdx diff --git a/coregrind/m_dispatch/dispatch-arm-linux.S b/coregrind/m_dispatch/dispatch-arm-linux.S index 3731c2e..b61818c 100644 --- a/coregrind/m_dispatch/dispatch-arm-linux.S +++ b/coregrind/m_dispatch/dispatch-arm-linux.S @@ -154,36 +154,114 @@ VG_(disp_cp_xindir): ldr r0, [r8, #OFFSET_arm_R15T] /* stats only */ - movw r1, #:lower16:vgPlain_stats__n_xindirs_32 - movt r1, #:upper16:vgPlain_stats__n_xindirs_32 - ldr r2, [r1, #0] - add r2, r2, #1 - str r2, [r1, #0] + movw r4, #:lower16:VG_(stats__n_xIndirs_32) + movt r4, #:upper16:VG_(stats__n_xIndirs_32) + ldr r5, [r4, #0] + add r5, r5, #1 + str r5, [r4, #0] + + // LIVE: r8 (guest state ptr), r0 (guest address to go to). + // We use 6 temporaries: + // r6 (to point at the relevant FastCacheSet), + // r1, r2, r3 (scratch, for swapping entries within a set) + // r4, r5 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute r6 = VG_TT_FAST_HASH(guest) + lsr r6, r0, #1 // g1 = guest >> 1 + eor r6, r6, r6, LSR #VG_TT_FAST_BITS // (g1 >> VG_TT_FAST_BITS) ^ g1 + ubfx r6, r6, #0, #VG_TT_FAST_BITS // setNo - /* try a fast lookup in the translation cache */ - // r0 = next guest, r1,r2,r3,r4 scratch - movw r1, #VG_TT_FAST_MASK // r1 = VG_TT_FAST_MASK + // Compute r6 = &VG_(tt_fast)[r6] movw r4, #:lower16:VG_(tt_fast) - - and r2, r1, r0, LSR #1 // r2 = entry # - movt r4, #:upper16:VG_(tt_fast) // r4 = &VG_(tt_fast) - - add r1, r4, r2, LSL #3 // r1 = &tt_fast[entry#] - - ldrd r4, r5, [r1, #0] // r4 = .guest, r5 = .host - - cmp r4, r0 - - // jump to host if lookup succeeded - bxeq r5 - - /* otherwise the fast lookup failed */ - /* RM ME -- stats only */ - movw r1, #:lower16:vgPlain_stats__n_xindir_misses_32 - movt r1, #:upper16:vgPlain_stats__n_xindir_misses_32 - ldr r2, [r1, #0] - add r2, r2, #1 - str r2, [r1, #0] + movt r4, #:upper16:VG_(tt_fast) + add r6, r4, r6, LSL #VG_FAST_CACHE_SET_BITS // &VG_(tt_fast)[setNo] + + // LIVE: r8 (guest state ptr), r0 (guest addr), r6 (cache set) + // try way 0 + ldr r4, [r6, #FCS_g0] // .guest0 + ldr r5, [r6, #FCS_h0] // .host0 + cmp r4, r0 // cmp against .guest0 + bne 1f + // hit at way 0 + // goto .host0 + bx r5 + /*NOTREACHED*/ + +1: // try way 1 + ldr r4, [r6, #FCS_g1] + cmp r4, r0 // cmp against .guest1 + bne 2f + // hit at way 1; swap upwards + ldr r1, [r6, #FCS_g0] // r1 = old .guest0 + ldr r2, [r6, #FCS_h0] // r2 = old .host0 + ldr r3, [r6, #FCS_h1] // r3 = old .host1 + str r0, [r6, #FCS_g0] // new .guest0 = guest + str r3, [r6, #FCS_h0] // new .host0 = old .host1 + str r1, [r6, #FCS_g1] // new .guest1 = old .guest0 + str r2, [r6, #FCS_h1] // new .host1 = old .host0 + // stats only + movw r4, #:lower16:VG_(stats__n_xIndir_hits1_32) + movt r4, #:upper16:VG_(stats__n_xIndir_hits1_32) + ldr r5, [r4, #0] + add r5, r5, #1 + str r5, [r4, #0] + // goto old .host1 a.k.a. new .host0 + bx r3 + /*NOTREACHED*/ + +2: // try way 2 + ldr r4, [r6, #FCS_g2] + cmp r4, r0 // cmp against .guest2 + bne 3f + // hit at way 2; swap upwards + ldr r1, [r6, #FCS_g1] + ldr r2, [r6, #FCS_h1] + ldr r3, [r6, #FCS_h2] + str r0, [r6, #FCS_g1] + str r3, [r6, #FCS_h1] + str r1, [r6, #FCS_g2] + str r2, [r6, #FCS_h2] + // stats only + movw r4, #:lower16:VG_(stats__n_xIndir_hits2_32) + movt r4, #:upper16:VG_(stats__n_xIndir_hits2_32) + ldr r5, [r4, #0] + add r5, r5, #1 + str r5, [r4, #0] + // goto old .host2 a.k.a. new .host1 + bx r3 + /*NOTREACHED*/ + +3: // try way 3 + ldr r4, [r6, #FCS_g3] + cmp r4, r0 // cmp against .guest3 + bne 4f + // hit at way 3; swap upwards + ldr r1, [r6, #FCS_g2] + ldr r2, [r6, #FCS_h2] + ldr r3, [r6, #FCS_h3] + str r0, [r6, #FCS_g2] + str r3, [r6, #FCS_h2] + str r1, [r6, #FCS_g3] + str r2, [r6, #FCS_h3] + // stats only + movw r4, #:lower16:VG_(stats__n_xIndir_hits3_32) + movt r4, #:upper16:VG_(stats__n_xIndir_hits3_32) + ldr r5, [r4, #0] + add r5, r5, #1 + str r5, [r4, #0] + // goto old .host3 a.k.a. new .host2 + bx r3 + /*NOTREACHED*/ + +4: // fast lookup failed + movw r4, #:lower16:VG_(stats__n_xIndir_misses_32) + movt r4, #:upper16:VG_(stats__n_xIndir_misses_32) + ldr r5, [r4, #0] + add r5, r5, #1 + str r5, [r4, #0] mov r1, #VG_TRC_INNER_FASTMISS mov r2, #0 diff --git a/coregrind/m_dispatch/dispatch-arm64-linux.S b/coregrind/m_dispatch/dispatch-arm64-linux.S index ee289fa..554fa9b 100644 --- a/coregrind/m_dispatch/dispatch-arm64-linux.S +++ b/coregrind/m_dispatch/dispatch-arm64-linux.S @@ -173,42 +173,118 @@ VG_(disp_cp_chain_me_to_fastEP): /* ------ Indirect but boring jump ------ */ .global VG_(disp_cp_xindir) VG_(disp_cp_xindir): - /* Where are we going? */ + // Where are we going? ldr x0, [x21, #OFFSET_arm64_PC] - /* stats only */ - adrp x1, VG_(stats__n_xindirs_32) - add x1, x1, :lo12:VG_(stats__n_xindirs_32) - ldr w2, [x1, #0] - add w2, w2, #1 - str w2, [x1, #0] - - /* try a fast lookup in the translation cache */ - // x0 = next guest, x1,x2,x3,x4 scratch - mov x1, #VG_TT_FAST_MASK // x1 = VG_TT_FAST_MASK - and x2, x1, x0, LSR #2 // x2 = entry # = (x1 & (x0 >> 2)) - + // stats only + adrp x4, VG_(stats__n_xIndirs_32) + add x4, x4, :lo12:VG_(stats__n_xIndirs_32) + ldr w5, [x4, #0] + add w5, w5, #1 + str w5, [x4, #0] + + // LIVE: x21 (guest state ptr), x0 (guest address to go to). + // We use 6 temporaries: + // x6 (to point at the relevant FastCacheSet), + // x1, x2, x3 (scratch, for swapping entries within a set) + // x4, x5 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute x6 = VG_TT_FAST_HASH(guest) + lsr x6, x0, #2 // g2 = guest >> 2 + eor x6, x6, x6, LSR #VG_TT_FAST_BITS // (g2 >> VG_TT_FAST_BITS) ^ g2 + mov x4, #VG_TT_FAST_MASK // VG_TT_FAST_MASK + and x6, x6, x4 // setNo + + // Compute x6 = &VG_(tt_fast)[x6] adrp x4, VG_(tt_fast) - add x4, x4, :lo12:VG_(tt_fast) // x4 = &VG_(tt_fast) - - add x1, x4, x2, LSL #4 // r1 = &tt_fast[entry#] + add x4, x4, :lo12:VG_(tt_fast) // &VG_(tt_fast)[0] + add x6, x4, x6, LSL #VG_FAST_CACHE_SET_BITS // &VG_(tt_fast)[setNo] + + // LIVE: x21 (guest state ptr), x0 (guest addr), x6 (cache set) + // try way 0 + ldp x4, x5, [x6, #FCS_g0] // x4 = .guest0, x5 = .host0 + cmp x4, x0 // cmp against .guest0 + bne 1f + // hit at way 0 + // goto .host0 + br x5 + /*NOTREACHED*/ - ldp x4, x5, [x1, #0] // x4 = .guest, x5 = .host +1: // try way 1 + ldr x4, [x6, #FCS_g1] + cmp x4, x0 // cmp against .guest1 + bne 2f + // hit at way 1; swap upwards + ldr x1, [x6, #FCS_g0] // x1 = old .guest0 + ldr x2, [x6, #FCS_h0] // x2 = old .host0 + ldr x3, [x6, #FCS_h1] // x3 = old .host1 + str x0, [x6, #FCS_g0] // new .guest0 = guest + str x3, [x6, #FCS_h0] // new .host0 = old .host1 + str x1, [x6, #FCS_g1] // new .guest1 = old .guest0 + str x2, [x6, #FCS_h1] // new .host1 = old .host0 + // stats only + adrp x4, VG_(stats__n_xIndir_hits1_32) + add x4, x4, :lo12:VG_(stats__n_xIndir_hits1_32) + ldr w5, [x4, #0] + add w5, w5, #1 + str w5, [x4, #0] + // goto old .host1 a.k.a. new .host0 + br x3 + /*NOTREACHED*/ - cmp x4, x0 +2: // try way 2 + ldr x4, [x6, #FCS_g2] + cmp x4, x0 // cmp against .guest2 + bne 3f + // hit at way 2; swap upwards + ldr x1, [x6, #FCS_g1] + ldr x2, [x6, #FCS_h1] + ldr x3, [x6, #FCS_h2] + str x0, [x6, #FCS_g1] + str x3, [x6, #FCS_h1] + str x1, [x6, #FCS_g2] + str x2, [x6, #FCS_h2] + // stats only + adrp x4, VG_(stats__n_xIndir_hits2_32) + add x4, x4, :lo12:VG_(stats__n_xIndir_hits2_32) + ldr w5, [x4, #0] + add w5, w5, #1 + str w5, [x4, #0] + // goto old .host2 a.k.a. new .host1 + br x3 + /*NOTREACHED*/ - // jump to host if lookup succeeded - bne fast_lookup_failed - br x5 +3: // try way 3 + ldr x4, [x6, #FCS_g3] + cmp x4, x0 // cmp against .guest3 + bne 4f + // hit at way 3; swap upwards + ldr x1, [x6, #FCS_g2] + ldr x2, [x6, #FCS_h2] + ldr x3, [x6, #FCS_h3] + str x0, [x6, #FCS_g2] + str x3, [x6, #FCS_h2] + str x1, [x6, #FCS_g3] + str x2, [x6, #FCS_h3] + // stats only + adrp x4, VG_(stats__n_xIndir_hits3_32) + add x4, x4, :lo12:VG_(stats__n_xIndir_hits3_32) + ldr w5, [x4, #0] + add w5, w5, #1 + str w5, [x4, #0] + // goto old .host3 a.k.a. new .host2 + br x3 /*NOTREACHED*/ -fast_lookup_failed: - /* RM ME -- stats only */ - adrp x1, VG_(stats__n_xindir_misses_32) - add x1, x1, :lo12:VG_(stats__n_xindir_misses_32) - ldr w2, [x1, #0] - add w2, w2, #1 - str w2, [x1, #0] +4: // fast lookup failed + adrp x4, VG_(stats__n_xIndir_misses_32) + add x4, x4, :lo12:VG_(stats__n_xIndir_misses_32) + ldr w5, [x4, #0] + add w5, w5, #1 + str w5, [x4, #0] mov x1, #VG_TRC_INNER_FASTMISS mov x2, #0 diff --git a/coregrind/m_dispatch/dispatch-mips32-linux.S b/coregrind/m_dispatch/dispatch-mips32-linux.S index 9918403..fdb1e29 100644 --- a/coregrind/m_dispatch/dispatch-mips32-linux.S +++ b/coregrind/m_dispatch/dispatch-mips32-linux.S @@ -175,47 +175,116 @@ VG_(disp_cp_chain_me_to_fastEP): .global VG_(disp_cp_xindir) VG_(disp_cp_xindir): /* Where are we going? */ - lw $11, OFFSET_mips32_PC($23) - - lw $13, vgPlain_stats__n_xindirs_32 - addiu $13, $13, 0x1 - sw $13, vgPlain_stats__n_xindirs_32 - - /* try a fast lookup in the translation cache */ - /* t1 = VG_TT_FAST_HASH(addr) * sizeof(ULong*) - = (t8 >> 2 & VG_TT_FAST_MASK) << 3 */ - - move $14, $11 - li $12, VG_TT_FAST_MASK - srl $14, $14, 2 - and $14, $14, $12 - sll $14, $14, 3 - - /* t2 = (addr of VG_(tt_fast)) + t1 */ - la $13, VG_(tt_fast) - addu $13, $13, $14 - - lw $12, 0($13) /* t3 = VG_(tt_fast)[hash] :: ULong* */ - addiu $13, $13, 4 - lw $25, 0($13) /* little-endian, so comparing 1st 32bit word */ - nop - -check: - bne $12, $11, fast_lookup_failed - /* run the translation */ - jr $25 - .long 0x0 /* persuade insn decoders not to speculate past here */ - -fast_lookup_failed: - /* %PC is up to date */ - /* back out decrement of the dispatch counter */ - /* hold dispatch_ctr in t0 (r8) */ - lw $13, vgPlain_stats__n_xindirs_32 - addiu $13, $13, 0x1 - sw $13, vgPlain_stats__n_xindirs_32 - li $2, VG_TRC_INNER_FASTMISS - li $3, 0 - b postamble + lw $10, OFFSET_mips32_PC($23) + + /* stats only */ + lw $15, VG_(stats__n_xIndirs_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndirs_32) + + // LIVE: r23 (guest state ptr), r10 (guest address to go to). + // We use 6 temporaries: + // r16 (to point at the relevant FastCacheSet), + // r11, r12, r13 (scratch, for swapping entries within a set) + // r14, r15 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute r16 = VG_TT_FAST_HASH(guest) + srl $16, $10, 2 // g2 = guest >> 2 + srl $15, $10, (VG_TT_FAST_BITS + 2) // (g2 >> VG_TT_FAST_BITS) + xor $16, $16, $15 // (g2 >> VG_TT_FAST_BITS) ^ g2 + li $15, VG_TT_FAST_MASK + and $16, $16, $15 // setNo + + // Compute r16 = &VG_(tt_fast)[r16] + la $15, VG_(tt_fast) + sll $16, $16, VG_FAST_CACHE_SET_BITS + addu $16, $16, $15 + + // LIVE: r23 (guest state ptr), r10 (guest addr), r16 (cache set) + // try way 0 + lw $14, FCS_g0($16) // .guest0 + lw $15, FCS_h0($16) // .host0 + bne $14, $10, 1f // cmp against .guest0 + // hit at way 0 + // goto .host0 + jr $15 + /*NOTREACHED*/ + .long 0x0 + +1: // try way 1 + lw $14, FCS_g1($16) + bne $14, $10, 2f // cmp against .guest1 + // hit at way 1; swap upwards + lw $11, FCS_g0($16) // $11 = old .guest0 + lw $12, FCS_h0($16) // $12 = old .host0 + lw $13, FCS_h1($16) // $13 = old .host1 + sw $10, FCS_g0($16) // new .guest0 = guest + sw $13, FCS_h0($16) // new .host0 = old .host1 + sw $11, FCS_g1($16) // new .guest1 = old .guest0 + sw $12, FCS_h1($16) // new .host1 = old .host0 + // stats only + lw $15, VG_(stats__n_xIndir_hits1_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_hits1_32) + // goto old .host1 a.k.a. new .host0 + jr $13 + /*NOTREACHED*/ + .long 0x0 + +2: // try way 2 + lw $14, FCS_g2($16) + bne $14, $10, 3f // cmp against .guest2 + // hit at way 2; swap upwards + lw $11, FCS_g1($16) + lw $12, FCS_h1($16) + lw $13, FCS_h2($16) + sw $10, FCS_g1($16) + sw $13, FCS_h1($16) + sw $11, FCS_g2($16) + sw $12, FCS_h2($16) + // stats only + lw $15, VG_(stats__n_xIndir_hits2_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_hits2_32) + // goto old .host2 a.k.a. new .host1 + jr $13 + /*NOTREACHED*/ + .long 0x0 + +3: // try way 3 + lw $14, FCS_g3($16) + bne $14, $10, 4f // cmp against .guest3 + // hit at way 3; swap upwards + lw $11, FCS_g2($16) + lw $12, FCS_h2($16) + lw $13, FCS_h3($16) + sw $10, FCS_g2($16) + sw $13, FCS_h2($16) + sw $11, FCS_g3($16) + sw $12, FCS_h3($16) + // stats only + lw $15, VG_(stats__n_xIndir_hits3_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_hits3_32) + // goto old .host3 a.k.a. new .host2 + jr $13 + /*NOTREACHED*/ + .long 0x0 + +4: // fast lookup failed: + /* stats only */ + lw $15, VG_(stats__n_xIndir_misses_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_misses_32) + + li $2, VG_TRC_INNER_FASTMISS + li $3, 0 + b postamble + /*NOTREACHED*/ + .long 0x0 /* ------ Assisted jump ------ */ .global VG_(disp_cp_xassisted) diff --git a/coregrind/m_dispatch/dispatch-mips64-linux.S b/coregrind/m_dispatch/dispatch-mips64-linux.S index 4a2b1b7..5d1efd6 100644 --- a/coregrind/m_dispatch/dispatch-mips64-linux.S +++ b/coregrind/m_dispatch/dispatch-mips64-linux.S @@ -182,47 +182,116 @@ VG_(disp_cp_chain_me_to_fastEP): .global VG_(disp_cp_xindir) VG_(disp_cp_xindir): /* Where are we going? */ - ld $11, OFFSET_mips64_PC($23) - - lw $13, vgPlain_stats__n_xindirs_32 - addiu $13, $13, 0x1 - sw $13, vgPlain_stats__n_xindirs_32 - - /* try a fast lookup in the translation cache */ - /* t1 = VG_TT_FAST_HASH(addr) * sizeof(ULong*) - = (t8 >> 2 & VG_TT_FAST_MASK) << 3 */ - - move $14, $11 - li $12, VG_TT_FAST_MASK - srl $14, $14, 2 - and $14, $14, $12 - sll $14, $14, 3 - - /* t2 = (addr of VG_(tt_fast)) + t1 */ - dla $13, VG_(tt_fast) - daddu $13, $13, $14 - - ld $12, 0($13) /* t3 = VG_(tt_fast)[hash] :: ULong* */ - daddiu $13, $13, 8 - ld $25, 0($13) /* little-endian, so comparing 1st 32bit word */ - nop - -check: - bne $12, $11, fast_lookup_failed - /* run the translation */ - jr $25 - .long 0x0 /* persuade insn decoders not to speculate past here */ - -fast_lookup_failed: - /* %PC is up to date */ - /* back out decrement of the dispatch counter */ - /* hold dispatch_ctr in t0 (r8) */ - lw $13, vgPlain_stats__n_xindirs_32 - addiu $13, $13, 0x1 - sw $13, vgPlain_stats__n_xindirs_32 - li $2, VG_TRC_INNER_FASTMISS - li $3, 0 - b postamble + ld $10, OFFSET_mips64_PC($23) + + /* stats only */ + lw $15, VG_(stats__n_xIndirs_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndirs_32) + + // LIVE: r23 (guest state ptr), r10 (guest address to go to). + // We use 6 temporaries: + // r16 (to point at the relevant FastCacheSet), + // r11, r12, r13 (scratch, for swapping entries within a set) + // r14, r15 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute r16 = VG_TT_FAST_HASH(guest) + dsrl $16, $10, 2 // g2 = guest >> 2 + dsrl $15, $10, (VG_TT_FAST_BITS + 2) // (g2 >> VG_TT_FAST_BITS) + xor $16, $16, $15 // (g2 >> VG_TT_FAST_BITS) ^ g2 + li $15, VG_TT_FAST_MASK + and $16, $16, $15 // setNo + + // Compute r16 = &VG_(tt_fast)[r16] + dla $15, VG_(tt_fast) + dsll $16, $16, VG_FAST_CACHE_SET_BITS + daddu $16, $16, $15 + + // LIVE: r23 (guest state ptr), r10 (guest addr), r16 (cache set) + // try way 0 + ld $14, FCS_g0($16) // .guest0 + ld $15, FCS_h0($16) // .host0 + bne $14, $10, 1f // cmp against .guest0 + // hit at way 0 + // goto .host0 + jr $15 + /*NOTREACHED*/ + .long 0x0 + +1: // try way 1 + ld $14, FCS_g1($16) + bne $14, $10, 2f // cmp against .guest1 + // hit at way 1; swap upwards + ld $11, FCS_g0($16) // $11 = old .guest0 + ld $12, FCS_h0($16) // $12 = old .host0 + ld $13, FCS_h1($16) // $13 = old .host1 + sd $10, FCS_g0($16) // new .guest0 = guest + sd $13, FCS_h0($16) // new .host0 = old .host1 + sd $11, FCS_g1($16) // new .guest1 = old .guest0 + sd $12, FCS_h1($16) // new .host1 = old .host0 + // stats only + lw $15, VG_(stats__n_xIndir_hits1_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_hits1_32) + // goto old .host1 a.k.a. new .host0 + jr $13 + /*NOTREACHED*/ + .long 0x0 + +2: // try way 2 + ld $14, FCS_g2($16) + bne $14, $10, 3f // cmp against .guest2 + // hit at way 2; swap upwards + ld $11, FCS_g1($16) + ld $12, FCS_h1($16) + ld $13, FCS_h2($16) + sd $10, FCS_g1($16) + sd $13, FCS_h1($16) + sd $11, FCS_g2($16) + sd $12, FCS_h2($16) + // stats only + lw $15, VG_(stats__n_xIndir_hits2_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_hits2_32) + // goto old .host2 a.k.a. new .host1 + jr $13 + /*NOTREACHED*/ + .long 0x0 + +3: // try way 3 + ld $14, FCS_g3($16) + bne $14, $10, 4f // cmp against .guest3 + // hit at way 3; swap upwards + ld $11, FCS_g2($16) + ld $12, FCS_h2($16) + ld $13, FCS_h3($16) + sd $10, FCS_g2($16) + sd $13, FCS_h2($16) + sd $11, FCS_g3($16) + sd $12, FCS_h3($16) + // stats only + lw $15, VG_(stats__n_xIndir_hits3_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_hits3_32) + // goto old .host3 a.k.a. new .host2 + jr $13 + /*NOTREACHED*/ + .long 0x0 + +4: // fast lookup failed: + /* stats only */ + lw $15, VG_(stats__n_xIndir_misses_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_misses_32) + + li $2, VG_TRC_INNER_FASTMISS + li $3, 0 + b postamble + /*NOTREACHED*/ + .long 0x0 /* ------ Assisted jump ------ */ .global VG_(disp_cp_xassisted) diff --git a/coregrind/m_dispatch/dispatch-ppc32-linux.S b/coregrind/m_dispatch/dispatch-ppc32-linux.S index 432306b..d3ff2d1 100644 --- a/coregrind/m_dispatch/dispatch-ppc32-linux.S +++ b/coregrind/m_dispatch/dispatch-ppc32-linux.S @@ -437,44 +437,128 @@ VG_(disp_cp_chain_me_to_fastEP): .global VG_(disp_cp_xindir) VG_(disp_cp_xindir): /* Where are we going? */ - lwz 3,OFFSET_ppc32_CIA(31) + lwz 20, OFFSET_ppc32_CIA(31) /* stats only */ - lis 5,VG_(stats__n_xindirs_32)@ha - addi 5,5,VG_(stats__n_xindirs_32)@l - lwz 6,0(5) - addi 6,6,1 - stw 6,0(5) + lis 24, VG_(stats__n_xIndirs_32)@ha + addi 24, 24, VG_(stats__n_xIndirs_32)@l + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + + // LIVE: r31 (guest state ptr), r20 (guest address to go to). + // We use 6 temporaries: + // r26 (to point at the relevant FastCacheSet), + // r21, r22, r23 (scratch, for swapping entries within a set) + // r24, r25 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute r26 = VG_TT_FAST_HASH(guest) + srwi 26, 20, 2 // g2 = guest >> 2 + srwi 25, 20, (VG_TT_FAST_BITS + 2) // (g2 >> VG_TT_FAST_BITS) + xor 26, 26, 25 // (g2 >> VG_TT_FAST_BITS) ^ g2 + andi. 26, 26, VG_TT_FAST_MASK // setNo - /* r5 = &VG_(tt_fast) */ - lis 5,VG_(tt_fast)@ha - addi 5,5,VG_(tt_fast)@l /* & VG_(tt_fast) */ - - /* try a fast lookup in the translation cache */ - /* r4 = VG_TT_FAST_HASH(addr) * sizeof(FastCacheEntry) - = ((r3 >>u 2) & VG_TT_FAST_MASK) << 3 */ - rlwinm 4,3,1, 29-VG_TT_FAST_BITS, 28 /* entry# * 8 */ - add 5,5,4 /* & VG_(tt_fast)[entry#] */ - lwz 6,0(5) /* .guest */ - lwz 7,4(5) /* .host */ - cmpw 3,6 - bne fast_lookup_failed - - /* Found a match. Jump to .host. */ - mtctr 7 + // Compute r6 = &VG_(tt_fast)[r6] + lis 25, VG_(tt_fast)@ha + addi 25, 25, VG_(tt_fast)@l + slwi 26, 26, VG_FAST_CACHE_SET_BITS + add 26, 26, 25 + + // LIVE: r31 (guest state ptr), r20 (guest addr), r26 (cache set) + // try way 0 + lwz 24, FCS_g0(26) // .guest0 + lwz 25, FCS_h0(26) // .host0 + cmpw 24, 20 // cmp against .guest0 + bne 1f + // hit at way 0 + // goto .host0 + mtctr 25 bctr + /*NOTREACHED*/ + +1: // try way 1 + lwz 24, FCS_g1(26) + cmpw 24, 20 // cmp against .guest1 + bne 2f + // hit at way 1; swap upwards + lwz 21, FCS_g0(26) // 21 = old .guest0 + lwz 22, FCS_h0(26) // 22 = old .host0 + lwz 23, FCS_h1(26) // 23 = old .host1 + stw 20, FCS_g0(26) // new .guest0 = guest + stw 23, FCS_h0(26) // new .host0 = old .host1 + stw 21, FCS_g1(26) // new .guest1 = old .guest0 + stw 22, FCS_h1(26) // new .host1 = old .host0 + // stats only + lis 24, VG_(stats__n_xIndir_hits1_32)@ha + addi 24, 24, VG_(stats__n_xIndir_hits1_32)@l + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host1 a.k.a. new .host0 + mtctr 23 + bctr + /*NOTREACHED*/ + +2: // try way 2 + lwz 24, FCS_g2(26) + cmpw 24, 20 // cmp against .guest2 + bne 3f + // hit at way 2; swap upwards + lwz 21, FCS_g1(26) + lwz 22, FCS_h1(26) + lwz 23, FCS_h2(26) + stw 20, FCS_g1(26) + stw 23, FCS_h1(26) + stw 21, FCS_g2(26) + stw 22, FCS_h2(26) + // stats only + lis 24, VG_(stats__n_xIndir_hits2_32)@ha + addi 24, 24, VG_(stats__n_xIndir_hits2_32)@l + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host2 a.k.a. new .host1 + mtctr 23 + bctr + /*NOTREACHED*/ + +3: // try way 3 + lwz 24, FCS_g3(26) + cmpw 24, 20 // cmp against .guest3 + bne 4f + // hit at way 3; swap upwards + lwz 21, FCS_g2(26) + lwz 22, FCS_h2(26) + lwz 23, FCS_h3(26) + stw 20, FCS_g2(26) + stw 23, FCS_h2(26) + stw 21, FCS_g3(26) + stw 22, FCS_h3(26) + // stats only + lis 24, VG_(stats__n_xIndir_hits3_32)@ha + addi 24, 24, VG_(stats__n_xIndir_hits3_32)@l + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host3 a.k.a. new .host2 + mtctr 23 + bctr + /*NOTREACHED*/ -fast_lookup_failed: +4: // fast lookup failed: /* stats only */ - lis 5,VG_(stats__n_xindir_misses_32)@ha - addi 5,5,VG_(stats__n_xindir_misses_32)@l - lwz 6,0(5) - addi 6,6,1 - stw 6,0(5) - - li 6,VG_TRC_INNER_FASTMISS - li 7,0 - b postamble + lis 24, VG_(stats__n_xIndir_misses_32)@ha + addi 24, 24, VG_(stats__n_xIndir_misses_32)@l + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + + li 6, VG_TRC_INNER_FASTMISS + li 7, 0 + b postamble /*NOTREACHED*/ /* ------ Assisted jump ------ */ diff --git a/coregrind/m_dispatch/dispatch-ppc64be-linux.S b/coregrind/m_dispatch/dispatch-ppc64be-linux.S index 91bd3b2..c5592d4 100644 --- a/coregrind/m_dispatch/dispatch-ppc64be-linux.S +++ b/coregrind/m_dispatch/dispatch-ppc64be-linux.S @@ -45,14 +45,27 @@ .type vgPlain_tt_fast, @object */ .section ".toc","aw" + .tocent__vgPlain_tt_fast: .tc vgPlain_tt_fast[TC],vgPlain_tt_fast -.tocent__vgPlain_stats__n_xindirs_32: - .tc vgPlain_stats__n_xindirs_32[TC],vgPlain_stats__n_xindirs_32 -.tocent__vgPlain_stats__n_xindir_misses_32: - .tc vgPlain_stats__n_xindir_misses_32[TC],vgPlain_stats__n_xindir_misses_32 + +.tocent__vgPlain_stats__n_xIndirs_32: + .tc vgPlain_stats__n_xIndirs_32[TC], vgPlain_stats__n_xIndirs_32 + +.tocent__vgPlain_stats__n_xIndir_hits1_32: + .tc vgPlain_stats__n_xIndir_hits1_32[TC], vgPlain_stats__n_xIndir_hits1_32 + +.tocent__vgPlain_stats__n_xIndir_hits2_32: + .tc vgPlain_stats__n_xIndir_hits2_32[TC], vgPlain_stats__n_xIndir_hits2_32 + +.tocent__vgPlain_stats__n_xIndir_hits3_32: + .tc vgPlain_stats__n_xIndir_hits3_32[TC], vgPlain_stats__n_xIndir_hits3_32 + +.tocent__vgPlain_stats__n_xIndir_misses_32: + .tc vgPlain_stats__n_xIndir_misses_32[TC], vgPlain_stats__n_xIndir_misses_32 + .tocent__vgPlain_machine_ppc64_has_VMX: - .tc vgPlain_machine_ppc64_has_VMX[TC],vgPlain_machine_ppc64_has_VMX + .tc vgPlain_machine_ppc64_has_VMX[TC], vgPlain_machine_ppc64_has_VMX /*------------------------------------------------------------*/ /*--- ---*/ @@ -454,42 +467,122 @@ VG_(disp_cp_xindir): .globl .VG_(disp_cp_xindir) .VG_(disp_cp_xindir): /* Where are we going? */ - ld 3,OFFSET_ppc64_CIA(31) + ld 20, OFFSET_ppc64_CIA(31) /* stats only */ - ld 5, .tocent__vgPlain_stats__n_xindirs_32@toc(2) - lwz 6,0(5) - addi 6,6,1 - stw 6,0(5) - - /* r5 = &VG_(tt_fast) */ - ld 5, .tocent__vgPlain_tt_fast@toc(2) /* &VG_(tt_fast) */ - - /* try a fast lookup in the translation cache */ - /* r4 = VG_TT_FAST_HASH(addr) * sizeof(FastCacheEntry) - = ((r3 >>u 2) & VG_TT_FAST_MASK) << 4 */ - rldicl 4,3, 62, 64-VG_TT_FAST_BITS /* entry# */ - sldi 4,4,4 /* entry# * sizeof(FastCacheEntry) */ - add 5,5,4 /* & VG_(tt_fast)[entry#] */ - ld 6,0(5) /* .guest */ - ld 7,8(5) /* .host */ - cmpd 3,6 - bne .fast_lookup_failed - - /* Found a match. Jump to .host. */ - mtctr 7 + ld 24, .tocent__vgPlain_stats__n_xIndirs_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + + // LIVE: r31 (guest state ptr), r20 (guest address to go to). + // We use 6 temporaries: + // r26 (to point at the relevant FastCacheSet), + // r21, r22, r23 (scratch, for swapping entries within a set) + // r24, r25 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute r26 = VG_TT_FAST_HASH(guest) + srdi 26, 20, 2 // g2 = guest >> 2 + srdi 25, 20, (VG_TT_FAST_BITS + 2) // (g2 >> VG_TT_FAST_BITS) + xor 26, 26, 25 // (g2 >> VG_TT_FAST_BITS) ^ g2 + andi. 26, 26, VG_TT_FAST_MASK // setNo + + // Compute r6 = &VG_(tt_fast)[r6] + ld 25, .tocent__vgPlain_tt_fast@toc(2) + sldi 26, 26, VG_FAST_CACHE_SET_BITS + add 26, 26, 25 + + // LIVE: r31 (guest state ptr), r20 (guest addr), r26 (cache set) + // try way 0 + ld 24, FCS_g0(26) // .guest0 + ld 25, FCS_h0(26) // .host0 + cmpd 24, 20 // cmp against .guest0 + bne 1f + // hit at way 0 + // goto .host0 + mtctr 25 bctr + /*NOTREACHED*/ + +1: // try way 1 + ld 24, FCS_g1(26) + cmpd 24, 20 // cmp against .guest1 + bne 2f + // hit at way 1; swap upwards + ld 21, FCS_g0(26) // 21 = old .guest0 + ld 22, FCS_h0(26) // 22 = old .host0 + ld 23, FCS_h1(26) // 23 = old .host1 + std 20, FCS_g0(26) // new .guest0 = guest + std 23, FCS_h0(26) // new .host0 = old .host1 + std 21, FCS_g1(26) // new .guest1 = old .guest0 + std 22, FCS_h1(26) // new .host1 = old .host0 + // stats only + ld 24, .tocent__vgPlain_stats__n_xIndir_hits1_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host1 a.k.a. new .host0 + mtctr 23 + bctr + /*NOTREACHED*/ + +2: // try way 2 + ld 24, FCS_g2(26) + cmpd 24, 20 // cmp against .guest2 + bne 3f + // hit at way 2; swap upwards + ld 21, FCS_g1(26) + ld 22, FCS_h1(26) + ld 23, FCS_h2(26) + std 20, FCS_g1(26) + std 23, FCS_h1(26) + std 21, FCS_g2(26) + std 22, FCS_h2(26) + // stats only + ld 24, .tocent__vgPlain_stats__n_xIndir_hits2_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host2 a.k.a. new .host1 + mtctr 23 + bctr + /*NOTREACHED*/ + +3: // try way 3 + ld 24, FCS_g3(26) + cmpd 24, 20 // cmp against .guest3 + bne 4f + // hit at way 3; swap upwards + ld 21, FCS_g2(26) + ld 22, FCS_h2(26) + ld 23, FCS_h3(26) + std 20, FCS_g2(26) + std 23, FCS_h2(26) + std 21, FCS_g3(26) + std 22, FCS_h3(26) + // stats only + ld 24, .tocent__vgPlain_stats__n_xIndir_hits3_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host3 a.k.a. new .host2 + mtctr 23 + bctr + /*NOTREACHED*/ -.fast_lookup_failed: +4: // fast lookup failed: /* stats only */ - ld 5, .tocent__vgPlain_stats__n_xindir_misses_32@toc(2) - lwz 6,0(5) - addi 6,6,1 - stw 6,0(5) - - li 6,VG_TRC_INNER_FASTMISS - li 7,0 - b .postamble + ld 24, .tocent__vgPlain_stats__n_xIndir_misses_32@toc(2) + lwz 25, 0(24) + addi 25 ,25, 1 + stw 25 ,0(24) + + li 6,VG_TRC_INNER_FASTMISS + li 7,0 + b .postamble /*NOTREACHED*/ /* ------ Assisted jump ------ */ diff --git a/coregrind/m_dispatch/dispatch-ppc64le-linux.S b/coregrind/m_dispatch/dispatch-ppc64le-linux.S index 21e4358..3e26d77 100644 --- a/coregrind/m_dispatch/dispatch-ppc64le-linux.S +++ b/coregrind/m_dispatch/dispatch-ppc64le-linux.S @@ -54,14 +54,27 @@ .type vgPlain_tt_fast, @object */ .section ".toc","aw" + .tocent__vgPlain_tt_fast: .tc vgPlain_tt_fast[TC],vgPlain_tt_fast -.tocent__vgPlain_stats__n_xindirs_32: - .tc vgPlain_stats__n_xindirs_32[TC],vgPlain_stats__n_xindirs_32 -.tocent__vgPlain_stats__n_xindir_misses_32: - .tc vgPlain_stats__n_xindir_misses_32[TC],vgPlain_stats__n_xindir_misses_32 + +.tocent__vgPlain_stats__n_xIndirs_32: + .tc vgPlain_stats__n_xIndirs_32[TC], vgPlain_stats__n_xIndirs_32 + +.tocent__vgPlain_stats__n_xIndir_hits1_32: + .tc vgPlain_stats__n_xIndir_hits1_32[TC], vgPlain_stats__n_xIndir_hits1_32 + +.tocent__vgPlain_stats__n_xIndir_hits2_32: + .tc vgPlain_stats__n_xIndir_hits2_32[TC], vgPlain_stats__n_xIndir_hits2_32 + +.tocent__vgPlain_stats__n_xIndir_hits3_32: + .tc vgPlain_stats__n_xIndir_hits3_32[TC], vgPlain_stats__n_xIndir_hits3_32 + +.tocent__vgPlain_stats__n_xIndir_misses_32: + .tc vgPlain_stats__n_xIndir_misses_32[TC], vgPlain_stats__n_xIndir_misses_32 + .tocent__vgPlain_machine_ppc64_has_VMX: - .tc vgPlain_machine_ppc64_has_VMX[TC],vgPlain_machine_ppc64_has_VMX + .tc vgPlain_machine_ppc64_has_VMX[TC], vgPlain_machine_ppc64_has_VMX /*------------------------------------------------------------*/ /*--- ---*/ @@ -518,47 +531,127 @@ VG_(disp_cp_xindir): addi 2,2,.TOC.-0b@l .localentry VG_(disp_cp_xindir), .-VG_(disp_cp_xindir) #endif - /* Where are we going? */ - ld 3,OFFSET_ppc64_CIA(31) + /* Where are we going? */ + ld 20, OFFSET_ppc64_CIA(31) /* stats only */ - ld 5, .tocent__vgPlain_stats__n_xindirs_32@toc(2) - lwz 6,0(5) - addi 6,6,1 - stw 6,0(5) - - /* r5 = &VG_(tt_fast) */ - ld 5, .tocent__vgPlain_tt_fast@toc(2) /* &VG_(tt_fast) */ - - /* try a fast lookup in the translation cache */ - /* r4 = VG_TT_FAST_HASH(addr) * sizeof(FastCacheEntry) - = ((r3 >>u 2) & VG_TT_FAST_MASK) << 4 */ - rldicl 4,3, 62, 64-VG_TT_FAST_BITS /* entry# */ - sldi 4,4,4 /* entry# * sizeof(FastCacheEntry) */ - add 5,5,4 /* & VG_(tt_fast)[entry#] */ - ld 6,0(5) /* .guest */ - ld 7,8(5) /* .host */ - cmpd 3,6 - bne .fast_lookup_failed - - /* Found a match. Jump to .host. */ - mtctr 7 + ld 24, .tocent__vgPlain_stats__n_xIndirs_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + + // LIVE: r31 (guest state ptr), r20 (guest address to go to). + // We use 6 temporaries: + // r26 (to point at the relevant FastCacheSet), + // r21, r22, r23 (scratch, for swapping entries within a set) + // r24, r25 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute r26 = VG_TT_FAST_HASH(guest) + srdi 26, 20, 2 // g2 = guest >> 2 + srdi 25, 20, (VG_TT_FAST_BITS + 2) // (g2 >> VG_TT_FAST_BITS) + xor 26, 26, 25 // (g2 >> VG_TT_FAST_BITS) ^ g2 + andi. 26, 26, VG_TT_FAST_MASK // setNo + + // Compute r6 = &VG_(tt_fast)[r6] + ld 25, .tocent__vgPlain_tt_fast@toc(2) + sldi 26, 26, VG_FAST_CACHE_SET_BITS + add 26, 26, 25 + + // LIVE: r31 (guest state ptr), r20 (guest addr), r26 (cache set) + // try way 0 + ld 24, FCS_g0(26) // .guest0 + ld 25, FCS_h0(26) // .host0 + cmpd 24, 20 // cmp against .guest0 + bne 1f + // hit at way 0 + // goto .host0 + mtctr 25 bctr -#if _CALL_ELF == 2 - .size VG_(disp_cp_xindir),.-VG_(disp_cp_xindir) -#endif + /*NOTREACHED*/ + +1: // try way 1 + ld 24, FCS_g1(26) + cmpd 24, 20 // cmp against .guest1 + bne 2f + // hit at way 1; swap upwards + ld 21, FCS_g0(26) // 21 = old .guest0 + ld 22, FCS_h0(26) // 22 = old .host0 + ld 23, FCS_h1(26) // 23 = old .host1 + std 20, FCS_g0(26) // new .guest0 = guest + std 23, FCS_h0(26) // new .host0 = old .host1 + std 21, FCS_g1(26) // new .guest1 = old .guest0 + std 22, FCS_h1(26) // new .host1 = old .host0 + // stats only + ld 24, .tocent__vgPlain_stats__n_xIndir_hits1_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host1 a.k.a. new .host0 + mtctr 23 + bctr + /*NOTREACHED*/ + +2: // try way 2 + ld 24, FCS_g2(26) + cmpd 24, 20 // cmp against .guest2 + bne 3f + // hit at way 2; swap upwards + ld 21, FCS_g1(26) + ld 22, FCS_h1(26) + ld 23, FCS_h2(26) + std 20, FCS_g1(26) + std 23, FCS_h1(26) + std 21, FCS_g2(26) + std 22, FCS_h2(26) + // stats only + ld 24, .tocent__vgPlain_stats__n_xIndir_hits2_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host2 a.k.a. new .host1 + mtctr 23 + bctr + /*NOTREACHED*/ + +3: // try way 3 + ld 24, FCS_g3(26) + cmpd 24, 20 // cmp against .guest3 + bne 4f + // hit at way 3; swap upwards + ld 21, FCS_g2(26) + ld 22, FCS_h2(26) + ld 23, FCS_h3(26) + std 20, FCS_g2(26) + std 23, FCS_h2(26) + std 21, FCS_g3(26) + std 22, FCS_h3(26) + // stats only + ld 24, .tocent__vgPlain_stats__n_xIndir_hits3_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host3 a.k.a. new .host2 + mtctr 23 + bctr + /*NOTREACHED*/ -.fast_lookup_failed: +4: // fast lookup failed: /* stats only */ - ld 5, .tocent__vgPlain_stats__n_xindir_misses_32@toc(2) - lwz 6,0(5) - addi 6,6,1 - stw 6,0(5) - - li 6,VG_TRC_INNER_FASTMISS - li 7,0 - b .postamble + ld 24, .tocent__vgPlain_stats__n_xIndir_misses_32@toc(2) + lwz 25, 0(24) + addi 25 ,25, 1 + stw 25 ,0(24) + + li 6,VG_TRC_INNER_FASTMISS + li 7,0 + b .postamble /*NOTREACHED*/ +#if _CALL_ELF == 2 + .size VG_(disp_cp_xindir),.-VG_(disp_cp_xindir) +#endif /* ------ Assisted jump ------ */ .section ".text" diff --git a/coregrind/m_dispatch/dispatch-s390x-linux.S b/coregrind/m_dispatch/dispatch-s390x-linux.S index 83c2e2a..c31e32a 100644 --- a/coregrind/m_dispatch/dispatch-s390x-linux.S +++ b/coregrind/m_dispatch/dispatch-s390x-linux.S @@ -197,54 +197,121 @@ VG_(disp_cp_chain_me_to_fastEP): /* ------ Indirect but boring jump ------ */ .global VG_(disp_cp_xindir) VG_(disp_cp_xindir): - /* Where are we going? */ - lg %r2, OFFSET_s390x_IA(%r13) - - /* Increment VG_(stats__n_xindirs_32) */ - larl %r8, VG_(stats__n_xindirs_32) - l %r10,0(%r8) - ahi %r10,1 - st %r10,0(%r8) - - /* Try a fast lookup in the translation cache: - Compute offset (not index) into VT_(tt_fast): - - offset = VG_TT_FAST_HASH(addr) * sizeof(FastCacheEntry) - - with VG_TT_FAST_HASH(addr) == (addr >> 1) & VG_TT_FAST_MASK - and sizeof(FastCacheEntry) == 16 - - offset = ((addr >> 1) & VG_TT_FAST_MASK) << 4 - which is - offset = ((addr & (VG_TT_FAST_MASK << 1) ) << 3 - */ - larl %r8, VG_(tt_fast) - llill %r5,(VG_TT_FAST_MASK << 1) & 0xffff -#if ((( VG_TT_FAST_MASK << 1) & 0xffff0000) >> 16 != 0) - iilh %r5,((VG_TT_FAST_MASK << 1) & 0xffff0000) >> 16 -#endif - ngr %r5,%r2 - sllg %r7,%r5,3 - lg %r11, 8(%r8,%r7) /* .host */ - cg %r2, 0(%r8,%r7) /* next guest address == .guest ? */ - jne fast_lookup_failed - - /* Found a match. Call .host. - r11 is an address. There we will find the instrumented client code. - That code may modify the guest state register r13. */ - br %r11 - .long 0x0 /* persuade insn decoders not to speculate past here */ - -fast_lookup_failed: - /* Increment VG_(stats__n_xindir_misses_32) */ - larl %r8, VG_(stats__n_xindir_misses_32) - l %r10,0(%r8) - ahi %r10,1 - st %r10,0(%r8) - - lghi %r0,VG_TRC_INNER_FASTMISS - lghi %r1,0 + /* Where are we going? */ + lg %r6, OFFSET_s390x_IA(%r13) // "guest" + + /* stats only */ + larl %r11, VG_(stats__n_xIndirs_32) + l %r12, 0(%r11) + ahi %r12, 1 + st %r12, 0(%r11) + + // LIVE: r13 (guest state ptr), r6 (guest address to go to). + // We use 6 temporaries: + // r7 (to point at the relevant FastCacheSet), + // r8, r9, r10 (scratch, for swapping entries within a set) + // r11, r12 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute %r7 = VG_TT_FAST_HASH(guest) + srlg %r7, %r6, 1 // g1 = guest >> 1 + srlg %r8, %r6, (VG_TT_FAST_BITS + 1) // (g1 >> VG_TT_FAST_BITS) + xgr %r7, %r8 // (g1 >> VG_TT_FAST_BITS) ^ g1 + llill %r8, VG_TT_FAST_MASK & 0xffff +# if ((VG_TT_FAST_MASK & 0xffff0000) >> 16 != 0) + iilh %r8, (VG_TT_FAST_MASK & 0xffff0000) >> 16 +# endif + ngr %r7, %r8 // setNo + + // Compute %r7 = &VG_(tt_fast)[%r7] + sllg %r7,%r7, VG_FAST_CACHE_SET_BITS // setNo * sizeof(FastCacheSet) + larl %r8, VG_(tt_fast) // &VG_(tt_fast)[0] + agr %r7, %r8 // &VG_(tt_fast)[setNo] + + // LIVE: %r13 (guest state ptr), %r6 (guest addr), %r7 (cache set) + // try way 0 + cg %r6, FCS_g0(%r7) // cmp against .guest0 + lg %r8, FCS_h0(%r7) + jne 1f + // hit at way 0 + // goto .host0 + br %r8 + /*NOTREACHED*/ + .long 0 + +1: // try way 1 + cg %r6, FCS_g1(%r7) // cmp against .guest1 + jne 2f + // hit at way 1; swap upwards + lg %r8, FCS_g0(%r7) // r8 = old .guest0 + lg %r9, FCS_h0(%r7) // r9 = old .host0 + lg %r10, FCS_h1(%r7) // r10 = old .host1 + stg %r6, FCS_g0(%r7) // new .guest0 = guest + stg %r10, FCS_h0(%r7) // new .host0 = old .host1 + stg %r8, FCS_g1(%r7) // new .guest1 = old .guest0 + stg %r9, FCS_h1(%r7) // new .host1 = old .host0 + // stats only + larl %r11, VG_(stats__n_xIndir_hits1_32) + l %r12, 0(%r11) + ahi %r12, 1 + st %r12, 0(%r11) + // goto old .host1 a.k.a. new .host0 + br %r10 + /*NOTREACHED*/ + .long 0 + +2: // try way 2 + cg %r6, FCS_g2(%r7) // cmp against .guest2 + jne 3f + lg %r8, FCS_g1(%r7) + lg %r9, FCS_h1(%r7) + lg %r10, FCS_h2(%r7) + stg %r6, FCS_g1(%r7) + stg %r10, FCS_h1(%r7) + stg %r8, FCS_g2(%r7) + stg %r9, FCS_h2(%r7) + // stats only + larl %r11, VG_(stats__n_xIndir_hits2_32) + l %r12, 0(%r11) + ahi %r12, 1 + st %r12, 0(%r11) + // goto old .host2 a.k.a. new .host1 + br %r10 + /*NOTREACHED*/ + .long 0 + +3: // try way 3 + cg %r6, FCS_g3(%r7) // cmp against .guest3 + jne 4f + // hit at way 3; swap upwards + lg %r8, FCS_g2(%r7) + lg %r9, FCS_h2(%r7) + lg %r10, FCS_h3(%r7) + stg %r6, FCS_g2(%r7) + stg %r10, FCS_h2(%r7) + stg %r8, FCS_g3(%r7) + stg %r9, FCS_h3(%r7) + // stats only + larl %r11, VG_(stats__n_xIndir_hits3_32) + l %r12, 0(%r11) + ahi %r12, 1 + st %r12, 0(%r11) + // goto old .host3 a.k.a. new .host2 + br %r10 + .long 0 + +4: // fast lookup failed + larl %r11, VG_(stats__n_xIndir_misses_32) + l %r12, 0(%r11) + ahi %r12, 1 + st %r12, 0(%r11) + + lghi %r0, VG_TRC_INNER_FASTMISS + lghi %r1, 0 j postamble + /*NOTREACHED*/ /* ------ Assisted jump ------ */ diff --git a/coregrind/m_dispatch/dispatch-x86-darwin.S b/coregrind/m_dispatch/dispatch-x86-darwin.S index 55188e9..467d7d6 100644 --- a/coregrind/m_dispatch/dispatch-x86-darwin.S +++ b/coregrind/m_dispatch/dispatch-x86-darwin.S @@ -194,29 +19... [truncated message content] |