|
From: <sv...@va...> - 2012-04-02 21:56:12
|
sewardj 2012-04-02 22:56:03 +0100 (Mon, 02 Apr 2012)
New Revision: 12484
Log:
Add translation chaining support for amd64, x86 and ARM
(Valgrind side). See #296422.
Added files:
branches/TCHAIN/docs/internals/t-chaining-notes.txt
Modified files:
branches/TCHAIN/Makefile.all.am
branches/TCHAIN/coregrind/m_dispatch/dispatch-amd64-linux.S
branches/TCHAIN/coregrind/m_dispatch/dispatch-arm-linux.S
branches/TCHAIN/coregrind/m_dispatch/dispatch-x86-linux.S
branches/TCHAIN/coregrind/m_errormgr.c
branches/TCHAIN/coregrind/m_gdbserver/server.c
branches/TCHAIN/coregrind/m_libcproc.c
branches/TCHAIN/coregrind/m_main.c
branches/TCHAIN/coregrind/m_scheduler/scheduler.c
branches/TCHAIN/coregrind/m_translate.c
branches/TCHAIN/coregrind/m_transtab.c
branches/TCHAIN/coregrind/m_xarray.c
branches/TCHAIN/coregrind/pub_core_dispatch.h
branches/TCHAIN/coregrind/pub_core_dispatch_asm.h
branches/TCHAIN/coregrind/pub_core_libcproc.h
branches/TCHAIN/coregrind/pub_core_translate.h
branches/TCHAIN/coregrind/pub_core_transtab.h
branches/TCHAIN/coregrind/pub_core_transtab_asm.h
branches/TCHAIN/docs/Makefile.am
branches/TCHAIN/drd/drd_load_store.c
branches/TCHAIN/drd/tests/unit_bitmap.c
branches/TCHAIN/helgrind/hg_main.c
branches/TCHAIN/include/pub_tool_xarray.h
branches/TCHAIN/memcheck/tests/Makefile.am
branches/TCHAIN/memcheck/tests/unit_oset.c
branches/TCHAIN/none/tests/arm/Makefile.am
Added: branches/TCHAIN/docs/internals/t-chaining-notes.txt (+201 -0)
===================================================================
--- branches/TCHAIN/docs/internals/t-chaining-notes.txt 2012-04-02 22:25:14 +01:00 (rev 12483)
+++ branches/TCHAIN/docs/internals/t-chaining-notes.txt 2012-04-02 22:56:03 +01:00 (rev 12484)
@@ -0,0 +1,201 @@
+
+DO NOT MERGE
+~~~~~~~~~~~
+
+Changes memcheck/tests/Makefile.am w.r.t. -mfloat-abi=softfp
+Ditto none/tests/arm/Makefile.am
+
+
+Verification todo
+~~~~~~~~~~~~~~~~~
+check that illegal insns on all targets don't cause the _toIR.c's to
+assert.
+
+check also with --vex-guest-chase-cond=yes
+
+check that all targets can run their insn set tests with
+--vex-guest-max-insns=1.
+
+
+Cleanups
+~~~~~~~~
+host_arm_isel.c and host_arm_defs.c: get rid of global var arm_hwcaps.
+
+host_x86_defs.c, host_amd64_defs.c: return proper VexInvalRange
+records from the patchers, instead of {0,0}, so that transparent
+self hosting works properly.
+
+
+Optimisations
+~~~~~~~~~~~~~
+all targets: change VG_(stats__n_xindirs) to a 32 bit counter, and
+empty out every now and again.
+
+amd64: XDirect: write const value to guest_RIP using single
+insn when the value is < 0x8000'0000
+
+arm: chain_XDirect: generate short form jumps when possible
+
+arm codegen: Generate ORRS for CmpwNEZ32(Or32(x,y))
+
+all targets: when nuking an entire sector, don't bother to undo the
+patching for any translations within the sector (nor with their
+invalidations).
+
+(somewhat implausible) for jumps to disp_cp_indir, have multiple
+copies of disp_cp_indir, one for each of the possible registers that
+could have held the target guest address before jumping to the stub.
+Then disp_cp_indir wouldn't have to reload it from memory each time.
+Might also have the effect of spreading out the indirect mispredict
+burden somewhat (across the multiple copies.)
+
+
+Implementation notes
+~~~~~~~~~~~~~~~~~~~~
+T-chaining changes -- summary
+
+* The code generators (host_blah_isel.c, host_blah_defs.[ch]) interact
+ more closely with Valgrind than before. In particular the
+ instruction selectors must use one of 3 different kinds of
+ control-transfer instructions: XDirect, XIndir and XAssisted.
+ All archs must use these the same; no more ad-hoc control transfer
+ instructions.
+ (more detail below)
+
+
+* With T-chaining, translations can jump between each other without
+ going through the dispatcher loop every time. This means that the
+ event check (counter dec, and exit if negative) the dispatcher loop
+ previously did now needs to be compiled into each translation.
+
+
+* The assembly dispatcher code (dispatch-arch-os.S) is still
+ present. It still provides table lookup services for
+ indirect branches, but it also provides a new feature:
+ dispatch points, to which the generated code jumps. There
+ are 5:
+
+ VG_(disp_cp_chain_me_to_slowEP):
+ VG_(disp_cp_chain_me_to_fastEP):
+ These are chain-me requests, used for Boring conditional and
+ unconditional jumps to destinations known at JIT time. The
+ generated code calls these (doesn't jump to them) and the
+ stub recovers the return address. These calls never return;
+ instead the call is done so that the stub knows where the
+ calling point is. It needs to know this so it can patch
+ the calling point to the requested destination.
+ VG_(disp_cp_xindir):
+ Old-style table lookup and go; used for indirect jumps
+ VG_(disp_cp_xassisted):
+ Most general and slowest kind. Can transfer to anywhere, but
+ first returns to scheduler to do some other event (eg a syscall)
+ before continuing.
+ VG_(disp_cp_evcheck_fail):
+ Code jumps here when the event check fails.
+
+
+* new instructions in backends: XDirect, XIndir and XAssisted.
+ XDirect is used for chainable jumps. It is compiled into a
+ call to VG_(disp_cp_chain_me_to_slowEP) or
+ VG_(disp_cp_chain_me_to_fastEP).
+
+ XIndir is used for indirect jumps. It is compiled into a jump
+ to VG_(disp_cp_xindir)
+
+ XAssisted is used for "assisted" (do something first, then jump)
+ transfers. It is compiled into a jump to VG_(disp_cp_xassisted)
+
+ All 3 of these may be conditional.
+
+ More complexity: in some circumstances (no-redir translations)
+ all transfers must be done with XAssisted. In such cases the
+ instruction selector will be told this.
+
+
+* Patching: XDirect is compiled basically into
+ %r11 = &VG_(disp_cp_chain_me_to_{slow,fast}EP)
+ call *%r11
+ Backends must provide a function (eg) chainXDirect_AMD64
+ which converts it into a jump to a specified destination
+ jmp $delta-of-PCs
+ or
+ %r11 = 64-bit immediate
+ jmpq *%r11
+ depending on branch distance.
+
+ Backends must provide a function (eg) unchainXDirect_AMD64
+ which restores the original call-to-the-stub version.
+
+
+* Event checks. Each translation now has two entry points,
+ the slow one (slowEP) and fast one (fastEP). Like this:
+
+ slowEP:
+ counter--
+ if (counter < 0) goto VG_(disp_cp_evcheck_fail)
+ fastEP:
+ (rest of the translation)
+
+ slowEP is used for control flow transfers that are or might be
+ a back edge in the control flow graph. Insn selectors are
+ given the address of the highest guest byte in the block so
+ they can determine which edges are definitely not back edges.
+
+ The counter is placed in the first 8 bytes of the guest state,
+ and the address of VG_(disp_cp_evcheck_fail) is placed in
+ the next 8 bytes. This allows very compact checks on all
+ targets, since no immediates need to be synthesised, eg:
+
+ decq 0(%baseblock-pointer)
+ jns fastEP
+ jmpq *8(baseblock-pointer)
+ fastEP:
+
+ On amd64 a non-failing check is therefore 2 insns; all 3 occupy
+ just 8 bytes.
+
+ On amd64 the event check is created by a special single
+ pseudo-instruction AMD64_EvCheck.
+
+
+* BB profiling (for --profile-flags=). The dispatch assembly
+ dispatch-arch-os.S no longer deals with this and so is much
+ simplified. Instead the profile inc is compiled into each
+ translation, as the insn immediately following the event
+ check. Again, on amd64 a pseudo-insn AMD64_ProfInc is used.
+ Counters are now 64 bit even on 32 bit hosts, to avoid overflow.
+
+ One complexity is that at JIT time it is not known where the
+ address of the counter is. To solve this, VexTranslateResult
+ now returns the offset of the profile inc in the generated
+ code. When the counter address is known, VEX can be called
+ again to patch it in. Backends must supply eg
+ patchProfInc_AMD64 to make this happen.
+
+
+* Front end changes (guest_blah_toIR.c)
+
+ The way the guest program counter is handled has changed
+ significantly. Previously, the guest PC was updated (in IR)
+ at the start of each instruction, except for the first insn
+ in an IRSB. This is inconsistent and doesn't work with the
+ new framework.
+
+ Now, each instruction must update the guest PC as its last
+ IR statement -- not its first. And no special exemption for
+ the first insn in the block. As before most of these are
+ optimised out by ir_opt, so no concerns about efficiency.
+
+ As a logical side effect of this, exits (IRStmt_Exit) and the
+ block-end transfer are both considered to write to the guest state
+ (the guest PC) and so need to be told the offset of it.
+
+ IR generators (eg disInstr_AMD64) are no longer allowed to set the
+ IRSB::next, to specify the block-end transfer address. Instead they
+ now indicate, to the generic steering logic that drives them (iow,
+ guest_generic_bb_to_IR.c), that the block has ended. This then
+ generates effectively "goto GET(PC)" (which, again, is optimised
+ away). What this does mean is that if the IR generator function
+ ends the IR of the last instruction in the block with an incorrect
+ assignment to the guest PC, execution will transfer to an incorrect
+ destination -- making the error obvious quickly.
Modified: branches/TCHAIN/drd/tests/unit_bitmap.c (+2 -0)
===================================================================
--- branches/TCHAIN/drd/tests/unit_bitmap.c 2012-04-02 22:25:14 +01:00 (rev 12483)
+++ branches/TCHAIN/drd/tests/unit_bitmap.c 2012-04-02 22:56:03 +01:00 (rev 12484)
@@ -48,6 +48,8 @@
{ return memset(s, c, sz); }
void* VG_(memcpy)(void *d, const void *s, SizeT sz)
{ return memcpy(d, s, sz); }
+void* VG_(memmove)(void *d, const void *s, SizeT sz)
+{ return memmove(d, s, sz); }
Int VG_(memcmp)(const void* s1, const void* s2, SizeT n)
{ return memcmp(s1, s2, n); }
UInt VG_(printf)(const HChar *format, ...)
Modified: branches/TCHAIN/coregrind/m_xarray.c (+14 -0)
===================================================================
--- branches/TCHAIN/coregrind/m_xarray.c 2012-04-02 22:25:14 +01:00 (rev 12483)
+++ branches/TCHAIN/coregrind/m_xarray.c 2012-04-02 22:56:03 +01:00 (rev 12484)
@@ -311,6 +311,20 @@
xa->usedsizeE -= n;
}
+void VG_(removeIndexXA)( XArray* xao, Word n )
+{
+ struct _XArray* xa = (struct _XArray*)xao;
+ vg_assert(xa);
+ vg_assert(n >= 0);
+ vg_assert(n < xa->usedsizeE);
+ if (n+1 < xa->usedsizeE) {
+ VG_(memmove)( ((char*)xa->arr) + (n+0) * xa->elemSzB,
+ ((char*)xa->arr) + (n+1) * xa->elemSzB,
+ (xa->usedsizeE - n - 1) * xa->elemSzB );
+ }
+ xa->usedsizeE--;
+}
+
void VG_(getContentsXA_UNSAFE)( XArray* xao,
/*OUT*/void** ctsP,
/*OUT*/Word* usedP )
Modified: branches/TCHAIN/none/tests/arm/Makefile.am (+3 -3)
===================================================================
--- branches/TCHAIN/none/tests/arm/Makefile.am 2012-04-02 22:25:14 +01:00 (rev 12483)
+++ branches/TCHAIN/none/tests/arm/Makefile.am 2012-04-02 22:56:03 +01:00 (rev 12484)
@@ -39,14 +39,14 @@
v6media_CFLAGS = $(AM_CFLAGS) -g -O0 -mcpu=cortex-a8 -mthumb
vfp_CFLAGS = $(AM_CFLAGS) -g -O0 -mcpu=cortex-a8 \
- -mfpu=neon -mfloat-abi=softfp \
+ -mfpu=neon \
-mthumb
neon128_CFLAGS = $(AM_CFLAGS) -g -O0 -mcpu=cortex-a8 \
- -mfpu=neon -mfloat-abi=softfp \
+ -mfpu=neon \
-mthumb
neon64_CFLAGS = $(AM_CFLAGS) -g -O0 -mcpu=cortex-a8 \
- -mfpu=neon -mfloat-abi=softfp \
+ -mfpu=neon \
-mthumb
Modified: branches/TCHAIN/coregrind/m_transtab.c (+698 -181)
===================================================================
--- branches/TCHAIN/coregrind/m_transtab.c 2012-04-02 22:25:14 +01:00 (rev 12483)
+++ branches/TCHAIN/coregrind/m_transtab.c 2012-04-02 22:56:03 +01:00 (rev 12484)
@@ -31,8 +31,10 @@
#include "pub_core_basics.h"
#include "pub_core_debuglog.h"
-#include "pub_core_machine.h" // For VG(machine_get_VexArchInfo)
+#include "pub_core_machine.h" // For VG_(machine_get_VexArchInfo)
#include "pub_core_libcbase.h"
+#include "pub_core_vki.h" // to keep pub_core_libproc.h happy, sigh
+#include "pub_core_libcproc.h" // VG_(invalidate_icache)
#include "pub_core_libcassert.h"
#include "pub_core_libcprint.h"
#include "pub_core_options.h"
@@ -40,14 +42,10 @@
#include "pub_core_transtab.h"
#include "pub_core_aspacemgr.h"
#include "pub_core_mallocfree.h" // VG_(out_of_memory_NORETURN)
+#include "pub_core_xarray.h"
+#include "pub_core_dispatch.h" // For VG_(disp_cp*) addresses
-// JRS FIXME get rid of this somehow
-#if defined(VGP_arm_linux)
-# include "pub_core_vkiscnums.h" // __ARM_NR_cacheflush
-# include "pub_core_syscall.h" // VG_(do_syscallN)
-#endif
-
/* #define DEBUG_TRANSTAB */
@@ -67,6 +65,7 @@
'deleted') and it is strongly recommended not to change this.
65521 is the largest prime <= 65535. */
#define N_TTES_PER_SECTOR /*30011*/ /*40009*/ 65521
+//DEBUG-ONLY: #define N_TTES_PER_SECTOR 10007
/* Because each sector contains a hash table of TTEntries, we need to
specify the maximum allowable loading, after which the sector is
@@ -91,6 +90,46 @@
/*------------------ TYPES ------------------*/
+/* In edges ("to-me") in the graph created by chaining. */
+typedef
+ struct {
+ UInt from_sNo; /* sector number */
+ UInt from_tteNo; /* TTE number in given sector */
+ UInt from_offs; /* code offset from TCEntry::tcptr where the patch is */
+ Bool to_fastEP; /* Is the patch to a fast or slow entry point? */
+ }
+ InEdge;
+
+
+/* Out edges ("from-me") in the graph created by chaining. */
+typedef
+ struct {
+ UInt to_sNo; /* sector number */
+ UInt to_tteNo; /* TTE number in given sector */
+ UInt from_offs; /* code offset in owning translation where patch is */
+ }
+ OutEdge;
+
+
+#define N_FIXED_IN_EDGE_ARR 3
+typedef
+ struct {
+ UInt n_fixed; /* 0 .. N_FIXED_IN_EDGE_ARR */
+ InEdge fixed[N_FIXED_IN_EDGE_ARR];
+ XArray* var; /* XArray* of InEdgeArr */
+ }
+ InEdgeArr;
+
+#define N_FIXED_OUT_EDGE_ARR 2
+typedef
+ struct {
+ UInt n_fixed; /* 0 .. N_FIXED_OUT_EDGE_ARR */
+ OutEdge fixed[N_FIXED_OUT_EDGE_ARR];
+ XArray* var; /* XArray* of OutEdgeArr */
+ }
+ OutEdgeArr;
+
+
/* A translation-table entry. This indicates precisely which areas of
guest code are included in the translation, and contains all other
auxiliary info too. */
@@ -102,7 +141,7 @@
Count is an entry count for the translation and is
incremented by 1 every time the translation is used, if we
are profiling. */
- UInt count;
+ ULong count;
UShort weight;
/* Status of the slot. Note, we need to be able to do lazy
@@ -143,15 +182,70 @@
// sec->ec2tte[ tte2ec_ec[i] ][ tte2ec_ix[i] ]
// should be the index
// of this TTEntry in the containing Sector's tt array.
+
+ /* Admin information for chaining. 'in_edges' is a set of the
+ patch points which jump to this translation -- hence are
+ predecessors in the control flow graph. 'out_edges' points
+ to successors in the control flow graph -- translations to
+ which this one has a patched jump. In short these are just
+ backwards and forwards edges in the graph of patched-together
+ blocks. The 'in_edges' contain slightly more info, enough
+ that we can undo the chaining of each mentioned patch point.
+ The 'out_edges' list exists only so that we can visit the
+ 'in_edges' entries of all blocks we're patched through to, in
+ order to remove ourselves from then when we're deleted. */
+
+ /* It is possible, although very unlikely, that a block A has
+ more than one patched jump to block B. This could happen if
+ (eg) A finishes "jcond B; jmp B".
+
+ This means in turn that B's in_edges set can list A more than
+ once (twice in this example). However, each such entry must
+ have a different from_offs, since a patched jump can only
+ jump to one place at once (it's meaningless for it to have
+ multiple destinations.) IOW, the successor and predecessor
+ edges in the graph are not uniquely determined by a
+ TTEntry --> TTEntry pair, but rather by a
+ (TTEntry,offset) --> TTEntry triple.
+
+ If A has multiple edges to B then B will mention A multiple
+ times in its in_edges. To make things simpler, we then
+ require that A mentions B exactly the same number of times in
+ its out_edges. Furthermore, a matching out-in pair must have
+ the same offset (from_offs). This facilitates sanity
+ checking, and it facilitates establishing the invariant that
+ a out_edges set may not have duplicates when using the
+ equality defined by (TTEntry,offset). Hence the out_edges
+ and in_edges sets really do have both have set semantics.
+
+ eg if A has been patched to B at offsets 42 and 87 (in A)
+ then A.out_edges = { (B,42), (B,87) } (in any order)
+ and B.in_edges = { (A,42), (A,87) } (in any order)
+
+ Hence for each node pair P->Q in the graph, there's a 1:1
+ mapping between P.out_edges and Q.in_edges.
+ */
+ InEdgeArr in_edges;
+ OutEdgeArr out_edges;
}
TTEntry;
+/* A structure used for mapping host code addresses back to the
+ relevant TTEntry. Used when doing chaining, for finding the
+ TTEntry to which some arbitrary patch address belongs. */
+typedef
+ struct {
+ UChar* start;
+ UInt len;
+ UInt tteNo;
+ }
+ HostExtent;
+
/* Finally, a sector itself. Each sector contains an array of
TCEntries, which hold code, and an array of TTEntries, containing
all required administrative info. Profiling is supported using the
- TTEntry .count and .weight fields, if required. Each sector is
- independent in that no cross-sector references are allowed.
+ TTEntry .count and .weight fields, if required.
If the sector is not in use, all three pointers are NULL and
tt_n_inuse is zero.
@@ -181,6 +275,11 @@
Int ec2tte_size[ECLASS_N];
Int ec2tte_used[ECLASS_N];
UShort* ec2tte[ECLASS_N];
+
+ /* The host extents. The [start, +len) ranges are constructed
+ in strictly non-overlapping order, so we can binary search
+ them at any time. */
+ XArray* host_extents; /* XArray* of HostExtent */
}
Sector;
@@ -238,31 +337,7 @@
*/
/*global*/ __attribute__((aligned(16)))
FastCacheEntry VG_(tt_fast)[VG_TT_FAST_SIZE];
-/*
-#define TRANSTAB_BOGUS_GUEST_ADDR ((Addr)1)
-*/
-/* For profiling, we have a parallel array of pointers to .count
- fields in TT entries. Again, these pointers must be invalidated
- when translations disappear. A NULL pointer suffices to indicate
- an unused slot.
-
- When not profiling (the normal case, VG_(clo_profile_flags) == 0),
- all tt_fastN entries are set to NULL at startup and never read nor
- written after that.
-
- When profiling (VG_(clo_profile_flags) > 0), tt_fast and tt_fastN
- change together: if tt_fast[i].guest is TRANSTAB_BOGUS_GUEST_ADDR
- then the corresponding tt_fastN[i] must be null. If
- tt_fast[i].guest is any other value, then tt_fastN[i] *must* point
- to the .count field of the corresponding TT entry.
-
- tt_fast and tt_fastN are referred to from assembly code
- (dispatch.S).
-*/
-/*global*/ UInt* VG_(tt_fastN)[VG_TT_FAST_SIZE];
-
-
/* Make sure we're not used before initialisation. */
static Bool init_done = False;
@@ -270,30 +345,483 @@
/*------------------ STATS DECLS ------------------*/
/* Number of fast-cache updates and flushes done. */
-ULong n_fast_flushes = 0;
-ULong n_fast_updates = 0;
+static ULong n_fast_flushes = 0;
+static ULong n_fast_updates = 0;
/* Number of full lookups done. */
-ULong n_full_lookups = 0;
-ULong n_lookup_probes = 0;
+static ULong n_full_lookups = 0;
+static ULong n_lookup_probes = 0;
/* Number/osize/tsize of translations entered; also the number of
those for which self-checking was requested. */
-ULong n_in_count = 0;
-ULong n_in_osize = 0;
-ULong n_in_tsize = 0;
-ULong n_in_sc_count = 0;
+static ULong n_in_count = 0;
+static ULong n_in_osize = 0;
+static ULong n_in_tsize = 0;
+static ULong n_in_sc_count = 0;
/* Number/osize of translations discarded due to lack of space. */
-ULong n_dump_count = 0;
-ULong n_dump_osize = 0;
+static ULong n_dump_count = 0;
+static ULong n_dump_osize = 0;
/* Number/osize of translations discarded due to requests to do so. */
-ULong n_disc_count = 0;
-ULong n_disc_osize = 0;
+static ULong n_disc_count = 0;
+static ULong n_disc_osize = 0;
/*-------------------------------------------------------------*/
+/*--- Misc ---*/
+/*-------------------------------------------------------------*/
+
+static void* ttaux_malloc ( HChar* tag, SizeT n )
+{
+ return VG_(arena_malloc)(VG_AR_TTAUX, tag, n);
+}
+
+static void ttaux_free ( void* p )
+{
+ VG_(arena_free)(VG_AR_TTAUX, p);
+}
+
+
+/*-------------------------------------------------------------*/
+/*--- Chaining support ---*/
+/*-------------------------------------------------------------*/
+
+static inline TTEntry* index_tte ( UInt sNo, UInt tteNo )
+{
+ vg_assert(sNo < N_SECTORS);
+ vg_assert(tteNo < N_TTES_PER_SECTOR);
+ Sector* s = §ors[sNo];
+ vg_assert(s->tt);
+ TTEntry* tte = &s->tt[tteNo];
+ vg_assert(tte->status == InUse);
+ return tte;
+}
+
+static void InEdge__init ( InEdge* ie )
+{
+ ie->from_sNo = -1; /* invalid */
+ ie->from_tteNo = 0;
+ ie->from_offs = 0;
+ ie->to_fastEP = False;
+}
+
+static void OutEdge__init ( OutEdge* oe )
+{
+ oe->to_sNo = -1; /* invalid */
+ oe->to_tteNo = 0;
+ oe->from_offs = 0;
+}
+
+static void TTEntry__init ( TTEntry* tte )
+{
+ VG_(memset)(tte, 0, sizeof(*tte));
+}
+
+static UWord InEdgeArr__size ( InEdgeArr* iea )
+{
+ if (iea->var) {
+ vg_assert(iea->n_fixed == 0);
+ return VG_(sizeXA)(iea->var);
+ } else {
+ vg_assert(iea->n_fixed <= N_FIXED_IN_EDGE_ARR);
+ return iea->n_fixed;
+ }
+}
+
+static void InEdgeArr__makeEmpty ( InEdgeArr* iea )
+{
+ if (iea->var) {
+ vg_assert(iea->n_fixed == 0);
+ VG_(deleteXA)(iea->var);
+ iea->var = NULL;
+ } else {
+ vg_assert(iea->n_fixed <= N_FIXED_IN_EDGE_ARR);
+ iea->n_fixed = 0;
+ }
+}
+
+static
+InEdge* InEdgeArr__index ( InEdgeArr* iea, UWord i )
+{
+ if (iea->var) {
+ vg_assert(iea->n_fixed == 0);
+ return (InEdge*)VG_(indexXA)(iea->var, i);
+ } else {
+ vg_assert(i < iea->n_fixed);
+ return &iea->fixed[i];
+ }
+}
+
+static
+void InEdgeArr__deleteIndex ( InEdgeArr* iea, UWord i )
+{
+ if (iea->var) {
+ vg_assert(iea->n_fixed == 0);
+ VG_(removeIndexXA)(iea->var, i);
+ } else {
+ vg_assert(i < iea->n_fixed);
+ for (; i+1 < iea->n_fixed; i++) {
+ iea->fixed[i] = iea->fixed[i+1];
+ }
+ iea->n_fixed--;
+ }
+}
+
+static
+void InEdgeArr__add ( InEdgeArr* iea, InEdge* ie )
+{
+ if (iea->var) {
+ vg_assert(iea->n_fixed == 0);
+ VG_(addToXA)(iea->var, ie);
+ } else {
+ vg_assert(iea->n_fixed <= N_FIXED_IN_EDGE_ARR);
+ if (iea->n_fixed == N_FIXED_IN_EDGE_ARR) {
+ /* The fixed array is full, so we have to initialise an
+ XArray and copy the fixed array into it. */
+ iea->var = VG_(newXA)(ttaux_malloc, "transtab.IEA__add",
+ ttaux_free,
+ sizeof(InEdge));
+ UWord i;
+ for (i = 0; i < iea->n_fixed; i++) {
+ VG_(addToXA)(iea->var, &iea->fixed[i]);
+ }
+ VG_(addToXA)(iea->var, ie);
+ iea->n_fixed = 0;
+ } else {
+ /* Just add to the fixed array. */
+ iea->fixed[iea->n_fixed++] = *ie;
+ }
+ }
+}
+
+static UWord OutEdgeArr__size ( OutEdgeArr* oea )
+{
+ if (oea->var) {
+ vg_assert(oea->n_fixed == 0);
+ return VG_(sizeXA)(oea->var);
+ } else {
+ vg_assert(oea->n_fixed <= N_FIXED_OUT_EDGE_ARR);
+ return oea->n_fixed;
+ }
+}
+
+static void OutEdgeArr__makeEmpty ( OutEdgeArr* oea )
+{
+ if (oea->var) {
+ vg_assert(oea->n_fixed == 0);
+ VG_(deleteXA)(oea->var);
+ oea->var = NULL;
+ } else {
+ vg_assert(oea->n_fixed <= N_FIXED_OUT_EDGE_ARR);
+ oea->n_fixed = 0;
+ }
+}
+
+static
+OutEdge* OutEdgeArr__index ( OutEdgeArr* oea, UWord i )
+{
+ if (oea->var) {
+ vg_assert(oea->n_fixed == 0);
+ return (OutEdge*)VG_(indexXA)(oea->var, i);
+ } else {
+ vg_assert(i < oea->n_fixed);
+ return &oea->fixed[i];
+ }
+}
+
+static
+void OutEdgeArr__deleteIndex ( OutEdgeArr* oea, UWord i )
+{
+ if (oea->var) {
+ vg_assert(oea->n_fixed == 0);
+ VG_(removeIndexXA)(oea->var, i);
+ } else {
+ vg_assert(i < oea->n_fixed);
+ for (; i+1 < oea->n_fixed; i++) {
+ oea->fixed[i] = oea->fixed[i+1];
+ }
+ oea->n_fixed--;
+ }
+}
+
+static
+void OutEdgeArr__add ( OutEdgeArr* oea, OutEdge* oe )
+{
+ if (oea->var) {
+ vg_assert(oea->n_fixed == 0);
+ VG_(addToXA)(oea->var, oe);
+ } else {
+ vg_assert(oea->n_fixed <= N_FIXED_OUT_EDGE_ARR);
+ if (oea->n_fixed == N_FIXED_OUT_EDGE_ARR) {
+ /* The fixed array is full, so we have to initialise an
+ XArray and copy the fixed array into it. */
+ oea->var = VG_(newXA)(ttaux_malloc, "transtab.OEA__add",
+ ttaux_free,
+ sizeof(OutEdge));
+ UWord i;
+ for (i = 0; i < oea->n_fixed; i++) {
+ VG_(addToXA)(oea->var, &oea->fixed[i]);
+ }
+ VG_(addToXA)(oea->var, oe);
+ oea->n_fixed = 0;
+ } else {
+ /* Just add to the fixed array. */
+ oea->fixed[oea->n_fixed++] = *oe;
+ }
+ }
+}
+
+static
+Int HostExtent__cmpOrd ( void* v1, void* v2 )
+{
+ HostExtent* hx1 = (HostExtent*)v1;
+ HostExtent* hx2 = (HostExtent*)v2;
+ if (hx1->start + hx1->len <= hx2->start) return -1;
+ if (hx2->start + hx2->len <= hx1->start) return 1;
+ return 0; /* partial overlap */
+}
+
+static __attribute__((noinline))
+Bool find_TTEntry_from_hcode( /*OUT*/UInt* from_sNo,
+ /*OUT*/UInt* from_tteNo,
+ void* hcode )
+{
+ Int i;
+
+ /* Search order logic copied from VG_(search_transtab). */
+ for (i = 0; i < N_SECTORS; i++) {
+ Int sno = sector_search_order[i];
+ if (UNLIKELY(sno == -1))
+ return False; /* run out of sectors to search */
+
+ Sector* sec = §ors[sno];
+ XArray* /* of HostExtent */ host_extents = sec->host_extents;
+ vg_assert(host_extents);
+
+ HostExtent key;
+ VG_(memset)(&key, 0, sizeof(key));
+ key.start = hcode;
+ key.len = 1;
+ Word firstW = -1, lastW = -1;
+ Bool found = VG_(lookupXA_UNSAFE)(
+ host_extents, &key, &firstW, &lastW,
+ (Int(*)(void*,void*))HostExtent__cmpOrd
+ );
+ vg_assert(firstW == lastW); // always true, even if not found
+ if (found) {
+ HostExtent* hx = VG_(indexXA)(host_extents, firstW);
+ UInt tteNo = hx->tteNo;
+ /* Do some additional sanity checks. */
+ vg_assert(tteNo <= N_TTES_PER_SECTOR);
+ vg_assert(sec->tt[tteNo].status == InUse);
+ /* Can only half check that the found TTEntry contains hcode,
+ due to not having a length value for the hcode in the
+ TTEntry. */
+ vg_assert((UChar*)sec->tt[tteNo].tcptr <= (UChar*)hcode);
+ /* Looks plausible */
+ *from_sNo = sno;
+ *from_tteNo = (UInt)tteNo;
+ return True;
+ }
+ }
+ return False;
+}
+
+
+/* Figure out whether or not hcode is jitted code present in the main
+ code cache (but not in the no-redir cache). Used for sanity
+ checking. */
+static Bool is_in_the_main_TC ( void* hcode )
+{
+ Int i, sno;
+ for (i = 0; i < N_SECTORS; i++) {
+ sno = sector_search_order[i];
+ if (sno == -1)
+ break; /* run out of sectors to search */
+ if ((UChar*)hcode >= (UChar*)sectors[sno].tc
+ && (UChar*)hcode <= (UChar*)sectors[sno].tc_next
+ + sizeof(ULong) - 1)
+ return True;
+ }
+ return False;
+}
+
+
+/* Fulfill a chaining request, and record admin info so we
+ can undo it later, if required.
+*/
+void VG_(tt_tc_do_chaining) ( void* from__patch_addr,
+ UInt to_sNo,
+ UInt to_tteNo,
+ Bool to_fastEP )
+{
+ /* Get the CPU info established at startup. */
+ VexArch vex_arch = VexArch_INVALID;
+ VG_(machine_get_VexArchInfo)( &vex_arch, NULL );
+
+ // host_code is where we're patching to. So it needs to
+ // take into account, whether we're jumping to the slow
+ // or fast entry point. By definition, the fast entry point
+ // is exactly one event check's worth of code along from
+ // the slow (tcptr) entry point.
+ TTEntry* to_tte = index_tte(to_sNo, to_tteNo);
+ void* host_code = ((UChar*)to_tte->tcptr)
+ + (to_fastEP ? LibVEX_evCheckSzB(vex_arch) : 0);
+
+ // stay sane -- the patch point (dst) is in this sector's code cache
+ vg_assert( (UChar*)host_code >= (UChar*)sectors[to_sNo].tc );
+ vg_assert( (UChar*)host_code <= (UChar*)sectors[to_sNo].tc_next
+ + sizeof(ULong) - 1 );
+ // stay sane -- the patch src is in some sector's code cache
+ vg_assert( is_in_the_main_TC(from__patch_addr) );
+
+ /* Get VEX to do the patching itself. We have to hand it off
+ since it is host-dependent. */
+ VexInvalRange vir
+ = LibVEX_Chain( vex_arch,
+ from__patch_addr,
+ to_fastEP ? &VG_(disp_cp_chain_me_to_fastEP)
+ : &VG_(disp_cp_chain_me_to_slowEP),
+ (void*)host_code );
+ VG_(invalidate_icache)( (void*)vir.start, vir.len );
+
+ /* Now do the tricky bit -- update the ch_succs and ch_preds info
+ for the two translations involved, so we can undo the chaining
+ later, which we will have to do if the to_ block gets removed
+ for whatever reason. */
+ /* Find the TTEntry for the from__ code. This isn't simple since
+ we only know the patch address, which is going to be somewhere
+ inside the from_ block. */
+ UInt from_sNo = (UInt)-1;
+ UInt from_tteNo = (UInt)-1;
+ Bool from_found
+ = find_TTEntry_from_hcode( &from_sNo, &from_tteNo,
+ from__patch_addr );
+ vg_assert(from_found);
+ TTEntry* from_tte = index_tte(from_sNo, from_tteNo);
+
+ /* This is the new from_ -> to_ link to add. */
+ InEdge ie;
+ InEdge__init(&ie);
+ ie.from_sNo = from_sNo;
+ ie.from_tteNo = from_tteNo;
+ ie.to_fastEP = to_fastEP;
+ HWord from_offs = (HWord)( (UChar*)from__patch_addr
+ - (UChar*)from_tte->tcptr );
+ vg_assert(from_offs < 100000/* let's say */);
+ ie.from_offs = (UInt)from_offs;
+
+ /* This is the new to_ -> from_ backlink to add. */
+ OutEdge oe;
+ OutEdge__init(&oe);
+ oe.to_sNo = to_sNo;
+ oe.to_tteNo = to_tteNo;
+ oe.from_offs = (UInt)from_offs;
+
+ /* Add .. */
+ InEdgeArr__add(&to_tte->in_edges, &ie);
+ OutEdgeArr__add(&from_tte->out_edges, &oe);
+}
+
+
+/* Unchain one patch, as described by the specified InEdge. For
+ sanity check purposes only (to check that the patched location is
+ as expected) it also requires the fast and slow entry point
+ addresses of the destination block (that is, the block that owns
+ this InEdge). */
+__attribute__((noinline))
+static void unchain_one ( VexArch vex_arch,
+ InEdge* ie,
+ void* to_fastEPaddr, void* to_slowEPaddr )
+{
+ vg_assert(ie);
+ TTEntry* tte
+ = index_tte(ie->from_sNo, ie->from_tteNo);
+ UChar* place_to_patch
+ = ((HChar*)tte->tcptr) + ie->from_offs;
+ UChar* disp_cp_chain_me
+ = ie->to_fastEP ? &VG_(disp_cp_chain_me_to_fastEP)
+ : &VG_(disp_cp_chain_me_to_slowEP);
+ UChar* place_to_jump_to_EXPECTED
+ = ie->to_fastEP ? to_fastEPaddr : to_slowEPaddr;
+
+ // stay sane: both src and dst for this unchaining are
+ // in the main code cache
+ vg_assert( is_in_the_main_TC(place_to_patch) ); // src
+ vg_assert( is_in_the_main_TC(place_to_jump_to_EXPECTED) ); // dst
+ // dst check is ok because LibVEX_UnChain checks that
+ // place_to_jump_to_EXPECTED really is the current dst, and
+ // asserts if it isn't.
+ VexInvalRange vir
+ = LibVEX_UnChain( vex_arch, place_to_patch,
+ place_to_jump_to_EXPECTED, disp_cp_chain_me );
+ VG_(invalidate_icache)( (void*)vir.start, vir.len );
+}
+
+
+/* The specified block is about to be deleted. Update the preds and
+ succs of its associated blocks accordingly. This includes undoing
+ any chained jumps to this block. */
+static
+void unchain_in_preparation_for_deletion ( VexArch vex_arch,
+ UInt here_sNo, UInt here_tteNo )
+{
+ if (0)
+ VG_(printf)("QQQ unchain_in_prep %u.%u\n", here_sNo, here_tteNo);
+ UWord i, j, n, m;
+ Int evCheckSzB = LibVEX_evCheckSzB(vex_arch);
+ TTEntry* here_tte = index_tte(here_sNo, here_tteNo);
+ vg_assert(here_tte->status == InUse);
+
+ /* Visit all InEdges owned by here_tte. */
+ n = InEdgeArr__size(&here_tte->in_edges);
+ for (i = 0; i < n; i++) {
+ InEdge* ie = InEdgeArr__index(&here_tte->in_edges, i);
+ // Undo the chaining.
+ UChar* here_slow_EP = (UChar*)here_tte->tcptr;
+ UChar* here_fast_EP = here_slow_EP + evCheckSzB;
+ unchain_one(vex_arch, ie, here_fast_EP, here_slow_EP);
+ // Find the corresponding entry in the "from" node's out_edges,
+ // and remove it.
+ TTEntry* from_tte = index_tte(ie->from_sNo, ie->from_tteNo);
+ m = OutEdgeArr__size(&from_tte->out_edges);
+ vg_assert(m > 0); // it must have at least one entry
+ for (j = 0; j < m; j++) {
+ OutEdge* oe = OutEdgeArr__index(&from_tte->out_edges, j);
+ if (oe->to_sNo == here_sNo && oe->to_tteNo == here_tteNo
+ && oe->from_offs == ie->from_offs)
+ break;
+ }
+ vg_assert(j < m); // "oe must be findable"
+ OutEdgeArr__deleteIndex(&from_tte->out_edges, j);
+ }
+
+ /* Visit all OutEdges owned by here_tte. */
+ n = OutEdgeArr__size(&here_tte->out_edges);
+ for (i = 0; i < n; i++) {
+ OutEdge* oe = OutEdgeArr__index(&here_tte->out_edges, i);
+ // Find the corresponding entry in the "to" node's in_edges,
+ // and remove it.
+ TTEntry* to_tte = index_tte(oe->to_sNo, oe->to_tteNo);
+ m = InEdgeArr__size(&to_tte->in_edges);
+ vg_assert(m > 0); // it must have at least one entry
+ for (j = 0; j < m; j++) {
+ InEdge* ie = InEdgeArr__index(&to_tte->in_edges, j);
+ if (ie->from_sNo == here_sNo && ie->from_tteNo == here_tteNo
+ && ie->from_offs == oe->from_offs)
+ break;
+ }
+ vg_assert(j < m); // "ie must be findable"
+ InEdgeArr__deleteIndex(&to_tte->in_edges, j);
+ }
+
+ InEdgeArr__makeEmpty(&here_tte->in_edges);
+ OutEdgeArr__makeEmpty(&here_tte->out_edges);
+}
+
+
+/*-------------------------------------------------------------*/
/*--- Address-range equivalence class stuff ---*/
/*-------------------------------------------------------------*/
@@ -398,12 +926,12 @@
old_sz = sec->ec2tte_size[ec];
old_ar = sec->ec2tte[ec];
new_sz = old_sz==0 ? 8 : old_sz<64 ? 2*old_sz : (3*old_sz)/2;
- new_ar = VG_(arena_malloc)(VG_AR_TTAUX, "transtab.aECN.1",
- new_sz * sizeof(UShort));
+ new_ar = ttaux_malloc("transtab.aECN.1",
+ new_sz * sizeof(UShort));
for (i = 0; i < old_sz; i++)
new_ar[i] = old_ar[i];
if (old_ar)
- VG_(arena_free)(VG_AR_TTAUX, old_ar);
+ ttaux_free(old_ar);
sec->ec2tte_size[ec] = new_sz;
sec->ec2tte[ec] = new_ar;
@@ -575,7 +1103,6 @@
/* forwards */
static Bool sanity_check_redir_tt_tc ( void );
-static Bool sanity_check_fastcache ( void );
static Bool sanity_check_sector_search_order ( void )
{
@@ -630,8 +1157,6 @@
}
if ( !sanity_check_redir_tt_tc() )
return False;
- if ( !sanity_check_fastcache() )
- return False;
if ( !sanity_check_sector_search_order() )
return False;
return True;
@@ -669,13 +1194,11 @@
return k32 % N_TTES_PER_SECTOR;
}
-static void setFastCacheEntry ( Addr64 key, ULong* tcptr, UInt* count )
+static void setFastCacheEntry ( Addr64 key, ULong* tcptr )
{
UInt cno = (UInt)VG_TT_FAST_HASH(key);
VG_(tt_fast)[cno].guest = (Addr)key;
VG_(tt_fast)[cno].host = (Addr)tcptr;
- if (VG_(clo_profile_flags) > 0)
- VG_(tt_fastN)[cno] = count;
n_fast_updates++;
/* This shouldn't fail. It should be assured by m_translate
which should reject any attempt to make translation of code
@@ -683,23 +1206,7 @@
vg_assert(VG_(tt_fast)[cno].guest != TRANSTAB_BOGUS_GUEST_ADDR);
}
-/* Invalidate the fast cache's counter array, VG_(tt_fastN). */
-static void invalidateFastNCache ( void )
-{
- UInt j;
- vg_assert(VG_TT_FAST_SIZE > 0 && (VG_TT_FAST_SIZE % 4) == 0);
- for (j = 0; j < VG_TT_FAST_SIZE; j += 4) {
- VG_(tt_fastN)[j+0] = NULL;
- VG_(tt_fastN)[j+1] = NULL;
- VG_(tt_fastN)[j+2] = NULL;
- VG_(tt_fastN)[j+3] = NULL;
- }
- vg_assert(j == VG_TT_FAST_SIZE);
-}
-
-/* Invalidate the fast cache VG_(tt_fast). If profiling, also
- invalidate the fast cache's counter array VG_(tt_fastN), otherwise
- don't touch it. */
+/* Invalidate the fast cache VG_(tt_fast). */
static void invalidateFastCache ( void )
{
UInt j;
@@ -713,42 +1220,19 @@
VG_(tt_fast)[j+3].guest = TRANSTAB_BOGUS_GUEST_ADDR;
}
- if (VG_(clo_profile_flags) > 0)
- invalidateFastNCache();
-
vg_assert(j == VG_TT_FAST_SIZE);
n_fast_flushes++;
}
-static Bool sanity_check_fastcache ( void )
+/* Returns True if the sector has been used before (hence, if we have
+ to eject existing code in it), False if it's never been used
+ before. */
+static Bool initialiseSector ( Int sno )
{
- UInt j;
- if (0) VG_(printf)("sanity check fastcache\n");
- if (VG_(clo_profile_flags) > 0) {
- /* profiling */
- for (j = 0; j < VG_TT_FAST_SIZE; j++) {
- if (VG_(tt_fastN)[j] == NULL
- && VG_(tt_fast)[j].guest != TRANSTAB_BOGUS_GUEST_ADDR)
- return False;
- if (VG_(tt_fastN)[j] != NULL
- && VG_(tt_fast)[j].guest == TRANSTAB_BOGUS_GUEST_ADDR)
- return False;
- }
- } else {
- /* not profiling */
- for (j = 0; j < VG_TT_FAST_SIZE; j++) {
- if (VG_(tt_fastN)[j] != NULL)
- return False;
- }
- }
- return True;
-}
-
-static void initialiseSector ( Int sno )
-{
- Int i;
- SysRes sres;
+ Int i;
+ SysRes sres;
Sector* sec;
+ Bool has_been_used_before = False;
vg_assert(isValidSector(sno));
{ Bool sane = sanity_check_sector_search_order();
@@ -768,6 +1252,7 @@
vg_assert(sec->ec2tte_used[i] == 0);
vg_assert(sec->ec2tte[i] == NULL);
}
+ vg_assert(sec->host_extents == NULL);
VG_(debugLog)(1,"transtab", "allocate sector %d\n", sno);
@@ -793,6 +1278,12 @@
sec->tt[i].n_tte2ec = 0;
}
+ /* Set up the host_extents array. */
+ sec->host_extents
+ = VG_(newXA)(ttaux_malloc, "transtab.initialiseSector(host_extents)",
+ ttaux_free,
+ sizeof(HostExtent));
+
/* Add an entry in the sector_search_order */
for (i = 0; i < N_SECTORS; i++) {
if (sector_search_order[i] == -1)
@@ -808,11 +1299,16 @@
/* Sector has been used before. Dump the old contents. */
VG_(debugLog)(1,"transtab", "recycle sector %d\n", sno);
+ has_been_used_before = True;
vg_assert(sec->tt != NULL);
vg_assert(sec->tc_next != NULL);
n_dump_count += sec->tt_n_inuse;
+ VexArch vex_arch = VexArch_INVALID;
+ VG_(machine_get_VexArchInfo)( &vex_arch, NULL );
+
/* Visit each just-about-to-be-abandoned translation. */
+VG_(printf)("QQQ unlink-entire-sector: %d START\n", sno);
for (i = 0; i < N_TTES_PER_SECTOR; i++) {
if (sec->tt[i].status == InUse) {
vg_assert(sec->tt[i].n_tte2ec >= 1);
@@ -824,12 +1320,14 @@
sec->tt[i].entry,
sec->tt[i].vge );
}
+ unchain_in_preparation_for_deletion(vex_arch, sno, i);
} else {
vg_assert(sec->tt[i].n_tte2ec == 0);
}
sec->tt[i].status = Empty;
sec->tt[i].n_tte2ec = 0;
}
+VG_(printf)("QQQ unlink-entire-sector: %d END\n", sno);
/* Free up the eclass structures. */
for (i = 0; i < ECLASS_N; i++) {
@@ -838,13 +1336,18 @@
vg_assert(sec->ec2tte[i] == NULL);
} else {
vg_assert(sec->ec2tte[i] != NULL);
- VG_(arena_free)(VG_AR_TTAUX, sec->ec2tte[i]);
+ ttaux_free(sec->ec2tte[i]);
sec->ec2tte[i] = NULL;
sec->ec2tte_size[i] = 0;
sec->ec2tte_used[i] = 0;
}
}
+ /* Empty out the host extents array. */
+ vg_assert(sec->host_extents != NULL);
+ VG_(dropTailXA)(sec->host_extents, VG_(sizeXA)(sec->host_extents));
+ vg_assert(VG_(sizeXA)(sec->host_extents) == 0);
+
/* Sanity check: ensure it is already in
sector_search_order[]. */
for (i = 0; i < N_SECTORS; i++) {
@@ -865,54 +1368,8 @@
{ Bool sane = sanity_check_sector_search_order();
vg_assert(sane);
}
-}
-static void invalidate_icache ( void *ptr, Int nbytes )
-{
-# if defined(VGA_ppc32) || defined(VGA_ppc64)
- Addr startaddr = (Addr) ptr;
- Addr endaddr = startaddr + nbytes;
- Addr cls;
- Addr addr;
- VexArchInfo vai;
-
- if (nbytes == 0) return;
- vg_assert(nbytes > 0);
-
- VG_(machine_get_VexArchInfo)( NULL, &vai );
- cls = vai.ppc_cache_line_szB;
-
- /* Stay sane .. */
- vg_assert(cls == 32 || cls == 64 || cls == 128);
-
- startaddr &= ~(cls - 1);
- for (addr = startaddr; addr < endaddr; addr += cls) {
- __asm__ __volatile__("dcbst 0,%0" : : "r" (addr));
- }
- __asm__ __volatile__("sync");
- for (addr = startaddr; addr < endaddr; addr += cls) {
- __asm__ __volatile__("icbi 0,%0" : : "r" (addr));
- }
- __asm__ __volatile__("sync; isync");
-
-# elif defined(VGA_x86)
- /* no need to do anything, hardware provides coherence */
-
-# elif defined(VGA_amd64)
- /* no need to do anything, hardware provides coherence */
-
-# elif defined(VGA_s390x)
- /* no need to do anything, hardware provides coherence */
-
-# elif defined(VGP_arm_linux)
- /* ARM cache flushes are privileged, so we must defer to the kernel. */
- Addr startaddr = (Addr) ptr;
- Addr endaddr = startaddr + nbytes;
- VG_(do_syscall2)(__NR_ARM_cacheflush, startaddr, endaddr);
-
-# else
-# error "Unknown ARCH"
-# endif
+ return has_been_used_before;
}
@@ -921,18 +1378,28 @@
pre: youngest_sector points to a valid (although possibly full)
sector.
+
+ Returns True if the call caused any existing translation(s) to get
+ thrown away in order to make space for this one.
*/
-void VG_(add_to_transtab)( VexGuestExtents* vge,
+Bool VG_(add_to_transtab)( VexGuestExtents* vge,
Addr64 entry,
AddrH code,
UInt code_len,
- Bool is_self_checking )
+ Bool is_self_checking,
+ Int offs_profInc,
+ VexArch arch_host )
{
Int tcAvailQ, reqdQ, y, i;
ULong *tcptr, *tcptr2;
UChar* srcP;
UChar* dstP;
+ /* We need to tell the caller whether this call caused any code to
+ be thrown away due to the TC becoming full, and hence the oldest
+ Sector to be emptied out and recycled. */
+ Bool caused_code_discarding = False;
+
vg_assert(init_done);
vg_assert(vge->n_used >= 1 && vge->n_used <= 3);
@@ -952,8 +1419,10 @@
y = youngest_sector;
vg_assert(isValidSector(y));
- if (sectors[y].tc == NULL)
- initialiseSector(y);
+ if (sectors[y].tc == NULL) {
+ Bool used_before = initialiseSector(y);
+ vg_assert(!used_before);
+ }
/* Try putting the translation in this sector. */
reqdQ = (code_len + 7) >> 3;
@@ -983,7 +1452,8 @@
if (youngest_sector >= N_SECTORS)
youngest_sector = 0;
y = youngest_sector;
- initialiseSector(y);
+ caused_code_discarding = initialiseSector(y);
+
}
/* Be sure ... */
@@ -1002,13 +1472,10 @@
dstP = (UChar*)tcptr;
srcP = (UChar*)code;
- for (i = 0; i < code_len; i++)
- dstP[i] = srcP[i];
+ VG_(memcpy)(dstP, srcP, code_len);
sectors[y].tc_next += reqdQ;
sectors[y].tt_n_inuse++;
- invalidate_icache( dstP, code_len );
-
/* more paranoia */
tcptr2 = sectors[y].tc_next;
vg_assert(tcptr2 >= §ors[y].tc[0]);
@@ -1027,6 +1494,7 @@
i = 0;
}
+ TTEntry__init(§ors[y].tt[i]);
sectors[y].tt[i].status = InUse;
sectors[y].tt[i].tcptr = tcptr;
sectors[y].tt[i].count = 0;
@@ -1034,11 +1502,42 @@
sectors[y].tt[i].vge = *vge;
sectors[y].tt[i].entry = entry;
+ /* Patch in the profile counter location, if necessary. */
+ if (offs_profInc != -1) {
+ vg_assert(offs_profInc >= 0 && offs_profInc < code_len);
+ VexInvalRange vir
+ = LibVEX_PatchProfInc( arch_host,
+ dstP + offs_profInc,
+ §ors[y].tt[i].count );
+ VG_(invalidate_icache)( (void*)vir.start, vir.len );
+ }
+
+ VG_(invalidate_icache)( dstP, code_len );
+
+ /* Add this entry to the host_extents map, checking that we're
+ adding in order. */
+ { HostExtent hx;
+ hx.start = (UChar*)tcptr;
+ hx.len = code_len;
+ hx.tteNo = i;
+ vg_assert(hx.len > 0); /* bsearch fails w/ zero length entries */
+ XArray* hx_array = sectors[y].host_extents;
+ vg_assert(hx_array);
+ Word n = VG_(sizeXA)(hx_array);
+ if (n > 0) {
+ HostExtent* hx_prev = (HostExtent*)VG_(indexXA)(hx_array, n-1);
+ vg_assert(hx_prev->start + hx_prev->len <= hx.start);
+ }
+ VG_(addToXA)(hx_array, &hx);
+ }
+
/* Update the fast-cache. */
- setFastCacheEntry( entry, tcptr, §ors[y].tt[i].count );
+ setFastCacheEntry( entry, tcptr );
/* Note the eclass numbers for this translation. */
upd_eclasses_after_add( §ors[y], i );
+
+ return caused_code_discarding;
}
@@ -1046,7 +1545,9 @@
requested, a successful search can also cause the fast-caches to be
updated.
*/
-Bool VG_(search_transtab) ( /*OUT*/AddrH* result,
+Bool VG_(search_transtab) ( /*OUT*/AddrH* res_hcode,
+ /*OUT*/UInt* res_sNo,
+ /*OUT*/UInt* res_tteNo,
Addr64 guest_addr,
Bool upd_cache )
{
@@ -1076,10 +1577,13 @@
/* found it */
if (upd_cache)
setFastCacheEntry(
- guest_addr, sectors[sno].tt[k].tcptr,
- §ors[sno].tt[k].count );
- if (result)
- *result = (AddrH)sectors[sno].tt[k].tcptr;
+ guest_addr, sectors[sno].tt[k].tcptr );
+ if (res_hcode)
+ *res_hcode = (AddrH)sectors[sno].tt[k].tcptr;
+ if (res_sNo)
+ *res_sNo = sno;
+ if (res_tteNo)
+ *res_tteNo = k;
/* pull this one one step closer to the front. For large
apps this more or less halves the number of required
probes. */
@@ -1147,16 +1651,23 @@
/* Delete a tt entry, and update all the eclass data accordingly. */
-static void delete_tte ( /*MOD*/Sector* sec, Int tteno )
+static void delete_tte ( /*MOD*/Sector* sec, UInt secNo, Int tteno,
+ VexArch vex_arch )
{
Int i, ec_num, ec_idx;
TTEntry* tte;
+ /* sec and secNo are mutually redundant; cross-check. */
+ vg_assert(sec == §ors[secNo]);
+
vg_assert(tteno >= 0 && tteno < N_TTES_PER_SECTOR);
tte = &sec->tt[tteno];
vg_assert(tte->status == InUse);
vg_assert(tte->n_tte2ec >= 1 && tte->n_tte2ec <= 3);
+ /* Unchain .. */
+ unchain_in_preparation_for_deletion(vex_arch, secNo, tteno);
+
/* Deal with the ec-to-tte links first. */
for (i = 0; i < tte->n_tte2ec; i++) {
ec_num = (Int)tte->tte2ec_ec[i];
@@ -1192,9 +1703,10 @@
only consider translations in the specified eclass. */
static
-Bool delete_translations_in_sector_eclass ( /*MOD*/Sector* sec,
+Bool delete_translations_in_sector_eclass ( /*MOD*/Sector* sec, UInt secNo,
Addr64 guest_start, ULong range,
- Int ec )
+ Int ec,
+ VexArch vex_arch )
{
Int i;
UShort tteno;
@@ -1218,7 +1730,7 @@
if (overlaps( guest_start, range, &tte->vge )) {
anyDeld = True;
- delete_tte( sec, (Int)tteno );
+ delete_tte( sec, secNo, (Int)tteno, vex_arch );
}
}
@@ -1231,8 +1743,9 @@
slow way, by inspecting all translations in sec. */
static
-Bool delete_translations_in_sector ( /*MOD*/Sector* sec,
- Addr64 guest_start, ULong range )
+Bool delete_translations_in_sector ( /*MOD*/Sector* sec, UInt secNo,
+ Addr64 guest_start, ULong range,
+ VexArch vex_arch )
{
Int i;
Bool anyDeld = False;
@@ -1241,7 +1754,7 @@
if (sec->tt[i].status == InUse
&& overlaps( guest_start, range, &sec->tt[i].vge )) {
anyDeld = True;
- delete_tte( sec, i );
+ delete_tte( sec, secNo, i, vex_arch );
}
}
@@ -1271,6 +1784,9 @@
if (range == 0)
return;
+ VexArch vex_arch = VexArch_INVALID;
+ VG_(machine_get_VexArchInfo)( &vex_arch, NULL );
+
/* There are two different ways to do this.
If the range fits within a single address-range equivalence
@@ -1310,9 +1826,13 @@
if (sec->tc == NULL)
continue;
anyDeleted |= delete_translations_in_sector_eclass(
- sec, guest_start, range, ec );
+ sec, sno, guest_start, range, ec,
+ vex_arch
+ );
anyDeleted |= delete_translations_in_sector_eclass(
- sec, guest_start, range, ECLASS_MISC );
+ sec, sno, guest_start, range, ECLASS_MISC,
+ vex_arch
+ );
}
} else {
@@ -1327,7 +1847,7 @@
if (sec->tc == NULL)
continue;
anyDeleted |= delete_translations_in_sector(
- sec, guest_start, range );
+ sec, sno, guest_start, range, vex_arch );
}
}
@@ -1483,7 +2003,7 @@
for (j = 0; j < code_len; j++)
dstP[j] = srcP[j];
- invalidate_icache( dstP, code_len );
+ VG_(invalidate_icache)( dstP, code_len );
unredir_tt[i].inUse = True;
unredir_tt[i].vge = *vge;
@@ -1573,18 +2093,15 @@
sectors[i].ec2tte_used[j] = 0;
sectors[i].ec2tte[j] = NULL;
}
+ sectors[i].host_extents = NULL;
}
/* Initialise the sector_search_order hint table. */
for (i = 0; i < N_SECTORS; i++)
sector_search_order[i] = -1;
- /* Initialise the fast caches. If not profiling (the usual case),
- we have to explicitly invalidate the fastN cache as
- invalidateFastCache() won't do that for us. */
+ /* Initialise the fast cache. */
invalidateFastCache();
- if (VG_(clo_profile_flags) == 0)
- invalidateFastNCache();
/* and the unredir tt/tc */
init_unredir_tt_tc();
Modified: branches/TCHAIN/coregrind/m_dispatch/dispatch-amd64-linux.S (+127 -227)
===================================================================
--- branches/TCHAIN/coregrind/m_dispatch/dispatch-amd64-linux.S 2012-04-02 22:25:14 +01:00 (rev 12483)
+++ branches/TCHAIN/coregrind/m_dispatch/dispatch-amd64-linux.S 2012-04-02 22:56:03 +01:00 (rev 12484)
@@ -39,30 +39,36 @@
/*------------------------------------------------------------*/
/*--- ---*/
-/*--- The dispatch loop. VG_(run_innerloop) is used to ---*/
-/*--- run all translations except no-redir ones. ---*/
+/*--- The dispatch loop. VG_(disp_run_translations) is ---*/
+/*--- used to run all translations, ---*/
+/*--- including no-redir ones. ---*/
/*--- ---*/
/*------------------------------------------------------------*/
/*----------------------------------------------------*/
-/*--- Preamble (set everything up) ---*/
+/*--- Entry and preamble (set everything up) ---*/
/*----------------------------------------------------*/
/* signature:
-UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
+UWord VG_(disp_run_translations)( UWord* two_words,
+ void* guest_state,
+ Addr host_addr );
*/
+.text
+.globl VG_(disp_run_translations)
+.type VG_(disp_run_translations), @function
+VG_(disp_run_translations):
+ /* %rdi holds two_words */
+ /* %rsi holds guest_state */
+ /* %rdx holds host_addr */
-.text
-.globl VG_(run_innerloop)
-.type VG_(run_innerloop), @function
-VG_(run_innerloop):
- /* %rdi holds guest_state */
- /* %rsi holds do_profiling */
-
- /* ----- entry point to VG_(run_innerloop) ----- */
+ /* The preamble */
+
+ /* Save integer registers, since this is a pseudo-function. */
+ pushq %rax
pushq %rbx
pushq %rcx
- pushq %rdx
+ pushq %rdx
pushq %rsi
pushq %rbp
pushq %r8
@@ -73,21 +79,11 @@
pushq %r13
pushq %r14
pushq %r15
- pushq %rdi /* guest_state */
+ /* %rdi must be saved last */
+ pushq %rdi
- movq VG_(dispatch_ctr)@GOTPCREL(%rip), %r15
- movl (%r15), %r15d
- pushq %r15
+ /* Get the host CPU in the state expected by generated code. */
- /* 8(%rsp) holds cached copy of guest_state ptr */
- /* 0(%rsp) holds cached copy of VG_(dispatch_ctr) */
-
- /* Set up the guest state pointer */
- movq %rdi, %rbp
-
- /* fetch %RIP into %rax */
- movq OFFSET_amd64_RIP(%rbp), %rax
-
/* set host FPU control word to the default mode expected
by VEX-generated code. See comments in libvex.h for
more info. */
@@ -105,158 +101,37 @@
/* set dir flag to known value */
cld
- /* fall into main loop (the right one) */
- cmpq $0, %rsi
- je VG_(run_innerloop__dispatch_unassisted_unprofiled)
- jmp VG_(run_innerloop__dispatch_unassisted_profiled)
- /*NOTREACHED*/
+ /* Set up the guest state pointer */
+ movq %rsi, %rbp
-/*----------------------------------------------------*/
-/*--- NO-PROFILING (standard) dispatcher ---*/
-/*----------------------------------------------------*/
+ /* and jump into the code cache. Chained translations in
+ the code cache run, until for whatever reason, they can't
+ continue. When that happens, the translation in question
+ will jump (or call) to one of the continuation points
+ VG_(cp_...) below. */
+ jmpq *%rdx
+ /*NOTREACHED*/
-.align 16
-.global VG_(run_innerloop__dispatch_unassisted_unprofiled)
-VG_(run_innerloop__dispatch_unassisted_unprofiled):
- /* AT ENTRY: %rax is next guest addr, %rbp is the
- unmodified guest state ptr */
-
- /* save the jump address in the guest state */
- movq %rax, OFFSET_amd64_RIP(%rbp)
-
- /* Are we out of timeslice? If yes, defer to scheduler. */
- subl $1, 0(%rsp)
- jz counter_is_zero
-
- /* try a fast lookup in the translation cache */
- movabsq $VG_(tt_fast), %rcx
- movq %rax, %rbx /* next guest addr */
- andq $VG_TT_FAST_MASK, %rbx /* entry# */
- shlq $4, %rbx /* entry# * sizeof(FastCacheEntry) */
- movq 0(%rcx,%rbx,1), %r10 /* .guest */
- movq 8(%rcx,%rbx,1), %r11 /* .host */
- cmpq %rax, %r10
- jnz fast_lookup_failed
-
- /* Found a match. Jump to .host. */
- jmp *%r11
- ud2 /* persuade insn decoders not to speculate past here */
- /* generated code should run, then jump back to either
- VG_(run_innerloop__dispatch_unassisted_unprofiled)
- VG_(run_innerloop__dispatch_assisted_unprofiled). */
- /*NOTREACHED*/
-
-.align 16
-.global VG_(run_innerloop__dispatch_assisted_unprofiled)
-VG_(run_innerloop__dispatch_assisted_unprofiled):
- /* AT ENTRY: %rax is next guest addr, %rbp is the
- modified guest state ptr */
- /* We know the guest state pointer has been modified.
- So jump directly to gsp_changed. */
- jmp gsp_changed
- ud2
- /*NOTREACHED*/
-
/*----------------------------------------------------*/
-/*--- PROFILING dispatcher (can be much slower) ---*/
+/*--- Postamble and exit. ---*/
/*----------------------------------------------------*/
-.align 16
-.global VG_(run_innerloop__dispatch_unassisted_profiled)
-VG_(run_innerloop__dispatch_unassisted_profiled):
- /* AT ENTRY: %rax is next guest addr, %rbp is the
- unmodified guest state ptr */
-
- /* save the jump address in the guest state */
- movq %rax, OFFSET_amd64_RIP(%rbp)
-
- /* Are we out of timeslice? If yes, defer to scheduler. */
- subl $1, 0(%rsp)
- jz counter_is_zero
-
- /* try a fast lookup in the translation cache */
- movabsq $VG_(tt_fast), %rcx
- movq %rax, %rbx
- andq $VG_TT_FAST_MASK, %rbx /* entry# */
- shlq $4, %rbx /* entry# * sizeof(FastCacheEntry) */
- movq 0(%rcx,%rbx,1), %r10 /* .guest */
- movq 8(%rcx,%rbx,1), %r11 /* .host */
- cmpq %rax, %r10
- jnz fast_lookup_failed
-
- /* increment bb profile counter */
- movabsq $VG_(tt_fastN), %rdx
- shrq $1, %rbx /* entry# * sizeof(UInt*) */
- movq (%rdx,%rbx,1), %rdx
- addl $1, (%rdx)
-
- /* Found a match. Jump to .host. */
- jmp *%r11
- ud2 /* persuade insn decoders not to speculate past here */
- /* generated code should run, then jump back to either
- VG_(run_innerloop__dispatch_unassisted_profiled)
- VG_(run_innerloop__dispatch_assisted_profiled). */
- /*NOTREACHED*/
-
-.align 16
-.global VG_(run_innerloop__dispatch_assisted_profiled)
-VG_(run_innerloop__dispatch_assisted_profiled):
- /* AT ENTRY: %rax is next guest addr, %rbp is the
- modified guest state ptr */
-
- /* Well, we know the guest state pointer has been modified.
- So jump directly to gsp_changed. */
- jmp gsp_changed
- ud2
- /*NOTREACHED*/
-
-/*----------------------------------------------------*/
-/*--- exit points ---*/
-/*----------------------------------------------------*/
-
-gsp_changed:
- /* Someone messed with the gsp. Have to
- defer to scheduler to resolve this. dispatch ctr
- is not yet decremented, so no need to inc...
[truncated message content] |