|
From: <sv...@va...> - 2005-10-12 10:09:28
|
Author: sewardj
Date: 2005-10-12 11:09:23 +0100 (Wed, 12 Oct 2005)
New Revision: 4903
Log:
Redo the way cachegrind generates instrumentation code, so that it can
deal with any IR that happens to show up. This makes it work on ppc32
and should fix occasionally-reported bugs on x86/amd64 where it bombs
due to having to deal with multiple date references in a single
instruction.
The new scheme is based around the idea of a queue of memory events
which are outstanding, in the sense that no IR has yet been generated
to do the relevant helper calls. The presence of the queue --
currently 16 entries deep -- gives cachegrind more scope for combining
multiple memory references into a single helper function call. As a
result it runs 3%-5% faster than the previous version, on x86.
This commit also changes the type of the tool interface function
'tool_discard_basic_block_info' and clarifies its meaning. See
comments in include/pub_tool_tooliface.h.
Modified:
trunk/cachegrind/cg_main.c
trunk/coregrind/m_tooliface.c
trunk/coregrind/m_transtab.c
trunk/coregrind/pub_core_tooliface.h
trunk/include/pub_tool_tooliface.h
Modified: trunk/cachegrind/cg_main.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/cachegrind/cg_main.c 2005-10-12 10:00:56 UTC (rev 4902)
+++ trunk/cachegrind/cg_main.c 2005-10-12 10:09:23 UTC (rev 4903)
@@ -51,6 +51,9 @@
/*--- Constants ---*/
/*------------------------------------------------------------*/
=20
+/* Set to 1 for very verbose debugging */
+#define DEBUG_CG 0
+
#define MIN_LINE_SIZE 16
#define FILE_LEN 256
#define FN_LEN 256
@@ -131,7 +134,6 @@
struct _instr_info {
Addr instr_addr;
UChar instr_len;
- UChar data_size;
lineCC* parent; // parent line-CC
};
=20
@@ -292,8 +294,8 @@
static VG_REGPARM(1)
void log_1I_0D_cache_access(instr_info* n)
{
- //VG_(printf)("1I_0D : CCaddr=3D0x%x, iaddr=3D0x%x, isize=3D%u\n",
- // n, n->instr_addr, n->instr_len);
+ //VG_(printf)("1I_0D : CCaddr=3D0x%010lx, iaddr=3D0x%010lx, isize=3D=
%lu\n",
+ // n, n->instr_addr, n->instr_len);
VGP_PUSHCC(VgpCacheSimulate);
cachesim_I1_doref(n->instr_addr, n->instr_len,=20
&n->parent->Ir.m1, &n->parent->Ir.m2);
@@ -302,62 +304,214 @@
}
=20
static VG_REGPARM(2)
-void log_1I_1Dr_cache_access(instr_info* n, Addr data_addr)
+void log_2I_0D_cache_access(instr_info* n, instr_info* n2)
{
- //VG_(printf)("1I_1Dr: CCaddr=3D%p, iaddr=3D%p, isize=3D%u, daddr=3D%=
p, dsize=3D%u\n",
- // n, n->instr_addr, n->instr_len, data_addr, n->data_size=
);
+ //VG_(printf)("2I_0D : CC1addr=3D0x%010lx, i1addr=3D0x%010lx, i1size=3D=
%lu\n"
+ // " CC2addr=3D0x%010lx, i2addr=3D0x%010lx, i2size=3D=
%lu\n",
+ // n, n->instr_addr, n->instr_len,
+ // n2, n2->instr_addr, n2->instr_len);
VGP_PUSHCC(VgpCacheSimulate);
cachesim_I1_doref(n->instr_addr, n->instr_len,=20
&n->parent->Ir.m1, &n->parent->Ir.m2);
n->parent->Ir.a++;
+ cachesim_I1_doref(n2->instr_addr, n2->instr_len,=20
+ &n2->parent->Ir.m1, &n2->parent->Ir.m2);
+ n2->parent->Ir.a++;
+ VGP_POPCC(VgpCacheSimulate);
+}
=20
- cachesim_D1_doref(data_addr, n->data_size,=20
+static VG_REGPARM(3)
+void log_3I_0D_cache_access(instr_info* n, instr_info* n2, instr_info* n=
3)
+{
+ //VG_(printf)("3I_0D : CC1addr=3D0x%010lx, i1addr=3D0x%010lx, i1size=3D=
%lu\n"
+ // " CC2addr=3D0x%010lx, i2addr=3D0x%010lx, i2size=3D=
%lu\n"
+ // " CC3addr=3D0x%010lx, i3addr=3D0x%010lx, i3size=3D=
%lu\n",
+ // n, n->instr_addr, n->instr_len,
+ // n2, n2->instr_addr, n2->instr_len,
+ // n3, n3->instr_addr, n3->instr_len);
+ VGP_PUSHCC(VgpCacheSimulate);
+ cachesim_I1_doref(n->instr_addr, n->instr_len,=20
+ &n->parent->Ir.m1, &n->parent->Ir.m2);
+ n->parent->Ir.a++;
+ cachesim_I1_doref(n2->instr_addr, n2->instr_len,=20
+ &n2->parent->Ir.m1, &n2->parent->Ir.m2);
+ n2->parent->Ir.a++;
+ cachesim_I1_doref(n3->instr_addr, n3->instr_len,=20
+ &n3->parent->Ir.m1, &n3->parent->Ir.m2);
+ n3->parent->Ir.a++;
+ VGP_POPCC(VgpCacheSimulate);
+}
+
+static VG_REGPARM(3)
+void log_1I_1Dr_cache_access(instr_info* n, Addr data_addr, Word data_si=
ze)
+{
+ //VG_(printf)("1I_1Dr: CCaddr=3D0x%010lx, iaddr=3D0x%010lx, isize=3D=
%lu\n"
+ // " daddr=3D0x%010lx, dsiz=
e=3D%lu\n",
+ // n, n->instr_addr, n->instr_len, data_addr, data_size);
+ VGP_PUSHCC(VgpCacheSimulate);
+ cachesim_I1_doref(n->instr_addr, n->instr_len,=20
+ &n->parent->Ir.m1, &n->parent->Ir.m2);
+ n->parent->Ir.a++;
+
+ cachesim_D1_doref(data_addr, data_size,=20
&n->parent->Dr.m1, &n->parent->Dr.m2);
n->parent->Dr.a++;
VGP_POPCC(VgpCacheSimulate);
}
=20
-static VG_REGPARM(2)
-void log_1I_1Dw_cache_access(instr_info* n, Addr data_addr)
+static VG_REGPARM(3)
+void log_1I_1Dw_cache_access(instr_info* n, Addr data_addr, Word data_si=
ze)
{
- //VG_(printf)("1I_1Dw: CCaddr=3D%p, iaddr=3D%p, isize=3D%u, daddr=3D%=
p, dsize=3D%u\n",
- // n, n->instr_addr, n->instr_len, data_addr, n->data_size=
);
+ //VG_(printf)("1I_1Dw: CCaddr=3D0x%010lx, iaddr=3D0x%010lx, isize=3D=
%lu\n"
+ // " daddr=3D0x%010lx, dsiz=
e=3D%lu\n",
+ // n, n->instr_addr, n->instr_len, data_addr, data_size);
VGP_PUSHCC(VgpCacheSimulate);
cachesim_I1_doref(n->instr_addr, n->instr_len,=20
&n->parent->Ir.m1, &n->parent->Ir.m2);
n->parent->Ir.a++;
=20
- cachesim_D1_doref(data_addr, n->data_size,=20
+ cachesim_D1_doref(data_addr, data_size,=20
&n->parent->Dw.m1, &n->parent->Dw.m2);
n->parent->Dw.a++;
VGP_POPCC(VgpCacheSimulate);
}
=20
static VG_REGPARM(3)
-void log_1I_2D_cache_access(instr_info* n, Addr data_addr1, Addr data_ad=
dr2)
+void log_0I_1Dr_cache_access(instr_info* n, Addr data_addr, Word data_si=
ze)
{
- //VG_(printf)("1I_2D: CCaddr=3D%p, iaddr=3D%p, isize=3D%u, daddr1=3D%=
p, daddr2=3D%p, dsize=3D%u\n",
- // n, n->instr_addr, n->instr_len, data_addr1, data_addr2,=
n->data_size);
+ //VG_(printf)("0I_1Dr: CCaddr=3D0x%010lx, daddr=3D0x%010lx, dsize=3D=
%lu\n",
+ // n, data_addr, data_size);
VGP_PUSHCC(VgpCacheSimulate);
- cachesim_I1_doref(n->instr_addr, n->instr_len,
- &n->parent->Ir.m1, &n->parent->Ir.m2);
- n->parent->Ir.a++;
-
- cachesim_D1_doref(data_addr1, n->data_size,
+ cachesim_D1_doref(data_addr, data_size,=20
&n->parent->Dr.m1, &n->parent->Dr.m2);
n->parent->Dr.a++;
- cachesim_D1_doref(data_addr2, n->data_size,
+ VGP_POPCC(VgpCacheSimulate);
+}
+
+static VG_REGPARM(3)
+void log_0I_1Dw_cache_access(instr_info* n, Addr data_addr, Word data_si=
ze)
+{
+ //VG_(printf)("0I_1Dw: CCaddr=3D0x%010lx, daddr=3D0x%010lx, dsize=3D=
%lu\n",
+ // n, data_addr, data_size);
+ VGP_PUSHCC(VgpCacheSimulate);
+ cachesim_D1_doref(data_addr, data_size,=20
&n->parent->Dw.m1, &n->parent->Dw.m2);
n->parent->Dw.a++;
VGP_POPCC(VgpCacheSimulate);
}
=20
/*------------------------------------------------------------*/
-/*--- Instrumentation ---*/
+/*--- Instrumentation types and structures ---*/
/*------------------------------------------------------------*/
=20
+/* Maintain an ordered list of memory events which are outstanding, in
+ the sense that no IR has yet been generated to do the relevant
+ helper calls. The BB is scanned top to bottom and memory events
+ are added to the end of the list, merging with the most recent
+ notified event where possible (Dw immediately following Dr and
+ having the same size and EA can be merged).
+
+ This merging is done so that for architectures which have
+ load-op-store instructions (x86, amd64), the insn is treated as if
+ it makes just one memory reference (a modify), rather than two (a
+ read followed by a write at the same address).
+
+ At various points the list will need to be flushed, that is, IR
+ generated from it. That must happen before any possible exit from
+ the block (the end, or an IRStmt_Exit). Flushing also takes place
+ when there is no space to add a new event.
+
+ If we require the simulation statistics to be up to date with
+ respect to possible memory exceptions, then the list would have to
+ be flushed before each memory reference. That would however lose
+ performance by inhibiting event-merging during flushing.
+
+ Flushing the list consists of walking it start to end and emitting
+ instrumentation IR for each event, in the order in which they
+ appear. It may be possible to emit a single call for two adjacent
+ events in order to reduce the number of helper function calls made.
+ For example, it could well be profitable to handle two adjacent Ir
+ events with a single helper call. */
+
+typedef
+ IRExpr=20
+ IRAtom;
+
+typedef=20
+ enum { Event_Ir=3D0, Event_Dr=3D1, Event_Dw=3D2, Event_Dm=3D3 }
+ EventKind;
+
+typedef
+ struct {
+ EventKind ekind;
+ Int size; /* ALL */
+ Addr64 iaddr; /* ALL. For Dr/Dw/Dm is & of parent insn. */
+ IRAtom* dataEA; /* Dr/Dw/Dm only */ /* IR ATOM ONLY */
+ }
+ Event;
+
+/* Up to this many unnotified events are allowed. Number is
+ arbitrary. Larger numbers allow more event merging to occur, but
+ potentially induce more spilling due to extending live ranges of
+ address temporaries. */
+#define N_EVENTS 16
+
+
+/* A struct which holds all the running state during instrumentation.
+ Mostly to avoid passing loads of parameters everywhere. */
+typedef
+ struct {
+ /* The current outstanding-memory-event list. */
+ Event events[N_EVENTS];
+ Int events_used;
+
+ /* The array of instr_info bins for the BB. */
+ BB_info* bbInfo;
+
+ /* Number instr_info bins 'used' so far. */
+ Int bbInfo_i;
+
+ /* Not sure what this is for (jrs 20051009) */
+ Bool bbSeenBefore;
+
+ /* The output BB being constructed. */
+ IRBB* bbOut;
+ }
+ CgState;
+
+
+static Int index3 ( EventKind k1, EventKind k2, EventKind k3 )
+{
+ Int i1 =3D k1;
+ Int i2 =3D k2;
+ Int i3 =3D k3;
+ Int r;
+ tl_assert(i1 >=3D 0 && i1 < 4);
+ tl_assert(i2 >=3D 0 && i2 < 4);
+ tl_assert(i3 >=3D 0 && i3 < 4);
+ r =3D 16*i1 + 4*i2 + i3;
+ tl_assert(r >=3D 0 && r < 64);
+ return r;
+}
+
+static void show3 ( Int idx )
+{
+ HChar* names =3D "IRWM";
+ Int i1 =3D (idx >> 4) & 3;
+ Int i2 =3D (idx >> 2) & 3;
+ Int i3 =3D idx & 3;
+ VG_(printf)("%c%c%c", names[i1], names[i2], names[i3]);
+}
+
+static Int trigrams[64];
+
+
+/*------------------------------------------------------------*/
+/*--- Instrumentation main ---*/
+/*------------------------------------------------------------*/
+
static
-BB_info* get_BB_info(IRBB* bbIn, Addr origAddr, Bool* bbSeenBefore)
+BB_info* get_BB_info(IRBB* bbIn, Addr origAddr, /*OUT*/Bool* bbSeenBefor=
e)
{
Int i, n_instrs;
IRStmt* st;
@@ -389,143 +543,14 @@
return bbInfo;
}
=20
-static=20
-Bool handleOneStatement(IRTypeEnv* tyenv, IRBB* bbOut, IRStmt* st, IRStm=
t* st2,
- Addr* instrAddr, UInt* instrLen,
- IRExpr** loadAddrExpr, IRExpr** storeAddrExpr,
- UInt* dataSize)
-{
- tl_assert(isFlatIRStmt(st));
=20
- switch (st->tag) {
- case Ist_NoOp:
- case Ist_AbiHint:
- case Ist_Put:
- case Ist_PutI:
- case Ist_MFence:
- break;
-
- case Ist_Exit: {
- // This is a conditional jump. Most of the time, we want to add t=
he
- // instrumentation before it, to ensure it gets executed. Eg, (1)=
if
- // this conditional jump is just before an IMark:
- //
- // t108 =3D Not1(t107)
- // [add instrumentation here]
- // if (t108) goto {Boring} 0x3A96637D:I32
- // ------ IMark(0x3A966370, 7) ------
- //
- // or (2) if this conditional jump is the last thing before the
- // block-ending unconditional jump:
- //
- // t111 =3D Not1(t110)
- // [add instrumentation here]
- // if (t111) goto {Boring} 0x3A96637D:I32
- // goto {Boring} 0x3A966370:I32
- //
- // One case (3) where we want the instrumentation after the condit=
ional
- // jump is when the conditional jump is for an x86 REP instruction=
:
- //
- // ------ IMark(0x3A967F13, 2) ------
- // t1 =3D GET:I32(4)
- // t6 =3D CmpEQ32(t1,0x0:I32)=20
- // if (t6) goto {Boring} 0x3A967F15:I32 # ignore this cond jm=
p
- // t7 =3D Sub32(t1,0x1:I32)
- // PUT(4) =3D t7
- // ...
- // t56 =3D Not1(t55)
- // [add instrumentation here]
- // if (t56) goto {Boring} 0x3A967F15:I32
- //
- // Therefore, we return true if the next statement is an IMark, or=
if
- // there is no next statement (which matches case (2), as the fina=
l
- // unconditional jump is not represented in the IRStmt list).
- //
- // Note that this approach won't do in the long run for supporting
- // PPC, but it's good enough for x86/AMD64 for the 3.0.X series.
- if (NULL =3D=3D st2 || Ist_IMark =3D=3D st2->tag)
- return True;
- else
- return False;
- }
-
- case Ist_IMark:
- /* st->Ist.IMark.addr is a 64-bit int. ULong_to_Ptr casts this
- to the host's native pointer type; if that is 32 bits then it
- discards the upper 32 bits. If we are cachegrinding on a
- 32-bit host then we are also ensured that the guest word size
- is 32 bits, due to the assertion in cg_instrument that the
- host and guest word sizes must be the same. Hence
- st->Ist.IMark.addr will have been derived from a 32-bit guest
- code address and truncation of it is safe. I believe this
- assignment should be correct for both 32- and 64-bit
- machines. */
- *instrAddr =3D (Addr)ULong_to_Ptr(st->Ist.IMark.addr);
- *instrLen =3D st->Ist.IMark.len;
- break;
-
- case Ist_Tmp: {
- IRExpr* data =3D st->Ist.Tmp.data;
- if (data->tag =3D=3D Iex_Load) {
- IRExpr* aexpr =3D data->Iex.Load.addr;
- tl_assert( isIRAtom(aexpr) );
- // Note also, endianness info is ignored. I guess that's not
- // interesting.
- // XXX: repe cmpsb does two loads... the first one is ignored h=
ere!
- //tl_assert( NULL =3D=3D *loadAddrExpr ); // XXX: ???
- *loadAddrExpr =3D aexpr;
- *dataSize =3D sizeofIRType(data->Iex.Load.ty);
- }
- break;
- }
- =20
- case Ist_Store: {
- IRExpr* data =3D st->Ist.Store.data;
- IRExpr* aexpr =3D st->Ist.Store.addr;
- tl_assert( isIRAtom(aexpr) );
- tl_assert( NULL =3D=3D *storeAddrExpr ); // XXX: ???
- *storeAddrExpr =3D aexpr;
- *dataSize =3D sizeofIRType(typeOfIRExpr(tyenv, data));
- break;
- }
- =20
- case Ist_Dirty: {
- IRDirty* d =3D st->Ist.Dirty.details;
- if (d->mFx !=3D Ifx_None) {
- /* This dirty helper accesses memory. Collect the
- details. */
- tl_assert(d->mAddr !=3D NULL);
- tl_assert(d->mSize !=3D 0);
- *dataSize =3D d->mSize;
- if (d->mFx =3D=3D Ifx_Read || d->mFx =3D=3D Ifx_Modify)
- *loadAddrExpr =3D d->mAddr;
- if (d->mFx =3D=3D Ifx_Write || d->mFx =3D=3D Ifx_Modify)
- *storeAddrExpr =3D d->mAddr;
- } else {
- tl_assert(d->mAddr =3D=3D NULL);
- tl_assert(d->mSize =3D=3D 0);
- }
- break;
- }
-
- default:
- VG_(printf)("\n");
- ppIRStmt(st);
- VG_(printf)("\n");
- VG_(tool_panic)("Cachegrind: unhandled IRStmt");
- }
-
- return False;
-}
-
static
-void do_details( instr_info* n, Bool bbSeenBefore,
- Addr instr_addr, Int instr_len, Int data_size )
+void init_instr_info( instr_info* n, Bool bbSeenBefore,
+ Addr instr_addr, Int instr_len )
{
if (bbSeenBefore) {
tl_assert( n->instr_addr =3D=3D instr_addr );
tl_assert( n->instr_len =3D=3D instr_len );
- tl_assert( n->data_size =3D=3D data_size );
// Don't check that (n->parent =3D=3D parent)... it's conceivable =
that
// the debug info might change; the other asserts should be enoug=
h to
// detect anything strange.
@@ -533,202 +558,451 @@
lineCC* parent =3D get_lineCC(instr_addr);
n->instr_addr =3D instr_addr;
n->instr_len =3D instr_len;
- n->data_size =3D data_size;
n->parent =3D parent;
}
}
=20
-static Bool loadStoreAddrsMatch(IRExpr* loadAddrExpr, IRExpr* storeAddrE=
xpr)
+static void showEvent ( Event* ev )
{
- // I'm assuming that for 'modify' instructions, that Vex always makes
- // the loadAddrExpr and storeAddrExpr be of the same type, ie. both Tm=
p
- // expressions, or both Const expressions.
- tl_assert(isIRAtom(loadAddrExpr));
- tl_assert(isIRAtom(storeAddrExpr));
- return eqIRAtom(loadAddrExpr, storeAddrExpr);
+ switch (ev->ekind) {
+ case Event_Ir:=20
+ VG_(printf)("Ir %d 0x%llx\n", ev->size, ev->iaddr);
+ break;
+ case Event_Dr:
+ VG_(printf)("Dr %d 0x%llx EA=3D", ev->size, ev->iaddr);
+ ppIRExpr(ev->dataEA);=20
+ VG_(printf)("\n");
+ break;
+ case Event_Dw:
+ VG_(printf)("Dw %d 0x%llx EA=3D", ev->size, ev->iaddr);
+ ppIRExpr(ev->dataEA);=20
+ VG_(printf)("\n");
+ break;
+ case Event_Dm:
+ VG_(printf)("Dm %d 0x%llx EA=3D", ev->size, ev->iaddr);
+ ppIRExpr(ev->dataEA);=20
+ VG_(printf)("\n");
+ break;
+ default:=20
+ tl_assert(0);
+ break;
+ }
}
=20
-// Instrumentation for the end of each original instruction.
-static
-void instrumentInstr(IRBB* bbOut, instr_info* i_node, Bool bbSeenBefore,
- UInt instrAddr, UInt instrLen, UInt dataSize,
- IRExpr* loadAddrExpr, IRExpr* storeAddrExpr)
+/* Reserve instr_info for the first mention of a new insn. */
+
+static instr_info* reserve_instr_info ( CgState* cgs )
{
- IRDirty* di;
- IRExpr *arg1, *arg2, *arg3, **argv;
- Int argc;
- Char* helperName;
- void* helperAddr;
- IRType wordTy;
+ instr_info* i_node;
+ tl_assert(cgs->bbInfo_i >=3D 0);
+ tl_assert(cgs->bbInfo_i < cgs->bbInfo->n_instrs);
+ i_node =3D &cgs->bbInfo->instrs[ cgs->bbInfo_i ];
+ cgs->bbInfo_i++;
+ return i_node;
+}
=20
- // Stay sane ...
- tl_assert(sizeof(HWord) =3D=3D sizeof(void*));
- if (sizeof(HWord) =3D=3D 4) {
- wordTy =3D Ity_I32;
- } else
- if (sizeof(HWord) =3D=3D 8) {
- wordTy =3D Ity_I64;
- } else {
- VG_(tool_panic)("instrumentInstr: strange word size");
- }
=20
- if (loadAddrExpr)=20
- tl_assert(wordTy =3D=3D typeOfIRExpr(bbOut->tyenv, loadAddrExpr));
- if (storeAddrExpr)=20
- tl_assert(wordTy =3D=3D typeOfIRExpr(bbOut->tyenv, storeAddrExpr))=
;
+/* Find the most recently allocated instr_info. */
=20
- // Large (eg. 28B, 108B, 512B on x86) data-sized instructions will be
- // done inaccurately, but they're very rare and this avoids errors fr=
om
- // hitting more than two cache lines in the simulation.
- if (dataSize > MIN_LINE_SIZE) dataSize =3D MIN_LINE_SIZE;
+static instr_info* find_most_recent_instr_info ( CgState* cgs )
+{
+ tl_assert(cgs->bbInfo_i >=3D 0);
+ tl_assert(cgs->bbInfo_i <=3D cgs->bbInfo->n_instrs);
+ if (cgs->bbInfo_i =3D=3D 0)
+ return NULL;
+ else
+ return &cgs->bbInfo->instrs[ cgs->bbInfo_i - 1 ];
+}
=20
- // Setup 1st arg: instr_info node's address
- // Believed to be 64-bit clean
- do_details(i_node, bbSeenBefore, instrAddr, instrLen, dataSize );
- arg1 =3D mkIRExpr_HWord( (HWord)i_node );
=20
- if (!loadAddrExpr && !storeAddrExpr) {
- // no load/store
- tl_assert(0 =3D=3D dataSize);
- helperName =3D "log_1I_0D_cache_access";
- helperAddr =3D &log_1I_0D_cache_access;
- argc =3D 1;
- argv =3D mkIRExprVec_1(arg1);
+/* Generate code for all outstanding memory events, and mark the queue
+ empty. Code is generated into cgs->bbOut, and this activity
+ 'consumes' slots in cgs->bbInfo. */
=20
- } else if (loadAddrExpr && !storeAddrExpr) {
- // load
- tl_assert( isIRAtom(loadAddrExpr) );
- helperName =3D "log_1I_1Dr_cache_access";
- helperAddr =3D &log_1I_1Dr_cache_access;
- argc =3D 2;
- arg2 =3D loadAddrExpr;
- argv =3D mkIRExprVec_2(arg1, arg2);
+static void flushEvents ( CgState* cgs )
+{
+ Int i, regparms;
+ Char* helperName;
+ void* helperAddr;
+ IRExpr** argv;
+ IRExpr* i_node_expr;
+ IRExpr* i_node2_expr;
+ IRExpr* i_node3_expr;
+ IRDirty* di;
+ instr_info* i_node;
+ instr_info* i_node2;
+ instr_info* i_node3;
=20
- } else if (!loadAddrExpr && storeAddrExpr) {
- // store
- tl_assert( isIRAtom(storeAddrExpr) );
- helperName =3D "log_1I_1Dw_cache_access";
- helperAddr =3D &log_1I_1Dw_cache_access;
- argc =3D 2;
- arg2 =3D storeAddrExpr;
- argv =3D mkIRExprVec_2(arg1, arg2);
-=20
- } else {
- tl_assert( loadAddrExpr && storeAddrExpr );
- tl_assert( isIRAtom(loadAddrExpr) );
- tl_assert( isIRAtom(storeAddrExpr) );
+ for (i =3D 0; i < cgs->events_used-2; i++)
+ trigrams [ index3( cgs->events[i].ekind, cgs->events[i+1].ekind,cgs=
->events[i+2].ekind ) ]++;
=20
- if ( loadStoreAddrsMatch(loadAddrExpr, storeAddrExpr) ) {
- // modify
- helperName =3D "log_1I_1Dr_cache_access";
- helperAddr =3D &log_1I_1Dr_cache_access;
- argc =3D 2;
- arg2 =3D loadAddrExpr;
- argv =3D mkIRExprVec_2(arg1, arg2);
+ i =3D 0;
+ while (i < cgs->events_used) {
=20
+ helperName =3D NULL;
+ helperAddr =3D NULL;
+ argv =3D NULL;
+ regparms =3D 0;
+
+ /* generate IR to notify event i and possibly the ones
+ immediately following it. */
+ tl_assert(i >=3D 0 && i < cgs->events_used);
+ if (DEBUG_CG) {
+ VG_(printf)(" flush ");=20
+ showEvent( &cgs->events[i] );
+ }
+
+ /* For any event we find the relevant instr_info. The following
+ assumes that Event_Ir is the first event to refer to any
+ specific insn, and so a new entry in the cgs->bbInfo->instrs
+ is allocated. All other events (Dr,Dw,Dm) must refer to the
+ most recently encountered IMark and so we use the
+ most-recently allocated instrs[] entry, which must exist. */
+
+ if (cgs->events[i].ekind =3D=3D Event_Ir) {
+ /* allocate an instr_info and fill in its addr/size. */
+ i_node =3D reserve_instr_info( cgs );
+ tl_assert(i_node);
+ init_instr_info( i_node, cgs->bbSeenBefore,
+ (Addr)cgs->events[i].iaddr, /* i addr */
+ cgs->events[i].size /* i size */);
} else {
- // load/store
- helperName =3D "log_1I_2D_cache_access";
- helperAddr =3D &log_1I_2D_cache_access;
- argc =3D 3;
- arg2 =3D loadAddrExpr;
- arg3 =3D storeAddrExpr;
- argv =3D mkIRExprVec_3(arg1, arg2, arg3);
+ /* use the most-recently allocated i_node but don't mess with
+ its internals */
+ i_node =3D find_most_recent_instr_info( cgs );
+ /* it must actually exist */
+ tl_assert(i_node);
+ /* it must match the declared parent instruction of this
+ event. */
+ tl_assert(i_node->instr_addr =3D=3D cgs->events[i].iaddr);
}
+
+ i_node_expr =3D mkIRExpr_HWord( (HWord)i_node );
+
+ /* Decide on helper fn to call and args to pass it, and advance
+ i appropriately. */
+ switch (cgs->events[i].ekind) {
+ case Event_Ir:
+ /* Merge with a following Dr/Dm if it is from this insn. */
+ if (i < cgs->events_used-1=20
+ && cgs->events[i+1].iaddr =3D=3D cgs->events[i].iaddr
+ && (cgs->events[i+1].ekind =3D=3D Event_Dr
+ || cgs->events[i+1].ekind =3D=3D Event_Dm)) {
+ helperName =3D "log_1I_1Dr_cache_access";
+ helperAddr =3D &log_1I_1Dr_cache_access;
+ argv =3D mkIRExprVec_3( i_node_expr,
+ cgs->events[i+1].dataEA,
+ mkIRExpr_HWord( cgs->events[i+1].si=
ze ) );
+ regparms =3D 3;
+ i +=3D 2;
+ }
+ /* Merge with a following Dw if it is from this insn. */
+ else
+ if (i < cgs->events_used-1=20
+ && cgs->events[i+1].iaddr =3D=3D cgs->events[i].iaddr
+ && cgs->events[i+1].ekind =3D=3D Event_Dw) {
+ helperName =3D "log_1I_1Dw_cache_access";
+ helperAddr =3D &log_1I_1Dw_cache_access;
+ argv =3D mkIRExprVec_3( i_node_expr,
+ cgs->events[i+1].dataEA,
+ mkIRExpr_HWord( cgs->events[i+1].si=
ze ) );
+ regparms =3D 3;
+ i +=3D 2;
+ }
+ /* Merge with two following Irs if possible. */
+ else
+ if (i < cgs->events_used-2=20
+ && cgs->events[i+1].ekind =3D=3D Event_Ir
+ && cgs->events[i+2].ekind =3D=3D Event_Ir) {
+ helperName =3D "log_3I_0D_cache_access";
+ helperAddr =3D &log_3I_0D_cache_access;
+
+ i_node2 =3D reserve_instr_info( cgs );
+ tl_assert(i_node2);
+ init_instr_info( i_node2, cgs->bbSeenBefore,
+ (Addr)cgs->events[i+1].iaddr, /* i addr =
*/
+ cgs->events[i+1].size /* i size */);
+ i_node2_expr =3D mkIRExpr_HWord( (HWord)i_node2 );
+
+ i_node3 =3D reserve_instr_info( cgs );
+ tl_assert(i_node3);
+ init_instr_info( i_node3, cgs->bbSeenBefore,
+ (Addr)cgs->events[i+2].iaddr, /* i addr =
*/
+ cgs->events[i+2].size /* i size */);
+ i_node3_expr =3D mkIRExpr_HWord( (HWord)i_node3 );
+
+ argv =3D mkIRExprVec_3( i_node_expr, i_node2_expr, i_node=
3_expr );
+ regparms =3D 3;
+ i +=3D 3;
+ }
+ /* Merge with a following Ir if possible. */
+ else
+ if (i < cgs->events_used-1=20
+ && cgs->events[i+1].ekind =3D=3D Event_Ir) {
+ helperName =3D "log_2I_0D_cache_access";
+ helperAddr =3D &log_2I_0D_cache_access;
+ i_node2 =3D reserve_instr_info( cgs );
+ tl_assert(i_node2);
+ init_instr_info( i_node2, cgs->bbSeenBefore,
+ (Addr)cgs->events[i+1].iaddr, /* i addr =
*/
+ cgs->events[i+1].size /* i size */);
+ i_node2_expr =3D mkIRExpr_HWord( (HWord)i_node2 );
+ argv =3D mkIRExprVec_2( i_node_expr, i_node2_expr );
+ regparms =3D 2;
+ i +=3D 2;
+ }
+ /* No merging possible; emit as-is. */
+ else {
+ helperName =3D "log_1I_0D_cache_access";
+ helperAddr =3D &log_1I_0D_cache_access;
+ argv =3D mkIRExprVec_1( i_node_expr );
+ regparms =3D 1;
+ i++;
+ }
+ break;
+ case Event_Dr:
+ case Event_Dm:
+ helperName =3D "log_0I_1Dr_cache_access";
+ helperAddr =3D &log_0I_1Dr_cache_access;
+ argv =3D mkIRExprVec_3( i_node_expr,=20
+ cgs->events[i].dataEA,=20
+ mkIRExpr_HWord( cgs->events[i].size ) =
);
+ regparms =3D 3;
+ i++;
+ break;
+ case Event_Dw:
+ helperName =3D "log_0I_1Dw_cache_access";
+ helperAddr =3D &log_0I_1Dw_cache_access;
+ argv =3D mkIRExprVec_3( i_node_expr,
+ cgs->events[i].dataEA,=20
+ mkIRExpr_HWord( cgs->events[i].size ) =
);
+ regparms =3D 3;
+ i++;
+ break;
+ default:
+ tl_assert(0);
+ }
+
+ /* Add the helper. */
+ tl_assert(helperName);
+ tl_assert(helperAddr);
+ tl_assert(argv);
+ di =3D unsafeIRDirty_0_N( regparms, helperName, helperAddr, argv);
+ addStmtToIRBB( cgs->bbOut, IRStmt_Dirty(di) );
}
=20
- // Add call to the instrumentation function
- di =3D unsafeIRDirty_0_N( argc, helperName, helperAddr, argv);
- addStmtToIRBB( bbOut, IRStmt_Dirty(di) );
+ cgs->events_used =3D 0;
}
=20
+
+static void addEvent_Ir ( CgState* cgs, Int size, Addr64 iaddr )
+{
+ Event* evt;
+ tl_assert(size >=3D 0 && size <=3D MIN_LINE_SIZE);
+ if (cgs->events_used =3D=3D N_EVENTS)
+ flushEvents(cgs);
+ tl_assert(cgs->events_used >=3D 0 && cgs->events_used < N_EVENTS);
+ /* If vex fails to decode an insn, the size will be zero, but that
+ can't really be true -- the cpu couldn't have determined the
+ insn was undecodable without looking at it. Hence: */
+ if (size =3D=3D 0)
+ size =3D 1;
+ evt =3D &cgs->events[cgs->events_used];
+ evt->ekind =3D Event_Ir;
+ evt->size =3D size;
+ evt->iaddr =3D iaddr;
+ evt->dataEA =3D NULL; /*paranoia*/
+ cgs->events_used++;
+}
+
+static void addEvent_Dr ( CgState* cgs, Int size, Addr64 iaddr, IRAtom* =
ea )
+{
+ Event* evt;
+ tl_assert(isIRAtom(ea));
+ tl_assert(size >=3D 1 && size <=3D MIN_LINE_SIZE);
+ if (cgs->events_used =3D=3D N_EVENTS)
+ flushEvents(cgs);
+ tl_assert(cgs->events_used >=3D 0 && cgs->events_used < N_EVENTS);
+ evt =3D &cgs->events[cgs->events_used];
+ evt->ekind =3D Event_Dr;
+ evt->size =3D size;
+ evt->iaddr =3D iaddr;
+ evt->dataEA =3D ea;
+ cgs->events_used++;
+}
+
+static void addEvent_Dw ( CgState* cgs, Int size, Addr64 iaddr, IRAtom* =
ea )
+{
+ tl_assert(isIRAtom(ea));
+ tl_assert(size >=3D 1 && size <=3D MIN_LINE_SIZE);
+
+ /* Is it possible to merge this write into an immediately preceding
+ read? */
+ if (cgs->events_used > 0
+ && cgs->events[cgs->events_used-1].ekind =3D=3D Event_Dr
+ && cgs->events[cgs->events_used-1].size =3D=3D size
+ && cgs->events[cgs->events_used-1].iaddr =3D=3D iaddr
+ && eqIRAtom(cgs->events[cgs->events_used-1].dataEA, ea)) {
+ cgs->events[cgs->events_used-1].ekind =3D Event_Dm;
+ return;
+ }
+
+ /* No. Add as normal. */
+ if (cgs->events_used =3D=3D N_EVENTS)
+ flushEvents(cgs);
+ tl_assert(cgs->events_used >=3D 0 && cgs->events_used < N_EVENTS);
+ cgs->events[cgs->events_used].ekind =3D Event_Dw;
+ cgs->events[cgs->events_used].size =3D size;
+ cgs->events[cgs->events_used].iaddr =3D iaddr;
+ cgs->events[cgs->events_used].dataEA =3D ea;
+ cgs->events_used++;
+}
+
+////////////////////////////////////////////////////////////
+
+
static IRBB* cg_instrument ( IRBB* bbIn, VexGuestLayout* layout,=20
IRType gWordTy, IRType hWordTy )
{
- Int i, dataSize =3D 0, bbInfo_i;
- IRBB* bbOut;
- IRStmt* st;
- BB_info* bbInfo;
- Bool bbSeenBefore =3D False, addedInstrumentation, addInstNow;
- Addr instrAddr, origAddr;
- UInt instrLen;
- IRExpr *loadAddrExpr, *storeAddrExpr;
+ Int i;
+ IRStmt* st;
+ Addr64 cia; /* address of current insn */
+ CgState cgs;
+ IRTypeEnv* tyenv =3D bbIn->tyenv;
=20
+
if (gWordTy !=3D hWordTy) {
/* We don't currently support this case. */
VG_(tool_panic)("host/guest word size mismatch");
}
=20
/* Set up BB */
- bbOut =3D emptyIRBB();
- bbOut->tyenv =3D dopyIRTypeEnv(bbIn->tyenv);
- bbOut->next =3D dopyIRExpr(bbIn->next);
- bbOut->jumpkind =3D bbIn->jumpkind;
+ cgs.bbOut =3D emptyIRBB();
+ cgs.bbOut->tyenv =3D dopyIRTypeEnv(tyenv);
=20
- // Get the first statement, and origAddr from it
+ // Get the first statement, and initial cia from it
i =3D 0;
tl_assert(bbIn->stmts_used > 0);
st =3D bbIn->stmts[0];
tl_assert(Ist_IMark =3D=3D st->tag);
- origAddr =3D (Addr)st->Ist.IMark.addr;
- tl_assert(origAddr =3D=3D st->Ist.IMark.addr); // XXX: check no over=
flow
+ cia =3D st->Ist.IMark.addr;
=20
- // Get block info
- bbInfo =3D get_BB_info(bbIn, origAddr, &bbSeenBefore);
- bbInfo_i =3D 0;
+ // Set up running state and get block info
+ cgs.events_used =3D 0;
+ cgs.bbInfo =3D get_BB_info(bbIn, (Addr)cia, &cgs.bbSeenBefore);
+ cgs.bbInfo_i =3D 0;
=20
- do {
- // We should be at an IMark statement
- tl_assert(Ist_IMark =3D=3D st->tag);
+ if (DEBUG_CG)
+ VG_(printf)("\n\n---------- cg_instrument ----------\n");
=20
- // Reset stuff for this original instruction
- loadAddrExpr =3D storeAddrExpr =3D NULL;
- dataSize =3D 0;
- addedInstrumentation =3D False;
+ // Traverse the block, adding events and flushing as necessary.
+ for (i =3D 0; i < bbIn->stmts_used; i++) {
=20
- // Process all the statements for this original instruction (ie. u=
ntil
- // the next IMark statement, or the end of the block)
- do {
- IRStmt* st2 =3D ( i+1 < bbIn->stmts_used ? bbIn->stmts[i+1] : N=
ULL );
+ st =3D bbIn->stmts[i];
+ tl_assert(isFlatIRStmt(st));
=20
- addInstNow =3D handleOneStatement(bbIn->tyenv, bbOut, st, st2,
- &instrAddr, &instrLen, &loadAdd=
rExpr,
- &storeAddrExpr, &dataSize);
- if (addInstNow) {
- tl_assert(!addedInstrumentation);
- addedInstrumentation =3D True;
- =20
- // Nb: instrLen will be zero if Vex failed to decode it.
- // Also Client requests can appear to be very large (eg. 18
- // bytes on x86) because they are really multiple instructio=
ns.
- tl_assert( 0 =3D=3D instrLen ||
- bbIn->jumpkind =3D=3D Ijk_ClientReq ||
- (instrLen >=3D VG_MIN_INSTR_SZB &&=20
- instrLen <=3D VG_MAX_INSTR_SZB) );
+ switch (st->tag) {
+ case Ist_NoOp:
+ case Ist_AbiHint:
+ case Ist_Put:
+ case Ist_PutI:
+ case Ist_MFence:
+ break;
=20
- // Add instrumentation before this statement.
- instrumentInstr(bbOut, &bbInfo->instrs[ bbInfo_i ], bbSeenBe=
fore,
- instrAddr, instrLen, dataSize, loadAddrExpr, store=
AddrExpr);
+ case Ist_IMark:
+ cia =3D st->Ist.IMark.addr;
+ addEvent_Ir( &cgs, st->Ist.IMark.len, cia );
+ break;
+
+ case Ist_Tmp: {
+ IRExpr* data =3D st->Ist.Tmp.data;
+ if (data->tag =3D=3D Iex_Load) {
+ IRExpr* aexpr =3D data->Iex.Load.addr;
+ tl_assert( isIRAtom(aexpr) );
+ // Note also, endianness info is ignored. I guess
+ // that's not interesting.
+ addEvent_Dr( &cgs, sizeofIRType(data->Iex.Load.ty),=20
+ cia, aexpr );
+ }
+ break;
}
=20
- addStmtToIRBB( bbOut, st );
+ case Ist_Store: {
+ IRExpr* data =3D st->Ist.Store.data;
+ IRExpr* aexpr =3D st->Ist.Store.addr;
+ tl_assert( isIRAtom(aexpr) );
+ addEvent_Dw( &cgs,=20
+ sizeofIRType(typeOfIRExpr(tyenv, data)),=20
+ cia, aexpr );
+ break;
+ }
=20
- i++;
- st =3D st2;
- }=20
- while (st && Ist_IMark !=3D st->tag);
+ case Ist_Dirty: {
+ Int dataSize;
+ IRDirty* d =3D st->Ist.Dirty.details;
+ if (d->mFx !=3D Ifx_None) {
+ /* This dirty helper accesses memory. Collect the
+ details. */
+ tl_assert(d->mAddr !=3D NULL);
+ tl_assert(d->mSize !=3D 0);
+ dataSize =3D d->mSize;
+ // Large (eg. 28B, 108B, 512B on x86) data-sized
+ // instructions will be done inaccurately, but they're
+ // very rare and this avoids errors from hitting more
+ // than two cache lines in the simulation.
+ if (dataSize > MIN_LINE_SIZE)
+ dataSize =3D MIN_LINE_SIZE;
+ if (d->mFx =3D=3D Ifx_Read || d->mFx =3D=3D Ifx_Modify)
+ addEvent_Dr( &cgs, dataSize, cia, d->mAddr );
+ if (d->mFx =3D=3D Ifx_Write || d->mFx =3D=3D Ifx_Modify)
+ addEvent_Dw( &cgs, dataSize, cia, d->mAddr );
+ } else {
+ tl_assert(d->mAddr =3D=3D NULL);
+ tl_assert(d->mSize =3D=3D 0);
+ }
+ break;
+ }
=20
- if (!addedInstrumentation) {
- // Add instrumentation now, after all the instruction's stateme=
nts.
- instrumentInstr(bbOut, &bbInfo->instrs[ bbInfo_i ], bbSeenBefor=
e,
- instrAddr, instrLen, dataSize, loadAddrExpr, st=
oreAddrExpr);
+ case Ist_Exit:
+ /* We may never reach the next statement, so need to flush
+ all outstanding transactions now. */
+ flushEvents( &cgs );
+ break;
+
+ default:
+ tl_assert(0);
+ break;
}
=20
- bbInfo_i++;
+ /* Copy the original statement */
+ addStmtToIRBB( cgs.bbOut, st );
+
+ if (DEBUG_CG) {
+ ppIRStmt(st);
+ VG_(printf)("\n");
+ }
}
- while (st);
=20
- return bbOut;
+ /* At the end of the bb. Flush outstandings. */
+ tl_assert(isIRAtom(bbIn->next));
+ flushEvents( &cgs );
+
+ /* copy where-next stuff. */
+ cgs.bbOut->next =3D dopyIRExpr(bbIn->next);
+ cgs.bbOut->jumpkind =3D bbIn->jumpkind;
+
+ /* done. stay sane ... */
+ tl_assert(cgs.bbInfo_i =3D=3D cgs.bbInfo->n_instrs);
+
+ if (DEBUG_CG) {
+ VG_(printf)( "goto {");
+ ppIRJumpKind(bbIn->jumpkind);
+ VG_(printf)( "} ");
+ ppIRExpr( bbIn->next );
+ VG_(printf)( "}\n");
+ }
+
+ return cgs.bbOut;
}
=20
/*------------------------------------------------------------*/
@@ -1077,6 +1351,12 @@
VG_(message)(Vg_DebugMsg, "BBs Retranslated: %d", BB_retranslatio=
ns);
}
VGP_POPCC(VgpCacheResults);
+
+ if (0) { Int i;
+ for (i =3D 0; i < 64; i++) {
+ show3(i); VG_(printf)(" %5d\n", trigrams[i] );
+ }
+ }
}
=20
/*--------------------------------------------------------------------*/
@@ -1084,15 +1364,20 @@
/*--------------------------------------------------------------------*/
=20
// Called when a translation is invalidated due to code unloading.
-static void cg_discard_basic_block_info ( Addr a, SizeT size )
+static void cg_discard_basic_block_info ( VexGuestExtents vge )
{
- VgHashNode* bbInfo;
+ VgHashNode* bbInfo;
=20
- if (0) VG_(printf)( "discard_basic_block_info: %p, %llu\n", a, (ULong=
)size);
+ tl_assert(vge.n_used > 0);
=20
+ if (DEBUG_CG)
+ VG_(printf)( "discard_basic_block_info: %p, %llu\n",=20
+ (void*)(Addr)vge.base[0], (ULong)vge.len[0]);
+
// Get BB info, remove from table, free BB info. Simple!
- bbInfo =3D VG_(HT_remove)(instr_info_table, a);
+ bbInfo =3D VG_(HT_remove)(instr_info_table, (UWord)vge.base[0]);
tl_assert(NULL !=3D bbInfo);
+
VG_(free)(bbInfo);
}
=20
@@ -1192,7 +1477,7 @@
VG_(details_copyright_author)(
"Copyright (C) 2002-2005, and GNU GPL'd, by Nicholas Nethercote et=
al.");
VG_(details_bug_reports_to) (VG_BUGS_TO);
- VG_(details_avg_translation_sizeB) ( 155 );
+ VG_(details_avg_translation_sizeB) ( 245 );
=20
VG_(basic_tool_funcs) (cg_post_clo_init,
cg_instrument,
Modified: trunk/coregrind/m_tooliface.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/coregrind/m_tooliface.c 2005-10-12 10:00:56 UTC (rev 4902)
+++ trunk/coregrind/m_tooliface.c 2005-10-12 10:09:23 UTC (rev 4903)
@@ -154,7 +154,7 @@
NEEDS(data_syms)
=20
void VG_(needs_basic_block_discards)(
- void (*discard)(Addr, SizeT)
+ void (*discard)(VexGuestExtents)
)
{
VG_(needs).basic_block_discards =3D True;
Modified: trunk/coregrind/m_transtab.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/coregrind/m_transtab.c 2005-10-12 10:00:56 UTC (rev 4902)
+++ trunk/coregrind/m_transtab.c 2005-10-12 10:09:23 UTC (rev 4903)
@@ -610,9 +610,14 @@
&& overlaps( guest_start, range, §ors[sno].tt[i].vge ))=
{
sectors[sno].tt[i].status =3D Deleted;
sectors[sno].tt_n_inuse--;
- anyDeleted =3D True;
+ anyDeleted =3D True;
n_disc_count++;
n_disc_osize +=3D vge_osize(§ors[sno].tt[i].vge);
+ /* Tell the tool too. */
+ if (VG_(needs).basic_block_discards) {
+ VG_TDICT_CALL( tool_discard_basic_block_info,
+ sectors[sno].tt[i].vge );
+ }
}
} =20
}
Modified: trunk/coregrind/pub_core_tooliface.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/coregrind/pub_core_tooliface.h 2005-10-12 10:00:56 UTC (rev 490=
2)
+++ trunk/coregrind/pub_core_tooliface.h 2005-10-12 10:09:23 UTC (rev 490=
3)
@@ -121,7 +121,7 @@
void (*tool_print_extra_suppression_info)(Error*);
=20
// VG_(needs).basic_block_discards
- void (*tool_discard_basic_block_info)(Addr, SizeT);
+ void (*tool_discard_basic_block_info)(VexGuestExtents);
=20
// VG_(needs).command_line_options
Bool (*tool_process_cmd_line_option)(Char*);
Modified: trunk/include/pub_tool_tooliface.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/include/pub_tool_tooliface.h 2005-10-12 10:00:56 UTC (rev 4902)
+++ trunk/include/pub_tool_tooliface.h 2005-10-12 10:09:23 UTC (rev 4903)
@@ -186,17 +186,28 @@
void (*print_extra_suppression_info)(Error* err)
);
=20
-
-/* Is information kept about specific individual basic blocks? (Eg. for
- cachegrind there are cost-centres for every instruction, stored at a
- basic block level.) If so, it sometimes has to be discarded, because
- .so mmap/munmap-ping or self-modifying code (informed by the
- DISCARD_TRANSLATIONS user request) can cause one instruction address
- to be used for more than one instruction in one program run... */
+/* Is information kept by the tool about specific instructions or
+ translations? (Eg. for cachegrind there are cost-centres for every
+ instruction, stored in a per-translation fashion.) If so, the info
+ may have to be discarded when translations are unloaded (eg. due to
+ .so unloading, or otherwise at the discretion of m_transtab, eg
+ when the table becomes too full) to avoid stale information being
+ reused for new translations. */
extern void VG_(needs_basic_block_discards) (
- // Should discard any information that pertains to specific basic blo=
cks
- // or instructions within the address range given.
- void (*discard_basic_block_info)(Addr a, SizeT size)
+ // Discard any information that pertains to specific translations
+ // or instructions within the address range given. The "extents"
+ // arg can be used in two ways.
+ // - If info is being stored at a per-translation level, the first
+ // address in the extents can be used to identify which translation
+ // is being discarded. Each translation will be discarded exactly
+ // once.
+ // - If info is being stored at a per-instruction level, you can get
+ // the address range(s) being discarded by stepping through "extent=
s".
+ // Note that any single instruction may belong to more than one
+ // translation, and so could be covered by the "extents" of more th=
an
+ // one call to this function.
+ // Doing it the first way (as eg. Cachegrind does) is probably easier=
.
+ void (*discard_basic_block_info)(VexGuestExtents vge)
);
=20
/* Tool defines its own command line options? */
|
|
From: Josef W. <Jos...@gm...> - 2005-10-12 14:12:14
|
On Wednesday 12 October 2005 12:09, sv...@va... wrote:
> Author: sewardj
> Date: 2005-10-12 11:09:23 +0100 (Wed, 12 Oct 2005)
> New Revision: 4903
>
> Log:
> Redo the way cachegrind generates instrumentation code, so that it can
> deal with any IR that happens to show up. This makes it work on ppc32
> and should fix occasionally-reported bugs on x86/amd64 where it bombs
> due to having to deal with multiple date references in a single
> instruction.
Good.
I wonder why you pulled the data size out of the instruction info into
a parameter. Are there cases on ppc32 where one instruction does
multiple data accesses with different sizes?
In callgrind, I did something similar in the last version:
Generate helpers into the cache simulator before every Ist_Exit,
and at end of a BB. For this, I also added 0I1Dr and so on.
Why did you get rid of xI2D? Is this forced because of the maximum
number of helper arguments, or because of profiling (i.e. it is
not worth it)?
> The new scheme is based around the idea of a queue of memory events
> which are outstanding, in the sense that no IR has yet been generated
> to do the relevant helper calls. The presence of the queue --
> currently 16 entries deep -- gives cachegrind more scope for combining
> multiple memory references into a single helper function call. As a
> result it runs 3%-5% faster than the previous version, on x86.
Not bad. I did not expect that putting 2 or 3 Ir's into one helper
call would provide such a speedup; especially, as the number of calls
into the cache simulator itself (cachesim_I1_doref) are the same.
Hmmm... do you mind if I take most of your changes over to Callgrind?
It is easier for me to keep up with needed changes for new architectures,
if the code base is the same. Especially as I don't have a ppc32
machine available here.
> This commit also changes the type of the tool interface function
> 'tool_discard_basic_block_info' and clarifies its meaning. See
> comments in include/pub_tool_tooliface.h.
BTW: Callgrind does not use tool_discard_basic_block_info at all:
I keep (dso, offset) instead of pure addresses in my structs.
This way, all the counters can stay attributed to debug info, and
it can cope quite fine with remapping of the same dso at different
addresses, using the same counters.
> +/* A struct which holds all the running state during instrumentation.
> + Mostly to avoid passing loads of parameters everywhere. */
> +typedef
> + struct {
> ...
> + /* Not sure what this is for (jrs 20051009) */
> + Bool bbSeenBefore;
This is set to true if a BB is instrumented more than once, e.g.
because of a flush of the translation cache. As Cachegrinds
data structures for this BB are still intact, you can use this
for sanity checks.
Josef
|
|
From: Nicholas N. <nj...@cs...> - 2005-10-12 14:38:08
|
On Wed, 12 Oct 2005, Josef Weidendorfer wrote:
> I wonder why you pulled the data size out of the instruction info into
> a parameter. Are there cases on ppc32 where one instruction does
> multiple data accesses with different sizes?
Julian and I discussed these changes a bit. I can't recall if we saw it
in practice, but it seemed like a dangerous assumption to have.
> In callgrind, I did something similar in the last version:
> Generate helpers into the cache simulator before every Ist_Exit,
> and at end of a BB. For this, I also added 0I1Dr and so on.
> Why did you get rid of xI2D? Is this forced because of the maximum
> number of helper arguments, or because of profiling (i.e. it is
> not worth it)?
I think the latter; Julian found the most common 2 and 3 event sequences.
> Not bad. I did not expect that putting 2 or 3 Ir's into one helper
> call would provide such a speedup; especially, as the number of calls
> into the cache simulator itself (cachesim_I1_doref) are the same.
The saving comes from doing fewer C calls from the instrumented code.
>> This commit also changes the type of the tool interface function
>> 'tool_discard_basic_block_info' and clarifies its meaning. See
>> comments in include/pub_tool_tooliface.h.
>
> BTW: Callgrind does not use tool_discard_basic_block_info at all:
> I keep (dso, offset) instead of pure addresses in my structs.
> This way, all the counters can stay attributed to debug info, and
> it can cope quite fine with remapping of the same dso at different
> addresses, using the same counters.
Cachegrind stores the access/hit/miss counters in cost centres (CCs) which
are indexed by source location (filename, function name, line number). It
stores other information (eg. instruction address and length) on a
per-instruction basis in the instr-info table, which is just used to
reduce the number of parameters passed to the C calls. It is these
instr-info nodes that must be removed when translations are discarded;
the CCs are still valid because they are in terms of source code locations
and thus unaffected by code loading/unloading.
>> +/* A struct which holds all the running state during instrumentation.
>> + Mostly to avoid passing loads of parameters everywhere. */
>> +typedef
>> + struct {
>> ...
>> + /* Not sure what this is for (jrs 20051009) */
>> + Bool bbSeenBefore;
>
> This is set to true if a BB is instrumented more than once, e.g.
> because of a flush of the translation cache. As Cachegrinds
> data structures for this BB are still intact, you can use this
> for sanity checks.
Actually, when a translation is discarded the BB's instr-info nodes get
removed from the instr-info table. From cg_discard_basic_block_info():
// Get BB info, remove from table, free BB info. Simple!
bbInfo = VG_(HT_remove)(instr_info_table, (UWord)vge.base[0]);
tl_assert(NULL != bbInfo);
So when the BB gets re-translated, its instr-info nodes entry shouldn't be
there. In practice, it sometimes was there which was why I added this
bbSeenBefore. But it turns out that m_transtab was not calling
discard_basic_block_info() when translations were discarded due to the
table being full, but only when code was unloaded due to munmap()... so
some BB instr-info entries were erroneously being left behind in the
table. Julian fixed that, and so now bbSeenBefore is never true, so it
can be removed. Make sense?
BTW Josef I have another Cachegrind patch that I will commit in the next
few days when I have time. It replaces the three-level hash table with an
OSet, which makes the code simpler and should scale better.
Nick
|
|
From: Julian S. <js...@ac...> - 2005-10-12 14:29:14
|
> I wonder why you pulled the data size out of the instruction info into > a parameter. Fundamentally, it's not really meaningful to say that an instruction has a specific data size. The data size is a property of each memory reference that an insn might make, but it is not a property of the insn itself. > Are there cases on ppc32 where one instruction does > multiple data accesses with different sizes? There is lsw, for which the amount of data transferred to/from memory is not known until run time (0-128 bytes). > Why did you get rid of xI2D? Is this forced because of the maximum > number of helper arguments, no > or because of profiling (i.e. it is not worth it)? yes. It's clear we could do a bit better on ppc with a couple more combinations, but I don't think it's worth the extra code bloat. What is there currently captures the majority of the opportunities for merging already. > Hmmm... do you mind if I take most of your changes over to Callgrind? Please do. Note, there will be some followup commits in the next few days to do some final cleanups, but nothing major. J |
|
From: Julian S. <js...@ac...> - 2005-10-12 15:14:01
|
> > or because of profiling (i.e. it is not worth it)? > > yes. It's clear we could do a bit better on ppc with a couple > more combinations, but I don't think it's worth the extra code > bloat. What is there currently captures the majority of the > opportunities for merging already. I did have a different speedup idea however. I don't know if it is valid, but might be worth looking into. cg_sim.c simulates general 2^n-way associative caches. Doing that involves endlessly rearranging entries in this set[] array when a tag matches set[i] for i > 0. In that case (which is the second most common case after matching set[0]) the section set[0 .. i] is rotated to put set[i] at the start. This seems expensive, and it could be that simply swapping set[i] and set[i-1] would be cheaper -- it would remove the loop setup/control overhead whilst still bringing a popular line to the top quickly given a sequence of references to it. What I can't figure out is if this would change the actual simulated behaviour or not. It's clear that the 'A miss' case needs to remain as it is in order to cause an entry to fall off the end of the array. J |
|
From: Josef W. <Jos...@gm...> - 2005-10-12 16:19:05
|
On Wednesday 12 October 2005 17:10, Julian Seward wrote: > I did have a different speedup idea however. I don't know if it > is valid, but might be worth looking into. > > cg_sim.c simulates general 2^n-way associative caches. Doing > that involves endlessly rearranging entries in this set[] array > when a tag matches set[i] for i > 0. In that case (which > is the second most common case after matching set[0]) the > section set[0 .. i] is rotated to put set[i] at the start. > > This seems expensive, and it could be that simply swapping > set[i] and set[i-1] would be cheaper -- it would remove the > loop setup/control overhead whilst still bringing a popular > line to the top quickly given a sequence of references to it. Do you want to unroll case i==1 ? Swapping set[i] and set[i-1] would be only valid for i==1. At the end, set[0] always has to hold the least recent access. Cachegrind (and Callgrind) currently simulates LRU replacement. So set[] is ordered according access time. If you do not rotate, you violate this property. Of course, Cachegrind could use a pseudo-LRU replacement strategy. This will lead to slightly changed hit/miss numbers. AFAIK, hardware cache implementations often do not use real LRU, as this is quite expensive. Another thing: With C++ and templates, we could get rid of the loop (and macro) by letting the compiler generate unrolled code for a fixed number of associativities. Is there any hope that tools can use C++ in the future? > What I can't figure out is if this would change the actual > simulated behaviour or not. It's clear that the 'A miss' > case needs to remain as it is in order to cause an entry to > fall off the end of the array. I you do not rotate in all cases, the entry which will fall off on a miss, will not always be the one which was accessed the longest time ago. Josef |
|
From: Nicholas N. <nj...@cs...> - 2005-10-12 15:26:05
|
On Wed, 12 Oct 2005, Julian Seward wrote: > I did have a different speedup idea however. I don't know if it > is valid, but might be worth looking into. > > cg_sim.c simulates general 2^n-way associative caches. Doing > that involves endlessly rearranging entries in this set[] array > when a tag matches set[i] for i > 0. In that case (which > is the second most common case after matching set[0]) the > section set[0 .. i] is rotated to put set[i] at the start. > > This seems expensive, and it could be that simply swapping > set[i] and set[i-1] would be cheaper -- it would remove the > loop setup/control overhead whilst still bringing a popular > line to the top quickly given a sequence of references to it. > > What I can't figure out is if this would change the actual > simulated behaviour or not. Sounds like it would -- it wouldn't be LRU any more. Besides, who knows if that loop is a big cost? Someone should profile Cachegrind with Cachegrind to work out where the main costs are in cg_sim.c. It would require un-macrofying cachesim_*_doref so that the counts get attributed to individual lines. Nick |
|
From: Josef W. <Jos...@gm...> - 2005-10-12 16:36:07
|
On Wednesday 12 October 2005 17:25, Nicholas Nethercote wrote: > Besides, who knows if that loop is a big cost? Someone should profile > Cachegrind with Cachegrind to work out where the main costs are in > cg_sim.c. Yes. Or use OProfile. set[] with typical associativity 8 fits into a cache line itself. > It would require un-macrofying cachesim_*_doref so that the > counts get attributed to individual lines. I did this in Callgrind 0.10.0 already. It is a little bit slower, but way easier to look at. BTW, Cache simulation is switched off by default in Callgrind. See section "Write Through Cache Simulation" in http://cvs.sourceforge.net/viewcvs.py/kcachegrind/clg3/src/sim.c?rev=1.11&view=auto I use a little different interface (returning an enum), but that should be easy to rewrite to fit cachegrind. Josef |
|
From: Paul M. <pa...@sa...> - 2005-10-13 01:24:25
|
Nicholas Nethercote writes: > On Wed, 12 Oct 2005, Julian Seward wrote: > > > I did have a different speedup idea however. I don't know if it > > is valid, but might be worth looking into. > > > > cg_sim.c simulates general 2^n-way associative caches. Doing > > that involves endlessly rearranging entries in this set[] array > > when a tag matches set[i] for i > 0. In that case (which > > is the second most common case after matching set[0]) the > > section set[0 .. i] is rotated to put set[i] at the start. > > > > This seems expensive, and it could be that simply swapping > > set[i] and set[i-1] would be cheaper -- it would remove the > > loop setup/control overhead whilst still bringing a popular > > line to the top quickly given a sequence of references to it. > > > > What I can't figure out is if this would change the actual > > simulated behaviour or not. > > Sounds like it would -- it wouldn't be LRU any more. Well, it depends on how the code chooses a line to cast out. If the algorithm is to pick set[N-1], then maybe the change to the algorithm would change the actual simulated behaviour. If the cast-out algorithm doesn't depend on the order of the entries, then moving them around is just an optimization. The multi-way set associative caches that I have seen actually use a pseudo-LRU scheme that uses a binary tree, with 1 bit per node, where the ways are the leaves of the tree (so the tree has lg(N)+1 levels for an N-way cache). Updating the tree for an access (i.e. marking one way as most recently used) involves setting 1 bit at level of the tree (except for the leaves). Finding the "LRU" element involves traversing the tree from the top down; the bit on each interior node tells you whether to go left or right from that node. The MRU update involves setting each of the bits on the path from the root to the leaf being accessed to point away from the path down to the leaf. It doesn't do exactly LRU but it does a good approximation and it is easy to implement. Regards, Paul. |
|
From: Josef W. <Jos...@gm...> - 2005-10-12 15:41:38
|
On Wednesday 12 October 2005 16:37, Nicholas Nethercote wrote: > On Wed, 12 Oct 2005, Josef Weidendorfer wrote: > > I wonder why you pulled the data size out of the instruction info into > > a parameter. Are there cases on ppc32 where one instruction does > > multiple data accesses with different sizes? > > Julian and I discussed these changes a bit. I can't recall if we saw it > in practice, but it seemed like a dangerous assumption to have. OK. For Super-CISC architectures VG will support in the future ;-) > > In callgrind, I did something similar in the last version: > > Generate helpers into the cache simulator before every Ist_Exit, > > and at end of a BB. For this, I also added 0I1Dr and so on. > > Why did you get rid of xI2D? Is this forced because of the maximum > > number of helper arguments, or because of profiling (i.e. it is > > not worth it)? > > I think the latter; Julian found the most common 2 and 3 event sequences. Either way; I think I can do the same in Callgrind. > > Not bad. I did not expect that putting 2 or 3 Ir's into one helper > > call would provide such a speedup; especially, as the number of calls > > into the cache simulator itself (cachesim_I1_doref) are the same. > > The saving comes from doing fewer C calls from the instrumented code. I thought these are already quite optimized, with parameters in registers and so on. Are multiple C calls so expensive because all the registers have to be saved before and restored afterwards (including flags, x87-state, ...)? > >> This commit also changes the type of the tool interface function > >> 'tool_discard_basic_block_info' and clarifies its meaning. See > >> comments in include/pub_tool_tooliface.h. > > > > BTW: Callgrind does not use tool_discard_basic_block_info at all: > > I keep (dso, offset) instead of pure addresses in my structs. > > This way, all the counters can stay attributed to debug info, and > > it can cope quite fine with remapping of the same dso at different > > addresses, using the same counters. > > Cachegrind stores the access/hit/miss counters in cost centres (CCs) which > are indexed by source location (filename, function name, line number). It > stores other information (eg. instruction address and length) on a > per-instruction basis in the instr-info table, which is just used to > reduce the number of parameters passed to the C calls. It is these > instr-info nodes that must be removed when translations are discarded; > the CCs are still valid because they are in terms of source code locations > and thus unaffected by code loading/unloading. Ah, yes, OK. You changed this some time ago, didn't you? I had the old way in mind. > So when the BB gets re-translated, its instr-info nodes entry shouldn't be > there. In practice, it sometimes was there which was why I added this > bbSeenBefore. But it turns out that m_transtab was not calling > discard_basic_block_info() when translations were discarded due to the > table being full, but only when code was unloaded due to munmap()... so > some BB instr-info entries were erroneously being left behind in the > table. Julian fixed that, and so now bbSeenBefore is never true, so it > can be removed. Make sense? Yes, now I see. If a tool would like to get rid of some private structures in response to a munmap (of e.g. code), it should track the munmap instead. There is no issue with adding a boolean to discard_basic_block_info(), as this could provoke the same problem as you had. > BTW Josef I have another Cachegrind patch that I will commit in the next > few days when I have time. It replaces the three-level hash table with an > OSet, which makes the code simpler and should scale better. Ah, what was OSet about? An ordered set? How is that implemented? With an AVL tree? In Callgrind, I often use resizable hash tables. I thought myself of using these instead. Do you see a problem with hash tables? Josef > Nick |
|
From: Nicholas N. <nj...@cs...> - 2005-10-12 16:39:14
|
On Wed, 12 Oct 2005, Josef Weidendorfer wrote: >> The saving comes from doing fewer C calls from the instrumented code. > > I thought these are already quite optimized, with parameters in registers > and so on. Are multiple C calls so expensive because all the registers > have to be saved before and restored afterwards (including flags, > x87-state, ...)? That, and it's just more control transfers. As Julian said it saves about 3--5% so it's not a huge saving. > Ah, what was OSet about? An ordered set? > How is that implemented? With an AVL tree? Yes and yes. > In Callgrind, I often use resizable hash tables. I thought myself of using > these instead. Do you see a problem with hash tables? Not really. It's the usual trade-offs -- all data structures have certain characteristics, the best one to use depends on the situation. Having said that, I put a lot of effort into OSet to give it a general and clean interface as well as making it fast (the original implementation was borrowed from elsewhere). The intention was for it to be the generic data structure of first choice for use within Valgrind, to replace VgHashTable which can have scaling issues because it's not resizable. I thought about doing a resizable hash table instead but then Julian found the AVL tree implementation so I used that. VgHashTable still does better in some circumstances (eg. tracking heap blocks in Memcheck), but its interface isn't as nice. One nice thing about changing Cachegrind to an OSet is that all the entries in the cachegrind.out file will be sorted; this isn't terribly important but does make them easier to read. Nick |
|
From: Josef W. <Jos...@gm...> - 2005-10-13 08:59:40
|
On Thursday 13 October 2005 03:21, Paul Mackerras wrote: > The multi-way set associative caches that I have seen actually use a > pseudo-LRU scheme that uses a binary tree, with 1 bit per node, where > the ways are the leaves of the tree (so the tree has lg(N)+1 levels > for an N-way cache). Updating the tree for an access (i.e. marking > one way as most recently used) involves setting 1 bit at level of the > tree (except for the leaves). Finding the "LRU" element involves > traversing the tree from the top down; the bit on each interior node > tells you whether to go left or right from that node. The MRU update > involves setting each of the bits on the path from the root to the > leaf being accessed to point away from the path down to the leaf. It > doesn't do exactly LRU but it does a good approximation and it is easy > to implement. So you wouldn't shuffle around the tags in the set, but update a bit field? I suppose these needs an additional indirection for the most common case (checking MRU), but it could be worth it as only the bitfield has to be updated. Another problem is how to do this bit shuffling with a variable associativity. Still, if we do this, we have to make it clear to cachegrind users that we use another replacement strategy, i.e. hit/miss numbers change slightly. Josef |
|
From: Nicholas N. <nj...@cs...> - 2005-10-13 12:53:13
|
On Thu, 13 Oct 2005, Josef Weidendorfer wrote: > On Thursday 13 October 2005 03:21, Paul Mackerras wrote: >> The multi-way set associative caches that I have seen actually use a >> pseudo-LRU scheme that uses a binary tree, with 1 bit per node, where >> [...] > > So you wouldn't shuffle around the tags in the set, but update > a bit field? I suppose these needs an additional indirection for > the most common case (checking MRU), but it could be worth it as > only the bitfield has to be updated. > Another problem is how to do this bit shuffling with a variable > associativity. If this part of the simulation is taking a lot of time (and I'd like to see profiling evidence) I'd suggest keeping the current algorithm and having several specialised versions, for associativities of 1, 2, 4, and 8. It should be ugly but doable with some macro magic. Nick |
|
From: Paul M. <pa...@sa...> - 2005-10-15 07:47:34
|
Nicholas Nethercote writes: > If this part of the simulation is taking a lot of time (and I'd like to > see profiling evidence) I'd suggest keeping the current algorithm and > having several specialised versions, for associativities of 1, 2, 4, and > 8. It should be ugly but doable with some macro magic. My point was that if the cpu we are running on uses this pseudo-LRU algorithm, cachegrind should use it too, because it will give results that more accurately reflect what the actual hardware will do. I'm pretty sure the PPC970 (G5) uses pseudo-LRU replacement for the L2 cache, which is 8-way set associative. I have no idea what intel or AMD cpus use, though. Paul. |
|
From: Josef W. <Jos...@gm...> - 2005-10-15 15:18:55
|
On Saturday 15 October 2005 08:58, you wrote: > Nicholas Nethercote writes: > > If this part of the simulation is taking a lot of time (and I'd like to > > see profiling evidence) I'd suggest keeping the current algorithm and > > having several specialised versions, for associativities of 1, 2, 4, and > > 8. It should be ugly but doable with some macro magic. > > My point was that if the cpu we are running on uses this pseudo-LRU > algorithm, cachegrind should use it too, because it will give results > that more accurately reflect what the actual hardware will do. I do not think it matters much. If we want to match the hardware in every detail, we would have to change the simulation in other ways: e.g. the tracecache for L1I in P4, the "sectored" caches on intel (always prefetch the neighboured line in addition on a read), the write-back behavior of L2, the hardware prefetcher, the multiple outstanding loads because of OOO, and so on. There is not much point in adding all these things, as a lot is not really documented, and it makes the simulation much slower. You can have a look at callgrind: I optionally added write-back events, and put in a hardware prefetcher to get a best-case szenario. The nice thing of LRU is that it is a quite simply model, the results are somewhat meaningful, and it is easy to check if the simulator is really doing LRU instead of being buggy, given a specially tailored program. If you have a pseudo-LRU, it is more difficult to check that. > I'm > pretty sure the PPC970 (G5) uses pseudo-LRU replacement for the L2 > cache, which is 8-way set associative. Is this officially documented somewhere? > I have no idea what intel or > AMD cpus use, though. They are quite probably doing pseudo-LRU too, but I never saw something hint how this is implemented in detail. So I would say, switching to a pseuo-LRU scheme is worth it if it really speeds up the simulation. It would be nice to choose between the schemes via CLO, and default to the pseudo-LRU. This way, you can compare the 2 schemes. Julian: I really would like to add handling of software-prefetching instructions. Is there a way for a tool to get some hints from IR in this regard? Similar for non-temporal loads. These things can make quite a huge difference for cache simulation. If there is a cache invalidation instruction on ppc32, this also would be a candidate to be detectable by tools. Josef > > Paul. |