|
From: <sv...@va...> - 2012-12-21 10:34:20
|
sewardj 2012-12-21 10:34:08 +0000 (Fri, 21 Dec 2012)
New Revision: 13194
Log:
Teach Cachegrind about IRLoadG and IRStoreG.
Modified files:
branches/COMEM/cachegrind/cg_main.c
Modified: branches/COMEM/cachegrind/cg_main.c (+80 -4)
===================================================================
--- branches/COMEM/cachegrind/cg_main.c 2012-12-21 04:25:10 +00:00 (rev 13193)
+++ branches/COMEM/cachegrind/cg_main.c 2012-12-21 10:34:08 +00:00 (rev 13194)
@@ -419,6 +419,9 @@
n->parent->Dw.a++;
}
+/* Note that addEvent_D_guarded assumes that log_0Ir_1Dr_cache_access
+ and log_0Ir_1Dw_cache_access have exactly the same prototype. If
+ you change them, you must change addEvent_D_guarded too. */
static VG_REGPARM(3)
void log_0Ir_1Dr_cache_access(InstrInfo* n, Addr data_addr, Word data_size)
{
@@ -429,6 +432,7 @@
n->parent->Dr.a++;
}
+/* See comment on log_0Ir_1Dr_cache_access. */
static VG_REGPARM(3)
void log_0Ir_1Dw_cache_access(InstrInfo* n, Addr data_addr, Word data_size)
{
@@ -930,10 +934,10 @@
/* Is it possible to merge this write with the preceding read? */
lastEvt = &cgs->events[cgs->events_used-1];
if (cgs->events_used > 0
- && lastEvt->tag == Ev_Dr
- && lastEvt->Ev.Dr.szB == datasize
- && lastEvt->inode == inode
- && eqIRAtom(lastEvt->Ev.Dr.ea, ea))
+ && lastEvt->tag == Ev_Dr
+ && lastEvt->Ev.Dr.szB == datasize
+ && lastEvt->inode == inode
+ && eqIRAtom(lastEvt->Ev.Dr.ea, ea))
{
lastEvt->tag = Ev_Dm;
return;
@@ -953,6 +957,52 @@
}
static
+void addEvent_D_guarded ( CgState* cgs, InstrInfo* inode,
+ Int datasize, IRAtom* ea, IRAtom* guard,
+ Bool isWrite )
+{
+ tl_assert(isIRAtom(ea));
+ tl_assert(guard);
+ tl_assert(isIRAtom(guard));
+ tl_assert(datasize >= 1 && datasize <= min_line_size);
+
+ if (!clo_cache_sim)
+ return;
+
+ /* Adding guarded memory actions and merging them with the existing
+ queue is too complex. Simply flush the queue and add this
+ action immediately. Since guarded loads and stores are pretty
+ rare, this is not thought likely to cause any noticeable
+ performance loss as a result of the loss of event-merging
+ opportunities. */
+ tl_assert(cgs->events_used >= 0);
+ flushEvents(cgs);
+ tl_assert(cgs->events_used == 0);
+ /* Same as case Ev_Dw / case Ev_Dr in flushEvents, except with guard */
+ IRExpr* i_node_expr;
+ const HChar* helperName;
+ void* helperAddr;
+ IRExpr** argv;
+ Int regparms;
+ IRDirty* di;
+ i_node_expr = mkIRExpr_HWord( (HWord)inode );
+ helperName = isWrite ? "log_0Ir_1Dw_cache_access"
+ : "log_0Ir_1Dr_cache_access";
+ helperAddr = isWrite ? &log_0Ir_1Dw_cache_access
+ : &log_0Ir_1Dr_cache_access;
+ argv = mkIRExprVec_3( i_node_expr,
+ ea, mkIRExpr_HWord( datasize ) );
+ regparms = 3;
+ di = unsafeIRDirty_0_N(
+ regparms,
+ helperName, VG_(fnptr_to_fnentry)( helperAddr ),
+ argv );
+ di->guard = guard;
+ addStmtToIRSB( cgs->sbOut, IRStmt_Dirty(di) );
+}
+
+
+static
void addEvent_Bc ( CgState* cgs, InstrInfo* inode, IRAtom* guard )
{
Event* evt;
@@ -1101,6 +1151,31 @@
break;
}
+ case Ist_StoreG: {
+ IRStoreG* sg = st->Ist.StoreG.details;
+ IRExpr* data = sg->data;
+ IRExpr* addr = sg->addr;
+ IRType type = typeOfIRExpr(tyenv, data);
+ tl_assert(type != Ity_INVALID);
+ addEvent_D_guarded( &cgs, curr_inode,
+ sizeofIRType(type), addr, sg->guard,
+ True/*isWrite*/ );
+ break;
+ }
+
+ case Ist_LoadG: {
+ IRLoadG* lg = st->Ist.LoadG.details;
+ IRType type = Ity_INVALID; /* loaded type */
+ IRType typeWide = Ity_INVALID; /* after implicit widening */
+ IRExpr* addr = lg->addr;
+ typeOfIRLoadGOp(lg->cvt, &typeWide, &type);
+ tl_assert(type != Ity_INVALID);
+ addEvent_D_guarded( &cgs, curr_inode,
+ sizeofIRType(type), addr, lg->guard,
+ False/*!isWrite*/ );
+ break;
+ }
+
case Ist_Dirty: {
Int dataSize;
IRDirty* d = st->Ist.Dirty.details;
@@ -1231,6 +1306,7 @@
}
default:
+ ppIRStmt(st);
tl_assert(0);
break;
}
|
|
From: Josef W. <Jos...@gm...> - 2012-12-21 10:57:34
|
Am 21.12.2012 11:34, schrieb sv...@va...: > + /* Adding guarded memory actions and merging them with the existing > + queue is too complex. Simply flush the queue and add this > + action immediately. Since guarded loads and stores are pretty > + rare, this is not thought likely to cause any noticeable > + performance loss as a result of the loss of event-merging > + opportunities. */ On x86, for sure they are rare. I am not so sure for ARM. Apart from merging events, another benefit of the queue is to do the dirty helper calls in chunks, reducing register save/restore overhead. I really need to check out Valgrind on my recently bought Nexus 7 ;-) The changes for Callgrind should be exactly the same. Thanks, Josef |
|
From: Julian S. <js...@ac...> - 2012-12-27 12:12:24
|
On Friday, December 21, 2012, Josef Weidendorfer wrote: > On x86, for sure they are rare. On x86 they do not exist at all for any instruction set < AVX, IIUC, since AVX has conditional scatter/gather loads. Did you mean something else? > I am not so sure for ARM. From my experimentation so far, they do exist on ARM (obviously) but are still quite rare. One reason they might be rare is that flattening an if-then-else into conditionalised straight-line code on ARM is only going to be a performance win if the then- and else- branches are very small (or empty). Also, on Thumb, the conditionalisation scheme -- which is different from ARM -- makes it impossible to write straight-line if-then-else for more than 4 instructions total. > The changes for Callgrind should be exactly the same. I think so, but it would be good if you could look at what I just committed for Callgrind (r13206). It does not produce the same numbers for insn reads or data accesses for "perf/bz2 x" compiled -g -O, which worries me. But IIRC you fixed some bug on the trunk relating to cachegrind/callgrind inconsistencies, and maybe you fixed it after I made the COMEM branch from the trunk. So maybe that is the reason for the differences. J |
|
From: Josef W. <Jos...@gm...> - 2013-01-04 22:33:38
|
Am 27.12.2012 13:12, schrieb Julian Seward: > > On Friday, December 21, 2012, Josef Weidendorfer wrote: >> On x86, for sure they are rare. > > On x86 they do not exist at all for any instruction set < AVX, IIUC, > since AVX has conditional scatter/gather loads. Did you mean something > else? I thought you may use this also on x86/amd64 for cmov... >> I am not so sure for ARM. > >>From my experimentation so far, they do exist on ARM (obviously) but are > still quite rare. Ok. > I think so, but it would be good if you could look at what I just > committed for Callgrind (r13206). Looks fine aside from one issue: The handlers CLG_(cachesim).log_0I1Dw/Dr may be 0 if cache simulation is off. Thus, in addEvent_D_guarded it should not add the helper call if helperAddr is 0. It does not produce the same > numbers for insn reads or data accesses for "perf/bz2 x" compiled > -g -O, which worries me. But IIRC you fixed some bug on the trunk > relating to cachegrind/callgrind inconsistencies, and maybe you > fixed it after I made the COMEM branch from the trunk. So maybe > that is the reason for the differences. May be. That fix was for the case when cache simulation is off in callgrind. If you start callgrind with --cache-sim=yes, it should show the same numbers. Josef |