|
From: Julian S. <js...@ac...> - 2012-06-01 16:28:49
|
Hi Josef, Nick,
Do either of you have any objection if I commit basically the patch
below, after some testing? I am trying to finish up the AVX infrastructure
changes this weekend and this is the last thing on my list.
Thanks.
J
On Tuesday, May 22, 2012, Josef Weidendorfer wrote:
> Am 22.05.2012 01:24, schrieb Nicholas Nethercote:
> > On Mon, May 21, 2012 at 7:22 AM, Julian Seward<js...@ac...> wrote:
> >> I recently added some AVX support to V, and as a result added a new type
> >> of 32-byte values (Ity_V256) to IR. Loads and stores of such values
> >> cause Cachegrind and Callgrind to assert, because the size (32 bytes)
> >> is larger than MIN_LINE_SIZE, which is 16.
> >>
> >> As they currently are, both tools refuse to process memory accesses
> >> bigger than 16, on the basis that the minimum possible line size is 16,
> >> and so a 16 byte access could access 2 adjacent lines, which is a
> >> situation they are prepared to handle. But not 3 lines, which is a
> >> possible case for a 32 byte access w/ 16 byte lines (something for
> >> which I'm sure no hardware actually exists).
> >
> > Can we model the 32-byte accesses correctly and just assert if the 3
> > line case occurs?
>
> As far as I understand, the assertion in the simulator should not
> trigger in any case, as memory access lengths are always cut down to
> MIN_LINE_SIZE at the moment.
>
> This is needed, as there are some rare instructions with larger memory
> accesses, such as PUSHA.
>
> I think it makes more sense to cut access lengths down to the real
> minimum of used line sizes, using the patch below.
>
> Hmm. If people explicitly configure for 16-byte line size, and use AVX,
> this will result in wrong simulations because of the large number of
> cut down access sizes. I see these options:
> * if we detect a processor with AVX, make 32 the minimal accepted
> line size in manual configuration
> * count the number of cuts, and print a warning if a threshould of e.g.
> 10000 is reached ?
> * handle accesses straddling >2 lines correctly
>
> Here is the result of vg_perf (vg_minline is with the patch), also
> running "none" to see the measurement noise produced on my laptop. It
> looks like all numbers more or less stay the same.
>
> Josef
>
> weidendo@lapbode134:~/SW/GitRepos/vg/valgrind (master)> perl
> perf/vg_perf --tools=none,cachegrind --vg=. --vg=../vg-minline perf/
> -- Running tests in perf ----------------------------------------------
> -- bigcode1 --
> bigcode1 . :0.14s no: 2.0s (14.5x, -----) ca: 6.3s (44.7x, -----)
> bigcode1 vg-minline:0.14s no: 2.0s (14.3x, 1.5%) ca: 6.3s (44.7x, 0.0%)
> -- bigcode2 --
> bigcode2 . :0.14s no: 4.8s (34.6x, -----) ca:10.4s (74.5x, -----)
> bigcode2 vg-minline:0.14s no: 4.8s (34.3x, 0.8%) ca:10.4s (74.3x, 0.3%)
> -- bz2 --
> bz2 . :0.67s no: 2.6s ( 3.9x, -----) ca:18.4s (27.5x, -----)
> bz2 vg-minline:0.67s no: 2.6s ( 3.9x, -0.4%) ca:18.1s (27.1x, 1.6%)
> -- fbench --
> fbench . :0.28s no: 1.2s ( 4.4x, -----) ca: 5.1s (18.1x, -----)
> fbench vg-minline:0.28s no: 1.2s ( 4.4x, 0.0%) ca: 5.1s (18.1x, 0.4%)
> -- ffbench --
> ffbench . :0.26s no: 1.2s ( 4.5x, -----) ca: 6.4s (24.5x, -----)
> ffbench vg-minline:0.26s no: 1.2s ( 4.5x, 0.9%) ca: 6.4s (24.5x, -0.2%)
> -- heap --
> heap . :0.12s no: 0.8s ( 6.4x, -----) ca: 5.3s (44.3x, -----)
> heap vg-minline:0.12s no: 0.8s ( 6.8x, -5.2%) ca: 5.3s (44.6x, -0.6%)
> -- heap_pdb4 --
> heap_pdb4 . :0.12s no: 0.9s ( 7.3x, -----) ca: 5.8s (48.3x,
> -----) heap_pdb4 vg-minline:0.12s no: 0.8s ( 7.1x, 3.4%) ca: 5.8s
> (48.1x, 0.5%) -- many-loss-records --
> many-loss-records . :0.01s no: 0.3s (32.0x, -----) ca: 1.0s
> (98.0x, -----)
> many-loss-records vg-minline:0.01s no: 0.3s (29.0x, 9.4%) ca: 1.0s
> (97.0x, 1.0%)
> -- many-xpts --
> many-xpts . :0.05s no: 0.4s ( 7.8x, -----) ca: 1.4s (28.0x,
> -----) many-xpts vg-minline:0.05s no: 0.4s ( 7.6x, 2.6%) ca: 1.4s
> (28.4x, -1.4%) -- sarp --
> sarp . :0.03s no: 0.3s (11.0x, -----) ca: 1.4s (47.3x, -----)
> sarp vg-minline:0.03s no: 0.3s (11.0x, 0.0%) ca: 1.4s (46.7x, 1.4%)
> -- tinycc --
> tinycc . :0.21s no: 1.7s ( 7.9x, -----) ca:11.5s (54.8x, -----)
> tinycc vg-minline:0.21s no: 1.7s ( 8.0x, -1.8%) ca:11.4s (54.3x, 0.9%)
> -- Finished tests in perf ----------------------------------------------
>
>
>
> ====================================================================
>
> diff --git a/cachegrind/cg_main.c b/cachegrind/cg_main.c
> index 4b36204..9982f23 100644
> --- a/cachegrind/cg_main.c
> +++ b/cachegrind/cg_main.c
> @@ -69,6 +69,13 @@ static Bool clo_cache_sim = True; /* do cache
> simulation? */
> static Bool clo_branch_sim = False; /* do branch simulation? */
> static Char* clo_cachegrind_out_file = "cachegrind.out.%p";
>
> +
> +/*------------------------------------------------------------*/
> +/*--- Cachesim configuration ---*/
> +/*------------------------------------------------------------*/
> +
> +static Int min_line_size = 0; /* min of L1 and LL cache line sizes */
> +
> /*------------------------------------------------------------*/
> /*--- Types and Data Structures ---*/
> /*------------------------------------------------------------*/
> @@ -846,7 +853,7 @@ void addEvent_Dr ( CgState* cgs, InstrInfo* inode,
> Int datasize, IRAtom*
> {
> Event* evt;
> tl_assert(isIRAtom(ea));
> - tl_assert(datasize >= 1 && datasize <= MIN_LINE_SIZE);
> + tl_assert(datasize >= 1 && datasize <= min_line_size);
> if (!clo_cache_sim)
> return;
> if (cgs->events_used == N_EVENTS)
> @@ -868,7 +875,7 @@ void addEvent_Dw ( CgState* cgs, InstrInfo* inode,
> Int datasize, IRAtom*
> Event* evt;
>
> tl_assert(isIRAtom(ea));
> - tl_assert(datasize >= 1 && datasize <= MIN_LINE_SIZE);
> + tl_assert(datasize >= 1 && datasize <= min_line_size);
>
> if (!clo_cache_sim)
> return;
> @@ -1058,8 +1065,8 @@ IRSB* cg_instrument ( VgCallbackClosure* closure,
> // instructions will be done inaccurately, but they're
> // very rare and this avoids errors from hitting more
> // than two cache lines in the simulation.
> - if (dataSize > MIN_LINE_SIZE)
> - dataSize = MIN_LINE_SIZE;
> + if (dataSize > min_line_size)
> + dataSize = min_line_size;
> if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify)
> addEvent_Dr( &cgs, curr_inode, dataSize, d->mAddr );
> if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify)
> @@ -1085,8 +1092,8 @@ IRSB* cg_instrument ( VgCallbackClosure* closure,
> if (cas->dataHi != NULL)
> dataSize *= 2; /* since it's a doubleword-CAS */
> /* I don't think this can ever happen, but play safe. */
> - if (dataSize > MIN_LINE_SIZE)
> - dataSize = MIN_LINE_SIZE;
> + if (dataSize > min_line_size)
> + dataSize = min_line_size;
> addEvent_Dr( &cgs, curr_inode, dataSize, cas->addr );
> addEvent_Dw( &cgs, curr_inode, dataSize, cas->addr );
> break;
> @@ -1724,6 +1731,12 @@ static void cg_post_clo_init(void)
> &clo_D1_cache,
> &clo_LL_cache);
>
> + // min_line_size is used to make sure that we never feed
> + // accesses to the simulator straddling more than two
> + // cache lines at any cache level
> + min_line_size = (I1c.size < D1c.size) ? I1c.size : D1c.size;
> + min_line_size = (LLc.size < min_line_size) ? LLc.size : min_line_size;
> +
> cachesim_I1_initcache(I1c);
> cachesim_D1_initcache(D1c);
> cachesim_LL_initcache(LLc);
> ====================================================================
|