|
From: <sv...@va...> - 2014-04-22 22:07:15
|
Author: philippe
Date: Tue Apr 22 22:07:01 2014
New Revision: 13904
Log:
* Implement optimisation in storev8/16/64 similar to optimisations in storev32
See comments in MC_(helperc_STOREV8) for a detailed explanation.
On modern cpu, this gives a few percent speed improvement for
some perf tests, and reduces the nr of undistinguished secondary
map.
Modified:
trunk/memcheck/mc_main.c
Modified: trunk/memcheck/mc_main.c
==============================================================================
--- trunk/memcheck/mc_main.c (original)
+++ trunk/memcheck/mc_main.c Tue Apr 22 22:07:01 2014
@@ -607,7 +607,7 @@
return &am->sm;
}
-static SecMap** get_secmap_ptr ( Addr a )
+static INLINE SecMap** get_secmap_ptr ( Addr a )
{
return ( a <= MAX_PRIMARY_ADDRESS
? get_secmap_low_ptr(a)
@@ -659,7 +659,7 @@
writable copy of it, install it, and return the copy instead. (COW
semantics).
*/
-static SecMap* get_secmap_for_writing ( Addr a )
+static INLINE SecMap* get_secmap_for_writing ( Addr a )
{
return ( a <= MAX_PRIMARY_ADDRESS
? get_secmap_for_writing_low (a)
@@ -1235,7 +1235,7 @@
ULong pessim64 = V_BITS64_DEFINED;
UWord long_index = byte_offset_w(szL, bigendian, j);
for (i = 8-1; i >= 0; i--) {
- PROF_EVENT(31, "mc_LOADV_128_or_256_slow(loop)");
+ PROF_EVENT(29, "mc_LOADV_128_or_256_slow(loop)");
ai = a + 8*long_index + byte_offset_w(8, bigendian, i);
ok = get_vbits8(ai, &vbits8);
vbits64 <<= 8;
@@ -1472,8 +1472,9 @@
cases on 64-bit platforms. Are merely a speedup hack; can be
omitted without loss of correctness/functionality. Note that in
both cases the "sizeof(void*) == 8" causes these cases to be
- folded out by compilers on 32-bit platforms. These are derived
- from STOREV64 and STOREV32.
+ folded out by compilers on 32-bit platforms. The logic below
+ is somewhat similar to some cases extensively commented in
+ MC_(helperc_STOREV8).
*/
if (LIKELY(sizeof(void*) == 8
&& nBits == 64 && VG_IS_8_ALIGNED(a))) {
@@ -1796,7 +1797,7 @@
void MC_(make_mem_undefined_w_otag) ( Addr a, SizeT len, UInt otag )
{
- PROF_EVENT(41, "MC_(make_mem_undefined)");
+ PROF_EVENT(43, "MC_(make_mem_undefined)");
DEBUG("MC_(make_mem_undefined)(%p, %lu)\n", a, len);
set_address_range_perms ( a, len, VA_BITS16_UNDEFINED, SM_DIST_UNDEFINED );
if (UNLIKELY( MC_(clo_mc_level) == 3 ))
@@ -4428,27 +4429,35 @@
sm_off16 = SM_OFF_16(a);
vabits16 = ((UShort*)(sm->vabits8))[sm_off16];
- if (LIKELY( !is_distinguished_sm(sm) &&
- (VA_BITS16_DEFINED == vabits16 ||
- VA_BITS16_UNDEFINED == vabits16) ))
- {
- /* Handle common case quickly: a is suitably aligned, */
- /* is mapped, and is addressible. */
- // Convert full V-bits in register to compact 2-bit form.
- if (V_BITS64_DEFINED == vbits64) {
+ // To understand the below cleverness, see the extensive comments
+ // in MC_(helperc_STOREV8).
+ if (LIKELY(V_BITS64_DEFINED == vbits64)) {
+ if (LIKELY(vabits16 == (UShort)VA_BITS16_DEFINED)) {
+ return;
+ }
+ if (!is_distinguished_sm(sm) && VA_BITS16_UNDEFINED == vabits16) {
((UShort*)(sm->vabits8))[sm_off16] = (UShort)VA_BITS16_DEFINED;
- } else if (V_BITS64_UNDEFINED == vbits64) {
- ((UShort*)(sm->vabits8))[sm_off16] = (UShort)VA_BITS16_UNDEFINED;
- } else {
- /* Slow but general case -- writing partially defined bytes. */
- PROF_EVENT(212, "mc_STOREV64-slow2");
- mc_STOREVn_slow( a, 64, vbits64, isBigEndian );
+ return;
}
- } else {
- /* Slow but general case. */
- PROF_EVENT(213, "mc_STOREV64-slow3");
+ PROF_EVENT(232, "mc_STOREV64-slow2");
mc_STOREVn_slow( a, 64, vbits64, isBigEndian );
+ return;
}
+ if (V_BITS64_UNDEFINED == vbits64) {
+ if (vabits16 == (UShort)VA_BITS16_UNDEFINED) {
+ return;
+ }
+ if (!is_distinguished_sm(sm) && VA_BITS16_DEFINED == vabits16) {
+ ((UShort*)(sm->vabits8))[sm_off16] = (UShort)VA_BITS16_UNDEFINED;
+ return;
+ }
+ PROF_EVENT(232, "mc_STOREV64-slow3");
+ mc_STOREVn_slow( a, 64, vbits64, isBigEndian );
+ return;
+ }
+
+ PROF_EVENT(212, "mc_STOREV64-slow4");
+ mc_STOREVn_slow( a, 64, vbits64, isBigEndian );
}
#endif
}
@@ -4536,35 +4545,35 @@
sm_off = SM_OFF(a);
vabits8 = sm->vabits8[sm_off];
- // Cleverness: sometimes we don't have to write the shadow memory at
- // all, if we can tell that what we want to write is the same as what is
- // already there. The 64/16/8 bit cases also have cleverness at this
- // point, but it works a little differently to the code below.
- if (V_BITS32_DEFINED == vbits32) {
- if (vabits8 == (UInt)VA_BITS8_DEFINED) {
+ // To understand the below cleverness, see the extensive comments
+ // in MC_(helperc_STOREV8).
+ if (LIKELY(V_BITS32_DEFINED == vbits32)) {
+ if (LIKELY(vabits8 == (UInt)VA_BITS8_DEFINED)) {
return;
- } else if (!is_distinguished_sm(sm) && VA_BITS8_UNDEFINED == vabits8) {
+ }
+ if (!is_distinguished_sm(sm) && VA_BITS8_UNDEFINED == vabits8) {
sm->vabits8[sm_off] = (UInt)VA_BITS8_DEFINED;
- } else {
- // not defined/undefined, or distinguished and changing state
- PROF_EVENT(232, "mc_STOREV32-slow2");
- mc_STOREVn_slow( a, 32, (ULong)vbits32, isBigEndian );
+ return;
}
- } else if (V_BITS32_UNDEFINED == vbits32) {
+ PROF_EVENT(232, "mc_STOREV32-slow2");
+ mc_STOREVn_slow( a, 32, (ULong)vbits32, isBigEndian );
+ return;
+ }
+ if (V_BITS32_UNDEFINED == vbits32) {
if (vabits8 == (UInt)VA_BITS8_UNDEFINED) {
return;
- } else if (!is_distinguished_sm(sm) && VA_BITS8_DEFINED == vabits8) {
+ }
+ if (!is_distinguished_sm(sm) && VA_BITS8_DEFINED == vabits8) {
sm->vabits8[sm_off] = (UInt)VA_BITS8_UNDEFINED;
- } else {
- // not defined/undefined, or distinguished and changing state
- PROF_EVENT(233, "mc_STOREV32-slow3");
- mc_STOREVn_slow( a, 32, (ULong)vbits32, isBigEndian );
+ return;
}
- } else {
- // Partially defined word
- PROF_EVENT(234, "mc_STOREV32-slow4");
+ PROF_EVENT(233, "mc_STOREV32-slow3");
mc_STOREVn_slow( a, 32, (ULong)vbits32, isBigEndian );
+ return;
}
+
+ PROF_EVENT(234, "mc_STOREV32-slow4");
+ mc_STOREVn_slow( a, 32, (ULong)vbits32, isBigEndian );
}
#endif
}
@@ -4631,6 +4640,18 @@
return mc_LOADV16(a, False);
}
+/* True if the vabits4 in vabits8 indicate a and a+1 are accessible. */
+static INLINE
+Bool accessible_vabits4_in_vabits8 ( Addr a, UChar vabits8 )
+{
+ UInt shift;
+ tl_assert(VG_IS_2_ALIGNED(a)); // Must be 2-aligned
+ shift = (a & 2) << 1; // shift by 0 or 4
+ vabits8 >>= shift; // shift the four bits to the bottom
+ // check 2 x vabits2 != VA_BITS2_NOACCESS
+ return ((0x3 & vabits8) != VA_BITS2_NOACCESS)
+ && ((0xc & vabits8) != VA_BITS2_NOACCESS << 2);
+}
static INLINE
void mc_STOREV16 ( Addr a, UWord vbits16, Bool isBigEndian )
@@ -4653,29 +4674,39 @@
sm = get_secmap_for_reading_low(a);
sm_off = SM_OFF(a);
vabits8 = sm->vabits8[sm_off];
- if (LIKELY( !is_distinguished_sm(sm) &&
- (VA_BITS8_DEFINED == vabits8 ||
- VA_BITS8_UNDEFINED == vabits8) ))
- {
- /* Handle common case quickly: a is suitably aligned, */
- /* is mapped, and is addressible. */
- // Convert full V-bits in register to compact 2-bit form.
- if (V_BITS16_DEFINED == vbits16) {
- insert_vabits4_into_vabits8( a, VA_BITS4_DEFINED ,
+
+ // To understand the below cleverness, see the extensive comments
+ // in MC_(helperc_STOREV8).
+ if (LIKELY(V_BITS16_DEFINED == vbits16)) {
+ if (LIKELY(vabits8 == VA_BITS8_DEFINED)) {
+ return;
+ }
+ if (!is_distinguished_sm(sm)
+ && accessible_vabits4_in_vabits8(a, vabits8)) {
+ insert_vabits4_into_vabits8( a, VA_BITS4_DEFINED,
&(sm->vabits8[sm_off]) );
- } else if (V_BITS16_UNDEFINED == vbits16) {
+ return;
+ }
+ PROF_EVENT(232, "mc_STOREV16-slow2");
+ mc_STOREVn_slow( a, 16, (ULong)vbits16, isBigEndian );
+ }
+ if (V_BITS16_UNDEFINED == vbits16) {
+ if (vabits8 == VA_BITS8_UNDEFINED) {
+ return;
+ }
+ if (!is_distinguished_sm(sm)
+ && accessible_vabits4_in_vabits8(a, vabits8)) {
insert_vabits4_into_vabits8( a, VA_BITS4_UNDEFINED,
&(sm->vabits8[sm_off]) );
- } else {
- /* Slow but general case -- writing partially defined bytes. */
- PROF_EVENT(252, "mc_STOREV16-slow2");
- mc_STOREVn_slow( a, 16, (ULong)vbits16, isBigEndian );
+ return;
}
- } else {
- /* Slow but general case. */
- PROF_EVENT(253, "mc_STOREV16-slow3");
+ PROF_EVENT(233, "mc_STOREV16-slow3");
mc_STOREVn_slow( a, 16, (ULong)vbits16, isBigEndian );
+ return;
}
+
+ PROF_EVENT(234, "mc_STOREV16-slow4");
+ mc_STOREVn_slow( a, 16, (ULong)vbits16, isBigEndian );
}
#endif
}
@@ -4756,33 +4787,78 @@
sm = get_secmap_for_reading_low(a);
sm_off = SM_OFF(a);
vabits8 = sm->vabits8[sm_off];
- if (LIKELY
- ( !is_distinguished_sm(sm) &&
- ( (VA_BITS8_DEFINED == vabits8 || VA_BITS8_UNDEFINED == vabits8)
- || (VA_BITS2_NOACCESS != extract_vabits2_from_vabits8(a, vabits8))
- )
- )
- )
- {
- /* Handle common case quickly: a is mapped, the entire word32 it
- lives in is addressible. */
- // Convert full V-bits in register to compact 2-bit form.
- if (V_BITS8_DEFINED == vbits8) {
+
+ // Clevernesses to speed up storing V bits.
+ // The 64/32/16 bit cases also have similar clevernesses, but it
+ // works a little differently to the code below.
+ //
+ // Cleverness 1: sometimes we don't have to write the shadow memory at
+ // all, if we can tell that what we want to write is the same as what is
+ // already there. These cases are marked below as "defined on defined" and
+ // "undefined on undefined".
+ //
+ // Cleverness 2:
+ // We also avoid to call mc_STOREVn_slow if the V bits can directly
+ // be written in the secondary map. V bits can be directly written
+ // if 4 conditions are respected:
+ // * The address for which V bits are written is naturally aligned
+ // on 1 byte for STOREV8 (this is always true)
+ // on 2 bytes for STOREV16
+ // on 4 bytes for STOREV32
+ // on 8 bytes for STOREV64.
+ // * V bits being written are either fully defined or fully undefined.
+ // (for partially defined V bits, V bits cannot be directly written,
+ // as the secondary vbits table must be maintained).
+ // * the secmap is not distinguished (distinguished maps cannot be
+ // modified).
+ // * the memory corresponding to the V bits being written is
+ // accessible (if one or more bytes are not accessible,
+ // we must call mc_STOREVn_slow in order to report accessibility
+ // errors).
+ // Note that for STOREV32 and STOREV64, it is too expensive
+ // to verify the accessibility of each byte for the benefit it
+ // brings. Instead, a quicker check is done by comparing to
+ // VA_BITS(8|16)_(UN)DEFINED. This guarantees accessibility,
+ // but misses some opportunity of direct modifications.
+ // Checking each byte accessibility was measured for
+ // STOREV32+perf tests and was slowing down all perf tests.
+ // The cases corresponding to cleverness 2 are marked below as
+ // "direct mod".
+ if (LIKELY(V_BITS8_DEFINED == vbits8)) {
+ if (LIKELY(vabits8 == VA_BITS8_DEFINED)) {
+ return; // defined on defined
+ }
+ if (!is_distinguished_sm(sm)
+ && VA_BITS2_NOACCESS != extract_vabits2_from_vabits8(a, vabits8)) {
+ // direct mod
insert_vabits2_into_vabits8( a, VA_BITS2_DEFINED,
- &(sm->vabits8[sm_off]) );
- } else if (V_BITS8_UNDEFINED == vbits8) {
+ &(sm->vabits8[sm_off]) );
+ return;
+ }
+ PROF_EVENT(232, "mc_STOREV8-slow2");
+ mc_STOREVn_slow( a, 8, (ULong)vbits8, False/*irrelevant*/ );
+ return;
+ }
+ if (V_BITS8_UNDEFINED == vbits8) {
+ if (vabits8 == VA_BITS8_UNDEFINED) {
+ return; // undefined on undefined
+ }
+ if (!is_distinguished_sm(sm)
+ && (VA_BITS2_NOACCESS
+ != extract_vabits2_from_vabits8(a, vabits8))) {
+ // direct mod
insert_vabits2_into_vabits8( a, VA_BITS2_UNDEFINED,
- &(sm->vabits8[sm_off]) );
- } else {
- /* Slow but general case -- writing partially defined bytes. */
- PROF_EVENT(272, "mc_STOREV8-slow2");
- mc_STOREVn_slow( a, 8, (ULong)vbits8, False/*irrelevant*/ );
+ &(sm->vabits8[sm_off]) );
+ return;
}
- } else {
- /* Slow but general case. */
- PROF_EVENT(273, "mc_STOREV8-slow3");
+ PROF_EVENT(233, "mc_STOREV8-slow3");
mc_STOREVn_slow( a, 8, (ULong)vbits8, False/*irrelevant*/ );
+ return;
}
+
+ // Partially defined word
+ PROF_EVENT(234, "mc_STOREV8-slow4");
+ mc_STOREVn_slow( a, 8, (ULong)vbits8, False/*irrelevant*/ );
}
#endif
}
|