|
From: <sv...@va...> - 2005-12-15 21:18:41
|
Author: njn
Date: 2005-12-15 21:18:34 +0000 (Thu, 15 Dec 2005)
New Revision: 5351
Log:
Minor secondary V bit table optimisations:
- Make each node cover 4 bytes instead of one. This costs no extra space
because of alignment, and reduces the number of nodes by 4x in the wors=
t
case. On real programs it seems to only makes a small difference, thou=
gh
(< 3%).
- STOREVn_slow() now has code to detect if the byte being overwritten has=
a
sec V bit entry that can be removed. This can reduce the number of sta=
le
nodes in the sec V bit table, although in practice not by very much
because most of the staleness is caused by set_address_range_perms() an=
d
make_aligned*(). Currently it's commented out because it slowed things
down a little bit for little gain.
Modified:
branches/COMPVBITS/memcheck/mc_main.c
Modified: branches/COMPVBITS/memcheck/mc_main.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- branches/COMPVBITS/memcheck/mc_main.c 2005-12-15 19:53:50 UTC (rev 53=
50)
+++ branches/COMPVBITS/memcheck/mc_main.c 2005-12-15 21:18:34 UTC (rev 53=
51)
@@ -398,52 +398,106 @@
=20
/* --------------- Secondary V bit table ------------ */
=20
-// XXX: this table can hold out-of-date stuff. Eg. write a partially
-// defined byte, then overwrite it with a fully defined byte. The info =
for
-// the partially defined bytes will still be here. But it shouldn't eve=
r
-// get accessed, I think...
+// Note: the nodes in this table can become stale. Eg. if you write a
+// partially defined byte (PDB), then overwrite the same address with a
+// fully defined byte, the sec-V-bit node will not necessarily be remove=
d.
+// This is because checking for whether removal is necessary would slow =
down
+// the fast paths. Hopefully this is not a problem. If it becomes a
+// problem, we may have to consider doing a clean-up pass every so often=
.
=20
-// XXX: profile, esp. with Julian's random-ORing stress test. Could may=
be
-// store in chunks up to a page size.
+static OSet* secVBitTable;
=20
-OSet* secVBitTable;
+static ULong sec_vbits_bytes_allocd =3D 0;
+static ULong sec_vbits_bytes_freed =3D 0;
+static ULong sec_vbits_bytes_curr =3D 0;
+static ULong sec_vbits_bytes_peak =3D 0;
=20
+// 4 is the best value here. We can go from 1 to 4 for free -- it doesn=
't
+// change the size of the SecVBitNode because of padding. If we make it
+// larger, we have bigger nodes, but can possibly fit more partially def=
ined
+// bytes in each node. In practice it seems that partially defined byte=
s
+// are not clustered close to each other, so going bigger than 4 does no=
t
+// save space.
+#define BYTES_PER_SEC_VBIT_NODE 4
+
typedef=20
struct {
Addr a;
- UWord vbits8;
+ UChar vbits8[BYTES_PER_SEC_VBIT_NODE];
}=20
SecVBitNode;
=20
static UWord get_sec_vbits8(Addr a)
{
- SecVBitNode* n;
- n =3D VG_(OSet_Lookup)(secVBitTable, &a);
+ Addr aAligned =3D VG_ROUNDDN(a, BYTES_PER_SEC_VBIT_NODE);
+ Int amod =3D a % BYTES_PER_SEC_VBIT_NODE;
+ SecVBitNode* n =3D VG_(OSet_Lookup)(secVBitTable, &aAligned);
+ UChar vbits8;
tl_assert(n);
// Shouldn't be fully defined or fully undefined -- those cases shoul=
dn't
// make it to the secondary V bits table.
- tl_assert(V_BITS8_VALID !=3D n->vbits8 && V_BITS8_INVALID !=3D n->vbi=
ts8 );
- return n->vbits8;
+ vbits8 =3D n->vbits8[amod];
+ tl_assert(V_BITS8_VALID !=3D vbits8 && V_BITS8_INVALID !=3D vbits8);
+ return vbits8;
}
=20
static void set_sec_vbits8(Addr a, UWord vbits8)
{
- SecVBitNode* n;
- n =3D VG_(OSet_Lookup)(secVBitTable, &a);
+ Addr aAligned =3D VG_ROUNDDN(a, BYTES_PER_SEC_VBIT_NODE);
+ Int i, amod =3D a % BYTES_PER_SEC_VBIT_NODE;
+ SecVBitNode* n =3D VG_(OSet_Lookup)(secVBitTable, &aAligned);
// Shouldn't be fully defined or fully undefined -- those cases shoul=
dn't
// make it to the secondary V bits table.
- tl_assert(V_BITS8_VALID !=3D vbits8 && V_BITS8_INVALID !=3D vbits8 );
+ tl_assert(V_BITS8_VALID !=3D vbits8 && V_BITS8_INVALID !=3D vbits8);
if (n) {
- n->vbits8 =3D vbits8; // update
+ n->vbits8[amod] =3D vbits8; // update
} else {
+ // New node: assign the specific byte, make the rest invalid (the=
y
+ // should never be read as-is, but be cautious).
+ sec_vbits_bytes_allocd +=3D sizeof(SecVBitNode);
+ sec_vbits_bytes_curr +=3D sizeof(SecVBitNode);
+ if (sec_vbits_bytes_curr > sec_vbits_bytes_peak)
+ sec_vbits_bytes_peak =3D sec_vbits_bytes_curr;
n =3D VG_(OSet_AllocNode)(secVBitTable, sizeof(SecVBitNode));
- n->a =3D a;
- n->vbits8 =3D vbits8;
+ n->a =3D aAligned;
+ for (i =3D 0; i < BYTES_PER_SEC_VBIT_NODE; i++) {
+ n->vbits8[i] =3D V_BITS8_INVALID;
+ }
+ n->vbits8[amod] =3D vbits8;
VG_(OSet_Insert)(secVBitTable, n);
}
}
=20
+// Remove the node if its V bytes (other than the one for 'a') are all f=
ully
+// defined or fully undefined. We ignore the V byte for 'a' because it'=
s
+// about to be overwritten with a fully defined or fully undefined value=
.
+__attribute__((unused))
+static void maybe_remove_sec_vbits8(Addr a)
+{
+ Addr aAligned =3D VG_ROUNDDN(a, BYTES_PER_SEC_VBIT_NODE);
+ Int i, amod =3D a % BYTES_PER_SEC_VBIT_NODE;
+ SecVBitNode* n =3D VG_(OSet_Lookup)(secVBitTable, &aAligned);
+ tl_assert(n);
+ for (i =3D 0; i < BYTES_PER_SEC_VBIT_NODE; i++) {
+ UChar vbits8 =3D n->vbits8[i];
=20
+ // Ignore the V byte for 'a'.
+ if (i =3D=3D amod)
+ continue;
+ =20
+ // One of the other V bytes is still partially defined -- don't re=
move
+ // this entry from the table.
+ if (V_BITS8_VALID !=3D vbits8 && V_BITS8_INVALID !=3D vbits8)
+ return;
+ }
+ n =3D VG_(OSet_Remove)(secVBitTable, &aAligned);
+ VG_(OSet_FreeNode)(secVBitTable, n);
+ sec_vbits_bytes_freed +=3D sizeof(SecVBitNode);
+ sec_vbits_bytes_curr -=3D sizeof(SecVBitNode);
+ tl_assert(n);
+}
+
+
/* --------------- Endianness helpers --------------- */
=20
/* Returns the offset in memory of the byteno-th most significant byte
@@ -457,7 +511,7 @@
/* --------------- Fundamental functions --------------- */
=20
static inline
-void insert_vabit8_into_vabits32 ( Addr a, UChar vabits8, UChar* vabits3=
2 )
+void insert_vabits8_into_vabits32 ( Addr a, UChar vabits8, UChar* vabits=
32 )
{
UInt shift =3D (a & 3) << 1; // shift by 0, 2, 4, or 6
*vabits32 &=3D ~(0x3 << shift); // mask out the two old bits
@@ -489,10 +543,7 @@
{
SecMap* sm =3D get_secmap_writable(a);
UWord sm_off =3D SM_OFF(a);
-// VG_(printf)("se:%p, %d\n", a, sm_off);
-// VG_(printf)("s1:%p (0x%x)\n", &(sm->vabits32[sm_off]), vabits8);
- insert_vabit8_into_vabits32( a, vabits8, &(sm->vabits32[sm_off]) );
-// VG_(printf)("s2: 0x%x\n", sm->vabits32[sm_off]);
+ insert_vabits8_into_vabits32( a, vabits8, &(sm->vabits32[sm_off]) );
}
=20
static inline
@@ -518,16 +569,16 @@
static
ULong mc_LOADVn_slow ( Addr a, SizeT szB, Bool bigendian )
{
- /* Make up a result V word, which contains the loaded data for
+ /* Make up a 64-bit result V word, which contains the loaded data for
valid addresses and Defined for invalid addresses. Iterate over
the bytes in the word, from the most significant down to the
least. */
- ULong vw =3D V_BITS64_INVALID;
+ ULong vbits64 =3D V_BITS64_INVALID;
SizeT i =3D szB-1;
SizeT n_addrs_bad =3D 0;
Addr ai;
Bool partial_load_exemption_applies;
- UWord vbyte, vabits8;
+ UWord vbits8, vabits8;
=20
PROF_EVENT(30, "mc_LOADVn_slow");
tl_assert(szB =3D=3D 8 || szB =3D=3D 4 || szB =3D=3D 2 || szB =3D=3D =
1);
@@ -541,17 +592,17 @@
// XXX: We check in order of most likely to least likely...
// XXX: could maybe have a little lookup table instead of these
// chained conditionals? and elsewhere?
- if ( VA_BITS8_READABLE =3D=3D vabits8 ) { vbyte =3D V_BITS8_V=
ALID; }
- else if ( VA_BITS8_WRITABLE =3D=3D vabits8 ) { vbyte =3D V_BITS8_I=
NVALID; }
+ if ( VA_BITS8_READABLE =3D=3D vabits8 ) { vbits8 =3D V_BITS8_=
VALID; }
+ else if ( VA_BITS8_WRITABLE =3D=3D vabits8 ) { vbits8 =3D V_BITS8_=
INVALID; }
else if ( VA_BITS8_NOACCESS =3D=3D vabits8 ) {
- vbyte =3D V_BITS8_VALID; // Make V bits defined!
+ vbits8 =3D V_BITS8_VALID; // Make V bits defined!
n_addrs_bad++;
} else {
tl_assert( VA_BITS8_OTHER =3D=3D vabits8 );
- vbyte =3D get_sec_vbits8(ai);
+ vbits8 =3D get_sec_vbits8(ai);
}
- vw <<=3D 8;=20
- vw |=3D vbyte;
+ vbits64 <<=3D 8;=20
+ vbits64 |=3D vbits8;
if (i =3D=3D 0) break;
i--;
}
@@ -577,7 +628,7 @@
if (n_addrs_bad > 0 && !partial_load_exemption_applies)
mc_record_address_error( VG_(get_running_tid)(), a, szB, False );
=20
- return vw;
+ return vbits64;
}
=20
=20
@@ -585,7 +636,7 @@
void mc_STOREVn_slow ( Addr a, SizeT szB, ULong vbytes, Bool bigendian )
{
SizeT i, n_addrs_bad =3D 0;
- UWord vbyte, vabits8;
+ UWord vbits8, vabits8;
Addr ai;
=20
PROF_EVENT(35, "mc_STOREVn_slow");
@@ -597,17 +648,33 @@
for (i =3D 0; i < szB; i++) {
PROF_EVENT(36, "mc_STOREVn_slow(loop)");
ai =3D a+byte_offset_w(szB,bigendian,i);
- vbyte =3D vbytes & 0xff;
+ vbits8 =3D vbytes & 0xff;
vabits8 =3D get_vabits8(ai);
if ( VA_BITS8_NOACCESS !=3D vabits8 ) {
// Addressable. Convert in-register format to in-memory format=
.
- if ( V_BITS8_VALID =3D=3D vbyte ) { vabits8 =3D VA_BITS8=
_READABLE; }
- else if ( V_BITS8_INVALID =3D=3D vbyte ) { vabits8 =3D VA_BITS8=
_WRITABLE; }
- else {=20
+ // Also remove any existing sec V bit entry for the byte if no
+ // longer necessary.
+ //
+ // XXX: the calls to maybe_remove_sec_vbits8() are commented ou=
t
+ // because they slow things down a bit (eg. 10% for perf/bz2)
+ // and the space saving is quite small (eg. 1--2% reduction in =
the
+ // size of the sec-V-bit-table?)
+ if ( V_BITS8_VALID =3D=3D vbits8 ) {=20
+// if (VA_BITS8_OTHER =3D=3D vabits8)
+// maybe_remove_sec_vbits8(ai);
+ vabits8 =3D VA_BITS8_READABLE;=20
+
+ } else if ( V_BITS8_INVALID =3D=3D vbits8 ) {=20
+// if (VA_BITS8_OTHER =3D=3D vabits8)
+// maybe_remove_sec_vbits8(ai);
+ vabits8 =3D VA_BITS8_WRITABLE;=20
+
+ } else {=20
vabits8 =3D VA_BITS8_OTHER;
- set_sec_vbits8(ai, vbyte);
+ set_sec_vbits8(ai, vbits8);
}
set_vabits8(ai, vabits8);
+
} else {
// Unaddressable! Do nothing -- when writing to unaddressable
// memory it acts as a black hole, and the V bits can never be =
seen
@@ -827,7 +894,7 @@
if (lenA < 1) break;
PROF_EVENT(156, "set_address_range_perms-loop1a");
sm_off =3D SM_OFF(a);
- insert_vabit8_into_vabits32( a, vabits8, &(sm->vabits32[sm_off]) )=
;
+ insert_vabits8_into_vabits32( a, vabits8, &(sm->vabits32[sm_off]) =
);
a +=3D 1;
lenA -=3D 1;
}
@@ -845,7 +912,7 @@
if (lenA < 1) break;
PROF_EVENT(158, "set_address_range_perms-loop1b");
sm_off =3D SM_OFF(a);
- insert_vabit8_into_vabits32( a, vabits8, &(sm->vabits32[sm_off]) )=
;
+ insert_vabits8_into_vabits32( a, vabits8, &(sm->vabits32[sm_off]) =
);
a +=3D 1;
lenA -=3D 1;
}
@@ -916,7 +983,7 @@
if (lenB < 1) return;
PROF_EVENT(164, "set_address_range_perms-loop1c");
sm_off =3D SM_OFF(a);
- insert_vabit8_into_vabits32( a, vabits8, &(sm->vabits32[sm_off]) )=
;
+ insert_vabits8_into_vabits32( a, vabits8, &(sm->vabits32[sm_off]) =
);
a +=3D 1;
lenB -=3D 1;
}
@@ -3078,11 +3145,11 @@
// Convert full V-bits in register to compact 2-bit form.
// XXX: is it best to check for VALID before INVALID?
if (V_BITS8_VALID =3D=3D vbyte) {
- insert_vabit8_into_vabits32( a, VA_BITS8_READABLE,
- &(sm->vabits32[sm_off]) );
+ insert_vabits8_into_vabits32( a, VA_BITS8_READABLE,
+ &(sm->vabits32[sm_off]) );
} else if (V_BITS8_INVALID =3D=3D vbyte) {
- insert_vabit8_into_vabits32( a, VA_BITS8_WRITABLE,
- &(sm->vabits32[sm_off]) );
+ insert_vabits8_into_vabits32( a, VA_BITS8_WRITABLE,
+ &(sm->vabits32[sm_off]) );
} else {
/* Slow but general case -- writing partially defined bytes. */
PROF_EVENT(272, "helperc_STOREV1-slow2");
|
|
From: Oswald B. <os...@kd...> - 2005-12-16 06:15:57
|
On Thu, Dec 15, 2005 at 09:18:34PM +0000, sv...@va... wrote: > +// 4 is the best value here. We can go from 1 to 4 for free -- it doesn't > +// change the size of the SecVBitNode because of padding. If we make it > +// larger, we have bigger nodes, but can possibly fit more partially defined > +// bytes in each node. In practice it seems that partially defined bytes > +// are not clustered close to each other, so going bigger than 4 does not > +// save space. > +#define BYTES_PER_SEC_VBIT_NODE 4 > + this is a very 32 bit assumption. i'd define it to sizeof(Addr). just being consequent about the "for free" principle ... unless i missed something about v's internals. :) -- Hi! I'm a .signature virus! Copy me into your ~/.signature, please! -- Chaos, panic, and disorder - my work here is done. |
|
From: Julian S. <js...@ac...> - 2005-12-16 09:54:29
|
> > +#define BYTES_PER_SEC_VBIT_NODE 4 > > + > > this is a very 32 bit assumption. i'd define it to sizeof(Addr). just > being consequent about the "for free" principle ... unless i missed > something about v's internals. :) In this particular case the value is completely unrelated to the word size, so it has no bad effect on 64-bit systems. J |
|
From: Nicholas N. <nj...@cs...> - 2005-12-16 16:11:51
|
On Fri, 16 Dec 2005, Julian Seward wrote: >>> +#define BYTES_PER_SEC_VBIT_NODE 4 >>> + >> >> this is a very 32 bit assumption. i'd define it to sizeof(Addr). just >> being consequent about the "for free" principle ... unless i missed >> something about v's internals. :) > > In this particular case the value is completely unrelated to the > word size, so it has no bad effect on 64-bit systems. It doesn't affect correctness, but Oswald is right that changing it to sizeof(UWord) will get us an extra 4 bytes of coverage per node for free on 64-bit systems. I've made the change. Nick |