|
From: Matthias S. <zz...@ge...> - 2015-07-23 19:12:37
|
There are two testcases using partial defined bytes (bz2 and heap_pdb4).
Adding larger caches did not enlarge the hit rate, so I implemented only
a one element cache.
Pdb testcases get faster by 2 to 2.9 percent.
The only strange observation is that bz2 gets slower on amd64.
hitrate:
perf/bz2: 8142653/8405577 = 96.9%
perf/heap_pdb4: 6000000/9600000 = 62.5%
x86 performance (--reps=10):
-- bz2 --
bz2 orig :0.42s me:5.08s (12.10x, -----)
bz2 withcache :0.42s me:4.98s (11.86x, 2.0%)
-- heap_pdb4 --
heap_pdb4 orig :0.08s me:7.18s (89.75x, -----)
heap_pdb4 withcache :0.08s me:6.97s (87.12x, 2.9%)
== 2 programs, 4 timings =================
amd64 performance:
-- bz2 --
bz2 orig :0.42s me:4.47s (10.64x, -----)
bz2 withcache :0.42s me:4.54s (10.81x, -1.6%)
-- heap_pdb4 --
heap_pdb4 orig :0.07s me:6.42s (91.71x, -----)
heap_pdb4 withcache :0.07s me:6.29s (89.86x, 2.0%)
== 2 programs, 4 timings =================
---
memcheck/mc_main.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 66 insertions(+), 2 deletions(-)
diff --git a/memcheck/mc_main.c b/memcheck/mc_main.c
index f30460a..ed0b73e 100644
--- a/memcheck/mc_main.c
+++ b/memcheck/mc_main.c
@@ -918,6 +918,53 @@ typedef
}
SecVBitNode;
+// Additional cache of one element.
+// No perf testcase did perform better with a larger cache.
+typedef
+ struct {
+ Addr alignedAddr;
+ SecVBitNode *node;
+ }
+ SecVBitCache;
+static SecVBitCache secVBitCache;
+
+static void SecVBitCacheAdd(Addr aAligned, SecVBitNode * n)
+{
+ DEBUG("SecVBitCacheAdd %#lx %#lx\n", (long)aAligned, (long)n);
+ PROF_EVENT(141, "SecVBitCacheAdd");
+ secVBitCache.alignedAddr = aAligned;
+ secVBitCache.node = n;
+}
+
+static SecVBitNode *SecVBitCacheGet(Addr aAligned)
+{
+ SecVBitNode * n;
+ PROF_EVENT(142, "SecVBitCacheGet");
+ if (LIKELY(secVBitCache.alignedAddr == aAligned))
+ {
+ PROF_EVENT(143, "SecVBitCacheGet_hit");
+ DEBUG("SecVBitCacheGet hit: %#lx %#lx\n", (long)aAligned, (long)secVBitCache.node);
+ return secVBitCache.node;
+ }
+
+ PROF_EVENT(144, "SecVBitCacheGet_miss");
+ n = VG_(OSetGen_Lookup)(secVBitTable, &aAligned);
+ DEBUG("SecVBitCacheGet miss %#lx %#lx\n", (long)aAligned, (long)n);
+
+ // Is unconditionally writing faster than branching?
+ // If the value is NULL, a node will be created irectly after this call.
+ if (LIKELY(n))
+ SecVBitCacheAdd(aAligned, n);
+ return n;
+}
+
+static void SecVBitCacheClear(void)
+{
+ PROF_EVENT(140, "SecVBitCacheClear");
+ DEBUG("SecVBitCacheClear\n");
+ secVBitCache.alignedAddr = 0;
+}
+
static OSet* createSecVBitTable(void)
{
OSet* newSecVBitTable;
@@ -938,6 +985,7 @@ static void gcSecVBitTable(void)
Int i, n_nodes = 0, n_survivors = 0;
GCs_done++;
+ SecVBitCacheClear();
// Create the new table.
secVBitTable2 = createSecVBitTable();
@@ -999,7 +1047,7 @@ static UWord get_sec_vbits8(Addr a)
{
Addr aAligned = VG_ROUNDDN(a, BYTES_PER_SEC_VBIT_NODE);
Int amod = a % BYTES_PER_SEC_VBIT_NODE;
- SecVBitNode* n = VG_(OSetGen_Lookup)(secVBitTable, &aAligned);
+ SecVBitNode* n = SecVBitCacheGet(aAligned);
UChar vbits8;
tl_assert2(n, "get_sec_vbits8: no node for address %p (%p)\n", aAligned, a);
// Shouldn't be fully defined or fully undefined -- those cases shouldn't
@@ -1013,7 +1061,7 @@ static void set_sec_vbits8(Addr a, UWord vbits8)
{
Addr aAligned = VG_ROUNDDN(a, BYTES_PER_SEC_VBIT_NODE);
Int i, amod = a % BYTES_PER_SEC_VBIT_NODE;
- SecVBitNode* n = VG_(OSetGen_Lookup)(secVBitTable, &aAligned);
+ SecVBitNode* n = SecVBitCacheGet(aAligned);
// Shouldn't be fully defined or fully undefined -- those cases shouldn't
// make it to the secondary V bits table.
tl_assert(V_BITS8_DEFINED != vbits8 && V_BITS8_UNDEFINED != vbits8);
@@ -1043,6 +1091,8 @@ static void set_sec_vbits8(Addr a, UWord vbits8)
n_secVBit_nodes = VG_(OSetGen_Size)(secVBitTable);
if (n_secVBit_nodes > max_secVBit_nodes)
max_secVBit_nodes = n_secVBit_nodes;
+
+ SecVBitCacheAdd(aAligned, n);
}
}
@@ -1359,6 +1409,13 @@ ULong mc_LOADVn_slow ( Addr a, SizeT nBits, Bool bigendian )
UChar vbits8;
Bool ok;
+ if ((nBits == 16 && a & 1)
+ || (nBits == 32 && a & 3)
+ || (nBits == 64 && a & 7)
+ )
+ {
+ PROF_EVENT(32, "mc_LOADVn_slow_unaligned");
+ }
tl_assert(nBits == 64 || nBits == 32 || nBits == 16 || nBits == 8);
/* Make up a 64-bit result V word, which contains the loaded data
@@ -1525,6 +1582,13 @@ void mc_STOREVn_slow ( Addr a, SizeT nBits, ULong vbytes, Bool bigendian )
}
/* ------------ END semi-fast cases ------------ */
+ if ((nBits == 16 && a & 1)
+ || (nBits == 32 && a & 3)
+ || (nBits == 64 && a & 7)
+ )
+ {
+ PROF_EVENT(37, "mc_STOREVn_slow_unaligned");
+ }
tl_assert(nBits == 64 || nBits == 32 || nBits == 16 || nBits == 8);
/* Dump vbytes in memory, iterating from least to most significant
--
2.4.5
|