Author: sewardj
Date: Thu Oct 23 20:48:01 2014
New Revision: 14661
Log:
Darwin only: add a filter mechanism that aims to remove pointless
memory-map resync operations. Without the filter, such operations
come to dominate the running time of complex apps with thousands of
memory segments (eg Firefox) and it becomes unusably slow. With
the filter in place, the huge performance loss is mostly avoided.
Has no meaning and no effect on non-Darwin targets. Controlled by
flag --resync-filter=no|yes|verbose [yes]. Filter is currently only
set up for Mac OS X 10.9 (Mavericks) 64 bit and will not produce
any performance benefit on any other configuration.
Modified:
trunk/coregrind/m_main.c
trunk/coregrind/m_options.c
trunk/coregrind/m_syswrap/priv_syswrap-darwin.h
trunk/coregrind/m_syswrap/syswrap-darwin.c
trunk/coregrind/m_syswrap/syswrap-main.c
trunk/coregrind/pub_core_options.h
Modified: trunk/coregrind/m_main.c
==============================================================================
--- trunk/coregrind/m_main.c (original)
+++ trunk/coregrind/m_main.c Thu Oct 23 20:48:01 2014
@@ -807,6 +807,13 @@
else if VG_BINT_CLO(arg, "--unw-stack-scan-frames",
VG_(clo_unw_stack_scan_frames), 0, 32) {}
+ else if VG_XACT_CLO(arg, "--resync-filter=no",
+ VG_(clo_resync_filter), 0) {}
+ else if VG_XACT_CLO(arg, "--resync-filter=yes",
+ VG_(clo_resync_filter), 1) {}
+ else if VG_XACT_CLO(arg, "--resync-filter=verbose",
+ VG_(clo_resync_filter), 2) {}
+
else if ( ! VG_(needs).command_line_options
|| ! VG_TDICT_CALL(tool_process_cmd_line_option, arg) ) {
VG_(fmsg_bad_option)(arg, "");
@@ -870,6 +877,14 @@
"because it doesn't generate errors.\n", VG_(details).name);
}
+# if !defined(VGO_darwin)
+ if (VG_(clo_resync_filter) != 0) {
+ VG_(fmsg_bad_option)("--resync-filter=yes or =verbose",
+ "--resync-filter= is only available on MacOS X.\n");
+ /*NOTREACHED*/
+ }
+# endif
+
/* If XML output is requested, check that the tool actually
supports it. */
if (VG_(clo_xml) && !VG_(needs).xml_output) {
Modified: trunk/coregrind/m_options.c
==============================================================================
--- trunk/coregrind/m_options.c (original)
+++ trunk/coregrind/m_options.c Thu Oct 23 20:48:01 2014
@@ -129,6 +129,12 @@
UInt VG_(clo_unw_stack_scan_thresh) = 0; /* disabled by default */
UInt VG_(clo_unw_stack_scan_frames) = 5;
+#if defined(VGO_darwin)
+UInt VG_(clo_resync_filter) = 1; /* enabled, but quiet */
+#else
+UInt VG_(clo_resync_filter) = 0; /* disabled */
+#endif
+
/*====================================================================*/
/*=== File expansion ===*/
Modified: trunk/coregrind/m_syswrap/priv_syswrap-darwin.h
==============================================================================
--- trunk/coregrind/m_syswrap/priv_syswrap-darwin.h (original)
+++ trunk/coregrind/m_syswrap/priv_syswrap-darwin.h Thu Oct 23 20:48:01 2014
@@ -51,7 +51,7 @@
void VG_(show_open_ports)(void);
-Bool ML_(sync_mappings)(const HChar *when, const HChar *where, Int num);
+Bool ML_(sync_mappings)(const HChar *when, const HChar *where, UWord num);
// Unix syscalls.
// GEN = it uses the generic wrapper
Modified: trunk/coregrind/m_syswrap/syswrap-darwin.c
==============================================================================
--- trunk/coregrind/m_syswrap/syswrap-darwin.c (original)
+++ trunk/coregrind/m_syswrap/syswrap-darwin.c Thu Oct 23 20:48:01 2014
@@ -612,9 +612,132 @@
sync_mappings
------------------------------------------------------------------ */
-Bool ML_(sync_mappings)(const HChar *when, const HChar *where, Int num)
+typedef
+ enum { CheckAlways=1, CheckEvery20, CheckNever }
+ CheckHowOften;
+
+static const HChar* show_CheckHowOften ( CheckHowOften cho ) {
+ switch (cho) {
+ case CheckAlways: return "Always ";
+ case CheckEvery20: return "Every20";
+ case CheckNever: return "Never ";
+ default: vg_assert(0);
+ }
+}
+
+/* Statistics for one particular resync-call set of arguments,
+ as specified by key1, key2 and key3. */
+typedef
+ struct {
+ CheckHowOften cho;
+ const HChar* key1;
+ const HChar* key2;
+ UWord key3;
+ ULong n_checks;
+ ULong n_mappings_added;
+ ULong n_mappings_removed;
+ }
+ SyncStats;
+
+static Bool cmp_eqkeys_SyncStats ( SyncStats* ss1, SyncStats* ss2 ) {
+ return ss1->key3 == ss2->key3
+ && 0 == VG_(strcmp)(ss1->key1, ss2->key1)
+ && 0 == VG_(strcmp)(ss1->key2, ss2->key2);
+}
+
+/* The filter data. */
+#define N_SYNCSTATS 1000
+static Int syncstats_used = 0;
+static SyncStats syncstats[N_SYNCSTATS];
+
+/* Statistics overall, for the filter. */
+static ULong n_syncsRequested = 0; // Total number requested
+static ULong n_syncsPerformed = 0; // Number carried out (the rest skipped)
+
+
+static
+void update_syncstats ( CheckHowOften cho,
+ const HChar* key1, const HChar* key2,
+ UWord key3,
+ UInt n_mappings_added, UInt n_mappings_removed )
+{
+ SyncStats dummy = { CheckAlways, key1, key2, key3, 0, 0, 0 };
+ Int i;
+ for (i = 0; i < syncstats_used; i++) {
+ if (cmp_eqkeys_SyncStats(&syncstats[i], &dummy))
+ break;
+ }
+ vg_assert(i >= 0 && i <= syncstats_used);
+ if (i == syncstats_used) {
+ // alloc new
+ vg_assert(syncstats_used < N_SYNCSTATS);
+ syncstats_used++;
+ syncstats[i] = dummy;
+ syncstats[i].cho = cho;
+ }
+ vg_assert(cmp_eqkeys_SyncStats(&syncstats[i], &dummy));
+ syncstats[i].n_checks++;
+ syncstats[i].n_mappings_added += (ULong)n_mappings_added;
+ syncstats[i].n_mappings_removed += (ULong)n_mappings_removed;
+ // reorder
+ static UInt reorder_ctr = 0;
+ if (i > 0 && 0 == (1 & reorder_ctr++)) {
+ SyncStats tmp = syncstats[i-1];
+ syncstats[i-1] = syncstats[i];
+ syncstats[i] = tmp;
+ }
+}
+
+
+static void maybe_show_syncstats ( void )
+{
+ Int i;
+
+ // display
+ if (0 == (n_syncsRequested & 0xFF)) {
+ VG_(printf)("Resync filter: %'llu requested, %'llu performed (%llu%%)\n",
+ n_syncsRequested, n_syncsPerformed,
+ (100 * n_syncsPerformed) /
+ (n_syncsRequested == 0 ? 1 : n_syncsRequested));
+ for (i = 0; i < syncstats_used; i++) {
+ if (i >= 40) break; // just show the top 40
+ VG_(printf)(" [%3d] (%s) upd %6llu diff %4llu+,%3llu-"
+ " %s %s 0x%08llx\n",
+ i, show_CheckHowOften(syncstats[i].cho),
+ syncstats[i].n_checks,
+ syncstats[i].n_mappings_added,
+ syncstats[i].n_mappings_removed,
+ syncstats[i].key1, syncstats[i].key2,
+ (ULong)syncstats[i].key3);
+ }
+ if (i < syncstats_used) {
+ VG_(printf)(" and %d more entries not shown.\n", syncstats_used - i);
+ }
+ VG_(printf)("\n");
+ }
+}
+
+
+Bool ML_(sync_mappings)(const HChar* when, const HChar* where, UWord num)
{
- // Usually the number of segments added/removed in a single calls is very
+ // If VG(clo_resync_filter) == 0, the filter is disabled, and
+ // we must always honour the resync request.
+ //
+ // If VG(clo_resync_filter) == 1, the filter is enabled,
+ // so we try to avoid doing the sync if possible, but keep
+ // quiet.
+ //
+ // If VG(clo_resync_filter) == 2, the filter is enabled,
+ // so we try to avoid doing the sync if possible, and also
+ // periodically show stats, so that the filter can be updated.
+ // (by hand).
+
+ if (VG_(clo_resync_filter) >= 2)
+ maybe_show_syncstats();
+
+ n_syncsRequested++;
+
+ // Usually the number of segments added/removed in a single call is very
// small e.g. 1. But it sometimes gets up to at least 100 or so (eg. for
// Quicktime). So we use a repeat-with-bigger-buffers-until-success model,
// because we can't do dynamic allocation within VG_(get_changed_segments),
@@ -625,10 +748,137 @@
Int i;
Bool ok;
- if (VG_(clo_trace_syscalls)) {
+ // -------------- BEGIN resync-filter-kludge --------------
+ //
+ // Some kludges to try and avoid the worst case cost hit of doing
+ // zillions of resyncs (huge). The idea is that many of the most
+ // common resyncs never appear to cause a delta, so we just ignore
+ // them (CheckNever). Then, a bunch of them also happen a lot, but
+ // only very occasionally cause a delta. We resync after 20 of those
+ // (CheckEvery20). Finally, the rest form a long tail, so we always
+ // resync after those (CheckAlways).
+ //
+ // Assume this is kernel-version and word-size specific, so develop
+ // filters accordingly. This might be overly conservative --
+ // I don't know.
+
+# define STREQ(_s1, _s2) (0 == VG_(strcmp)((_s1),(_s2)))
+ Bool when_in = STREQ(when, "in");
+ Bool when_after = STREQ(when, "after");
+ Bool where_mmr = STREQ(where, "mach_msg_receive");
+ Bool where_mmrU = STREQ(where, "mach_msg_receive-UNHANDLED");
+ Bool where_iuct = STREQ(where, "iokit_user_client_trap");
+ Bool where_MwcN = STREQ(where, "ML_(wqthread_continue_NORETURN)");
+ Bool where_woQR = STREQ(where, "workq_ops(QUEUE_REQTHREADS)");
+ Bool where_woTR = STREQ(where, "workq_ops(THREAD_RETURN)");
+ Bool where_ke64 = STREQ(where, "kevent64");
+# undef STREQ
+
+ vg_assert(
+ 1 >= ( (where_mmr ? 1 : 0) + (where_mmrU ? 1 : 0)
+ + (where_iuct ? 1 : 0) + (where_MwcN ? 1 : 0)
+ + (where_woQR ? 1 : 0) + (where_woTR ? 1 : 0)
+ + (where_ke64 ? 1 : 0)
+ ));
+ // merely to stop gcc complaining of non-use in the case where
+ // there's no filter:
+ vg_assert(when_in == True || when_in == False);
+ vg_assert(when_after == True || when_after == False);
+
+ CheckHowOften check = CheckAlways;
+
+# if DARWIN_VERS == DARWIN_10_9 && VG_WORDSIZE == 8
+ /* ------ BEGIN filter for 64-bit 10.9.x ------ */
+ if (when_after && where_mmr) {
+ // "after mach_msg_receive <number>"
+ switch (num) {
+ case 0x00000000: // upd 12414 diff 36+,0-
+ check = CheckEvery20;
+ break;
+ default:
+ break;
+ }
+ }
+ else
+ if (when_after && where_mmrU) {
+ // "after mach_msg_receive-UNHANDLED <number>"
+ switch (num) {
+ case 0x00000000: // upd 16687 diff 73+,0-
+ case 0x00000001: // upd 5106 diff 89+,0-
+ case 0x00000002: // upd 1609 diff 1+,0-
+ case 0x00000003: // upd 1987 diff 6+,0-
+ // case 0x00000b95: // upd 2894 diff 57+,1- <==dangerous
+ case 0x000072d9: // upd 2616 diff 11+,0-
+ case 0x000072cb: // upd 2616 diff 9+,0-
+ case 0x000074d5: // upd 172 diff 0+,0-
+ check = CheckEvery20;
+ break;
+ default:
+ break;
+ }
+ }
+ else
+ if (when_in && where_MwcN && num == 0x00000000) {
+ // in ML_(wqthread_continue_NORETURN) 0x00000000
+ // upd 4346 diff 0+,0-
+ check = CheckEvery20;
+ }
+ else
+ if (when_after && where_woQR && num == 0x00000000) {
+ // after workq_ops(QUEUE_REQTHREADS) 0x00000000
+ // upd 14434 diff 102+,0-
+ check = CheckEvery20;
+ }
+ else
+ if (when_after && where_woTR && num == 0x00000000) {
+ // after workq_ops(THREAD_RETURN) 0x00000000
+ // upd 14434 diff 102+,0-
+ check = CheckEvery20;
+ }
+ else
+ if (when_after && where_ke64 && num == 0x00000000) {
+ // after kevent64 0x00000000
+ // upd 1736 diff 78+,0-
+ check = CheckEvery20;
+ }
+ /* ------- END filter for 64-bit 10.9.x ------- */
+# endif /* DARWIN_VERS == DARWIN_10_9 && VG_WORDSIZE == 8 */
+
+ /* Regardless of what the filter says, force a sync every 1 time in
+ 1000, to stop things getting too far out of sync. */
+ {
+ static UInt ctr1k = 0;
+ ctr1k++;
+ if ((ctr1k % 1000) == 0)
+ check = CheckAlways;
+ }
+
+ /* If the filter is disabled, we must always check. */
+ if (VG_(clo_resync_filter) == 0)
+ check = CheckAlways;
+
+ switch (check) {
+ case CheckAlways:
+ break;
+ case CheckEvery20: {
+ // only resync once every 20th time
+ static UInt ctr10 = 0;
+ ctr10++;
+ if ((ctr10 % 20) != 0) return False;
+ break;
+ }
+ case CheckNever:
+ return False;
+ default:
+ vg_assert(0);
+ }
+ //
+ // --------------- END resync-filter-kludge ---------------
+
+ if (0 || VG_(clo_trace_syscalls)) {
VG_(debugLog)(0, "syswrap-darwin",
- "sync_mappings(\"%s\", \"%s\", %d)\n",
- when, where, num);
+ "sync_mappings (%s) (\"%s\", \"%s\", 0x%llx)\n",
+ show_CheckHowOften(check), when, where, (ULong)num);
}
// 16 is enough for most cases, but small enough that overflow happens
@@ -637,20 +887,25 @@
ok = False;
while (!ok) {
VG_(free)(css); // css is NULL on first iteration; that's ok.
- css = VG_(calloc)("sys_wrap.sync_mappings", css_size, sizeof(ChangedSeg));
+ css = VG_(calloc)("sys_wrap.sync_mappings",
+ css_size, sizeof(ChangedSeg));
ok = VG_(get_changed_segments)(when, where, css, css_size, &css_used);
css_size *= 2;
}
+ UInt css_added = 0, css_removed = 0;
+
// Now add/remove them.
for (i = 0; i < css_used; i++) {
ChangedSeg* cs = &css[i];
if (cs->is_added) {
+ css_added++;
ML_(notify_core_and_tool_of_mmap)(
cs->start, cs->end - cs->start + 1,
cs->prot, VKI_MAP_PRIVATE, 0, cs->offset);
// should this call VG_(di_notify_mmap) also?
} else {
+ css_removed++;
ML_(notify_core_and_tool_of_munmap)(
cs->start, cs->end - cs->start + 1);
}
@@ -669,6 +924,14 @@
VG_(free)(css);
+ if (0)
+ VG_(debugLog)(0, "syswrap-darwin", "SYNC: %d %s %s\n",
+ css_used, when, where);
+
+ // Update the stats, so we can derive the filter above.
+ n_syncsPerformed++;
+ update_syncstats(check, when, where, num, css_added, css_removed);
+
return css_used > 0;
}
@@ -7128,14 +7391,14 @@
POST(mach_msg_receive)
{
- // mach_msg_header_t *mh = (mach_msg_header_t *)ARG1;
+ mach_msg_header_t *mh = (mach_msg_header_t *)ARG1;
// GrP fixme don't know of anything interesting here currently
// import_complex_message handles everything
// PRINT("UNHANDLED reply %d", mh->msgh_id);
// Assume the call may have mapped or unmapped memory
- ML_(sync_mappings)("after", "mach_msg_receive", 0);
+ ML_(sync_mappings)("after", "mach_msg_receive", mh->msgh_id);
}
PRE(mach_msg_receive)
@@ -7698,7 +7961,8 @@
POST(mach_msg_unhandled)
{
- ML_(sync_mappings)("after", "mach_msg_receive (unhandled)", 0);
+ mach_msg_header_t *mh = (mach_msg_header_t *)ARG1;
+ ML_(sync_mappings)("after", "mach_msg_receive-UNHANDLED", mh->msgh_id);
}
POST(mach_msg_unhandled_check)
Modified: trunk/coregrind/m_syswrap/syswrap-main.c
==============================================================================
--- trunk/coregrind/m_syswrap/syswrap-main.c (original)
+++ trunk/coregrind/m_syswrap/syswrap-main.c Thu Oct 23 20:48:01 2014
@@ -1404,6 +1404,11 @@
vg_assert(tid >= 1 && tid < VG_N_THREADS);
vg_assert(VG_(is_running_thread)(tid));
+# if !defined(VGO_darwin)
+ // Resync filtering is meaningless on non-Darwin targets.
+ vg_assert(VG_(clo_resync_filter) == 0);
+# endif
+
tst = VG_(get_ThreadState)(tid);
/* BEGIN ensure root thread's stack is suitably mapped */
Modified: trunk/coregrind/pub_core_options.h
==============================================================================
--- trunk/coregrind/pub_core_options.h (original)
+++ trunk/coregrind/pub_core_options.h Thu Oct 23 20:48:01 2014
@@ -372,6 +372,12 @@
low by default. Default: 5 */
extern UInt VG_(clo_unw_stack_scan_frames);
+/* Controls the resync-filter on MacOS. Has no effect on Linux.
+ 0=disabled [default on Linux] "no"
+ 1=enabled [default on MacOS] "yes"
+ 2=enabled and verbose. "verbose" */
+extern UInt VG_(clo_resync_filter);
+
#endif // __PUB_CORE_OPTIONS_H
/*--------------------------------------------------------------------*/
|