|
From: <sv...@va...> - 2013-03-27 12:41:09
|
sewardj 2013-03-27 12:40:48 +0000 (Wed, 27 Mar 2013)
New Revision: 13341
Log:
Commit Philippe Waroquiers' initial work multithread valgrind (mtV)
to branches/MTV. See #301830.
(Philippe Waroquiers, phi...@sk...)
Added files:
branches/MTV/coregrind/m_lock.c
branches/MTV/coregrind/m_scheduler/sched-lock-rwlock.c
branches/MTV/docs/internals/mtV.txt
branches/MTV/include/priv_atomic_amd64.h
branches/MTV/include/priv_atomic_x86.h
branches/MTV/include/pub_tool_atomic.h
branches/MTV/include/pub_tool_lock.h
branches/MTV/perf/big_parallel_sleepers.c
branches/MTV/perf/big_parallel_sleepers.vgperf
Modified files:
branches/MTV/coregrind/Makefile.am
branches/MTV/coregrind/m_aspacemgr/aspacemgr-linux.c
branches/MTV/coregrind/m_aspacemgr/priv_aspacemgr.h
branches/MTV/coregrind/m_dispatch/dispatch-amd64-linux.S
branches/MTV/coregrind/m_dispatch/dispatch-x86-linux.S
branches/MTV/coregrind/m_gdbserver/m_gdbserver.c
branches/MTV/coregrind/m_libcassert.c
branches/MTV/coregrind/m_main.c
branches/MTV/coregrind/m_scheduler/priv_sched-lock-impl.h
branches/MTV/coregrind/m_scheduler/priv_sched-lock.h
branches/MTV/coregrind/m_scheduler/priv_sema.h
branches/MTV/coregrind/m_scheduler/sched-lock-generic.c
branches/MTV/coregrind/m_scheduler/sched-lock.c
branches/MTV/coregrind/m_scheduler/scheduler.c
branches/MTV/coregrind/m_scheduler/sema.c
branches/MTV/coregrind/m_scheduler/ticket-lock-linux.c
branches/MTV/coregrind/m_signals.c
branches/MTV/coregrind/m_stacks.c
branches/MTV/coregrind/m_syswrap/syswrap-amd64-linux.c
branches/MTV/coregrind/m_syswrap/syswrap-linux.c
branches/MTV/coregrind/m_syswrap/syswrap-main.c
branches/MTV/coregrind/m_syswrap/syswrap-x86-linux.c
branches/MTV/coregrind/m_threadstate.c
branches/MTV/coregrind/m_transtab.c
branches/MTV/coregrind/pub_core_scheduler.h
branches/MTV/coregrind/pub_core_threadstate.h
branches/MTV/gdbserver_tests/sleepers.c
branches/MTV/include/Makefile.am
branches/MTV/memcheck/mc_main.c
branches/MTV/perf/Makefile.am
Modified: branches/MTV/coregrind/m_signals.c (+6 -5)
===================================================================
--- branches/MTV/coregrind/m_signals.c 2013-03-27 11:43:20 +00:00 (rev 13340)
+++ branches/MTV/coregrind/m_signals.c 2013-03-27 12:40:48 +00:00 (rev 13341)
@@ -2116,7 +2116,8 @@
/* The thread isn't currently running, make it so before going on */
vg_assert(tst->status == VgTs_WaitSys);
- VG_(acquire_BigLock)(tid, "async_signalhandler");
+ VG_(acquire_BigLock)(tid, VgTs_ReadLock, "async_signalhandler");
+ //mtV? unclear which kind of lock we need here. Normally a readlock I guess.
info->si_code = sanitize_si_code(info->si_code);
@@ -2414,12 +2415,13 @@
void sync_signalhandler_from_kernel ( ThreadId tid,
Int sigNo, vki_siginfo_t *info, struct vki_ucontext *uc )
{
+ ThreadState *tst = VG_(get_ThreadState)(tid);
/* Check to see if some part of Valgrind itself is interested in faults.
The fault catcher should never be set whilst we're in generated code, so
check for that. AFAIK the only use of the catcher right now is
memcheck's leak detector. */
if (fault_catcher) {
- vg_assert(VG_(in_generated_code) == False);
+ vg_assert(tst->in_generated_code == False);
(*fault_catcher)(sigNo, (Addr)info->VKI_SIGINFO_si_addr);
/* If the catcher returns, then it didn't handle the fault,
@@ -2434,14 +2436,13 @@
/* OK, this is a signal we really have to deal with. If it came
from the client's code, then we can jump back into the scheduler
and have it delivered. Otherwise it's a Valgrind bug. */
- ThreadState *tst = VG_(get_ThreadState)(tid);
if (VG_(sigismember)(&tst->sig_mask, sigNo)) {
/* signal is blocked, but they're not allowed to block faults */
VG_(set_default_handler)(sigNo);
}
- if (VG_(in_generated_code)) {
+ if (tst->in_generated_code) {
if (VG_(gdbserver_report_signal) (sigNo, tid)
|| VG_(sigismember)(&tst->sig_mask, sigNo)) {
/* Can't continue; must longjmp back to the scheduler and thus
@@ -2553,7 +2554,7 @@
if (VG_(clo_trace_signals))
VG_(dmsg)("sigvgkill for lwp %d tid %d\n", VG_(gettid)(), tid);
- VG_(acquire_BigLock)(tid, "sigvgkill_handler");
+ VG_(acquire_BigLock)(tid, VgTs_ReadLock, "sigvgkill_handler"); //mtV? see async_signalhandler
vg_assert(signo == VG_SIGVGKILL);
vg_assert(si->si_signo == signo);
Modified: branches/MTV/coregrind/m_scheduler/priv_sema.h (+6 -0)
===================================================================
--- branches/MTV/coregrind/m_scheduler/priv_sema.h 2013-03-27 11:43:20 +00:00 (rev 13340)
+++ branches/MTV/coregrind/m_scheduler/priv_sema.h 2013-03-27 12:40:48 +00:00 (rev 13341)
@@ -34,10 +34,16 @@
/* Not really a semaphore, but use a pipe for a token-passing scheme */
typedef struct {
Int pipe[2];
+ Char sema_char;
Int owner_lwpid; /* who currently has it */
Bool held_as_LL; /* if held, True == held by a _LL call */
} vg_sema_t;
+/* Cycle the sema_char passed through the pipe through 'A' .. 'Z' to make
+ it easier to make sense of strace/truss output - makes it possible
+ to see more clearly the change of ownership of the lock. Need to
+ be careful to reinitialise it at fork() time. */
+
// Nb: this may be OS-specific, but let's not factor it out until we
// implement an OS port for which this isn't ok.
void ML_(sema_init) ( vg_sema_t *sema );
Modified: branches/MTV/coregrind/m_syswrap/syswrap-amd64-linux.c (+5 -1)
===================================================================
--- branches/MTV/coregrind/m_syswrap/syswrap-amd64-linux.c 2013-03-27 11:43:20 +00:00 (rev 13340)
+++ branches/MTV/coregrind/m_syswrap/syswrap-amd64-linux.c 2013-03-27 12:40:48 +00:00 (rev 13341)
@@ -57,7 +57,6 @@
#include "priv_syswrap-linux-variants.h" /* decls of linux variant wrappers */
#include "priv_syswrap-main.h"
-
/* ---------------------------------------------------------------------
clone() handling
------------------------------------------------------------------ */
@@ -290,7 +289,12 @@
know that this thread has come into existence. If the clone
fails, we'll send out a ll_exit notification for it at the out:
label below, to clean up. */
+#ifdef SINGLEV
vg_assert(VG_(owns_BigLock_LL)(ptid));
+#else
+ vg_assert(ptst->slk == VgTs_ReadLock);
+ //mtV? or should we acquire the write lock ?
+#endif
VG_TRACK ( pre_thread_ll_create, ptid, ctid );
if (flags & VKI_CLONE_SETTLS) {
Modified: branches/MTV/coregrind/m_aspacemgr/aspacemgr-linux.c (+26 -4)
===================================================================
--- branches/MTV/coregrind/m_aspacemgr/aspacemgr-linux.c 2013-03-27 11:43:20 +00:00 (rev 13340)
+++ branches/MTV/coregrind/m_aspacemgr/aspacemgr-linux.c 2013-03-27 12:40:48 +00:00 (rev 13341)
@@ -300,6 +300,8 @@
static NSegment nsegments[VG_N_SEGMENTS];
static Int nsegments_used = 0;
+static Mutex ns_mutex;
+
#define Addr_MIN ((Addr)0)
#define Addr_MAX ((Addr)(-1ULL))
@@ -1196,14 +1198,22 @@
readonly data. */
NSegment const * VG_(am_find_nsegment) ( Addr a )
{
+ // mtV? Well, we are locking the data structure while we search it.
+ // But we return a pointer to it ???
+ // So, what if nsegments[i] is changed afterwards by another thread ?
+ VG_(mutex_lock)(&ns_mutex);
Int i = find_nsegment_idx(a);
aspacem_assert(i >= 0 && i < nsegments_used);
aspacem_assert(nsegments[i].start <= a);
aspacem_assert(a <= nsegments[i].end);
- if (nsegments[i].kind == SkFree)
+ if (nsegments[i].kind == SkFree) {
+ VG_(mutex_unlock)(&ns_mutex);
return NULL;
- else
+ }
+ else {
+ VG_(mutex_unlock)(&ns_mutex);
return &nsegments[i];
+ }
}
@@ -1323,7 +1333,10 @@
Bool VG_(am_is_valid_for_client)( Addr start, SizeT len,
UInt prot )
{
- return is_valid_for_client( start, len, prot, False/*free not OK*/ );
+ VG_(mutex_lock)(&ns_mutex);
+ Bool res = is_valid_for_client( start, len, prot, False/*free not OK*/ );
+ VG_(mutex_unlock)(&ns_mutex);
+ return res;
}
/* Variant of VG_(am_is_valid_for_client) which allows free areas to
@@ -1333,7 +1346,10 @@
Bool VG_(am_is_valid_for_client_or_free_or_resvn)
( Addr start, SizeT len, UInt prot )
{
- return is_valid_for_client( start, len, prot, True/*free is OK*/ );
+ VG_(mutex_lock)(&ns_mutex);
+ Bool res = is_valid_for_client( start, len, prot, True/*free is OK*/ );
+ VG_(mutex_unlock)(&ns_mutex);
+ return res;
}
@@ -1598,6 +1614,8 @@
NSegment seg;
Addr suggested_clstack_top;
+ VG_(mutex_init)(&ns_mutex);
+
aspacem_assert(sizeof(Word) == sizeof(void*));
aspacem_assert(sizeof(Addr) == sizeof(void*));
aspacem_assert(sizeof(SizeT) == sizeof(void*));
@@ -2023,6 +2041,7 @@
NSegment seg;
Bool needDiscard;
+ VG_(mutex_lock)(&ns_mutex);
aspacem_assert(len > 0);
aspacem_assert(VG_IS_PAGE_ALIGNED(a));
aspacem_assert(VG_IS_PAGE_ALIGNED(len));
@@ -2052,6 +2071,7 @@
}
add_segment( &seg );
AM_SANITY_CHECK;
+ VG_(mutex_unlock)(&ns_mutex);
return needDiscard;
}
@@ -2748,11 +2768,13 @@
segment. */
void VG_(am_set_segment_hasT_if_SkFileC_or_SkAnonC)( const NSegment* seg )
{
+ VG_(mutex_lock)(&ns_mutex);
Int i = segAddr_to_index( seg );
aspacem_assert(i >= 0 && i < nsegments_used);
if (nsegments[i].kind == SkAnonC || nsegments[i].kind == SkFileC) {
nsegments[i].hasT = True;
}
+ VG_(mutex_unlock)(&ns_mutex);
}
Modified: branches/MTV/perf/Makefile.am (+5 -1)
===================================================================
--- branches/MTV/perf/Makefile.am 2013-03-27 11:43:20 +00:00 (rev 13340)
+++ branches/MTV/perf/Makefile.am 2013-03-27 12:40:48 +00:00 (rev 13341)
@@ -6,6 +6,7 @@
EXTRA_DIST = \
bigcode1.vgperf \
bigcode2.vgperf \
+ big_parallel_sleepers.vgperf \
bz2.vgperf \
fbench.vgperf \
ffbench.vgperf \
@@ -18,7 +19,8 @@
test_input_for_tinycc.c
check_PROGRAMS = \
- bigcode bz2 fbench ffbench heap many-loss-records many-xpts sarp tinycc
+ bigcode big_parallel_sleepers bz2 fbench ffbench heap \
+ many-loss-records many-xpts sarp tinycc
AM_CFLAGS += -O $(AM_FLAG_M3264_PRI)
AM_CXXFLAGS += -O $(AM_FLAG_M3264_PRI)
@@ -30,6 +32,8 @@
fbench_CFLAGS = $(AM_CFLAGS) -O2
ffbench_LDADD = -lm
+big_parallel_sleepers_CFLAGS = $(AM_CFLAGS) -lpthread
+
tinycc_CFLAGS = $(AM_CFLAGS) -Wno-shadow -Wno-inline
if HAS_POINTER_SIGN_WARNING
tinycc_CFLAGS += -Wno-pointer-sign
Added: branches/MTV/include/priv_atomic_x86.h (+399 -0)
===================================================================
--- branches/MTV/include/priv_atomic_x86.h 2013-03-27 11:43:20 +00:00 (rev 13340)
+++ branches/MTV/include/priv_atomic_x86.h 2013-03-27 12:40:48 +00:00 (rev 13341)
@@ -0,0 +1,399 @@
+
+/*--------------------------------------------------------------------*/
+/*--- Atomic primitives for x86. priv_atomic_x86.h ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+ This file is part of Valgrind, a dynamic binary instrumentation
+ framework.
+ Derived from glibc 2.13
+ Copyright (C) 2002-2004, 2006, 2007, 2009 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <dr...@re...>, 2002.
+
+ Copyright (C) 2012-2012 Philippe Waroquiers
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307, USA.
+
+ The GNU General Public License is contained in the file COPYING.
+*/
+
+#ifndef __PRIV_ATOMIC_X86_H
+#define __PRIV_ATOMIC_X86_H
+
+#define LOCK_PREFIX "lock;"
+
+
+# define __arch_compare_and_exchange_val_8_acq(mem, newval, oldval) \
+ ({ __typeof (*mem) ret; \
+ __asm __volatile (LOCK_PREFIX "cmpxchgb %b2, %1" \
+ : "=a" (ret), "=m" (*mem) \
+ : "q" (newval), "m" (*mem), "0" (oldval)); \
+ ret; })
+
+# define __arch_compare_and_exchange_val_16_acq(mem, newval, oldval) \
+ ({ __typeof (*mem) ret; \
+ __asm __volatile (LOCK_PREFIX "cmpxchgw %w2, %1" \
+ : "=a" (ret), "=m" (*mem) \
+ : "r" (newval), "m" (*mem), "0" (oldval)); \
+ ret; })
+
+# define __arch_compare_and_exchange_val_32_acq(mem, newval, oldval) \
+ ({ __typeof (*mem) ret; \
+ __asm __volatile (LOCK_PREFIX "cmpxchgl %2, %1" \
+ : "=a" (ret), "=m" (*mem) \
+ : "r" (newval), "m" (*mem), "0" (oldval)); \
+ ret; })
+
+# define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval) \
+ ({ __typeof (*mem) ret; \
+ __asm __volatile (LOCK_PREFIX "cmpxchg8b %1" \
+ : "=A" (ret), "=m" (*mem) \
+ : "b" (((unsigned long long int) (newval)) \
+ & 0xffffffff), \
+ "c" (((unsigned long long int) (newval)) >> 32), \
+ "m" (*mem), "a" (((unsigned long long int) (oldval)) \
+ & 0xffffffff), \
+ "d" (((unsigned long long int) (oldval)) >> 32)); \
+ ret; })
+
+
+/* Note that we need no lock prefix. */
+#define atomic_exchange_acq(mem, newvalue) \
+ ({ __typeof (*mem) result; \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile ("xchgb %b0, %1" \
+ : "=q" (result), "=m" (*mem) \
+ : "0" (newvalue), "m" (*mem)); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile ("xchgw %w0, %1" \
+ : "=r" (result), "=m" (*mem) \
+ : "0" (newvalue), "m" (*mem)); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile ("xchgl %0, %1" \
+ : "=r" (result), "=m" (*mem) \
+ : "0" (newvalue), "m" (*mem)); \
+ else \
+ { \
+ result = 0; \
+ vg_assert (False); \
+ } \
+ result; })
+
+
+#define __arch_exchange_and_add_body(lock, pfx, mem, value) \
+ ({ __typeof (*mem) __result; \
+ __typeof (value) __addval = (value); \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile (lock "xaddb %b0, %1" \
+ : "=q" (__result), "=m" (*mem) \
+ : "0" (__addval), "m" (*mem)); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (lock "xaddw %w0, %1" \
+ : "=r" (__result), "=m" (*mem) \
+ : "0" (__addval), "m" (*mem))); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (lock "xaddl %0, %1" \
+ : "=r" (__result), "=m" (*mem) \
+ : "0" (__addval), "m" (*mem)); \
+ else \
+ { \
+ __typeof (mem) __memp = (mem); \
+ __typeof (*mem) __tmpval; \
+ __result = *__memp; \
+ do \
+ __tmpval = __result; \
+ while ((__result = pfx##_compare_and_exchange_val_64_acq \
+ (__memp, __result + __addval, __result)) == __tmpval); \
+ } \
+ __result; })
+
+# define atomic_exchange_and_add(mem, value) \
+ __arch_exchange_and_add_body (LOCK_PREFIX, __arch, mem, value)
+
+#define __arch_add_body(lock, pfx, mem, value) \
+ do { \
+ if (__builtin_constant_p (value) && (value) == 1) \
+ atomic_increment (mem); \
+ else if (__builtin_constant_p (value) && (value) == -1) \
+ atomic_decrement (mem); \
+ else if (sizeof (*mem) == 1) \
+ __asm __volatile (lock "addb %b1, %0" \
+ : "=m" (*mem) \
+ : "iq" (value), "m" (*mem)); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (lock "addw %w1, %0" \
+ : "=m" (*mem) \
+ : "ir" (value), "m" (*mem)); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (lock "addl %1, %0" \
+ : "=m" (*mem) \
+ : "ir" (value), "m" (*mem)); \
+ else \
+ __sync_add_and_fetch(mem, value); \
+ } while (0)
+#if 0
+//mtV?
+ The below gives compilation problems:
+error: impossible register constraint in ‘asm’
+Waiting for that, we use __sync_add_and_fetch
+To be fixed ...
+ else \
+ { \
+ __typeof (value) __addval = (value); \
+ __typeof (mem) __memp = (mem); \
+ __typeof (*mem) __oldval = *__memp; \
+ __typeof (*mem) __tmpval; \
+ do \
+ __tmpval = __oldval; \
+ while ((__oldval = pfx##_compare_and_exchange_val_64_acq \
+ (__memp, __oldval + __addval, __oldval)) == __tmpval); \
+ }} while (0)
+#endif
+
+
+#define atomic_add(mem, value) \
+ __arch_add_body (LOCK_PREFIX, __arch, mem, value)
+
+
+#define atomic_add_negative(mem, value) \
+ ({ unsigned char __result; \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile (LOCK_PREFIX "addb %b2, %0; sets %1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "iq" (value), "m" (*mem)); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (LOCK_PREFIX "addw %w2, %0; sets %1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "ir" (value), "m" (*mem)); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (LOCK_PREFIX "addl %2, %0; sets %1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "ir" (value), "m" (*mem)); \
+ else \
+ vg_assert (False); \
+ __result; })
+
+
+#define atomic_add_zero(mem, value) \
+ ({ unsigned char __result; \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile (LOCK_PREFIX "addb %b2, %0; setz %1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "iq" (value), "m" (*mem)); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (LOCK_PREFIX "addw %w2, %0; setz %1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "ir" (value), "m" (*mem)); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (LOCK_PREFIX "addl %2, %0; setz %1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "ir" (value), "m" (*mem)); \
+ else \
+ vg_assert (False); \
+ __result; })
+
+
+#define __arch_increment_body(lock, pfx, mem) \
+ do { \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile (lock "incb %b0" \
+ : "=m" (*mem) \
+ : "m" (*mem)); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (lock "incw %w0" \
+ : "=m" (*mem) \
+ : "m" (*mem)); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (lock "incl %0" \
+ : "=m" (*mem) \
+ : "m" (*mem)); \
+ else \
+ __sync_add_and_fetch(mem, 1); \
+ } while (0)
+
+#if 0
+ //mtV?
+ //????? the below gives asm error on x86 at least ?????
+ /// replacing by __sync_add_and_fetch
+ else \
+ { \
+ __typeof (mem) __memp = (mem); \
+ __typeof (*mem) __oldval = *__memp; \
+ __typeof (*mem) __tmpval; \
+ do \
+ __tmpval = __oldval; \
+ while ((__oldval = pfx##_compare_and_exchange_val_64_acq \
+ (__memp, __oldval + 1, __oldval)) == __tmpval); \
+ }
+#endif
+
+#define atomic_increment(mem) __arch_increment_body (LOCK_PREFIX, __arch, mem)
+
+
+#define atomic_increment_and_test(mem) \
+ ({ unsigned char __result; \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile (LOCK_PREFIX "incb %0; sete %b1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "m" (*mem)); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (LOCK_PREFIX "incw %0; sete %w1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "m" (*mem)); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (LOCK_PREFIX "incl %0; sete %1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "m" (*mem)); \
+ else \
+ vg_assert (False); \
+ __result; })
+
+
+#define __arch_decrement_body(lock, pfx, mem) \
+ do { \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile (lock "decb %b0" \
+ : "=m" (*mem) \
+ : "m" (*mem)); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (lock "decw %w0" \
+ : "=m" (*mem) \
+ : "m" (*mem)); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (lock "decl %0" \
+ : "=m" (*mem) \
+ : "m" (*mem)); \
+ else \
+ { \
+ __typeof (mem) __memp = (mem); \
+ __typeof (*mem) __oldval = *__memp; \
+ __typeof (*mem) __tmpval; \
+ do \
+ __tmpval = __oldval; \
+ while ((__oldval = pfx##_compare_and_exchange_val_64_acq \
+ (__memp, __oldval - 1, __oldval)) == __tmpval); \
+ } \
+ } while (0)
+
+#define atomic_decrement(mem) __arch_decrement_body (LOCK_PREFIX, __arch, mem)
+
+#define atomic_decrement_and_test(mem) \
+ ({ unsigned char __result; \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile (LOCK_PREFIX "decb %b0; sete %1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "m" (*mem)); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (LOCK_PREFIX "decw %w0; sete %1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "m" (*mem)); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (LOCK_PREFIX "decl %0; sete %1" \
+ : "=m" (*mem), "=qm" (__result) \
+ : "m" (*mem)); \
+ else \
+ vg_assert (False); \
+ __result; })
+
+
+#define atomic_bit_set(mem, bit) \
+ do { \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile (LOCK_PREFIX "orb %b2, %0" \
+ : "=m" (*mem) \
+ : "m" (*mem), "iq" (1 << (bit))); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (LOCK_PREFIX "orw %w2, %0" \
+ : "=m" (*mem) \
+ : "m" (*mem), "ir" (1 << (bit))); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (LOCK_PREFIX "orl %2, %0" \
+ : "=m" (*mem) \
+ : "m" (*mem), "ir" (1 << (bit))); \
+ else \
+ vg_assert (False); \
+ } while (0)
+
+
+#define atomic_bit_test_set(mem, bit) \
+ ({ unsigned char __result; \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile (LOCK_PREFIX "btsb %3, %1; setc %0" \
+ : "=q" (__result), "=m" (*mem) \
+ : "m" (*mem), "ir" (bit)); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (LOCK_PREFIX "btsw %3, %1; setc %0" \
+ : "=q" (__result), "=m" (*mem) \
+ : "m" (*mem), "ir" (bit)); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (LOCK_PREFIX "btsl %3, %1; setc %0" \
+ : "=q" (__result), "=m" (*mem) \
+ : "m" (*mem), "ir" (bit)); \
+ else \
+ vg_assert (False); \
+ __result; })
+
+
+#define atomic_delay() asm ("rep; nop")
+
+
+#define __arch_and_body(lock, mem, mask) \
+ do { \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile (lock "andb %b1, %0" \
+ : "=m" (*mem) \
+ : "iq" (mask), "m" (*mem)); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (lock "andw %w1, %0" \
+ : "=m" (*mem) \
+ : "ir" (mask), "m" (*mem)); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (lock "andl %1, %0" \
+ : "=m" (*mem) \
+ : "ir" (mask), "m" (*mem)); \
+ else \
+ vg_assert (False); \
+ } while (0)
+
+
+#define atomic_and(mem, mask) __arch_and_body (LOCK_PREFIX, mem, mask)
+
+#define __arch_or_body(lock, mem, mask) \
+ do { \
+ if (sizeof (*mem) == 1) \
+ __asm __volatile (lock "orb %b1, %0" \
+ : "=m" (*mem) \
+ : "iq" (mask), "m" (*mem)); \
+ else if (sizeof (*mem) == 2) \
+ __asm __volatile (lock "orw %w1, %0" \
+ : "=m" (*mem) \
+ : "ir" (mask), "m" (*mem)); \
+ else if (sizeof (*mem) == 4) \
+ __asm __volatile (lock "orl %1, %0" \
+ : "=m" (*mem) \
+ : "ir" (mask), "m" (*mem)); \
+ else \
+ vg_assert (False); \
+ } while (0)
+
+#define atomic_or(mem, mask) __arch_or_body (LOCK_PREFIX, mem, mask)
+
+#endif /* __PRIV_ATOMIC_X86_H */
+
+/*--------------------------------------------------------------------*/
+/*--- end priv_atomic_x86.h ---*/
+/*--------------------------------------------------------------------*/
Modified: branches/MTV/gdbserver_tests/sleepers.c (+1 -1)
===================================================================
--- branches/MTV/gdbserver_tests/sleepers.c 2013-03-27 11:43:20 +00:00 (rev 13340)
+++ branches/MTV/gdbserver_tests/sleepers.c 2013-03-27 12:40:48 +00:00 (rev 13341)
@@ -132,7 +132,7 @@
pthread_t ebbr, egll, zzzz;
struct spec b, l, p, m;
char *some_mem __attribute__((unused)) = malloc(100);
- setaffinity();
+ if (argc < 6) setaffinity(); // add -p to have parallel sleepers
if (argc > 1)
loops = atoi(argv[1]);
Modified: branches/MTV/coregrind/m_transtab.c (+54 -9)
===================================================================
--- branches/MTV/coregrind/m_transtab.c 2013-03-27 11:43:20 +00:00 (rev 13340)
+++ branches/MTV/coregrind/m_transtab.c 2013-03-27 12:40:48 +00:00 (rev 13341)
@@ -44,8 +44,8 @@
#include "pub_core_mallocfree.h" // VG_(out_of_memory_NORETURN)
#include "pub_core_xarray.h"
#include "pub_core_dispatch.h" // For VG_(disp_cp*) addresses
+#include "pub_tool_lock.h"
-
#define DEBUG_TRANSTAB 0
@@ -348,6 +348,20 @@
N_TC_SECTORS. The initial -1 value indicates the TT/TC system is
not yet initialised.
*/
+
+/* translation table search/add/... protected by the below mutex.
+ Why is this mutex needed ?
+ Why can't we just use the Big Lock e.g. in Read mode ?
+ Having the Big Lock in read mode does not provide
+ a safe access to the translation table. A.o., a search in
+ the translation table will modify various "small caches"
+ arrays (such as the order in which sectors are searched.
+ It would be possible to use the Big Lock in write mode.
+ This was even tried. This gave catastrophic performances:
+ more total CPU and longer total elapsed time than the
+ serialised Valgrind. */
+static Mutex tt_mutex;
+
static Sector sectors[N_SECTORS];
static Int youngest_sector = -1;
@@ -673,6 +687,7 @@
{
Int i;
+ VG_(mutex_lock) (&tt_mutex);
/* Search order logic copied from VG_(search_transtab). */
for (i = 0; i < N_SECTORS; i++) {
Int sno = sector_search_order[i];
@@ -700,9 +715,10 @@
/* if this hx entry corresponds to dead host code, we must
tell this code has not been found, as it cannot be patched. */
- if (HostExtent__is_dead (hx, sec))
+ if (HostExtent__is_dead (hx, sec)) {
+ VG_(mutex_unlock) (&tt_mutex);
return False;
-
+ }
vg_assert(sec->tt[tteNo].status == InUse);
/* Can only half check that the found TTEntry contains hcode,
due to not having a length value for the hcode in the
@@ -711,9 +727,11 @@
/* Looks plausible */
*from_sNo = sno;
*from_tteNo = (UInt)tteNo;
+ VG_(mutex_unlock) (&tt_mutex);
return True;
}
}
+ VG_(mutex_unlock) (&tt_mutex);
return False;
}
@@ -783,7 +801,27 @@
}
TTEntry* from_tte = index_tte(from_sNo, from_tteNo);
+ HWord from_offs = (HWord)( (UChar*)from__patch_addr
+ - (UChar*)from_tte->tcptr );
+ vg_assert(from_offs < 100000/* let's say */);
+ /* The chaining might have been done already by another thread.
+ In such a case, don't redo the chaining. */
+ /* Visit all OutEdges owned by from_tte. */
+ UWord n = OutEdgeArr__size(&from_tte->out_edges);
+ UWord i;
+ for (i = 0; i < n; i++) {
+ OutEdge* oe = OutEdgeArr__index(&from_tte->out_edges, i);
+ if (oe->to_sNo == to_sNo && oe->to_tteNo == to_tteNo
+ && oe->from_offs == from_offs) {
+ VG_(debugLog)(1,"transtab",
+ "host code %p already chained"
+ " => no chaining redone\n",
+ from__patch_addr);
+ return;
+ }
+ }
+
/* Get VEX to do the patching itself. We have to hand it off
since it is host-dependent. */
VexInvalRange vir
@@ -808,9 +846,6 @@
ie.from_sNo = from_sNo;
ie.from_tteNo = from_tteNo;
ie.to_fastEP = to_fastEP;
- HWord from_offs = (HWord)( (UChar*)from__patch_addr
- - (UChar*)from_tte->tcptr );
- vg_assert(from_offs < 100000/* let's say */);
ie.from_offs = (UInt)from_offs;
/* This is the new to_ -> from_ backlink to add. */
@@ -1528,6 +1563,7 @@
VG_(printf)("add_to_transtab(entry = 0x%llx, len = %d) ...\n",
entry, code_len);
+ VG_(mutex_lock) (&tt_mutex);
n_in_count++;
n_in_tsize += code_len;
n_in_osize += vge_osize(vge);
@@ -1660,9 +1696,9 @@
/* Note the eclass numbers for this translation. */
upd_eclasses_after_add( §ors[y], i );
+ VG_(mutex_unlock) (&tt_mutex);
}
-
/* Search for the translation of the given guest address. If
requested, a successful search can also cause the fast-caches to be
updated.
@@ -1678,18 +1714,22 @@
vg_assert(init_done);
/* Find the initial probe point just once. It will be the same in
all sectors and avoids multiple expensive % operations. */
- n_full_lookups++;
k = -1;
kstart = HASH_TT(guest_addr);
vg_assert(kstart >= 0 && kstart < N_TTES_PER_SECTOR);
+ VG_(mutex_lock) (&tt_mutex);
+ n_full_lookups++;
+
/* Search in all the sectors,using sector_search_order[] as a
heuristic guide as to what order to visit the sectors. */
for (i = 0; i < N_SECTORS; i++) {
sno = sector_search_order[i];
- if (UNLIKELY(sno == -1))
+ if (UNLIKELY(sno == -1)) {
+ VG_(mutex_unlock) (&tt_mutex);
return False; /* run out of sectors to search */
+ }
k = kstart;
for (j = 0; j < N_TTES_PER_SECTOR; j++) {
@@ -1714,6 +1754,7 @@
sector_search_order[i-1] = sector_search_order[i];
sector_search_order[i] = tmp;
}
+ VG_(mutex_unlock) (&tt_mutex);
return True;
}
if (sectors[sno].tt[k].status == Empty)
@@ -1729,6 +1770,7 @@
}
/* Not found in any sector. */
+ VG_(mutex_unlock) (&tt_mutex);
return False;
}
@@ -2175,6 +2217,8 @@
{
Int i, j, avg_codeszQ;
+ VG_(mutex_init) (&tt_mutex);
+ VG_(mutex_lock)(&tt_mutex);
vg_assert(!init_done);
init_done = True;
@@ -2249,6 +2293,7 @@
N_SECTORS * N_TTES_PER_SECTOR,
N_SECTORS * N_TTES_PER_SECTOR_USABLE,
SECTOR_TT_LIMIT_PERCENT );
+ VG_(mutex_unlock)(&tt_mutex);
}
Modified: branches/MTV/coregrind/m_syswrap/syswrap-linux.c (+8 -1)
===================================================================
--- branches/MTV/coregrind/m_syswrap/syswrap-linux.c 2013-03-27 11:43:20 +00:00 (rev 13340)
+++ branches/MTV/coregrind/m_syswrap/syswrap-linux.c 2013-03-27 12:40:48 +00:00 (rev 13341)
@@ -80,7 +80,7 @@
vg_assert(tst->status == VgTs_Init);
/* make sure we get the CPU lock before doing anything significant */
- VG_(acquire_BigLock)(tid, "thread_wrapper(starting new thread)");
+ VG_(acquire_BigLock)(tid, VgTs_WriteLock, "thread_wrapper(starting new thread)");
if (0)
VG_(printf)("thread tid %d started: stack = %p\n",
@@ -424,6 +424,13 @@
VG_(do_atfork_pre)(tid);
+#ifdef SINGLEV
+ vg_assert(VG_(owns_BigLock_LL)(ptid));
+#else
+ vg_assert(VG_(threads)[tid].slk == VgTs_ReadLock);
+ //mtV? or should we rather have the write lock ?
+#endif
+
/* Since this is the fork() form of clone, we don't need all that
VG_(clone) stuff */
#if defined(VGP_x86_linux) \
Modified: branches/MTV/coregrind/m_libcassert.c (+6 -2)
===================================================================
--- branches/MTV/coregrind/m_libcassert.c 2013-03-27 11:43:20 +00:00 (rev 13340)
+++ branches/MTV/coregrind/m_libcassert.c 2013-03-27 12:40:48 +00:00 (rev 13341)
@@ -230,11 +230,15 @@
{
Int i;
VG_(printf)("\nsched status:\n");
+#ifdef SINGLEV
VG_(printf)(" running_tid=%d\n", VG_(get_running_tid)());
+#endif
for (i = 1; i < VG_N_THREADS; i++) {
if (VG_(threads)[i].status == VgTs_Empty) continue;
- VG_(printf)( "\nThread %d: status = %s\n", i,
- VG_(name_of_ThreadStatus)(VG_(threads)[i].status) );
+ VG_(printf)( "\nThread %d: status = %s slk = %s in_gen_code %d\n", i,
+ VG_(name_of_ThreadStatus)(VG_(threads)[i].status),
+ VG_(name_of_SchedLockKind)(VG_(threads)[i].slk),
+ VG_(threads)[i].in_generated_code);
VG_(get_and_pp_StackTrace)( i, BACKTRACE_DEPTH );
}
VG_(printf)("\n");
Modified: branches/MTV/coregrind/m_dispatch/dispatch-x86-linux.S (+2 -2)
===================================================================
--- branches/MTV/coregrind/m_dispatch/dispatch-x86-linux.S 2013-03-27 11:43:20 +00:00 (rev 13340)
+++ branches/MTV/coregrind/m_dispatch/dispatch-x86-linux.S 2013-03-27 12:40:48 +00:00 (rev 13341)
@@ -200,7 +200,7 @@
movl OFFSET_x86_EIP(%ebp), %eax
/* stats only */
- addl $1, VG_(stats__n_xindirs_32)
+ lock addl $1, VG_(stats__n_xindirs_32)
/* try a fast lookup in the translation cache */
movl %eax, %ebx /* next guest addr */
@@ -216,7 +216,7 @@
fast_lookup_failed:
/* stats only */
- addl $1, VG_(stats__n_xindir_misses_32)
+ lock addl $1, VG_(stats__n_xindir_misses_32)
movl $VG_TRC_INNER_FASTMISS, %eax
movl $0, %edx
Added: branches/MTV/docs/internals/mtV.txt (+982 -0)
===================================================================
--- branches/MTV/docs/internals/mtV.txt 2013-03-27 11:43:20 +00:00 (rev 13340)
+++ branches/MTV/docs/internals/mtV.txt 2013-03-27 12:40:48 +00:00 (rev 13341)
@@ -0,0 +1,982 @@
+Valgrind supports multi-threaded applications, but schedules
+only one thread at a time. In other words, a multi-threaded application
+running under Valgrind will not benefit from multiple CPUs.
+
+A prototype has been developped of a "really" multi threaded Valgrind
+(mtV). The prototype is made of ugly and/or inefficient
+kludges/trials/...
+Now, you have been warned that you will encounter horrors.
+The only objective of this prototype is to understand the problems of
+doing an mtV, see what are the possible approaches, possible
+performance gains.
+The current state of the patches and/or tests results are maintained
+in bugzilla in bug
+https://bugs.kde.org/show_bug.cgi?id=301830
+
+
+The document starts with a description of the current behaviour and
+structure of Valgrind.
+It then contains a description of the prototype developped,
+the techniques used, the problems still opened.
+The document below has some sections.
+Section title starts with a #.
+The first section is some general information, which might
+be skipped by most V developpers.
+
+
+# The 3 Valgrind "layers":
+==========================
+ ------------------------------------------------------------------------
+ | | |
+ | TOOL "runtime code" | "instrument function" |
+ ------------------------------------------------------------------------
+ | |
+ | GUEST JIT code |
+ | (generated/instrumented code) |
+ --------------------------------------------------------------------
+ | |
+ | CORE layer |
+ --------------------------------------------------------------------
+
+
+ CORE layer : this contains the V framework & modules
+ used to support the TOOL code.
+ TOOL layer : made of two big parts.
+ 1. The instrument function is called by the CORE layer.
+ It instruments the code of the executable running
+ under V small blocks at a time, producing
+ "translated and instrumented" small pieces of JIT code.
+ These pieces of JIT codes are called by the scheduler
+ contained in the CORE layer.
+ 2. the TOOL "runtime code" (C helper functions and data
+ structures needed by the GUEST JIT code).
+ For example, for memcheck, it contains the data
+ structures to track the allocated and freed memory.
+ GUEST layer : contains GUEST JIT : this is the process guest code
+ translated and instrumented by the TOOL instrument function.
+
+
+ The typical control flow is:
+
+ 1. CORE layer calls the "instrument function", producing
+ instrumented GUEST JIT code
+
+ 2. CORE layer calls the GUEST JIT generated code.
+
+ 3. the GUEST JIT code runs. It will typically do many calls to
+ the tool runtime code.
+
+ Note: the none tool is special : the "instrument function"
+ does not do any code transformation. The none tool has no
+ runtime code.
+
+ The tool runtime code can itself do calls to the CORE layer.
+
+ Example:
+ the guest process code is: char *s;
+ s = malloc (10);
+ strcpy(s, "abc");
+ (*fnptr)();
+ The following sequence will happen:
+ 1. CORE will decode the guest process code.
+ 2. CORE will call the TOOL function to instrument this decoded code
+ For memcheck, the instrumented code will be made of:
+ a call to the memcheck malloc replacement function
+ (this replacement function is part of memcheck runtime code,
+ it a.o. tracks the allocated memory to allow leak search).
+ an assignment of the result of the malloc replacement to s
+ a call to a tool C helper to indicate that s (the ptr) is now
+ initialized
+ a call to the function strcpy
+ a (dynamic) call to the function pointed to by fnptr.
+ 3. CORE will translate the instrumented code to executable code (JIT)
+ and will store this code in a data structure (the translation table).
+ Basically, the translation table is a mapping between a guest code
+ address and the corresponding translated JIT code.
+ 4. CORE scheduler will call the JIT code (the just translated piece of
+ code)
+ 5. the JIT code does the following:
+ call the tool malloc replacement function, which does:
+ call the CORE code to allocate a block of memory
+ (10 bytes + some admin overhead).
+ call the CORE code to compute (and store) a stack trace of
+ the guest code call stack.
+ call the CORE code to store in an hash table
+ the address of the allocated block
+ + the stack trace reference
+ maintain in a memcheck data structure that the 10 bytes are
+ accessible but not initialized.
+ the JIT code should normally call strcpy. Because strcpy
+ is not translated yet, the call code is exiting to the scheduler.
+ So, the JIT code executes a return, giving back the control to
+ the CORE scheduler
+ 6. the scheduler will translate the strcpy function (generating a new
+ piece of instrumented JIT code) and add it in the translation table.
+ Possibly, the table is full. Then the old translations have to be
+ removed from the table.
+ 7. The scheduler then calls the new JIT code
+ The JIT code will execute the strcpy instrumented JIT code
+ (this code will typically do multiple calls to the tool runtime
+ code to indicate that the 4 first bytes pointed to by s are now
+ initialized).
+ 8. The JIT code will search in the translation table for the
+ translated piece of code corresponding to *fnptr.
+ If existing, it is called.
+ Otherwise, control is given back to scheduler for instrumentation.
+
+
+For a typical tool and guest application being executed, most of the
+time is spent in GUEST JIT code, in the TOOL runtime code and in
+the CORE code. It is deemed that the time spent in TOOL instrument
+function is not a major part : usually, a translated piece of code
+is executed often.
+
+If we have an application containing multiple threads, there is no
+(to be more precise, almost no) parallel execution : at any moment
+in time, there is maximum one thread which "really" executes some code.
+This thread can be busy either in the CORE layer (possibly calling
+the TOOL instrument function), or can be busy in GUEST JIT code
+or in the TOOL runtime code.
+
+This "single thread busy" model is needed because neither the CORE
+layer, nor the TOOL layer (runtime or instrument) are re-entrant.
+These 3 layers are containing global variables/data structures/...
+which can only be used by one thread at a time.
+
+To ensure thread-safety of this non-reentrant code, V uses the very
+simple approach of "single thread busy". The "single thread busy" is
+ensured by the "Big Lock".
+
+A thread must acquire the Big Lock before it executes CORE or GUEST or
+TOOL code. When a thread acquires the Big Lock, it becomes THE running
+thread.
+
+The running thread releases the Big Lock after it has executed a certain
+quantity of GUEST code. Releasing the Big lock will allow another thread
+to run during some time.
+
+The other threads (ready to run) are all blocked, trying
+to acquire the Big Lock. One of these threads will acquire the Big Lock
+and start to execute some code. Typically, it will start to execute
+GUEST JIT code. But it might also start to execute CORE code because
+some guest code has to be instrumented.
+
+If the running thread executing GUEST JIT code has to execute a
+syscall (e.g. reading data on a socket), the running thread must
+release the Big Lock : if this thread would not release the Big Lock
+before doing a syscall, then the whole application will be blocked
+waiting for this thread to finish its syscall.
+
+In other words, the only real parallelism provided by Valgrind is
+between one thread doing some CPU in CORE/GUEST/TOOL
+and all other threads doing a system call called by GUEST JIT code.
+
+This is a very low level of parallelism. If an application does
+an intensive usage of threads, the typically slowdown factor
+of the V tool (e.g. 5 to 20 for memcheck) will be multiplied
+by the serial execution of the CORE/GUEST/TOOL code.
+
+The objective of the prototype is to determine how to best increase
+the parallelism of Valgrind.
+
+# the non-thread safe code
+==========================
+To increase the performance, we want to have multiple
+threads running in parallel (e.g. executing the code of the example).
+For this, the Big Lock must be removed and replaced by other
+techniques : in all the above steps, global data structures are used
+that will be corrupted by parallel access : almost all the steps are
+not thread safe if the Big Lock is removed.
+Typically, for the translation part, the following are not thread safe :
+ the VEX lib (used by the CORE and tool instrument function)
+ the tool instrument function itself
+ the CORE translation framework
+ ...
+
+At runtime (executing JIT code), e.g. the following are not thread safe
+ the CORE scheduler
+ the memcheck C helpers
+ memcheck malloc replacement
+ memcheck V and A bits C helpers
+ the CORE malloc/free
+ the CORE aspacemgr (used e.g. by the CORE malloc/free)
+ ...
+
+At many places in the code, we have statistical counters
+to help understand the behaviour/performance of V.
+(e.g. CORE malloc is counting how many bytes have been malloc-ed).
+These counters are all non-thread safe.
+
+The guest code might or might not be thread safe (e.g. might contain
+race condition). Such non thread safe guest code is not a problem
+for our objective, as such non thread safety will only corrupt the
+guest process data structure (the developper of the guest code might
+use helgrind to search for such non thread safe code).
+
+However, currently, even thread-safe guest code can result in non
+thread safe JIT code. For example, two threads executing
+at the same time a call to a not yet translated function (e.g. strcpy)
+will corrupt the JIT code (t-chaining).
+
+Making all the above thread safe can be done using various techniques:
+ using thread local storage instead of global variables
+ using mutex locks
+ using read/write locks
+ using atomic instructions
+ (e.g. for statistical counters)
+ using lock-less algorithms
+ (e.g. to maintain data structures without
+ using locks. Such lock-less algorithms are
+ based on atomic instructions)
+ Note: mutex can be implemented using atomic instructions and
+ OS syscalls (e.g. based on futex). If there is no contention,
+ the perf. of mutex based approach will be very probably similar
+ (or even maybe better?) than a lock-less algorithm.
+ Lock-less algorithms might however help either to increase
+ the throughput or to avoid difficulties such as deadlocks.
+
+The prototype has explored some of the above techniques.
+Some "testing" (ha ha) has been done. See # testing below.
+
+At this stage, only the thread safe aspect of CORE and JIT layer
+was looked at for the prototype. Nothing has been done to look
+at the tool runtime layer.
+
+# prototype
+===========
+
+The very first version of the prototype was based on the following
+idea (looking somewhat wrong a posteriori):
+ When a thread executes guest code, it has to use some data
+ structures (e.g. the translation table, the JIT code itself, ...).
+ These structures can't be protected by a mutex, otherwise
+ we are back to stage 1: no parallelism.
+ So, let's transform the Big Lock into a rwlock.
+ Before executing guest code, a thread must acquire
+ the Big Lock in read mode.
+ Before doing any modifications to any data structure, the
+ thread must acquire the Big Lock in write mode.
+
+So, the Big Lock was transformed into a rwlock by using
+low level "semaphores". These semaphores are the
+same as the current trunk scheduler "big lock", i.e. implemented
+using a pipe. !!! Fair scheduler has not been looked at.
+The rwlock implemented on top of these semaphores is not
+efficient as it implies several read/write syscall
+to acquire/release the lock. Really
+
+With this, a bunch of changes were needed to avoid assertions
+being raised. A.o.:
+ * the concept of THE lock owner disappears
+ so this cannot be checked anymore
+ * the concept of THE running thread disappears similarly
+ * the concept of a global 'in generated code' disappears
+ * a bunch of failing asserts have been #ifdef-ed or commented.
+
+After fixing the above, the 4 threads were able to run in parallel.
+One observe the following on the "big_parallel_sleepers" test:
+ * during +- 40 seconds, about 125% CPU is used
+ * after that, jump to 400%.
+ Interpretation of this: making the translations and storing them
+ in the transtab is still very serialized by the Big Lock.
+ Once all the code is translated, full speed is possible.
+ See also the "duplicate t-chaining" below.
+
+
+helgrind (trunk) was then used in a setup "outer trunk helgrind/inner
+none tool prototype" to find race conditions.
+
+A bunch of such race conditions were found.
+Typically:
+ * race conditions in statistical counters
+ * ... need to retrieve and document more of these changes done.
+
+There are some tricky races as some actions looks like they are just
+reading some data, but in fact they do modifications to global
+variables. E.g. all the functions which are maintaining a small static
+local cache will write to the local cache. Typical example in the
+aspacemgr or in the transtab.
+The transtab (VG_(tt_fast)) is the worst case because:
+ 1. it is used a lot
+ 2. it is used by the asm dispatch code.
+=> using a rw lock for this might not be an ideal solution.
+See # tt_fast below for another suggested approach.
+Currently the VG_(tt_fast) race condition is not fixed
+in the prototype.
+The other data structures related to translation tables were
+first tried to be protected the following way:
+ Before having to search or modify the translation table,
+ the Big Lock has to be acquired in W mode.
+ So, if a thread had the Big Lock in R mode, it
+ was reacquiring it in W mode (by releasing and re-acquiring).
+ Really not efficient, the effect of this was really bad:
+ The max CPU usage dropped from 400% till 140%.
+ Worse, the total elapsed time to run the test was
+ bigger than with the V trunk.
+
+The conclusion of this was : a (reasonable) mtV cannot be based
+purely on a RW Big Lock.
+
+* So, started to make a pub_tool_lock.h and m_lock.c module.
+ (currently only contains a mutex, based on atomic instructions
+ and futex syscall). It should at least be completed with
+ a rwlock. Maybe also spin locks ?
+ Some constraints for this module:
+ * it should be initialisable very early in the startup sequence
+ (as e.g. aspacemgr will need it. Even maybe DebugLog might
+ need locks).
+
+ The m_lock.c is derived from (lgpl 2.1) NPTL glibc 2.13 code.
+ Atomic instructions for x86 and amd64 were also copied from
+ the NPTL pthread lib and slightly transformed.
+ Basically, removed the 'catomic*' kind of actions, which are
+ optimisations done referencing a global pthread NPTL var
+ to "skip" the LOCK prefix when there is only one thread.
+ !!! There is one asm statement in priv_atomic_x86.h
+ giving a compile/assembly error.
+ Replaced by __sync_add_and_fetch temporarily. This should
+ be fixed.
+ !!! the fair scheduler should be rewritten based on the
+ atomic instructions rather than the __sync.. and __builtin
+ so that it will be available everywhere.
+ !!! need to see if all 32 bits platforms are supporting
+ an atomic increment of a 64 bit counter. Might either not
+ be supported or cause problems like the x86 compile error above.
+ Then these will have to be emulated. This might all be
+ costly so we might have to avoid some of the 64 bits statistical
+ counters.
+
+* The above efficient futex based mutex was used to protect
+ the translation table. With this, many race condiions
+ disappeared. CPU usage back to 400%.
+
+* VG_(unknown_SP_update) is a perf. critical function.
+ This function contains a piece of non thread safe code
+ for detecting stack changes. This code has been
+ disabled waiting for a proper solution.
+ A possible approach is to use TLS : the current_stack
+ global variable would become a 'per thread' variable.
+ See # TLS below for the trial of thread local storage.
+
+* aspacemgr locking: race condition detected e.g. between
+ VG_(am_find_nsegment)
+ and a mmap syscall, calling VG_(am_notify_client_mmap)
+ => a mutex was added in aspacemgr, protecting at least
+ the race between these 2 functions.
+ However, it is far to be clear that the aspacemgr is
+ "safe" with that. Effectively, VG_(am_find_nsegment)
+ returns a pointer to an element of the segment array.
+ So, if another thread is modifying the entry in the
+ array once the first thread has got an access to it,
+ then what ? It looks however that VG_(am_find_nsegment)
+ is used for "small durations". But still it is not
+ clear this is a "clean thread safe interface".
+ It is unclear exactly what and how should be protected
+ in the aspace mgr. Protecting all the public interface
+ of aspacemgr could be done (need to avoid recursive locking,
+ as pub_tool_lock.h does not detect that). Howeve, unclear
+ if this is good enough (or not).
+
+* When doing the trial of protecting aspacemgr globally,
+ deadlock obtained due to recursive locking.
+ => one might need to make "safe" locks (currently,
+ m_lock.c does not verify non-recursive locking).
+ Also, if we do plenty of fine grained locking everywhere,
+ deadlocks might be difficult to avoid.
+ Lock-free algorithms might help to avoid these.
+ See # Lock free algorithms below.
+
+* Need to avoid "duplicate t-chaining".
+ Got an assert failure in VEX, as the place to t-chain did not
+ contain the expected bytes. This was created by two threads
+ detecting at the same time that a t-chaining is to be done.
+ Then, even if Write Locked one after each other, the 2nd
+ thread would assert as the expected bytes to replace
+ for t-chain are not there anymore.
+ So, in the t-chaining protected critical section, there
+ is a verification if the t-chaining has not been done
+ in the meantime.
+ This causes a really bizarre perf impact (search for really bizarre
+ in the # Performance measurements) :
+ running 150 iterations with 4 parallel threads is total less cpu
+ than in serial. But running only 2 iterations is significantly slower
+ in parallel than serial.
+ The most probable explanation: I suppose there is some useless work
+ done e.g. during translation which consumes CPU : a thread detects it
+ must do t-chaining" : it exits to the scheduler, takes the write big
+ lock, detects that this t-chaining has already be done by another
+ thread and so has done all this work for nothing.
+ This hypothesis is confirmed when giving -d.
+ This causes plenty of traces:
+ --22919:1:transtab host code 0x40537438F already chained => no chaining redone
+ But by which miracle are the parallel threads "recuperating" this cpu burned
+ for nothing later on when doing more iterations ?
+
+ Whatever: a possible solution might be to have VEX provide an
+ efficient way to check that the current ip of the thread has
+ been already chained. Then a "already done t-chaining" would only
+ cost an exit to the C scheduler, and a few "VEX" if-s.
+ This condition looks easy to implement in VEX.
+ Is it safe ? I believe yes:
+ At least VEX can detect that the t-chaining has been done
+ already (or rather that the place is not t-chainable anymore).
+ I suppose just a "!" on this condition would do it :
+ if a place has to be chained, then it contains (for x86)
+ BA <4 bytes value == disp_cp_chain_me_EXPECTED>
+ FF D2
+ otherwise it contains something different.
+
+ ??? is it so clear that this will avoid burning cpu : when a thread
+ exits to C, it will try to acquire the big lock to do t-chaining,
+ but will not be able till all threads go out of the JIT code
+ either due to need of t-chaining or due to QUANTUM expired).
+ If all exits due to QUANTUM expired, our thread will get
+ the big lock, do the t-chaining and all is well.
+ But if the other threads have to do the same t-chaining,
+ they will all exits, see that the t-chaining is not done,
+ and then queue to acquire the write big lock.
+ One will get it, do the t-chaining, then all others will one
+ by one get the write lock, and see the t-chaining is done.
+ Threads have quite some chance to get to the same not
+ done t-chaining if they all execute the same code.
+
+ ??? or even isn't it that the translation table and
+ the rd/wr big lock protecting it is just not scalable :
+ whenever there are some threads which are executing
+ JITted code not yet translated and/or t-chained, there
+ is a high level of contention, causing a lot of
+ lock/unlock, consuming user and sys cpu ?
+
+
+# tt_fast "xor" approach
+========================
+(the tt_fast race condition has not been solved yet.
+Here is a suggested very elegant approach. But is
+this really working ?)
+[after a lot of discussion, it became clear that this does
+not work. There is a small probability of a wrong
+value being read. See at the end of the section the
+failing case and probability analysis]
+
+tt_fast is used (read only) by the asm dispatcher.
+tt_fast is accessed for every 'dynamic' call/jump/...
+(the 'static' call/jump/... are resolved using translation
+chaining).
+tt_fast is modified either when a new translation is
+done or when an already translated piece of code is
+found in the translation cache.
+In other words, one see that even if tt_fast+translation
+table is logically only read, it is also modified by a search.
+
+http://www.cis.uab.edu/hyatt/hashing.html
+describes a technique which (I believe) would allow to use
+tt_fast without locking and without atomic instruction,
+with very few changes in the asm dispatcher.
+
+Basically, the idea of the paper applied on tt_fast would be:
+ tt_fast is an array of pair (G, H)
+ where G is a guest code address and H is the address
+ of the JIT translation of G.
+ (G, H) is stored at a position in tt_fast obtained by
+ "hashing" G (basically, shifting and masking some bits of G).
+If we have a pair (G1,H1) and (G2,H2) which have the same
+hash value, and these pairs are inserted (or modified)
+in parallel, a third thread reading this table might get
+one of the following 4 pairs:
+ (G1,H1) (good)
+ (G2,H2) (good)
+ (G1,H2) (bad)
+ (G2,H1) (bad)
+The idea is that the asm dispatcher would detect the
+bad cases, and then just fall back to the normal
+search (exiting the asm dispatcher to do a full search
+in the translation table).
+To differentiate the good from the bad (without an ugly lock :),
+the idea is to store in tt_fast the following:
+ (G xor H, H)
+Then when searching for G1, one does the following:
+ ...
[truncated message content] |