|
From: <sv...@va...> - 2005-12-26 17:59:03
|
Author: sewardj
Date: 2005-12-26 17:58:58 +0000 (Mon, 26 Dec 2005)
New Revision: 5441
Log:
More dispatcher tuning for ppc32/64. Makes a big difference for
perf/tinycc.
- run_thread_for_a_while: just clear this thread's reservation when
starting, not all of them.
- use a different fast-cache hashing function for ppc32/64 than for
x86/amd64. This allows the former to use all the fast-cache entries
rather than just 1/4 of them.
Modified:
trunk/coregrind/m_dispatch/dispatch-ppc32-linux.S
trunk/coregrind/m_scheduler/scheduler.c
trunk/coregrind/m_transtab.c
trunk/coregrind/pub_core_transtab_asm.h
Modified: trunk/coregrind/m_dispatch/dispatch-ppc32-linux.S
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/coregrind/m_dispatch/dispatch-ppc32-linux.S 2005-12-26 17:50:22=
UTC (rev 5440)
+++ trunk/coregrind/m_dispatch/dispatch-ppc32-linux.S 2005-12-26 17:58:58=
UTC (rev 5441)
@@ -254,14 +254,14 @@
stw 3,OFFSET_ppc32_CIA(31)
=20
/* Are we out of timeslice? If yes, defer to scheduler. */
-// subic. 29,29,1
subi 29,29,1
cmplwi 29,0
beq counter_is_zero
=20
/* try a fast lookup in the translation cache */
- /* r4=3D((r3<<2) & (VG_TT_FAST_MASK<<2)) */
- rlwinm 4,3, 2, 32-2-VG_TT_FAST_BITS, 31-2 =20
+ /* r4 =3D VG_TT_FAST_HASH(addr) * sizeof(ULong)
+ =3D ((r3 >>u 2) & VG_TT_FAST_MASK) << 2 */
+ rlwinm 4,3, 0, 32-2-VG_TT_FAST_BITS, 31-2 =20
addis 5,4,VG_(tt_fast)@ha
lwz 5,VG_(tt_fast)@l(5)
lwz 6,4(5) /* big-endian, so comparing 2nd 32bit word */
@@ -310,12 +310,14 @@
stw 3,OFFSET_ppc32_CIA(31)
=20
/* Are we out of timeslice? If yes, defer to scheduler. */
- addic. 29,29,-1
+ subi 29,29,1
+ cmplwi 29,0
beq counter_is_zero
=20
/* try a fast lookup in the translation cache */
- /* r4=3D((r3<<2) & (VG_TT_FAST_MASK<<2)) */
- rlwinm 4,3, 2, 32-2-VG_TT_FAST_BITS, 31-2 =20
+ /* r4 =3D VG_TT_FAST_HASH(addr) * sizeof(ULong)
+ =3D ((r3 >>u 2) & VG_TT_FAST_MASK) << 2 */
+ rlwinm 4,3, 0, 32-2-VG_TT_FAST_BITS, 31-2=20
addis 5,4,VG_(tt_fast)@ha
lwz 5,VG_(tt_fast)@l(5)
lwz 6,4(5) /* big-endian, so comparing 2nd 32bit word */
Modified: trunk/coregrind/m_scheduler/scheduler.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/coregrind/m_scheduler/scheduler.c 2005-12-26 17:50:22 UTC (rev =
5440)
+++ trunk/coregrind/m_scheduler/scheduler.c 2005-12-26 17:58:58 UTC (rev =
5441)
@@ -331,8 +331,8 @@
VG_(sigprocmask)(VKI_SIG_SETMASK, &mask, NULL);
}
=20
-/* Use libc setjmp/longjmp. longjmp must not restore signal mask
- state, but does need to pass "val" through. */
+/* Use gcc's built-in setjmp/longjmp. longjmp must not restore signal
+ mask state, but does need to pass "val" through. */
#define SCHEDSETJMP(tid, jumped, stmt) \
do { \
ThreadState * volatile _qq_tst =3D VG_(get_ThreadState)(tid); \
@@ -343,7 +343,8 @@
_qq_tst->sched_jmpbuf_valid =3D True; \
stmt; \
} else if (VG_(clo_trace_sched)) \
- VG_(printf)("SCHEDSETJMP(line %d) tid %d, jumped=3D%d\n", __LINE__, ti=
d, jumped); \
+ VG_(printf)("SCHEDSETJMP(line %d) tid %d, jumped=3D%d\n", \
+ __LINE__, tid, jumped); =
\
vg_assert(_qq_tst->sched_jmpbuf_valid); \
_qq_tst->sched_jmpbuf_valid =3D False; \
} while(0)
@@ -370,7 +371,6 @@
=20
/* Paranoia */
vg_assert(VG_(is_valid_tid)(tid));
- vg_assert(VG_(is_valid_tid)(tid));
vg_assert(VG_(is_running_thread)(tid));
vg_assert(!VG_(is_exiting)(tid));
=20
@@ -408,11 +408,9 @@
=20
This should be abstractified and lifted out.
*/
- { Int i;
- /* Clear any existing reservation. Be paranoid and clear them all.=
*/
- for (i =3D 0; i < VG_N_THREADS; i++)
- VG_(threads)[i].arch.vex.guest_RESVN =3D 0;
- }
+ /* Clear any existing reservation that this thread might have made
+ last time it was running. */
+ VG_(threads)[tid].arch.vex.guest_RESVN =3D 0;
=20
/* ppc guest_state vector regs must be 16byte aligned for loads/store=
s */
vg_assert(VG_IS_16_ALIGNED(VG_(threads)[tid].arch.vex.guest_VR0));
@@ -422,7 +420,8 @@
/* there should be no undealt-with signals */
//vg_assert(VG_(threads)[tid].siginfo.si_signo =3D=3D 0);
=20
- //VG_(printf)("running EIP =3D %p ESP=3D%p\n", VG_(threads)[tid].arch=
.m_eip, VG_(threads)[tid].arch.m_esp);
+ //VG_(printf)("running EIP =3D %p ESP=3D%p\n",
+ //VG_(threads)[tid].arch.m_eip, VG_(threads)[tid].arch.m_esp);
=20
vg_assert(VG_(my_fault));
VG_(my_fault) =3D False;
Modified: trunk/coregrind/m_transtab.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/coregrind/m_transtab.c 2005-12-26 17:50:22 UTC (rev 5440)
+++ trunk/coregrind/m_transtab.c 2005-12-26 17:58:58 UTC (rev 5441)
@@ -606,7 +606,7 @@
=20
static void setFastCacheEntry ( Addr64 key, ULong* tce, UInt* count )
{
- UInt cno =3D ((UInt)key) & VG_TT_FAST_MASK;
+ UInt cno =3D (UInt)VG_TT_FAST_HASH(key);
VG_(tt_fast)[cno] =3D tce;
VG_(tt_fastN)[cno] =3D count;
n_fast_updates++;
Modified: trunk/coregrind/pub_core_transtab_asm.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/coregrind/pub_core_transtab_asm.h 2005-12-26 17:50:22 UTC (rev =
5440)
+++ trunk/coregrind/pub_core_transtab_asm.h 2005-12-26 17:58:58 UTC (rev =
5441)
@@ -31,11 +31,31 @@
#ifndef __PUB_CORE_TRANSTAB_ASM_H
#define __PUB_CORE_TRANSTAB_ASM_H
=20
-/* Constants for the fast translation lookup cache. */
+/* Constants for the fast translation lookup cache. It is a direct
+ mapped cache, with 2^VG_TT_FAST_BITS entries.
+
+ On x86/amd64, the cache index is computed as
+ 'address[VG_TT_FAST_BITS-1 : 0]'.
+
+ On ppc32/ppc64, the bottom two bits of instruction addresses are
+ zero, which means that function causes only 1/4 of the entries to
+ ever be used. So instead the function is '(address >>u
+ 2)[VG_TT_FAST_BITS-1 : 0]' on those targets. */
+
#define VG_TT_FAST_BITS 15
#define VG_TT_FAST_SIZE (1 << VG_TT_FAST_BITS)
#define VG_TT_FAST_MASK ((VG_TT_FAST_SIZE) - 1)
=20
+/* This macro isn't usable in asm land; nevertheless this seems
+ like a good place to put it. */
+#if defined(VGA_x86) || defined(VGA_amd64)
+# define VG_TT_FAST_HASH(_addr) ((((UWord)(_addr)) ) & VG_TT_FAST_=
MASK)
+#elif defined(VGA_ppc32) || defined(VGA_ppc64)
+# define VG_TT_FAST_HASH(_addr) ((((UWord)(_addr)) >> 2) & VG_TT_FAST_=
MASK)
+#else
+# error "VG_TT_FAST_HASH: unknown platform"
+#endif
+
#endif // __PUB_CORE_TRANSTAB_ASM_H
=20
/*--------------------------------------------------------------------*/
|