|
From: <sv...@va...> - 2005-12-15 21:40:40
|
Author: sewardj
Date: 2005-12-15 21:40:34 +0000 (Thu, 15 Dec 2005)
New Revision: 5352
Log:
Rewrite ppc32 dispatch loop to avoid profiling overhead, as per
today's x86 and amd64 rewrites.
Modified:
trunk/coregrind/m_dispatch/dispatch-ppc32-linux.S
trunk/docs/internals/performance.txt
Modified: trunk/coregrind/m_dispatch/dispatch-ppc32-linux.S
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/coregrind/m_dispatch/dispatch-ppc32-linux.S 2005-12-15 21:18:34=
UTC (rev 5351)
+++ trunk/coregrind/m_dispatch/dispatch-ppc32-linux.S 2005-12-15 21:40:34=
UTC (rev 5352)
@@ -1,8 +1,8 @@
=20
-##--------------------------------------------------------------------##
-##--- The core dispatch loop, for jumping to a code address. ---##
-##--- dispatch-ppc32.S ---##
-##--------------------------------------------------------------------##
+/*--------------------------------------------------------------------*/
+/*--- The core dispatch loop, for jumping to a code address. ---*/
+/*--- dispatch-ppc32.S ---*/
+/*--------------------------------------------------------------------*/
=20
/*
This file is part of Valgrind, a dynamic binary instrumentation
@@ -38,12 +38,20 @@
/*--- The dispatch loop. ---*/
/*------------------------------------------------------------*/
=20
-/* signature: UWord VG_(run_innerloop) ( void* guest_state ) */
+/*----------------------------------------------------*/
+/*--- Preamble (set everything up) ---*/
+/*----------------------------------------------------*/
=20
- .globl VG_(run_innerloop)
+/* signature:
+UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
+*/
+.text
+.globl VG_(run_innerloop)
VG_(run_innerloop):
+ /* r3 holds guest_state */
+ /* r4 holds do_profiling */
+
/* ----- entry point to VG_(run_innerloop) ----- */
-
/* For Linux/ppc32 we need the SysV ABI, which uses
LR->4(parent_sp), CR->anywhere.
(The AIX ABI, used on Darwin, and maybe Linux/ppc64?,
@@ -58,10 +66,10 @@
stwu 1,-496(1) /* sp should maintain 16-byte alignment */
=20
/* Save callee-saved registers... */
- /* r3 is live here (guest state ptr), so use r4 */
- lis 4,VG_(machine_ppc32_has_FP)@ha
- lwz 4,VG_(machine_ppc32_has_FP)@l(4)
- cmplwi 4,0
+ /* r3, r4 are live here, so use r5 */
+ lis 5,VG_(machine_ppc32_has_FP)@ha
+ lwz 5,VG_(machine_ppc32_has_FP)@l(5)
+ cmplwi 5,0
beq LafterFP1
=20
/* Floating-point reg save area : 144 bytes */
@@ -111,43 +119,43 @@
/* It's necessary to save/restore VRSAVE in the AIX / Darwin ABI=
.
The Linux kernel might not actually use VRSAVE for its intend=
ed
purpose, but it should be harmless to preserve anyway. */
- /* r3 is live here (guest state ptr), so use r4 */
- lis 4,VG_(machine_ppc32_has_VMX)@ha
- lwz 4,VG_(machine_ppc32_has_VMX)@l(4)
- cmplwi 4,0
+ /* r3, r4 are live here (guest state ptr), so use r5 */
+ lis 5,VG_(machine_ppc32_has_VMX)@ha
+ lwz 5,VG_(machine_ppc32_has_VMX)@l(5)
+ cmplwi 5,0
beq LafterVMX1
=20
/* VRSAVE save word : 32 bytes */
- mfspr 4,256 /* vrsave reg is spr number 256 */
- stw 4,244(1)
+ mfspr 5,256 /* vrsave reg is spr number 256 */
+ stw 5,244(1)
=20
/* Alignment padding : 4 bytes */
=20
/* Vector reg save area (quadword aligned) : 192 bytes */
- li 4,224
- stvx 31,4,1
- li 4,208
- stvx 30,4,1
- li 4,192
- stvx 29,4,1
- li 4,176
- stvx 28,4,1
- li 4,160
- stvx 27,4,1
- li 4,144
- stvx 26,4,1
- li 4,128
- stvx 25,4,1
- li 4,112
- stvx 24,4,1
- li 4,96
- stvx 23,4,1
- li 4,80
- stvx 22,4,1
- li 4,64
- stvx 21,4,1
- li 4,48
- stvx 20,4,1
+ li 5,224
+ stvx 31,5,1
+ li 5,208
+ stvx 30,5,1
+ li 5,192
+ stvx 29,5,1
+ li 5,176
+ stvx 28,5,1
+ li 5,160
+ stvx 27,5,1
+ li 5,144
+ stvx 26,5,1
+ li 5,128
+ stvx 25,5,1
+ li 5,112
+ stvx 25,5,1
+ li 5,96
+ stvx 23,5,1
+ li 5,80
+ stvx 22,5,1
+ li 5,64
+ stvx 21,5,1
+ li 5,48
+ stvx 20,5,1
LafterVMX1:
=20
/* Save cr */
@@ -159,8 +167,9 @@
/* 32(sp) used later to check FPSCR[RM] */
=20
/* r3 holds guest_state */
- mr 31,3
- stw 3,28(1) /* spill orig guest_state ptr */
+ /* r4 holds do_profiling */
+ mr 31,3 /* r31 (generated code gsp) =3D r3 */
+ stw 3,28(1) /* spill orig guest_state ptr */
=20
/* 24(sp) used later to stop ctr reg being clobbered */
/* 20(sp) used later to load fpscr with zero */
@@ -171,40 +180,37 @@
0(sp) : back-chain
*/
=20
-// CAB TODO: Use a caller-saved reg for orig guest_state ptr
-// - rem to set non-allocateable in isel.c
+ /* CAB TODO: Use a caller-saved reg for orig guest_state ptr
+ - rem to set non-allocateable in isel.c */
=20
/* hold dispatch_ctr in ctr reg */
- lis 17,VG_(dispatch_ctr)@ha
- lwz 17,VG_(dispatch_ctr)@l(17)
- mtctr 17
+ lis 5,VG_(dispatch_ctr)@ha
+ lwz 5,VG_(dispatch_ctr)@l(5)
+ mtctr 5
=20
- /* fetch %CIA into r30 */
- lwz 30,OFFSET_ppc32_CIA(31)
-
/* set host FPU control word to the default mode expected=20
by VEX-generated code. See comments in libvex.h for
more info. */
- lis 3,VG_(machine_ppc32_has_FP)@ha
- lwz 3,VG_(machine_ppc32_has_FP)@l(3)
- cmplwi 3,0
+ lis 5,VG_(machine_ppc32_has_FP)@ha
+ lwz 5,VG_(machine_ppc32_has_FP)@l(5)
+ cmplwi 5,0
beq LafterFP2
=20
- /* get zero into f3 (tedious) */
- /* note: fsub 3,3,3 is not a reliable way to do this,=20
- since if f3 holds a NaN or similar then we don't necessarily
- wind up with zero. */
- li 3,0
- stw 3,20(1)
+ /* get zero into f3 (tedious) */
+ /* note: fsub 3,3,3 is not a reliable way to do this,=20
+ since if f3 holds a NaN or similar then we don't necessarily
+ wind up with zero. */
+ li 5,0
+ stw 5,20(1)
lfs 3,20(1)
mtfsf 0xFF,3 /* fpscr =3D f3 */
LafterFP2:
=20
/* set host AltiVec control word to the default mode expected=20
by VEX-generated code. */
- lis 3,VG_(machine_ppc32_has_VMX)@ha
- lwz 3,VG_(machine_ppc32_has_VMX)@l(3)
- cmplwi 3,0
+ lis 5,VG_(machine_ppc32_has_VMX)@ha
+ lwz 5,VG_(machine_ppc32_has_VMX)@l(5)
+ cmplwi 5,0
beq LafterVMX2
=20
vspltisw 3,0x0 /* generate zero */
@@ -214,36 +220,108 @@
/* make a stack frame for the code we are calling */
stwu 1,-16(1)
=20
- /* fall into main loop */
+ /* fetch %CIA into r3 */
+ lwz 3,OFFSET_ppc32_CIA(31)
=20
-/* Live regs:
- r1 (=3Dsp)
- r30 (=3DCIA =3D jump address)
- r31 (=3Dguest_state)
- ctr (=3Ddispatch_ctr)
- Stack state:
- 44(r1) (=3Dorig guest_state)
-*/
+ /* fall into main loop (the right one) */
+ /* r4 =3D do_profiling. It's probably trashed after here,
+ but that's OK: we don't need it after here. */
+ cmplwi 4,0
+ beq VG_(run_innerloop__dispatch_unprofiled)
+ b VG_(run_innerloop__dispatch_profiled)
+ /*NOTREACHED*/
=20
-dispatch_boring:
+/*----------------------------------------------------*/
+/*--- NO-PROFILING (standard) dispatcher ---*/
+/*----------------------------------------------------*/
+
+.global VG_(run_innerloop__dispatch_unprofiled)
+VG_(run_innerloop__dispatch_unprofiled):
+ /* At entry: Live regs:
+ r1 (=3Dsp)
+ r3 (=3DCIA =3D next guest address)
+ r31 (=3Dguest_state)
+ ctr (=3Ddispatch_ctr)
+ Stack state:
+ 44(r1) (=3Dorig guest_state)
+ */
+
+ /* Has the guest state pointer been messed with? If yes, exit. */
+ lwz 5,44(1) /* original guest_state ptr */
+ cmpw 5,31
+ bne gsp_changed
+
/* save the jump address in the guest state */
- stw 30,OFFSET_ppc32_CIA(31)
+ stw 3,OFFSET_ppc32_CIA(31)
=20
/* Are we out of timeslice? If yes, defer to scheduler. */
bdz counter_is_zero /* decrements ctr reg */
=20
/* try a fast lookup in the translation cache */
/* r4=3D((r30<<2) & (VG_TT_FAST_MASK<<2)) */
- rlwinm 4,30, 2, 32-2-VG_TT_FAST_BITS, 31-2 =20
-// CAB: use a caller-saved reg for this ?
+ rlwinm 4,3, 2, 32-2-VG_TT_FAST_BITS, 31-2 =20
addis 5,4,VG_(tt_fast)@ha
lwz 5,VG_(tt_fast)@l(5)
lwz 6,4(5) /* big-endian, so comparing 2nd 32bit word */
- cmpw 30,6
+ cmpw 3,6
bne fast_lookup_failed
=20
+ /* Found a match. Call tce[1], which is 8 bytes along, since
+ each tce element is a 64-bit int. */
+ addi 8,5,8
+ mtlr 8
+
+ /* stop ctr being clobbered */
+ mfctr 5
+ stw 5,40(1) /* =3D> 40-16 =3D 24(1) on our parent stack */
+
+ /* run the translation */
+ blrl
+
+ /* reinstate clobbered ctr */
+ lwz 5,40(1)
+ mtctr 5
+
+ /* start over */
+ b VG_(run_innerloop__dispatch_unprofiled)
+ /*NOTREACHED*/
+
+/*----------------------------------------------------*/
+/*--- PROFILING dispatcher (can be much slower) ---*/
+/*----------------------------------------------------*/
+
+.global VG_(run_innerloop__dispatch_profiled)
+VG_(run_innerloop__dispatch_profiled):
+ /* At entry: Live regs:
+ r1 (=3Dsp)
+ r3 (=3DCIA =3D next guest address)
+ r31 (=3Dguest_state)
+ ctr (=3Ddispatch_ctr)
+ Stack state:
+ 44(r1) (=3Dorig guest_state)
+ */
+
+ /* Has the guest state pointer been messed with? If yes, exit. */
+ lwz 5,44(1) /* original guest_state ptr */
+ cmpw 5,31
+ bne gsp_changed
+
+ /* save the jump address in the guest state */
+ stw 3,OFFSET_ppc32_CIA(31)
+
+ /* Are we out of timeslice? If yes, defer to scheduler. */
+ bdz counter_is_zero /* decrements ctr reg */
+
+ /* try a fast lookup in the translation cache */
+ /* r4=3D((r30<<2) & (VG_TT_FAST_MASK<<2)) */
+ rlwinm 4,3, 2, 32-2-VG_TT_FAST_BITS, 31-2 =20
+ addis 5,4,VG_(tt_fast)@ha
+ lwz 5,VG_(tt_fast)@l(5)
+ lwz 6,4(5) /* big-endian, so comparing 2nd 32bit word */
+ cmpw 3,6
+ bne fast_lookup_failed
+
/* increment bb profile counter */
-// CAB: use a caller-saved reg for this ?
addis 6,4,VG_(tt_fastN)@ha
lwz 7,VG_(tt_fastN)@l(6)
lwz 8,0(7)
@@ -256,37 +334,57 @@
mtlr 8
=20
/* stop ctr being clobbered */
-// CAB: use a caller-saved reg for this ?
-// but then (bdz) =3D> (decr, cmp, bc)... still better than a stw?
- mfctr 9
- stw 9,40(1) /* =3D> 40-16 =3D 24(1) on our parent stack */
+ mfctr 5
+ stw 5,40(1) /* =3D> 40-16 =3D 24(1) on our parent stack */
=20
+ /* run the translation */
blrl
=20
+ /* reinstate clobbered ctr */
+ lwz 5,40(1)
+ mtctr 5
=20
- /* On return from guest code:
- r3 holds destination (original) address.
+ /* start over */
+ b VG_(run_innerloop__dispatch_profiled)
+ /*NOTREACHED*/
=20
- r31 may be unchanged (guest_state), or may indicate further
- details of the control transfer requested to *r3.
+/*----------------------------------------------------*/
+/*--- exit points ---*/
+/*----------------------------------------------------*/
=20
- If r31 is unchanged (=3D=3D 44(r1)), just jump next to r3.
+gsp_changed:
+ /* Someone messed with the gsp (in r31). Have to
+ defer to scheduler to resolve this. dispatch ctr
+ is not yet decremented, so no need to increment. */
+ /* %CIA is NOT up to date here. First, need to write
+ %r3 back to %CIA, but without trashing %r31 since
+ that holds the value we want to return to the scheduler.
+ Hence use %r5 transiently for the guest state pointer. */
+ lwz 5,44(1) /* original guest_state ptr */
+ stw 3,OFFSET_ppc32_CIA(5)
+ mr 3,31 /* r3 =3D new gsp value */
+ b run_innerloop_exit
+ /*NOTREACHED*/
=20
- Otherwise fall out, back to the scheduler, and let it
- figure out what to do next.
- */
+counter_is_zero:
+ /* %CIA is up to date */
+ /* back out decrement of the dispatch counter */
+ mfctr 5
+ addi 5,5,1
+ mtctr 5
+ li 3,VG_TRC_INNER_COUNTERZERO
+ b run_innerloop_exit
=20
- /* reinstate clobbered ctr */
- lwz 9,40(1)
- mtctr 9
+fast_lookup_failed:
+ /* %CIA is up to date */
+ /* back out decrement of the dispatch counter */
+ mfctr 5
+ addi 5,5,1
+ mtctr 5
+ li 3,VG_TRC_INNER_FASTMISS
+ b run_innerloop_exit
=20
- mr 30,3 /* put CIA (=3Dr3) in r30 */
- lwz 16,44(1) /* original guest_state ptr */
- cmpw 16,31
- beq dispatch_boring /* r31 unchanged... */
=20
- mr 3,31 /* put return val (=3Dr31) in r3 */
- b dispatch_exceptional
=20
/* All exits from the dispatcher go through here.
r3 holds the return value.=20
@@ -301,8 +399,9 @@
cmplwi 10,0
beq LafterFP8
=20
-/* This check avoidance may be removable if stfiwx is implemented. */
-#if !defined(ENABLE_INNER)
+ /* This check avoidance may be removable if stfiwx is
+ implemented. */
+# if !defined(ENABLE_INNER)
/* Check FPSCR & 0xFF =3D=3D 0 (lowest 8bits are controls) */
mffs 4 /* fpscr -> fpr */
li 5,48
@@ -311,7 +410,7 @@
andi. 6,6,0xFF /* mask wanted bits */
cmplwi 6,0x0 /* cmp with zero */
bne invariant_violation /* branch if not zero */
-#endif
+# endif
LafterFP8:
=20
/* Using r11 - value used again further on, so don't trash! */
@@ -445,36 +544,9 @@
addi 1,1,496 /* stack_size */
blr
=20
-
-/* Other ways of getting out of the inner loop. Placed out-of-line to
- make it look cleaner.=20
-*/
-dispatch_exceptional:
- /* this is jumped to only, not fallen-through from above */
- /* save r30 in %CIA and defer to sched */
- lwz 16,44(1)
- stw 30,OFFSET_ppc32_CIA(16)
- b run_innerloop_exit
-
-fast_lookup_failed:
- /* %CIA is up to date here since dispatch_boring dominates */
- mfctr 17
- addi 17,17,1
- mtctr 17
- li 3,VG_TRC_INNER_FASTMISS
- b run_innerloop_exit
-
-counter_is_zero:
- /* %CIA is up to date here since dispatch_boring dominates */
- mfctr 17
- addi 17,17,1
- mtctr 17
- li 3,VG_TRC_INNER_COUNTERZERO
- b run_innerloop_exit
-
/* Let the linker know we don't need an executable stack */
.section .note.GNU-stack,"",@progbits
=20
-##--------------------------------------------------------------------##
-##--- end ---##
-##--------------------------------------------------------------------##
+/*--------------------------------------------------------------------*/
+/*--- end ---*/
+/*--------------------------------------------------------------------*/
Modified: trunk/docs/internals/performance.txt
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/docs/internals/performance.txt 2005-12-15 21:18:34 UTC (rev 535=
1)
+++ trunk/docs/internals/performance.txt 2005-12-15 21:40:34 UTC (rev 535=
2)
@@ -14,11 +14,12 @@
- Nick improved vg_SP_update_pass() to identify more small constant
increments/decrements of SP, so the fast cases can be used more often.
Saved 1--3% on a few programs.
-- r5345,r5346: Julian improved the dispatcher so that x86 and AMD64 use
- jumps instead of call/return for calling translations, and also remove=
d
- the --profile-flags profiling from the dispatcher unless --profile-fla=
gs
- is being used. Improved Nulgrind performance typically by 10--20%,
- and Memcheck performance typically by 2--20%.
+- r5345,r5346,r5352: Julian improved the dispatcher so that x86 and
+ AMD64 use jumps instead of call/return for calling translations.
+ Also, on x86, amd64 and ppc32, --profile-flags style profiling was
+ removed from the despatch loop unless --profile-flags is being used.
+ Improved Nulgrind performance typically by 10--20%, and Memcheck
+ performance typically by 2--20%.
=20
COMPVBITS branch:
- Nick converted to compress V bits, initial version saved 0--5% on most
|