|
From: <sv...@va...> - 2005-11-08 22:03:18
|
Author: cerion
Date: 2005-11-08 22:03:07 +0000 (Tue, 08 Nov 2005)
New Revision: 5047
Log:
store & load callee-saved floating-point and vector registers in core dis=
patch loop.
Modified:
trunk/coregrind/m_dispatch/dispatch-ppc32-linux.S
Modified: trunk/coregrind/m_dispatch/dispatch-ppc32-linux.S
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/coregrind/m_dispatch/dispatch-ppc32-linux.S 2005-11-08 20:59:14=
UTC (rev 5046)
+++ trunk/coregrind/m_dispatch/dispatch-ppc32-linux.S 2005-11-08 22:03:07=
UTC (rev 5047)
@@ -48,50 +48,104 @@
mflr 0
stw 0,4(1)
=20
- /* New stack frame: save callee-saved regs */
- stwu 1,-88(1)
- stw 31,84(1)
- stw 30,80(1)
- stw 29,76(1)
- stw 28,72(1)
- stw 27,68(1)
- stw 26,64(1)
- stw 25,60(1)
- stw 24,56(1)
- stw 23,52(1)
- stw 22,48(1)
- stw 21,44(1)
- stw 20,40(1)
- stw 19,36(1)
- stw 18,32(1)
- stw 17,28(1)
- stw 16,24(1)
- stw 15,20(1)
- stw 14,16(1)
+ /* New stack frame */
+ stwu 1,-432(1) /* sp should maintain 16-byte alignment */
=20
+ /* CAB: should put this gap somewhere else - see ppc-abi */
+
+ /* callee-saved regs
+ http://developer.apple.com : PowerPCRuntime.pdf : p27 */
+ stw 31,424(1)
+ stw 30,420(1)
+ stw 29,416(1)
+ stw 28,412(1)
+ stw 27,408(1)
+ stw 26,404(1)
+ stw 25,400(1)
+ stw 24,396(1)
+ stw 23,392(1)
+ stw 22,388(1)
+ stw 21,384(1)
+ stw 20,380(1)
+ stw 19,376(1)
+ stw 18,372(1)
+ stw 17,368(1)
+ stw 16,364(1)
+ stw 15,360(1)
+ stw 14,356(1)
+ stw 13,352(1)
+
+ stfd 31,344(1)
+ stfd 30,336(1)
+ stfd 29,328(1)
+ stfd 28,320(1)
+ stfd 27,312(1)
+ stfd 26,304(1)
+ stfd 25,296(1)
+ stfd 24,288(1)
+ stfd 23,280(1)
+ stfd 22,272(1)
+ stfd 21,264(1)
+ stfd 20,256(1)
+ stfd 19,248(1)
+ stfd 18,240(1)
+ stfd 17,232(1)
+ stfd 16,224(1)
+ stfd 15,216(1)
+ stfd 14,208(1)
+
+ li 4,192
+ stvx 31,4,1
+ li 4,176
+ stvx 30,4,1
+ li 4,160
+ stvx 29,4,1
+ li 4,144
+ stvx 28,4,1
+ li 4,128
+ stvx 27,4,1
+ li 4,112
+ stvx 26,4,1
+ li 4,96
+ stvx 25,4,1
+ li 4,80
+ stvx 24,4,1
+ li 4,64
+ stvx 23,4,1
+ li 4,48
+ stvx 22,4,1
+ li 4,32
+ stvx 21,4,1
+ li 4,16
+ stvx 20,4,1
+
/* r3 holds guest_state */
mr 31,3
stw 3,12(1) /* spill orig guest_state ptr */
=20
+ /* 8(1) used later to stop ctr reg being clobbered
+ 4(1) =3D standard LR-save space
+ */
+
// CAB TODO: Use a caller-saved reg for orig guest_state ptr
// - rem to set non-allocateable in isel.c
=20
/* hold dispach_ctr in ctr reg */
lis 17,VG_(dispatch_ctr)@ha
lwz 17,VG_(dispatch_ctr)@l(17)
- mtctr 17
+ mtctr 17
=20
/* fetch %CIA into r30 */
lwz 30,OFFSET_ppc32_CIA(31)
=20
- /* set host FPU control word to the default mode expected=20
+ /* set host FPU control word to the default mode expected=20
by VEX-generated code. See comments in libvex.h for
more info. */
fsub 3,3,3 /* generate zero */
mtfsf 0xFF,3
=20
- /* set host AltiVec control word to the default mode expected=20
- by VEX-generated code. */
+ /* set host AltiVec control word to the default mode expected=20
+ by VEX-generated code. */
lis 3,VG_(have_altivec_ppc32)@ha
lwz 3,VG_(have_altivec_ppc32)@l(3)
cmplwi 3,0
@@ -124,14 +178,14 @@
bdz counter_is_zero /* decrements ctr reg */
=20
/* try a fast lookup in the translation cache */
- /* r4=3D((r30<<2) & (VG_TT_FAST_MASK<<2)) */
+ /* r4=3D((r30<<2) & (VG_TT_FAST_MASK<<2)) */
rlwinm 4,30, 2, 32-2-VG_TT_FAST_BITS, 31-2 =20
// CAB: use a caller-saved reg for this ?
addis 5,4,VG_(tt_fast)@ha
lwz 5,VG_(tt_fast)@l(5)
lwz 6,4(5) /* big-endian, so comparing 2nd 32bit word */
cmpw 30,6
- bne fast_lookup_failed
+ bne fast_lookup_failed
=20
/* increment bb profile counter */
// CAB: use a caller-saved reg for this ?
@@ -150,7 +204,7 @@
// CAB: use a caller-saved reg for this ?
// but then (bdz) =3D> (decr, cmp, bc)... still better than a stw?
mfctr 9
- stw 9,24(1)
+ stw 9,24(1) /* =3D> 24-16 =3D 8(1) on our parent stack */
=20
blrl
=20
@@ -171,12 +225,12 @@
lwz 9,24(1)
mtctr 9
=20
- mr 30,3 /* put CIA (=3Dr3) in r30 */
+ mr 30,3 /* put CIA (=3Dr3) in r30 */
lwz 16,28(1) /* original guest_state ptr */
cmpw 16,31
beq dispatch_boring /* r31 unchanged... */
=20
- mr 3,31 /* put return val (=3Dr31) in r3 */
+ mr 3,31 /* put return val (=3Dr31) in r3 */
b dispatch_exceptional
=20
/* All exits from the dispatcher go through here.
@@ -218,27 +272,75 @@
lis 18,VG_(dispatch_ctr)@ha
stw 17,VG_(dispatch_ctr)@l(18)
=20
- lwz 14,16(1)
- lwz 15,20(1)
- lwz 16,24(1)
- lwz 17,28(1)
- lwz 18,32(1)
- lwz 19,36(1)
- lwz 20,40(1)
- lwz 21,44(1)
- lwz 22,48(1)
- lwz 23,52(1)
- lwz 24,56(1)
- lwz 25,60(1)
- lwz 26,64(1)
- lwz 27,68(1)
- lwz 28,72(1)
- lwz 29,76(1)
- lwz 30,80(1)
- lwz 31,84(1)
- lwz 0,92(1)
+ /* restore callee-saved registers */
+ li 4,16
+ lvx 20,4,1
+ li 4,32
+ lvx 21,4,1
+ li 4,48
+ lvx 22,4,1
+ li 4,64
+ lvx 23,4,1
+ li 4,80
+ lvx 24,4,1
+ li 4,96
+ lvx 25,4,1
+ li 4,112
+ lvx 26,4,1
+ li 4,128
+ lvx 27,4,1=09
+ li 4,144
+ lvx 28,4,1
+ li 4,160
+ lvx 29,4,1
+ li 4,176
+ lvx 30,4,1
+ li 4,192
+ lvx 31,4,1
+
+ lfd 14,208(1)
+ lfd 15,216(1)
+ lfd 16,224(1)
+ lfd 17,232(1)
+ lfd 18,240(1)
+ lfd 19,248(1)
+ lfd 20,256(1)
+ lfd 21,264(1)
+ lfd 22,272(1)
+ lfd 23,280(1)
+ lfd 24,288(1)
+ lfd 25,296(1)
+ lfd 26,304(1)
+ lfd 27,312(1)
+ lfd 28,320(1)
+ lfd 29,328(1)
+ lfd 30,336(1)
+ lfd 31,344(1)
+
+ lwz 13,352(1)
+ lwz 14,356(1)
+ lwz 15,360(1)
+ lwz 16,364(1)
+ lwz 17,368(1)
+ lwz 18,372(1)
+ lwz 19,376(1)
+ lwz 20,380(1)
+ lwz 21,384(1)
+ lwz 22,388(1)
+ lwz 23,392(1)
+ lwz 24,396(1)
+ lwz 25,400(1)
+ lwz 26,404(1)
+ lwz 27,408(1)
+ lwz 28,412(1)
+ lwz 29,416(1)
+ lwz 30,420(1)
+ lwz 31,424(1)
+
+ /* reset lr & sp */
+ lwz 0,436(1) /* stack_size + 4 */
mtlr 0
- addi 1,1,88
+ addi 1,1,432 /* stack_size */
blr
=20
=20
@@ -259,7 +361,6 @@
mtctr 17
li 3,VG_TRC_INNER_FASTMISS
b run_innerloop_exit
- =20
=20
counter_is_zero:
/* %CIA is up to date here since dispatch_boring dominates */
|
|
From: Greg P. <gp...@us...> - 2005-11-08 22:26:05
|
sv...@va... writes: > store & load callee-saved floating-point and vector registers in > core dispatch loop. > > Modified: trunk/coregrind/m_dispatch/dispatch-ppc32-linux.S You probably also need to save the condition register, because some of the condition register fields are callee-saved in the AIX ABI. -- Greg Parker gp...@us... |
|
From: Paul M. <pa...@sa...> - 2005-11-09 01:02:43
|
Greg Parker writes: > You probably also need to save the condition register, because some > of the condition register fields are callee-saved in the AIX ABI. And, more relevantly, in the 32-bit PowerPC ELF ABI. Specifically cr2 - cr4 are callee-saved. Paul. |
|
From: Cerion Armour-B. <ce...@op...> - 2005-11-09 14:34:48
|
On Wednesday 09 November 2005 01:53, Paul Mackerras wrote: > Greg Parker writes: > > You probably also need to save the condition register, because some > > of the condition register fields are callee-saved in the AIX ABI. > > And, more relevantly, in the 32-bit PowerPC ELF ABI. Specifically > cr2 - cr4 are callee-saved. > > Paul. Ok, saving/restoring CR now (r5055). But a couple of questions: Most ppc32 abi references I find ask for LR to be stored at 8(parent_sp), and CR at 4(parent_sp). But doing this seems to corrupt my stack: I get valgrind: m_scheduler/scheduler.c:442 (run_thread_for_a_while): Assertion 'trc == 0' failed. If i conform to SysV ppc32 abi (as I have been doing), setting LR at 4 (parent_sp), and CR in current stack, all's (seemingly) well. Am wondering if there's a 'right way' to do this? Also, I'm now saving/restoring the VRSAVE register - is this correct/necessary? Cheers, Cerion |
|
From: Greg P. <gp...@us...> - 2005-11-10 19:04:54
|
Cerion Armour-Brown writes: > Most ppc32 abi references I find ask for LR to be stored at 8(parent_sp), and > CR at 4(parent_sp). But doing this seems to corrupt my stack: I get > > valgrind: m_scheduler/scheduler.c:442 (run_thread_for_a_while): Assertion 'trc > == 0' failed. > > If i conform to SysV ppc32 abi (as I have been doing), setting LR at 4 > (parent_sp), and CR in current stack, all's (seemingly) well. The LR->8(sp) / CR->4(sp) convention is part of the AIX ABI, which is also used on Darwin and (I think) Linux/ppc64. For Linux/ppc32 you do need the SysV ABI, which uses LR->4(sp) / CR->anywhere. In the SysV ABI, 8(sp) is usually part of the parent frame's local variable storage, so it's reasonable that using it clobbers local variable trc. > Also, I'm now saving/restoring the VRSAVE register - is this > correct/necessary? It's necessary in the AIX / Darwin ABI. The Linux kernel might not actually use vrsave for its intended purpose, but it should be harmless to preserve vrsave anyway. -- Greg Parker gp...@us... |
|
From: Nicholas N. <nj...@cs...> - 2005-11-10 19:25:59
|
On Thu, 10 Nov 2005, Greg Parker wrote: >> If i conform to SysV ppc32 abi (as I have been doing), setting LR at 4 >> (parent_sp), and CR in current stack, all's (seemingly) well. > > The LR->8(sp) / CR->4(sp) convention is part of the AIX ABI, which > is also used on Darwin and (I think) Linux/ppc64. For Linux/ppc32 > you do need the SysV ABI, which uses LR->4(sp) / CR->anywhere. In the > SysV ABI, 8(sp) is usually part of the parent frame's local variable > storage, so it's reasonable that using it clobbers local variable trc. > > >> Also, I'm now saving/restoring the VRSAVE register - is this >> correct/necessary? > > It's necessary in the AIX / Darwin ABI. The Linux kernel might > not actually use vrsave for its intended purpose, but it should > be harmless to preserve vrsave anyway. This looks like good info to stick in the code as comments. Nick |
|
From: Cerion Armour-B. <ce...@op...> - 2005-11-11 01:01:23
|
On Thursday 10 November 2005 20:25, Nicholas Nethercote wrote: > On Thu, 10 Nov 2005, Greg Parker wrote: > >> If i conform to SysV ppc32 abi (as I have been doing), setting LR at 4 > >> (parent_sp), and CR in current stack, all's (seemingly) well. > > > > The LR->8(sp) / CR->4(sp) convention is part of the AIX ABI, which > > is also used on Darwin and (I think) Linux/ppc64. For Linux/ppc32 > > you do need the SysV ABI, which uses LR->4(sp) / CR->anywhere. In the > > SysV ABI, 8(sp) is usually part of the parent frame's local variable > > storage, so it's reasonable that using it clobbers local variable trc. > > > >> Also, I'm now saving/restoring the VRSAVE register - is this > >> correct/necessary? > > > > It's necessary in the AIX / Darwin ABI. The Linux kernel might > > not actually use vrsave for its intended purpose, but it should > > be harmless to preserve vrsave anyway. > > This looks like good info to stick in the code as comments. > > Nick Done, thanks Greg. Cerion |