|
From: <sv...@va...> - 2005-12-15 15:46:54
|
Author: sewardj
Date: 2005-12-15 15:46:43 +0000 (Thu, 15 Dec 2005)
New Revision: 5346
Log:
Rewrite amd64 dispatch loop to add performance enhancements as per x86
reorganisation of r5345.
Modified:
trunk/coregrind/m_dispatch/dispatch-amd64-linux.S
Modified: trunk/coregrind/m_dispatch/dispatch-amd64-linux.S
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- trunk/coregrind/m_dispatch/dispatch-amd64-linux.S 2005-12-15 14:07:07=
UTC (rev 5345)
+++ trunk/coregrind/m_dispatch/dispatch-amd64-linux.S 2005-12-15 15:46:43=
UTC (rev 5346)
@@ -1,8 +1,8 @@
=20
-##--------------------------------------------------------------------##
-##--- The core dispatch loop, for jumping to a code address. ---##
-##--- dispatch-amd64.S ---##
-##--------------------------------------------------------------------##
+/*--------------------------------------------------------------------*/
+/*--- The core dispatch loop, for jumping to a code address. ---*/
+/*--- dispatch-amd64.S ---*/
+/*--------------------------------------------------------------------*/
=20
/*
This file is part of Valgrind, a dynamic binary instrumentation
@@ -39,11 +39,19 @@
/*--- The dispatch loop. ---*/
/*------------------------------------------------------------*/
=20
-/* signature: UWord VG_(run_innerloop) ( void* guest_state ) */
+/*----------------------------------------------------*/
+/*--- Preamble (set everything up) ---*/
+/*----------------------------------------------------*/
=20
+/* signature:
+UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
+*/
+
+.text
.globl VG_(run_innerloop)
VG_(run_innerloop):
/* %rdi holds guest_state */
+ /* %rsi holds do_profiling */
=09
/* ----- entry point to VG_(run_innerloop) ----- */
pushq %rbx
@@ -59,12 +67,13 @@
pushq %r13
pushq %r14
pushq %r15
- pushq %rdi
+ pushq %rdi /* guest_state */
=20
- movq VG_(dispatch_ctr)@GOTPCREL(%rip), %rsi
- pushq (%rsi)
+ movq VG_(dispatch_ctr)@GOTPCREL(%rip), %r15
+ movl (%r15), %r15d
+ pushq %r15
=20
- /* 8(%rsp) holds cached copy of guest_state */
+ /* 8(%rsp) holds cached copy of guest_state ptr */
/* 0(%rsp) holds cached copy of VG_(dispatch_ctr) */
=20
/* Set up the guest state pointer */
@@ -90,12 +99,26 @@
/* set dir flag to known value */
cld
=20
- /* fall into main loop */
+ /* fall into main loop (the right one) */
+ cmpq $0, %rsi
+ je VG_(run_innerloop__dispatch_unprofiled)
+ jmp VG_(run_innerloop__dispatch_profiled)
+ /*NOTREACHED*/=09
=20
- /* Here, %rax is the only live (real) register. The entire
- simulated state is saved in the ThreadState. */
+/*----------------------------------------------------*/
+/*--- NO-PROFILING (standard) dispatcher ---*/
+/*----------------------------------------------------*/
=20
-dispatch_boring:
+.align 16
+.global VG_(run_innerloop__dispatch_unprofiled)
+VG_(run_innerloop__dispatch_unprofiled):
+ /* AT ENTRY: %rax is next guest addr, %rbp is possibly
+ modified guest state ptr */
+
+ /* Has the guest state pointer been messed with? If yes, exit. */
+ cmpq 8(%rsp), %rbp
+ jnz gsp_changed
+
/* save the jump address in the guest state */
movq %rax, OFFSET_amd64_RIP(%rbp)
=20
@@ -104,40 +127,99 @@
jz counter_is_zero
=20
/* try a fast lookup in the translation cache */
- movq %rax, %rbx
- andq $VG_TT_FAST_MASK, %rbx
- movq VG_(tt_fast)@GOTPCREL(%rip), %rcx
- movq (%rcx,%rbx,8), %rcx
- cmpq %rax, (%rcx)
- jnz fast_lookup_failed
- /* increment bb profile counter */
- movq VG_(tt_fastN)@GOTPCREL(%rip), %rdx
- movq (%rdx,%rbx,8), %rdx
- incl (%rdx)
+ movq VG_(tt_fast)@GOTPCREL(%rip), %rcx
+ movq %rax, %rbx
+ andq $VG_TT_FAST_MASK, %rbx
+ movq (%rcx,%rbx,8), %rcx
+ cmpq %rax, (%rcx)
+ jnz fast_lookup_failed
=20
/* Found a match. Call tce[1], which is 8 bytes along, since
each tce element is a 64-bit int. */
addq $8, %rcx
- call *%rcx
+ jmp *%rcx
+ ud2 /* persuade insn decoders not to speculate past here */
+ /* generated code should run, then jump back to
+ VG_(run_innerloop__dispatch_unprofiled). */
+ /*NOTREACHED*/
=20
- /*=20
- %rax holds destination (original) address.
- %rbp indicates further details of the control transfer
- requested to the address in %rax.
-=09
- If rbp is unchanged (=3D=3D * 8(%rsp)), just jump next to %rax.
+/*----------------------------------------------------*/
+/*--- PROFILING dispatcher (can be much slower) ---*/
+/*----------------------------------------------------*/
=20
- Otherwise fall out, back to the scheduler, and let it
- figure out what to do next.
- */
+.align 16
+.global VG_(run_innerloop__dispatch_profiled)
+VG_(run_innerloop__dispatch_profiled):
+ /* AT ENTRY: %rax is next guest addr, %rbp is possibly
+ modified guest state ptr */
=20
+ /* Has the guest state pointer been messed with? If yes, exit. */
cmpq 8(%rsp), %rbp
- jz dispatch_boring
+ jnz gsp_changed
=20
- jmp dispatch_exceptional
+ /* save the jump address in the guest state */
+ movq %rax, OFFSET_amd64_RIP(%rbp)
=20
+ /* Are we out of timeslice? If yes, defer to scheduler. */
+ subl $1, 0(%rsp)
+ jz counter_is_zero
=20
+ /* try a fast lookup in the translation cache */
+ movq VG_(tt_fast)@GOTPCREL(%rip), %rcx
+ movq %rax, %rbx
+ andq $VG_TT_FAST_MASK, %rbx
+ movq (%rcx,%rbx,8), %rcx
+ cmpq %rax, (%rcx)
+ jnz fast_lookup_failed
=20
+ /* increment bb profile counter */
+ movq VG_(tt_fastN)@GOTPCREL(%rip), %rdx
+ movq (%rdx,%rbx,8), %rdx
+ addl $1, (%rdx)
+
+ /* Found a match. Call tce[1], which is 8 bytes along, since
+ each tce element is a 64-bit int. */
+ addq $8, %rcx
+ jmp *%rcx
+ ud2 /* persuade insn decoders not to speculate past here */
+ /* generated code should run, then jump back to
+ VG_(run_innerloop__dispatch_profiled). */
+ /*NOTREACHED*/
+
+/*----------------------------------------------------*/
+/*--- exit points ---*/
+/*----------------------------------------------------*/
+
+gsp_changed:
+ /* Someone messed with the gsp. Have to
+ defer to scheduler to resolve this. dispatch ctr
+ is not yet decremented, so no need to increment. */
+ /* %RIP is NOT up to date here. First, need to write
+ %rax back to %RIP, but without trashing %rbp since
+ that holds the value we want to return to the scheduler.
+ Hence use %r15 transiently for the guest state pointer. */
+ movq 8(%rsp), %r15
+ movq %rax, OFFSET_amd64_RIP(%r15)
+ movq %rbp, %rax
+ jmp run_innerloop_exit
+ /*NOTREACHED*/
+
+counter_is_zero:
+ /* %RIP is up to date here */
+ /* back out decrement of the dispatch counter */
+ addl $1, 0(%rsp)
+ movq $VG_TRC_INNER_COUNTERZERO, %rax
+ jmp run_innerloop_exit
+
+fast_lookup_failed:
+ /* %RIP is up to date here */
+ /* back out decrement of the dispatch counter */
+ addl $1, 0(%rsp)
+ movq $VG_TRC_INNER_FASTMISS, %rax
+ jmp run_innerloop_exit
+
+
+
/* All exits from the dispatcher go through here. %rax holds
the return value.=20
*/
@@ -150,14 +232,14 @@
pushq $0
fstcw (%rsp)
cmpl $0x027F, (%rsp)
- popq %r11 /* get rid of the word without trashing %eflags */
+ popq %r15 /* get rid of the word without trashing %eflags */
jnz invariant_violation
#endif
pushq $0
stmxcsr (%rsp)
andl $0xFFFFFFC0, (%rsp) /* mask out status flags */
cmpl $0x1F80, (%rsp)
- popq %r11
+ popq %r15
jnz invariant_violation
/* otherwise we're OK */
jmp run_innerloop_exit_REALLY
@@ -167,8 +249,12 @@
jmp run_innerloop_exit_REALLY
=20
run_innerloop_exit_REALLY:
- movq VG_(dispatch_ctr)@GOTPCREL(%rip), %rsi
- popq (%rsi)
+
+ /* restore VG_(dispatch_ctr) */=09
+ popq %r14
+ movq VG_(dispatch_ctr)@GOTPCREL(%rip), %r15
+ movl %r14d, (%r15)
+
popq %rdi
popq %r15
popq %r14
@@ -190,31 +276,13 @@
/* Other ways of getting out of the inner loop. Placed out-of-line to
make it look cleaner.=20
*/
-dispatch_exceptional:
- /* this is jumped to only, not fallen-through from above */
=20
- /* save %rax in %RIP and defer to sched */
- movq 8(%rsp), %rdi
- movq %rax, OFFSET_amd64_RIP(%rdi)
- movq %rbp, %rax
- jmp run_innerloop_exit
=20
-fast_lookup_failed:
- /* %RIP is up to date here since dispatch_boring dominates */
- addl $1, 0(%rsp)
- movq $VG_TRC_INNER_FASTMISS, %rax
- jmp run_innerloop_exit
=20
-counter_is_zero:
- /* %RIP is up to date here since dispatch_boring dominates */
- addl $1, 0(%rsp)
- movq $VG_TRC_INNER_COUNTERZERO, %rax
- jmp run_innerloop_exit
=20
-
/* Let the linker know we don't need an executable stack */
.section .note.GNU-stack,"",@progbits
=20
-##--------------------------------------------------------------------##
-##--- end ---##
-##--------------------------------------------------------------------##
+/*--------------------------------------------------------------------*/
+/*--- end ---*/
+/*--------------------------------------------------------------------*/
|