|
From: <sv...@va...> - 2008-02-08 15:17:20
|
Author: tom
Date: 2008-02-08 15:17:07 +0000 (Fri, 08 Feb 2008)
New Revision: 7383
Log:
Make the clone system call wrappers call VG_(register_stack) to record
the new thread's stack, then make the stack unwinder use that information
to make a better guess at the stack bounds.
This helps avoid crashes trying to unwind the stack under wine when
the starting point is a routine without a proper stack frame.
Modified:
trunk/coregrind/m_stacks.c
trunk/coregrind/m_stacktrace.c
trunk/coregrind/m_syswrap/syswrap-amd64-linux.c
trunk/coregrind/m_syswrap/syswrap-ppc32-linux.c
trunk/coregrind/m_syswrap/syswrap-ppc64-linux.c
trunk/coregrind/m_syswrap/syswrap-x86-linux.c
trunk/coregrind/pub_core_stacks.h
Modified: trunk/coregrind/m_stacks.c
===================================================================
--- trunk/coregrind/m_stacks.c 2008-02-08 00:37:18 UTC (rev 7382)
+++ trunk/coregrind/m_stacks.c 2008-02-08 15:17:07 UTC (rev 7383)
@@ -195,6 +195,20 @@
}
}
+/*
+ * Find the bounds of the stack (if any) which includes the
+ * specified stack pointer.
+ */
+void VG_(stack_limits)(Addr SP, Addr *start, Addr *end )
+{
+ Stack* stack = find_stack_by_addr(SP);
+
+ if (stack) {
+ *start = stack->start;
+ *end = stack->end;
+ }
+}
+
/* This function gets called if new_mem_stack and/or die_mem_stack are
tracked by the tool, and one of the specialised cases
(eg. new_mem_stack_4) isn't used in preference.
Modified: trunk/coregrind/m_stacktrace.c
===================================================================
--- trunk/coregrind/m_stacktrace.c 2008-02-08 00:37:18 UTC (rev 7382)
+++ trunk/coregrind/m_stacktrace.c 2008-02-08 15:17:07 UTC (rev 7383)
@@ -398,6 +398,7 @@
Addr sp = VG_(get_SP)(tid);
Addr lr = VG_(get_LR)(tid);
Addr stack_highest_word = VG_(threads)[tid].client_stack_highest_word;
+ Addr stack_lowest_word = 0;
# if defined(VGP_x86_linux)
/* Nasty little hack to deal with syscalls - if libc is using its
@@ -426,6 +427,9 @@
}
# endif
+ /* See if we can get a better idea of the stack limits */
+ VG_(stack_limits)(sp, &stack_lowest_word, &stack_highest_word);
+
/* Take into account the first_ip_delta. */
vg_assert( sizeof(Addr) == sizeof(Word) );
ip += first_ip_delta;
Modified: trunk/coregrind/m_syswrap/syswrap-amd64-linux.c
===================================================================
--- trunk/coregrind/m_syswrap/syswrap-amd64-linux.c 2008-02-08 00:37:18 UTC (rev 7382)
+++ trunk/coregrind/m_syswrap/syswrap-amd64-linux.c 2008-02-08 15:17:07 UTC (rev 7383)
@@ -258,6 +258,8 @@
ctst->client_stack_highest_word = (Addr)VG_PGROUNDUP(rsp);
ctst->client_stack_szB = ctst->client_stack_highest_word - seg->start;
+ VG_(register_stack)(seg->start, ctst->client_stack_highest_word);
+
if (debug)
VG_(printf)("tid %d: guessed client stack range %p-%p\n",
ctid, seg->start, VG_PGROUNDUP(rsp));
Modified: trunk/coregrind/m_syswrap/syswrap-ppc32-linux.c
===================================================================
--- trunk/coregrind/m_syswrap/syswrap-ppc32-linux.c 2008-02-08 00:37:18 UTC (rev 7382)
+++ trunk/coregrind/m_syswrap/syswrap-ppc32-linux.c 2008-02-08 15:17:07 UTC (rev 7383)
@@ -304,6 +304,8 @@
ctst->client_stack_highest_word = (Addr)VG_PGROUNDUP(sp);
ctst->client_stack_szB = ctst->client_stack_highest_word - seg->start;
+ VG_(register_stack)(seg->start, ctst->client_stack_highest_word);
+
if (debug)
VG_(printf)("\ntid %d: guessed client stack range %p-%p\n",
ctid, seg->start, VG_PGROUNDUP(sp));
Modified: trunk/coregrind/m_syswrap/syswrap-ppc64-linux.c
===================================================================
--- trunk/coregrind/m_syswrap/syswrap-ppc64-linux.c 2008-02-08 00:37:18 UTC (rev 7382)
+++ trunk/coregrind/m_syswrap/syswrap-ppc64-linux.c 2008-02-08 15:17:07 UTC (rev 7383)
@@ -332,6 +332,8 @@
ctst->client_stack_highest_word = (Addr)VG_PGROUNDUP(sp);
ctst->client_stack_szB = ctst->client_stack_highest_word - seg->start;
+ VG_(register_stack)(seg->start, ctst->client_stack_highest_word);
+
if (debug)
VG_(printf)("\ntid %d: guessed client stack range %p-%p\n",
ctid, seg->start, VG_PGROUNDUP(sp));
Modified: trunk/coregrind/m_syswrap/syswrap-x86-linux.c
===================================================================
--- trunk/coregrind/m_syswrap/syswrap-x86-linux.c 2008-02-08 00:37:18 UTC (rev 7382)
+++ trunk/coregrind/m_syswrap/syswrap-x86-linux.c 2008-02-08 15:17:07 UTC (rev 7383)
@@ -270,6 +270,8 @@
ctst->client_stack_highest_word = (Addr)VG_PGROUNDUP(esp);
ctst->client_stack_szB = ctst->client_stack_highest_word - seg->start;
+ VG_(register_stack)(seg->start, ctst->client_stack_highest_word);
+
if (debug)
VG_(printf)("tid %d: guessed client stack range %p-%p\n",
ctid, seg->start, VG_PGROUNDUP(esp));
Modified: trunk/coregrind/pub_core_stacks.h
===================================================================
--- trunk/coregrind/pub_core_stacks.h 2008-02-08 00:37:18 UTC (rev 7382)
+++ trunk/coregrind/pub_core_stacks.h 2008-02-08 15:17:07 UTC (rev 7383)
@@ -39,6 +39,7 @@
extern UWord VG_(register_stack) ( Addr start, Addr end );
extern void VG_(deregister_stack) ( UWord id );
extern void VG_(change_stack) ( UWord id, Addr start, Addr end );
+extern void VG_(stack_limits) ( Addr SP, Addr *start, Addr *end );
extern VG_REGPARM(2)
void VG_(unknown_SP_update) ( Addr old_SP, Addr new_SP );
|
|
From: Julian S. <js...@ac...> - 2008-02-09 11:35:16
|
On Friday 08 February 2008 16:17, sv...@va... wrote: > Author: tom > Date: 2008-02-08 15:17:07 +0000 (Fri, 08 Feb 2008) > New Revision: 7383 > > Log: > Make the clone system call wrappers call VG_(register_stack) to record > the new thread's stack, then make the stack unwinder use that information > to make a better guess at the stack bounds. Seems like a good plan, perhaps a long-overdue thing to do. Some comments: * Would it perhaps be prudent to deregister the stack at thread exit? Otherwise, the linked list of stacks grows without bound (afaics -- VG_(deregister_stack) is only ever called as a result of a client request at the moment). Which clearly isn't too good for eg none/tests/manythreads.c. * In VG_(get_StackTrace), the call to VG_(stack_limits) potentially gets a new value for stack_highest_word, and that could (?) be higher than the previous value, VG_(threads)[tid].client_stack_highest_word (I don't see how, but still ..) Maybe more conservative to use the min of the values from VG_(threads)[tid].client_stack_highest_word and VG_(stack_limits)? * This might give a bit of a performance hit in unwind- intensive programs as the stacks list now has to be searched for each snapshot. I guess we could mostly ameliorate this by the usual trick of incrementally rearranging the list to diffuse frequently-requested entries towards the front. J |
|
From: Tom H. <to...@co...> - 2008-02-12 10:31:07
|
In message <200...@ac...>
Julian Seward <js...@ac...> wrote:
> * Would it perhaps be prudent to deregister the stack at thread
> exit? Otherwise, the linked list of stacks grows without bound
> (afaics -- VG_(deregister_stack) is only ever called as a result
> of a client request at the moment). Which clearly isn't too good
> for eg none/tests/manythreads.c.
Probably a good idea, yes. It will probably mean storing the ID value
returned by register_stack in the thread state.
> * In VG_(get_StackTrace), the call to VG_(stack_limits)
> potentially gets a new value for stack_highest_word, and
> that could (?) be higher than the previous value,
> VG_(threads)[tid].client_stack_highest_word (I don't see how,
> but still ..) Maybe more conservative to use the min of
> the values from VG_(threads)[tid].client_stack_highest_word
> and VG_(stack_limits)?
Well really we should probably change client_stack_highest_word when
a stack change happens surely? That was actually my first plan, but
it turned out to be quite hard as VG_(unknown_SP_update) is the place
that detects the change but it is called from generated code without
any easy access to the thread state, and is probably fairly sensitive
to performance issues.
I did consider updating it each time control returned from the client
to valgrind, but that proved non-trivial as well as m_stacks.c is
tracking the concept of the current stack via a global variable.
> * This might give a bit of a performance hit in unwind-
> intensive programs as the stacks list now has to be searched for
> each snapshot. I guess we could mostly ameliorate this by the
> usual trick of incrementally rearranging the list to diffuse
> frequently-requested entries towards the front.
It may be an issue, yes. In fact my original solution to the
problem of stopping the unwinder crashing was to simply ask the
address space manager whether the stack frame we were about to
try and read existed and was readable. That should be completely
bulletproof, but I was worried about the cost as that was done
for every stack entry not just once per unwind.
Tom
--
Tom Hughes (to...@co...)
http://www.compton.nu/
|
|
From: Julian S. <js...@ac...> - 2008-02-13 02:26:42
|
> > * This might give a bit of a performance hit in unwind- > > intensive programs as the stacks list now has to be searched for > > each snapshot. I guess we could mostly ameliorate this by the > > usual trick of incrementally rearranging the list to diffuse > > frequently-requested entries towards the front. > > It may be an issue, yes. In fact my original solution to the > problem of stopping the unwinder crashing was to simply ask the > address space manager whether the stack frame we were about to > try and read existed and was readable. That should be completely > bulletproof, but I was worried about the cost as that was done > for every stack entry not just once per unwind. The business of tracking the bounds of each stack is a pain, and makes for fragility -- for example, does it now work correctly with user-defined stacks? On consideration, your original solution seems preferable as it sidesteps all the stack bounds stuff. I wonder if the performance hit could be ameliorated by (1) adding a new function (in m_aspace's interface) to ask "is the page containing a given address mapped and readable, and (2) cacheing the results of such queries (inside m_aspacem) Clearly the cache would have to be flushed each time the mapping state changed, or at least partially flushed. However, unwind intensive code would likely do a lot of snapshots relative to map state changes, so the caching would be (temporally) effective. In addition, I'd bet that most stack frames are a lot smaller than a page, so the caching is also spatially effective: if we know that this frame is safe to poke around in, then it's likely that the next frame is in the same page and so we don't even need to ask aspacem about its safety. What do you reckon? Availability of a low-overhead page-safety check facility would be more generally useful too. For example in various syscall handlers we need to poke at the client supplied args and we're never 100% it won't fault, especially if the client is passing bogus pointers to the kernel. J |
|
From: Tom H. <to...@co...> - 2008-02-13 11:02:16
|
In message <200...@ac...>
Julian Seward <js...@ac...> wrote:
> The business of tracking the bounds of each stack is a pain, and
> makes for fragility -- for example, does it now work correctly with
> user-defined stacks?
Well it should do so long as they are registered with the
appropriate client request (oddly wine seems to register the
initial stack for each process but not those for any other
threads it creates).
> On consideration, your original solution seems preferable as it
> sidesteps all the stack bounds stuff. I wonder if the performance
> hit could be ameliorated by
>
> (1) adding a new function (in m_aspace's interface) to ask "is the
> page containing a given address mapped and readable, and
Well it really needs to check a range of addresses - unwinding
a stack frame on x86 using the frame pointer requires reading
a group of 8 bytes, which can cross a page boundary.
> (2) cacheing the results of such queries (inside m_aspacem)
Interesting - using an oset or something presumably?
As Nick said, we may as well enter the entire segment into the
cache rather than just the one page - it will only be one cache
entry anyway and it is likely to cover the entire stack with a
single entry that way.
> Clearly the cache would have to be flushed each time the mapping
> state changed, or at least partially flushed. However, unwind
> intensive code would likely do a lot of snapshots relative to
> map state changes, so the caching would be (temporally) effective.
Indeed.
> In addition, I'd bet that most stack frames are a lot smaller
> than a page, so the caching is also spatially effective: if we
> know that this frame is safe to poke around in, then it's likely
> that the next frame is in the same page and so we don't even need
> to ask aspacem about its safety.
If we cache the entire segment that we get ever better spatial
effectiveness from the cache.
> What do you reckon? Availability of a low-overhead page-safety
> check facility would be more generally useful too. For example in
> various syscall handlers we need to poke at the client supplied
> args and we're never 100% it won't fault, especially if the client
> is passing bogus pointers to the kernel.
Well the system call stuff could use VG_(am_is_valid_for_client) at
the moment, though that has to binary search all the current segments
every time.
Note that stack unwinding needs to allow reading of V segments as
well as C segments, as we are sometimes unwinding valgrind's stack
rather than the client's. The system call check will only want to
allow C segments.
Tom
--
Tom Hughes (to...@co...)
http://www.compton.nu/
|
|
From: Julian S. <js...@ac...> - 2008-02-13 12:05:23
|
> Well it really needs to check a range of addresses - unwinding
> a stack frame on x86 using the frame pointer requires reading
> a group of 8 bytes, which can cross a page boundary.
Hmm, true. Still, checking for page boundary crossing is something
that can be pushed into the query function.
> Interesting - using an oset or something presumably?
OSets are problematic in m_aspacemgr because we can't use dynamic
memory allocation there. I was thinking of something along the
lines of a small fixed size array of known-safe segments, perhaps
arranged as a fully associative or 2 or 4 way set associative cache.
> Well the system call stuff could use VG_(am_is_valid_for_client) at
> the moment, though that has to binary search all the current segments
> every time.
>
> Note that stack unwinding needs to allow reading of V segments as
> well as C segments, as we are sometimes unwinding valgrind's stack
> rather than the client's. The system call check will only want to
> allow C segments.
Yes. So at least as a simple start, we need a function "Bool
VG_(am_dword_is_readable)(Addr)", which returns True if arg ..
arg+2*sizeof(Word)-1 is safe to read.
One way it could be done is to have a fixed-size (16-ish?) array
of pointers to NSegments. Each NSegment is in the array only if
it is safe to read from it. Array is searched from index 0 onwards
and a hit at index > 0 causes that entry to be moved forward one
place.
Assuming that most queries hit entry 0 -- as they should do, since
this is now a cache of segments, not pages -- then the fast case is
if cache[0] != NULL // entry present
&& a >= cache[0]->start
&& a+8 <= cache[0]->end return True
So that's 3 highly predictable conditionals, plus the call and return
branches. I'd say doable in 30 ish cycles in the common case, considering
the call/return overhead. So that's an extra 360 cycles for a common-case
12-frame stack unwind. Not bad really.
J
|
|
From: Eric P. <eri...@or...> - 2008-02-13 20:51:30
|
Tom Hughes a écrit : > In message <200...@ac...> > Julian Seward <js...@ac...> wrote: > > >> The business of tracking the bounds of each stack is a pain, and >> makes for fragility -- for example, does it now work correctly with >> user-defined stacks? >> > > Well it should do so long as they are registered with the > appropriate client request (oddly wine seems to register the > initial stack for each process but not those for any other > threads it creates). > it's mainly because we (wine project) create a specific stack for the first thread in a process (so the first thread of a process gets two stacks: the one allocated by the system, and the one wine creates) each other threads gets only one stack, and it's bound to the thread at thread creation time at the time I looked at this it wasn't necessary to add any instrumentation for this case as valgrind happily handled the one thread / one stack case A+ -- Eric Pouech "The problem with designing something completely foolproof is to underestimate the ingenuity of a complete idiot." (Douglas Adams) |
|
From: Nicholas N. <nj...@cs...> - 2008-02-13 02:35:44
|
On Wed, 13 Feb 2008, Julian Seward wrote: > In addition, I'd bet that most stack frames are a lot smaller > than a page, so the caching is also spatially effective: if we > know that this frame is safe to poke around in, then it's likely > that the next frame is in the same page and so we don't even need > to ask aspacem about its safety. Perhaps you could ask for the extent of the contiguous addressable memory around the particular address? N |
|
From: Julian S. <js...@ac...> - 2008-02-13 02:50:01
|
On Wednesday 13 February 2008 03:35, Nicholas Nethercote wrote: > On Wed, 13 Feb 2008, Julian Seward wrote: > > In addition, I'd bet that most stack frames are a lot smaller > > than a page, so the caching is also spatially effective: if we > > know that this frame is safe to poke around in, then it's likely > > that the next frame is in the same page and so we don't even need > > to ask aspacem about its safety. > > Perhaps you could ask for the extent of the contiguous addressable memory > around the particular address? Yes, that would I guess be a logical generalisation. It does however put the administrative burden more on the caller, since then the caller first has to ask how big is the accessible area in which address X lies; then keep track of when it has gone outside the area, in which case it needs to ask again, etc. Also, the caller(s) have no obvious way to know when any information they have cached, has gone stale. It would be simpler for the caller if all such trickery were pushed into the is-this-page-safe function and we took care to ensure that said function returned extremely quickly in the majority of cases. J |