From: Denys V. <dvl...@re...> - 2009-02-23 13:47:24
|
diff -x CVS -urpN 2009-01-27/ChangeLog 2009-01-28/ChangeLog --- 2009-01-27/ChangeLog 2009-01-26 20:09:35.000000000 +0100 +++ 2009-01-28/ChangeLog 2009-01-27 20:38:44.000000000 +0100 @@ -1,3 +1,8 @@ +2009-01-27 Denys Vlasenko <dvl...@re...> + + * strace.c (collect_stopped_tcbs): Guard against the case when + waitpid() reports the same task multiple times. + 2009-01-26 Denys Vlasenko <dvl...@re...> * process.c (printwaitn): Add comment about wait4() pid expansion. diff -x CVS -urpN 2009-01-27/strace.c 2009-01-28/strace.c --- 2009-01-27/strace.c 2009-01-21 20:05:43.000000000 +0100 +++ 2009-01-28/strace.c 2009-01-27 20:38:44.000000000 +0100 @@ -27,7 +27,7 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $Id: strace.c,v 1.103 2009/01/21 19:05:43 vda_linux Exp $ + * $Id: strace.c,v 1.104 2009/01/27 19:38:44 vda_linux Exp $ */ #include "defs.h" @@ -2255,21 +2255,39 @@ handle_group_exit(struct tcb *tcp, int s static struct tcb * collect_stopped_tcbs(void) { +#ifdef LINUX + static int remembered_pid; + static int remembered_status; +#endif int pid; int wait_errno; int status; struct tcb *tcp; + struct tcb *found_tcps; #ifdef LINUX + struct tcb **nextp; struct rusage ru; struct rusage* ru_ptr = cflag ? &ru : NULL; int wnohang = 0; - struct tcb *found_tcps; - struct tcb **nextp = &found_tcps; #ifdef __WALL int wait4_options = __WALL; #endif + + if (remembered_pid > 0) { + pid = remembered_pid; + remembered_pid = 0; + if (debug) + fprintf(stderr, " [remembered wait(%#x) = %u]\n", + remembered_status, pid); + tcp = pid2tcb(pid); /* can't be NULL */ + tcp->wait_status = remembered_status; + tcp->next_need_service = NULL; + return tcp; + } + nextp = &found_tcps; #endif /* LINUX */ + found_tcps = NULL; while (1) { #ifdef LINUX #ifdef __WALL @@ -2338,6 +2356,14 @@ collect_stopped_tcbs(void) if (debug) fprintf(stderr, " [wait(%#x) = %u]\n", status, pid); + /* RHEL5 bug workaround. + * It can re-report stopped tasks. Happens on SIGSTOPs here. + * Second (bogus) report has signal# set to 0. + * Stop collecting and process what we have. + */ + if (WIFSTOPPED(status) && WSTOPSIG(status) == 0) + break; + /* Look up `pid' in our table. */ if ((tcp = pid2tcb(pid)) == NULL) { #ifdef LINUX @@ -2384,31 +2410,58 @@ Process %d attached (waiting for parent) * process has not been actually restarted. * Since we have inspected the arguments of suspended * processes we end up here testing for this case. + * + * We also end up here when we catch new pid of + * CLONE_PTRACEd process. Do not process/restart it + * until we see corresponding clone() syscall exit + * in its parent. */ continue; } - tcp->wait_status = status; #ifdef LINUX + /* So far observed only on RHEL5 ia64, but I imagine this + * can legitimately happen elsewhere. + * If we waited and got a stopped task notification, + * subsequent wait may return the same pid again, for example, + * with SIGKILL notification. SIGKILL kills even stopped tasks. + * We must not add it to the list + * (one task can't be inserted twice in the list). + */ + { + struct tcb *f = found_tcps; + while (f) { + if (f == tcp) { + remembered_pid = pid; + remembered_status = status; + return found_tcps; + } + f = f->next_need_service; + } + } /* It is important to not invert the order of tasks * to process. For one, alloc_tcb() above picks newly forked * threads in some order, processing of them and their parent * should be in the same order, otherwise bad things happen * (misinterpreted SIGSTOPs and such). */ + tcp->wait_status = status; *nextp = tcp; nextp = &tcp->next_need_service; + *nextp = NULL; wnohang = WNOHANG; #endif #ifdef SUNOS4 /* Probably need to replace wait with waitpid * and loop on Sun too, but I can't test it. Volunteers? */ + tcp->wait_status = status; + tcp->next_need_service = NULL; + found_tcps = tcp; break; #endif - } + } /* while (1) - collecting all stopped/exited tracees */ - *nextp = NULL; return found_tcps; } |