On Tue, 2005-04-26 at 06:32, Laura Ramirez wrote:
> Hi Roopa,
>
> The panic as you said was because there was no vproc. The problem
> was that the child's task struct was released, but the vproc was not
> cleanued up properly because the vpop_report_state() changed the
> "zombie" state to a run state in the middle of the cleanup. I have
> added locking in the vpop_report_state() to prevent this. Attached is
> a patchfile...can you see it works.
Thanks Laura, I used your patch on my cluster and I did not see any
panic.
Regards,
Roopa
>
> thanks
>
> laura
>
> Roopa Prabhu wrote:
> > Hello Laura,
> >
> > I hit another panic during reboot when it was trying to kill all
> > processes. Attached is the stack trace at kdb.
> >
> >>From what I understand, In ssi_get_ppid_task() the VPROCPTR() call below
> > is called with an illegal pid. And, hence the returned vproc ptr is
> > NULL. There should be a check for vp returned by VPROCPTR()
> >
> > if (t) {
> > vp = VPROCPTR(t->ppid);
> > return PVP(vp)->pvp_pproc;
> > }
> >
> > Above is a small part of the problem.
> > I think this is another task struct reference count problem.
> > Cause the task struct on which vpop_report_state() is called is freed
> > already. Tried looking at the code on the grounds of your previous
> > reference count fix. But, wasn't able to locate where.
> >
> > Is the __kill_all routine where you actually call vpop_vproc_pid() the
> > culprit ?
> >
> >
> >
> >
> > ------------------------------------------------------------------------
> >
> >
> >
> >
> > <1>Unable to handle kernel NULL pointer dereference at virtual address 0000000c
> > <1> printing eip:
> > [0]more>
> > Only 'q' or 'Q' are processed at more prompt, input ignored
> > <4>c021fe76
> > <1>*pde = 00000000
> > <1>Oops: 0000 [#1]
> > <4>SMP
> > <4>Modules linked in: autofs4 microcode video button battery ac cciss ext3 jbd d
> > <4>CPU: 0
> > <4>EIP: 0060:[<c021fe76>] Not tainted VLI
> > <4>EFLAGS: 00010292 (2.6.10)
> > <4>EIP is at ssi_get_ppid_task+0x26/0x30
> > <4>eax: 00000000 ebx: 00000001 ecx: 00000000 edx: 00000000
> > <4>esi: f7d23750 edi: f6f30000 ebp: f6f31e38 esp: f6f31e30
> > <4>ds: 007b es: 007b ss: 0068
> > <4>Process killall5 (pid: 67971, threadinfo=f6f30000 task=f7cbd550)
> > <4>Stack: 6b6b6b6b 00000001 f6f31e80 c02177dd f52b6550 0000007e 00000246 000000
> > <4> 00000003 f6f3007b f6e89550 00000000 f6f31e80 00000000 c03c7fbf 000000
> > <4> 00000000 00000001 f7d23730 00000000 f6f31eb8 c01303e5 f7d23730 000000
> > <4>Call Trace:
> > <4> [<c010680f>] show_stack+0x7f/0xa0
> > <4> [<c01069c4>] show_registers+0x164/0x220
> > <4> [<c0106d54>] die+0xf4/0x1a0
> > <4> [<c011c8c5>] do_page_fault+0x375/0x695
> > <4> [<c010645b>] error_code+0x2b/0x30
> > [0]more>
> > Only 'q' or 'Q' are processed at more prompt, input ignored
> > <4> [<c02177dd>] vpop_report_state+0xbd/0x310
> > <4> [<c01303e5>] __kill_all+0x235/0x290
> > <4> [<c0219892>] ssi_kill_all+0x22/0x120
> > <4> [<c01304a7>] kill_something_info+0x67/0x90
> > <4> [<c0132510>] sys_kill+0x50/0x60
> > <4> [<c0105915>] sysenter_past_esp+0x52/0x75
> > <4>Code: ec 5d c3 89 f6 55 31 c0 89 e5 83 ec 08 8b 55 08 85 d2 74 1d b8 01 00 0
> > <4>
> >
> > [0]kdb> bt
> > Stack traceback for pid 67971
> > 0xf7cbd550 67971 67640 1 0 R 0xf7cbd720 *killall5
> > EBP EIP Function (args)
> > 0xf6f31e38 0xc021fe76 ssi_get_ppid_task+0x26 (0xf52b6550, 0x7e, 0x246, 0x0, 0x3)
> > 0xf6f31e80 0xc02177dd vpop_report_state+0xbd (0xf7d23730, 0x12, 0x0, 0x3, 0x0)
> > 0xf6f31eb8 0xc01303e5 __kill_all+0x235 (0x12, 0xf6f31f3c, 0xffffffff, 0xf, 0xf6)
> > 0xf6f31f14 0xc0219892 ssi_kill_all+0x22 (0x12, 0xf6f31f3c, 0xffffffff)
> > 0xf6f31f28 0xc01304a7 kill_something_info+0x67 (0x12, 0xf6f31f3c, 0xffffffff, 0)
> > 0xf6f31fbc 0xc0132510 sys_kill+0x50
> > 0xc0105915 sysenter_past_esp+0x52
> > [0]kdb> rd
> > eax = 0x00000000 ebx = 0x00000001 ecx = 0x00000000 edx = 0x00000000
> > esi = 0xf7d23750 edi = 0xf6f30000 esp = 0xf6f31e30 eip = 0xc021fe76
> > ebp = 0xf6f31e38 xss = 0xc0320068 xcs = 0x00000060 eflags = 0x00010292
> > xds = 0x0000007b xes = 0x0000007b origeax = 0xffffffff ®s = 0xf6f31dfc
> >
> >
> > [0]kdb> call print_vproc 0xf7d23730 /*Roopa:first argument of vpop_set_state */
> > vp_magic=0x63727076 (should be 0x63727076)
> > vp_pid=67901
> > vp_ref_cnt=11
> > vp_data=0xf7d23750
> > vp_hashfwd=0x00000000
> > vp_hashbwd=0x00000000
> > Function print_vproc returned 0x0
> > [0]kdb> call print_pvproc 0xf7d23750
> > pvp_flag=0x84f05e
> > pvp_wstate=0x1
> > pvp_pproc=0xf52b6550
> > pvp_head_childl=0x00000000
> > pvp_childl=0x00000000
> > pvp_head_pgrpl=0xf7d23730
> > pvp_pgrpl=0x00000000
> > pvp_sessionl=0x00000000
> > pvp_head_oclist=0x00000000
> > pvp_oclist=0x00000000
> > pvp_ppid=1
> > pvp_oppid=0
> > pvp_sid=67901
> > pvp_pgid=67901
> > pvp_pp_sid=1
> > pvp_pp_pgid=1
> > pvp_fromnode=1
> > pvp_tonode=1
> > pvp_cttynode=1048577
> > pvp_cttydev=0x0
> > pvp_jobc=0
> > pvp_pgrp_ldr_seqno=0
> > pvp_pgrp_mem_seqno=0
> > pvp_fork_sigmigarg=0
> > pvp.ml.ml_flag=0
> > pvp.ml.ml_shr_count=0
> > pvp.ml.ml_excl_count=0
> > pvp_loadlevel=0
> > pvp_pin=0
> > pvp_localview=1
> >
> > [0]kdb> call print_task_struct 0xf52b6550
> > state=0x6b6b6b6b
> > flags=0x6b6b6b6b
> > ptrace=0x6b6b6b6b
> > lock_depth=1802201963
> > prio=1802201963
> > static_prio=1802201963
> > array=6b6b6b6b
> > sleep_avg=1802201963
> > interactive_credit=1802201963
> > timestamp=7740398493674204011
> > activated=0x6b6b6b6b
> > policy=1802201963
> > &cpus_allowed=0xf52b659c
> > time_slice=1802201963
> > first_time_slice=1802201963
> > tasks.next 0x6b6b6b6b, tasks.prev 0x6b6b6b6b
> > mm=6b6b6b6b
> > active_mm=6b6b6b6b
> > binfmt=6b6b6b6b
> > exit_code=1802201963
> > exit_signal=1802201963
> > pdeath_signal=1802201963
> > personality=0x6b6b6b6b
> > did_exec=1
> > pid=1802201963
> > epid=1802201963
> > ppid=1802201963
> > tgid=1802201963
> > cltnode=1802201963
> > p_vproc=0x6b6b6b6b
> > p_vfparent=0x6b6b6b6b
> > group_leader=0x6b6b6b6b
> > &pids=0xf52b65f4
> > set_child_tid 0x6b6b6b6b
> > clear_child_tid 0x6b6b6b6b
> > rt_priority=0x6b6b6b6b
> > it_real_value=0x6b6b6b6b
> > it_prof_value=0x6b6b6b6b
> > it_virt_value=0x6b6b6b6b
> > it_real_incr=0x6b6b6b6b
> > it_prof_incr=0x6b6b6b6b
> > it_virt_incr=0x6b6b6b6b
> > utime=1802201963
> > stime=1802201963
> >
> > [0]kdb> id ssi_get_ppid_task
> > 0xc021fe50 ssi_get_ppid_task: push %ebp
> > 0xc021fe51 ssi_get_ppid_task+0x1: xor %eax,%eax
> > 0xc021fe53 ssi_get_ppid_task+0x3: mov %esp,%ebp
> > 0xc021fe55 ssi_get_ppid_task+0x5: sub $0x8,%esp
> > 0xc021fe58 ssi_get_ppid_task+0x8: mov 0x8(%ebp),%edx
> > 0xc021fe5b ssi_get_ppid_task+0xb: test %edx,%edx
> > 0xc021fe5d ssi_get_ppid_task+0xd: je 0xc021fe7c ssi_get_ppid_task+0x2c
> > 0xc021fe5f ssi_get_ppid_task+0xf: mov $0x1,%eax
> > 0xc021fe64 ssi_get_ppid_task+0x14: mov %eax,0x4(%esp,1)
> > 0xc021fe68 ssi_get_ppid_task+0x18: mov 0x8c(%edx),%eax
> > 0xc021fe6e ssi_get_ppid_task+0x1e: mov %eax,(%esp,1)
> > 0xc021fe71 ssi_get_ppid_task+0x21: call 0xc021faa0 vprocptr
> > 0xc021fe76 ssi_get_ppid_task+0x26: mov 0xc(%eax),%eax
> > 0xc021fe79 ssi_get_ppid_task+0x29: mov 0x8(%eax),%eax
> > 0xc021fe7c ssi_get_ppid_task+0x2c: mov %ebp,%esp
> > 0xc021fe7e ssi_get_ppid_task+0x2e: pop %ebp
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
> >
>
> ______________________________________________________________________
> Index: cluster/ssi/vproc/dvp_vpops.c
> ===================================================================
> RCS file: /cvsroot/ssic-linux/openssi/kernel/cluster/ssi/vproc/dvp_vpops.c,v
> retrieving revision 1.28
> diff -u -p -r1.28 dvp_vpops.c
> --- cluster/ssi/vproc/dvp_vpops.c 25 Mar 2005 03:40:42 -0000 1.28
> +++ cluster/ssi/vproc/dvp_vpops.c 26 Apr 2005 00:52:14 -0000
> @@ -1291,6 +1291,18 @@ vpop_report_state(
> struct vproc *vp;
> register int old_state, new_state;
> int error = 0;
> + int held = VPROC_LOCK_EXCL_HELD(v);
> +
> + if (!local_only) {
> + if (!held)
> + VPROC_LOCK_EXCL(v, "temp report_state");
> + if ((PVP(v)->pvp_wstate == PVWS_SZOMB) &&
> + (state != VPROC_ZOMBIE)) {
> + if (!held)
> + VPROC_UNLOCK_EXCL(v, "temp report_state");
> + return 0;
> + }
> + }
>
> /* set the local vproc */
> VPROC_LOCK_FLAG(v, "vpop_report_state");
> @@ -1342,15 +1354,11 @@ vpop_report_state(
> * perform SIGCONT processing in the context of the posting process.
> */
> else {
> - int held = VPROC_LOCK_EXCL_HELD(v);
> struct task_struct *tsk = current;
> int current_state = tsk->state;
> int ppid;
> struct task_struct *parent;
>
> - if (!held)
> - VPROC_LOCK_EXCL(v, "temp report_state");
> -
> ppid = PVP(v)->pvp_ppid;
> parent = tsk = GET_PPID_TASK(PVP(v)->pvp_pproc);
> vp = LOCATE_VPROC_PID(ppid, "vpop_report_state");
> Index: cluster/ssi/vproc/vp_subr.c
> ===================================================================
> RCS file: /cvsroot/ssic-linux/openssi/kernel/cluster/ssi/vproc/vp_subr.c,v
> retrieving revision 1.6
> diff -u -p -r1.6 vp_subr.c
> --- cluster/ssi/vproc/vp_subr.c 21 Oct 2004 23:40:50 -0000 1.6
> +++ cluster/ssi/vproc/vp_subr.c 26 Apr 2005 00:52:14 -0000
> @@ -212,7 +212,7 @@ struct task_struct *ssi_get_ppid_task(st
>
> if (t) {
> vp = VPROCPTR(t->ppid);
> - return PVP(vp)->pvp_pproc;
> + return (vp ? PVP(vp)->pvp_pproc : NULL);
> }
>
> return NULL;
|