From: jez <je...@ji...> - 2007-02-20 03:07:34
|
Hi Guys, We're trying to run a pure 64-bit environment (the debian etch amd64 port on both host and guests) but we're having serious problems running our server applications on the guests. To my untrained eyes, it looks like a process calls clone() and then it's child and itself get killed off by a SIGSEGV. I've included some straces down below. Any insight or direction anyone can offer would be much appreciated. A solution that doesn't involve changing the host kernel wins full points! :-) We tested this stuff using host: * debian stock 2.6.18-3-amd64 kernel and guests: * 2.6.20 from kernel.org * 2.6.18 with debian patches applied Most of the testing was done using the default configuration (ARCH=um make defconfig) but statically linked. We tried a few other configurations as well but the problem remained. We usually run UML instances inside chroot jails, but we've also tested all this stuff in the wild with: ./vmlinux umid=tuff mem=160M ubda=fs.cow,fs.base \ ubdb=swapfile eth0=tuntap,ituff con=pts ssl=pts uml_dir=tmp Startup output looks like: Checking that ptrace can change system call numbers...OK Checking syscall emulation patch for ptrace...missing Checking for tmpfs mount on /dev/shm...OK Checking PROT_EXEC mmap in /dev/shm/...OK Checking for the skas3 patch in the host: - /proc/mm...not found - PTRACE_FAULTINFO...not found - PTRACE_LDT...not found UML running in SKAS0 mode Checking that ptrace can change system call numbers...OK Checking syscall emulation patch for ptrace...missing The server software that we've been testing with (Asterisk and Apache) are standard debian packages that seem to work fine on the host platform. We also compiled Asterisk from it's original source on a guest and tried that as well. All this stuff (configurations, packages, etc) work fine for us on x86. I'm including three sample strace outputs: (1) A trace taken from the host when sshd is the only server running on the guest. (2) A guest trace of Asterisk dying. (3) A guest trace of Apache running. Unlike Asterisk, Apache doesn't get killed. I reckon this is because it registers a signal handler, but what do I know. :-) I've got plenty more traces but the other ones tend to be very verbose. (1) Sample trace of the guest doing nothing much: ... --- SIGALRM (Alarm clock) @ 0 (0) --- setitimer(ITIMER_REAL, {it_interval={0, 0}, it_value={0, 0}}, NULL) = 0 setitimer(ITIMER_VIRTUAL, {it_interval={0, 10000}, it_value={0, 10000}}, NULL) = 0 rt_sigprocmask(SIG_UNBLOCK, [USR1], [USR1 ALRM WINCH IO], 8) = 0 setitimer(ITIMER_VIRTUAL, {it_interval={0, 0}, it_value={0, 0}}, NULL) = 0 setitimer(ITIMER_REAL, {it_interval={0, 10000}, it_value={0, 10000}}, NULL) = 0 rt_sigreturn(0) = -1 EINTR (Interrupted system call) nanosleep({10, 0}, 0) = ? ERESTART_RESTARTBLOCK (To be restarted) --- SIGALRM (Alarm clock) @ 0 (0) --- ... --- SIGCHLD (Child exited) @ 0 (0) --- wait4(5627, [{WIFSTOPPED(s) && WSTOPSIG(s) == 133}], WSTOPPED, NULL) = 5627 ptrace(PTRACE_GETREGS, 5627, 0, 0x60e9f188) = 0 ptrace(PTRACE_GETFPREGS, 5627, 0, 0x60e9f260) = 0 ptrace(PTRACE_POKEUSER, 5627, 8*ORIG_RAX, 0x27) = 0 ptrace(PTRACE_SYSCALL, 5627, 0, SIG_0) = 0 --- SIGCHLD (Child exited) @ 0 (0) --- wait4(5627, [{WIFSTOPPED(s) && WSTOPSIG(s) == 133}], WSTOPPED, NULL) = 5627 ptrace(PTRACE_SETREGS, 5627, 0, 0x60e9f188) = 0 ptrace(PTRACE_SETFPREGS, 5627, 0, 0x60e9f260) = 0 ptrace(PTRACE_SYSCALL, 5627, 0, SIG_0) = 0 ... (2) Sample trace of Asterisk's demise: 1785 clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x4001f640) = 1786 1785 exit_group(0) = ? 1786 setsid() = 1786 1786 chdir("/") = 0 1786 open("/dev/null", O_RDWR) = 3 1786 fstat(3, {st_mode=S_IFCHR|0666, st_rdev=makedev(1, 3), ...}) = 0 1786 dup2(3, 0) = 0 1786 dup2(3, 1) = 1 1786 dup2(3, 2) = 2 1786 close(3) = 0 1786 unlink("/var/run/asterisk/asterisk.pid") = 0 1786 open("/var/run/asterisk/asterisk.pid", O_WRONLY|O_CREAT|O_TRUNC, 0666) = 3 1786 fstat(3, {st_mode=S_IFREG|0644, st_size=0, ...}) = 0 1786 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x40019000 1786 write(3, "1786\n", 5) = 5 1786 close(3) = 0 1786 munmap(0x40019000, 4096) = 0 1786 mmap(NULL, 266240, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|0x40, -1, 0) = 0x40020000 1786 mprotect(0x40020000, 4096, PROT_NONE) = 0 1786 clone(child_stack=0x40060280, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID|CLONE_DETACHED, parent_tidptr=0x400609f0, tls=0x40060960, child_tidptr=0x400609f0) = 1787 1786 nanosleep({0, 100000}, <unfinished ...> 1787 --- SIGSEGV (Segmentation fault) @ 0 (0) --- (3) Sample trace of Apache: ... 945 rt_sigaction(SIGSEGV, {0x43ace0, [], SA_RESTORER|SA_ONESHOT, 0x4113f410}, NULL, 8) = 0 945 rt_sigaction(SIGBUS, {0x43ace0, [], SA_RESTORER|SA_ONESHOT, 0x4113f410}, NULL, 8) = 0 ... 945 select(0, NULL, NULL, NULL, {1, 0}) = 0 (Timeout) 945 clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x40025a60) = 961 945 wait4(-1, 0x7f7fc3267c, WNOHANG|WSTOPPED, NULL) = 0 945 select(0, NULL, NULL, NULL, {1, 0} <unfinished ...> 961 rt_sigaction(SIGTERM, {0x446b10, [], SA_RESTORER|SA_INTERRUPT, 0x4113f410}, {0x444fd0, [], SA_RESTORER, 0x4113f410}, 8) = 0 961 geteuid() = 0 961 setgid(33) = 0 961 open("/proc/sys/kernel/ngroups_max", O_RDONLY) = 8 961 read(8, "65536\n", 31) = 6 961 close(8) = 0 961 open("/etc/group", O_RDONLY) = 8 961 fcntl(8, F_GETFD) = 0 961 fcntl(8, F_SETFD, FD_CLOEXEC) = 0 961 lseek(8, 0, SEEK_CUR) = 0 961 fstat(8, {st_mode=S_IFREG|0644, st_size=485, ...}) = 0 961 mmap(NULL, 485, PROT_READ, MAP_SHARED, 8, 0) = 0x40019000 961 lseek(8, 485, SEEK_SET) = 485 961 fstat(8, {st_mode=S_IFREG|0644, st_size=485, ...}) = 0 961 munmap(0x40019000, 485) = 0 961 close(8) = 0 961 setgroups(1, [33]) = 0 961 geteuid() = 0 961 setuid(33) = 0 961 rt_sigprocmask(SIG_SETMASK, ~[ILL TRAP ABRT BUS FPE SEGV USR2 PIPE SYS RTMIN RT_1], NULL, 8) = 0 961 mmap(NULL, 8392704, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|0x40, -1, 0) = 0x43981000 961 mprotect(0x43981000, 4096, PROT_NONE) = 0 961 clone(child_stack=0x44181280, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETT ID|LONE_CHILD_CLEARTID|CLONE_DETACHED, parent_tidptr=0x441819f0, tls=0x44181960, child_tidptr=0x441819f0) = 962 961 rt_sigprocmask(SIG_UNBLOCK, [TERM], NULL, 8) = 0 961 rt_sigaction(SIGTERM, {0x445040, [], SA_RESTORER|SA_INTERRUPT, 0x4113f410}, {0x446b10, [], SA_RESTORER|SA_INTERRUPT, 0x4113f410}, 8) = 0 961 read(4, <unfinished ...> 962 --- SIGSEGV (Segmentation fault) @ 0 (0) --- 962 chdir("/etc/apache2") = 0 962 rt_sigaction(SIGSEGV, {SIG_DFL}, {SIG_DFL}, 8) = 0 962 kill(961, SIGSEGV) = 0 961 <... read resumed> 0x7f7fc32657, 1) = ? ERESTARTSYS (To be restarted) 961 --- SIGSEGV (Segmentation fault) @ 0 (0) --- 962 +++ killed by SIGSEGV +++ 945 <... select resumed> ) = ? ERESTARTNOHAND (To be restarted) 945 --- SIGCHLD (Child exited) @ 0 (0) --- 945 select(0, NULL, NULL, NULL, {1, 0}) = 0 (Timeout) 945 wait4(-1, [{WIFSIGNALED(s) && WTERMSIG(s) == SIGSEGV}], WNOHANG|WSTOPPED, NULL) = 961 945 write(6, "[Tue Feb 20 02:46:59 2007] [notice] child pid 961 exit signal Segmentation fault (11)\n", 86) = 86 945 wait4(-1, 0x7f7fc3267c, WNOHANG|WSTOPPED, NULL) = 0 945 select(0, NULL, NULL, NULL, {1, 0}) = 0 (Timeout) 945 clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x40025a60) = 963 ... -- a la groundhog day Even if you can't help, thanks for reading this far! jez |