From: Denys V. <dvl...@re...> - 2012-01-20 11:24:22
|
Currently, we use PTRACE_PEEKDATA to read things like filenames and data passed by I/O syscalls. PTRACE_PEEKDATA gets one word per syscall. This is VERY expensive. For example, in order to print fstat syscall, we need to perform this: write(3, "fstat64(1, ", 11) = 11 ptrace(PTRACE_SYSCALL, 3983, 0x1, SIG_0) = 0 --- {si_signo=SIGCHLD, si_code=CLD_TRAPPED, si_pid=3983, si_status=SIGTRAP, si_utime=0, si_stime=0} (Child exited) --- wait4(-1, [{WIFSTOPPED(s) && WSTOPSIG(s) == 133}], __WALL, NULL) = 3983 ptrace(PTRACE_GETREGS, 3983, 0, 0x80885a0) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ecc, [0xa]) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ed0, [0]) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ed4, [0]) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ed8, [0x3]) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1edc, [0x2190]) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ee0, [0x1]) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ee4, [0]) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ee8, [0x5]) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1eec, [0x8800]) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ef0, [0]) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ef4, [0]) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ef8, [0]) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1efc, [0]) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f00, [0x400]) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f04, [0]) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f08, [0]) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f0c, [0x4f192228]) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f10, [0x18f0e6ba]) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f14, [0x4f194c1f]) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f18, [0x1b365bd4]) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f1c, [0x4f192153]) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f20, [0x246674e9]) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f24, [0x3]) = 0 ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f28, [0]) = 0 write(3, "{st_mode=S_IFCHR|0620, st_rdev=m"..., 58) = 58 Twenty four trips into kernel to fetch one struct stat! Kernel just got a new syscall, process_vm_readv(), which can be used to copy data out of process' address space. Looks like it's ideal for this goal. This is a patch which uses process_vm_readv() in umoven() and umovestr() functions if possible, with fallback to old method if process_vm_readv() returns ENOSYS. There is a slight change in API: since I read data in blocks of 256 bytes, now umovestr() may overwrite data in the buffer past terminating NUL. One caller used to depend on guarantee that this never happens, but I just committed a small patch which reworks that place to remove this assumption. I don't have a kernel where process_vm_readv() syscall actually works, so the patch definitely needs to wait until it can be tested for real. So far I only tested that on ENOSYS it indeed falls back to old code and everything continues to work :) What do you guys think about it? -- vda diff -d -urpN strace.5/linux/i386/syscallent.h strace.6/linux/i386/syscallent.h --- strace.5/linux/i386/syscallent.h 2012-01-04 15:09:05.000000000 +0100 +++ strace.6/linux/i386/syscallent.h 2012-01-20 12:01:46.824097896 +0100 @@ -377,8 +377,8 @@ { 1, TD, sys_syncfs, "syncfs" }, /* 344 */ { 4, TN, sys_sendmmsg, "sendmmsg" }, /* 345 */ { 2, TD, sys_setns, "setns" }, /* 346 */ - { 5, 0, printargs, "SYS_347" }, /* 347 */ - { 5, 0, printargs, "SYS_348" }, /* 348 */ + { 6, 0, printargs, "process_vm_readv" }, /* 347 */ + { 6, 0, printargs, "process_vm_writev" }, /* 348 */ { 5, 0, printargs, "SYS_349" }, /* 349 */ { 5, 0, printargs, "SYS_350" }, /* 350 */ { 5, 0, printargs, "SYS_351" }, /* 351 */ diff -d -urpN strace.5/linux/powerpc/syscallent.h strace.6/linux/powerpc/syscallent.h --- strace.5/linux/powerpc/syscallent.h 2012-01-10 16:48:13.000000000 +0100 +++ strace.6/linux/powerpc/syscallent.h 2012-01-20 12:01:46.824097896 +0100 @@ -379,8 +379,8 @@ { 1, TD, sys_syncfs, "syncfs" }, /* 348 */ { 4, TN, sys_sendmmsg, "sendmmsg" }, /* 349 */ { 2, TD, sys_setns, "setns" }, /* 350 */ - { 5, 0, printargs, "SYS_351" }, /* 351 */ - { 5, 0, printargs, "SYS_352" }, /* 352 */ + { 6, 0, printargs, "process_vm_readv" }, /* 351 */ + { 6, 0, printargs, "process_vm_writev" }, /* 352 */ { 5, 0, printargs, "SYS_353" }, /* 353 */ { 5, 0, printargs, "SYS_354" }, /* 354 */ { 5, 0, printargs, "SYS_355" }, /* 355 */ diff -d -urpN strace.5/linux/x86_64/syscallent.h strace.6/linux/x86_64/syscallent.h --- strace.5/linux/x86_64/syscallent.h 2012-01-04 15:09:05.000000000 +0100 +++ strace.6/linux/x86_64/syscallent.h 2012-01-20 12:01:46.824097896 +0100 @@ -308,3 +308,5 @@ { 4, TN, sys_sendmmsg, "sendmmsg" }, /* 307 */ { 2, TD, sys_setns, "setns" }, /* 308 */ { 3, 0, sys_getcpu, "getcpu" }, /* 309 */ + { 6, 0, printargs, "process_vm_readv" }, /* 310 */ + { 6, 0, printargs, "process_vm_writev" }, /* 311 */ diff -d -urpN strace.5/util.c strace.6/util.c --- strace.5/util.c 2012-01-20 11:55:31.000000000 +0100 +++ strace.6/util.c 2012-01-20 12:08:23.222153067 +0100 @@ -769,6 +769,35 @@ dumpstr(struct tcb *tcp, long addr, int } } + + +#if !defined(__NR_process_vm_readv) +# if defined(I386) +# define __NR_process_vm_readv 347 +# define __NR_process_vm_writev 348 +# elif defined(X86_64) +# define __NR_process_vm_readv 310 +# define __NR_process_vm_writev 311 +# endif +#endif + +#if defined(__NR_process_vm_readv) +static bool process_vm_readv_not_supported = 0; +static ssize_t process_vm_readv(pid_t pid, + const struct iovec *lvec, + unsigned long liovcnt, + const struct iovec *rvec, + unsigned long riovcnt, + unsigned long flags) +{ + return syscall(__NR_process_vm_readv, (long)pid, lvec, liovcnt, rvec, riovcnt, flags); +} +#else +static bool process_vm_readv_not_supported = 1; +# define process_vm_readv(...) (errno = ENOSYS, -1) +#endif + + #define PAGMASK (~(PAGSIZ - 1)) /* * move `len' bytes of data from process `pid' @@ -786,6 +815,29 @@ umoven(struct tcb *tcp, long addr, int l char x[sizeof(long)]; } u; + if (!process_vm_readv_not_supported) { + struct iovec local[1], remote[1]; + int r; + + local[0].iov_base = laddr; + remote[0].iov_base = (void*)addr; + local[0].iov_len = remote[0].iov_len = len; + r = process_vm_readv(pid, + local, 1, + remote, 1, + /*flags:*/ 0 + ); + if (r < 0) { + if (errno == ENOSYS) { + process_vm_readv_not_supported = 1; + goto vm_readv_not_supported; + } + perror("process_vm_readv"); + } + return r; + } + vm_readv_not_supported: + #if SUPPORTED_PERSONALITIES > 1 if (personality_wordsize[current_personality] < sizeof(addr)) addr &= (1ul << 8 * personality_wordsize[current_personality]) - 1; @@ -921,6 +973,55 @@ umovestr(struct tcb *tcp, long addr, int addr &= (1ul << 8 * personality_wordsize[current_personality]) - 1; #endif + if (!process_vm_readv_not_supported) { + struct iovec local[1], remote[1]; + + local[0].iov_base = laddr; + remote[0].iov_base = (void*)addr; + + while (len > 0) { + int end_in_page; + int r; + int chunk_len; + + /* Don't read kilobytes: most strings are short */ + chunk_len = len; + if (chunk_len > 256) + chunk_len = 256; + /* Don't cross pages. I guess otherwise we can get EFAULT + * and fail to notice that terminating NUL lies + * in the existing (first) page. + * (I hope there aren't arches with pages < 4K) + */ + end_in_page = ((addr + chunk_len) & 4095); + r = chunk_len - end_in_page; + if (r > 0) /* if chunk_len > end_in_page */ + chunk_len = r; /* chunk_len -= end_in_page */ + + local[0].iov_len = remote[0].iov_len = chunk_len; + r = process_vm_readv(pid, + local, 1, + remote, 1, + /*flags:*/ 0 + ); + if (r < 0) { + if (errno == ENOSYS) { + process_vm_readv_not_supported = 1; + goto vm_readv_not_supported; + } + perror("process_vm_readv"); + return -1; + } + if (memchr(local[0].iov_base, 0, r)) + return 1; + local[0].iov_base += r; + remote[0].iov_base += r; + len -= r; + } + return 0; + } + vm_readv_not_supported: + if (addr & (sizeof(long) - 1)) { /* addr not a multiple of sizeof(long) */ n = addr - (addr & -sizeof(long)); /* residue */ |