|
From: Carl L. <ce...@li...> - 2024-04-26 20:40:41
|
Paul:
On Power, there are only 6 arguments maximum for a system call. They are passed in regs 3 to 8 and get
mapped to args 1 to 6. The mmap call has agruments:
__addr, __len, __prot, __flags, __fd, __offset
So, it does seem a little strange for the first agument to be zero???? As I mentioned, I do see call that succeed where arg1 is zero So, I am still missing something.
Carl
On 4/26/24 11:24, Carl Love wrote:
> Paul:
>
> I put some print statements into void VG_(client_syscall) before the call to
> putSyscallStatusIntoGuestState. I put the patch at the bottom so you can see exactly what I did.
>
> Here is what I get right before the seg fault:
>
>
> CARLL, VG_(client_syscall), tid 1
> CARLL, sysno = 90, SYSCALL[871316,1](90)
>
> CARLL, call VG_(client_syscall) 1
> CARLL, syscall info, sci->orig_args.arg1 = 0x0
> CARLL, syscall info, sci->orig_args.arg2 = 0x100000
> CARLL, syscall info, sci->orig_args.arg3 = 0x3
> CARLL, syscall info, sci->orig_args.arg4 = 0x22
> CARLL, syscall info, sci->orig_args.arg5 = 0xffffffffffffffff
> CARLL, syscall info, sci->orig_args.arg6 = 0x0
> CARLL, syscall info, sci->orig_args.arg7 = 0x1
> CARLL, syscall info, sci->orig_args.arg8 = 0x0
> CARLL, syscall info, sci->args.arg1 = 0x0
> CARLL, syscall info, sci->args.arg2 = 0x100000
> CARLL, syscall info, sci->args.arg3 = 0x3
> CARLL, syscall info, sci->args.arg4 = 0x22
> CARLL, syscall info, sci->args.arg5 = 0xffffffffffffffff
> CARLL, syscall info, sci->args.arg6 = 0x0
> CARLL, syscall info, sci->args.arg7 = 0x1
> CARLL, syscall info, sci->args.arg8 = 0x0
> CARLL, syscall info, sci->status.what = 0x1
>
> CARLL, syscall info, flags = 0x0
> CARLL, thread state tid = 0x1
> CARLL, thread state sig_mask = 0x0
> CARLL, call_num = 2477
> CARLL, call VG_(client_syscall) 2
> CARLL, call_num = 2478
> CARLL, SCHEDSETJMP host_code_addr = 0x100302ec08
> CARLL, SCHEDSETJMP host_code_addr = 0x100303d910
> CARLL, SCHEDSETJMP host_code_addr = 0x10030cb7d0
> ....
> CARLL, SCHEDSETJMP host_code_addr = 0x10037230f0
> CARLL, SCHEDSETJMP host_code_addr = 0x1003723180
> CARLL, SCHEDSETJMP host_code_addr = 0x10030d29b0
> CARLL, SCHEDSETJMP host_code_addr = 0x100321f6c8
> ==871316==
> ==871316== Process terminating with default action of signal 11 (SIGSEGV)
> ==871316== Bad permissions for mapped region at address 0x4730000
> ==871316== at 0x43962B8: __memset_power10 (in /usr/lib64/glibc-hwcaps/power9\
> /libc-2.28.so)
>
>
> I looked at some of the previous calls for sysno = 90 and don't see anything that looks really suspicious. I was looking for the documentation on what the syscall arguments are in each of the registers. I have seen this in the past but can't remember where.. I am thinking arg1 is the address, so the fact that it is zero for this call did seem strange. But I see previous calls where it is also zero for example this sequence:
>
> CARLL, call VG_(client_syscall) 1
> CARLL, syscall info, sci->orig_args.arg1 = 0x0
> CARLL, syscall info, sci->orig_args.arg2 = 0x30428
> CARLL, syscall info, sci->orig_args.arg3 = 0x5
> CARLL, syscall info, sci->orig_args.arg4 = 0x802
> CARLL, syscall info, sci->orig_args.arg5 = 0x3
> CARLL, syscall info, sci->orig_args.arg6 = 0x0
> CARLL, syscall info, sci->orig_args.arg7 = 0x1
> CARLL, syscall info, sci->orig_args.arg8 = 0x0
> CARLL, syscall info, sci->args.arg1 = 0x0
> CARLL, syscall info, sci->args.arg2 = 0x30428
> CARLL, syscall info, sci->args.arg3 = 0x5
> CARLL, syscall info, sci->args.arg4 = 0x802
> CARLL, syscall info, sci->args.arg5 = 0x3
> CARLL, syscall info, sci->args.arg6 = 0x0
> CARLL, syscall info, sci->args.arg7 = 0x1
> CARLL, syscall info, sci->args.arg8 = 0x0
> CARLL, syscall info, sci->status.what = 0x1
>
> CARLL, syscall info, flags = 0x0
>
> CARLL, thread state tid = 0x1
> CARLL, thread state sig_mask = 0x0
> CARLL, call_num = 2397
> CARLL, call VG_(client_syscall) 2
> CARLL, call_num = 2398
> CARLL, SCHEDSETJMP host_code_addr = 0x1002f758e0
>
> CARLL, VG_(client_syscall), tid 1
> CARLL, sysno = 90, SYSCALL[871316,1](90)
> CARLL, call VG_(client_syscall) 1
> CARLL, syscall info, sci->orig_args.arg1 = 0x6860000
> CARLL, syscall info, sci->orig_args.arg2 = 0x20000
> CARLL, syscall info, sci->orig_args.arg3 = 0x3
> CARLL, syscall info, sci->orig_args.arg4 = 0x812
> CARLL, syscall info, sci->orig_args.arg5 = 0x3
> CARLL, syscall info, sci->orig_args.arg6 = 0x10000
> CARLL, syscall info, sci->orig_args.arg7 = 0x1
> CARLL, syscall info, sci->orig_args.arg8 = 0x0
> CARLL, syscall info, sci->args.arg1 = 0x6860000
> CARLL, syscall info, sci->args.arg2 = 0x20000
> CARLL, syscall info, sci->args.arg3 = 0x3
> CARLL, syscall info, sci->args.arg4 = 0x812
> CARLL, syscall info, sci->args.arg5 = 0x3
> CARLL, syscall info, sci->args.arg6 = 0x10000
> CARLL, syscall info, sci->args.arg7 = 0x1
> CARLL, syscall info, sci->args.arg8 = 0x0
> CARLL, syscall info, sci->status.what = 0x1
>
> CARLL, syscall info, flags = 0x0
>
> CARLL, thread state tid = 0x1
> CARLL, thread state sig_mask = 0x0
> CARLL, call_num = 2399
> CARLL, call VG_(client_syscall) 2
> CARLL, call_num = 2400
> CARLL, SCHEDSETJMP host_code_addr = 0x1002f758e0
>
> CARLL, VG_(client_syscall), tid 1
> CARLL, sysno = 6, SYSCALL[871316,1](6)
>
> CARLL, call VG_(client_syscall) 1
> CARLL, syscall info, sci->orig_args.arg1 = 0x3
> CARLL, syscall info, sci->orig_args.arg2 = 0x6fffffff
> CARLL, syscall info, sci->orig_args.arg3 = 0xfffffffc80000118
> CARLL, syscall info, sci->orig_args.arg4 = 0x6840610
> CARLL, syscall info, sci->orig_args.arg5 = 0x1079f110
>
> etc.
>
> Where things appear to have worked OK. Really need to know what the register values correspond to. I will keep looking for the documentation on the system call args.
>
> Not sure if this helps you or not? Thanks for the help.
>
>
> On 4/25/24 12:44, Paul Floyd wrote:
>>
>>
>> On 23-04-24 15:52, Carl Love wrote:
>>> Paul:
>>>
>>> I have been digging some more with gdb. I have also put in some print statements to try and figure out when and what syscall the issue occurs on. The issue occurs after processing a system call and we return to running the user code. While running the user code we encounter the seg fault.
>>>
>>> Valgrind calls void VG_(client_syscall) in syswrap-main.c to process a system call. The function calls putSyscallStatusIntoGuestState, twice, as part of processing the system call. I see a variable number of calls to putSyscallStatusIntoGuestState before hitting the seg fault. Note, the number of calls to putSyscallStatusIntoGuestState before the seg fault varies when just running valgrind. I see 2474 or 2478, calls to the function while processing system call 90 or I see 3604 or 3608 function calls for system call 6 before we hit the seg fault. I am puzzled by the inconsistency in the number of calls/sys call number before the segmentation fault, it doesn't "feel" like it is the system call processing per say that is the issue but maybe some other issue, just guessing???? Also, when the failure occurs, it isn't the first time that system call has been handled. It looked like we have processed that system call 10 to 20 times previously without a failure. Don't know
>>> if that helps or not?
>>>
>>> If you have any thoughts as to a possible root cause, please let me know and I will look into it. Thanks.
>>>
>>
>> Hi Carl
>>
>> For client syscalls Valgrind will make several syscalls itself. The sequeence is
>>
>> 1. gettid
>> 2. write (that's the syscall 6) to release the fifo big lock
>> 3. sigprocmask to set the guest signal mask
>> 4. the actual client syscall
>> 5. sigprocmask to set the Valgrind signal mask
>> 6. read to acquire the fifo big lock
>>
>> On multithreaded apps it might be a different thread that performs the read.
>>
>> Going back to your first error
>>
>> ==2282131== Process terminating with default action of signal 11 (SIGSEGV)
>> ==2282131== Bad permissions for mapped region at address 0x6B50000
>> ==2282131== at 0x43962B8: __memset_power10 (in /usr/lib64/glibc-hwcaps/power9/libc-2.28.so)
>> ==2282131== by 0x1013FBFF: PyTuple_New (in /home/carll/anaconda3/envs/faiss/bin/python3.11)
>>
>>
>> I would expect memcheck to redirect the __memset_power10 call which was why I was wondering if other tools were OK.
>>
>> Do the SyscallInfo and ThreadState data structures look OK in gdb when you hit the segfault?
>>
>>
>> Does running with --sanity-level=4 change anything (other than making it a lot slower)?
>>
>> A+
>> Paul
>>
> -----------------------------------------------------------------------------------
>
> valgrind debug prints
>
> ---
> callgrind/callstack.c | 2 +-
> coregrind/m_scheduler/scheduler.c | 1 +
> coregrind/m_syswrap/syswrap-main.c | 62 +++++++++++++++++++++++++++---
> 3 files changed, 59 insertions(+), 6 deletions(-)
>
> diff --git a/callgrind/callstack.c b/callgrind/callstack.c
> index fc1d6eeab..9afff2a37 100644
> --- a/callgrind/callstack.c
> +++ b/callgrind/callstack.c
> @@ -39,7 +39,7 @@
> *
> */
>
> -#define N_CALL_STACK_INITIAL_ENTRIES 500
> +#define N_CALL_STACK_INITIAL_ENTRIES 1000
>
> call_stack CLG_(current_call_stack);
>
> diff --git a/coregrind/m_scheduler/scheduler.c b/coregrind/m_scheduler/scheduler.c
> index fc8cf7c9c..ecbe243a4 100644
> --- a/coregrind/m_scheduler/scheduler.c
> +++ b/coregrind/m_scheduler/scheduler.c
> @@ -1028,6 +1028,7 @@ void run_thread_for_a_while ( /*OUT*/HWord* two_words,
> vg_assert(VG_(in_generated_code) == False);
> VG_(in_generated_code) = True;
>
> + VG_(printf)("CARLL, SCHEDSETJMP host_code_addr = 0x%lx\n",host_code_addr);
> SCHEDSETJMP(
> tid,
> jumped,
> diff --git a/coregrind/m_syswrap/syswrap-main.c b/coregrind/m_syswrap/syswrap-main.c
> index be4ac0f08..7c03a28ae 100644
> --- a/coregrind/m_syswrap/syswrap-main.c
> +++ b/coregrind/m_syswrap/syswrap-main.c
> @@ -1413,12 +1413,16 @@ void getSyscallStatusFromGuestState ( /*OUT*/SyscallStatus* canonical,
> # error "getSyscallStatusFromGuestState: unknown arch"
> # endif
> }
> -
> +int call_num=0;
> static
> void putSyscallStatusIntoGuestState ( /*IN*/ ThreadId tid,
> /*IN*/ SyscallStatus* canonical,
> /*OUT*/VexGuestArchState* gst_vanilla )
> {
> + VG_(printf)("CARLL, call_num = %d\n", ++call_num);
> + if (call_num == 3608) // 3596 for stand alone, 3608 when run under GDB
> + VG_(printf)("CARLL, found bad call\n");
> +
> # if defined(VGP_x86_linux)
> VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
> vg_assert(canonical->what == SsComplete);
> @@ -2150,6 +2154,8 @@ void VG_(client_syscall) ( ThreadId tid, UInt trc )
> SyscallArgLayout layout;
> SyscallInfo* sci;
>
> + VG_(printf)("\nCARLL, VG_(client_syscall), tid %u\n", (unsigned int)tid);
> +
> ensure_initialised();
>
> vg_assert(VG_(is_valid_tid)(tid));
> @@ -2338,6 +2344,9 @@ void VG_(client_syscall) ( ThreadId tid, UInt trc )
> PRINT("SYSCALL[%d,%u](%s) ",
> VG_(getpid)(), tid, VG_SYSNUM_STRING(sysno));
>
> + VG_(printf)("CARLL, sysno = %ld, SYSCALL[%d,%u](%s)\n ",
> + sysno, VG_(getpid)(), tid, VG_SYSNUM_STRING(sysno));
> +
> /* Do any pre-syscall actions */
> if (VG_(needs).syscall_wrapper) {
> UWord tmpv[8];
> @@ -2516,8 +2525,50 @@ void VG_(client_syscall) ( ThreadId tid, UInt trc )
>
> /* Dump the syscall result back in the guest state. This is
> a platform-specific action. */
> - if (!(sci->flags & SfNoWriteResult))
> + if (!(sci->flags & SfNoWriteResult)) {
> + VG_(printf)("\nCARLL, call VG_(client_syscall) 1\n");
> +
> + VG_(printf)("CARLL, syscall info, sci->orig_args.arg1 = 0x%lx\n",
> + sci->orig_args.arg1);
> + VG_(printf)("CARLL, syscall info, sci->orig_args.arg2 = 0x%lx\n",
> + sci->orig_args.arg2);
> + VG_(printf)("CARLL, syscall info, sci->orig_args.arg3 = 0x%lx\n",
> + sci->orig_args.arg3);
> + VG_(printf)("CARLL, syscall info, sci->orig_args.arg4 = 0x%lx\n",
> + sci->orig_args.arg4);
> + VG_(printf)("CARLL, syscall info, sci->orig_args.arg5 = 0x%lx\n",
> + sci->orig_args.arg5);
> + VG_(printf)("CARLL, syscall info, sci->orig_args.arg6 = 0x%lx\n",
> + sci->orig_args.arg6);
> + VG_(printf)("CARLL, syscall info, sci->orig_args.arg7 = 0x%lx\n",
> + sci->orig_args.arg7);
> + VG_(printf)("CARLL, syscall info, sci->orig_args.arg8 = 0x%lx\n",
> + sci->orig_args.arg8);
> + VG_(printf)("CARLL, syscall info, sci->args.arg1 = 0x%lx\n",
> + sci->args.arg1);
> + VG_(printf)("CARLL, syscall info, sci->args.arg2 = 0x%lx\n",
> + sci->args.arg2);
> + VG_(printf)("CARLL, syscall info, sci->args.arg3 = 0x%lx\n",
> + sci->args.arg3);
> + VG_(printf)("CARLL, syscall info, sci->args.arg4 = 0x%lx\n",
> + sci->args.arg4);
> + VG_(printf)("CARLL, syscall info, sci->args.arg5 = 0x%lx\n",
> + sci->args.arg5);
> + VG_(printf)("CARLL, syscall info, sci->args.arg6 = 0x%lx\n",
> + sci->args.arg6);
> + VG_(printf)("CARLL, syscall info, sci->args.arg7 = 0x%lx\n",
> + sci->args.arg7);
> + VG_(printf)("CARLL, syscall info, sci->args.arg8 = 0x%lx\n",
> + sci->args.arg8);
> + VG_(printf)("CARLL, syscall info, sci->status.what = 0x%x\n\n",
> + sci->status.what);
> + VG_(printf)("CARLL, syscall info, flags = 0x%lx\n\n", sci->flags);
> +
> + VG_(printf)("CARLL, thread state tid = 0x%x\n", tst->tid);
> + VG_(printf)("CARLL, thread state sig_mask = 0x%x\n",
> + tst->sig_mask);
> putSyscallStatusIntoGuestState( tid, &sci->status, &tst->arch.vex );
> + }
>
> /* If needed, gdbserver will report syscall return to GDB */
> VG_(gdbserver_report_syscall)(False, sysno, tid);
> @@ -2616,9 +2667,10 @@ void VG_(post_syscall) (ThreadId tid)
> post-handler for sys_open can change the result from success to
> failure if the kernel supplied a fd that it doesn't like), once
> again dump the syscall result back in the guest state.*/
> - if (!(sci->flags & SfNoWriteResult))
> + if (!(sci->flags & SfNoWriteResult)) {
> + VG_(printf)("CARLL, call VG_(client_syscall) 2\n");
> putSyscallStatusIntoGuestState( tid, &sci->status, &tst->arch.vex );
> -
> + }
> /* Do any post-syscall actions required by the tool. */
> if (VG_(needs).syscall_wrapper) {
> UWord tmpv[8];
> @@ -2648,7 +2700,7 @@ void VG_(post_syscall) (ThreadId tid)
>
> /* Similarly, the wrappers might have asked for a yield
> afterwards. */
> - if (sci->flags & SfYieldAfter)
> + if (sci->flags & SfYieldAfter)
> VG_(vg_yield)();
> }
>
|