From: Maynard J. <may...@us...> - 2013-02-23 00:13:04
|
This patch exposed a latent bug . . . The operf_record destructor was incorrectly updating "op_header info" for both types of output, file or pipe. It should only re-write the op_header info for file type of output, since the only new information in the re-write is the data size which is needed when reading the file. Version 2 of the patch fixes this. -Maynard ------------------ Background info for this patch: I've been debugging a nasty problem recently that occurs under certain conditions on large servers when profiling a threaded application at a high sampling rate (100000 or 200000 cycles per sample). The symptom is corrupted sample data, but this was not obvious in my initial debugging sessions because of the way that the operf_read process would die in a very ungraceful way. I had to make some changes to handle various corner cases and to detect the bad data and exit gracefully before I was able to even get a handle on what was causing the process to die (i.e., the corrupted data). This issue is still being investigated, but I think the changes to make the reading/conversion process more robust are definitely beneficial in their own right -- thus, I submit this patch for review. Thanks. -Maynard ------------------------------------------------------------ Make convertPerfData procedure more robust The operf_read::convertPerfData function reads sample data in perf_events format from either the temporary operf.data file or a pipe, depending on whether or not operf is run with the --lazy-conversion option. This patch makes the reading/conversion process more robust so that if bad data is found in the file or pipe, the process will display helpful messages and end gracefully. This patch also makes some other minor cleanups, correcting some misspellings, etc. Signed-off-by: Maynard Johnson <may...@us...> --- libperf_events/operf_counter.cpp | 168 ++++++++++++++++++++++++++++---------- libperf_events/operf_counter.h | 7 +- libperf_events/operf_utils.cpp | 80 +++++++++++++----- libperf_events/operf_utils.h | 10 ++- pe_profiling/operf.cpp | 22 +++-- 5 files changed, 206 insertions(+), 81 deletions(-) Index: oprof-work/libperf_events/operf_counter.cpp =================================================================== --- oprof-work.orig/libperf_events/operf_counter.cpp +++ oprof-work/libperf_events/operf_counter.cpp @@ -57,14 +57,23 @@ static const char *__op_magic = "OPFILE" #define OP_MAGIC (*(u64 *)__op_magic) - -int _get_perf_event_from_pipe(event_t * event, int sample_data_fd) +/* This function for reading an event from the sample data pipe must + * be robust enough to handle the situation where the operf_record process + * writes an event record to the pipe in multiple chunks. + */ +#define OP_PIPE_READ_OK 0 +#define OP_PIPE_CLOSED -1 +static int _get_perf_event_from_pipe(event_t * event, int sample_data_fd) { static size_t pe_header_size = sizeof(perf_event_header); + size_t read_size = pe_header_size; + int rc = OP_PIPE_READ_OK; char * evt = (char *)event; ssize_t num_read; perf_event_header * header = (perf_event_header *)event; + memset(header, '\0', pe_header_size); + /* A signal handler was setup for the operf_read process to handle interrupts * (i.e., from ctrl-C), so the read syscalls below may get interrupted. But the * operf_read process should ignore the interrupt and continue processing @@ -74,52 +83,87 @@ int _get_perf_event_from_pipe(event_t * */ again: errno = 0; - if ((num_read = read(sample_data_fd, header, pe_header_size)) < 0) { + if ((num_read = read(sample_data_fd, header, read_size)) < 0) { cverb << vdebug << "Read 1 of sample data pipe returned with " << strerror(errno) << endl; - if (errno == EINTR) + if (errno == EINTR) { goto again; - else - return -1; + } else { + rc = OP_PIPE_CLOSED; + goto out; + } } else if (num_read == 0) { - return -1; + // Implies pipe has been closed on the write end, so return -1 to quit reading + rc = OP_PIPE_CLOSED; + goto out; + } else if (num_read != read_size) { + header += num_read; + read_size -= num_read; + goto again; } + + read_size = header->size - pe_header_size; + if (read_size == 0) + /* This is technically a valid record -- it's just empty. I'm not + * sure if this can happen (i.e., if the kernel ever creates empty + * records), but we'll handle it just in case. + */ + goto again; + + if (!header->size || (header->size < pe_header_size)) + /* Bogus header size detected. In this case, we don't set rc to -1, + * because the caller will catch this error when it calls is_header_valid(). + * I've seen such bogus stuff occur when profiling lots of processes at + * a very high sampling frequency. This issue is still being investigated, + * so for now, we'll just do our best to detect and handle gracefully. + */ + goto out; + evt += pe_header_size; - if (!header->size) - return -1; again2: - if ((num_read = read(sample_data_fd, evt, header->size - pe_header_size)) < 0) { + if ((num_read = read(sample_data_fd, evt, read_size)) < 0) { cverb << vdebug << "Read 2 of sample data pipe returned with " << strerror(errno) << endl; - if (errno == EINTR) + if (errno == EINTR) { goto again2; - else - return -1; + } else { + rc = OP_PIPE_CLOSED; + if (errno == EFAULT) + cerr << "Size of event record: " << header->size << endl; + goto out; + } } else if (num_read == 0) { - return -1; + // Implies pipe has been closed on the write end, so return -1 to quit reading + rc = OP_PIPE_CLOSED; + goto out; + } else if (num_read != read_size) { + evt += num_read; + read_size -= num_read; + goto again; } - return 0; + +out: + return rc; } -event_t * _get_perf_event_from_file(struct mmap_info & info) +static event_t * _get_perf_event_from_file(struct mmap_info & info) { uint32_t size; + static int num_remaps = 0; event_t * event; - - if (info.offset + info.head >= info.file_data_offset + info.file_data_size) - return NULL; - - if (!pg_sz) - pg_sz = sysconf(_SC_PAGESIZE); + size_t pe_header_size = sizeof(struct perf_event_header); try_again: - event = (event_t *)(info.buf + info.head); + event = NULL; + if (info.offset + info.head + pe_header_size > info.file_data_size) + goto out; + + if (info.head + pe_header_size <= mmap_size) + event = (event_t *)(info.buf + info.head); - if ((mmap_size != info.file_data_size) && - (((info.head + sizeof(event->header)) > mmap_size) || - (info.head + event->header.size > mmap_size))) { + if (unlikely(!event || (info.head + event->header.size > mmap_size))) { int ret; u64 shift = pg_sz * (info.head / pg_sz); - cverb << vconvert << "Remapping perf data file" << endl; + cverb << vdebug << "Remapping perf data file: " << dec << ++num_remaps << endl; ret = munmap(info.buf, mmap_size); if (ret) { string errmsg = "Internal error: munmap of perf data file failed with errno: "; @@ -139,16 +183,13 @@ try_again: } size = event->header.size; - - // The tail end of the operf data file may be zero'ed out, so we assume if we - // find size==0, we're now in that area of the file, so we're done. - if (size == 0) - return NULL; - info.head += size; - if (info.offset + info.head >= info.file_data_offset + info.file_data_size) - return NULL; - +out: + if (unlikely(!event)) { + cverb << vdebug << "No more event records in file. info.offset: " << dec << info.offset + << "; info.head: " << info.head << "; info.file_data_size: " << info.file_data_size + << endl << "; mmap_size: " << mmap_size << "; current record size: " << size << endl; + } return event; } @@ -236,12 +277,15 @@ operf_record::~operf_record() { cverb << vrecord << "operf_record::~operf_record()" << endl; opHeader.data_size = total_bytes_recorded; - if (total_bytes_recorded) + // If recording to a file, we re-write the op_header info + // in order to update the data_size field. + if (total_bytes_recorded && write_to_file) write_op_header_info(); if (poll_data) delete[] poll_data; - close(output_fd); + if (write_to_file) + close(output_fd); for (int i = 0; i < samples_array.size(); i++) { struct mmap_data *md = &samples_array[i]; munmap(md->base, (num_mmap_pages + 1) * pagesize); @@ -253,7 +297,7 @@ operf_record::~operf_record() operf_record::operf_record(int out_fd, bool sys_wide, pid_t the_pid, bool pid_running, vector<operf_event_t> & events, vmlinux_info_t vi, bool do_cg, -bool separate_by_cpu, bool out_fd_is_file) + bool separate_by_cpu, bool out_fd_is_file) { int flags = O_CREAT|O_RDWR|O_TRUNC; struct sigaction sa; @@ -678,19 +722,27 @@ void operf_record::recordPerfData(void) while (1) { int prev = sample_reads; + if (quit && disabled) + break; for (int i = 0; i < samples_array.size(); i++) { - if (samples_array[i].base) + if ((samples_array[i].base) && !quit) op_get_kernel_event_data(&samples_array[i], this); } - if (quit && disabled) - break; if (prev == sample_reads) { poll(poll_data, poll_count, -1); } if (quit) { + /* If we're writing to a pipe, we want to close it immediately so + * as to let the reader know to quit reading from the pipe. If we + * don't close the pipe right away, I've seen cases (with very high + * sampling frequencies collecting samples from many processes) where + * the reader somehow gets bad data from the pipe and will abend. + */ + if (!write_to_file) + close(output_fd); for (unsigned int i = 0; i < perfCounters.size(); i++) ioctl(perfCounters[i].get_fd(), PERF_EVENT_IOC_DISABLE); disabled = true; @@ -901,21 +953,27 @@ int operf_read::get_eventnum_by_perf_eve return -1; } -int operf_read::convertPerfData(void) + +unsigned int operf_read::convertPerfData(void) { - int num_bytes = 0; + unsigned int num_bytes = 0; struct mmap_info info; + bool error = false; event_t * event; if (!inputFname.empty()) { info.file_data_offset = opHeader.data_offset; info.file_data_size = opHeader.data_size; + cverb << vdebug << "Expecting to read approximately " << dec + << info.file_data_size - info.file_data_offset + << " bytes from operf sample data file." << endl; info.traceFD = open(inputFname.c_str(), O_RDONLY); if (info.traceFD == -1) { cerr << "Error: open failed with errno:\n\t" << strerror(errno) << endl; throw runtime_error("Error: Unable to open operf data file"); } cverb << vdebug << "operf_read opened " << inputFname << endl; + pg_sz = sysconf(_SC_PAGESIZE); if (op_mmap_trace_file(info, true) < 0) { close(info.traceFD); throw runtime_error("Error: Unable to mmap operf data file"); @@ -933,6 +991,7 @@ int operf_read::convertPerfData(void) cverb << vdebug << "sample type is " << hex << opHeader.h_attrs[0].attr.sample_type << endl; first_time_processing = true; int num_recs = 0; + struct perf_event_header last_header; bool print_progress = !inputFname.empty() && syswide; if (print_progress) cerr << "Converting profile data to OProfile format" << endl; @@ -947,17 +1006,38 @@ int operf_read::convertPerfData(void) break; } rec_size = event->header.size; - op_write_event(event, opHeader.h_attrs[0].attr.sample_type); + + if ((!is_header_valid(event->header)) || + ((op_write_event(event, opHeader.h_attrs[0].attr.sample_type)) < 0)) { + error = true; + last_header = event->header; + break; + } num_bytes += rec_size; num_recs++; if ((num_recs % 1000000 == 0) && print_progress) cerr << "."; } + + if (unlikely(error)) { + if (!inputFname.empty()) { + cerr << "ERROR: operf_read::convertPerfData quitting. Bad data read from file." << endl; + } else { + cerr << "ERROR: operf_read::convertPerfData quitting. Bad data read from pipe." << endl; + cerr << "Closing read end of data pipe. operf-record process will stop with SIGPIPE (13)." + << endl; + } + cerr << "Try lowering the sample frequency to avoid this error; e.g., double the 'count'" + << endl << "value in your event specification." << endl; + cverb << vdebug << "Event header type: " << last_header.type << "; size: " << last_header.size << endl; + } + if (print_progress) cerr << endl; first_time_processing = false; - op_reprocess_unresolved_events(opHeader.h_attrs[0].attr.sample_type); + if (!error) + op_reprocess_unresolved_events(opHeader.h_attrs[0].attr.sample_type); op_release_resources(); operf_print_stats(operf_options::session_dir, start_time_human_readable, throttled); Index: oprof-work/libperf_events/operf_counter.h =================================================================== --- oprof-work.orig/libperf_events/operf_counter.h +++ oprof-work/libperf_events/operf_counter.h @@ -88,7 +88,7 @@ public: int out_fd(void) const { return output_fd; } void add_to_total(int n) { total_bytes_recorded += n; } void add_process(struct comm_event proc) { procs.push_back(proc); } - int get_total_bytes_recorded(void) const { return total_bytes_recorded; } + unsigned int get_total_bytes_recorded(void) const { return total_bytes_recorded; } void register_perf_event_id(unsigned counter, u64 id, perf_event_attr evt_attr); bool get_valid(void) { return valid; } @@ -119,7 +119,7 @@ private: bool callgraph; bool separate_cpu; std::vector<operf_counter> perfCounters; - int total_bytes_recorded; + unsigned int total_bytes_recorded; int poll_count; struct OP_header opHeader; std::vector<operf_event_t> evts; @@ -135,7 +135,7 @@ public: std::vector<operf_event_t> & evts, bool systemwide); ~operf_read(); int readPerfHeader(void); - int convertPerfData(void); + unsigned int convertPerfData(void); bool is_valid(void) {return valid; } int get_eventnum_by_perf_event_id(u64 id) const; inline const operf_event_t * get_event_by_counter(u32 counter) { return &evts[counter]; } @@ -150,7 +150,6 @@ private: bool valid; bool syswide; op_cpu cpu_type; - int _get_one_perf_event(event_t *); int _read_header_info_with_ifstream(void); int _read_perf_header_from_file(void); int _read_perf_header_from_pipe(void); Index: oprof-work/libperf_events/operf_utils.cpp =================================================================== --- oprof-work.orig/libperf_events/operf_utils.cpp +++ oprof-work/libperf_events/operf_utils.cpp @@ -346,7 +346,7 @@ static void __handle_comm_event(event_t * calling disassociate_from_parent(). */ if (cverb << vconvert) - cout << "Disassociating forked proc " << event->comm.pid + cout << "Dis-associating forked proc " << event->comm.pid << " from parent" << endl; it->second->disassociate_from_parent(event->comm.comm); } else { @@ -672,25 +672,38 @@ static void __map_hypervisor_sample(u64 proc->process_hypervisor_mapping(ip); } -static void __handle_throttle_event(event_t * event, u64 sample_type) +static int __handle_throttle_event(event_t * event, u64 sample_type) { + int rc = 0; trans.event = operfRead.get_eventnum_by_perf_event_id(event->throttle.id); - - __set_event_throttled(trans.event); + if (trans.event >= 0) + __set_event_throttled(trans.event); + else + rc = -1; + return rc; } -static void __handle_sample_event(event_t * event, u64 sample_type) +static int __handle_sample_event(event_t * event, u64 sample_type) { struct sample_data data; bool found_trans = false; bool in_kernel; + int rc = 0; const struct operf_mmap * op_mmap = NULL; bool hypervisor = (event->header.misc == PERF_RECORD_MISC_HYPERVISOR); u64 *array = event->sample.array; + /* As we extract the various pieces of information from the sample data array, + * if we find that the sample type does not match up with an expected mandatory + * perf_event_sample_format, we consider this as corruption of the sample data + * stream. Since it wouldn't make sense to continue with suspect data, we quit. + */ if (sample_type & PERF_SAMPLE_IP) { data.ip = event->ip.ip; array++; + } else { + rc = -1; + goto done; } if (sample_type & PERF_SAMPLE_TID) { @@ -698,14 +711,21 @@ static void __handle_sample_event(event_ data.pid = p[0]; data.tid = p[1]; array++; + } else { + rc = -1; + goto done; } data.id = ~0ULL; if (sample_type & PERF_SAMPLE_ID) { data.id = *array; array++; + } else { + rc = -1; + goto done; } + // PERF_SAMPLE_CPU is optional (see --separate-cpu). if (sample_type & PERF_SAMPLE_CPU) { u_int32_t *p = (u_int32_t *)array; data.cpu = *p; @@ -778,7 +798,8 @@ static void __handle_sample_event(event_ trans.event = operfRead.get_eventnum_by_perf_event_id(data.id); if (trans.event < 0) { cerr << "Event num " << trans.event << " for id " << data.id - << " is invalid. Skipping sample." << endl; + << " is invalid. Sample data appears to be corrupted." << endl; + rc = -1; goto out; } @@ -863,7 +884,7 @@ static void __handle_sample_event(event_ out: clear_trans(&trans); done: - return; + return rc; } @@ -875,8 +896,11 @@ done: * when profiling began. Additional PERF_RECORD_MMAP records may appear later in the data * stream (e.g., dlopen for single-process profiling or new process startup for system-wide * profiling. + * + * This function returns '0' on success and '-1' on failure. A failure implies the sample + * data is probably corrupt and the calling function should handle appropriately. */ -void OP_perf_utils::op_write_event(event_t * event, u64 sample_type) +int OP_perf_utils::op_write_event(event_t * event, u64 sample_type) { #if 0 if (event->header.type < PERF_RECORD_MAX) { @@ -886,33 +910,38 @@ void OP_perf_utils::op_write_event(event switch (event->header.type) { case PERF_RECORD_SAMPLE: - __handle_sample_event(event, sample_type); - return; + return __handle_sample_event(event, sample_type); case PERF_RECORD_MMAP: __handle_mmap_event(event); - return; + return 0; case PERF_RECORD_COMM: if (!sfile_init_done) { operf_sfile_init(); sfile_init_done = true; } __handle_comm_event(event); - return; + return 0; case PERF_RECORD_FORK: __handle_fork_event(event); - return; + return 0; case PERF_RECORD_THROTTLE: - __handle_throttle_event(event, sample_type); - return; + return __handle_throttle_event(event, sample_type); case PERF_RECORD_LOST: operf_stats[OPERF_RECORD_LOST_SAMPLE] += event->lost.lost; - return; + return 0; case PERF_RECORD_EXIT: - return; + return 0; default: - // OK, ignore all other header types. - cverb << vconvert << "No matching event type for " << hex << event->header.type << endl; - return; + if (event->header.type > PERF_RECORD_MAX) { + // Bad header + cerr << "Invalid event type " << hex << event->header.type << endl; + cerr << "Sample data is probably corrupted." << endl; + return -1; + } else { + cverb << vconvert << "Event type "<< hex << event->header.type + << " is ignored." << endl; + return 0; + } } } @@ -979,6 +1008,7 @@ static int __mmap_trace_file(struct mmap mmap_flags, info.traceFD, info.offset); if (info.buf == MAP_FAILED) { cerr << "Error: mmap failed with errno:\n\t" << strerror(errno) << endl; + cerr << "\tmmap_size: 0x" << hex << mmap_size << "; offset: 0x" << info.offset << endl; return -1; } else { @@ -994,8 +1024,6 @@ int OP_perf_utils::op_mmap_trace_file(st { u64 shift; if (init) { - if (!pg_sz) - pg_sz = sysconf(_SC_PAGESIZE); if (!mmap_size) { if (MMAP_WINDOW_SZ > info.file_data_size) { mmap_size = info.file_data_size; @@ -1020,7 +1048,10 @@ int OP_perf_utils::op_write_output(int o int ret = write(output, buf, size); if (ret < 0) { - string errmsg = "Internal error: Failed to write sample data to pipe. errno is "; + if (errno == EINTR) + continue; + + string errmsg = "Internal error: Failed to write sample data to output fd. errno is "; errmsg += strerror(errno); throw runtime_error(errmsg); } @@ -1359,6 +1390,9 @@ void OP_perf_utils::op_get_kernel_event_ void *buf; int64_t diff; + if (old == head) + return; + diff = head - old; if (diff < 0) { throw runtime_error("ERROR: event buffer wrapped, which should NEVER happen."); Index: oprof-work/libperf_events/operf_utils.h =================================================================== --- oprof-work.orig/libperf_events/operf_utils.h +++ oprof-work/libperf_events/operf_utils.h @@ -45,6 +45,14 @@ extern bool throttled; #define MMAP_WINDOW_SZ (32 * 1024 * 1024ULL) #endif +# define likely(x) __builtin_expect(!!(x), 1) +# define unlikely(x) __builtin_expect(!!(x), 0) + +#define is_header_valid(hdr) \ + ((hdr.size > sizeof(hdr)) && \ + (hdr.type) && (hdr.size)) + + extern unsigned int op_nr_counters; static inline size_t align_64bit(u64 x) @@ -68,7 +76,7 @@ void op_perfrecord_sigusr1_handler(int s int op_get_process_info(bool system_wide, pid_t pid, operf_record * pr); void op_record_process_exec_mmaps(pid_t pid, pid_t tgid, int output_fd, operf_record * pr); int op_write_output(int output, void *buf, size_t size); -void op_write_event(event_t * event, u64 sample_type); +int op_write_event(event_t * event, u64 sample_type); int op_read_from_stream(std::ifstream & is, char * buf, std::streamsize sz); int op_mmap_trace_file(struct mmap_info & info, bool init); int op_get_next_online_cpu(DIR * dir, struct dirent *entry); Index: oprof-work/pe_profiling/operf.cpp =================================================================== --- oprof-work.orig/pe_profiling/operf.cpp +++ oprof-work/pe_profiling/operf.cpp @@ -92,7 +92,8 @@ static vector<operf_event_t> events; static bool jit_conversion_running; static void convert_sample_data(void); static int sample_data_pipe[2]; -static bool ctl_c = false; +bool ctl_c = false; +bool pipe_closed = false; namespace operf_options { @@ -370,7 +371,7 @@ int start_profiling(void) // start recording operfRecord.recordPerfData(); - cverb << vmisc << "Total bytes recorded from perf events: " << dec + cverb << vdebug << "Total bytes recorded from perf events: " << dec << operfRecord.get_total_bytes_recorded() << endl; } catch (runtime_error re) { /* If the user does ctl-c, the operf-record process may get interrupted @@ -496,7 +497,7 @@ static end_code_t _kill_operf_read_pid(e if (!WEXITSTATUS(waitpid_status)) { cverb << vdebug << "operf-read process returned OK" << endl; } else if (WIFEXITED(waitpid_status)) { - /* If user did ctl-c, operf-record may get spurious errors, like + /* If user did ctl-c, operf-read may get spurious errors, like * broken pipe, etc. We ignore these unless the user asks for * debug output. */ @@ -508,7 +509,7 @@ static end_code_t _kill_operf_read_pid(e } } else if (WIFSIGNALED(waitpid_status)) { keep_trying = false; - /* If user did ctl-c, operf-record may get spurious errors, like + /* If user did ctl-c, operf-read may get spurious errors, like * broken pipe, etc. We ignore these unless the user asks for * debug output. */ @@ -529,9 +530,13 @@ static end_code_t _kill_operf_record_pid end_code_t rc = ALL_OK; // stop operf-record process + errno = 0; if (kill(operf_record_pid, SIGUSR1) < 0) { - perror("Attempt to stop operf-record process failed"); - rc = PERF_RECORD_ERROR; + // If operf-record process is already ended, don't consider this an error. + if (errno != ESRCH) { + perror("Attempt to stop operf-record process failed"); + rc = PERF_RECORD_ERROR; + } } else { if (waitpid(operf_record_pid, &waitpid_status, 0) < 0) { perror("waitpid for operf-record process failed"); @@ -590,7 +595,7 @@ static end_code_t _run(void) /* If we're not doing system wide profiling and no app is started, then * there's no profile data to convert. So if this condition is NOT true, * then we'll do the convert. - * Note that if --lazy-connversion is passed, then operf_options::post_conversion + * Note that if --lazy-conversion is passed, then operf_options::post_conversion * will be set, and we will defer conversion until after the operf-record * process is done. */ @@ -605,6 +610,7 @@ static end_code_t _run(void) close(sample_data_pipe[1]); _set_basic_SIGINT_handler_for_child(); convert_sample_data(); + _exit(EXIT_SUCCESS); } // parent close(sample_data_pipe[0]); @@ -921,7 +927,7 @@ static void convert_sample_data(void) cverb << vdebug << "Successfully read header info for sample data " << endl; if (operfRead.is_valid()) { try { - int num = operfRead.convertPerfData(); + unsigned int num = operfRead.convertPerfData(); cverb << vdebug << "operf_read: Total bytes received from operf_record process: " << dec << num << endl; } catch (runtime_error e) { cerr << "Caught runtime error from operf_read::convertPerfData" << endl; |