From: Maynard J. <may...@us...> - 2012-06-09 00:43:10
|
[PATCH] Fix for operf userland/kernelspace profiling and improvements to mmap of kernel data Some fixes in this patch were accidentally left out of the previous patch I posted -- in particular, the pieces relating to the problem where samples were sometimes mis-attributed due to having an "operf_transient" object in an inconsistent state. Additionally, changes were made to make mmap'ing of the perf_events kernel data more efficient. This patch has been pushed into the perf-events branch, but review comments are welcome Signed-off-by: Maynard Johnson <may...@us...> --- libperf_events/operf_counter.cpp | 57 ++++++++++----- libperf_events/operf_counter.h | 4 +- libperf_events/operf_event.h | 1 - libperf_events/operf_utils.cpp | 156 +++++++++++++++++++++----------------- libperf_events/operf_utils.h | 1 + pe_profiling/operf.cpp | 5 +- 6 files changed, 130 insertions(+), 94 deletions(-) diff --git a/libperf_events/operf_counter.cpp b/libperf_events/operf_counter.cpp index 280a7aa..5152ba3 100644 --- a/libperf_events/operf_counter.cpp +++ b/libperf_events/operf_counter.cpp @@ -42,6 +42,7 @@ unsigned int pagesize; verbose vperf("perf_events"); extern bool first_time_processing; +extern bool throttled; namespace { @@ -125,6 +126,10 @@ operf_record::~operf_record() if (poll_data) delete[] poll_data; close(output_fd); + for (int i = 0; i < samples_array.size(); i++) { + struct mmap_data *md = &samples_array[i]; + munmap(md->base, (NUM_MMAP_PAGES + 1) * pagesize); + } samples_array.clear(); evts.clear(); perfCounters.clear(); @@ -213,18 +218,16 @@ void operf_record::write_op_header_info() } } -int operf_record::prepareToRecord(int counter, int cpu, int fd) +int operf_record::prepareToRecord(int cpu, int fd) { struct mmap_data md;; - - md.counter = counter; md.prev = 0; md.mask = NUM_MMAP_PAGES * pagesize - 1; fcntl(fd, F_SETFL, O_NONBLOCK); - poll_data[cpu * evts.size() + counter].fd = fd; - poll_data[cpu * evts.size() + counter].events = POLLIN; + poll_data[cpu].fd = fd; + poll_data[cpu].events = POLLIN; poll_count++; md.base = mmap(NULL, (NUM_MMAP_PAGES + 1) * pagesize, @@ -233,8 +236,7 @@ int operf_record::prepareToRecord(int counter, int cpu, int fd) perror("failed to mmap"); return -1; } - samples_array[cpu].push_back(md); - ioctl(fd, PERF_EVENT_IOC_ENABLE); + samples_array.push_back(md); return 0; } @@ -248,6 +250,8 @@ void operf_record::setup() DIR *dir = NULL; string err_msg; char cpus_online[129]; + bool need_IOC_enable = (system_wide || pid_started); + if (system_wide) cverb << vperf << "operf_record::setup() for system-wide profiling" << endl; @@ -276,7 +280,7 @@ void operf_record::setup() if (!num_cpus) throw runtime_error("Number of online CPUs is zero; cannot continue");; - poll_data = new struct pollfd [evts.size() * num_cpus]; + poll_data = new struct pollfd [num_cpus]; cverb << vperf << "calling perf_event_open for pid " << pid << " on " << num_cpus << " cpus" << endl; @@ -304,6 +308,8 @@ void operf_record::setup() for (int cpu = 0; cpu < num_cpus; cpu++) { int real_cpu; + int mmap_fd; + bool mmap_done_for_cpu = false; if (all_cpus_avail) { real_cpu = cpu; } else { @@ -318,10 +324,8 @@ void operf_record::setup() // Create new row to hold operf_counter objects since we need one // row for each cpu. Do the same for samples_array. - vector<struct mmap_data> tmp_mdvec; vector<operf_counter> tmp_pcvec; - samples_array.push_back(tmp_mdvec); perfCounters.push_back(tmp_pcvec); for (unsigned event = 0; event < evts.size(); event++) { evts[event].counter = event; @@ -332,10 +336,21 @@ void operf_record::setup() err_msg = "Internal Error. Perf event setup failed."; goto error; } - if (((rc = prepareToRecord(event, cpu, perfCounters[cpu][event].get_fd()))) < 0) { - err_msg = "Internal Error. Perf event setup failed."; - goto error; + if (!mmap_done_for_cpu) { + if (((rc = prepareToRecord(cpu, perfCounters[cpu][event].get_fd()))) < 0) { + err_msg = "Internal Error. Perf event setup failed."; + goto error; + } + mmap_fd = perfCounters[cpu][event].get_fd(); + mmap_done_for_cpu = true; + } else { + if (ioctl(perfCounters[cpu][event].get_fd(), + PERF_EVENT_IOC_SET_OUTPUT, mmap_fd) < 0) + goto error; } + if (need_IOC_enable) + if (ioctl(perfCounters[cpu][event].get_fd(), PERF_EVENT_IOC_ENABLE) < 0) + goto error; } } if (!all_cpus_avail) @@ -348,6 +363,11 @@ void operf_record::setup() error: delete[] poll_data; + for (int i = 0; i < samples_array.size(); i++) { + struct mmap_data *md = &samples_array[i]; + munmap(md->base, (NUM_MMAP_PAGES + 1) * pagesize); + } + samples_array.clear(); close(output_fd); if (rc != OP_PERF_HANDLED_ERROR) throw runtime_error(err_msg); @@ -370,11 +390,10 @@ void operf_record::recordPerfData(void) while (1) { int prev = sample_reads; - for (int cpu = 0; cpu < num_cpus; cpu++) { - for (unsigned int evt = 0; evt < evts.size(); evt++) { - if (samples_array[cpu][evt].base) - op_get_kernel_event_data(&samples_array[cpu][evt], this); - } + + for (int i = 0; i < samples_array.size(); i++) { + if (samples_array[i].base) + op_get_kernel_event_data(&samples_array[i], this); } if (quit && disabled) break; @@ -551,6 +570,8 @@ int operf_read::convertPerfData(void) op_reprocess_unresolved_events(opHeader.h_attrs[0].attr.sample_type); op_release_resources(); + if (throttled) + cout << endl << "* * * * ATTENTION: Profiling rate was throttled back by the kernel * * * *" << endl; char * cbuf; cbuf = (char *)xmalloc(operf_options::session_dir.length() + 5); diff --git a/libperf_events/operf_counter.h b/libperf_events/operf_counter.h index ce1198e..ec8eafe 100644 --- a/libperf_events/operf_counter.h +++ b/libperf_events/operf_counter.h @@ -91,11 +91,11 @@ public: private: void create(std::string outfile, std::vector<operf_event_t> & evts); void setup(void); - int prepareToRecord(int counter, int cpu, int fd); + int prepareToRecord(int cpu, int fd); void write_op_header_info(void); int output_fd; struct pollfd * poll_data; - std::vector< std::vector<struct mmap_data> > samples_array; + std::vector<struct mmap_data> samples_array; int num_cpus; pid_t pid; bool pid_started; diff --git a/libperf_events/operf_event.h b/libperf_events/operf_event.h index b4e06d1..aa94767 100644 --- a/libperf_events/operf_event.h +++ b/libperf_events/operf_event.h @@ -84,7 +84,6 @@ typedef union event_union { } event_t; struct mmap_data { - int counter; void *base; u64 mask; u64 prev; diff --git a/libperf_events/operf_utils.cpp b/libperf_events/operf_utils.cpp index 601b20a..0eb099a 100644 --- a/libperf_events/operf_utils.cpp +++ b/libperf_events/operf_utils.cpp @@ -52,6 +52,7 @@ multimap<string, struct operf_mmap *> all_images_map; map<u64, struct operf_mmap *> kernel_modules; struct operf_mmap * kernel_mmap; bool first_time_processing; +bool throttled; static list<event_t *> unresolved_events; static struct operf_transient trans; @@ -165,9 +166,17 @@ static void __handle_comm_event(event_t * event) if (cverb << vperf) cout << "PERF_RECORD_COMM for " << event->comm.comm << ", tgid/tid = " << event->comm.pid << "/" << event->comm.tid << endl; + map<pid_t, operf_process_info *>::iterator it; it = process_map.find(event->comm.pid); if (it == process_map.end()) { + /* TODO: Handle system housekeeping tasks. For certain kinds of processes, + * we will get a COMM event, but never get an MMAP event (e.g, kpsmoused). + * Without receiving an MMAP event, we have no clue whether the name given + * with the COMM event is a full "appname" or not, so the operf_process_info + * is marked invalid. We end up dropping all samples for such tasks when + * doing a system-wide profile. + */ operf_process_info * proc = new operf_process_info(event->comm.pid, app_name ? app_name : event->comm.comm, app_name != NULL, true); @@ -175,10 +184,14 @@ static void __handle_comm_event(event_t * event) cout << "Adding new proc info to collection for PID " << event->comm.pid << endl; process_map[event->comm.pid] = proc; } else { - // sanity check -- should not get a second COMM event for same PID + /* Sanity check -- should not get a second COMM event for same PID/TID. + * Most likely, this COMM event is for a different TID than the original, + * but since we don't record the TID with our operf_process_info handling, + * we'll just display a debug message when requested. + */ if (it->second->is_valid()) { if (cverb << vperf) - cout << "Received extraneous COMM event for " << event->comm.comm + cout << "Received COMM event for " << event->comm.comm << ", PID " << event->comm.pid << endl; } else { if (cverb << vperf) @@ -290,7 +303,8 @@ static void __handle_mmap_event(event_t * event) } } -static struct operf_transient * __get_operf_trans(struct sample_data * data, bool hypervisor_domain) +static struct operf_transient * __get_operf_trans(struct sample_data * data, bool hypervisor_domain, + bool kernel_mode) { operf_process_info * proc = NULL; const struct operf_mmap * op_mmap = NULL; @@ -306,43 +320,13 @@ static struct operf_transient * __get_operf_trans(struct sample_data * data, boo std::map<pid_t, operf_process_info *>::const_iterator it = process_map.find(data->pid); if (it != process_map.end() && (it->second->is_appname_valid())) { proc = it->second; - trans.cur_procinfo = proc; - } else if (trans.in_kernel) { - if (data->ip >= kernel_mmap->start_addr && - data->ip < kernel_mmap->end_addr) { - op_mmap = kernel_mmap; - } else { - map<u64, struct operf_mmap *>::iterator it; - it = kernel_modules.begin(); - while (it != kernel_modules.end()) { - if (data->ip >= it->second->start_addr && - data->ip < it->second->end_addr) { - op_mmap = it->second; - break; - } - it++; - } - } if (!op_mmap) { - if ((kernel_mmap->start_addr == 0ULL) && - (kernel_mmap->end_addr == 0ULL)) - op_mmap = kernel_mmap; - } - if (op_mmap) { - // Create a process_info object for this process running in kernel mode - proc = new operf_process_info(data->pid, op_mmap->filename, true, true); - proc->process_new_mapping((struct operf_mmap *)op_mmap); - if (cverb << vperf) - cout << "Created process_info for kernel task " << data->pid << " using image " - << op_mmap->filename << endl; - process_map[data->pid] = proc; - } } else { - /* We may reach this point for the following reasons: - * - We get a sample without ever getting a COMM + /* This can happen for the following reasons: + * - We get a sample before getting a COMM or MMAP * event for the process being profiled - * - The COMM event has been processed, but no MMAP events. - * And since the COMM event only gives 16 chars of the - * app name, we don't have a valid app name yet. + * - The COMM event has been processed, but since that + * only gives 16 chars of the app name, we don't + * have a valid app name yet * - The kernel incorrectly records a sample for a * process other than the one we requested (not * likely -- this would be a kernel bug if it did) @@ -357,7 +341,33 @@ static struct operf_transient * __get_operf_trans(struct sample_data * data, boo // Now find mmapping that contains the data.ip address. // Use that mmapping to set fields in trans. - if (!trans.in_kernel) { + if (kernel_mode) { + if (data->ip >= kernel_mmap->start_addr && + data->ip <= kernel_mmap->end_addr) { + op_mmap = kernel_mmap; + } else { + map<u64, struct operf_mmap *>::iterator it; + it = kernel_modules.begin(); + while (it != kernel_modules.end()) { + if (data->ip >= it->second->start_addr && + data->ip <= it->second->end_addr) { + op_mmap = it->second; + break; + } + it++; + } + } if (!op_mmap) { + if ((kernel_mmap->start_addr == 0ULL) && + (kernel_mmap->end_addr == 0ULL)) + op_mmap = kernel_mmap; + } + if (!op_mmap) { + /* This can happen if a kernel module is loaded after profiling + * starts, and then we get samples for that kernel module. + * TODO: Fix this. + */ + } + } else { op_mmap = proc->find_mapping_for_sample(data->ip); // TODO log dropped sample if (op_mmap && op_mmap->is_hypervisor && !hypervisor_domain) { @@ -367,7 +377,8 @@ static struct operf_transient * __get_operf_trans(struct sample_data * data, boo } if (op_mmap) { if (cverb << vperf) - cout << "Found mmap for sample; image_name is " << op_mmap->filename << endl; + cout << "Found mmap for sample; image_name is " << op_mmap->filename << + " and app name is " <<proc->get_app_name() << endl; trans.image_name = op_mmap->filename; trans.app_filename = proc->get_app_name().c_str(); trans.image_len = strlen(trans.image_name); @@ -376,8 +387,10 @@ static struct operf_transient * __get_operf_trans(struct sample_data * data, boo trans.end_addr = op_mmap->end_addr; trans.tgid = data->pid; trans.tid = data->tid; + trans.cur_procinfo = proc; trans.cpu = data->cpu; trans.is_anon = op_mmap->is_anon_mapping; + trans.in_kernel = kernel_mode; if (trans.in_kernel || trans.is_anon) trans.pc = data->ip; else @@ -387,11 +400,10 @@ static struct operf_transient * __get_operf_trans(struct sample_data * data, boo retval = &trans; } else { // TODO: log lost sample due to no mapping - // if ((cverb << vmisc) && !first_time_processing) { - if (!first_time_processing) { + if ((cverb << vmisc) && !first_time_processing) { string domain = trans.in_kernel ? "kernel" : "userspace"; - cerr << "Discarding " << domain - << " sample where no appropriate mapping was found. (pc=0x" + cerr << "Discarding " << domain << " sample for process " << data->pid + << " where no appropriate mapping was found. (pc=0x" << hex << data->ip <<")" << endl; } retval = NULL; @@ -402,6 +414,7 @@ out: static void __handle_callchain(u64 * array, struct sample_data * data) { + bool in_kernel = false; data->callchain = (struct ip_callchain *) array; if (data->callchain->nr) { if (cverb << vperf) @@ -415,17 +428,17 @@ static void __handle_callchain(u64 * array, struct sample_data * data) // TODO: log lost callgraph arc break; case PERF_CONTEXT_KERNEL: - trans.in_kernel = 1; + in_kernel = true; break; case PERF_CONTEXT_USER: - trans.in_kernel = 0; + in_kernel = false; break; default: break; } continue; } - if (data->ip && __get_operf_trans(data, false)) { + if (data->ip && __get_operf_trans(data, false, in_kernel)) { if ((trans.current = operf_sfile_find(&trans))) { operf_sfile_log_arc(&trans); update_trans_last(&trans); @@ -461,6 +474,7 @@ static void __handle_sample_event(event_t * event, u64 sample_type) { struct sample_data data; bool found_trans = false; + bool in_kernel; const struct operf_mmap * op_mmap = NULL; bool hypervisor = (event->header.misc == PERF_RECORD_MISC_HYPERVISOR); u64 *array = event->sample.array; @@ -488,11 +502,10 @@ static void __handle_sample_event(event_t * event, u64 sample_type) data.cpu = *p; array++; } - if (event->header.misc == PERF_RECORD_MISC_KERNEL) { - trans.in_kernel = 1; + in_kernel = true; } else if (event->header.misc == PERF_RECORD_MISC_USER) { - trans.in_kernel = 0; + in_kernel = false; } #if (defined(__powerpc__) || defined(__powerpc64__)) else if (event->header.misc == PERF_RECORD_MISC_HYPERVISOR) { @@ -503,7 +516,7 @@ static void __handle_sample_event(event_t * event, u64 sample_type) << hex << data.ip << endl; goto out; } - trans.in_kernel = 0; + in_kernel = false; if (first_time_processing) { __map_hypervisor_sample(data.ip, data.pid); } @@ -534,15 +547,13 @@ static void __handle_sample_event(event_t * event, u64 sample_type) goto out; } - - /* If the static variable trans.tgid is still holding its initial value of 0, - * then we would incorrectly find trans.tgid and data.pid matching, and - * and make wrong assumptions from that match -- ending seg fault. So we - * will bail out early if we see a sample for PID 0 coming in and trans.image_name - * is NULL (implying the trans object is still in its initial state). - */ + /* If the static variable trans.tgid is still holding its initial value of 0, + * then we would incorrectly find trans.tgid and data.pid matching, and + * and make wrong assumptions from that match -- ending seg fault. So we + * will bail out early if we see a sample for PID 0 coming in and trans.image_name + * is NULL (implying the trans object is still in its initial state). + */ if (!trans.image_name && (data.pid == 0)) { - //cverb << vmisc << "Discarding sample for PID 0" << endl; cverb << vmisc << "Discarding sample for PID 0" << endl; goto out; } @@ -561,14 +572,13 @@ static void __handle_sample_event(event_t * event, u64 sample_type) } /* Only need to check for "no_user" since "no_kernel" is done by - * perf_events code. - */ - if ((operfRead.get_event_by_counter(trans.event)->no_user) && - (event->header.misc == PERF_RECORD_MISC_USER)) { - // Dropping user domain sample by user request in event spec. - goto out; - } - + * perf_events code. + */ + if ((operfRead.get_event_by_counter(trans.event)->no_user) && + (event->header.misc == PERF_RECORD_MISC_USER)) { + // Dropping user domain sample by user request in event spec. + goto out; + } if ((event->header.misc == PERF_RECORD_MISC_HYPERVISOR) && first_time_processing) { /* We defer processing hypervisor samples until all the samples @@ -593,13 +603,13 @@ static void __handle_sample_event(event_t * event, u64 sample_type) * The last resort (and most expensive) is to call __get_operf_trans() if the * sample cannot be matched up with a previous tran object. */ - if (trans.in_kernel) { + if (in_kernel) { if (trans.image_name && trans.tgid == data.pid) { // For the no-vmlinux case . . . if ((trans.start_addr == 0ULL) && (trans.end_addr == 0ULL)) { trans.pc = data.ip; found_trans = true; - // For the case where a vmlinux file is passed in . . . + // For samples in vmlinux or kernel module } else if (data.ip >= trans.start_addr && data.ip <= trans.end_addr) { trans.pc = data.ip; found_trans = true; @@ -613,7 +623,8 @@ static void __handle_sample_event(event_t * event, u64 sample_type) trans.pc = data.ip - trans.start_addr; found_trans = true; } - if (!found_trans && __get_operf_trans(&data, hypervisor)) { + + if (!found_trans && __get_operf_trans(&data, hypervisor, in_kernel)) { trans.current = operf_sfile_find(&trans); found_trans = true; } @@ -674,6 +685,9 @@ void OP_perf_utils::op_write_event(event_t * event, u64 sample_type) } __handle_comm_event(event); return; + case PERF_RECORD_THROTTLE: + throttled = true; + return; case PERF_RECORD_EXIT: return; default: @@ -1056,7 +1070,7 @@ void OP_perf_utils::op_record_kernel_info(string vmlinux_file, u64 start_addr, u int num = op_write_output(output_fd, &mmap, mmap.header.size); if (cverb << vperf) cout << "Created MMAP event of size " << mmap.header.size << " for " <<mmap.filename << ". length: " - << hex << mmap.len << "; start addr: " << mmap.start << endl; + << hex << mmap.len << "; start addr: " << mmap.start << endl; pr->add_to_total(num); _record_module_info(output_fd, pr); } diff --git a/libperf_events/operf_utils.h b/libperf_events/operf_utils.h index 1681895..aedd0f2 100644 --- a/libperf_events/operf_utils.h +++ b/libperf_events/operf_utils.h @@ -36,6 +36,7 @@ extern bool separate_thread; extern bool no_vmlinux; extern int kptr_restrict; extern uid_t my_uid; +extern bool throttled; #define OP_APPNAME_LEN 1024 diff --git a/pe_profiling/operf.cpp b/pe_profiling/operf.cpp index d7f7c53..dbb275c 100644 --- a/pe_profiling/operf.cpp +++ b/pe_profiling/operf.cpp @@ -1029,7 +1029,6 @@ static void _process_events_list(void) char * event_str = op_xstrndup(event_spec.c_str(), event_spec.length()); operf_event_t event; strncpy(event.name, strtok(event_str, ":"), OP_MAX_EVT_NAME_LEN); - free(event_str); event.count = atoi(strtok(NULL, ":")); /* Name and count are required in the event spec in order for * 'ophelp --check-events' to pass. But since unit mask and domain @@ -1060,6 +1059,7 @@ static void _process_events_list(void) } place++; } + free(event_str); event.op_evt_code = _get_event_code(event.name); event.evt_code = event.op_evt_code; events.push_back(event); @@ -1337,6 +1337,7 @@ static void process_args(int argc, char const ** argv) } else { _process_events_list(); } + op_nr_counters = events.size(); if (operf_options::vmlinux.empty()) { no_vmlinux = true; @@ -1415,6 +1416,7 @@ static int _get_sys_value(const char * filename) int main(int argc, char const *argv[]) { int rc; + throttled = false; if ((rc = _check_perf_events_cap())) { if (rc == EBUSY) { cerr << "Performance monitor unit is busy. Do 'opcontrol --deinit' and try again." << endl; @@ -1448,7 +1450,6 @@ int main(int argc, char const *argv[]) cleanup(); exit(1); } - op_nr_counters = op_get_nr_counters(cpu_type); if (my_uid != 0) { bool for_current = true; -- 1.6.2.rc2 |