From: Suravee S. <sur...@am...> - 2013-01-23 16:37:34
|
Maynard, I'll try to reproduce, review the patch and get back to you this week. Suravee. On 1/22/2013 4:02 PM, Maynard Johnson wrote: > *Suravee*, could you please review this patch? It would also be really nice > if you could find a SLES 11 SP1/AMD system where you could reproduce the problem > and then test the patch. > > Thanks. > > -Maynard > > ----------------------------------------------------------------------- > operf: Fix 'Permission denied' error on early perf_events kernels > > The new operf tool available with OProfile 0.9.8 uses the perf_event_open > syscall to obtain access to the performance monitor counters and registers. > This syscall is implmeneted by the Linux Kernel Performance Events Subsystem > (aka "perf_events"). This perf_events subsystem was introduced in kernel > version 2.6.31, and it underwent a lot of changes in the first several versions > thereafter. Apparently, the operf tool, as currently written and operating today, > relies on certain kernel functionaility that was introduced later than some > kernels provided with some Linux distributions that supported perf_events in the > very early stages (e.g.,SLES 11 SP1). When attempting to profile with operf > (e.g., 'operf ls'), it fails with the message: > > Unexpected error running operf: Permission denied > Please use the opcontrol command instead of operf. > > The fix for this problem is to pass '-1' for the cpu arg on the > perf_event_open syscall when running on an early perf_events kernel. > Passing '-1' for the cpu arg was a requirement (in most circumstances) > on early perf_events kernels. Later kernels removed this requirement > so perf_event_open could be called for each cpu, even for single-app > profiling by non-root users. This is the standard usage model employed > by operf, which allows us to mmap kernel data space for each cpu, thus > giving a lot more memory for the kernel to record sample data. > > Signed-off-by: Maynard Johnson <may...@us...> > --- > libperf_events/operf_counter.cpp | 11 +++-- > pe_profiling/operf.cpp | 112 ++++++++++++++++++++++++++++++++------ > 2 files changed, 101 insertions(+), 22 deletions(-) > > diff --git a/libperf_events/operf_counter.cpp b/libperf_events/operf_counter.cpp > index a02c566..1949c23 100644 > --- a/libperf_events/operf_counter.cpp > +++ b/libperf_events/operf_counter.cpp > @@ -48,6 +48,7 @@ extern bool first_time_processing; > extern bool throttled; > extern size_t mmap_size; > extern size_t pg_sz; > +extern bool try_cpu_minus_one; > > namespace { > > @@ -421,7 +422,7 @@ void operf_record::setup() > struct dirent *entry = NULL; > DIR *dir = NULL; > string err_msg; > - char cpus_online[129]; > + char cpus_online[257]; > bool need_IOC_enable = (system_wide || pid_started); > > > @@ -449,7 +450,7 @@ void operf_record::setup() > } > pagesize = sysconf(_SC_PAGE_SIZE); > num_mmap_pages = (512 * 1024)/pagesize; > - num_cpus = sysconf(_SC_NPROCESSORS_ONLN); > + num_cpus = try_cpu_minus_one ? 1 : sysconf(_SC_NPROCESSORS_ONLN); > if (!num_cpus) > throw runtime_error("Number of online CPUs is zero; cannot continue");; > > @@ -472,7 +473,7 @@ void operf_record::setup() > goto error; > > } > - if (index(cpus_online, ',')) { > + if (index(cpus_online, ',') || cpus_online[0] != '0') { > all_cpus_avail = false; > if ((dir = opendir("/sys/devices/system/cpu")) == NULL) { > fclose(online_cpus); > @@ -487,7 +488,9 @@ void operf_record::setup() > int real_cpu; > int mmap_fd; > bool mmap_done_for_cpu = false; > - if (all_cpus_avail) { > + if (try_cpu_minus_one) { > + real_cpu = -1; > + } else if (all_cpus_avail) { > real_cpu = cpu; > } else { > real_cpu = op_get_next_online_cpu(dir, entry); > diff --git a/pe_profiling/operf.cpp b/pe_profiling/operf.cpp > index e4d675b..1f3fc2d 100644 > --- a/pe_profiling/operf.cpp > +++ b/pe_profiling/operf.cpp > @@ -11,7 +11,7 @@ > * (C) Copyright IBM Corp. 2011 > * > * Modified by Maynard Johnson <may...@us...> > - * (C) Copyright IBM Corporation 2012 > + * (C) Copyright IBM Corporation 2012, 2013 > * > */ > > @@ -59,6 +59,7 @@ typedef enum END_CODE { > > // Globals > char * app_name = NULL; > +bool try_cpu_minus_one = false; > pid_t app_PID = -1; > uint64_t kernel_start, kernel_end; > operf_read operfRead; > @@ -1724,7 +1725,60 @@ static void process_args(int argc, char * const argv[]) > return; > } > > -static int _check_perf_events_cap(void) > +static int _get_cpu_for_perf_events_cap(void) > +{ > + int retval; > + string err_msg; > + char cpus_online[257]; > + FILE * online_cpus; > + DIR *dir = NULL; > + > + int total_cpus = sysconf(_SC_NPROCESSORS_ONLN); > + if (!total_cpus) { > + err_msg = "Internal Error (1): Number of online cpus cannot be determined."; > + retval = -1; > + goto error; > + } > + > + online_cpus = fopen("/sys/devices/system/cpu/online", "r"); > + if (!online_cpus) { > + err_msg = "Internal Error (2): Number of online cpus cannot be determined."; > + retval = -1; > + goto error; > + } > + memset(cpus_online, 0, sizeof(cpus_online)); > + fgets(cpus_online, sizeof(cpus_online), online_cpus); > + if (!cpus_online[0]) { > + fclose(online_cpus); > + err_msg = "Internal Error (3): Number of online cpus cannot be determined."; > + retval = -1; > + goto error; > + > + } > + if (index(cpus_online, ',') || cpus_online[0] != '0') { > + // A comma in cpus_online implies a gap, which in turn implies that not all > + // CPUs are online. > + if ((dir = opendir("/sys/devices/system/cpu")) == NULL) { > + fclose(online_cpus); > + err_msg = "Internal Error (4): Number of online cpus cannot be determined."; > + retval = -1; > + goto error; > + } else { > + struct dirent *entry = NULL; > + retval = OP_perf_utils::op_get_next_online_cpu(dir, entry); > + closedir(dir); > + } > + } else { > + // All CPUs are available, so we just arbitrarily choose CPU 0. > + retval = 0; > + } > + fclose(online_cpus); > +error: > + return retval; > +} > + > + > +static int _check_perf_events_cap(bool try_cpu_minus_one) > { > /* If perf_events syscall is not implemented, the syscall below will fail > * with ENOSYS (38). If implemented, but the processor type on which this > @@ -1733,12 +1787,14 @@ static int _check_perf_events_cap(void) > */ > struct perf_event_attr attr; > pid_t pid ; > + int cpu_to_try = try_cpu_minus_one ? -1 : _get_cpu_for_perf_events_cap(); > + errno = 0; > memset(&attr, 0, sizeof(attr)); > attr.size = sizeof(attr); > attr.sample_type = PERF_SAMPLE_IP; > > pid = getpid(); > - syscall(__NR_perf_event_open, &attr, pid, 0, -1, 0); > + syscall(__NR_perf_event_open, &attr, pid, cpu_to_try, -1, 0); > return errno; > > } > @@ -1790,21 +1846,43 @@ static int _get_sys_value(const char * filename) > int main(int argc, char * const argv[]) > { > int rc; > + int perf_event_paranoid = _get_sys_value("/proc/sys/kernel/perf_event_paranoid"); > + > + my_uid = geteuid(); > throttled = false; > - if ((rc = _check_perf_events_cap())) { > - if (rc == EBUSY) { > - cerr << "Performance monitor unit is busy. Do 'opcontrol --deinit' and try again." << endl; > - exit(1); > - } > - if (rc == ENOSYS) { > - cerr << "Your kernel does not implement a required syscall" > - << " for the operf program." << endl; > - } else if (rc == ENOENT) { > - cerr << "Your kernel's Performance Events Subsystem does not support" > - << " your processor type." << endl; > - } else { > - cerr << "Unexpected error running operf: " << strerror(rc) << endl; > + rc = _check_perf_events_cap(try_cpu_minus_one); > + if (rc == EACCES) { > + /* Early perf_events kernels required the cpu argument to perf_event_open > + * to be '-1' when setting up to profile a single process if 1) the user is > + * not root; and 2) perf_event_paranoid is > 0. An EACCES error would be > + * returned if passing '0' or greater for the cpu arg and the above criteria > + * was not met. Unfortunately, later kernels turned this requirement around > + * such that the passed cpu arg must be '0' or greater when the user is not > + * root. > + * > + * We don't really have a good way to check whether we're running on such an > + * early kernel except to try the perf_event_open with different values to see > + * what works. > + */ > + if (my_uid != 0 && perf_event_paranoid > 0) { > + try_cpu_minus_one = true; > + rc = _check_perf_events_cap(try_cpu_minus_one); > } > + } > + if (rc == EBUSY) { > + cerr << "Performance monitor unit is busy. Do 'opcontrol --deinit' and try again." << endl; > + exit(1); > + } > + if (rc == ENOSYS) { > + cerr << "Your kernel does not implement a required syscall" > + << " for the operf program." << endl; > + } else if (rc == ENOENT) { > + cerr << "Your kernel's Performance Events Subsystem does not support" > + << " your processor type." << endl; > + } else if (rc) { > + cerr << "Unexpected error running operf: " << strerror(rc) << endl; > + } > + if (rc) { > cerr << "Please use the opcontrol command instead of operf." << endl; > exit(1); > } > @@ -1813,8 +1891,6 @@ int main(int argc, char * const argv[]) > cpu_speed = op_cpu_frequency(); > process_args(argc, argv); > > - int perf_event_paranoid = _get_sys_value("/proc/sys/kernel/perf_event_paranoid"); > - my_uid = geteuid(); > if (operf_options::system_wide && ((my_uid != 0) && (perf_event_paranoid > 0))) { > cerr << "To do system-wide profiling, either you must be root or" << endl; > cerr << "/proc/sys/kernel/perf_event_paranoid must be set to 0 or -1." << endl; |