Thread: [PATCH] operf: Fix 'Permission denied' error on early perf_events kernels

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

*Suravee*, could you please review this patch?  It would also be really nice
if you could find a SLES 11 SP1/AMD system where you could reproduce the problem
and then test the patch.

Thanks.

-Maynard

-----------------------------------------------------------------------
operf: Fix 'Permission denied' error on early perf_events kernels

The new operf tool available with OProfile 0.9.8 uses the perf_event_open
syscall to obtain access to the performance monitor counters and registers.
This syscall is implmeneted by the Linux Kernel Performance Events Subsystem
(aka "perf_events").  This perf_events subsystem was introduced in kernel
version 2.6.31, and it underwent a lot of changes in the first several versions
thereafter.  Apparently, the operf tool, as currently written and operating today,
relies on certain kernel functionaility that was introduced later than some
kernels provided with some Linux distributions that supported perf_events in the
very early stages (e.g.,SLES 11 SP1).  When attempting to profile with operf
(e.g., 'operf ls'), it fails with the message:

Unexpected error running operf: Permission denied
Please use the opcontrol command instead of operf.

The fix for this problem is to pass '-1' for the cpu arg on the
perf_event_open syscall when running on an early perf_events kernel.
Passing '-1' for the cpu arg was a requirement (in most circumstances)
on early perf_events kernels.  Later kernels removed this requirement
so perf_event_open could be called for each cpu, even for single-app
profiling by non-root users.  This is the standard usage model employed
by operf, which allows us to mmap kernel data space for each cpu, thus
giving a lot more memory for the kernel to record sample data.

Signed-off-by: Maynard Johnson <may...@us...>
---
 libperf_events/operf_counter.cpp |   11 +++--
 pe_profiling/operf.cpp           |  112 ++++++++++++++++++++++++++++++++------
 2 files changed, 101 insertions(+), 22 deletions(-)

diff --git a/libperf_events/operf_counter.cpp b/libperf_events/operf_counter.cpp
index a02c566..1949c23 100644
--- a/libperf_events/operf_counter.cpp
+++ b/libperf_events/operf_counter.cpp
@@ -48,6 +48,7 @@ extern bool first_time_processing;
 extern bool throttled;
 extern size_t mmap_size;
 extern size_t pg_sz;
+extern bool try_cpu_minus_one;
 
 namespace {
 
@@ -421,7 +422,7 @@ void operf_record::setup()
 	struct dirent *entry = NULL;
 	DIR *dir = NULL;
 	string err_msg;
-	char cpus_online[129];
+	char cpus_online[257];
 	bool need_IOC_enable = (system_wide || pid_started);
 
 
@@ -449,7 +450,7 @@ void operf_record::setup()
 	}
 	pagesize = sysconf(_SC_PAGE_SIZE);
 	num_mmap_pages = (512 * 1024)/pagesize;
-	num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	num_cpus = try_cpu_minus_one ? 1 : sysconf(_SC_NPROCESSORS_ONLN);
 	if (!num_cpus)
 		throw runtime_error("Number of online CPUs is zero; cannot continue");;
 
@@ -472,7 +473,7 @@ void operf_record::setup()
 		goto error;
 
 	}
-	if (index(cpus_online, ',')) {
+	if (index(cpus_online, ',') || cpus_online[0] != '0') {
 		all_cpus_avail = false;
 		if ((dir = opendir("/sys/devices/system/cpu")) == NULL) {
 			fclose(online_cpus);
@@ -487,7 +488,9 @@ void operf_record::setup()
 		int real_cpu;
 		int mmap_fd;
 		bool mmap_done_for_cpu = false;
-		if (all_cpus_avail) {
+		if (try_cpu_minus_one) {
+			real_cpu = -1;
+		} else if (all_cpus_avail) {
 			real_cpu = cpu;
 		} else {
 			real_cpu = op_get_next_online_cpu(dir, entry);
diff --git a/pe_profiling/operf.cpp b/pe_profiling/operf.cpp
index e4d675b..1f3fc2d 100644
--- a/pe_profiling/operf.cpp
+++ b/pe_profiling/operf.cpp
@@ -11,7 +11,7 @@
  * (C) Copyright IBM Corp. 2011
  *
  * Modified by Maynard Johnson <may...@us...>
- * (C) Copyright IBM Corporation 2012
+ * (C) Copyright IBM Corporation 2012, 2013
  *
  */
 
@@ -59,6 +59,7 @@ typedef enum END_CODE {
 
 // Globals
 char * app_name = NULL;
+bool try_cpu_minus_one = false;
 pid_t app_PID = -1;
 uint64_t kernel_start, kernel_end;
 operf_read operfRead;
@@ -1724,7 +1725,60 @@ static void process_args(int argc, char * const argv[])
 	return;
 }
 
-static int _check_perf_events_cap(void)
+static int _get_cpu_for_perf_events_cap(void)
+{
+	int retval;
+	string err_msg;
+	char cpus_online[257];
+	FILE * online_cpus;
+	DIR *dir = NULL;
+
+	int total_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	if (!total_cpus) {
+		err_msg = "Internal Error (1): Number of online cpus cannot be determined.";
+		retval = -1;
+		goto error;
+	}
+
+	online_cpus = fopen("/sys/devices/system/cpu/online", "r");
+	if (!online_cpus) {
+		err_msg = "Internal Error (2): Number of online cpus cannot be determined.";
+		retval = -1;
+		goto error;
+	}
+	memset(cpus_online, 0, sizeof(cpus_online));
+	fgets(cpus_online, sizeof(cpus_online), online_cpus);
+	if (!cpus_online[0]) {
+		fclose(online_cpus);
+		err_msg = "Internal Error (3): Number of online cpus cannot be determined.";
+		retval = -1;
+		goto error;
+
+	}
+	if (index(cpus_online, ',') || cpus_online[0] != '0') {
+		// A comma in cpus_online implies a gap, which in turn implies that not all
+		// CPUs are online.
+		if ((dir = opendir("/sys/devices/system/cpu")) == NULL) {
+			fclose(online_cpus);
+			err_msg = "Internal Error (4): Number of online cpus cannot be determined.";
+			retval = -1;
+			goto error;
+		} else {
+			struct dirent *entry = NULL;
+			retval = OP_perf_utils::op_get_next_online_cpu(dir, entry);
+			closedir(dir);
+		}
+	} else {
+		// All CPUs are available, so we just arbitrarily choose CPU 0.
+		retval = 0;
+	}
+	fclose(online_cpus);
+error:
+	return retval;
+}
+
+
+static int _check_perf_events_cap(bool try_cpu_minus_one)
 {
 	/* If perf_events syscall is not implemented, the syscall below will fail
 	 * with ENOSYS (38).  If implemented, but the processor type on which this
@@ -1733,12 +1787,14 @@ static int _check_perf_events_cap(void)
 	 */
 	struct perf_event_attr attr;
 	pid_t pid ;
+	int cpu_to_try = try_cpu_minus_one ? -1 : _get_cpu_for_perf_events_cap();
+	errno = 0;
         memset(&attr, 0, sizeof(attr));
         attr.size = sizeof(attr);
         attr.sample_type = PERF_SAMPLE_IP;
 
 	pid = getpid();
-	syscall(__NR_perf_event_open, &attr, pid, 0, -1, 0);
+	syscall(__NR_perf_event_open, &attr, pid, cpu_to_try, -1, 0);
 	return errno;
 
 }
@@ -1790,21 +1846,43 @@ static int _get_sys_value(const char * filename)
 int main(int argc, char * const argv[])
 {
 	int rc;
+	int perf_event_paranoid = _get_sys_value("/proc/sys/kernel/perf_event_paranoid");
+
+	my_uid = geteuid();
 	throttled = false;
-	if ((rc = _check_perf_events_cap())) {
-		if (rc == EBUSY) {
-			cerr << "Performance monitor unit is busy.  Do 'opcontrol --deinit' and try again." << endl;
-			exit(1);
-		}
-		if (rc == ENOSYS) {
-			cerr << "Your kernel does not implement a required syscall"
-			     << "  for the operf program." << endl;
-		} else if (rc == ENOENT) {
-			cerr << "Your kernel's Performance Events Subsystem does not support"
-			     << " your processor type." << endl;
-		} else {
-			cerr << "Unexpected error running operf: " << strerror(rc) << endl;
+	rc = _check_perf_events_cap(try_cpu_minus_one);
+	if (rc == EACCES) {
+		/* Early perf_events kernels required the cpu argument to perf_event_open
+		 * to be '-1' when setting up to profile a single process if 1) the user is
+		 * not root; and 2) perf_event_paranoid is > 0.  An EACCES error would be
+		 * returned if passing '0' or greater for the cpu arg and the above criteria
+		 * was not met.  Unfortunately, later kernels turned this requirement around
+		 * such that the passed cpu arg must be '0' or greater when the user is not
+		 * root.
+		 *
+		 * We don't really have a good way to check whether we're running on such an
+		 * early kernel except to try the perf_event_open with different values to see
+		 * what works.
+		 */
+		if (my_uid != 0 && perf_event_paranoid > 0) {
+			try_cpu_minus_one = true;
+			rc = _check_perf_events_cap(try_cpu_minus_one);
 		}
+	}
+	if (rc == EBUSY) {
+		cerr << "Performance monitor unit is busy.  Do 'opcontrol --deinit' and try again." << endl;
+		exit(1);
+	}
+	if (rc == ENOSYS) {
+		cerr << "Your kernel does not implement a required syscall"
+		     << " for the operf program." << endl;
+	} else if (rc == ENOENT) {
+		cerr << "Your kernel's Performance Events Subsystem does not support"
+		     << " your processor type." << endl;
+	} else if (rc) {
+		cerr << "Unexpected error running operf: " << strerror(rc) << endl;
+	}
+	if (rc) {
 		cerr << "Please use the opcontrol command instead of operf." << endl;
 		exit(1);
 	}
@@ -1813,8 +1891,6 @@ int main(int argc, char * const argv[])
 	cpu_speed = op_cpu_frequency();
 	process_args(argc, argv);
 
-	int perf_event_paranoid = _get_sys_value("/proc/sys/kernel/perf_event_paranoid");
-	my_uid = geteuid();
 	if (operf_options::system_wide && ((my_uid != 0) && (perf_event_paranoid > 0))) {
 		cerr << "To do system-wide profiling, either you must be root or" << endl;
 		cerr << "/proc/sys/kernel/perf_event_paranoid must be set to 0 or -1." << endl;
-- 
1.7.1






Thread: [PATCH] operf: Fix 'Permission denied' error on early perf_events kernels

oprofile-list