Re: [PATCH] operf: Fix 'Permission denied' error on early perf_events kernels

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Maynard,

I'll try to reproduce, review the patch and get back to you this week.

Suravee.

On 1/22/2013 4:02 PM, Maynard Johnson wrote:

> *Suravee*, could you please review this patch?  It would also be really nice
> if you could find a SLES 11 SP1/AMD system where you could reproduce the problem
> and then test the patch.
>
> Thanks.
>
> -Maynard
>
> -----------------------------------------------------------------------
> operf: Fix 'Permission denied' error on early perf_events kernels
>
> The new operf tool available with OProfile 0.9.8 uses the perf_event_open
> syscall to obtain access to the performance monitor counters and registers.
> This syscall is implmeneted by the Linux Kernel Performance Events Subsystem
> (aka "perf_events").  This perf_events subsystem was introduced in kernel
> version 2.6.31, and it underwent a lot of changes in the first several versions
> thereafter.  Apparently, the operf tool, as currently written and operating today,
> relies on certain kernel functionaility that was introduced later than some
> kernels provided with some Linux distributions that supported perf_events in the
> very early stages (e.g.,SLES 11 SP1).  When attempting to profile with operf
> (e.g., 'operf ls'), it fails with the message:
>
> Unexpected error running operf: Permission denied
> Please use the opcontrol command instead of operf.
>
> The fix for this problem is to pass '-1' for the cpu arg on the
> perf_event_open syscall when running on an early perf_events kernel.
> Passing '-1' for the cpu arg was a requirement (in most circumstances)
> on early perf_events kernels.  Later kernels removed this requirement
> so perf_event_open could be called for each cpu, even for single-app
> profiling by non-root users.  This is the standard usage model employed
> by operf, which allows us to mmap kernel data space for each cpu, thus
> giving a lot more memory for the kernel to record sample data.
>
> Signed-off-by: Maynard Johnson <may...@us...>
> ---
>   libperf_events/operf_counter.cpp |   11 +++--
>   pe_profiling/operf.cpp           |  112 ++++++++++++++++++++++++++++++++------
>   2 files changed, 101 insertions(+), 22 deletions(-)
>
> diff --git a/libperf_events/operf_counter.cpp b/libperf_events/operf_counter.cpp
> index a02c566..1949c23 100644
> --- a/libperf_events/operf_counter.cpp
> +++ b/libperf_events/operf_counter.cpp
> @@ -48,6 +48,7 @@ extern bool first_time_processing;
>   extern bool throttled;
>   extern size_t mmap_size;
>   extern size_t pg_sz;
> +extern bool try_cpu_minus_one;
>   
>   namespace {
>   
> @@ -421,7 +422,7 @@ void operf_record::setup()
>   	struct dirent *entry = NULL;
>   	DIR *dir = NULL;
>   	string err_msg;
> -	char cpus_online[129];
> +	char cpus_online[257];
>   	bool need_IOC_enable = (system_wide || pid_started);
>   
>   
> @@ -449,7 +450,7 @@ void operf_record::setup()
>   	}
>   	pagesize = sysconf(_SC_PAGE_SIZE);
>   	num_mmap_pages = (512 * 1024)/pagesize;
> -	num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
> +	num_cpus = try_cpu_minus_one ? 1 : sysconf(_SC_NPROCESSORS_ONLN);
>   	if (!num_cpus)
>   		throw runtime_error("Number of online CPUs is zero; cannot continue");;
>   
> @@ -472,7 +473,7 @@ void operf_record::setup()
>   		goto error;
>   
>   	}
> -	if (index(cpus_online, ',')) {
> +	if (index(cpus_online, ',') || cpus_online[0] != '0') {
>   		all_cpus_avail = false;
>   		if ((dir = opendir("/sys/devices/system/cpu")) == NULL) {
>   			fclose(online_cpus);
> @@ -487,7 +488,9 @@ void operf_record::setup()
>   		int real_cpu;
>   		int mmap_fd;
>   		bool mmap_done_for_cpu = false;
> -		if (all_cpus_avail) {
> +		if (try_cpu_minus_one) {
> +			real_cpu = -1;
> +		} else if (all_cpus_avail) {
>   			real_cpu = cpu;
>   		} else {
>   			real_cpu = op_get_next_online_cpu(dir, entry);
> diff --git a/pe_profiling/operf.cpp b/pe_profiling/operf.cpp
> index e4d675b..1f3fc2d 100644
> --- a/pe_profiling/operf.cpp
> +++ b/pe_profiling/operf.cpp
> @@ -11,7 +11,7 @@
>    * (C) Copyright IBM Corp. 2011
>    *
>    * Modified by Maynard Johnson <may...@us...>
> - * (C) Copyright IBM Corporation 2012
> + * (C) Copyright IBM Corporation 2012, 2013
>    *
>    */
>   
> @@ -59,6 +59,7 @@ typedef enum END_CODE {
>   
>   // Globals
>   char * app_name = NULL;
> +bool try_cpu_minus_one = false;
>   pid_t app_PID = -1;
>   uint64_t kernel_start, kernel_end;
>   operf_read operfRead;
> @@ -1724,7 +1725,60 @@ static void process_args(int argc, char * const argv[])
>   	return;
>   }
>   
> -static int _check_perf_events_cap(void)
> +static int _get_cpu_for_perf_events_cap(void)
> +{
> +	int retval;
> +	string err_msg;
> +	char cpus_online[257];
> +	FILE * online_cpus;
> +	DIR *dir = NULL;
> +
> +	int total_cpus = sysconf(_SC_NPROCESSORS_ONLN);
> +	if (!total_cpus) {
> +		err_msg = "Internal Error (1): Number of online cpus cannot be determined.";
> +		retval = -1;
> +		goto error;
> +	}
> +
> +	online_cpus = fopen("/sys/devices/system/cpu/online", "r");
> +	if (!online_cpus) {
> +		err_msg = "Internal Error (2): Number of online cpus cannot be determined.";
> +		retval = -1;
> +		goto error;
> +	}
> +	memset(cpus_online, 0, sizeof(cpus_online));
> +	fgets(cpus_online, sizeof(cpus_online), online_cpus);
> +	if (!cpus_online[0]) {
> +		fclose(online_cpus);
> +		err_msg = "Internal Error (3): Number of online cpus cannot be determined.";
> +		retval = -1;
> +		goto error;
> +
> +	}
> +	if (index(cpus_online, ',') || cpus_online[0] != '0') {
> +		// A comma in cpus_online implies a gap, which in turn implies that not all
> +		// CPUs are online.
> +		if ((dir = opendir("/sys/devices/system/cpu")) == NULL) {
> +			fclose(online_cpus);
> +			err_msg = "Internal Error (4): Number of online cpus cannot be determined.";
> +			retval = -1;
> +			goto error;
> +		} else {
> +			struct dirent *entry = NULL;
> +			retval = OP_perf_utils::op_get_next_online_cpu(dir, entry);
> +			closedir(dir);
> +		}
> +	} else {
> +		// All CPUs are available, so we just arbitrarily choose CPU 0.
> +		retval = 0;
> +	}
> +	fclose(online_cpus);
> +error:
> +	return retval;
> +}
> +
> +
> +static int _check_perf_events_cap(bool try_cpu_minus_one)
>   {
>   	/* If perf_events syscall is not implemented, the syscall below will fail
>   	 * with ENOSYS (38).  If implemented, but the processor type on which this
> @@ -1733,12 +1787,14 @@ static int _check_perf_events_cap(void)
>   	 */
>   	struct perf_event_attr attr;
>   	pid_t pid ;
> +	int cpu_to_try = try_cpu_minus_one ? -1 : _get_cpu_for_perf_events_cap();
> +	errno = 0;
>           memset(&attr, 0, sizeof(attr));
>           attr.size = sizeof(attr);
>           attr.sample_type = PERF_SAMPLE_IP;
>   
>   	pid = getpid();
> -	syscall(__NR_perf_event_open, &attr, pid, 0, -1, 0);
> +	syscall(__NR_perf_event_open, &attr, pid, cpu_to_try, -1, 0);
>   	return errno;
>   
>   }
> @@ -1790,21 +1846,43 @@ static int _get_sys_value(const char * filename)
>   int main(int argc, char * const argv[])
>   {
>   	int rc;
> +	int perf_event_paranoid = _get_sys_value("/proc/sys/kernel/perf_event_paranoid");
> +
> +	my_uid = geteuid();
>   	throttled = false;
> -	if ((rc = _check_perf_events_cap())) {
> -		if (rc == EBUSY) {
> -			cerr << "Performance monitor unit is busy.  Do 'opcontrol --deinit' and try again." << endl;
> -			exit(1);
> -		}
> -		if (rc == ENOSYS) {
> -			cerr << "Your kernel does not implement a required syscall"
> -			     << "  for the operf program." << endl;
> -		} else if (rc == ENOENT) {
> -			cerr << "Your kernel's Performance Events Subsystem does not support"
> -			     << " your processor type." << endl;
> -		} else {
> -			cerr << "Unexpected error running operf: " << strerror(rc) << endl;
> +	rc = _check_perf_events_cap(try_cpu_minus_one);
> +	if (rc == EACCES) {
> +		/* Early perf_events kernels required the cpu argument to perf_event_open
> +		 * to be '-1' when setting up to profile a single process if 1) the user is
> +		 * not root; and 2) perf_event_paranoid is > 0.  An EACCES error would be
> +		 * returned if passing '0' or greater for the cpu arg and the above criteria
> +		 * was not met.  Unfortunately, later kernels turned this requirement around
> +		 * such that the passed cpu arg must be '0' or greater when the user is not
> +		 * root.
> +		 *
> +		 * We don't really have a good way to check whether we're running on such an
> +		 * early kernel except to try the perf_event_open with different values to see
> +		 * what works.
> +		 */
> +		if (my_uid != 0 && perf_event_paranoid > 0) {
> +			try_cpu_minus_one = true;
> +			rc = _check_perf_events_cap(try_cpu_minus_one);
>   		}
> +	}
> +	if (rc == EBUSY) {
> +		cerr << "Performance monitor unit is busy.  Do 'opcontrol --deinit' and try again." << endl;
> +		exit(1);
> +	}
> +	if (rc == ENOSYS) {
> +		cerr << "Your kernel does not implement a required syscall"
> +		     << " for the operf program." << endl;
> +	} else if (rc == ENOENT) {
> +		cerr << "Your kernel's Performance Events Subsystem does not support"
> +		     << " your processor type." << endl;
> +	} else if (rc) {
> +		cerr << "Unexpected error running operf: " << strerror(rc) << endl;
> +	}
> +	if (rc) {
>   		cerr << "Please use the opcontrol command instead of operf." << endl;
>   		exit(1);
>   	}
> @@ -1813,8 +1891,6 @@ int main(int argc, char * const argv[])
>   	cpu_speed = op_cpu_frequency();
>   	process_args(argc, argv);
>   
> -	int perf_event_paranoid = _get_sys_value("/proc/sys/kernel/perf_event_paranoid");
> -	my_uid = geteuid();
>   	if (operf_options::system_wide && ((my_uid != 0) && (perf_event_paranoid > 0))) {
>   		cerr << "To do system-wide profiling, either you must be root or" << endl;
>   		cerr << "/proc/sys/kernel/perf_event_paranoid must be set to 0 or -1." << endl;