Thread: [RFC PATCH] PoC port of OProfile to perf_events

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

The attached patch (excuse the size) is a proof of concept port of OProfile to the Linux kernel's
Performance Events Subsystem (aka "perf_events").  This enhancement allows users to profile an
application without the need for root authority.  This patch fairly closely follows the proposal
laid out in my Oct 27 posting (subject: "[RFC - v2] Porting oprofile to perf_events"), although
not all features presented in that proposal have been implemented yet.  To try out this patch,
apply to the current source tree and build as usual.  The configure script checks if the running
kernel supports perf_events; if so, it will check for /usr/include/linux/perf_event.h (typically
available in a kernel headers package) -- these two checks must pass in order to build the code
for the new "operf" program.  Either way, legacy oprofile (using opcontrol) will still be built
and can be used as normal.

To use operf, do the following:

	operf --help
	operf --reset [--events=<comma_separated_list_of_event_specs>] <app>  [<app_args>]

If no event spec is given, the default event/count will be used.  By default, operf runs in a
"--separate=lib,thread" mode.  There is no option to disable this mode, so when profiling multi-
threaded apps, you will likely want to use "--merge=tgid" when generating reports. When operf
completes, it prints the following message:

	Use '--session-dir=<current_dir>/oprofile_data'
	with opreport and other post processing tools to view your profile data.


Some major pieces of work that are still needed are as follows:

1) Add statistics gathering similar to daemon/opd_stats (as described in the Oct 27 proposal).

2) Add support for vdso and anon mmaps.

3) Add support for profiling the kernel.

4) Add support for callgraph profiling.

5) Add support for profiling by PID.

6) Add support for system-wide profiling.

7) Add support for extended features (e.g, AMD IBS -- Is there support for IBS upstream in
perf_events yet?).

8) Flesh out cpu type support.  Currently, operf (when run by itself) is able to ascertain the
cpu type for only ppc64 processors and Intel processors that support architectural perfmon.
If operf fails with the message . . .

	Unable to open cpu_type file for reading
	Make sure you have done opcontrol --init

you can do 'opcontrol --init' as a workaround since operf will use /dev/oprofile/cpu_type if it
exists.  operf needs the cpu type when asking ophelp to validate a passed event spec or for
finding the default event when no event spec is passed.  Fleshing out the cpu type support will
probably require help from some knowledgeable people for each supported architecture.

I would like to get some volunteers to help with the above tasks.  Please join in the fun by
either helping with some development or trying out the new code.  Feedback would be greatly
appreciated.

Thanks.
-Maynard

-----------------------------------------------------------------------------------
 Makefile.am                         |    1 +
 configure.in                        |   55 +++-
 libabi/opimport.cpp                 |    2 +-
 libop/op_cpu_type.c                 |   71 +++-
 libopt++/popt_options.cpp           |   37 ++-
 libopt++/popt_options.h             |    5 +-
 libpp/op_header.cpp                 |    1 -
 libutil/op_popt.c                   |   59 +++
 libutil/op_popt.h                   |   16 +
 m4/kernelversion.m4                 |   16 +
 pe_profiling/Makefile.am            |   41 ++
 pe_profiling/operf.cpp              |  858 +++++++++++++++++++++++++++++++++++
 pe_profiling/operf.h                |  164 +++++++
 pe_profiling/operf_counter.cpp      |  488 ++++++++++++++++++++
 pe_profiling/operf_counter.h        |  121 +++++
 pe_profiling/operf_event.h          |  163 +++++++
 pe_profiling/operf_kernel.cpp       |  199 ++++++++
 pe_profiling/operf_kernel.h         |   39 ++
 pe_profiling/operf_mangling.cpp     |  198 ++++++++
 pe_profiling/operf_mangling.h       |   35 ++
 pe_profiling/operf_process_info.cpp |  160 +++++++
 pe_profiling/operf_process_info.h   |   89 ++++
 pe_profiling/operf_sfile.cpp        |  589 ++++++++++++++++++++++++
 pe_profiling/operf_sfile.h          |  152 ++++++
 pe_profiling/operf_utils.cpp        |  710 +++++++++++++++++++++++++++++
 pp/common_option.cpp                |    2 +-
 utils/Makefile.am                   |    4 +-
 utils/op_perf_events_checker.c      |   40 ++
 utils/opcontrol                     |    5 +
 29 files changed, 4302 insertions(+), 18 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index 5aa6ce5..7df5a4b 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -19,6 +19,7 @@ SUBDIRS = \
 	events \
 	doc \
 	gui \
+	pe_profiling \
 	agents
 #### ATTENTION ####
 #    The agents directory must be kept as the last subdir
diff --git a/configure.in b/configure.in
index 24912cc..bd64773 100644
--- a/configure.in
+++ b/configure.in
@@ -12,7 +12,7 @@
 AC_PREREQ(2.13)
 
 AC_INIT(libop/op_config.h)
-AM_INIT_AUTOMAKE(oprofile, 0.9.8git)
+AM_INIT_AUTOMAKE(oprofile, 0.9.8_perf_events)
 AM_CONFIG_HEADER(config.h)
 
 AC_CHECK_DECLS([basename], [], [], [[#include <libgen.h>]])
@@ -63,6 +63,53 @@ AC_PROG_CXX
 AC_CHECK_PROG(LD,ld,ld,)
 test "$LD" || AC_ERROR(ld not found)
 
+dnl Check kernel version for perf_events supported
+AC_MSG_CHECKING([kernel version supports perf_events])
+AX_KERNEL_VERSION(2, 6, 31, <=, kernel_has_perf_events_support="yes",
+kernel_has_perf_events_support="no")
+
+if test "$kernel_has_perf_events_support" = "no"; then
+	AC_MSG_RESULT([This kernel does not have perf_events support; falling back to legacy oprofile])
+else
+	AC_MSG_RESULT([This kernel has perf_events support])
+fi
+
+AC_CHECK_FILE("/usr/include/linux/perf_event.h",PERF_EVENT_H_EXISTS="yes")
+AM_CONDITIONAL(BUILD_FOR_PERF_EVENT, test -n "$PERF_EVENT_H_EXISTS")
+if test "$PERF_EVENT_H_EXISTS" = "yes"; then
+	HAVE_PERF_EVENTS='1'
+else
+	HAVE_PERF_EVENTS='0'
+fi
+AC_DEFINE_UNQUOTED(HAVE_PERF_EVENTS, $HAVE_PERF_EVENTS, [Kernel support for perf_events exists])
+
+if test "$HAVE_PERF_EVENTS" = "1"; then
+	AC_CHECK_HEADER(libelf.h,,[
+  	if test -f /usr/include/elfutils/libelf.h; then
+    	CXXFLAGS="$CXXFLAGS -I /usr/include/elfutils"
+    	CFLAGS="$FLAGS -I /usr/include/elfutils"
+  	elif test -f /usr/include/libelf/libelf.h; then
+    	CXXFLAGS="$CXXFLAGS -I /usr/include/libelf"
+    	CFLAGS="$CFLAGS -I /usr/include/libelf"
+  	else
+    	AC_MSG_ERROR(libelf.h not found)
+  	fi])
+  	
+  	AC_CHECK_LIB(elf,elf_begin,,  AC_MSG_ERROR([elf library not found]))
+  	ELF_LIB="-lelf"
+  	AC_SUBST(ELF_LIB)
+  	
+  	PFM_LIB=
+  	arch="`uname -m`"
+  	if test "$arch" = "ppc64"; then
+  		AC_CHECK_HEADER(perfmon/pfmlib.h,,[AC_MSG_ERROR([pfmlib.h not found; usually provided in papi devel package])])
+  		AC_CHECK_LIB(pfm,pfm_find_event,, [AC_MSG_ERROR([libpfm not found; usually provided in papi devel package])])
+  		PFM_LIB="-lpfm"
+  	fi
+  	AC_SUBST(PFM_LIB)
+  	
+fi
+
 AC_ARG_WITH(java,
 [  --with-java=java-home        Path to Java home directory (default is "no"; "yes" will use /usr as Java home)],
 JAVA_HOMEDIR=$with_java, [with_java=no])
@@ -278,6 +325,7 @@ OP_DOCDIR=`eval echo "${my_op_prefix}/share/doc/$PACKAGE/"`
 AC_SUBST(OP_DOCDIR)
 
 AC_OUTPUT(Makefile \
+	pe_profiling/Makefile \
 	m4/Makefile \
 	libutil/Makefile \
 	libutil/tests/Makefile \
@@ -352,3 +400,8 @@ elif test "`getent passwd oprofile 2>/dev/null`" == "" || \
 	fi
 fi
 
+if  test "$PERF_EVENT_H_EXISTS" != "yes" && test "$kernel_has_perf_events_support" = "yes"; then
+	echo "Warning: perf_event.h not found.  Please install the kernel headers package if you"
+	echo "         want non-root support built into OProfile."
+fi
+
diff --git a/libabi/opimport.cpp b/libabi/opimport.cpp
index d268293..b65ade2 100644
--- a/libabi/opimport.cpp
+++ b/libabi/opimport.cpp
@@ -169,7 +169,7 @@ int main(int argc, char const ** argv)
 {
 
 	vector<string> inputs;
-	popt::parse_options(argc, argv, inputs);
+	popt::parse_options(argc, argv, inputs, false /*non_option is NOT an app*/);
 
 	if (inputs.size() != 1) {
 		cerr << "error: must specify exactly 1 input file" << endl;
diff --git a/libop/op_cpu_type.c b/libop/op_cpu_type.c
index 6aa604f..2f96408 100644
--- a/libop/op_cpu_type.c
+++ b/libop/op_cpu_type.c
@@ -12,6 +12,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/utsname.h>
+#include <ctype.h>
 
 #include "op_cpu_type.h"
 #include "op_hw_specific.h"
@@ -101,6 +103,69 @@ static struct cpu_descr const cpu_descrs[MAX_CPU_TYPE] = {
  
 static size_t const nr_cpu_descrs = sizeof(cpu_descrs) / sizeof(struct cpu_descr);
 
+static op_cpu _get_ppc64_cpu_type(void)
+{
+	char line[100];
+	op_cpu cpu_type = CPU_NO_GOOD;
+	FILE * fp;
+	fp = fopen("/proc/cpuinfo", "r");
+	if (!fp) {
+		perror("Unable to open /proc/cpuinfo\n");
+		return cpu_type;
+	}
+	memset(line, 0, 100);
+	while (cpu_type == CPU_NO_GOOD) {
+		if (fgets(line, sizeof(line), fp) == NULL) {
+			fprintf(stderr, "Did not find processor type in /proc/cpuinfo.\n");
+			goto out;
+		}
+		if (!strncmp(line, "cpu", 3)) {
+			char cpu_name[64];
+			char *cpu = line + 3;
+			while (*cpu && isspace(*cpu))
+				++cpu;
+			if (sscanf(cpu, ":%s ", cpu_name) == 1) {
+				int i;
+				char cpu_type_str[64], cpu_name_lowercase[64];
+				size_t len = strlen(cpu_name);
+				for (i = 0; i < (int)len ; i++)
+					cpu_name_lowercase[i] = tolower(cpu_name[i]);
+
+				cpu_type_str[0] = '\0';
+				strcat(cpu_type_str, "ppc64/");
+				strncat(cpu_type_str, cpu_name_lowercase, len);
+				cpu_type = op_get_cpu_number(cpu_type_str);
+			}
+		}
+	}
+	out:
+	fclose(fp);
+	return cpu_type;
+}
+
+static op_cpu _get_x86_64_cpu_type(void)
+{
+	// TODO: Horrible HACK!!!
+	fprintf(stderr, "!!! WARNING !!! operf currently supports only arch_perfmon CPUs.\n");
+	return op_cpu_specific_type(CPU_ARCH_PERFMON);
+}
+
+static op_cpu __get_cpu_type_alt_method(void)
+{
+	struct utsname uname_info;
+	if (uname(&uname_info) < 0) {
+		perror("uname failed");
+		return CPU_NO_GOOD;
+	}
+	if (strncmp(uname_info.machine, "x86_64", 6) ==0) {
+		return _get_x86_64_cpu_type();
+	}
+	if (strncmp(uname_info.machine, "ppc64", 5) == 0) {
+		return _get_ppc64_cpu_type();
+	}
+	return CPU_NO_GOOD;
+}
+
 int op_cpu_variations(op_cpu cpu_type)
 {
 	switch (cpu_type) {
@@ -140,8 +205,10 @@ op_cpu op_get_cpu_type(void)
 		/* Try 2.6's oprofilefs one instead. */
 		fp = fopen("/dev/oprofile/cpu_type", "r");
 		if (!fp) {
-			fprintf(stderr, "Unable to open cpu_type file for reading\n");
-			fprintf(stderr, "Make sure you have done opcontrol --init\n");
+			if ((cpu_type = __get_cpu_type_alt_method()) == CPU_NO_GOOD) {
+				fprintf(stderr, "Unable to open cpu_type file for reading\n");
+				fprintf(stderr, "Make sure you have done opcontrol --init\n");
+			}
 			return cpu_type;
 		}
 	}
diff --git a/libopt++/popt_options.cpp b/libopt++/popt_options.cpp
index 53a8c80..a6604a5 100644
--- a/libopt++/popt_options.cpp
+++ b/libopt++/popt_options.cpp
@@ -10,7 +10,8 @@
  */
 
 #include <iostream>
-
+#include <stdlib.h>
+#include <string.h>
 #include "op_popt.h"
 #include "op_version.h"
 
@@ -80,39 +81,55 @@ static struct poptOption appended_options[] = {
  * returned poptContext which contains  pointer inside the options array */
 static poptContext do_parse_options(int argc, char const ** argv,
                                     vector<poptOption> & options,
-                                    vector<string> & additional_params)
+                                    vector<string> & additional_params,
+                                    bool non_option_is_app)
 {
+	poptContext con;
 	options = popt_options();
 
 	int const nr_appended_options =
 		sizeof(appended_options) / sizeof(appended_options[0]);
-
 	options.insert(options.end(), appended_options,
 		       appended_options + nr_appended_options);
 
-	poptContext con = op_poptGetContext(NULL, argc, argv, &options[0], 0);
+	if (non_option_is_app) {
+		char ** app_params = (char **)calloc(2, sizeof(char **));
+		con = op_poptGetOptions_getApp(NULL, argc, argv, &options[0], app_params, 0);
+		if (app_params[0]) {
+			for (int i = 0; i < 2; i++) {
+				additional_params.push_back(app_params[i]);
+				if (strlen(app_params[i]))
+					free(app_params[i]);
+			}
+		}
+	} else {
+		con = op_poptGetContext(NULL, argc, argv, &options[0], 0);
+		char const * file;
+		while ((file = poptGetArg(con)) != 0) {
+			additional_params.push_back(file);
+		}
+	}
 
 	if (showvers)
 		show_version(argv[0]);
 
-	char const * file;
-	while ((file = poptGetArg(con)) != 0)
-		additional_params.push_back(file);
-
 	for (size_t i = 0 ; i < options_list().size() ; ++i)
 		options_list()[i]->post_process();
 
+
 	return con;
 }
 
 
 void parse_options(int argc, char const ** argv,
-                   vector<string> & additional_params)
+                   vector<string> & additional_params,
+                   bool non_option_is_app)
 {
 	vector<poptOption> options;
 
 	poptContext con =
-		do_parse_options(argc, argv, options, additional_params);
+			do_parse_options(argc, argv, options, additional_params,
+			                 non_option_is_app);
 
 	poptFreeContext(con);
 }
diff --git a/libopt++/popt_options.h b/libopt++/popt_options.h
index 7f72aa6..3958c7e 100644
--- a/libopt++/popt_options.h
+++ b/libopt++/popt_options.h
@@ -59,6 +59,8 @@ namespace popt {
  * @param argc like the parameter of main()
  * @param argv like the parameter of main()
  * @param additional_params additional options are stored here
+ * @param non_option_is_app true if app name is expected; otherwise
+ *        profile spec is expected
  *
  * Parse the given command line with the previous
  * options created. Multiple additional arguments
@@ -66,7 +68,8 @@ namespace popt {
  * vector.
  */
 void parse_options(int argc, char const ** argv,
-                   std::vector<std::string> & additional_params);
+                   std::vector<std::string> & additional_params,
+                   bool non_option_is_app);
 
 class option_base;
 
diff --git a/libpp/op_header.cpp b/libpp/op_header.cpp
index 754015a..4d081c0 100644
--- a/libpp/op_header.cpp
+++ b/libpp/op_header.cpp
@@ -164,7 +164,6 @@ string const op_print_event(op_cpu cpu_type, u32 type, u32 um, u32 count)
 		str += "Profiling through timer interrupt";
 		return str;
 	}
-
 	struct op_event * event = op_find_event(cpu_type, type, um);
 
 	if (!event) {
diff --git a/libutil/op_popt.c b/libutil/op_popt.c
index de96364..e412b9a 100644
--- a/libutil/op_popt.c
+++ b/libutil/op_popt.c
@@ -11,6 +11,7 @@
  */
 
 #include <stdlib.h>
+#include <string.h>
 #include "op_libiberty.h"
 #include "op_popt.h"
 
@@ -41,3 +42,61 @@ poptContext op_poptGetContext(char const * name,
 
 	return optcon;
 }
+
+poptContext op_poptGetOptions_getApp(char const * name, int argc,
+                                     char const ** argv,
+                                     struct poptOption const * options,
+                                     char **app_options, int flags)
+{
+	poptContext optcon;
+	const char * leftover = NULL;
+	int c;
+
+	xmalloc_set_program_name(argv[0]);
+
+#ifdef CONST_POPT
+	optcon = poptGetContext(name, argc, argv, options, flags);
+#else
+	optcon = poptGetContext((char *)name, argc, (char **)argv, options, flags);
+#endif
+
+	c = poptGetNextOpt(optcon);
+	if (c < 0) {
+		leftover = poptGetArg(optcon);
+		if (leftover) {
+			int arg_idx, app_name_found, length, num_app_args;
+			char * app_name = (char *)xcalloc(strlen(leftover) + 1, 1);
+			strncpy(app_name, leftover, strlen(leftover) + 1);
+			app_options[0] = app_name;
+			for (arg_idx = 1, app_name_found = 0, length = 0, num_app_args = 0;
+					arg_idx < argc;
+					arg_idx++) {
+				if (app_name_found) {
+					num_app_args++;
+					length += strlen(argv[arg_idx]) + 1;
+				}
+				if (!strcmp(argv[arg_idx], app_name)) {
+					app_name_found = 1;
+				}
+			}
+			if (num_app_args)
+				app_options[1] = (char *)xcalloc(length, 1);
+			else
+				app_options[1] = "";
+			for (arg_idx = argc - num_app_args; arg_idx < argc; arg_idx++) {
+				if (arg_idx > (argc - num_app_args))
+					app_options[1] = strcat(app_options[1], " ");
+				app_options[1] = strcat(app_options[1], argv[arg_idx]);
+			}
+		} else if (c < -1) {
+			fprintf(stderr, "%s: %s: %s\n", argv[0],
+			        poptBadOption(optcon, POPT_BADOPTION_NOALIAS),
+			        poptStrerror(c));
+			poptPrintHelp(optcon, stderr, 0);
+			exit(EXIT_FAILURE);
+		}
+	}
+
+	return optcon;
+
+}
diff --git a/libutil/op_popt.h b/libutil/op_popt.h
index c3dfa6c..0a8d465 100644
--- a/libutil/op_popt.h
+++ b/libutil/op_popt.h
@@ -35,6 +35,22 @@ poptContext op_poptGetContext(char const * name,
                 int argc, char const ** argv,
                 struct poptOption const * options, int flags);
 
+/**
+ * op_poptGetOptions_getApp
+ *
+ * Use this function when the argv array may be of the form:
+ *    <pgm> [options] <app-to-profile> [app-args]
+ * The <app-to-profile and app-args are passed back in app_options.
+ * The caller MUST allocate a char * array of size 2 and pass that
+ * array in app_options argument.  The first member of this array will
+ * be set to the app-to-profile pathname; the second member will be
+ * set to the app's args.
+ */
+poptContext op_poptGetOptions_getApp(char const * name, int argc,
+                                     char const ** argv,
+                                     struct poptOption const * options,
+                                     char **app_options, int flags);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/m4/kernelversion.m4 b/m4/kernelversion.m4
new file mode 100644
index 0000000..db4b805
--- /dev/null
+++ b/m4/kernelversion.m4
@@ -0,0 +1,16 @@
+dnl AX_KERNEL_VERSION(major, minor, level, comparison, action-if-true, action-if-false)
+AC_DEFUN([AX_KERNEL_VERSION], [
+SAVE_CFLAGS=$CFLAGS
+CFLAGS="-I$KINC -D__KERNEL__ -Werror"
+AC_TRY_COMPILE( 
+  [
+  #include <linux/version.h>
+  ],
+  [
+  #if LINUX_VERSION_CODE $4 KERNEL_VERSION($1, $2, $3)
+  break_me_hard(\\\);
+  #endif
+  ],
+[$5],[$6],)
+CFLAGS=$SAVE_CFLAGS
+])
diff --git a/pe_profiling/Makefile.am b/pe_profiling/Makefile.am
new file mode 100644
index 0000000..44a2e40
--- /dev/null
+++ b/pe_profiling/Makefile.am
@@ -0,0 +1,41 @@
+LIBS=@POPT_LIBS@ @LIBERTY_LIBS@ @ELF_LIB@ @PFM_LIB@
+if BUILD_FOR_PERF_EVENT
+
+AM_CPPFLAGS = \
+	-I ${top_srcdir}/libabi \
+	-I ${top_srcdir}/libutil \
+	-I ${top_srcdir}/libop \
+	-I ${top_srcdir}/libopt++ \
+	-I ${top_srcdir}/libutil++ \
+	-I ${top_srcdir}/libdb \
+	-I ${top_srcdir}/pe_profiling
+
+# 	-I ${top_srcdir}/daemon
+
+operf_SOURCES = operf.cpp \
+	operf.h \
+	operf_utils.cpp \
+	operf_counter.cpp \
+	operf_counter.h \
+	operf_event.h \
+	operf_process_info.h \
+	operf_process_info.cpp \
+	operf_sfile.cpp \
+	operf_sfile.h \
+	operf_kernel.h \
+	operf_kernel.cpp \
+	operf_mangling.h \
+	operf_mangling.cpp
+
+AM_CXXFLAGS = @OP_CXXFLAGS@
+bin_PROGRAMS = operf
+operf_LDADD =	../libopt++/libopt++.a \
+	../libutil++/libutil++.a \
+	../libdb/libodb.a \
+	../libop/libop.a \
+	../libutil/libutil.a \
+	../libabi/libabi.a
+
+#operf_LINK = $(CXX) $(AM_CXXFLAGS) $(AM_CPPFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+
+endif
diff --git a/pe_profiling/operf.cpp b/pe_profiling/operf.cpp
new file mode 100644
index 0000000..6a07cd6
--- /dev/null
+++ b/pe_profiling/operf.cpp
@@ -0,0 +1,858 @@
+/**
+ * @file operf.cpp
+ * Front-end (containing main) for handling a user request to run a profile
+ * using the new Linux Performance Events Subsystem.
+ *
+ * @remark Copyright 2011 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * Created on: Dec 7, 2011
+ * @author Maynard Johnson
+ * (C) Copyright IBM Corp. 2011
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <dirent.h>
+#include <exception>
+#include <pwd.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <ftw.h>
+#include <iostream>
+#include "operf.h"
+#include "popt_options.h"
+#include "op_libiberty.h"
+#include "string_manip.h"
+#include "cverb.h"
+#include "operf_counter.h"
+#include "op_cpu_type.h"
+#include "op_cpufreq.h"
+#include "op_abi.h"
+#include "op_events.h"
+#include "op_string.h"
+
+using namespace std;
+
+typedef enum END_CODE {
+	ALL_OK = 0,
+	APP_ABNORMAL_END = -2,
+	PERF_RECORD_ERROR = -3
+} end_code_t;
+// Globals
+char * app_name = NULL;
+operf_read operfRead;
+op_cpu cpu_type;
+double cpu_speed;
+char op_samples_current_dir[PATH_MAX];
+
+
+#define DEFAULT_OPERF_OUTFILE "operf.data"
+static char full_pathname[PATH_MAX];
+static char * app_name_SAVE = NULL;
+static char * app_args = NULL;
+static pid_t app_PID;
+static bool app_started;
+static pid_t operf_pid;
+static string samples_dir;
+static string outputfile;
+uint op_nr_counters;
+vector<operf_event_t> events;
+
+verbose vmisc("misc");
+
+namespace operf_options {
+bool system_wide;
+bool reset;
+int pid;
+int callgraph_depth;
+int mmap_pages_mult;
+string session_dir;
+bool separate_cpu;
+vector<string> evts;
+}
+
+namespace {
+vector<string> verbose_string;
+
+popt::option options_array[] = {
+	popt::option(verbose_string, "verbose", 'V',
+			"verbose output", "debug,perf_events,misc,all"),
+	popt::option(operf_options::session_dir, "session-dir", '\0',
+			"specify session path to hold sample data", "path"),
+	popt::option(operf_options::callgraph_depth, "callgraph", 'g',
+			"callgraph depth", "depth"),
+	popt::option(operf_options::system_wide, "system-wide", 's',
+			"profile entire system"),
+	popt::option(operf_options::reset, "reset", 'r',
+			"remove old profile data"),
+	popt::option(operf_options::pid, "pid", 'p',
+			"process ID to profile", "PID"),
+	popt::option(operf_options::mmap_pages_mult, "kernel-buffersize-multiplier", 'k',
+			"factor by which kernel buffer size should be increased", "buffersize"),
+	popt::option(operf_options::evts, "events", 'e',
+			"profile on these comma separated events", "events"),
+	popt::option(operf_options::separate_cpu, "separate-cpu", 'c',
+			"Categorize samples by cpu"),
+};
+}
+
+
+static void op_sig_stop(int val __attribute__((unused)))
+{
+	// Received a signal to quit, so we need to stop the
+	// app being profiled.
+	if (cverb << vdebug)
+		write(1, "in op_sig_stop ", 15);
+	kill(app_PID, SIGKILL);
+}
+
+void set_signals(void)
+{
+	struct sigaction act;
+	sigset_t ss;
+
+	sigfillset(&ss);
+	sigprocmask(SIG_UNBLOCK, &ss, NULL);
+
+	act.sa_handler = op_sig_stop;
+	act.sa_flags = 0;
+	sigemptyset(&act.sa_mask);
+	sigaddset(&act.sa_mask, SIGINT);
+
+	if (sigaction(SIGINT, &act, NULL)) {
+		perror("operf: install of SIGINT handler failed: ");
+		exit(EXIT_FAILURE);
+	}
+}
+
+static int app_ready_pipe[2], start_app_pipe[2], app_started_pipe[2], valid_profile_pipe[2];
+
+void run_app(void)
+{
+	char * app_fname = rindex(app_name, '/') + 1;
+	if (!app_fname) {
+		cerr << "Error trying to parse app name " <<  app_name << endl;
+		cerr << "usage: operf [options] --pid=<PID> | appname [args]" << endl;
+		exit(EXIT_FAILURE);
+	}
+
+	vector<string> exec_args_str;
+	if (app_args) {
+		size_t end_pos;
+		string app_args_str = app_args;
+		// Since the strings returned from substr would otherwise be ephemeral, we
+		// need to store them into the exec_args_str vector so we can reference
+		// them later when we call execvp.
+		do {
+			end_pos = app_args_str.find_first_of(' ', 0);
+			if (end_pos != string::npos) {
+				exec_args_str.push_back(app_args_str.substr(0, end_pos));
+				app_args_str = app_args_str.substr(end_pos + 1);
+			} else {
+				exec_args_str.push_back(app_args_str);
+			}
+		} while (end_pos != string::npos);
+	}
+
+	vector<const char *> exec_args;
+	exec_args.push_back(app_fname);
+	vector<string>::iterator it;
+	cverb << vdebug << "Exec args are: " << app_fname << " ";
+	// Now transfer the args from the intermediate exec_args_str container to the
+	// exec_args container that can be passed to execvp.
+	for (it = exec_args_str.begin(); it != exec_args_str.end(); it++) {
+		exec_args.push_back((*it).c_str());
+		cverb << vdebug << (*it).c_str() << " ";
+	}
+	exec_args.push_back((char *) NULL);
+	cverb << vdebug << endl;
+	// Fake an exec to warm-up the resolver
+	execvp("", ((char * const *)&exec_args[0]));
+	// signal to the parent that we're ready to exec
+	int startup = 1;
+	if (write(app_ready_pipe[1], &startup, sizeof(startup)) < 0) {
+		perror("Internal error on start_app_pipe");
+		_exit(EXIT_FAILURE);
+	}
+
+	// wait for parent to tell us to start
+	int startme = 0;
+	if (read(start_app_pipe[0], &startme, sizeof(startme)) == -1) {
+		perror("Internal error in run_app on start_app_pipe");
+		_exit(EXIT_FAILURE);
+	}
+	if (startme != 1)
+		goto fail;
+
+	cverb << vdebug << "parent says start app " << app_name << endl;
+	//sleep(1);
+	execvp(app_name, ((char * const *)&exec_args[0]));
+	cerr <<  "Failed to exec " << exec_args[0] << ": " << strerror(errno) << endl;
+	fail:
+	/* We don't want any cleanup in the child */
+	_exit(EXIT_FAILURE);
+
+}
+
+int start_profiling_app(void)
+{
+	// The only process that should return from this function is the process
+	// which invoked it.  Any forked process must do _exit() rather than return().
+
+	if (pipe(app_ready_pipe) < 0 || pipe(start_app_pipe) < 0) {
+		perror("Internal error: operf-record could not create pipe");
+		_exit(EXIT_FAILURE);
+	}
+	app_PID = fork();
+	if (app_PID < 0) {
+		perror("Internal error: fork failed");
+		_exit(EXIT_FAILURE);
+	} else if (app_PID == 0) { // child process for exec'ing app
+		run_app();
+	} else {  //parent
+		if (pipe(app_started_pipe) < 0) {
+			perror("Internal error: could not create pipe");
+			return -1;
+		}
+		if (pipe(valid_profile_pipe) < 0) {
+			perror("Internal error: could not create pipe");
+			return -1;
+		}
+		operf_pid = fork();
+		if (operf_pid < 0) {
+			return -1;
+		} else if (operf_pid == 0) { // operf-record process
+			pid_t appID = 0;
+			if (read(app_started_pipe[0], &appID, sizeof(appID)) == -1) {
+				perror("Internal error in _run: app_started_pipe");
+				return -1;
+			} else if (app_PID == 0) {
+		                cerr << "Failed to start app. Exiting." << endl;
+		                _exit(EXIT_FAILURE);
+		        }
+
+			// setup PerfRecord
+			operf_record operfRecord(outputfile, app_PID, events);
+			if (operfRecord.get_valid() == false) {
+				/* If valid is false, it means that one of the "known" errors has
+				 * occurred:
+				 *   - profiled process has already ended
+				 *   - device or resource busy
+				 * Since an informative message has already been displayed to
+				 * the user, we don't want to blow chunks here; instead, we'll
+				 * exit gracefully.  Clear out the operf.data file as an indication
+				 * ti the parent process that the profile data isn't valid.
+				 */
+				ofstream of;
+				of.open(outputfile.c_str(), ios_base::trunc);
+				of.close();
+				_exit(EXIT_SUCCESS);
+			}
+			// start recording
+			operfRecord.recordPerfData();
+			cerr << "Total bytes recorded from perf events: "
+					<< operfRecord.get_total_bytes_recorded() << endl;
+
+			operfRecord.~operf_record();
+			// done
+			_exit(EXIT_SUCCESS);
+		} else {  // parent
+			int startup;
+			if (read(app_ready_pipe[0], &startup, sizeof(startup)) == -1) {
+				perror("Internal error on app_ready_pipe");
+				return -1;
+			} else if (startup != 1) {
+				cerr << "app is not ready to start; exiting" << endl;
+				return -1;
+			}
+			app_started = true;
+			// Tell app_PID to start the app
+			cverb << vdebug << "operf-record telling child to start app" << endl;
+			if (write(start_app_pipe[1], &startup, sizeof(startup)) < 0) {
+				perror("Internal error on start_app_pipe");
+				return -1;
+			}
+
+			// Let operf-record process know the app PID
+			cverb << vdebug << "writing " << app_PID << " app pid to pipe" << endl;
+			if (write(app_started_pipe[1], &app_PID, sizeof(app_PID)) < 0) {
+				perror("Internal error on app_started_pipe");
+				return -1;
+			}
+		}
+	}
+	// parent returns
+	return 0;
+}
+
+static end_code_t _run(void)
+{
+	int waitpid_status = 0;
+	end_code_t rc = ALL_OK;
+
+	// Fork processes with signals blocked.
+	sigset_t ss;
+	sigfillset(&ss);
+	sigprocmask(SIG_BLOCK, &ss, NULL);
+
+	if (start_profiling_app() < 0) {
+		perror("Internal error: fork failed");
+		return PERF_RECORD_ERROR;
+	}
+	// parent continues here
+	cverb << vdebug << "app " << app_PID << " is started" << endl;
+	set_signals();
+	cverb << vdebug << "going into waitpid on profiled app " << app_PID << endl;
+	if (waitpid(app_PID, &waitpid_status, 0) < 0) {
+		if (errno == EINTR) {
+			cverb << vdebug << "Caught ctrl-C.  Killed profiled app." << endl;
+		} else {
+			cerr << "waitpid errno is " << errno << endl;
+			perror("waitpid for profiled app failed");
+			rc = APP_ABNORMAL_END;
+		}
+	} else {
+		if (WIFEXITED(waitpid_status) && (!WEXITSTATUS(waitpid_status))) {
+			cverb << vdebug << "waitpid for profiled app returned OK" << endl;
+		} else if (WIFEXITED(waitpid_status)) {
+			cerr <<  "profiled app ended abnormally: "
+			     << WEXITSTATUS(waitpid_status) << endl;
+			rc = APP_ABNORMAL_END;
+		}
+	}
+
+	// stop operf-record process
+	if (kill(operf_pid, SIGUSR1) < 0) {
+		perror("Attempt to stop operf-record process failed");
+		rc = PERF_RECORD_ERROR;
+	} else {
+		if (waitpid(operf_pid, &waitpid_status, 0) < 0) {
+			perror("waitpid for operf-record process failed");
+			rc = PERF_RECORD_ERROR;
+		} else {
+			if (WIFEXITED(waitpid_status) && (!WEXITSTATUS(waitpid_status))) {
+				cverb << vdebug << "waitpid for operf-record process returned OK" << endl;
+			} else {
+				cerr <<  "operf-record process ended abnormally: "
+				     << WEXITSTATUS(waitpid_status) << endl;
+				rc = PERF_RECORD_ERROR;
+			}
+		}
+	}
+	return rc;
+}
+
+static void cleanup(void)
+{
+	string cmd = "rm -f " + outputfile;
+	free(app_name_SAVE);
+	free(app_args);
+	events.clear();
+	verbose_string.clear();
+	system(cmd.c_str());
+}
+
+static int __delete_sample_data(const char *fpath,
+                                const struct stat *sb  __attribute__((unused)),
+                                int tflag  __attribute__((unused)),
+                                struct FTW *ftwbuf __attribute__((unused)))
+{
+	if (remove(fpath)) {
+		perror("sample data removal error");
+		return FTW_STOP;
+	} else {
+		return FTW_CONTINUE;
+	}
+}
+
+static void complete(void)
+{
+	int rc;
+	string current_sampledir = samples_dir + "/current/";
+	current_sampledir.copy(op_samples_current_dir, current_sampledir.length(), 0);
+
+	if (!app_started) {
+		cleanup();
+		return;
+	}
+	if (operf_options::reset) {
+		int flags = FTW_DEPTH | FTW_ACTIONRETVAL;
+
+		if (nftw(current_sampledir.c_str(), __delete_sample_data, 32, flags) < 0 &&
+				errno != ENOENT) {
+			perror("nftw");
+			cleanup();
+			exit(1);
+		}
+	}
+	rc = mkdir(current_sampledir.c_str(), S_IRWXU);
+	if (rc && (errno != EEXIST)) {
+		cerr << "Error trying to create " << current_sampledir << " dir." << endl;
+		perror("mkdir failed with");
+		exit(EXIT_FAILURE);
+	}
+
+	operfRead.init(outputfile, current_sampledir, cpu_type, events);
+	if ((rc = operfRead.readPerfHeader()) < 0) {
+		if (rc != OP_PERF_HANDLED_ERROR)
+			cerr << "Error: Cannot create read header info for sample file " << outputfile << endl;
+		cleanup();
+		exit(1);
+	}
+	cverb << vdebug << "Successfully read header info for sample file " << outputfile << endl;
+	// TODO:  We may want to do incremental conversion of the perf data, since the perf sample format
+	// is very inefficient to store.  For example, using a simple test program that does many
+	// millions of memcpy's over a 12 second span of time, a profile taken via legacy oprofile,
+	// with --separate=all and --image=<app_name> requires ~300K of storage.  Using the perf tool
+	// (not operf) to profile the same application creates an 18MB perf.data file!!
+	if (operfRead.is_valid()) {
+		try {
+			operfRead.convertPerfData();
+			char * cbuf;
+			cbuf = (char *)xmalloc(operf_options::session_dir.length() + 5);
+			strcpy(cbuf, operf_options::session_dir.c_str());
+			strcat(cbuf, "/abi");
+			op_write_abi_to_file(cbuf);
+			cerr << endl << "Use '--session-dir=" << operf_options::session_dir << "'" << endl
+			     << "with opreport and other post processing tools to view your profile data."
+			     << endl;
+			cerr << "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" << endl;
+			free(cbuf);
+		} catch (runtime_error e) {
+			cerr << "Caught exception from operf_read::convertPerfData" << endl;
+			cerr << e.what() << endl;
+		}
+	}
+	cleanup();
+}
+
+static int _get_sys_value(const char * filename)
+{
+	char str[10];
+	int _val = -999;
+	FILE * fp = fopen(filename, "r");
+	if (fp == NULL)
+		return _val;
+	if (!fgets(str, 9, fp))
+		return _val;
+	sscanf(str, "%d", &_val);
+	return _val;
+}
+
+static void handle_sys_values(void)
+{
+	int value;
+	// TODO: Iniital iterations of the perf tool required perf_event_paranoid to be -1
+	// in order for it to work with non-root users; however, this is not the case with
+	// recent distros (e.g., RHEL 6.2).  Need to figure out how perf can work with a
+	// paranoid setting of '1' so that we don't have this restriction with operf.
+	value = _get_sys_value("/proc/sys/kernel/perf_event_paranoid");
+	if (value != -1) {
+		if (value == -999) {
+			cerr << "--------------------------------------------------------------------------" << endl;
+			cerr << "WARNING: operf requires the /proc/sys/kernel/perf_event_paranoid system value" << endl
+			     << "         to be set to '-1', but we were unable to verify the setting." << endl;
+			if (errno)
+				cerr << "The following error message was received when trying to access this system value:" << endl
+				<< strerror(errno) << endl;
+			cerr << endl
+			     << "If you receive the following message:" << endl << endl
+			     << "\t\"failed to mmap: Operation not permitted\"" << endl << endl
+			     << "please ask your system administrator to change this value to '-1'." << endl;
+			cerr << "--------------------------------------------------------------------------" << endl;
+		} else {
+			cerr  << "ERROR: operf requires the /proc/sys/kernel/perf_event_paranoid system value" << endl
+			      << "       to be set to '-1', but the detected value is '" << value << "'." << endl
+			      << "       Please ask your system administrator to change this value to '-1'." << endl;
+			exit(EXIT_FAILURE);
+		}
+	}
+}
+
+static int find_app_file_in_dir(const struct dirent * d)
+{
+	if (!strcmp(d->d_name, app_name))
+		return 1;
+	else
+		return 0;
+}
+
+static int get_PATH_based_pathname(char * path_holder, size_t n)
+{
+	int retval = -1;
+
+	char * real_path = getenv("PATH");
+	char * path = (char *) xstrdup(real_path);
+	char * segment = strtok(path, ":");
+	while (segment) {
+		struct dirent ** namelist;
+		int rc = scandir(segment, &namelist, find_app_file_in_dir, NULL);
+		if (rc < 0) {
+			cerr << app_name << " cannot be found in your PATH." << endl;
+			break;
+		} else if (rc == 1) {
+			size_t applen = strlen(app_name);
+			size_t dirlen = strlen(segment);
+
+			if (applen + dirlen + 2 > n) {
+				cerr << "Path segment " << segment
+				     << " prepended to the passed app name is too long"
+				     << endl;
+				retval = -1;
+				break;
+			}
+			strncpy(path_holder, segment, dirlen);
+			strcat(path_holder, "/");
+			strncat(path_holder, app_name, applen);
+			retval = 0;
+			break;
+		}
+		segment = strtok(NULL, ":");
+	}
+	return retval;
+}
+int validate_app_name(void)
+{
+	int rc = 0;
+	struct stat filestat;
+	size_t len = strlen(app_name);
+
+	if (len > (size_t) (OP_APPNAME_LEN - 1)) {
+		cerr << "app name longer than max allowed (" << OP_APPNAME_LEN
+		     << " chars)\n";
+		cerr << app_name << endl;
+		rc = -1;
+		goto out;
+	}
+
+	if (index(app_name, '/')) {
+		strncpy(full_pathname, app_name, len);
+	} else {
+		rc = get_PATH_based_pathname(full_pathname, PATH_MAX);
+	}
+
+	if (rc) {
+		cerr << "Problem finding app name " << app_name << ". Aborting."
+		     << endl;
+		goto out;
+	}
+	app_name_SAVE = app_name;
+	app_name = full_pathname;
+	if (stat(app_name, &filestat)) {
+		char msg[OP_APPNAME_LEN + 50];
+		snprintf(msg, OP_APPNAME_LEN + 50, "Non-existent app name \"%s\"",
+		         app_name);
+		perror(msg);
+		rc = -1;
+	}
+
+	out: return rc;
+}
+
+static void __print_usage_and_exit(char * extra_msg)
+{
+	if (extra_msg)
+		cerr << extra_msg << endl;
+	cerr << "usage: operf [options] --pid=<PID> | appname [args]" << endl;
+	exit(EXIT_FAILURE);
+
+}
+
+static u32 _get_event_code(char name[])
+{
+	FILE * fp;
+	char oprof_event_code[9];
+	string command;
+	command = "ophelp ";
+	command += name;
+
+	fp = popen(command.c_str(), "r");
+	if (fp == NULL) {
+		cerr << "Unable to execute ophelp to get info for event "
+		     << name << endl;
+		exit(EXIT_FAILURE);
+	}
+	if (fgets(oprof_event_code, sizeof(oprof_event_code), fp) == NULL) {
+		cerr << "Unable to find info for event "
+		     << name << endl;
+		exit(EXIT_FAILURE);
+	}
+
+	return atoi(oprof_event_code);
+}
+
+static void _process_events_list(void)
+{
+	string cmd = "ophelp --check-events ";
+	for (unsigned int i = 0; i <  operf_options::evts.size(); i++) {
+		FILE * fp;
+		string full_cmd = cmd;
+		string event_spec = operf_options::evts[i];
+		full_cmd += event_spec;
+		fp = popen(full_cmd.c_str(), "r");
+		if (fp == NULL) {
+			cerr << "Unable to execute ophelp to get info for event "
+			     << event_spec << endl;
+			exit(EXIT_FAILURE);
+		}
+		if (fgetc(fp) == EOF) {
+			cerr << "Unable to find info for event "
+			     << event_spec << endl;
+			exit(EXIT_FAILURE);
+		}
+		char * event_str = op_xstrndup(event_spec.c_str(), event_spec.length());
+		operf_event_t event;
+		strncpy(event.name, strtok(event_str, ":"), OP_MAX_EVT_NAME_LEN);
+		event.count = atoi(strtok(NULL, ":"));
+		/* Name and count are required in the event spec in order for
+		 * 'ophelp --check-events' to pass.  But since unit mask is
+		 * optional, we need to ensure the result of strtok is valid.
+		 */
+		char * um = strtok(NULL, ":");
+		if (um)
+			event.evt_um = atoi(um);
+		else
+			event.evt_um = 0;
+		event.op_evt_code = _get_event_code(event.name);
+		event.evt_code = event.op_evt_code;
+		events.push_back(event);
+	}
+#if (defined(__powerpc__) || defined(__powerpc64__))
+	{
+		using namespace OP_perf_utils;
+		if (!op_convert_event_vals(&events)) {
+			cerr << "Unable to convert all oprofile event values to perf_event values" << endl;
+			exit(EXIT_FAILURE);
+		}
+	}
+#endif
+}
+
+static void get_default_event(void)
+{
+	operf_event_t dft_evt;
+	struct op_default_event_descr descr;
+	vector<operf_event_t> tmp_events;
+
+	op_default_event(cpu_type, &descr);
+	if (descr.name[0] == '\0') {
+		cerr << "Unable to find default event" << endl;
+		exit(EXIT_FAILURE);
+	}
+
+	memset(&dft_evt, 0, sizeof(dft_evt));
+	dft_evt.count = descr.count;
+	dft_evt.evt_um = descr.um;
+	strncpy(dft_evt.name, descr.name, OP_MAX_EVT_NAME_LEN - 1);
+	dft_evt.op_evt_code = _get_event_code(dft_evt.name);
+	dft_evt.evt_code = dft_evt.op_evt_code;
+	events.push_back(dft_evt);
+
+#if (defined(__powerpc__) || defined(__powerpc64__))
+	{
+		using namespace OP_perf_utils;
+		if (!op_convert_event_vals(&events)) {
+			cerr << "Unable to convert all oprofile event values to perf_event values" << endl;
+			exit(EXIT_FAILURE);
+		}
+	}
+#endif
+}
+
+static void _process_session_dir(void)
+{
+	if (operf_options::session_dir.empty()) {
+		char * cwd = NULL;
+		int rc;
+		cwd = (char *) xmalloc(PATH_MAX);
+		// set default session dir
+		cwd = getcwd(cwd, PATH_MAX);
+		operf_options::session_dir = cwd;
+		operf_options::session_dir +="/oprofile_data";
+		samples_dir = operf_options::session_dir + "/samples";
+		free(cwd);
+		rc = mkdir(operf_options::session_dir.c_str(), S_IRWXU);
+		if (rc && (errno != EEXIST)) {
+			cerr << "Error trying to create " << operf_options::session_dir << " dir." << endl;
+			perror("mkdir failed with");
+			exit(EXIT_FAILURE);
+		}
+		rc = mkdir(samples_dir.c_str(), S_IRWXU);
+		if (rc && (errno != EEXIST)) {
+			cerr << "Error trying to create " << samples_dir << " dir." << endl;
+			perror("mkdir failed with");
+			exit(EXIT_FAILURE);
+		}
+	} else {
+		struct stat filestat;
+		int rc;
+		if (stat(operf_options::session_dir.c_str(), &filestat)) {
+			perror("stat operation on passed session-dir failed");
+			exit(EXIT_FAILURE);
+		}
+		if (!S_ISDIR(filestat.st_mode)) {
+			cerr << "Passed session-dir " << operf_options::session_dir
+			     << " is not a directory" << endl;
+			exit(EXIT_FAILURE);
+		}
+		string tmp = operf_options::session_dir + "/oprofile_data";
+		rc = mkdir(tmp.c_str(), S_IRWXU);
+		if (rc && (errno != EEXIST)) {
+			cerr << "Error trying to create " << tmp << " dir." << endl;
+			perror("mkdir failed with");
+			exit(EXIT_FAILURE);
+		}
+		samples_dir = tmp + "/samples";
+		rc = mkdir(samples_dir.c_str(), S_IRWXU);
+		if (rc && (errno != EEXIST)) {
+			cerr << "Error trying to create " << samples_dir << " dir." << endl;
+			perror("mkdir failed with");
+			exit(EXIT_FAILURE);
+		}
+	}
+	cverb << vdebug << "Using samples dir " << samples_dir << endl;
+}
+
+static void process_args(int argc, char const ** argv)
+{
+	vector<string> non_options;
+	popt::parse_options(argc, argv, non_options, true/*non-options IS an app*/);
+
+	if (operf_options::callgraph_depth) {
+		cerr << "The --callgraph option is not yet supported." << endl;
+		exit(EXIT_FAILURE);
+	}
+
+	if (!non_options.empty()) {
+		if (operf_options::pid)
+			__print_usage_and_exit(NULL);
+
+		vector<string>::iterator it = non_options.begin();
+		app_name = (char *) xmalloc((*it).length() + 1);
+		strncpy(app_name, ((char *)(*it).c_str()), (*it).length() + 1);
+		if (it++ != non_options.end()) {
+			if ((*it).length() > 0) {
+				app_args = (char *) xmalloc((*it).length() + 1);
+				strncpy(app_args, ((char *)(*it).c_str()), (*it).length() + 1);
+			}
+		}
+		if (validate_app_name() < 0) {
+			exit(1);
+		}
+	} else if (operf_options::pid) {
+		if (operf_options::system_wide)
+			__print_usage_and_exit(NULL);
+		cerr << "The --pid option is not yet supported." << endl;
+		exit(EXIT_FAILURE);
+	} else if (operf_options::system_wide) {
+		cerr << "The --system-wide option is not yet supported." << endl;
+		exit(EXIT_FAILURE);
+	}
+	else {
+		__print_usage_and_exit(NULL);
+	}
+	/*  At this point, we know what kind of profile the user requested:
+	 *    - profile app by name
+	 *    - profile app by PID
+	 *    - profile whole system
+	 */
+
+	if (!verbose::setup(verbose_string)) {
+		cerr << "unknown --verbose= options\n";
+		exit(EXIT_FAILURE);
+	}
+
+	_process_session_dir();
+	outputfile = samples_dir + "/" + DEFAULT_OPERF_OUTFILE;
+
+	// TODO: Need to examine ocontrol to see what (if any) additional
+	// event verification is needed here.
+	if (operf_options::evts.empty()) {
+		// Use default event
+		get_default_event();
+	} else  {
+		_process_events_list();
+	}
+
+	return;
+}
+
+static int _check_perf_events_cap(void)
+{
+	/* If perf_events syscall is not implemented, the syscall below will fail
+	 * with ENOSYS (38).  If implemented, but the processor type on which this
+	 * program is running is not supported by perf_events, the syscall returns
+	 * ENOENT (2).
+	 */
+	struct perf_event_attr attr;
+	pid_t pid ;
+        memset(&attr, 0, sizeof(attr));
+        attr.size = sizeof(attr);
+        attr.sample_type = PERF_SAMPLE_IP;
+
+	pid = getpid();
+	syscall(__NR_perf_event_open, &attr, pid, 0, -1, 0);
+	return errno;
+
+}
+
+bool no_vmlinux;
+int main(int argc, char const *argv[])
+{
+	int rc;
+	// TODO:  For now, set no_vmlinux to true.  Need to implement vmlinux handling (with /proc/kallsyms?).
+	no_vmlinux = true;
+	if ((rc = _check_perf_events_cap())) {
+		if (rc == ENOSYS)
+			cerr << "Your kernel does not implement a required syscall"
+			     << "  for the operf program." << endl;
+		else if (rc == ENOENT)
+			cerr << "Your kernel's Performance Events Subsystem does not support"
+			     << " your processor type." << endl;
+		else
+			cerr << "Unexpected error running operf: " << strerror(rc) << endl;
+		cerr << "Please use the opcontrol command instead of operf." << endl;
+		exit(1);
+	}
+
+	cpu_type = op_get_cpu_type();
+	cpu_speed = op_cpu_frequency();
+	process_args(argc, argv);
+
+	if (cpu_type == CPU_NO_GOOD) {
+		cerr << "Unable to ascertain cpu type.  Exiting." << endl;
+		exit(1);
+	}
+	op_nr_counters = op_get_nr_counters(cpu_type);
+
+	uid_t uid = geteuid();
+	if (uid != 0)
+		handle_sys_values();
+
+	end_code_t run_result;
+	if ((run_result = _run())) {
+		if (app_started && (run_result != APP_ABNORMAL_END)) {
+			int rc;
+			cverb << vdebug << "Killing profiled app . . ." << endl;
+			rc = kill(app_PID, SIGKILL);
+			if (rc)
+				perror("Attempt to kill profiled app failed.");
+		}
+		if (run_result == PERF_RECORD_ERROR) {
+			cerr <<  "Error running profiler" << endl;
+			exit(1);
+		} else {
+			cerr << "WARNING: Profile results may be incomplete due to to abend of profiled app." << endl;
+		}
+	}
+	complete();
+	return 0;
+}
diff --git a/pe_profiling/operf.h b/pe_profiling/operf.h
new file mode 100644
index 0000000..186ee8b
--- /dev/null
+++ b/pe_profiling/operf.h
@@ -0,0 +1,164 @@
+/**
+ * @file operf.h
+ * Header file containing definitions for handling a user request to profile
+ * using the new Linux Performance Events Subsystem.
+ *
+ * @remark Copyright 2011 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * Created on: Dec 7, 2011
+ * @author Maynard Johnson
+ * (C) Copyright IBM Corp. 2011
+ *
+ */
+
+#ifndef OPERF_H_
+#define OPERF_H_
+
+#include <linux/perf_event.h>
+#include <vector>
+#include "config.h"
+#include "op_config.h"
+#include "op_types.h"
+#include "operf_event.h"
+#include "operf_counter.h"
+
+
+using namespace std;
+namespace operf_options {
+extern bool system_wide;
+extern bool reset;
+extern int pid;
+extern int callgraph_depth;
+extern int mmap_pages_mult;
+extern string session_dir;
+extern bool separate_cpu;
+}
+
+extern bool no_vmlinux;
+
+#define OP_APPNAME_LEN 1024
+
+#define OP_EXEC_ARGS_LIST  exec_args[0], \
+						exec_args[1], \
+						exec_args[2], \
+						exec_args[3], \
+						exec_args[4], \
+						exec_args[5]
+
+#if BITS_PER_LONG == 64
+#define MMAP_WINDOW_SZ ULLONG_MAX
+#else
+#define MMAP_WINDOW_SZ (32 * 1024 * 1024ULL)
+#endif
+
+
+extern unsigned int op_nr_counters;
+
+static inline size_t align_64bit(u64 x)
+{
+	u64 mask = 7ULL;
+	return (x + mask) & ~mask;
+}
+
+
+// extern declarations
+namespace OP_perf_utils {
+void op_get_kernel_event_data(struct mmap_data *md, operf_record * pr);
+void op_perfrecord_sigusr1_handler(int sig __attribute__((unused)),
+		siginfo_t * siginfo __attribute__((unused)),
+		void *u_context __attribute__((unused)));
+void op_record_process_info(pid_t pid, operf_record * pr, int output_fd);
+int op_read_input(int input, void * buf, size_t size);
+int op_read_from_stream(ifstream & is, char * buf, streamsize sz);
+int op_write_output(int output, void *buf, size_t size);
+int op_write_event(event_t * event);
+int op_mmap_trace_file(struct mmap_info & info);
+event_t * op_get_perf_event(struct mmap_info & info);
+int op_get_next_online_cpu(DIR * dir, struct dirent *entry);
+bool op_convert_event_vals(vector<operf_event_t> * evt_vec);
+}
+
+// The rmb() macros were borrowed from perf.h in the kernel tree
+#if defined(__i386__)
+#include <asm/unistd.h>
+#define rmb()		asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#if defined(__x86_64__)
+#include <asm/unistd.h>
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#ifdef __powerpc__
+#include <asm/unistd.h>
+#define rmb()		asm volatile ("sync" ::: "memory")
+#define cpu_relax()	asm volatile ("" ::: "memory");
+#endif
+
+#ifdef __s390__
+#include <asm/unistd.h>
+#define rmb()		asm volatile("bcr 15,0" ::: "memory")
+#define cpu_relax()	asm volatile("" ::: "memory");
+#endif
+
+#ifdef __sh__
+#include <asm/unistd.h>
+#if defined(__SH4A__) || defined(__SH5__)
+# define rmb()		asm volatile("synco" ::: "memory")
+#else
+# define rmb()		asm volatile("" ::: "memory")
+#endif
+#define cpu_relax()	asm volatile("" ::: "memory")
+#endif
+
+#ifdef __hppa__
+#include <asm/unistd.h>
+#define rmb()		asm volatile("" ::: "memory")
+#define cpu_relax()	asm volatile("" ::: "memory");
+#endif
+
+#ifdef __sparc__
+#include <asm/unistd.h>
+#define rmb()		asm volatile("":::"memory")
+#define cpu_relax()	asm volatile("":::"memory")
+#endif
+
+#ifdef __alpha__
+#include <asm/unistd.h>
+#define rmb()		asm volatile("mb" ::: "memory")
+#define cpu_relax()	asm volatile("" ::: "memory")
+#endif
+
+#ifdef __ia64__
+#include <asm/unistd.h>
+#define rmb()		asm volatile ("mf" ::: "memory")
+#define cpu_relax()	asm volatile ("hint @pause" ::: "memory")
+#endif
+
+#ifdef __arm__
+#include <asm/unistd.h>
+/*
+ * Use the __kuser_memory_barrier helper in the CPU helper page. See
+ * arch/arm/kernel/entry-armv.S in the kernel source for details.
+ */
+#define rmb()		((void(*)(void))0xffff0fa0)()
+#define cpu_relax()	asm volatile("":::"memory")
+#endif
+
+#ifdef __mips__
+#include <asm/unistd.h>
+#define rmb()		asm volatile(					\
+				".set	mips2\n\t"			\
+				"sync\n\t"				\
+				".set	mips0"				\
+				: /* no output */			\
+				: /* no input */			\
+				: "memory")
+#define cpu_relax()	asm volatile("" ::: "memory")
+#endif
+
+
+#endif // OPERF_H_
diff --git a/pe_profiling/operf_counter.cpp b/pe_profiling/operf_counter.cpp
new file mode 100644
index 0000000..1d2b74d
--- /dev/null
+++ b/pe_profiling/operf_counter.cpp
@@ -0,0 +1,488 @@
+/**
+ * @file operf_counter.cpp
+ * C++ class implementation that abstracts the user-to-kernel interface
+ * for using Linux Performance Events Subsystem.
+ *
+ * @remark Copyright 2011 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * Created on: Dec 7, 2011
+ * @author Maynard Johnson
+ * (C) Copyright IBM Corp. 2011
+ *
+ */
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <signal.h>
+#include <errno.h>
+#include <string.h>
+#include <iostream>
+#ifdef HAVE_LIBPFM
+#include <perfmon/pfmlib.h>
+#endif
+#include <stdlib.h>
+#include "op_events.h"
+#include "operf_counter.h"
+#include "cverb.h"
+#include "operf_process_info.h"
+#include "operf.h"
+
+using namespace std;
+using namespace OP_perf_utils;
+
+
+volatile bool quit;
+int sample_reads;
+unsigned int pagesize;
+verbose vperf("perf_events");
+
+namespace {
+
+vector<string> event_names;
+
+static const char *__op_magic = "OPFILE";
+
+#define OP_MAGIC	(*(u64 *)__op_magic)
+
+}  // end anonymous namespace
+
+operf_counter::operf_counter(operf_event_t evt) {
+	memset(&attr, 0, sizeof(attr));
+	attr.size = sizeof(attr);
+	attr.sample_type = OP_BASIC_SAMPLE_FORMAT;
+	attr.type = PERF_TYPE_RAW;
+	attr.config = evt.evt_code;
+	attr.sample_period = evt.count;
+	attr.inherit = 1;
+	attr.enable_on_exec = 1;
+	attr.disabled  = 1;
+	attr.exclude_idle = 1;
+	attr.exclude_kernel = evt.no_kernel;
+	attr.exclude_hv = evt.no_hv;
+	attr.sample_id_all = 0;
+	attr.read_format = PERF_FORMAT_ID;
+	event_name = evt.name;
+}
+
+operf_counter::~operf_counter() {
+}
+
+
+int operf_counter::perf_event_open(pid_t ppid, int cpu, unsigned event, operf_record * rec)
+{
+	struct {
+		u64 count;
+		u64 id;
+	} read_data;
+
+	if (event == 0) {
+		attr.mmap = 1;
+		attr.comm = 1;
+		attr.mmap_data = 1;
+	}
+
+	fd = op_perf_event_open(&attr, ppid, cpu, -1, 0);
+	if (fd < 0) {
+		int ret = -1;
+		cverb << vperf << "perf_event_open failed: " << strerror(errno) << endl;
+		if (errno == EBUSY) {
+			cerr << "The performance monitoring hardware reports EBUSY. Is another profiling tool in use?" << endl
+			     << "On some architectures, tools such as oprofile and perf being used in system-wide "
+			     << "mode can cause this problem." << endl;
+			ret = OP_PERF_HANDLED_ERROR;
+		} else if (errno == ESRCH) {
+			cerr << "!!!! No samples collected !!!" << endl;
+			cerr << "The target program/command ended before profiling was started." << endl;
+			ret = OP_PERF_HANDLED_ERROR;
+		}
+		return ret;
+	}
+	if (read(fd, &read_data, sizeof(read_data)) == -1) {
+		perror("Error reading perf_event fd");
+		return -1;
+	}
+	rec->register_perf_event_id(event, read_data.id, attr);
+
+	cverb << vperf << "perf_event_open returned fd " << fd << endl;
+	return fd;
+}
+
+operf_record::~operf_record()
+{
+	cverb << vperf << "operf_record::~operf_record()" << endl;
+	opHeader.data_size = total_bytes_recorded;
+	if (total_bytes_recorded)
+		write_op_header_info();
+	delete[] poll_data;
+	close(outputFile);
+	samples_array.clear();
+	evts.clear();
+	perfCounters.clear();
+}
+
+operf_record::operf_record(string outfile, pid_t the_pid, vector<operf_event_t> & events)
+{
+	int flags = O_CREAT|O_RDWR|O_TRUNC;
+	struct sigaction sa;
+	sigset_t ss;
+
+	pid = the_pid;
+	total_bytes_recorded = 0;
+	poll_count = 0;
+	evts = events;
+	valid = false;
+
+	opHeader.data_size = 0;
+	outputFile = open(outfile.c_str(), flags, S_IRUSR|S_IWUSR);
+	if (outputFile < 0) {
+		string errmsg = "Internal error:  Could not create output file. errno is ";
+		errmsg += strerror(errno);
+		throw runtime_error(errmsg);
+	}
+	cverb << vperf << "operf_record ctor: successfully opened output file " << outfile << endl;
+
+	memset(&sa, 0, sizeof(struct sigaction));
+	sa.sa_sigaction = op_perfrecord_sigusr1_handler;
+	sigemptyset(&sa.sa_mask);
+	sigemptyset(&ss);
+	sigaddset(&ss, SIGUSR1);
+	sigprocmask(SIG_UNBLOCK, &ss, NULL);
+	sa.sa_mask = ss;
+	sa.sa_flags = SA_NOCLDSTOP | SA_SIGINFO;
+	cverb << vperf << "calling sigaction" << endl;
+	if (sigaction(SIGUSR1, &sa, NULL) == -1) {
+		cverb << vperf << "operf_record ctor: sigaction failed; errno is: "
+		      << strerror(errno) << endl;
+		_exit(EXIT_FAILURE);
+	}
+	cverb << vperf << "calling setup" << endl;
+	setup();
+}
+
+
+void operf_record::register_perf_event_id(unsigned event, u64 id, perf_event_attr attr)
+{
+	// It's overkill to blindly do this assignment below every time, since this function
+	// is invoked once for each event for each cpu; but it's not worth the bother of trying
+	// to avoid it.
+	opHeader.h_attrs[event].attr = attr;
+	cverb << vperf << "Perf header: id = " << hex << (unsigned long long)id << " for event num "
+			<< event << ", code " << attr.config <<  endl;
+	opHeader.h_attrs[event].ids.push_back(id);
+}
+
+void operf_record::write_op_header_info()
+{
+	struct OP_file_header f_header;
+	struct op_file_attr f_attr;
+
+	lseek(outputFile, sizeof(f_header), SEEK_SET);
+
+	for (unsigned i = 0; i < evts.size(); i++) {
+		opHeader.h_attrs[i].id_offset = lseek(outputFile, 0, SEEK_CUR);
+		add_to_total(op_write_output(outputFile, &opHeader.h_attrs[i].ids[0],
+				opHeader.h_attrs[i].ids.size() * sizeof(u64)));
+	}
+
+	opHeader.attr_offset = lseek(outputFile, 0, SEEK_CUR);
+
+	for (unsigned i = 0; i < evts.size(); i++) {
+		struct op_header_evt_info attr = opHeader.h_attrs[i];
+		f_attr.attr = attr.attr;
+		f_attr.ids.offset = attr.id_offset;
+		f_attr.ids.size =attr.ids.size() * sizeof(u64);
+		add_to_total(op_write_output(outputFile, &f_attr, sizeof(f_attr)));
+	}
+
+	opHeader.data_offset = lseek(outputFile, 0, SEEK_CUR);
+
+	f_header.magic = OP_MAGIC;
+	f_header.size = sizeof(f_header);
+	f_header.attr_size = sizeof(f_attr);
+	f_header.attrs.offset = opHeader.attr_offset;
+	f_header.attrs.size = evts.size() * sizeof(f_attr);
+	f_header.data.offset = opHeader.data_offset;
+	f_header.data.size = opHeader.data_size;
+
+	lseek(outputFile, 0, SEEK_SET);
+	add_to_total(op_write_output(outputFile, &f_header, sizeof(f_header)));
+	lseek(outputFile, opHeader.data_offset + opHeader.data_size, SEEK_SET);
+}
+
+int operf_record::prepareToRecord(int counter, int cpu, int fd)
+{
+	struct mmap_data md;;
+
+	md.counter = counter;
+	md.prev = 0;
+	md.mask = NUM_MMAP_PAGES * pagesize - 1;
+
+	fcntl(fd, F_SETFL, O_NONBLOCK);
+
+	poll_data[cpu * evts.size() + counter].fd = fd;
+	poll_data[cpu * evts.size() + counter].events = POLLIN;
+	poll_count++;
+
+	md.base = mmap(NULL, (NUM_MMAP_PAGES + 1) * pagesize,
+			PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+	if (md.base == MAP_FAILED) {
+		perror("failed to mmap");
+		return -1;
+	}
+	samples_array[cpu].push_back(md);
+	ioctl(fd, PERF_EVENT_IOC_ENABLE);
+
+	return 0;
+}
+
+
+void operf_record::setup()
+{
+	bool all_cpus_avail = true;
+	int rc = 0;
+	struct dirent *entry = NULL;
+	DIR *dir = NULL;
+	string err_msg;
+	char cpus_online[129];
+
+	cverb << vperf << "operf_record::setup()" << endl;
+	pagesize = sysconf(_SC_PAGE_SIZE);
+	num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	if (!num_cpus)
+		throw runtime_error("Number of online CPUs is zero; cannot continue");;
+
+	poll_data = new struct pollfd [evts.size() * num_cpus];
+
+	cverb << vperf << "calling perf_event_open for pid " << pid << " on "
+	      << num_cpus << " cpus" << endl;
+	FILE * online_cpus = fopen("/sys/devices/system/cpu/online", "r");
+	if (!online_cpus) {
+		fclose(online_cpus);
+		err_msg = "Internal Error: Number of online cpus cannot be determined.";
+		rc = -1;
+		goto error;
+	}
+	memset(cpus_online, 0, sizeof(cpus_online));
+	fgets(cpus_online, sizeof(cpus_online), online_cpus);
+	if (!cpus_online[0]) {
+		fclose(online_cpus);
+		err_msg = "Internal Error: Number of online cpus cannot be determined.";
+		rc = -1;
+		goto error;
+
+	}
+	if (index(cpus_online, ',')) {
+		all_cpus_avail = false;
+		dir = opendir("/sys/devices/system/cpu");
+	}
+	fclose(online_cpus);
+
+	for (int cpu = 0; cpu < num_cpus; cpu++) {
+		int real_cpu;
+		if (all_cpus_avail) {
+			real_cpu = cpu;
+		} else {
+			real_cpu = op_get_next_online_cpu(dir, entry);
+			if (real_cpu < 0) {
+				closedir(dir);
+				err_msg = "Internal Error: Number of online cpus cannot be determined.";
+				rc = -1;
+				goto error;
+			}
+		}
+
+		// Create new row to hold operf_counter objects since we need one
+		// row for each cpu. Do the same for samples_array.
+		vector<struct mmap_data> tmp_mdvec;
+		vector<operf_counter> tmp_pcvec;
+
+		samples_array.push_back(tmp_mdvec);
+		perfCounters.push_back(tmp_pcvec);
+		for (unsigned event = 0; event < evts.size(); event++) {
+			evts[event].counter = event;
+			perfCounters[cpu].push_back(operf_counter(evts[event]));
+			if ((rc = perfCounters[cpu][event].perf_event_open(pid, real_cpu, event, this)) < 0) {
+				err_msg = "Internal Error.  Perf event setup failed.";
+				goto error;
+			}
+			if (((rc = prepareToRecord(event, cpu, perfCounters[cpu][event].get_fd()))) < 0) {
+				err_msg = "Internal Error.  Perf event setup failed.";
+				goto error;
+			}
+		}
+	}
+	if (!all_cpus_avail)
+		closedir(dir);
+	write_op_header_info();
+
+	op_record_process_info(pid, this, outputFile);
+	// Set bit to indicate we're set to go.
+	valid = true;
+	return;
+
+error:
+	delete[] poll_data;
+	close(outputFile);
+	if (rc != OP_PERF_HANDLED_ERROR)
+		throw runtime_error(err_msg);
+}
+
+void operf_record::recordPerfData(void)
+{
+	while (1) {
+		int prev = sample_reads;
+
+		for (int cpu = 0; cpu < num_cpus; cpu++) {
+			for (unsigned int evt = 0; evt < evts.size(); evt++) {
+				if (samples_array[cpu][evt].base)
+					op_get_kernel_event_data(&samples_array[cpu][evt], this);
+			}
+		}
+		if (quit)
+			break;
+
+		if (prev == sample_reads) {
+			poll(poll_data, poll_count, -1);
+		}
+
+		if (quit) {
+			for (int i = 0; i < num_cpus; i++) {
+				for (unsigned int evt = 0; evt < evts.size(); evt++)
+					ioctl(perfCounters[i][evt].get_fd(), PERF_EVENT_IOC_DISABLE);
+			}
+		}
+	}
+	cverb << vdebug << "operf recording finished." << endl;
+}
+
+void operf_read::init(string infilename, string samples_loc,  op_cpu cputype, vector<operf_event_t> & events)
+{
+	inputFname = infilename;
+	sampledir = samples_loc;
+	evts = events;
+	cpu_type = cputype;
+}
+
+operf_read::~operf_read()
+{
+	evts.clear();
+}
+
+int operf_read::readPerfHeader(void)
+{
+	int ret = 0;
+
+	opHeader.data_size = 0;
+	istrm.open(inputFname.c_str(), ios_base::in);
+	if (!istrm.good()) {
+		return -1;
+	}
+	istrm.peek();
+	if (istrm.eof()) {
+		cverb << vperf << "operf_read::readPerfHeader:  Empty profile data file." << endl;
+		valid = false;
+		return OP_PERF_HANDLED_ERROR;
+	}
+	cverb << vperf << "operf_read: successfully opened input file " << inputFname << endl;
+	read_op_header_info_with_ifstream();
+	valid = true;
+	cverb << vperf << "Successfully read perf header" << endl;
+
+	return ret;
+}
+
+void operf_read::r...
 
[truncated message content]

Thread: [RFC PATCH] PoC port of OProfile to perf_events

oprofile-list