Re: [PATCH v2] operf does not properly sample already-running Java apps

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

On Tue, 2013-02-12 at 16:38 -0600, Maynard Johnson wrote:
> The difference between this "version 2" patch and the original patch is
> that it fixes the bug Carl identified earlier today where --system-wide
> mode took several seconds before it would start to record any samples.
> 
> In my original patch, I had changed things around with how we were collecting
> and recording information about already-running processes -- separating out
> the info collection phase from the phase where we write PERF_RECORD_COMMs
> (for apps) and PERF_RECORD_MMAPs (for the shared libraries used by the apps).
> These PERF_RECORD_* events are written by operf_record into the sample data
> stream that operf_read processes in convertSampleData.  Anyway, my patch was
> mistakenly blindly calling op_record_process_exec_mmaps() for *every* process
> I had stored into the 'procs' vector.  But many of the processes collected
> into the 'procs' vector were child processes, so their memory mappings would
> be the same as the parent.  The upshot was that I was creating many unnecessary
> PERF_RECORD_MMAPs. So I put some code into to make sure I didn't call
> op_record_process_exec_mmaps() unnecessarily -- i.e., if it was called once
> for a given tgid, it didn't need to be called again.  This change was made
> in operf_counter.cpp:operf_record::record_process_info.  I also moved the
> location of the "Profiler started" message from operf.cpp:_run to
> operf_counter.cpp:operf_record::recordPerfData.
> 
> *Suravee and Carl*, please review.

Maynard:

I don't see anything specific with the patch.  The updated patch does
seem to fix the issue I had with short sampling.  Looks good.

              Carl Love

> 
> -Maynard
> 
> --------------------------------------------------------------------
> operf does not properly sample already-running Java apps
> 
> When passing the 'java' command directly to operf, samples are nicely
> collected for all of the threads created by the JVM. However, if the
> Java app is already running when the user starts operf with either
> '--pid' or '--system-wide' option, zero samples are collected on the
> child threads of the JVM. Note: The user program that is JITed by the
> JVM is executed by a child thread.
> 
> This patch addresses the problem by:
>  - Keeping a list of child processes
>  - Synthesizing PERF_RECORD_COMM events for the main JVM process and all
>    the child processes
>  - Calling perf_event_open for the main JVM process and all child processes
> 
> These changes entailed some fairly major restructuring of some functions
> and data structures of the operf_record class.
> 
> Signed-off-by: Maynard Johnson <may...@us...>
> ---
>  libperf_events/operf_counter.cpp |  247 ++++++++++++++++++++++++++------------
>  libperf_events/operf_counter.h   |   22 +++-
>  libperf_events/operf_event.h     |    1 -
>  libperf_events/operf_stats.cpp   |   17 ++--
>  libperf_events/operf_stats.h     |    2 +-
>  libperf_events/operf_utils.cpp   |   42 +++----
>  libperf_events/operf_utils.h     |    3 +-
>  pe_profiling/operf.cpp           |   12 +-
>  8 files changed, 225 insertions(+), 121 deletions(-)
> 
> Index: op-master/libperf_events/operf_counter.cpp
> ===================================================================
> --- op-master.orig/libperf_events/operf_counter.cpp
> +++ op-master/libperf_events/operf_counter.cpp
> @@ -47,7 +47,7 @@ extern bool first_time_processing;
>  extern bool throttled;
>  extern size_t mmap_size;
>  extern size_t pg_sz;
> -extern bool try_cpu_minus_one;
> +extern bool use_cpu_minus_one;
> 
>  namespace {
> 
> @@ -155,7 +155,7 @@ try_again:
>  }  // end anonymous namespace
> 
>  operf_counter::operf_counter(operf_event_t & evt,  bool enable_on_exec, bool do_cg,
> -                             bool separate_cpu)
> +                             bool separate_cpu, bool inherit, int event_number)
>  {
>  	memset(&attr, 0, sizeof(attr));
>  	attr.size = sizeof(attr);
> @@ -174,7 +174,7 @@ operf_counter::operf_counter(operf_event
>  	
>  	attr.config = evt.evt_code;
>  	attr.sample_period = evt.count;
> -	attr.inherit = 1;
> +	attr.inherit = inherit ? 1 : 0;
>  	attr.enable_on_exec = enable_on_exec ? 1 : 0;
>  	attr.disabled  = 1;
>  	attr.exclude_idle = 0;
> @@ -184,13 +184,14 @@ operf_counter::operf_counter(operf_event
>  		PERF_FORMAT_TOTAL_TIME_ENABLED;
>  	event_name = evt.name;
>  	fd = id = -1;
> +	evt_num = event_number;
>  }
> 
>  operf_counter::~operf_counter() {
>  }
> 
> 
> -int operf_counter::perf_event_open(pid_t ppid, int cpu, unsigned event, operf_record * rec)
> +int operf_counter::perf_event_open(pid_t pid, int cpu, operf_record * rec)
>  {
>  	struct {
>  		u64 count;
> @@ -199,11 +200,11 @@ int operf_counter::perf_event_open(pid_t
>  		u64 id;
>  	} read_data;
> 
> -	if (event == 0) {
> +	if (evt_num == 0) {
>  		attr.mmap = 1;
>  		attr.comm = 1;
>  	}
> -	fd = op_perf_event_open(&attr, ppid, cpu, -1, 0);
> +	fd = op_perf_event_open(&attr, pid, cpu, -1, 0);
>  	if (fd < 0) {
>  		int ret = -1;
>  		cverb << vrecord << "perf_event_open failed: " << strerror(errno) << endl;
> @@ -225,7 +226,7 @@ int operf_counter::perf_event_open(pid_t
>  		perror("Error reading perf_event fd");
>  		return -1;
>  	}
> -	rec->register_perf_event_id(event, read_data.id, attr);
> +	rec->register_perf_event_id(evt_num, read_data.id, attr);
> 
>  	cverb << vrecord << "perf_event_open returning fd " << fd << endl;
>  	return fd;
> @@ -260,7 +261,7 @@ bool separate_by_cpu, bool out_fd_is_fil
>  	vmlinux_file = vi.image_name;
>  	kernel_start = vi.start;
>  	kernel_end = vi.end;
> -	pid = the_pid;
> +	pid_to_profile = the_pid;
>  	pid_started = pid_running;
>  	system_wide = sys_wide;
>  	callgraph = do_cg;
> @@ -275,7 +276,7 @@ bool separate_by_cpu, bool out_fd_is_fil
>  	opHeader.data_size = 0;
>  	num_cpus = -1;
> 
> -	if (system_wide && (pid != -1 || pid_started))
> +	if (system_wide && (pid_to_profile != -1 || pid_started))
>  		return;  // object is not valid
> 
>  	cverb << vrecord << "operf_record ctor using output fd " << output_fd << endl;
> @@ -385,16 +386,16 @@ void operf_record::write_op_header_info(
>  		add_to_total(_write_header_to_pipe());
>  }
> 
> -int operf_record::prepareToRecord(int cpu, int fd)
> +int operf_record::_prepare_to_record_one_fd(int idx, int fd)
>  {
> -	struct mmap_data md;;
> +	struct mmap_data md;
>  	md.prev = 0;
>  	md.mask = num_mmap_pages * pagesize - 1;
> 
>  	fcntl(fd, F_SETFL, O_NONBLOCK);
> 
> -	poll_data[cpu].fd = fd;
> -	poll_data[cpu].events = POLLIN;
> +	poll_data[idx].fd = fd;
> +	poll_data[idx].events = POLLIN;
>  	poll_count++;
> 
>  	md.base = mmap(NULL, (num_mmap_pages + 1) * pagesize,
> @@ -417,6 +418,88 @@ int operf_record::prepareToRecord(int cp
>  }
> 
> 
> +int operf_record::prepareToRecord(void)
> +{
> +	int op_ctr_idx = 0;
> +	int rc = 0;
> +	errno = 0;
> +	if (pid_started && (procs.size() > 1)) {
> +		/* Implies we're profiling a thread group, where we call perf_event_open
> +		 * on each thread (process) in the group, passing cpu=-1.  So we'll do
> +		 * one mmap per thread (by way of the _prepare_to_record_one_fd function).
> +		 * If more than one event has been specified to profile on, we just do an
> +		 * ioctl PERF_EVENT_IOC_SET_OUTPUT to tie that perf_event fd with the fd
> +		 * of the first event of the thread.
> +		 */
> +
> +		// Sanity check
> +		if ((procs.size() * evts.size()) != perfCounters.size()) {
> +			cerr << "Internal error: Number of fds[] (" << perfCounters.size()
> +			     << ") != number of processes x number of events ("
> +			     << procs.size() << " x " << evts.size() << ")." << endl;
> +			return -1;
> +		}
> +		for (unsigned int proc_idx = 0; proc_idx < procs.size(); proc_idx++) {
> +			int fd_for_set_output = perfCounters[op_ctr_idx].get_fd();
> +			for (unsigned event = 0; event < evts.size(); event++) {
> +				int fd =  perfCounters[op_ctr_idx].get_fd();
> +				if (event == 0) {
> +					rc = _prepare_to_record_one_fd(proc_idx, fd);
> +				} else {
> +					if ((rc = ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT,
> +					                fd_for_set_output)) < 0)
> +						perror("prepareToRecord: ioctl #1 failed");
> +				}
> +
> +				if (rc < 0)
> +					return rc;
> +
> +				if ((rc = ioctl(fd, PERF_EVENT_IOC_ENABLE)) < 0) {
> +					perror("prepareToRecord: ioctl #2 failed");
> +					return rc;
> +				}
> +				op_ctr_idx++;
> +			}
> +		}
> +	} else {
> +		/* We're either doing a system-wide profile or a profile of a single process.
> +		 * We'll do one mmap per cpu.  If more than one event has been specified
> +		 * to profile on, we just do an ioctl PERF_EVENT_IOC_SET_OUTPUT to tie
> +		 * that perf_event fd with the fd of the first event of the cpu.
> +		 */
> +		if ((num_cpus * evts.size()) != perfCounters.size()) {
> +			cerr << "Internal error: Number of fds[] (" << perfCounters.size()
> +			     << ") != number of cpus x number of events ("
> +			     << num_cpus << " x " << evts.size() << ")." << endl;
> +			return -1;
> +		}
> +		for (unsigned int cpu = 0; cpu < num_cpus; cpu++) {
> +			int fd_for_set_output = perfCounters[op_ctr_idx].get_fd();
> +			for (unsigned event = 0; event < evts.size(); event++) {
> +				int fd = perfCounters[op_ctr_idx].get_fd();
> +				if (event == 0) {
> +					rc = _prepare_to_record_one_fd(cpu, fd);
> +				} else {
> +					if ((rc = ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT,
> +					                fd_for_set_output)) < 0)
> +						perror("prepareToRecord: ioctl #3 failed");
> +				}
> +
> +				if (rc < 0)
> +					return rc;
> +
> +				if ((rc = ioctl(fd, PERF_EVENT_IOC_ENABLE)) < 0) {
> +					perror("prepareToRecord: ioctl #4 failed");
> +					return rc;
> +				}
> +				op_ctr_idx++;
> +			}
> +		}
> +	}
> +	return rc;
> +}
> +
> +
>  void operf_record::setup()
>  {
>  	bool all_cpus_avail = true;
> @@ -425,7 +508,7 @@ void operf_record::setup()
>  	DIR *dir = NULL;
>  	string err_msg;
>  	char cpus_online[257];
> -	bool need_IOC_enable = (system_wide || pid_started);
> +	bool profile_process_group = false;
> 
> 
>  	if (system_wide)
> @@ -433,32 +516,31 @@ void operf_record::setup()
>  	else
>  		cverb << vrecord << "operf_record::setup() with pid_started = " << pid_started << endl;
> 
> -	if (!system_wide && pid_started) {
> -		/* We need to verify the existence of the passed PID before trying
> -		 * perf_event_open or all hell will break loose.
> -		 */
> -		char fname[PATH_MAX];
> -		FILE *fp;
> -		snprintf(fname, sizeof(fname), "/proc/%d/status", pid);
> -		fp = fopen(fname, "r");
> -		if (fp == NULL) {
> -			// Process must have finished or invalid PID passed into us.
> -			// We'll bail out now.
> -			cerr << "Unable to find process information for PID " << pid << "." << endl;
> -			cverb << vrecord << "couldn't open " << fname << endl;
> -			return;
> -		}
> -		fclose(fp);
> +	if (pid_started || system_wide) {
> +		if ((rc = op_get_process_info(system_wide, pid_to_profile, this)) < 0) {
> +			if (rc == OP_PERF_HANDLED_ERROR)
> +				return;
> +			else
> +				throw runtime_error("Unexpected error in operf_record setup");
> +		}
> +		// 'pid_started && (procs.size() > 1)' implies the process that the user
> +		// has requested us to profile has cloned one or more children.
> +		profile_process_group = pid_started && (procs.size() > 1);
>  	}
> +
>  	pagesize = sysconf(_SC_PAGE_SIZE);
> -	num_mmap_pages = (512 * 1024)/pagesize;
> -	num_cpus = try_cpu_minus_one ? 1 : sysconf(_SC_NPROCESSORS_ONLN);
> +	// If profiling a process group, use a smaller mmap length to avoid EINVAL.
> +	num_mmap_pages = profile_process_group ? 1 : (512 * 1024)/pagesize;
> +
> +	/* To set up to profile an existing thread group, we need call perf_event_open
> +	 * for each thread, and we need to pass cpu=-1 on the syscall.
> +	 */
> +	use_cpu_minus_one = use_cpu_minus_one ? true : profile_process_group;
> +	num_cpus = use_cpu_minus_one ? 1 : sysconf(_SC_NPROCESSORS_ONLN);
>  	if (!num_cpus)
>  		throw runtime_error("Number of online CPUs is zero; cannot continue");;
> 
> -	poll_data = new struct pollfd [num_cpus];
> -
> -	cverb << vrecord << "calling perf_event_open for pid " << pid << " on "
> +	cverb << vrecord << "calling perf_event_open for pid " << pid_to_profile << " on "
>  	      << num_cpus << " cpus" << endl;
>  	FILE * online_cpus = fopen("/sys/devices/system/cpu/online", "r");
>  	if (!online_cpus) {
> @@ -489,8 +571,7 @@ void operf_record::setup()
>  	for (int cpu = 0; cpu < num_cpus; cpu++) {
>  		int real_cpu;
>  		int mmap_fd;
> -		bool mmap_done_for_cpu = false;
> -		if (try_cpu_minus_one) {
> +		if (use_cpu_minus_one) {
>  			real_cpu = -1;
>  		} else if (all_cpus_avail) {
>  			real_cpu = cpu;
> @@ -502,40 +583,47 @@ void operf_record::setup()
>  				goto error;
>  			}
>  		}
> -
> -		// Create new row to hold operf_counter objects since we need one
> -		// row for each cpu. Do the same for samples_array.
> -		vector<operf_counter> tmp_pcvec;
> -
> -		perfCounters.push_back(tmp_pcvec);
> -		for (unsigned event = 0; event < evts.size(); event++) {
> -			evts[event].counter = event;
> -			perfCounters[cpu].push_back(operf_counter(evts[event],
> -			                                          (!pid_started && !system_wide),
> -			                                          callgraph, separate_cpu));
> -			if ((rc = perfCounters[cpu][event].perf_event_open(pid, real_cpu, event, this)) < 0) {
> -				err_msg = "Internal Error.  Perf event setup failed.";
> -				goto error;
> -			}
> -			if (!mmap_done_for_cpu) {
> -				if (((rc = prepareToRecord(cpu, perfCounters[cpu][event].get_fd()))) < 0) {
> +		size_t num_procs = profile_process_group ? procs.size() : 1;
> +		/* To profile a parent and its children, the perf_events kernel subsystem
> +		 * requires us to use cpu=-1 on the perf_event_open call for each of the
> +		 * processes in the group.  But perf_events also prevents us from specifying
> +		 * "inherit" on the perf_event_attr we pass to perf_event_open when cpu is '-1'.
> +		 */
> +		bool inherit = !profile_process_group;
> +		for (unsigned proc_idx = 0; proc_idx < num_procs; proc_idx++) {
> +			for (unsigned event = 0; event < evts.size(); event++) {
> +				/* For a parent process, comm.tid==comm.pid, but for child
> +				 * processes in a process group, comm.pid is the parent, so
> +				 * we must use comm.tid for the perf_event_open call.  So
> +				 * we can use comm.tid for all cases.
> +				 */
> +				pid_t pid_for_open = profile_process_group ? procs[proc_idx].tid
> +				                                           : pid_to_profile;
> +				operf_counter op_ctr(operf_counter(evts[event],
> +				                                   (!pid_started && !system_wide),
> +				                                   callgraph, separate_cpu,
> +				                                   inherit, event));
> +				if ((rc = op_ctr.perf_event_open(pid_for_open,
> +				                                 real_cpu, this)) < 0) {
>  					err_msg = "Internal Error.  Perf event setup failed.";
>  					goto error;
>  				}
> -				mmap_fd = perfCounters[cpu][event].get_fd();
> -				mmap_done_for_cpu = true;
> -			} else {
> -				if (ioctl(perfCounters[cpu][event].get_fd(),
> -				          PERF_EVENT_IOC_SET_OUTPUT, mmap_fd) < 0)
> -					goto error;
> +				perfCounters.push_back(op_ctr);
>  			}
> -			if (need_IOC_enable)
> -				if (ioctl(perfCounters[cpu][event].get_fd(), PERF_EVENT_IOC_ENABLE) < 0)
> -					goto error;
>  		}
>  	}
>  	if (dir)
>  		closedir(dir);
> +	int num_mmaps;
> +	if (pid_started && (procs.size() > 1))
> +		num_mmaps = procs.size();
> +	else
> +		num_mmaps = num_cpus;
> +	poll_data = new struct pollfd [num_mmaps];
> +	if ((rc = prepareToRecord()) < 0) {
> +		err_msg = "Internal Error.  Perf event setup failed.";
> +		goto error;
> +	}
>  	write_op_header_info();
> 
>  	// Set bit to indicate we're set to go.
> @@ -557,20 +645,36 @@ error:
>  		throw runtime_error(err_msg);
>  }
> 
> +void operf_record::record_process_info(void)
> +{
> +	map<unsigned int, unsigned int> pids_mapped;
> +	pid_t last_tgid = -1;
> +	for (unsigned int proc_idx = 0; proc_idx < procs.size(); proc_idx++)
> +	{
> +		int num = OP_perf_utils::op_write_output(output_fd, &procs[proc_idx],
> +		                                         procs[proc_idx].header.size);
> +		add_to_total(num);
> +	        if (cverb << vrecord)
> +	                cout << "Created COMM event for " << procs[proc_idx].comm << endl;
> +
> +	        if ((procs[proc_idx].pid == last_tgid) ||
> +	        		(pids_mapped.find(procs[proc_idx].pid) != pids_mapped.end()))
> +	        	continue;
> +	        OP_perf_utils::op_record_process_exec_mmaps(procs[proc_idx].tid,
> +	                                                    procs[proc_idx].pid,
> +	                                                    output_fd, this);
> +	        pids_mapped[procs[proc_idx].pid] = last_tgid = procs[proc_idx].pid;
> +	}
> +}
> +
>  void operf_record::recordPerfData(void)
>  {
>  	bool disabled = false;
> -	if (pid_started || system_wide) {
> -		if (op_record_process_info(system_wide, pid, this, output_fd) < 0) {
> -			for (int i = 0; i < num_cpus; i++) {
> -				for (unsigned int evt = 0; evt < evts.size(); evt++)
> -					ioctl(perfCounters[i][evt].get_fd(), PERF_EVENT_IOC_DISABLE);
> -			}
> -			throw runtime_error("operf_record: error recording process info");
> -		}
> -	}
> -	op_record_kernel_info(vmlinux_file, kernel_start, kernel_end, output_fd, this);
> +	if (pid_started || system_wide)
> +		record_process_info();
> 
> +	op_record_kernel_info(vmlinux_file, kernel_start, kernel_end, output_fd, this);
> +	cerr << "operf: Profiler started" << endl;
>  	while (1) {
>  		int prev = sample_reads;
> 
> @@ -587,10 +691,8 @@ void operf_record::recordPerfData(void)
>  		}
> 
>  		if (quit) {
> -			for (int i = 0; i < num_cpus; i++) {
> -				for (unsigned int evt = 0; evt < evts.size(); evt++)
> -					ioctl(perfCounters[i][evt].get_fd(), PERF_EVENT_IOC_DISABLE);
> -			}
> +			for (unsigned int i = 0; i < perfCounters.size(); i++)
> +				ioctl(perfCounters[i].get_fd(), PERF_EVENT_IOC_DISABLE);
>  			disabled = true;
>  			cverb << vrecord << "operf_record::recordPerfData received signal to quit." << endl;
>  		}
> Index: op-master/libperf_events/operf_counter.h
> ===================================================================
> --- op-master.orig/libperf_events/operf_counter.h
> +++ op-master/libperf_events/operf_counter.h
> @@ -56,18 +56,20 @@ op_perf_event_open(struct perf_event_att
>  class operf_counter {
>  public:
>  	operf_counter(operf_event_t & evt, bool enable_on_exec, bool callgraph,
> -	              bool separate_by_cpu);
> +	              bool separate_by_cpu, bool inherit, int event_number);
>  	~operf_counter();
> -	int perf_event_open(pid_t ppid, int cpu, unsigned counter, operf_record * pr);
> +	int perf_event_open(pid_t pid, int cpu, operf_record * pr);
>  	const struct perf_event_attr * the_attr(void) const { return &attr; }
>  	int get_fd(void) const { return fd; }
>  	int get_id(void) const { return id; }
> +	int get_evt_num(void) const { return evt_num; }
>  	const std::string get_event_name(void) const { return event_name; }
> 
>  private:
>  	struct perf_event_attr attr;
>  	int fd;
>  	int id;
> +	int evt_num;
>  	std::string event_name;
>  };
> 
> @@ -85,6 +87,7 @@ public:
>  	void recordPerfData(void);
>  	int out_fd(void) const { return output_fd; }
>  	void add_to_total(int n) { total_bytes_recorded += n; }
> +	void add_process(struct comm_event proc) { procs.push_back(proc); }
>  	int get_total_bytes_recorded(void) const { return total_bytes_recorded; }
>  	void register_perf_event_id(unsigned counter, u64 id, perf_event_attr evt_attr);
>  	bool get_valid(void) { return valid; }
> @@ -92,21 +95,30 @@ public:
>  private:
>  	void create(std::string outfile, std::vector<operf_event_t> & evts);
>  	void setup(void);
> -	int prepareToRecord(int cpu, int fd);
> +	int prepareToRecord(void);
> +	int _prepare_to_record_one_fd(int idx, int fd);
> +	void record_process_info(void);
>  	void write_op_header_info(void);
>  	int _write_header_to_file(void);
>  	int _write_header_to_pipe(void);
>  	int output_fd;
>  	bool write_to_file;
> +	// Array of size 'num_cpus_used_for_perf_event_open * num_pids * num_events'
>  	struct pollfd * poll_data;
>  	std::vector<struct mmap_data> samples_array;
>  	int num_cpus;
> -	pid_t pid;
> +	pid_t pid_to_profile;
> +	/* When doing --pid or --system-wide profiling, we'll obtain process information
> +	 * for all processes to be profiled (including forked/cloned processes) and store
> +	 * that information in a collection of type 'comm_event'.  We'll use this collection
> +	 * for synthesizing PERF_RECORD_COMM events into the profile data stream.
> +	 */
> +	std::vector<struct comm_event> procs;
>  	bool pid_started;
>  	bool system_wide;
>  	bool callgraph;
>  	bool separate_cpu;
> -	std::vector< std::vector<operf_counter> > perfCounters;
> +	std::vector<operf_counter> perfCounters;
>  	int total_bytes_recorded;
>  	int poll_count;
>  	struct OP_header opHeader;
> Index: op-master/libperf_events/operf_event.h
> ===================================================================
> --- op-master.orig/libperf_events/operf_event.h
> +++ op-master/libperf_events/operf_event.h
> @@ -130,7 +130,6 @@ typedef struct operf_event {
>  	unsigned long evt_um;
>  	char um_name[OP_MAX_UM_NAME_LEN];
>  	unsigned long count;
> -	u32 counter;
>  	bool no_kernel;
>  	bool no_user;
>  	bool no_hv;
> Index: op-master/libperf_events/operf_stats.cpp
> ===================================================================
> --- op-master.orig/libperf_events/operf_stats.cpp
> +++ op-master/libperf_events/operf_stats.cpp
> @@ -176,7 +176,7 @@ std::string  operf_stats_recorder::creat
>  }
> 
> 
> -void operf_stats_recorder::check_for_multiplexing(std::vector< std::vector<operf_counter> > const & perfCounters,
> +void operf_stats_recorder::check_for_multiplexing(std::vector<operf_counter> const & perfCounters,
>  						  int num_cpus,
>  						  int system_wide, int evt)
>  {
> @@ -199,11 +199,15 @@ void operf_stats_recorder::check_for_mul
> 
>  	u64 cumulative_time_running = 0;
>  	u64 max_enabled_time = 0, min_enabled_time = 0xFFFFFFFFFFFFFFFFULL;
> -	int fd;
> +	int fd, idx_for_event_name;
>  	bool event_multiplexed = false;
> 
> -	for (int cpu = 0; cpu < num_cpus; cpu++) {
> -	    fd = perfCounters[cpu][evt].get_fd();
> +	for (unsigned int i = 0; i < perfCounters.size(); i++) {
> +		if (perfCounters[i].get_evt_num() != evt)
> +			continue;
> +		// Any index can be used, since event name is the same
> +		idx_for_event_name = i;
> +		fd = perfCounters[i].get_fd();
> 
>  		if (read(fd, &read_data, sizeof(read_data)) == -1) {
>  			return;
> @@ -270,10 +274,7 @@ void operf_stats_recorder::check_for_mul
>  			return;
>  		}
> 
> -		/* The event is listed in all of the CPUs, just use CPU 0
> -		 * to get the name of the event.
> -		 */
> -		event_name = perfCounters[0][evt].get_event_name();
> +		event_name = perfCounters[idx_for_event_name].get_event_name();
>  		outputfile = multiplexed_dir + "/" + event_name;
> 
>  		outfile.open(outputfile.c_str());
> Index: op-master/libperf_events/operf_stats.h
> ===================================================================
> --- op-master.orig/libperf_events/operf_stats.h
> +++ op-master/libperf_events/operf_stats.h
> @@ -64,7 +64,7 @@ public:
>  	 */
>  	static void mv_multiplexed_data_dir(std::string const & session_dir,
>  					    std::string const & sample_dir);
> -	static void check_for_multiplexing(std::vector< std::vector<operf_counter> > const & perfCounters,
> +	static void check_for_multiplexing(std::vector<operf_counter> const & perfCounters,
>  					   int num_cpus, int system_wide,
>  					   int evt);
> 
> Index: op-master/libperf_events/operf_utils.cpp
> ===================================================================
> --- op-master.orig/libperf_events/operf_utils.cpp
> +++ op-master/libperf_events/operf_utils.cpp
> @@ -1033,7 +1033,7 @@ int OP_perf_utils::op_write_output(int o
>  }
> 
> 
> -static void op_record_process_exec_mmaps(pid_t pid, pid_t tgid, int output_fd, operf_record * pr)
> +void OP_perf_utils::op_record_process_exec_mmaps(pid_t pid, pid_t tgid, int output_fd, operf_record * pr)
>  {
>  	char fname[PATH_MAX];
>  	FILE *fp;
> @@ -1051,6 +1051,8 @@ static void op_record_process_exec_mmaps
>  		char line_buffer[BUFSIZ];
>  		char perms[5], pathname[PATH_MAX], dev[16];
>  		unsigned long long start_addr, end_addr, offset;
> +		const char * anon_mem = "//anon";
> +
>  		u_int32_t inode;
> 
>  		memset(pathname, '\0', sizeof(pathname));
> @@ -1072,6 +1074,9 @@ static void op_record_process_exec_mmaps
>  			if (imagename == NULL)
>  				imagename = strstr(pathname, "[vdso]");
> 
> +			if ((imagename == NULL) && !strstr(pathname, "["))
> +				imagename = (char *)anon_mem;
> +
>  			if (imagename == NULL)
>  				continue;
> 
> @@ -1095,8 +1100,7 @@ static void op_record_process_exec_mmaps
>  	return;
>  }
> 
> -static int _record_one_process_info(pid_t pid, bool sys_wide, operf_record * pr,
> -                                    int output_fd)
> +static int _get_one_process_info(bool sys_wide, pid_t pid, operf_record * pr)
>  {
>  	struct comm_event comm;
>  	char fname[PATH_MAX];
> @@ -1119,7 +1123,7 @@ static int _record_one_process_info(pid_
>  		if (!sys_wide) {
>  			cerr << "Unable to find process information for process " << pid << "." << endl;
>  			cverb << vrecord << "couldn't open " << fname << endl;
> -			return -1;
> +			return OP_PERF_HANDLED_ERROR;
>  		} else {
>  			return 0;
>  		}
> @@ -1154,10 +1158,10 @@ static int _record_one_process_info(pid_
>  	size = align_64bit(size);
>  	comm.header.size = sizeof(comm) - (sizeof(comm.comm) - size);
>  	if (tgid != pid) {
> -		// passed pid must have been a secondary thread
> +		// passed pid must have been a secondary thread, and we
> +		// don't go looking at the /proc/<pid>/task of such processes.
>  		comm.tid = pid;
> -		int num = OP_perf_utils::op_write_output(output_fd, &comm, comm.header.size);
> -		pr->add_to_total(num);
> +		pr->add_process(comm);
>  		goto out;
>  	}
> 
> @@ -1166,7 +1170,8 @@ static int _record_one_process_info(pid_
>  	if (tids == NULL) {
>  		// process must have exited
>  		ret = -1;
> -		cverb << vrecord << "opendir returned NULL" << endl;
> +		cverb << vrecord << "Process " << pid << " apparently exited while "
> +		      << "process info was being collected"<< endl;
>  		goto out;
>  	}
> 
> @@ -1177,39 +1182,30 @@ static int _record_one_process_info(pid_
>  			continue;
> 
>  		comm.tid = pid;
> -
> -		int num = OP_perf_utils::op_write_output(output_fd, &comm, comm.header.size);
> -		pr->add_to_total(num);
> +		pr->add_process(comm);
>  	}
>  	closedir(tids);
> -	if (cverb << vrecord)
> -		cout << "Created COMM event for " << comm.comm << endl;
> 
>  out:
> -	op_record_process_exec_mmaps(pid, tgid, output_fd, pr);
> -
>  	fclose(fp);
>  	if (ret) {
>  		cverb << vrecord << "couldn't get app name and tgid for pid "
>  		      << dec << pid << " from /proc fs." << endl;
>  	}
>  	return ret;
> -
>  }
> 
>  /* Obtain process information for an active process (where the user has
>   * passed in a process ID via the --pid option) or all active processes
> - * (where system_wide==true).  Then generate the necessary PERF_RECORD_COMM
> - * and PERF_RECORD_MMAP entries into the profile data stream.
> + * (where system_wide==true).
>   */
> -int OP_perf_utils::op_record_process_info(bool system_wide, pid_t pid, operf_record * pr,
> -                                          int output_fd)
> +int OP_perf_utils::op_get_process_info(bool system_wide, pid_t pid, operf_record * pr)
>  {
>  	int ret = 0;
>  	if (cverb << vrecord)
> -		cout << "op_record_process_info" << endl;
> +		cout << "op_get_process_info" << endl;
>  	if (!system_wide) {
> -		ret = _record_one_process_info(pid, system_wide, pr, output_fd);
> +		ret = _get_one_process_info(system_wide, pid, pr);
>  	} else {
>  		char buff[BUFSIZ];
>  		pid_t tgid = 0;
> @@ -1231,7 +1227,7 @@ int OP_perf_utils::op_record_process_inf
>  				cverb << vmisc << "/proc entry " << dirent.d_name << " is not a PID" << endl;
>  				continue;
>  			}
> -			if ((ret = _record_one_process_info(pid, system_wide, pr, output_fd)) < 0)
> +			if ((ret = _get_one_process_info(system_wide, pid, pr)) < 0)
>  				break;
>  		}
>  		closedir(pids);
> Index: op-master/libperf_events/operf_utils.h
> ===================================================================
> --- op-master.orig/libperf_events/operf_utils.h
> +++ op-master/libperf_events/operf_utils.h
> @@ -65,7 +65,8 @@ void op_get_kernel_event_data(struct mma
>  void op_perfrecord_sigusr1_handler(int sig __attribute__((unused)),
>  		siginfo_t * siginfo __attribute__((unused)),
>  		void *u_context __attribute__((unused)));
> -int op_record_process_info(bool system_wide, pid_t pid, operf_record * pr, int output_fd);
> +int op_get_process_info(bool system_wide, pid_t pid, operf_record * pr);
> +void op_record_process_exec_mmaps(pid_t pid, pid_t tgid, int output_fd, operf_record * pr);
>  int op_write_output(int output, void *buf, size_t size);
>  void op_write_event(event_t * event, u64 sample_type);
>  int op_read_from_stream(std::ifstream & is, char * buf, std::streamsize sz);
> Index: op-master/pe_profiling/operf.cpp
> ===================================================================
> --- op-master.orig/pe_profiling/operf.cpp
> +++ op-master/pe_profiling/operf.cpp
> @@ -60,7 +60,7 @@ typedef enum END_CODE {
> 
>  // Globals
>  char * app_name = NULL;
> -bool try_cpu_minus_one = false;
> +bool use_cpu_minus_one = false;
>  pid_t app_PID = -1;
>  uint64_t kernel_start, kernel_end;
>  operf_read operfRead;
> @@ -613,7 +613,6 @@ static end_code_t _run(void)
>  	}
> 
>  	set_signals_for_parent();
> -	cerr << "operf: Profiler started" << endl;
>  	if (startApp) {
>  		/* The user passed in a command or program name to start, so we'll need to do waitpid on that
>  		 * process.  However, while that user-requested process is running, it's possible we
> @@ -1834,7 +1833,7 @@ error:
>  }
> 
> 
> -static int _check_perf_events_cap(bool try_cpu_minus_one)
> +static int _check_perf_events_cap(bool use_cpu_minus_one)
>  {
>  	/* If perf_events syscall is not implemented, the syscall below will fail
>  	 * with ENOSYS (38).  If implemented, but the processor type on which this
> @@ -1843,7 +1842,7 @@ static int _check_perf_events_cap(bool t
>  	 */
>  	struct perf_event_attr attr;
>  	pid_t pid ;
> -	int cpu_to_try = try_cpu_minus_one ? -1 : _get_cpu_for_perf_events_cap();
> +	int cpu_to_try = use_cpu_minus_one ? -1 : _get_cpu_for_perf_events_cap();
>  	errno = 0;
>          memset(&attr, 0, sizeof(attr));
>          attr.size = sizeof(attr);
> @@ -1906,7 +1905,7 @@ int main(int argc, char * const argv[])
> 
>  	my_uid = geteuid();
>  	throttled = false;
> -	rc = _check_perf_events_cap(try_cpu_minus_one);
> +	rc = _check_perf_events_cap(use_cpu_minus_one);
>  	if (rc == EACCES) {
>  		/* Early perf_events kernels required the cpu argument to perf_event_open
>  		 * to be '-1' when setting up to profile a single process if 1) the user is
> @@ -1921,8 +1920,8 @@ int main(int argc, char * const argv[])
>  		 * what works.
>  		 */
>  		if (my_uid != 0 && perf_event_paranoid > 0) {
> -			try_cpu_minus_one = true;
> -			rc = _check_perf_events_cap(try_cpu_minus_one);
> +			use_cpu_minus_one = true;
> +			rc = _check_perf_events_cap(use_cpu_minus_one);
>  		}
>  	}
>  	if (rc == EBUSY) {
>