[PATCH] Add a new option to operf to do conversion to oprofile format after profiling is done

The profile data read from the perf_events kernel has to be converted to OProfile
format for the post-processing tools (e.g., opreport).  By default, this conversion
is done on-the-fly during profling using a pipe.  This method involves starting an
additional operf process that reads the data fed into the pipe by the operf-record
process.  This additional process interprets the perf_events data and does the
conversion to OProfile format, persisting the ususal oprofile sample data files in
<session-dir>/samples/current.  When operf is run in single application mode, the
extra overhead of the conversion process running during profiling time is not even
recorded, and thus is not easily noticeable.  However, when doing a system-wide
profile, operf overhead can be substantial (easily over 6% on a busy system), and
the samples for operf *are* noticeable in the system-wide report.

This patch introduces the "--lazy-conversion" option which directs operf to write
the perf_events profile data to a temporary file during profiling.  Then, after
profiling is done, the data in the temporary file is converted to OProfile format.

This patch has been pushed upstream. Review comments still welcome.

Signed-off-by: Maynard Johnson <maynardj@us.ibm.com>
---
 doc/operf.1.in                   |   15 ++
 doc/oprofile.xml                 |   13 ++
 libperf_events/operf_counter.cpp |  359 +++++++++++++++++++++++++++++++-------
 libperf_events/operf_counter.h   |   18 ++-
 libperf_events/operf_event.h     |   12 ++
 libperf_events/operf_stats.cpp   |    2 +-
 libperf_events/operf_utils.cpp   |   59 ++++++
 libperf_events/operf_utils.h     |    7 +
 pe_profiling/operf.cpp           |  136 +++++++++++----
 9 files changed, 510 insertions(+), 111 deletions(-)

diff --git a/doc/operf.1.in b/doc/operf.1.in
index 45adf6b..4ed2f52 100644
--- a/doc/operf.1.in
+++ b/doc/operf.1.in
@@ -160,6 +160,21 @@ the data is saved in the
 directory on the current path.
 .br
 .TP
+.BI "--lazy-conversion / -l"
+Use this option to reduce the overhead of
+.BI operf
+during profiling. Normally, profile data received from the kernel is converted
+to OProfile format during profiling time. This is typically not an issue when
+profiling a single application. But when using the
+.I --system-wide
+option, this on-the-fly conversion process can cause noticeable overhead,
+particularly on busy multi-processor systems. The
+.I --lazy-conversion
+option directs
+.BI operf
+to wait until profiling is completed to do the conversion of profile data.
+.br
+.TP
 .BI "--verbose / -V " level
 A comma-separated list of debugging control values, used to increase the verbosity of the output.
 Valid values are:  debug, perf_events, misc, sfile, arcs, or the special value, 'all'.
diff --git a/doc/oprofile.xml b/doc/oprofile.xml
index 1cf2aa7..48adf97 100644
--- a/doc/oprofile.xml
+++ b/doc/oprofile.xml
@@ -716,6 +716,19 @@ Following is a description of the <command>operf</command> options.
 		</para></listitem>
 	</varlistentry>
 	<varlistentry>
+	   <term><option>---lazy-conversion / -l</option></term>
+		<listitem><para>
+		Use this option to reduce the overhead of <command>operf</command> during profiling.
+		Normally, profile data received from the kernel is converted to OProfile format
+		during profiling time. This is typically not an issue when profiling a single
+		application. But when using the <code>--system-wide</code> option, this on-the-fly
+		conversion process can cause noticeable overhead, particularly on busy
+		multi-processor systems. The <code>--lazy-conversion</code> option directs
+		<command>operf</command> to wait until profiling is completed to do the conversion
+		of profile data.
+		</para></listitem>
+	</varlistentry>
+	<varlistentry>
 		<term><option>--verbose / -V [level]</option></term>
 		<listitem><para>
 		A comma-separated list of debugging control values used to increase the verbosity of the
diff --git a/libperf_events/operf_counter.cpp b/libperf_events/operf_counter.cpp
index 9dde224..8a21eba 100644
--- a/libperf_events/operf_counter.cpp
+++ b/libperf_events/operf_counter.cpp
@@ -45,6 +45,8 @@ verbose vperf("perf_events");
 
 extern bool first_time_processing;
 extern bool throttled;
+extern size_t mmap_size;
+extern size_t pg_sz;
 
 namespace {
 
@@ -54,6 +56,101 @@ static const char *__op_magic = "OPFILE";
 
 #define OP_MAGIC	(*(u64 *)__op_magic)
 
+
+int _get_perf_event_from_pipe(event_t * event, int sample_data_fd)
+{
+	static size_t pe_header_size = sizeof(perf_event_header);
+	char * evt = (char *)event;
+	ssize_t num_read;
+	perf_event_header * header = (perf_event_header *)event;
+
+	/* A signal handler was setup for the operf_read process to handle interrupts
+	 * (i.e., from ctrl-C), so the read syscalls below may get interrupted.  But the
+	 * operf_read process should ignore the interrupt and continue processing
+	 * until there's no more data to read or until the parent operf process
+	 * forces us to stop.  So we must try the read operation again if it was
+	 * interrupted.
+	 */
+again:
+	errno = 0;
+	if ((num_read = read(sample_data_fd, header, pe_header_size)) < 0) {
+		cverb << vdebug << "Read 1 of sample data pipe returned with " << strerror(errno) << endl;
+		if (errno == EINTR)
+			goto again;
+		else
+			return -1;
+	} else if (num_read == 0) {
+		return -1;
+	}
+	evt += pe_header_size;
+	if (!header->size)
+		return -1;
+
+again2:
+	if ((num_read = read(sample_data_fd, evt, header->size - pe_header_size)) < 0) {
+		cverb << vdebug << "Read 2 of sample data pipe returned with " << strerror(errno) << endl;
+		if (errno == EINTR)
+			goto again2;
+		else
+			return -1;
+	} else if (num_read == 0) {
+		return -1;
+	}
+	return 0;
+}
+
+event_t * _get_perf_event_from_file(struct mmap_info & info)
+{
+	uint32_t size;
+	event_t * event;
+
+	if (info.offset + info.head >= info.file_data_offset + info.file_data_size)
+		return NULL;
+
+	if (!pg_sz)
+		pg_sz = sysconf(_SC_PAGESIZE);
+
+try_again:
+	event = (event_t *)(info.buf + info.head);
+
+	if ((mmap_size != info.file_data_size) &&
+			(((info.head + sizeof(event->header)) > mmap_size) ||
+					(info.head + event->header.size > mmap_size))) {
+		int ret;
+		u64 shift = pg_sz * (info.head / pg_sz);
+		cverb << vperf << "Remapping perf data file" << endl;
+		ret = munmap(info.buf, mmap_size);
+		if (ret) {
+			string errmsg = "Internal error:  munmap of perf data file failed with errno: ";
+			errmsg += strerror(errno);
+			throw runtime_error(errmsg);
+		}
+
+		info.offset += shift;
+		info.head -= shift;
+		ret = op_mmap_trace_file(info, false);
+		if (ret) {
+			string errmsg = "Internal error:  mmap of perf data file failed with errno: ";
+			errmsg += strerror(errno);
+			throw runtime_error(errmsg);
+		}
+		goto try_again;
+	}
+
+	size = event->header.size;
+
+	// The tail end of the operf data file may be zero'ed out, so we assume if we
+	// find size==0, we're now in that area of the file, so we're done.
+	if (size == 0)
+		return NULL;
+
+	info.head += size;
+	if (info.offset + info.head >= info.file_data_offset + info.file_data_size)
+		return NULL;
+
+	return event;
+}
+
 }  // end anonymous namespace
 
 operf_counter::operf_counter(operf_event_t evt,  bool enable_on_exec, bool do_cg,
@@ -126,6 +223,10 @@ int operf_counter::perf_event_open(pid_t ppid, int cpu, unsigned event, operf_re
 operf_record::~operf_record()
 {
 	cverb << vperf << "operf_record::~operf_record()" << endl;
+	opHeader.data_size = total_bytes_recorded;
+	if (total_bytes_recorded)
+		write_op_header_info();
+
 	if (poll_data)
 		delete[] poll_data;
 	close(output_fd);
@@ -140,7 +241,7 @@ operf_record::~operf_record()
 
 operf_record::operf_record(int out_fd, bool sys_wide, pid_t the_pid, bool pid_running,
                            vector<operf_event_t> & events, vmlinux_info_t vi, bool do_cg,
-                           bool separate_by_cpu)
+bool separate_by_cpu, bool out_fd_is_file)
 {
 	int flags = O_CREAT|O_RDWR|O_TRUNC;
 	struct sigaction sa;
@@ -159,6 +260,8 @@ operf_record::operf_record(int out_fd, bool sys_wide, pid_t the_pid, bool pid_ru
 	valid = false;
 	poll_data = NULL;
 	output_fd = out_fd;
+	write_to_file = out_fd_is_file;
+	opHeader.data_size = 0;
 	num_cpus = -1;
 
 	if (system_wide && (pid != -1 || pid_started))
@@ -184,22 +287,50 @@ operf_record::operf_record(int out_fd, bool sys_wide, pid_t the_pid, bool pid_ru
 	setup();
 }
 
-
-void operf_record::register_perf_event_id(unsigned event, u64 id, perf_event_attr attr)
+int operf_record::_write_header_to_file(void)
 {
-	// It's overkill to blindly do this assignment below every time, since this function
-	// is invoked once for each event for each cpu; but it's not worth the bother of trying
-	// to avoid it.
-	opHeader.h_attrs[event].attr = attr;
-	cverb << vperf << "Perf header: id = " << hex << (unsigned long long)id << " for event num "
-			<< event << ", code " << attr.config <<  endl;
-	opHeader.h_attrs[event].ids.push_back(id);
+	struct OP_file_header f_header;
+	struct op_file_attr f_attr;
+	int total = 0;
+
+	lseek(output_fd, sizeof(f_header), SEEK_SET);
+
+	for (unsigned i = 0; i < evts.size(); i++) {
+		opHeader.h_attrs[i].id_offset = lseek(output_fd, 0, SEEK_CUR);
+		total += op_write_output(output_fd, &opHeader.h_attrs[i].ids[0],
+		                         opHeader.h_attrs[i].ids.size() * sizeof(u64));
+	}
+
+	opHeader.attr_offset = lseek(output_fd, 0, SEEK_CUR);
+
+	for (unsigned i = 0; i < evts.size(); i++) {
+		struct op_header_evt_info attr = opHeader.h_attrs[i];
+		f_attr.attr = attr.attr;
+		f_attr.ids.offset = attr.id_offset;
+		f_attr.ids.size = attr.ids.size() * sizeof(u64);
+		total += op_write_output(output_fd, &f_attr, sizeof(f_attr));
+	}
+
+	opHeader.data_offset = lseek(output_fd, 0, SEEK_CUR);
+
+	f_header.magic = OP_MAGIC;
+	f_header.size = sizeof(f_header);
+	f_header.attr_size = sizeof(f_attr);
+	f_header.attrs.offset = opHeader.attr_offset;
+	f_header.attrs.size = evts.size() * sizeof(f_attr);
+	f_header.data.offset = opHeader.data_offset;
+	f_header.data.size = opHeader.data_size;
+
+	lseek(output_fd, 0, SEEK_SET);
+	total += op_write_output(output_fd, &f_header, sizeof(f_header));
+	lseek(output_fd, opHeader.data_offset + opHeader.data_size, SEEK_SET);
 }
 
-void operf_record::write_op_header_info()
+int operf_record::_write_header_to_pipe(void)
 {
 	struct OP_file_header f_header;
 	struct op_file_attr f_attr;
+	int total;
 
 	f_header.magic = OP_MAGIC;
 	f_header.size = sizeof(f_header);
@@ -207,19 +338,39 @@ void operf_record::write_op_header_info()
 	f_header.attrs.size = evts.size() * sizeof(f_attr);
 	f_header.data.size = 0;
 
-	add_to_total(op_write_output(output_fd, &f_header, sizeof(f_header)));
+	total = op_write_output(output_fd, &f_header, sizeof(f_header));
 
 	for (unsigned i = 0; i < evts.size(); i++) {
 		struct op_header_evt_info attr = opHeader.h_attrs[i];
 		f_attr.attr = attr.attr;
 		f_attr.ids.size = attr.ids.size() * sizeof(u64);
-		add_to_total(op_write_output(output_fd, &f_attr, sizeof(f_attr)));
+		total += op_write_output(output_fd, &f_attr, sizeof(f_attr));
 	}
 
 	for (unsigned i = 0; i < evts.size(); i++) {
-		add_to_total(op_write_output(output_fd, &opHeader.h_attrs[i].ids[0],
-		                             opHeader.h_attrs[i].ids.size() * sizeof(u64)));
+		total += op_write_output(output_fd, &opHeader.h_attrs[i].ids[0],
+		                         opHeader.h_attrs[i].ids.size() * sizeof(u64));
 	}
+	return total;
+}
+
+void operf_record::register_perf_event_id(unsigned event, u64 id, perf_event_attr attr)
+{
+	// It's overkill to blindly do this assignment below every time, since this function
+	// is invoked once for each event for each cpu; but it's not worth the bother of trying
+	// to avoid it.
+	opHeader.h_attrs[event].attr = attr;
+	cverb << vperf << "Perf header: id = " << hex << (unsigned long long)id << " for event num "
+			<< event << ", code " << attr.config <<  endl;
+	opHeader.h_attrs[event].ids.push_back(id);
+}
+
+void operf_record::write_op_header_info()
+{
+	if (write_to_file)
+		add_to_total(_write_header_to_file());
+	else
+		add_to_total(_write_header_to_pipe());
 }
 
 int operf_record::prepareToRecord(int cpu, int fd)
@@ -428,18 +579,20 @@ void operf_record::recordPerfData(void)
 			cverb << vperf << "operf_record::recordPerfData received signal to quit." << endl;
 		}
 	}
-	close(output_fd);
 	cverb << vdebug << "operf recording finished." << endl;
 }
 
-void operf_read::init(int sample_data_pipe_fd, string samples_loc,  op_cpu cputype, vector<operf_event_t> & events)
+void operf_read::init(int sample_data_pipe_fd, string input_filename, string samples_loc, op_cpu cputype,
+                      vector<operf_event_t> & events, bool systemwide)
 {
 	struct sigaction sa;
 	sigset_t ss;
 	sample_data_fd = sample_data_pipe_fd;
+	inputFname = input_filename;
 	sampledir = samples_loc;
 	evts = events;
 	cpu_type = cputype;
+	syswide = systemwide;
 	memset(&sa, 0, sizeof(struct sigaction));
 	sa.sa_sigaction = op_perfread_sigusr1_handler;
 	sigemptyset(&sa.sa_mask);
@@ -462,7 +615,78 @@ operf_read::~operf_read()
 	evts.clear();
 }
 
-int operf_read::readPerfHeader(void)
+
+void operf_read::_read_header_info_with_ifstream(void)
+{
+	struct OP_file_header fheader;
+	istrm.seekg(0, ios_base::beg);
+
+	if (op_read_from_stream(istrm, (char *)&fheader, sizeof(fheader)) != sizeof(fheader)) {
+		throw runtime_error("Error: input file " + inputFname + " does not have enough data for header");
+	}
+
+	if (memcmp(&fheader.magic, __op_magic, sizeof(fheader.magic)))
+		throw runtime_error("Error: input file " + inputFname + " does not have expected header data");
+
+	cverb << vperf << "operf magic number " << (char *)&fheader.magic << " matches expected __op_magic " << __op_magic << endl;
+	opHeader.attr_offset = fheader.attrs.offset;
+	opHeader.data_offset = fheader.data.offset;
+	opHeader.data_size = fheader.data.size;
+	size_t fattr_size = sizeof(struct op_file_attr);
+	if (fattr_size != fheader.attr_size) {
+		string msg = "Error: perf_events binary incompatibility. Event data collection was apparently "
+				"performed under a different kernel version than current.";
+		throw runtime_error(msg);
+	}
+	int num_fattrs = fheader.attrs.size/fheader.attr_size;
+	cverb << vperf << "num_fattrs  is " << num_fattrs << endl;
+	istrm.seekg(opHeader.attr_offset, ios_base::beg);
+	for (int i = 0; i < num_fattrs; i++) {
+		struct op_file_attr f_attr;
+		streamsize fattr_size = sizeof(f_attr);
+		if (op_read_from_stream(istrm, (char *)&f_attr, fattr_size) != fattr_size)
+			throw runtime_error("Error: Unexpected end of input file " + inputFname + ".");
+		opHeader.h_attrs[i].attr = f_attr.attr;
+		streampos next_f_attr = istrm.tellg();
+		int num_ids = f_attr.ids.size/sizeof(u64);
+		istrm.seekg(f_attr.ids.offset, ios_base::beg);
+		for (int id = 0; id < num_ids; id++) {
+			u64 perf_id;
+			streamsize perfid_size = sizeof(perf_id);
+			if (op_read_from_stream(istrm, (char *)& perf_id, perfid_size) != perfid_size)
+				throw runtime_error("Error: Unexpected end of input file " + inputFname + ".");
+			cverb << vperf << "Perf header: id = " << hex << (unsigned long long)perf_id << endl;
+			opHeader.h_attrs[i].ids.push_back(perf_id);
+		}
+		istrm.seekg(next_f_attr, ios_base::beg);
+	}
+	istrm.close();
+}
+
+int operf_read::_read_perf_header_from_file(void)
+{
+	int ret = 0;
+
+	opHeader.data_size = 0;
+	istrm.open(inputFname.c_str(), ios_base::in);
+	if (!istrm.good()) {
+		return -1;
+	}
+	istrm.peek();
+	if (istrm.eof()) {
+		cverb << vperf << "operf_read::readPerfHeader:  Empty profile data file." << endl;
+		valid = false;
+		return OP_PERF_HANDLED_ERROR;
+	}
+	cverb << vperf << "operf_read: successfully opened input file " << inputFname << endl;
+	_read_header_info_with_ifstream();
+	valid = true;
+	cverb << vperf << "Successfully read perf header" << endl;
+
+	return ret;
+}
+
+int operf_read::_read_perf_header_from_pipe(void)
 {
 	struct OP_file_header fheader;
 	string errmsg;
@@ -526,6 +750,14 @@ fail:
 	return -1;
 }
 
+int operf_read::readPerfHeader(void)
+{
+	if (!inputFname.empty())
+		_read_perf_header_from_file();
+	else
+		_read_perf_header_from_pipe();
+}
+
 int operf_read::get_eventnum_by_perf_event_id(u64 id) const
 {
 	for (unsigned i = 0; i < evts.size(); i++) {
@@ -538,54 +770,30 @@ int operf_read::get_eventnum_by_perf_event_id(u64 id) const
 	return -1;
 }
 
-int operf_read::_get_one_perf_event(event_t * event)
-{
-	static size_t pe_header_size = sizeof(perf_event_header);
-	char * evt = (char *)event;
-	ssize_t num_read;
-	perf_event_header * header = (perf_event_header *)event;
-
-	/* A signal handler was setup for the operf_read process to handle interrupts
-	 * (i.e., from ctrl-C), so the read syscalls below may get interrupted.  But the
-	 * operf_read process should ignore the interrupt and continue processing
-	 * until there's no more data to read or until the parent operf process
-	 * forces us to stop.  So we must try the read operation again if it was
-	 * interrupted.
-	 */
-again:
-	errno = 0;
-	if ((num_read = read(sample_data_fd, header, pe_header_size)) < 0) {
-		cverb << vdebug << "Read 1 of sample data pipe returned with " << strerror(errno) << endl;
-		if (errno == EINTR)
-			goto again;
-		else
-			return -1;
-	} else if (num_read == 0) {
-		return -1;
-	}
-	evt += pe_header_size;
-	if (!header->size)
-		return -1;
-
-again2:
-	if ((num_read = read(sample_data_fd, evt, header->size - pe_header_size)) < 0) {
-		cverb << vdebug << "Read 2 of sample data pipe returned with " << strerror(errno) << endl;
-		if (errno == EINTR)
-			goto again2;
-		else
-			return -1;
-	} else if (num_read == 0) {
-		return -1;
-	}
-	return 0;
-}
-
-
 int operf_read::convertPerfData(void)
 {
 	int num_bytes = 0;
-	// Allocate way more than enough space for a really big event with a long callchain
-	event_t * event = (event_t *)xmalloc(65536);
+	struct mmap_info info;
+	event_t * event;
+
+	if (!inputFname.empty()) {
+		info.file_data_offset = opHeader.data_offset;
+		info.file_data_size = opHeader.data_size;
+		info.traceFD = open(inputFname.c_str(), O_RDONLY);
+		if (info.traceFD == -1) {
+			cerr << "Error: open failed with errno:\n\t" << strerror(errno) << endl;
+			throw runtime_error("Error: Unable to open operf data file");
+		}
+		cverb << vdebug << "operf_read opened " << inputFname << endl;
+		if (op_mmap_trace_file(info, true) < 0) {
+			close(info.traceFD);
+			throw runtime_error("Error: Unable to mmap operf data file");
+		}
+	} else {
+		// Allocate way more than enough space for a really big event with a long callchain
+		event = (event_t *)xmalloc(65536);
+		memset(event, '\0', 65536);
+	}
 
 	for (int i = 0; i < OPERF_MAX_STATS; i++)
 		operf_stats[i] = 0;
@@ -593,16 +801,30 @@ int operf_read::convertPerfData(void)
 	cverb << vdebug << "Converting operf data to oprofile sample data format" << endl;
 	cverb << vdebug << "sample type is " << hex <<  opHeader.h_attrs[0].attr.sample_type << endl;
 	first_time_processing = true;
-	memset(event, '\0', 65536);
+	int num_recs = 0;
+	bool print_progress = !inputFname.empty() && syswide;
+	if (print_progress)
+		cerr << "Converting profile data to OProfile format" << endl;
 	while (1) {
 		streamsize rec_size = 0;
-		if (_get_one_perf_event(event) < 0) {
-			break;
+		if (!inputFname.empty()) {
+			event = _get_perf_event_from_file(info);
+			if (event == NULL)
+				break;
+		} else {
+			if (_get_perf_event_from_pipe(event, sample_data_fd) < 0)
+				break;
 		}
 		rec_size = event->header.size;
 		op_write_event(event, opHeader.h_attrs[0].attr.sample_type);
 		num_bytes += rec_size;
+		num_recs++;
+		if ((num_recs % 1000000 == 0) && print_progress)
+			cerr << ".";
 	}
+	if (print_progress)
+		cerr << endl;
+
 	first_time_processing = false;
 	op_reprocess_unresolved_events(opHeader.h_attrs[0].attr.sample_type);
 
@@ -615,6 +837,9 @@ int operf_read::convertPerfData(void)
 	strcat(cbuf, "/abi");
 	op_write_abi_to_file(cbuf);
 	free(cbuf);
-	free(event);
+	if (inputFname.empty())
+		close(info.traceFD);
+	else
+		free(event);
 	return num_bytes;
 }
diff --git a/libperf_events/operf_counter.h b/libperf_events/operf_counter.h
index 5d88c4e..87c2796 100644
--- a/libperf_events/operf_counter.h
+++ b/libperf_events/operf_counter.h
@@ -78,9 +78,9 @@ public:
 	 * For single app profiling, set sys_wide=false, the_pid=<processID-to-profile>,
 	 * and pid_running=true if profiling an already active process; otherwise false.
 	 */
-	operf_record(int output_pipe_fd, bool sys_wide, pid_t the_pid, bool pid_running,
+	operf_record(int output_fd, bool sys_wide, pid_t the_pid, bool pid_running,
 	             std::vector<operf_event_t> & evts, OP_perf_utils::vmlinux_info_t vi,
-	             bool callgraph, bool separate_by_cpu);
+	             bool callgraph, bool separate_by_cpu, bool output_fd_is_file);
 	~operf_record();
 	void recordPerfData(void);
 	int out_fd(void) const { return output_fd; }
@@ -94,7 +94,10 @@ private:
 	void setup(void);
 	int prepareToRecord(int cpu, int fd);
 	void write_op_header_info(void);
+	int _write_header_to_file(void);
+	int _write_header_to_pipe(void);
 	int output_fd;
+	bool write_to_file;
 	struct pollfd * poll_data;
 	std::vector<struct mmap_data> samples_array;
 	int num_cpus;
@@ -115,9 +118,9 @@ private:
 
 class operf_read {
 public:
-	operf_read(void) : sample_data_fd(-1), cpu_type(CPU_NO_GOOD) { valid = false; }
-	void init(int sample_data_pipe_fd, std::string samples_dir, op_cpu cputype,
-	          std::vector<operf_event_t> & evts);
+	operf_read(void) : sample_data_fd(-1), inputFname(""), cpu_type(CPU_NO_GOOD) { valid = false; }
+	void init(int sample_data_pipe_fd, std::string input_filename, std::string samples_dir, op_cpu cputype,
+	          std::vector<operf_event_t> & evts, bool systemwide);
 	~operf_read();
 	int readPerfHeader(void);
 	int convertPerfData(void);
@@ -127,13 +130,18 @@ public:
 
 private:
 	int sample_data_fd;
+	std::string inputFname;
 	std::string sampledir;
 	std::ifstream istrm;
 	struct OP_header opHeader;
 	std::vector<operf_event_t> evts;
 	bool valid;
+	bool syswide;
 	op_cpu cpu_type;
 	int _get_one_perf_event(event_t *);
+	void _read_header_info_with_ifstream(void);
+	int _read_perf_header_from_file(void);
+	int _read_perf_header_from_pipe(void);
 };
 
 
diff --git a/libperf_events/operf_event.h b/libperf_events/operf_event.h
index aa94767..59f8390 100644
--- a/libperf_events/operf_event.h
+++ b/libperf_events/operf_event.h
@@ -123,8 +123,16 @@ typedef struct operf_event {
 	bool no_hv;
 } operf_event_t;
 
+struct mmap_info {
+	u64 offset, file_data_size, file_data_offset, head;
+	char * buf;
+	int traceFD;
+};
+
+
 struct op_file_section {
 	u64 size;
+	u64 offset;
 };
 
 struct op_file_attr {
@@ -135,6 +143,7 @@ struct op_file_attr {
 struct op_header_evt_info {
 	struct perf_event_attr attr;
 	std::vector<u64> ids;
+	off_t id_offset;
 };
 
 struct OP_file_header {
@@ -147,6 +156,9 @@ struct OP_file_header {
 
 struct OP_header {
 	struct op_header_evt_info h_attrs[OP_MAX_NUM_EVENTS];
+	off_t			attr_offset;
+	off_t			data_offset;
+	u64			data_size;
 };
 /* Some of the above definitions were borrowed from the perf tool's util/event.h file. */
 
diff --git a/libperf_events/operf_stats.cpp b/libperf_events/operf_stats.cpp
index af2b5f9..1d93f89 100644
--- a/libperf_events/operf_stats.cpp
+++ b/libperf_events/operf_stats.cpp
@@ -67,7 +67,7 @@ void operf_print_stats(string sessiondir, char * starttime, bool throttled)
 		fprintf(stderr, "* * * * WARNING: Profiling rate was throttled back by the kernel * * * *\n");
 		fprintf(stderr, "The number of samples actually recorded is less than expected, but is\n");
 		fprintf(stderr, "probably still statistically valid.  Decreasing the sampling rate is the\n");
-		fprintf(stderr, "best option if you want to avoid throttling.");
+		fprintf(stderr, "best option if you want to avoid throttling.\n");
 	}
 
 	// TODO: handle extended stats
diff --git a/libperf_events/operf_utils.cpp b/libperf_events/operf_utils.cpp
index d635913..6c9a366 100644
--- a/libperf_events/operf_utils.cpp
+++ b/libperf_events/operf_utils.cpp
@@ -55,6 +55,8 @@ map<u64, struct operf_mmap *> kernel_modules;
 struct operf_mmap * kernel_mmap;
 bool first_time_processing;
 bool throttled;
+size_t mmap_size;
+size_t pg_sz;
 
 static list<event_t *> unresolved_events;
 static struct operf_transient trans;
@@ -950,6 +952,63 @@ void OP_perf_utils::op_perfread_sigusr1_handler(int sig __attribute__((unused)),
 	read_quit = true;
 }
 
+int OP_perf_utils::op_read_from_stream(ifstream & is, char * buf, streamsize sz)
+{
+	int rc = 0;
+	is.read(buf, sz);
+	if (!is.eof() && is.fail()) {
+		cerr << "Internal error:  Failed to read from input file." << endl;
+		rc = -1;
+	} else {
+		rc = is.gcount();
+	}
+	return rc;
+}
+
+
+static int __mmap_trace_file(struct mmap_info & info)
+{
+	int mmap_prot  = PROT_READ;
+	int mmap_flags = MAP_SHARED;
+
+	info.buf = (char *) mmap(NULL, mmap_size, mmap_prot,
+	                         mmap_flags, info.traceFD, info.offset);
+	if (info.buf == MAP_FAILED) {
+		cerr << "Error: mmap failed with errno:\n\t" << strerror(errno) << endl;
+		return -1;
+	}
+	else {
+		cverb << vperf << hex << "mmap with the following parameters" << endl
+		      << "\tinfo.head: " << info.head << endl
+		      << "\tinfo.offset: " << info.offset << endl;
+		return 0;
+	}
+}
+
+
+int OP_perf_utils::op_mmap_trace_file(struct mmap_info & info, bool init)
+{
+	u64 shift;
+	if (init) {
+		if (!pg_sz)
+			pg_sz = sysconf(_SC_PAGESIZE);
+		if (!mmap_size) {
+			if (MMAP_WINDOW_SZ > info.file_data_size) {
+				mmap_size = info.file_data_size;
+			} else {
+				mmap_size = MMAP_WINDOW_SZ;
+			}
+		}
+		info.offset = 0;
+		info.head = info.file_data_offset;
+		shift = pg_sz * (info.head / pg_sz);
+		info.offset += shift;
+		info.head -= shift;
+	}
+	return __mmap_trace_file(info);
+}
+
+
 int OP_perf_utils::op_write_output(int output, void *buf, size_t size)
 {
 	int sum = 0;
diff --git a/libperf_events/operf_utils.h b/libperf_events/operf_utils.h
index 70dabb8..c14942d 100644
--- a/libperf_events/operf_utils.h
+++ b/libperf_events/operf_utils.h
@@ -39,6 +39,11 @@ extern uid_t my_uid;
 extern bool throttled;
 
 #define OP_APPNAME_LEN 1024
+#if BITS_PER_LONG == 64
+#define MMAP_WINDOW_SZ ULLONG_MAX
+#else
+#define MMAP_WINDOW_SZ (32 * 1024 * 1024ULL)
+#endif
 
 extern unsigned int op_nr_counters;
 
@@ -66,6 +71,8 @@ void op_perfread_sigusr1_handler(int sig __attribute__((unused)),
 int op_record_process_info(bool system_wide, pid_t pid, operf_record * pr, int output_fd);
 int op_write_output(int output, void *buf, size_t size);
 void op_write_event(event_t * event, u64 sample_type);
+int op_read_from_stream(std::ifstream & is, char * buf, std::streamsize sz);
+int op_mmap_trace_file(struct mmap_info & info, bool init);
 int op_get_next_online_cpu(DIR * dir, struct dirent *entry);
 bool op_convert_event_vals(std::vector<operf_event_t> * evt_vec);
 void op_reprocess_unresolved_events(u64 sample_type);
diff --git a/pe_profiling/operf.cpp b/pe_profiling/operf.cpp
index 39a7faa..1bebde0 100644
--- a/pe_profiling/operf.cpp
+++ b/pe_profiling/operf.cpp
@@ -29,6 +29,7 @@
 #include <unistd.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <fcntl.h>
 #include <sys/wait.h>
 #include <ftw.h>
 #include <getopt.h>
@@ -71,6 +72,7 @@ bool no_vmlinux;
 int kptr_restrict;
 char * start_time_human_readable;
 
+#define DEFAULT_OPERF_OUTFILE "operf.data"
 #define CALLGRAPH_MIN_COUNT_SCALE 15
 
 static char full_pathname[PATH_MAX];
@@ -82,6 +84,7 @@ static pid_t operf_record_pid;
 static pid_t operf_read_pid;
 static string samples_dir;
 static bool startApp;
+static string outputfile;
 static char start_time_str[32];
 static vector<operf_event_t> events;
 static bool jit_conversion_running;
@@ -100,6 +103,7 @@ string session_dir;
 string vmlinux;
 bool separate_cpu;
 bool separate_thread;
+bool post_conversion;
 vector<string> evts;
 }
 
@@ -118,13 +122,14 @@ struct option long_options [] =
  {"events", required_argument, NULL, 'e'},
  {"separate-cpu", no_argument, NULL, 'c'},
  {"separate-thread", no_argument, NULL, 't'},
+ {"lazy-conversion", no_argument, NULL, 'l'},
  {"help", no_argument, NULL, 'h'},
  {"version", no_argument, NULL, 'v'},
  {"usage", no_argument, NULL, 'u'},
  {NULL, 9, NULL, 0}
 };
 
-const char * short_options = "V:d:k:gsap:e:cthuv";
+const char * short_options = "V:d:k:gsap:e:ctlhuv";
 
 vector<string> verbose_string;
 
@@ -288,8 +293,10 @@ int start_profiling(void)
 			perror("Internal error: fork failed");
 			_exit(EXIT_FAILURE);
 		} else if (app_PID == 0) { // child process for exec'ing app
-			close(sample_data_pipe[0]);
-			close(sample_data_pipe[1]);
+			if (!operf_options::post_conversion) {
+				close(sample_data_pipe[0]);
+				close(sample_data_pipe[1]);
+			}
 			run_app();
 		}
 	}
@@ -307,7 +314,8 @@ int start_profiling(void)
 		int exit_code = EXIT_SUCCESS;
 		_set_signals_for_record();
 		close(operf_record_ready_pipe[0]);
-		close(sample_data_pipe[0]);
+		if (!operf_options::post_conversion)
+			close(sample_data_pipe[0]);
 		/*
 		 * Since an informative message will be displayed to the user if
 		 * an error occurs, we don't want to blow chunks here; instead, we'll
@@ -316,13 +324,25 @@ int start_profiling(void)
 		 */
 		try {
 			OP_perf_utils::vmlinux_info_t vi;
+			int outfd;
+			int flags = O_WRONLY | O_CREAT | O_TRUNC;
 			vi.image_name = operf_options::vmlinux;
 			vi.start = kernel_start;
 			vi.end = kernel_end;
-			operf_record operfRecord(sample_data_pipe[1], operf_options::system_wide, app_PID,
+			if (operf_options::post_conversion) {
+				outfd = open(outputfile.c_str(), flags, S_IRUSR|S_IWUSR);
+				if (outfd < 0) {
+					string errmsg = "Internal error: Could not create temporary output file. errno is ";
+					errmsg += strerror(errno);
+					throw runtime_error(errmsg);
+				}
+			} else {
+				outfd = sample_data_pipe[1];
+			}
+			operf_record operfRecord(outfd, operf_options::system_wide, app_PID,
 			                         (operf_options::pid == app_PID), events, vi,
 			                         operf_options::callgraph,
-			                         operf_options::separate_cpu);
+			                         operf_options::separate_cpu, operf_options::post_conversion);
 			if (operfRecord.get_valid() == false) {
 				/* If valid is false, it means that one of the "known" errors has
 				 * occurred:
@@ -549,9 +569,10 @@ static end_code_t _run(void)
 	sigfillset(&ss);
 	sigprocmask(SIG_BLOCK, &ss, NULL);
 
-	// Create pipe to which operf-record process writes sample data and
-	// from which the operf-read process reads.
-	if (pipe(sample_data_pipe) < 0) {
+	/* By default (unless the user specifies --lazy-conversion), the operf-record process
+	 * writes the sample data to a pipe, from which the operf-read process reads.
+	 */
+	if (!operf_options::post_conversion && pipe(sample_data_pipe) < 0) {
 		perror("Internal error: operf-record could not create pipe");
 		_exit(EXIT_FAILURE);
 	}
@@ -566,19 +587,25 @@ static end_code_t _run(void)
 	/* If we're not doing system wide profiling and no app is started, then
 	 * there's no profile data to convert. So if this condition is NOT true,
 	 * then we'll do the convert.
+	 * Note that if --lazy-connversion is passed, then operf_options::post_conversion
+	 * will be set, and we will defer conversion until after the operf-record
+	 * process is done.
 	 */
-	if (!(!app_started && !operf_options::system_wide)) {
-		operf_read_pid = fork();
-		if (operf_read_pid < 0) {
-			perror("Internal error: fork failed");
-			_exit(EXIT_FAILURE);
-		} else if (operf_read_pid == 0) { // child process
+	if (!operf_options::post_conversion) {
+		if (!(!app_started && !operf_options::system_wide)) {
+			cverb << vdebug << "Forking read pid" << endl;
+			operf_read_pid = fork();
+			if (operf_read_pid < 0) {
+				perror("Internal error: fork failed");
+				_exit(EXIT_FAILURE);
+			} else if (operf_read_pid == 0) { // child process
+				close(sample_data_pipe[1]);
+				convert_sample_data();
+			}
+			// parent
+			close(sample_data_pipe[0]);
 			close(sample_data_pipe[1]);
-			convert_sample_data();
 		}
-		// parent
-		close(sample_data_pipe[0]);
-		close(sample_data_pipe[1]);
 	}
 
 	set_signals();
@@ -691,10 +718,16 @@ again:
 			}
 		}
 	}
-	if (kill_record)
-		rc = _kill_operf_read_pid(_kill_operf_record_pid());
-	else
-		rc = _kill_operf_read_pid(rc);
+	if (kill_record) {
+		if (operf_options::post_conversion)
+			rc = _kill_operf_record_pid();
+		else
+			rc = _kill_operf_read_pid(_kill_operf_record_pid());
+	} else {
+		if (!operf_options::post_conversion)
+			rc = _kill_operf_read_pid(rc);
+	}
+
 	return rc;
 }
 
@@ -704,6 +737,10 @@ static void cleanup(void)
 	free(app_args);
 	events.clear();
 	verbose_string.clear();
+	if (operf_options::post_conversion) {
+		string cmd = "rm -f " + outputfile;
+		system(cmd.c_str());
+	}
 }
 
 static void _jitconv_complete(int val __attribute__((unused)))
@@ -818,12 +855,16 @@ static int __delete_old_previous_sample_data(const char *fpath,
  */
 static void convert_sample_data(void)
 {
-	int rc;
+	int inputfd;
+	string inputfname;
+	int rc = EXIT_SUCCESS;
+	int keep_waiting = 0;
 	string current_sampledir = samples_dir + "/current/";
 	string previous_sampledir = samples_dir + "/previous";
 	current_sampledir.copy(op_samples_current_dir, current_sampledir.length(), 0);
 
-	_set_signals_for_convert();
+	if (!app_started && !operf_options::system_wide)
+		return;
 
 	if (!operf_options::append) {
                 int flags = FTW_DEPTH | FTW_ACTIONRETVAL;
@@ -833,15 +874,15 @@ static void convert_sample_data(void)
 			cerr << "Unable to remove old sample data at " << previous_sampledir << "." << endl;
 			if (errno)
 				cerr << strerror(errno) << endl;
-			cleanup();
-			_exit(EXIT_FAILURE);
+			rc = EXIT_FAILURE;
+			goto out;
 		}
 		if (rename(current_sampledir.c_str(), previous_sampledir.c_str()) < 0) {
 			if (errno && (errno != ENOENT)) {
 				cerr << "Unable to move old profile data to " << previous_sampledir << endl;
 				cerr << strerror(errno) << endl;
-				cleanup();
-				_exit(EXIT_FAILURE);
+				rc = EXIT_FAILURE;
+				goto out;
 			}
 		}
 	}
@@ -849,15 +890,23 @@ static void convert_sample_data(void)
 	if (rc && (errno != EEXIST)) {
 		cerr << "Error trying to create " << current_sampledir << " dir." << endl;
 		perror("mkdir failed with");
-		_exit(EXIT_FAILURE);
+		rc = EXIT_FAILURE;
+		goto out;
 	}
 
-	operfRead.init(sample_data_pipe[0], current_sampledir, cpu_type, events);
+	if (operf_options::post_conversion) {
+		inputfd = -1;
+		inputfname = outputfile;
+	} else {
+		inputfd = sample_data_pipe[0];
+		inputfname = "";
+	}
+	operfRead.init(inputfd, inputfname, current_sampledir, cpu_type, events, operf_options::system_wide);
 	if ((rc = operfRead.readPerfHeader()) < 0) {
 		if (rc != OP_PERF_HANDLED_ERROR)
 			cerr << "Error: Cannot create read header info for sample data " << endl;
-		cleanup();
-		_exit(EXIT_FAILURE);
+		rc = EXIT_FAILURE;
+		goto out;
 	}
 	cverb << vdebug << "Successfully read header info for sample data " << endl;
 	if (operfRead.is_valid()) {
@@ -867,12 +916,13 @@ static void convert_sample_data(void)
 		} catch (runtime_error e) {
 			cerr << "Caught runtime error from operf_read::convertPerfData" << endl;
 			cerr << e.what() << endl;
-			cleanup();
-			_exit(EXIT_FAILURE);
+			rc = EXIT_FAILURE;
+			goto out;
 		}
 	}
+	_set_signals_for_convert();
+	cverb << vdebug << "Calling _do_jitdump_convert" << endl;
 	_do_jitdump_convert();
-	int keep_waiting = 0;
 	while (jit_conversion_running && (keep_waiting < 2)) {
 		sleep(1);
 		keep_waiting++;
@@ -880,8 +930,9 @@ static void convert_sample_data(void)
 	if (jit_conversion_running) {
 		kill(jitconv_pid, SIGKILL);
 	}
-	_exit(EXIT_SUCCESS);
-
+out:
+	if (!operf_options::post_conversion)
+		_exit(rc);
 }
 
 
@@ -1388,6 +1439,9 @@ static int _process_operf_and_app_args(int argc, char * const argv[])
 		case 't':
 			operf_options::separate_thread = true;
 			break;
+		case 'l':
+			operf_options::post_conversion = true;
+			break;
 		case 'h':
 			__print_usage_and_exit(NULL);
 			break;
@@ -1460,6 +1514,8 @@ static void process_args(int argc, char * const argv[])
 	}
 
 	_process_session_dir();
+	if (operf_options::post_conversion)
+		outputfile = samples_dir + "/" + DEFAULT_OPERF_OUTFILE;
 
 	if (operf_options::evts.empty()) {
 		// Use default event
@@ -1620,6 +1676,10 @@ int main(int argc, char * const argv[])
 	} else {
 		cerr << endl << "Profiling done." << endl;
 	}
+	if (operf_options::post_conversion) {
+		if (!(!app_started && !operf_options::system_wide))
+			convert_sample_data();
+	}
 	cleanup();
 	return run_result;;
 }
-- 
1.6.2.rc2