OProfile bugz #264 ([bugs:#264] opjitconv gets "record past end of file" error)
describes a scenario where the JIT agent library (used by a JVM) and the pjitconv
process could be accessing a JIT dump file at the same time, resulting in
a incoherent dump file being used for conversion to the <PID>.jo ELF file.
I developed a patch to address this problem, and oprofile's JIT support
sub-maintainer, Daniel Hansel, reviewed it and acked it.  I've pushed the
patch (see below) upstream already, but comments are still welcome.

----------------------------------------------------------------------------

Plug timing hole between JIT agent and opjitconv that can corrupt dump file

There's a window of time where oprofile's Java JIT agent library may be
writing to a JIT dump file at a time when the opjitconv is simultaneously
making a copy of it for purposes of processing its records to create an
ELF file.  It's possible that opjitconv may make a copy of the file when
it's not in a coherent state -- i.e., the JIT agent may be in the process
of performing multiple writes for a record. There is no OS-level locking
of the file, so neither of the processes involved (the JIT agent and
opjitconv) are aware the other one is accessing the file.

No actual bugs have been reported due to this problem, so this patch is
a pre-emptive fix. Symptoms of this problem occurring would include
opjitconv not processing all JIT dump files, and operf displaying the message
    JIT dump processing exited abnormally: 1
Passing "--verbose=debug" to operf to collect detailed information on the JIT
conversion process would show the following error message:

    record past end of file
    JIT convert error -1

Signed-off-by: Maynard Johnson <maynardj@us.ibm.com>
---
 libopagent/opagent.c  |  139 +++++++++++++++++++++++++++++++++++++++++++++---
 opjitconv/opjitconv.c |   31 ++++++++++-
 2 files changed, 159 insertions(+), 11 deletions(-)

diff --git a/libopagent/opagent.c b/libopagent/opagent.c
index 5ec6138..0d97265 100644
--- a/libopagent/opagent.c
+++ b/libopagent/opagent.c
@@ -126,6 +126,8 @@ static int define_bfd_vars(void)
 
 op_agent_t op_open_agent(void)
 {
+#define OP_JITCONV_USECS_TO_WAIT 1000
+	unsigned int usecs_waited = 0;
 	char pad_bytes[7] = {0, 0, 0, 0, 0, 0, 0};
 	int pad_cnt;
 	char dump_path[PATH_MAX];
@@ -196,6 +198,23 @@ op_agent_t op_open_agent(void)
 		close(fd);
 		return NULL;
 	}
+
+again:
+	/* We need OS-level file locking here because the opjitconv process may need to
+	 * copy the dumpfile while the JIT agent is still writing to it. */
+	rc = flock(fd, LOCK_EX | LOCK_NB);
+	if (rc) {
+		if (usecs_waited < OP_JITCONV_USECS_TO_WAIT) {
+			usleep(100);
+			usecs_waited += 100;
+			goto again;
+		} else {
+			printf("opagent: Unable to obtain lock on JIT dumpfile\n");
+			return NULL;
+		}
+	}
+
+
 	if (define_bfd_vars()) {
 		fclose(dumpfile);
 		return NULL;
@@ -216,30 +235,35 @@ op_agent_t op_open_agent(void)
 
 	header.timestamp = tv.tv_sec;
 	snprintf(err_msg, PATH_MAX + 16, "Error writing to %s", dump_path);
-	if (!fwrite(&header, sizeof(header), 1, dumpfile)) {
+	if (!fwrite_unlocked(&header, sizeof(header), 1, dumpfile)) {
 		fclose(dumpfile);
 		fprintf(stderr, "%s\n", err_msg);
 		return NULL;
 	}
-	if (!fwrite(_bfd_target_name, strlen(_bfd_target_name) + 1, 1,
+	if (!fwrite_unlocked(_bfd_target_name, strlen(_bfd_target_name) + 1, 1,
 		    dumpfile)) {
 		fclose(dumpfile);
 		fprintf(stderr, "%s\n", err_msg);
 		return NULL;
 	}
 	/* write padding '\0' if necessary */
-	if (pad_cnt && !fwrite(pad_bytes, pad_cnt, 1, dumpfile)) {
+	if (pad_cnt && !fwrite_unlocked(pad_bytes, pad_cnt, 1, dumpfile)) {
 		fclose(dumpfile);
 		fprintf(stderr, "%s\n", err_msg);
 		return NULL;
 	}
-	fflush(dumpfile);
+	fflush_unlocked(dumpfile);
+	flock(fd, LOCK_UN);
+#undef OP_JITCONV_USECS_TO_WAIT
 	return (op_agent_t)dumpfile;
 }
 
 
 int op_close_agent(op_agent_t hdl)
 {
+#define OP_JITCONV_USECS_TO_WAIT 1000
+	unsigned int usecs_waited = 0;
+	int dumpfd, rc;
 	struct jr_code_close rec;
 	struct timeval tv;
 	FILE * dumpfile = (FILE *) hdl;
@@ -255,9 +279,26 @@ int op_close_agent(op_agent_t hdl)
 	}
 	rec.timestamp = tv.tv_sec;
 
-	if (!fwrite(&rec, sizeof(rec), 1, dumpfile))
+again:
+	/* We need OS-level file locking here because the opjitconv process may need to
+	 * copy the dumpfile while the JIT agent is still writing to it. */
+	rc = flock(dumpfd, LOCK_EX | LOCK_NB);
+	if (rc) {
+		if (usecs_waited < OP_JITCONV_USECS_TO_WAIT) {
+			usleep(100);
+			usecs_waited += 100;
+			goto again;
+		} else {
+			printf("opagent: Unable to obtain lock on JIT dumpfile\n");
+			return -1;
+		}
+	}
+
+	if (!fwrite_unlocked(&rec, sizeof(rec), 1, dumpfile))
 		return -1;
 	fclose(dumpfile);
+	flock(dumpfd, LOCK_UN);
+#undef OP_JITCONV_USECS_TO_WAIT
 	dumpfile = NULL;
 	return 0;
 }
@@ -266,6 +307,9 @@ int op_close_agent(op_agent_t hdl)
 int op_write_native_code(op_agent_t hdl, char const * symbol_name,
 	uint64_t vma, void const * code, unsigned int const size)
 {
+#define OP_JITCONV_USECS_TO_WAIT 1000
+	unsigned int usecs_waited = 0;
+	int dumpfd, rc;
 	struct jr_code_load rec;
 	struct timeval tv;
 	size_t sz_symb_name;
@@ -296,7 +340,27 @@ int op_write_native_code(op_agent_t hdl, char const * symbol_name,
 
 	rec.timestamp = tv.tv_sec;
 
-	/* locking makes sure that we continuously write this record, if
+	if ((dumpfd = fileno(dumpfile)) < 0) {
+		fprintf(stderr, "opagent: Unable to get file descriptor for JIT dumpfile\n");
+		return -1;
+	}
+again:
+	/* We need OS-level file locking here because the opjitconv process may need to
+	 * copy the dumpfile while the JIT agent is still writing to it.
+	 */
+	rc = flock(dumpfd, LOCK_EX | LOCK_NB);
+	if (rc) {
+		if (usecs_waited < OP_JITCONV_USECS_TO_WAIT) {
+			usleep(100);
+			usecs_waited += 100;
+			goto again;
+		} else {
+			printf("opagent: Unable to obtain lock on JIT dumpfile\n");
+			return -1;
+		}
+	}
+
+	/* This locking makes sure that we continuously write this record if
 	 * we are called within a multi-threaded context */
 	flockfile(dumpfile);
 	/* Write record, symbol name, code (optionally), and (if necessary)
@@ -312,10 +376,13 @@ int op_write_native_code(op_agent_t hdl, char const * symbol_name,
 		 * data as soon as possible */
 		fflush_unlocked(dumpfile);
 		funlockfile(dumpfile);
+		flock(dumpfd, LOCK_UN);
 		return 0;
 	}
 	fflush_unlocked(dumpfile);
 	funlockfile(dumpfile);
+	flock(dumpfd, LOCK_UN);
+#undef OP_JITCONV_USECS_TO_WAIT
 	return -1;
 }
 
@@ -324,13 +391,15 @@ int op_write_debug_line_info(op_agent_t hdl, void const * code,
 			     size_t nr_entry,
 			     struct debug_line_info const * compile_map)
 {
+#define OP_JITCONV_USECS_TO_WAIT 1000
+	unsigned int usecs_waited = 0;
 	struct jr_code_debug_info rec;
 	long cur_pos, last_pos;
 	struct timeval tv;
 	size_t i;
 	size_t padding_count;
 	char padd_bytes[7] = {0, 0, 0, 0, 0, 0, 0};
-	int rc = -1;
+	int dumpfd, rc = -1;
 	FILE * dumpfile = (FILE *) hdl;
 
 	if (!dumpfile) {
@@ -355,6 +424,27 @@ int op_write_debug_line_info(op_agent_t hdl, void const * code,
 
 	rec.timestamp = tv.tv_sec;
 
+	if ((dumpfd = fileno(dumpfile)) < 0) {
+		fprintf(stderr, "opagent: Unable to get file descriptor for JIT dumpfile\n");
+		return -1;
+	}
+again:
+	/* We need OS-level file locking here because the opjitconv process may need to
+	 * copy the dumpfile while the JIT agent is still writing to it. */
+	rc = flock(dumpfd, LOCK_EX | LOCK_NB);
+	if (rc) {
+		if (usecs_waited < OP_JITCONV_USECS_TO_WAIT) {
+			usleep(100);
+			usecs_waited += 100;
+			goto again;
+		} else {
+			printf("opagent: Unable to obtain lock on JIT dumpfile\n");
+			return -1;
+		}
+	}
+
+	/* This locking makes sure that we continuously write this record if
+	 * we are called within a multi-threaded context. */
 	flockfile(dumpfile);
 
 	if ((cur_pos = ftell(dumpfile)) == -1l)
@@ -392,12 +482,17 @@ int op_write_debug_line_info(op_agent_t hdl, void const * code,
 error:
 	fflush_unlocked(dumpfile);
 	funlockfile(dumpfile);
+	flock(dumpfd, LOCK_UN);
+#undef OP_JITCONV_USECS_TO_WAIT
 	return rc;
 }
 
 
 int op_unload_native_code(op_agent_t hdl, uint64_t vma)
 {
+#define OP_JITCONV_USECS_TO_WAIT 1000
+	int dumpfd, rc;
+	unsigned int usecs_waited = 0;
 	struct jr_code_unload rec;
 	struct timeval tv;
 	FILE * dumpfile = (FILE *) hdl;
@@ -417,9 +512,35 @@ int op_unload_native_code(op_agent_t hdl, uint64_t vma)
 	}
 	rec.timestamp = tv.tv_sec;
 
-	if (!fwrite(&rec, sizeof(rec), 1, dumpfile))
+	if ((dumpfd = fileno(dumpfile)) < 0) {
+		fprintf(stderr, "opagent: Unable to get file descriptor for JIT dumpfile\n");
+		return -1;
+	}
+again:
+	/* We need OS-level file locking here because the opjitconv process may need to
+	 * copy the dumpfile while the JIT agent is still writing to it. */
+	rc = flock(dumpfd, LOCK_EX | LOCK_NB);
+	if (rc) {
+		if (usecs_waited < OP_JITCONV_USECS_TO_WAIT) {
+			usleep(100);
+			usecs_waited += 100;
+			goto again;
+		} else {
+			printf("opagent: Unable to obtain lock on JIT dumpfile\n");
+			return -1;
+		}
+	}
+
+	/* This locking makes sure that we continuously write this record if
+	 * we are called within a multi-threaded context. */
+	flockfile(dumpfile);
+
+	if (!fwrite_unlocked(&rec, sizeof(rec), 1, dumpfile))
 		return -1;
-	fflush(dumpfile);
+	fflush_unlocked(dumpfile);
+	funlockfile(dumpfile);
+	flock(dumpfd, LOCK_UN);
+#undef OP_JITCONV_USECS_TO_WAIT
 	return 0;
 }
 
diff --git a/opjitconv/opjitconv.c b/opjitconv/opjitconv.c
index a9dfa91..712bb4a 100644
--- a/opjitconv/opjitconv.c
+++ b/opjitconv/opjitconv.c
@@ -34,6 +34,7 @@
 #include <sys/types.h>
 #include <unistd.h>
 #include <wait.h>
+#include <sys/file.h>
 
 /*
  * list head.  The linked list is used during parsing (parse_all) to
@@ -198,10 +199,31 @@ out:
  */
 int copy_dumpfile(char const * dumpfile, char * tmp_dumpfile)
 {
+#define OP_JITCONV_USECS_TO_WAIT 1000
+	int file_locked = 0;
+	unsigned int usecs_waited = 0;
 	int rc = OP_JIT_CONV_OK;
-
+	int fd = open(dumpfile, S_IRUSR);
+	if (fd < 0) {
+		perror("opjitconv failed to open JIT dumpfile");
+		return OP_JIT_CONV_FAIL;
+	}
+again:
+	// Need OS-level file locking here since opagent may still be writing to the file.
+	rc = flock(fd, LOCK_EX | LOCK_NB);
+	if (rc) {
+		if (usecs_waited < OP_JITCONV_USECS_TO_WAIT) {
+			usleep(100);
+			usecs_waited += 100;
+			goto again;
+		} else {
+			printf("opjitconv: Unable to obtain lock on %s.\n", dumpfile);
+			rc = OP_JIT_CONV_FAIL;
+			goto out;
+		}
+	}
+	file_locked = 1;
 	sprintf(sys_cmd_buffer, "/bin/cp -p %s %s", dumpfile, tmp_dumpfile);
-
 	if (system(sys_cmd_buffer) != 0) {
 		printf("opjitconv: Calling system() to copy files failed.\n");
 		rc = OP_JIT_CONV_FAIL;
@@ -215,6 +237,11 @@ int copy_dumpfile(char const * dumpfile, char * tmp_dumpfile)
 	}
 	
 out:
+#undef OP_JITCONV_USECS_TO_WAIT
+	close(fd);
+	if (file_locked)
+		flock(fd, LOCK_UN);
+
 	return rc;
 }
 
-- 
1.7.1