From: <ke...@us...> - 2008-09-24 16:17:36
|
Revision: 7628 http://bacula.svn.sourceforge.net/bacula/?rev=7628&view=rev Author: kerns Date: 2008-09-24 16:13:15 +0000 (Wed, 24 Sep 2008) Log Message: ----------- This code should fix the race condition that leads to a Director crash at job end time when the job list is updated. This was reported in bug #1162. Modified Paths: -------------- branches/Branch-2.4/bacula/src/lib/jcr.c branches/Branch-2.4/bacula/technotes-2.4 Added Paths: ----------- branches/Branch-2.4/bacula/patches/2.4.2-jobend-crash.patch Added: branches/Branch-2.4/bacula/patches/2.4.2-jobend-crash.patch =================================================================== --- branches/Branch-2.4/bacula/patches/2.4.2-jobend-crash.patch (rev 0) +++ branches/Branch-2.4/bacula/patches/2.4.2-jobend-crash.patch 2008-09-24 16:13:15 UTC (rev 7628) @@ -0,0 +1,119 @@ + + This patch should fix the race condition that leads to a Director + crash at job end time when the job list is updated. This was reported + in bug #1162. + + Apply this patch to Bacula version 2.4.2 (and earlier) with: + + cd <bacula-source> + patch -p0 <2.4.2-jobend-crash.patch + ./configure <your-options> + make + ... + make install + + +Index: src/lib/jcr.c +=================================================================== +--- src/lib/jcr.c (revision 7566) ++++ src/lib/jcr.c (working copy) +@@ -110,6 +110,7 @@ + void term_last_jobs_list() + { + if (last_jobs) { ++ lock_last_jobs_list(); + while (!last_jobs->empty()) { + void *je = last_jobs->first(); + last_jobs->remove(je); +@@ -117,6 +118,7 @@ + } + delete last_jobs; + last_jobs = NULL; ++ unlock_last_jobs_list(); + } + if (jcrs) { + delete jcrs; +@@ -128,6 +130,7 @@ + { + struct s_last_job *je, job; + uint32_t num; ++ bool ok = true; + + Dmsg1(100, "read_last_jobs seek to %d\n", (int)addr); + if (addr == 0 || lseek(fd, (off_t)addr, SEEK_SET) < 0) { +@@ -140,11 +143,13 @@ + if (num > 4 * max_last_jobs) { /* sanity check */ + return false; + } ++ lock_last)jobs_list(); + for ( ; num; num--) { + if (read(fd, &job, sizeof(job)) != sizeof(job)) { + berrno be; + Pmsg1(000, "Read job entry. ERR=%s\n", be.bstrerror()); +- return false; ++ ok = false; ++ break; + } + if (job.JobId > 0) { + je = (struct s_last_job *)malloc(sizeof(struct s_last_job)); +@@ -160,7 +165,8 @@ + } + } + } +- return true; ++ unlock_last_jobs_list(); ++ return ok; + } + + uint64_t write_last_jobs_list(int fd, uint64_t addr) +@@ -173,20 +179,22 @@ + return 0; + } + if (last_jobs) { ++ lock_last)jobs_list(); + /* First record is number of entires */ + num = last_jobs->size(); + if (write(fd, &num, sizeof(num)) != sizeof(num)) { + berrno be; + Pmsg1(000, "Error writing num_items: ERR=%s\n", be.bstrerror()); +- return 0; ++ goto bail_out; + } + foreach_dlist(je, last_jobs) { + if (write(fd, je, sizeof(struct s_last_job)) != sizeof(struct s_last_job)) { + berrno be; + Pmsg1(000, "Error writing job: ERR=%s\n", be.bstrerror()); +- return 0; ++ got bail_out; + } + } ++ unlock_last_jobs_list(); + } + /* Return current address */ + ssize_t stat = lseek(fd, 0, SEEK_CUR); +@@ -195,6 +203,9 @@ + } + return stat; + ++bail_out: ++ unlock_last_jobs_list(); ++ return 0; + } + + void lock_last_jobs_list() +@@ -331,6 +342,7 @@ + last_job.end_time = time(NULL); + /* Keep list of last jobs, but not Console where JobId==0 */ + if (last_job.JobId > 0) { ++ lock_last_jobs_list(); + je = (struct s_last_job *)malloc(sizeof(struct s_last_job)); + memcpy((char *)je, (char *)&last_job, sizeof(last_job)); + if (!last_jobs) { +@@ -342,6 +354,7 @@ + last_jobs->remove(je); + free(je); + } ++ unlock_last_jobs_list(); + } + break; + default: Modified: branches/Branch-2.4/bacula/src/lib/jcr.c =================================================================== --- branches/Branch-2.4/bacula/src/lib/jcr.c 2008-09-24 13:23:01 UTC (rev 7627) +++ branches/Branch-2.4/bacula/src/lib/jcr.c 2008-09-24 16:13:15 UTC (rev 7628) @@ -110,6 +110,7 @@ void term_last_jobs_list() { if (last_jobs) { + lock_last_jobs_list(); while (!last_jobs->empty()) { void *je = last_jobs->first(); last_jobs->remove(je); @@ -117,6 +118,7 @@ } delete last_jobs; last_jobs = NULL; + unlock_last_jobs_list(); } if (jcrs) { delete jcrs; @@ -128,6 +130,7 @@ { struct s_last_job *je, job; uint32_t num; + bool ok = true; Dmsg1(100, "read_last_jobs seek to %d\n", (int)addr); if (addr == 0 || lseek(fd, (off_t)addr, SEEK_SET) < 0) { @@ -140,11 +143,13 @@ if (num > 4 * max_last_jobs) { /* sanity check */ return false; } + lock_last)jobs_list(); for ( ; num; num--) { if (read(fd, &job, sizeof(job)) != sizeof(job)) { berrno be; Pmsg1(000, "Read job entry. ERR=%s\n", be.bstrerror()); - return false; + ok = false; + break; } if (job.JobId > 0) { je = (struct s_last_job *)malloc(sizeof(struct s_last_job)); @@ -160,7 +165,8 @@ } } } - return true; + unlock_last_jobs_list(); + return ok; } uint64_t write_last_jobs_list(int fd, uint64_t addr) @@ -173,20 +179,22 @@ return 0; } if (last_jobs) { + lock_last)jobs_list(); /* First record is number of entires */ num = last_jobs->size(); if (write(fd, &num, sizeof(num)) != sizeof(num)) { berrno be; Pmsg1(000, "Error writing num_items: ERR=%s\n", be.bstrerror()); - return 0; + goto bail_out; } foreach_dlist(je, last_jobs) { if (write(fd, je, sizeof(struct s_last_job)) != sizeof(struct s_last_job)) { berrno be; Pmsg1(000, "Error writing job: ERR=%s\n", be.bstrerror()); - return 0; + got bail_out; } } + unlock_last_jobs_list(); } /* Return current address */ ssize_t stat = lseek(fd, 0, SEEK_CUR); @@ -195,6 +203,9 @@ } return stat; +bail_out: + unlock_last_jobs_list(); + return 0; } void lock_last_jobs_list() @@ -331,6 +342,7 @@ last_job.end_time = time(NULL); /* Keep list of last jobs, but not Console where JobId==0 */ if (last_job.JobId > 0) { + lock_last_jobs_list(); je = (struct s_last_job *)malloc(sizeof(struct s_last_job)); memcpy((char *)je, (char *)&last_job, sizeof(last_job)); if (!last_jobs) { @@ -342,6 +354,7 @@ last_jobs->remove(je); free(je); } + unlock_last_jobs_list(); } break; default: Modified: branches/Branch-2.4/bacula/technotes-2.4 =================================================================== --- branches/Branch-2.4/bacula/technotes-2.4 2008-09-24 13:23:01 UTC (rev 7627) +++ branches/Branch-2.4/bacula/technotes-2.4 2008-09-24 16:13:15 UTC (rev 7628) @@ -1,6 +1,10 @@ Technical notes on version 2.4 General: +28Sep08 +kes This code should fix the race condition that leads to a Director + crash at job end time when the job list is updated. This was reported + in bug #1162. 20Sep08 kes Remove all double quotes from SQLite creating script and replace by single quotes as suggested by John Huttley. This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |