From: <tho...@us...> - 2013-11-19 14:10:13
|
Revision: 7563 http://bigdata.svn.sourceforge.net/bigdata/?rev=7563&view=rev Author: thompsonbry Date: 2013-11-19 14:10:04 +0000 (Tue, 19 Nov 2013) Log Message: ----------- I have added unit tests for a physically empty HALog and a corrupt HALog parallel to the existing unit test for a logically empty HALog. All three conditions are correctly handled when the HALog with the problem is the successor of the last commit point on the journal. This closes out the bug identified above. The other aspects of this ticket remain open. See #775 (HAJournal start()) Modified Paths: -------------- branches/BIGDATA_RELEASE_1_3_0/bigdata-jini/src/java/com/bigdata/journal/jini/ha/HALogNexus.java branches/BIGDATA_RELEASE_1_3_0/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestAll.java branches/BIGDATA_RELEASE_1_3_0/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3JournalServerWithHALogs.java branches/BIGDATA_RELEASE_1_3_0/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3WORMJournalServer.java Modified: branches/BIGDATA_RELEASE_1_3_0/bigdata-jini/src/java/com/bigdata/journal/jini/ha/HALogNexus.java =================================================================== --- branches/BIGDATA_RELEASE_1_3_0/bigdata-jini/src/java/com/bigdata/journal/jini/ha/HALogNexus.java 2013-11-18 22:45:25 UTC (rev 7562) +++ branches/BIGDATA_RELEASE_1_3_0/bigdata-jini/src/java/com/bigdata/journal/jini/ha/HALogNexus.java 2013-11-19 14:10:04 UTC (rev 7563) @@ -70,7 +70,7 @@ */ public class HALogNexus implements IHALogWriter { - private static final Logger log = Logger.getLogger(SnapshotManager.class); + private static final Logger log = Logger.getLogger(HALogNexus.class); /** * Logger for HA events. @@ -249,37 +249,125 @@ * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/679" > * HAJournalServer can not restart due to logically empty log files * </a> + * @see <a href="http://sourceforge.net/apps/trac/bigdata/ticket/775" > + * HAJournal start() </a> */ { /* - * Used to detect a logically empty HALog (iff it is the last one in - * commit order). + * Data structure used to detect a bad HALog and identify whether or + * not it is the last one in commit order. */ final HALogScanState tmp = new HALogScanState(); // Scan the HALog directory, populating the in-memory index. populateIndexRecursive(haLogDir, IHALogReader.HALOG_FILTER, tmp); - if (tmp.emptyHALogFile != null) { + final long commitCounterOnJournal = journal.getRootBlockView().getCommitCounter(); - /* - * The last HALog file is logically empty. It WAS NOT added to - * the in-memory index. We try to remove it now. + if (tmp.firstBadHALogFile != null) { + + /** + * The only the last HALog file is bad (physically empty, + * logically empty, bad MAGIC, etc), then it WAS NOT added to + * the in-memory index. * + * We try to remove it now and then start up normally. While we + * are short one HALog file, we can obtain it during + * resynchronization from the other nodes in the cluster. + * * Note: It is not critical that we succeed in removing this * HALog file so long as it does not interfere with the correct * startup of the HAJournalServer. */ - final File f = tmp.emptyHALogFile; + final File f = tmp.firstBadHALogFile; - if (!f.delete()) { + /* + * Parse out the closing commit counter for that HALog. This is + * the commit counter that would be assigned to the root block + * if this transaction had been applied to the Journal. + */ + final long closingCommitCounter = CommitCounterUtility + .parseCommitCounterFile(f.getName(), + IHALogReader.HA_LOG_EXT); - log.warn("Could not remove empty HALog: " + f); + if (commitCounterOnJournal + 1 == closingCommitCounter) { + + /* + * This HALog file was for the next commit point to be + * recorded on the Journal. We can safely delete it and + * continue the normal startup. + */ + if (haLog.isInfoEnabled()) + haLog.info("Removing bad/empty HALog file: commitCounterOnJournal=" + + commitCounterOnJournal); + + if (!f.delete()) { + + log.warn("Could not remove empty HALog: " + f); + + } + + } else { + + /* + * This HALog file is bad. The service can not start until + * it has been replaced. + * + * FIXME Automate the replacement of the bad/missing HALog + * file from the quorum leader. + */ + throw new HALogException(tmp.firstBadHALogFile, + tmp.firstCause); + } } + + // Get the most recent HALog record from the index. + final IHALogRecord r = haLogIndex.getNewestEntry(); + + if (r != null) { + + /** + * Note: The logic above makes sure that we have each HALog in + * sequence from some unspecified starting point, but it does + * not verify that the last HALog file corresponds to the last + * durable commit point on the Journal, does not verify the + * number of local HALog files against some target (e.g., as + * specified by the restore policy), and does not verify that + * there are no HALog files for commit points beyond the last + * commit point on the journal (which could happen if someone + * did a point in time restore of the journal from a snapshot + * and failed to remove the HALog files after that point in + * time). + * + * TODO This should be refactored when we address #775. + */ + + if (r.getCommitCounter() < commitCounterOnJournal) { + /* + * Reject start if we are missing the HALog for the most + * recent commit point on the journal. + */ + throw new RuntimeException( + "Missing HALog(s) for committed state on journal: journal@=" + + commitCounterOnJournal + ", lastHALog@" + + r.getCommitCounter()); + } + + /* + * Note: If there are HALog files for commit points beyond the + * most recent commit point on the journal, then those HALog + * files will be applied to roll forward the journal. This is + * done by HAJournalServer in its RESTORE state. Thus is + * necessary to remove any HALog files beyond the desired commit + * point before restarting the service when rolling back to a + * specific point in time. + */ + + } } @@ -309,13 +397,19 @@ */ private static class HALogScanState { /** - * Flag is set the first time an empty HALog file is identified. + * Flag is set the first time bad HALog file is identified. * <p> - * Note: We scan the HALog files in commit counter order. If the last - * file is (logically) empty, then we will silently remove it. However, - * if any other HALog file is logically empty, then this is an error. + * Note: We scan the HALog files in commit counter order. If only the + * last file in the scan is bad, then we will silently remove it - the + * HALog will be replaced when this service attempts to . + * However, if there is more than one bad HALog file, then this is an + * error. */ - File emptyHALogFile = null; + File firstBadHALogFile = null; + /** + * The exception when we first encountered a bad HALog file. + */ + Throwable firstCause = null; } /** @@ -328,11 +422,11 @@ * side-effect using the {@link HALogScanState} and will NOT be added to the * index. The caller SHOULD then remove the logically empty HALog file * - * TODO If an HALog is discovered to have bad checksums or otherwise corrupt - * root blocks and there is a met quorum, then we should re-replicate that - * HALog from the quourm leader. + * FIXME If an HALog is discovered to have bad checksums or otherwise + * corrupt root blocks and there is a met quorum, then we should + * re-replicate that HALog from the quourm leader. * - * TODO For HALog files other than the last HALog file (in commit counter + * FIXME For HALog files other than the last HALog file (in commit counter * sequence) if there are any missing HALog files in the sequence, if any if * the files in the sequence other than the last HALog file is logically * empty, or if any of those HALog files has a bad root bloxks) then we @@ -342,6 +436,27 @@ * we allow the service to start, then it will have limited rollback * capability. All of this could be checked in an index scan once we have * identified all of the HALog files in the file system. + * + * TODO This could be rewritten to generate the filenames by running the + * commit counter from the first discovered HALog file's commit counter up + * through the current commit point on the journal. Alternatively, we could + * just start with the current commit point on the journal and the substract + * one and move backward until we find the first HALog file that is not + * locally available. We could then cross check this with the + * {@link IRestorePolicy} and decide whether we needed to back fill either + * HALog files or snapshots on this service in order to satisify the + * {@link IRestorePolicy}. This has the advantage that we start with the + * most recent HALog file first, so we can immediately diagnose any problems + * with the last commit point on restart. It removes the recursive logic and + * makes it easier to write code that decides whether or not a given HALog + * file being bad poses a problem and what kind of a problem and how to + * resolve that problem. There will be more GC associated with the + * generation of the file names from the commit counters, but we could get + * rid of that GC overhead entirely by supplying a reusable + * {@link StringBuilder}. + * + * @see <a href="http://sourceforge.net/apps/trac/bigdata/ticket/775" > + * HAJournal start() </a> */ private void populateIndexRecursive(final File f, final FileFilter fileFilter, final HALogScanState state) @@ -370,7 +485,7 @@ } else { - if (state.emptyHALogFile != null) { + if (state.firstBadHALogFile != null) { /* * We already have an empty HALog file. If there are any more @@ -380,8 +495,9 @@ * order). */ - throw new LogicallyEmptyHALogException(state.emptyHALogFile); - + throw new HALogException(state.firstBadHALogFile, + state.firstCause); + } try { @@ -389,16 +505,22 @@ // Attempt to add to the index. addHALog(f); - } catch (LogicallyEmptyHALogException ex) { + } catch (Throwable t) { + if (InnerCause.isInnerCause(t, InterruptedException.class)) { + // propagate interrupt. + throw new RuntimeException(t); + } + // Should be null since we checked this above. - assert state.emptyHALogFile == null; + assert state.firstBadHALogFile == null; /* * The first empty HALog file. There is at most one allowed and * it must be the last HALog file in commit counter order. */ - state.emptyHALogFile = f; + state.firstBadHALogFile = f; + state.firstCause = t; } @@ -430,6 +552,13 @@ final byte[] b0 = new byte[RootBlockView.SIZEOF_ROOT_BLOCK]; final byte[] b1 = new byte[RootBlockView.SIZEOF_ROOT_BLOCK]; + + if (file.length() == 0L) { + /* + * The file is physically empty (zero length). + */ + throw new EmptyHALogException(file); + } final DataInputStream is = new DataInputStream( new FileInputStream(file)); @@ -455,8 +584,8 @@ } catch(IOException ex) { // Wrap exception with the file name. - throw new IOException(ex.getMessage() + ", file=" + file, ex); - + throw new HALogException(file, ex); + } finally { is.close(); @@ -489,18 +618,16 @@ * @throws ChecksumError * if there is a checksum problem with the root blocks. * - * TODO If the root blocks are the same then this is an empty - * HALog. Right now that is an error. [We might want to simply - * remove any such HALog file.] - * <p> - * Likewise, it is an error if any HALog has bad root blocks - * (checksum or other errors). - * * TODO A similar problem exists if any of the HALog files GTE * the earliest snapshot are missing, have bad root blocks, etc. * We will not be able to restore the commit point associated * with that HALog file unless it also happens to correspond to - * a snapshot. + * a snapshot. Such bad/missing HALog files should be + * re-replicated from the quorum leader. This process should be + * automated. + * + * @see <a href="http://sourceforge.net/apps/trac/bigdata/ticket/775" > + * HAJournal start() </a> */ private void addHALog(final File file) throws IOException, LogicallyEmptyHALogException { @@ -554,28 +681,66 @@ } /** + * Base class for exceptions when we are unable to read an HALog file. + * + * @author <a href="mailto:tho...@us...">Bryan Thompson</a> + */ + private static class HALogException extends IOException { + + private static final long serialVersionUID = 1L; + + public HALogException(final File file) { + + super(file.getAbsolutePath()); + + } + + public HALogException(final File file,final Throwable cause) { + + super(file.getAbsolutePath(), cause); + + } + + } + + /** * Exception raise when an HALog file is logically empty (the opening and * closing root blocks are identicial). * * @author <a href="mailto:tho...@us...">Bryan * Thompson</a> */ - private static class LogicallyEmptyHALogException extends IOException { + private static class LogicallyEmptyHALogException extends HALogException { - /** - * - */ private static final long serialVersionUID = 1L; public LogicallyEmptyHALogException(final File file) { - super(file.getAbsolutePath()); + super(file); } } /** + * Exception raise when an HALog file is physically empty (zero length). + * + * @author <a href="mailto:tho...@us...">Bryan + * Thompson</a> + */ + private static class EmptyHALogException extends HALogException { + + private static final long serialVersionUID = 1L; + + public EmptyHALogException(final File file) { + + super(file); + + } + + } + + /** * Remove an snapshot from the file system and the {@link #haLogIndex}. * * @param file Modified: branches/BIGDATA_RELEASE_1_3_0/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestAll.java =================================================================== --- branches/BIGDATA_RELEASE_1_3_0/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestAll.java 2013-11-18 22:45:25 UTC (rev 7562) +++ branches/BIGDATA_RELEASE_1_3_0/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestAll.java 2013-11-19 14:10:04 UTC (rev 7563) @@ -32,8 +32,6 @@ import junit.framework.TestSuite; import com.bigdata.journal.Journal; -import com.bigdata.journal.WORMStrategy; -import com.bigdata.rwstore.RWStore; /** * Test suite for highly available configurations of the standalone @@ -60,21 +58,6 @@ /** * Returns a test that will run each of the implementation specific test * suites in turn. - * - * FIXME (*) Test {@link WORMStrategy} and {@link RWStore} (through an override?) - * - * FIXME The NSS should transparently proxy mutation requests to the quorum - * leader (and to a global leader if offsite is supported, or maybe that is - * handled at a layer above). The tests need to be modified (A) to NOT only - * write on the leader; and (B) to verify that we can send a write request - * to ANY service that is joined with the met quorum. (And verify for POST, - * DELETE, and PUT since those are all different method.) - * <p> - * Note: We could have services that are not joined with the met quorum - * simply forward read requests to services that ARE joined with the met - * quorum. That way they can begin "accepting" reads and writes immediately. - * This could also be done one level down, using failover reads to reach a - * service joined with the met quorum. */ public static Test suite() { Modified: branches/BIGDATA_RELEASE_1_3_0/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3JournalServerWithHALogs.java =================================================================== --- branches/BIGDATA_RELEASE_1_3_0/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3JournalServerWithHALogs.java 2013-11-18 22:45:25 UTC (rev 7562) +++ branches/BIGDATA_RELEASE_1_3_0/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3JournalServerWithHALogs.java 2013-11-19 14:10:04 UTC (rev 7563) @@ -27,6 +27,9 @@ package com.bigdata.journal.jini.ha; import java.io.File; +import java.io.FileOutputStream; +import java.io.OutputStream; +import java.nio.ByteBuffer; import net.jini.config.Configuration; @@ -37,7 +40,9 @@ import com.bigdata.ha.halog.IHALogReader; import com.bigdata.ha.msg.IHA2PhasePrepareMessage; import com.bigdata.journal.CommitCounterUtility; +import com.bigdata.journal.FileMetadata; import com.bigdata.journal.jini.ha.HAJournalTest.HAGlueTest; +import com.bigdata.util.InnerCause; /** * Test suite when we are using the {@link DefaultSnapshotPolicy} and @@ -104,7 +109,7 @@ * HAJournalServer can not restart due to logically empty log files * </a> */ - public void test_startABC_emptyLogFileDeletedOnRestartC() throws Exception { + public void test_startABC_logicallyEmptyLogFileDeletedOnRestartC() throws Exception { final ABC abc = new ABC(true/* sequential */); @@ -275,6 +280,620 @@ } /** + * This is a unit test for the ability to silently remove a physically empty + * HALog file. Three services are started in sequence (A,B,C). A series of + * small commits are applied to the quorum. (C) is then shutdown. A + * logically empty HALog file should exist on each service for the next + * commit point. We now overwrite that file with a physically empty HALog + * file (zero length). We then do one more update. C is then restarted. We + * verify that C restarts and that the logically empty HALog file has been + * replaced by an HALog file that has the same digest as the HALog file for + * that commit point on (A,B). + * <p> + * Note: We can not reliably observe that the physically HALog file was + * removed during startup. However, this is not critical. What is critical + * is that the physically empty HALog file (a) does not prevent (C) from + * starting; (b) is replaced by the correct HALog data from the quorum + * leader; and (c) that (C) resynchronizes with the met quorum and joins + * causing a fully met quorum. + * + * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/679" > + * HAJournalServer can not restart due to logically empty log files + * </a> + * @see <a href="http://sourceforge.net/apps/trac/bigdata/ticket/775" > + * HAJournal start() </a> + */ + public void test_startABC_physicallyEmptyLogFileDeletedOnRestartC() throws Exception { + + final ABC abc = new ABC(true/* sequential */); + + final HAGlue serverA = abc.serverA, serverB = abc.serverB; + HAGlue serverC = abc.serverC; + + // Verify quorum is FULLY met. + awaitFullyMetQuorum(); + + // await the KB create commit point to become visible on each service. + awaitCommitCounter(1L, new HAGlue[] { serverA, serverB, serverC }); + + // Verify binary equality of ALL journals. + assertDigestsEquals(new HAGlue[] { serverA, serverB, serverC }); + + // Verify binary equality of ALL HALog files. + assertHALogDigestsEquals(1L/* firstCommitCounter */, + 1/* lastCommitCounter */, new HAGlue[] { serverA, serverB, + serverC }); + + /* + * Do a series of small commits. + */ + + final int NSMALL = 5; + + for (int i = 1/* createKB */; i <= NSMALL; i++) { + + simpleTransaction(); + + } + + final long commitCounter1 = 1 + NSMALL; // AKA (6) + + // await the commit points to become visible. + awaitCommitCounter(commitCounter1, + new HAGlue[] { serverA, serverB, serverC }); + + // Verify binary equality of ALL journals. + assertDigestsEquals(new HAGlue[] { serverA, serverB, serverC }); + + // Verify binary equality of ALL HALog files. + assertHALogDigestsEquals(1L/* firstCommitCounter */, commitCounter1, + new HAGlue[] { serverA, serverB, serverC }); + + /* + * Verify the expected #of HALogs on each service. + * + * Note: This is (lastCommitCounter+1) since an empty HALog was created + * for the next commit point. + */ + awaitLogCount(getHALogDirA(), commitCounter1 + 1); + awaitLogCount(getHALogDirB(), commitCounter1 + 1); + awaitLogCount(getHALogDirC(), commitCounter1 + 1); + + /* + * Shutdown C. + * + * Note: This might cause the empty HALog file on (C) to be deleted. + * That is Ok, since we will copy the desired empty HALOg from (A) to + * (C), thus enforcing the desired test condition. + */ + shutdownC(); + + /* + * Verify that there is an empty HALog file on (A) for the next + * commit point. + */ + + // The next commit point. + final long commitCounter2 = commitCounter1 + 1; // AKA (7) + + // The HALog for that next commit point. + final File fileA = CommitCounterUtility.getCommitCounterFile( + getHALogDirA(), commitCounter2, IHALogReader.HA_LOG_EXT); + + // Verify HALog file for next commit point on A is logically empty. + { + assertTrue(fileA.exists()); + final IHALogReader r = new HALogReader(fileA); + assertTrue(r.isEmpty()); + assertFalse(r.isLive()); + r.close(); + assertTrue(fileA.exists()); + } + + // The name of that HALog file on (C). + final File fileC = CommitCounterUtility.getCommitCounterFile( + getHALogDirC(), commitCounter2, IHALogReader.HA_LOG_EXT); + +// // Copy that empty HALog file to (C). +// copyFile(fileA, fileC, false/* append */); + + // delete the logically empty file (if it exists). + if (fileC.exists() && !fileC.delete()) + fail("Could not delete: fileC=" + fileC); + + // create the physically empty file. + if (!fileC.createNewFile()) + fail("Could not create: fileC=" + fileC); + + /* + * Do another transaction. This will cause the HALog file for that + * commit point to be non-empty on A. + */ + simpleTransaction(); + + /* + * Await the commit points to become visible. + * + * Note: This is (lastCommitCounter+1) since an empty HALog was created + * for the next commit point. + */ + awaitCommitCounter(commitCounter2, new HAGlue[] { serverA, serverB }); + + // Verify the expected #of HALogs on each service. + awaitLogCount(getHALogDirA(), commitCounter2 + 1); + awaitLogCount(getHALogDirB(), commitCounter2 + 1); + awaitLogCount(getHALogDirC(), commitCounter2); + + // Verify HALog file for next commit point on A is NOT empty. + { + assertTrue(fileA.exists()); + final IHALogReader r = new HALogReader(fileA); + assertFalse(r.isEmpty()); + assertFalse(r.isLive()); + r.close(); + assertTrue(fileA.exists()); + } + + // Verify HALog file for next commit point on C is phsyically empty. + { + assertTrue(fileC.exists()); + assertEquals(0L, fileC.length()); + } + + /* + * Restart (C). It should start without complaint. The logically empty + * HALog file should be replaced by the corresponding file from (A) by + * the time the quorum fully meets. At this point all services will have + * the same digests for all HALog files. + */ + + // Restart C. + serverC = startC(); + + // Wait until the quorum is fully met. + awaitFullyMetQuorum(); + + // await the commit points to become visible. + awaitCommitCounter(commitCounter2, + new HAGlue[] { serverA, serverB, serverC }); + + // Verify binary equality of ALL journals. + assertDigestsEquals(new HAGlue[] { serverA, serverB, serverC }); + + // Verify binary equality of ALL HALog files. + assertHALogDigestsEquals(1L/* firstCommitCounter */, + commitCounter2 /* lastCommitCounter */, new HAGlue[] { serverA, + serverB, serverC }); + + /* + * Verify the expected #of HALogs on each service. + * + * Note: Each service will have an empty HALog for the next commit + * point. + */ + awaitLogCount(getHALogDirA(), commitCounter2+1); + awaitLogCount(getHALogDirB(), commitCounter2+1); + awaitLogCount(getHALogDirC(), commitCounter2+1); + + } + + /** + * This is a variant test for the ability to silently remove a corrupt HALog + * file on restart when it is the HALog file for the first write set not yet + * committed on the journal. Three services are started in sequence (A,B,C). + * A series of small commits are applied to the quorum. (C) is then + * shutdown. A logically empty HALog file should exist on each service for + * the next commit point. However, since this might have been removed on C + * when it was shutdown, we copy the logically empty HALog file from (A) to + * (C). We then overwrite the root blocks of that logically empty HALog file + * with junk. We then do one more update. C is then restarted. We verify + * that C restarts and that the corrupt HALog file has been replaced + * by an HALog file that has the same digest as the HALog file for that + * commit point on (A,B). + * <p> + * Note: We can not reliably observe that the logically HALog file was + * removed during startup. However, this is not critical. What is critical + * is that the logically empty HALog file (a) does not prevent (C) from + * starting; (b) is replaced by the correct HALog data from the quorum + * leader; and (c) that (C) resynchronizes with the met quorum and joins + * causing a fully met quorum. + * + * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/679" > + * HAJournalServer can not restart due to logically empty log files + * </a> + */ + public void test_startABC_corruptLogFileDeletedOnRestartC() throws Exception { + + final ABC abc = new ABC(true/* sequential */); + + final HAGlue serverA = abc.serverA, serverB = abc.serverB; + HAGlue serverC = abc.serverC; + + // Verify quorum is FULLY met. + awaitFullyMetQuorum(); + + // await the KB create commit point to become visible on each service. + awaitCommitCounter(1L, new HAGlue[] { serverA, serverB, serverC }); + + // Verify binary equality of ALL journals. + assertDigestsEquals(new HAGlue[] { serverA, serverB, serverC }); + + // Verify binary equality of ALL HALog files. + assertHALogDigestsEquals(1L/* firstCommitCounter */, + 1/* lastCommitCounter */, new HAGlue[] { serverA, serverB, + serverC }); + + /* + * Do a series of small commits. + */ + + final int NSMALL = 5; + + for (int i = 1/* createKB */; i <= NSMALL; i++) { + + simpleTransaction(); + + } + + final long commitCounter1 = 1 + NSMALL; // AKA (6) + + // await the commit points to become visible. + awaitCommitCounter(commitCounter1, + new HAGlue[] { serverA, serverB, serverC }); + + // Verify binary equality of ALL journals. + assertDigestsEquals(new HAGlue[] { serverA, serverB, serverC }); + + // Verify binary equality of ALL HALog files. + assertHALogDigestsEquals(1L/* firstCommitCounter */, commitCounter1, + new HAGlue[] { serverA, serverB, serverC }); + + /* + * Verify the expected #of HALogs on each service. + * + * Note: This is (lastCommitCounter+1) since an empty HALog was created + * for the next commit point. + */ + awaitLogCount(getHALogDirA(), commitCounter1 + 1); + awaitLogCount(getHALogDirB(), commitCounter1 + 1); + awaitLogCount(getHALogDirC(), commitCounter1 + 1); + + /* + * Shutdown C. + * + * Note: This might cause the empty HALog file on (C) to be deleted. + * That is Ok, since we will copy the desired empty HALOg from (A) to + * (C), thus enforcing the desired test condition. + */ + shutdownC(); + + /* + * Verify that there is an empty HALog file on (A) for the next + * commit point. + */ + + // The next commit point. + final long commitCounter2 = commitCounter1 + 1; // AKA (7) + + // The HALog for that next commit point. + final File fileA = CommitCounterUtility.getCommitCounterFile( + getHALogDirA(), commitCounter2, IHALogReader.HA_LOG_EXT); + + // Verify HALog file for next commit point on A is logically empty. + { + assertTrue(fileA.exists()); + final IHALogReader r = new HALogReader(fileA); + assertTrue(r.isEmpty()); + assertFalse(r.isLive()); + r.close(); + assertTrue(fileA.exists()); + } + + // The name of that HALog file on (C). + final File fileC = CommitCounterUtility.getCommitCounterFile( + getHALogDirC(), commitCounter2, IHALogReader.HA_LOG_EXT); + + // Copy that empty HALog file to (C). + copyFile(fileA, fileC, false/* append */); + /* + * Overwrite the root blocks of the HALog on (C). + */ + { + final OutputStream os = new FileOutputStream(fileC); + try { + final ByteBuffer buf = getRandomData(FileMetadata.headerSize0); + final byte[] b = getBytes(buf); + os.write(b); + os.flush(); + } finally { + os.close(); + } + } + + /* + * Do another transaction. This will cause the HALog file for that + * commit point to be non-empty on A. + */ + simpleTransaction(); + + /* + * Await the commit points to become visible. + * + * Note: This is (lastCommitCounter+1) since an empty HALog was created + * for the next commit point. + */ + awaitCommitCounter(commitCounter2, new HAGlue[] { serverA, serverB }); + + // Verify the expected #of HALogs on each service. + awaitLogCount(getHALogDirA(), commitCounter2 + 1); + awaitLogCount(getHALogDirB(), commitCounter2 + 1); + awaitLogCount(getHALogDirC(), commitCounter2); + + // Verify HALog file for next commit point on A is NOT empty. + { + assertTrue(fileA.exists()); + final IHALogReader r = new HALogReader(fileA); + assertFalse(r.isEmpty()); + assertFalse(r.isLive()); + r.close(); + assertTrue(fileA.exists()); + } + + // Verify HALog file for next commit point on C is corrupt. + { + boolean ok = false; + try { + new HALogReader(fileC); + ok = true; + } catch(Throwable t) { + // Note: Could be IOException, ChecksumError, or + // RuntimeException. + } + if (ok) + fail("HALog is not corrupt: " + fileC); + } + + /* + * Restart (C). It should start without complaint. The logically empty + * HALog file should be replaced by the corresponding file from (A) by + * the time the quorum fully meets. At this point all services will have + * the same digests for all HALog files. + */ + + // Restart C. + serverC = startC(); + + // Wait until the quorum is fully met. + awaitFullyMetQuorum(); + + // await the commit points to become visible. + awaitCommitCounter(commitCounter2, + new HAGlue[] { serverA, serverB, serverC }); + + // Verify binary equality of ALL journals. + assertDigestsEquals(new HAGlue[] { serverA, serverB, serverC }); + + // Verify binary equality of ALL HALog files. + assertHALogDigestsEquals(1L/* firstCommitCounter */, + commitCounter2 /* lastCommitCounter */, new HAGlue[] { serverA, + serverB, serverC }); + + /* + * Verify the expected #of HALogs on each service. + * + * Note: Each service will have an empty HALog for the next commit + * point. + */ + awaitLogCount(getHALogDirA(), commitCounter2+1); + awaitLogCount(getHALogDirB(), commitCounter2+1); + awaitLogCount(getHALogDirC(), commitCounter2+1); + + } + + /** + * This is a unit test for the ability to correctly NOT remove a logically + * empty HALog file when that HALog file is for the last commit point on the + * Journal. Three services are started in sequence (A,B,C). A series of + * small commits are applied to the quorum. (C) is then shutdown. A + * logically empty HALog file should exist on each service for the next + * commit point. We remove the HALog for the next commit point from (C) if + * it exists. We then remove the HALog for the last durable commit point on + * (C) and replace it with a physically empty HALog file. We then do one + * more update. C is then restarted. We verify that C DOES NOT restart and + * that the physically empty HALog file for the last durable commit point on + * C has not been removed or updated. + * + * TODO This is the staring place for adding the capability to automatically + * replicate bad or missing historical HALog files from the quorum leader. + * The tests exists now to ensure that the logic to remove a bad HALog on + * startup will refuse to remove an HALog corresponding to the most recent + * commit point on the Journal. + * + * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/679" > + * HAJournalServer can not restart due to logically empty log files + * </a> + * @see <a href="http://sourceforge.net/apps/trac/bigdata/ticket/775" > + * HAJournal start() </a> + */ + public void test_startABC_missingHALogFileForLastCommitBlocksRestartC() throws Exception { + + final ABC abc = new ABC(true/* sequential */); + + final HAGlue serverA = abc.serverA, serverB = abc.serverB; + HAGlue serverC = abc.serverC; + + // Verify quorum is FULLY met. + awaitFullyMetQuorum(); + + // await the KB create commit point to become visible on each service. + awaitCommitCounter(1L, new HAGlue[] { serverA, serverB, serverC }); + + // Verify binary equality of ALL journals. + assertDigestsEquals(new HAGlue[] { serverA, serverB, serverC }); + + // Verify binary equality of ALL HALog files. + assertHALogDigestsEquals(1L/* firstCommitCounter */, + 1/* lastCommitCounter */, new HAGlue[] { serverA, serverB, + serverC }); + + /* + * Do a series of small commits. + */ + + final int NSMALL = 5; + + for (int i = 1/* createKB */; i <= NSMALL; i++) { + + simpleTransaction(); + + } + + final long commitCounter1 = 1 + NSMALL; // AKA (6) + + // await the commit points to become visible. + awaitCommitCounter(commitCounter1, + new HAGlue[] { serverA, serverB, serverC }); + + // Verify binary equality of ALL journals. + assertDigestsEquals(new HAGlue[] { serverA, serverB, serverC }); + + // Verify binary equality of ALL HALog files. + assertHALogDigestsEquals(1L/* firstCommitCounter */, commitCounter1, + new HAGlue[] { serverA, serverB, serverC }); + + /* + * Verify the expected #of HALogs on each service. + * + * Note: This is (lastCommitCounter+1) since an empty HALog was created + * for the next commit point. + */ + awaitLogCount(getHALogDirA(), commitCounter1 + 1); + awaitLogCount(getHALogDirB(), commitCounter1 + 1); + awaitLogCount(getHALogDirC(), commitCounter1 + 1); + + /* + * Shutdown C. + * + * Note: This might cause the empty HALog file on (C) to be deleted. + * That is Ok, since we will copy the desired empty HALOg from (A) to + * (C), thus enforcing the desired test condition. + */ + shutdownC(); + + /* + * Verify that there is an empty HALog file on (A) for the next + * commit point. + */ + + // The next commit point. + final long commitCounter2 = commitCounter1 + 1; // AKA (7) + + // The HALog for that next commit point. + final File fileA = CommitCounterUtility.getCommitCounterFile( + getHALogDirA(), commitCounter2, IHALogReader.HA_LOG_EXT); + + // Verify HALog file for next commit point on A is logically empty. + { + assertTrue(fileA.exists()); + final IHALogReader r = new HALogReader(fileA); + assertTrue(r.isEmpty()); + assertFalse(r.isLive()); + r.close(); + assertTrue(fileA.exists()); + } + + // The name of that HALog file on (C). + final File fileC = CommitCounterUtility.getCommitCounterFile( + getHALogDirC(), commitCounter2, IHALogReader.HA_LOG_EXT); + +// // Copy that empty HALog file to (C). +// copyFile(fileA, fileC, false/* append */); + if (fileC.exists()) + if (!fileC.delete()) + fail("Could not remove HALog for open write set: " + fileC); + + // The HALog file on (C) for the last durable commit point on (C). + final File fileCLastCommit = CommitCounterUtility.getCommitCounterFile( + getHALogDirC(), commitCounter1, IHALogReader.HA_LOG_EXT); + + if (!fileCLastCommit.exists()) + fail("HALog for last commit not found: " + fileCLastCommit); + + if (!fileCLastCommit.delete()) + fail("Could not remove HALog for last commit: " + fileCLastCommit); + + /* + * Do another transaction. This will cause the HALog file for that + * commit point to be non-empty on A. + */ + simpleTransaction(); + + /* + * Await the commit points to become visible. + * + * Note: This is (lastCommitCounter+1) since an empty HALog was created + * for the next commit point. + */ + awaitCommitCounter(commitCounter2, new HAGlue[] { serverA, serverB }); + + // Verify the expected #of HALogs on each service. + awaitLogCount(getHALogDirA(), commitCounter2 + 1); + awaitLogCount(getHALogDirB(), commitCounter2 + 1); + awaitLogCount(getHALogDirC(), commitCounter1 - 1); + + // Verify HALog file for next commit point on A is NOT empty. + { + assertTrue(fileA.exists()); + final IHALogReader r = new HALogReader(fileA); + assertFalse(r.isEmpty()); + assertFalse(r.isLive()); + r.close(); + assertTrue(fileA.exists()); + } + + // Verify HALog files for last and next commit point on C are missing. + { + assertFalse(fileC.exists()); + assertFalse(fileCLastCommit.exists()); + } + + /* + * Restart (C). + * + * Note: This restart should fail. The number of HALog files on (C) + * should be unchanged. + */ + + // Restart C. + { + boolean ok = false; + try { + serverC = startC(); + ok = true; + } catch (Throwable t) { + if (InnerCause.isInnerCause(t, InterruptedException.class)) + // test interrupted? propagate interrupt. + throw new RuntimeException(t); + // log message. refused start is expected. + log.warn("C refused to start: " + t, t); + } + if (ok) + fail("C should not have restarted."); + } + + /* + * Verify the expected #of HALogs on each service. + * + * Note: Each service will have an empty HALog for the next commit + * point. + */ + awaitLogCount(getHALogDirA(), commitCounter2 + 1); + awaitLogCount(getHALogDirB(), commitCounter2 + 1); + awaitLogCount(getHALogDirC(), commitCounter1 - 1); + + } + + /** * Unit test for a situation in which A B and C start. A quorum mets and the * third service resyncs with the met quorum. The quorum then fully meets. * Once the fully met quorum is stable, C is then restarted. This test Modified: branches/BIGDATA_RELEASE_1_3_0/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3WORMJournalServer.java =================================================================== --- branches/BIGDATA_RELEASE_1_3_0/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3WORMJournalServer.java 2013-11-18 22:45:25 UTC (rev 7562) +++ branches/BIGDATA_RELEASE_1_3_0/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3WORMJournalServer.java 2013-11-19 14:10:04 UTC (rev 7563) @@ -1,3 +1,26 @@ +/** + +Copyright (C) SYSTAP, LLC 2006-2010. All rights reserved. + +Contact: + SYSTAP, LLC + 4501 Tower Road + Greensboro, NC 27410 + lic...@bi... + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ package com.bigdata.journal.jini.ha; import com.bigdata.journal.BufferMode; @@ -2,13 +25,19 @@ +/** + * FIXME HAWORM: This test suite is not implemented. It needs to override the + * {@link BufferMode}. + * + * @author <a href="mailto:tho...@us...">Bryan Thompson</a> + */ public class TestHA3WORMJournalServer extends TestHA3JournalServer { - - - public TestHA3WORMJournalServer() {} - - public TestHA3WORMJournalServer(String nme) { - super(nme); - } - + + public TestHA3WORMJournalServer() { + } + + public TestHA3WORMJournalServer(String nme) { + super(nme); + } + protected BufferMode getDiskMode() { - return BufferMode.DiskWORM; + return BufferMode.DiskWORM; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |