From: <tho...@us...> - 2013-10-25 13:20:22
|
Revision: 7482 http://bigdata.svn.sourceforge.net/bigdata/?rev=7482&view=rev Author: thompsonbry Date: 2013-10-25 13:20:15 +0000 (Fri, 25 Oct 2013) Log Message: ----------- Cleaning up test suite. - assertLogCount() => awaitLogCount() since there is non-determinism. Modified Paths: -------------- branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/AbstractHAJournalServerTestCase.java branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3JournalServer.java branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3JournalServerWithHALogs.java branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHAJournalServerOverride.java Modified: branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/AbstractHAJournalServerTestCase.java =================================================================== --- branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/AbstractHAJournalServerTestCase.java 2013-10-24 22:40:05 UTC (rev 7481) +++ branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/AbstractHAJournalServerTestCase.java 2013-10-25 13:20:15 UTC (rev 7482) @@ -1176,29 +1176,52 @@ } private Iterator<File> getLogs(final File f, final FileFilter fileFilter) { - ArrayList<File> files = new ArrayList<File>(); - - recursiveAdd(files, f, fileFilter); - - return files.iterator(); + + final ArrayList<File> files = new ArrayList<File>(); + + recursiveAdd(files, f, fileFilter); + + return files.iterator(); + } - protected void assertLogCount(final File logDir, final long count) { + protected void awaitLogCount(final File logDir, final long count) { + assertCondition(new Runnable() { + public void run() { + assertLogCount(logDir, count); + } + }, 5000, TimeUnit.MILLISECONDS); + + } + + /** + * Note: There is typically some non-determinism around when an HALog file + * is closed and a new one is opened in a 2-phase commit. Therefore you + * should generally use {@link #awaitLogCount(File, long)} rather than this + * method. + * + * @param logDir + * @param count + */ + private void assertLogCount(final File logDir, final long count) { + final long actual = recursiveCount(logDir, IHALogReader.HALOG_FILTER); - + if (actual != count) { - - final Iterator<File> logs = getLogs(logDir, IHALogReader.HALOG_FILTER); - StringBuilder fnmes = new StringBuilder(); - while (logs.hasNext()) { - fnmes.append("\n" + logs.next().getName()); - } - - fail("Actual log files: " + actual + ", expected: " + count + ", files: " + fnmes); - + + final Iterator<File> logs = getLogs(logDir, + IHALogReader.HALOG_FILTER); + StringBuilder fnmes = new StringBuilder(); + while (logs.hasNext()) { + fnmes.append("\n" + logs.next().getName()); + } + + fail("Actual log files: " + actual + ", expected: " + count + + ", files: " + fnmes); + } } - + } Modified: branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3JournalServer.java =================================================================== --- branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3JournalServer.java 2013-10-24 22:40:05 UTC (rev 7481) +++ branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3JournalServer.java 2013-10-25 13:20:15 UTC (rev 7482) @@ -1688,42 +1688,66 @@ */ public void testStartABC_halog() throws Exception { - // Start 3 services - startA(); - startB(); - startC(); - - awaitFullyMetQuorum(); - // setup log directories final File logsA = getHALogDirA(); final File logsB = getHALogDirB(); final File logsC = getHALogDirC(); + /* + * Start 2 services and wait for first commit point. The HALogs will not + * be purged because the quorum is not fully met. + */ + HAGlue serverA = startA(); + HAGlue serverB = startB(); + final long token = awaitMetQuorum(); + // initial token value. + assertEquals(0L, token); + awaitCommitCounter(1L, serverA, serverB); + awaitLogCount(logsA, 2); + awaitLogCount(logsB, 2); + + /* + * Start next service. The service will resync and join with the met + * quorum. The HALog files will not be purged - they do not get purged + * until the fully met quorum goes through a commit point. + */ + HAGlue serverC = startC(); + final long token2 = awaitFullyMetQuorum(); + assertEquals(token2,token); + awaitCommitCounter(1L, serverA, serverB, + serverC); + // committed log files not purged prior to fully met commit - assertLogCount(logsA, 2); - assertLogCount(logsB, 2); - assertLogCount(logsC, 2); + awaitLogCount(logsA, 2); + awaitLogCount(logsB, 2); + awaitLogCount(logsC, 2); + + // + // Above here the code is shared with halogRestart() + // // Run through transaction simpleTransaction(); // again check that only open log files remaining - assertLogCount(logsA, 1); - assertLogCount(logsB, 1); - assertLogCount(logsC, 1); + awaitLogCount(logsA, 1); + awaitLogCount(logsB, 1); + awaitLogCount(logsC, 1); assertEquals(1L, recursiveCount(getHALogDirA(),IHALogReader.HALOG_FILTER)); assertEquals(1L, recursiveCount(getHALogDirB(),IHALogReader.HALOG_FILTER)); assertEquals(1L, recursiveCount(getHALogDirC(),IHALogReader.HALOG_FILTER)); // Now run several transactions - for (int i = 0; i < 5; i++) - simpleTransaction(); + for (int i = 0; i < 5; i++) { + + simpleTransaction(); + + } // again check that only open log files remaining - assertLogCount(logsA, 1); - assertLogCount(logsB, 1); - assertLogCount(logsC, 1); + awaitLogCount(logsA, 1); + awaitLogCount(logsB, 1); + awaitLogCount(logsC, 1); } /** @@ -1733,37 +1757,52 @@ * @throws Exception */ public void testStartABC_halogRestart() throws Exception { - // Start 3 services in strict order. -// startA(); -// Thread.sleep(1000); // ensure A will be leader -// startB(); -// startC(); - new ABC(true/*sequential*/); - awaitFullyMetQuorum(); - // setup log directories final File logsA = getHALogDirA(); final File logsB = getHALogDirB(); final File logsC = getHALogDirC(); - final long token = awaitFullyMetQuorum(); - + /* + * Start 2 services and wait for first commit point. The HALogs will not + * be purged because the quorum is not fully met. + */ + HAGlue serverA = startA(); + HAGlue serverB = startB(); + final long token = awaitMetQuorum(); // initial token value. assertEquals(0L, token); + awaitCommitCounter(1L, serverA, serverB); + awaitLogCount(logsA, 2); + awaitLogCount(logsB, 2); + /* + * Start next service. The service will resync and join with the met + * quorum. The HALog files will not be purged - they do not get purged + * until the fully met quorum goes through a commit point. + */ + HAGlue serverC = startC(); + final long token2 = awaitFullyMetQuorum(); + assertEquals(token2,token); + awaitCommitCounter(1L, serverA, serverB, + serverC); + // committed log files not purged prior to fully met commit - assertLogCount(logsA, 2); - assertLogCount(logsB, 2); - assertLogCount(logsC, 2); + awaitLogCount(logsA, 2); + awaitLogCount(logsB, 2); + awaitLogCount(logsC, 2); + + // + // Above here the code is shared with halogRestart2() + // // Run through transaction simpleTransaction(); // again check that only open log files remaining - assertLogCount(logsA, 1); - assertLogCount(logsB, 1); - assertLogCount(logsC, 1); + awaitLogCount(logsA, 1); + awaitLogCount(logsB, 1); + awaitLogCount(logsC, 1); // Now shutdown all servers shutdownB(); @@ -1771,13 +1810,13 @@ shutdownA(); // and check that there are no logs - assertLogCount(logsA, 0); - assertLogCount(logsB, 0); - assertLogCount(logsC, 0); + awaitLogCount(logsA, 0); + awaitLogCount(logsB, 0); + awaitLogCount(logsC, 0); // startup AB - final HAGlue serverA = startA(); - final HAGlue serverB = startB(); + serverA = startA(); + serverB = startB(); final long token1 = awaitMetQuorum(); awaitHAStatus(serverA, HAStatusEnum.Leader); @@ -1785,63 +1824,74 @@ // Verify new quorum token. assertEquals(token + 1, token1); -((HAGlueTest)serverA).log("MARK"); -((HAGlueTest)serverB).log("MARK"); -Thread.sleep(1000/*ms*/); // FIXME Why this delay? // and check that there are open logs - assertLogCount(logsA, 1); - assertLogCount(logsB, 1); + awaitLogCount(logsA, 1); + awaitLogCount(logsB, 1); // add C - final HAGlue serverC = startC(); + serverC = startC(); assertEquals(token1, awaitFullyMetQuorum()); awaitHAStatus(serverA, HAStatusEnum.Leader); awaitHAStatus(serverB, HAStatusEnum.Follower); awaitHAStatus(serverC, HAStatusEnum.Follower); -((HAGlueTest)serverA).log("MARK"); -((HAGlueTest)serverB).log("MARK"); -((HAGlueTest)serverC).log("MARK"); -Thread.sleep(1000/*ms*/); // FIXME Why this delay? // and check again for ABC - assertLogCount(logsA, 1); - assertLogCount(logsB, 1); - assertLogCount(logsC, 1); + awaitLogCount(logsA, 1); + awaitLogCount(logsB, 1); + awaitLogCount(logsC, 1); } /** * Variant where A is shutdown first. */ public void testStartABC_halogRestart2() throws Exception { - // Start 3 services in strict order. -// startA(); -// Thread.sleep(1000); // ensure A will be leader -// startB(); -// startC(); - new ABC(true/* sequential */); - - final long token = awaitFullyMetQuorum(); - - assertEquals(0L, token); - + // setup log directories final File logsA = getHALogDirA(); final File logsB = getHALogDirB(); final File logsC = getHALogDirC(); + /* + * Start 2 services and wait for first commit point. The HALogs will not + * be purged because the quorum is not fully met. + */ + HAGlue serverA = startA(); + HAGlue serverB = startB(); + final long token = awaitMetQuorum(); + // initial token value. + assertEquals(0L, token); + awaitCommitCounter(1L, serverA, serverB); + awaitLogCount(logsA, 2); + awaitLogCount(logsB, 2); + + /* + * Start next service. The service will resync and join with the met + * quorum. The HALog files will not be purged - they do not get purged + * until the fully met quorum goes through a commit point. + */ + HAGlue serverC = startC(); + final long token2 = awaitFullyMetQuorum(); + assertEquals(token2,token); + awaitCommitCounter(1L, serverA, serverB, + serverC); + // committed log files not purged prior to fully met commit - assertLogCount(logsA, 2); - assertLogCount(logsB, 2); - assertLogCount(logsC, 2); + awaitLogCount(logsA, 2); + awaitLogCount(logsB, 2); + awaitLogCount(logsC, 2); + + // + // Above here the code is shared with halogRestart() + // // Run through transaction simpleTransaction(); // again check that only open log files remaining - assertLogCount(logsA, 1); - assertLogCount(logsB, 1); - assertLogCount(logsC, 1); + awaitLogCount(logsA, 1); + awaitLogCount(logsB, 1); + awaitLogCount(logsC, 1); // Verify token unchanged. assertEquals(token, awaitFullyMetQuorum()); @@ -1852,18 +1902,15 @@ shutdownC(); // and check that there are no logs - assertLogCount(logsA, 0); - assertLogCount(logsB, 0); - assertLogCount(logsC, 0); + awaitLogCount(logsA, 0); + awaitLogCount(logsB, 0); + awaitLogCount(logsC, 0); // startup AB - final HAGlue serverA = startA(); - final HAGlue serverB = startB(); + serverA = startA(); + serverB = startB(); final long token1 = awaitMetQuorum(); -((HAGlueTest)serverA).log("MARK"); -((HAGlueTest)serverB).log("MARK"); -Thread.sleep(1000/*ms*/); // FIXME Why this delay? /* * Verify new quorum token (could be a quorum meet when the leader @@ -1872,23 +1919,19 @@ assertTrue(token1 >= token + 1); // and check that there are open logs - assertLogCount(logsA, 1); - assertLogCount(logsB, 1); + awaitLogCount(logsA, 1); + awaitLogCount(logsB, 1); // add C - final HAGlue serverC = startC(); + serverC = startC(); // Verify quorum token is unchanged. assertEquals(token1, awaitFullyMetQuorum()); -((HAGlueTest)serverA).log("MARK"); -((HAGlueTest)serverB).log("MARK"); -((HAGlueTest)serverC).log("MARK"); -Thread.sleep(1000/*ms*/); // FIXME Why this delay? // and check again for ABC - assertLogCount(logsA, 1); - assertLogCount(logsB, 1); - assertLogCount(logsC, 1); + awaitLogCount(logsA, 1); + awaitLogCount(logsB, 1); + awaitLogCount(logsC, 1); } public void testABCMultiTransactionFollowerReads() throws Exception { @@ -2336,18 +2379,18 @@ simpleTransaction(); // 5 committed files + 1 open == 6 - assertLogCount(logsA, 6); - assertLogCount(logsB, 6); + awaitLogCount(logsA, 6); + awaitLogCount(logsB, 6); // and restart C with empty journal, forces Rebuild startC(); awaitFullyMetQuorum(); - assertLogCount(logsA, 6); - assertLogCount(logsB, 6); + awaitLogCount(logsA, 6); + awaitLogCount(logsB, 6); // Log count on C is 1 after quiescent rebuild - assertLogCount(logsC, 1); + awaitLogCount(logsC, 1); log.warn("CHECK: Committed log files not copied on Rebuild"); @@ -2356,9 +2399,9 @@ simpleTransaction(); // and check that only open log files remaining - assertLogCount(logsA, 1); - assertLogCount(logsB, 1); - assertLogCount(logsC, 1); + awaitLogCount(logsA, 1); + awaitLogCount(logsB, 1); + awaitLogCount(logsC, 1); } Modified: branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3JournalServerWithHALogs.java =================================================================== --- branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3JournalServerWithHALogs.java 2013-10-24 22:40:05 UTC (rev 7481) +++ branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3JournalServerWithHALogs.java 2013-10-25 13:20:15 UTC (rev 7482) @@ -152,9 +152,9 @@ * Note: This is (lastCommitCounter+1) since an empty HALog was created * for the next commit point. */ - assertLogCount(getHALogDirA(), commitCounter1 + 1); - assertLogCount(getHALogDirB(), commitCounter1 + 1); - assertLogCount(getHALogDirC(), commitCounter1 + 1); + awaitLogCount(getHALogDirA(), commitCounter1 + 1); + awaitLogCount(getHALogDirB(), commitCounter1 + 1); + awaitLogCount(getHALogDirC(), commitCounter1 + 1); /* * Shutdown C. @@ -209,9 +209,9 @@ awaitCommitCounter(commitCounter2, new HAGlue[] { serverA, serverB }); // Verify the expected #of HALogs on each service. - assertLogCount(getHALogDirA(), commitCounter2 + 1); - assertLogCount(getHALogDirB(), commitCounter2 + 1); - assertLogCount(getHALogDirC(), commitCounter2); + awaitLogCount(getHALogDirA(), commitCounter2 + 1); + awaitLogCount(getHALogDirB(), commitCounter2 + 1); + awaitLogCount(getHALogDirC(), commitCounter2); // Verify HALog file for next commit point on A is NOT empty. { @@ -264,9 +264,9 @@ * Note: Each service will have an empty HALog for the next commit * point. */ - assertLogCount(getHALogDirA(), commitCounter2+1); - assertLogCount(getHALogDirB(), commitCounter2+1); - assertLogCount(getHALogDirC(), commitCounter2+1); + awaitLogCount(getHALogDirA(), commitCounter2+1); + awaitLogCount(getHALogDirB(), commitCounter2+1); + awaitLogCount(getHALogDirC(), commitCounter2+1); } Modified: branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHAJournalServerOverride.java =================================================================== --- branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHAJournalServerOverride.java 2013-10-24 22:40:05 UTC (rev 7481) +++ branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHAJournalServerOverride.java 2013-10-25 13:20:15 UTC (rev 7482) @@ -492,10 +492,13 @@ * might have a consensus around the new commit point.) * * TODO Consider leader failure scenarios in this test suite, not just - * scenarios where B fails. Probably we should also cover failures of C (the - * 2nd follower). We should also cover scenariors where the quorum is barely - * met and a single failure causes a rejected commit (local decision) or - * 2-phase abort (joined services in joint agreement). + * scenarios where B fails. We MUST also cover failures of C (the 2nd + * follower). We should also cover scenariors where the quorum is barely met + * and a single failure causes a rejected commit (local decision) or 2-phase + * abort (joined services in joint agreement). + * + * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/760" > + * Review commit2Phase semantics when a follower fails </a> */ public void testStartABC_commit2Phase_B_fails() throws Exception { @@ -517,61 +520,92 @@ new Class[] { IHA2PhaseCommitMessage.class }, 0/* nwait */, 1/* nfail */); - // Simple transaction. - simpleTransaction(); + /** + * FIXME We need to resolve the correct behavior when B fails the commit + * after having prepared. Two code paths are outlined below. The + * implementation currently does an abort2Phase() when the + * commit2Phase() observe an error for B. That causes the commit point + * to NOT advance. + * + * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/760" > + * Review commit2Phase semantics when a follower fails </a> + */ -// try { -// // Simple transaction. -// simpleTransaction(); -// fail("Expecting failed transaction"); -// } catch(HttpException ex) { -// if (!ex.getMessage().contains( -// SpuriousTestException.class.getName())) { -// /* -// * Wrong inner cause. -// * -// * Note: The stack trace of the local exception does not include -// * the remote stack trace. The cause is formatted into the HTTP -// * response body. -// */ -// fail("Expecting " + ClocksNotSynchronizedException.class, t); -// } -// } - - // Verify quorum is unchanged. - assertEquals(token, quorum.token()); - - // Should be two commit points on {A,C]. - awaitCommitCounter(2L, startup.serverA, startup.serverC); - - /* - * B should go into an ERROR state and then into SeekConsensus and from - * there to RESYNC and finally back to RunMet. We can not reliably - * observe the intervening states. So what we really need to do is watch - * for B to move to the end of the pipeline and catch up to the same - * commit point. - */ + if(true) { - /* - * The pipeline should be reordered. B will do a service leave, then - * enter seek consensus, and then re-enter the pipeline. - */ - awaitPipeline(new HAGlue[] { startup.serverA, startup.serverC, - startup.serverB }); + // Simple transaction. + simpleTransaction(); - /* - * There should be two commit points on {A,C,B} (note that this assert - * does not pay attention to the pipeline order). - */ - awaitCommitCounter(2L, startup.serverA, startup.serverC, - startup.serverB); + // Verify quorum is unchanged. + assertEquals(token, quorum.token()); - // B should be a follower again. - awaitHAStatus(startup.serverB, HAStatusEnum.Follower); + // Should be two commit points on {A,C]. + awaitCommitCounter(2L, startup.serverA, startup.serverC); - // quorum token is unchanged. - assertEquals(token, quorum.token()); + /* + * B should go into an ERROR state and then into SeekConsensus and + * from there to RESYNC and finally back to RunMet. We can not + * reliably observe the intervening states. So what we really need + * to do is watch for B to move to the end of the pipeline and catch + * up to the same commit point. + */ + /* + * The pipeline should be reordered. B will do a service leave, then + * enter seek consensus, and then re-enter the pipeline. + */ + awaitPipeline(new HAGlue[] { startup.serverA, startup.serverC, + startup.serverB }); + + /* + * There should be two commit points on {A,C,B} (note that this + * assert does not pay attention to the pipeline order). + */ + awaitCommitCounter(2L, startup.serverA, startup.serverC, + startup.serverB); + + // B should be a follower again. + awaitHAStatus(startup.serverB, HAStatusEnum.Follower); + + // quorum token is unchanged. + assertEquals(token, quorum.token()); + + } else { + + try { + + // Simple transaction. + simpleTransaction(); + + fail("Expecting failed transaction"); + + } catch (Exception t) { + + if (!t.getMessage().contains( + SpuriousTestException.class.getName())) { + /* + * Wrong inner cause. + * + * Note: The stack trace of the local exception does not + * include the remote stack trace. The cause is formatted + * into the HTTP response body. + */ + fail("Expecting " + SpuriousTestException.class, t); + } + + } + + // Verify quorum is unchanged. + assertEquals(token, quorum.token()); + + // Should be ONE commit point on {A,B, C]. + awaitCommitCounter(1L, startup.serverA, startup.serverB, + startup.serverC); + + fail("finish test under these assumptions"); + + } + } /** This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |