From: <tho...@us...> - 2013-10-24 14:58:25
|
Revision: 7476 http://bigdata.svn.sourceforge.net/bigdata/?rev=7476&view=rev Author: thompsonbry Date: 2013-10-24 14:58:13 +0000 (Thu, 24 Oct 2013) Log Message: ----------- Test suite fix. restart leader and restart follower do not imply a guaranteed order in which the services will join. After a quorum break, everyone leaves and then comes back in. The first service to notice (or restart) will wind up as the new leader. Modified Paths: -------------- branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/AbstractHA3JournalServerTestCase.java branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA2JournalServer.java Modified: branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/AbstractHA3JournalServerTestCase.java =================================================================== --- branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/AbstractHA3JournalServerTestCase.java 2013-10-24 14:36:40 UTC (rev 7475) +++ branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/AbstractHA3JournalServerTestCase.java 2013-10-24 14:58:13 UTC (rev 7476) @@ -531,11 +531,18 @@ } protected void assertReady(final HAGlue[] members) throws IOException { - for (HAGlue member : members) { - final HAStatusEnum status = member.getHAStatus(); - if(log.isInfoEnabled())log.info(member.getServiceName() + ": " + status); - assertFalse(HAStatusEnum.NotReady == status); - } + + for (HAGlue member : members) { + + final HAStatusEnum status = member.getHAStatus(); + + if (log.isInfoEnabled()) + log.info(member.getServiceName() + ": " + status); + + assertFalse(HAStatusEnum.NotReady == status); + + } + } /** Modified: branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA2JournalServer.java =================================================================== --- branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA2JournalServer.java 2013-10-24 14:36:40 UTC (rev 7475) +++ branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA2JournalServer.java 2013-10-24 14:58:13 UTC (rev 7476) @@ -226,17 +226,23 @@ */ assertEquals(token1 + 1, token2); - // The leader should not have changed. - final HAGlue leader2 = quorum.getClient().getLeader(token2); - - final UUID leaderId2 = leader2.getServiceId(); - - if (!leaderId1.equals(leaderId2)) { - - fail("Expected leaderId=" + leaderId1 + ", but was " - + leaderId2); - - } + /* + * Note: The services will recast their votes. The pipeline order + * depends on whether the leader restarts more quickly than the + * latency required for the follower to notice that the leader is + * dead (this depends on the negotiated zookeeper session timeout). + */ +// // The leader should not have changed. +// final HAGlue leader2 = quorum.getClient().getLeader(token2); +// +// final UUID leaderId2 = leader2.getServiceId(); +// +// if (!leaderId1.equals(leaderId2)) { +// +// fail("Expected leaderId=" + leaderId1 + ", but was " +// + leaderId2); +// +// } /* * Verify that the votes were recast for the then current @@ -252,6 +258,8 @@ */ for (HAGlue service : new HAGlue[] { serverA, serverB }) { + awaitNSSAndHAReady(service); + final RemoteRepository repo = getRemoteRepository(service); // Should be empty. @@ -323,22 +331,28 @@ */ assertEquals(token1 + 1, token2); - // The leader should have changed. - final HAGlue leader2 = quorum.getClient().getLeader(token2); + /* + * Note: The services will recast their votes. The pipeline order + * depends on whether the leader restarts more quickly than the + * latency required for the follower to notice that the leader is + * dead (this depends on the negotiated zookeeper session timeout). + */ +// // The leader should have changed. +// final HAGlue leader2 = quorum.getClient().getLeader(token2); +// +// final UUID leaderId2 = leader2.getServiceId(); +// +// if (leaderId1.equals(leaderId2)) { +// /* +// * FIXME This fail message is not useful. +// * leaderId1.equals(leaderId2). it should report what the leader +// * *should* have been, but reports two identical values instead. +// */ +// fail("Expected leaderId=" + leaderId1 + ", but was " +// + leaderId2); +// +// } - final UUID leaderId2 = leader2.getServiceId(); - - if (leaderId1.equals(leaderId2)) { - /* - * FIXME This fail message is not useful. - * leaderId1.equals(leaderId2). it should report what the leader - * *should* have been, but reports two identical values instead. - */ - fail("Expected leaderId=" + leaderId1 + ", but was " - + leaderId2); - - } - /* * Verify that the votes were recast for the then current * lastCommitTime. @@ -352,6 +366,8 @@ * both the leader and the follower. */ for (HAGlue service : new HAGlue[] { serverA, serverB }) { + + awaitNSSAndHAReady(service); final RemoteRepository repo = getRemoteRepository(service); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <tho...@us...> - 2013-10-24 20:35:08
|
Revision: 7480 http://bigdata.svn.sourceforge.net/bigdata/?rev=7480&view=rev Author: thompsonbry Date: 2013-10-24 20:34:58 +0000 (Thu, 24 Oct 2013) Log Message: ----------- I have disabled the unit tests that stop and restart the zookeeper service. We can investigate re-enabling these tests based on our updated understanding of the zookeeper semantics. However, another way to approach this (and one which may make more sense) is to run a highly available zookeeper ensemble and then investigate how the bigdata HA replication cluster handles the transient zk client changes as we stop and start various zk servers, forcing the zk clients to rollover to different zk servers in the ensemble. See #718 (zk disconnect handling) See #724 (wire pull and sudden kill testing). Modified Paths: -------------- branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestAll.java branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHAJournalServerOverride.java Modified: branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestAll.java =================================================================== --- branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestAll.java 2013-10-24 20:25:13 UTC (rev 7479) +++ branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestAll.java 2013-10-24 20:34:58 UTC (rev 7480) @@ -114,7 +114,7 @@ suite.addTestSuite(TestHA3DumpLogs.class); // Verify ability to override the HAJournal implementation class. -// suite.addTestSuite(TestHAJournalServerOverride.class); FIXME RESTORE OVERRIDE TEST SUITE! + suite.addTestSuite(TestHAJournalServerOverride.class); // Test suite of longer running stress tests for an HA3 cluster. suite.addTestSuite(StressTestHA3JournalServer.class); Modified: branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHAJournalServerOverride.java =================================================================== --- branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHAJournalServerOverride.java 2013-10-24 20:25:13 UTC (rev 7479) +++ branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHAJournalServerOverride.java 2013-10-24 20:34:58 UTC (rev 7480) @@ -893,47 +893,54 @@ // // } - public void testStartAB_StopStartZookeeper() throws Exception { - - doStartAB_StopStartZookeeper(); - } - - public void testStartAB_StopStartZookeeperA() throws Exception { - - doStartAB_StopStartZookeeper(); - } - - public void testStartAB_StopStartZookeeperB() throws Exception { - - doStartAB_StopStartZookeeper(); - } - - public void testStartAB_StopStartZookeeperC() throws Exception { - - doStartAB_StopStartZookeeper(); - } - - public void testStartAB_StopStartZookeeperD() throws Exception { - - doStartAB_StopStartZookeeper(); - } - - public void testStartAB_StopStartZookeeperE() throws Exception { - - doStartAB_StopStartZookeeper(); - } - - public void testStartAB_StopStartZookeeperF() throws Exception { - - doStartAB_StopStartZookeeper(); - } - - public void testStartAB_StopStartZookeeperG() throws Exception { - - doStartAB_StopStartZookeeper(); - } + /* + * FIXME Consider re-enabling these tests. Do we want to attempt to + * programmatically stop and start the zookeeper ensemble under test + * control? For true HA deployments, what would be much more interesting + * is to run an ensemble with 3 zk servers and then do failovers of the + * zk servers and see how that effects the HAJournalServer instances. + */ +// public void testStartAB_StopStartZookeeper() throws Exception { +// +// doStartAB_StopStartZookeeper(); +// } +// +// public void testStartAB_StopStartZookeeperA() throws Exception { +// +// doStartAB_StopStartZookeeper(); +// } +// +// public void testStartAB_StopStartZookeeperB() throws Exception { +// +// doStartAB_StopStartZookeeper(); +// } +// +// public void testStartAB_StopStartZookeeperC() throws Exception { +// +// doStartAB_StopStartZookeeper(); +// } +// +// public void testStartAB_StopStartZookeeperD() throws Exception { +// +// doStartAB_StopStartZookeeper(); +// } +// +// public void testStartAB_StopStartZookeeperE() throws Exception { +// +// doStartAB_StopStartZookeeper(); +// } +// +// public void testStartAB_StopStartZookeeperF() throws Exception { +// +// doStartAB_StopStartZookeeper(); +// } +// +// public void testStartAB_StopStartZookeeperG() throws Exception { +// +// doStartAB_StopStartZookeeper(); +// } - public void doStartAB_StopStartZookeeper() throws Exception { + private void doStartAB_StopStartZookeeper() throws Exception { final HAGlue serverA = startA(); final HAGlue serverB = startB(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <tho...@us...> - 2013-10-25 13:20:22
|
Revision: 7482 http://bigdata.svn.sourceforge.net/bigdata/?rev=7482&view=rev Author: thompsonbry Date: 2013-10-25 13:20:15 +0000 (Fri, 25 Oct 2013) Log Message: ----------- Cleaning up test suite. - assertLogCount() => awaitLogCount() since there is non-determinism. Modified Paths: -------------- branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/AbstractHAJournalServerTestCase.java branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3JournalServer.java branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3JournalServerWithHALogs.java branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHAJournalServerOverride.java Modified: branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/AbstractHAJournalServerTestCase.java =================================================================== --- branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/AbstractHAJournalServerTestCase.java 2013-10-24 22:40:05 UTC (rev 7481) +++ branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/AbstractHAJournalServerTestCase.java 2013-10-25 13:20:15 UTC (rev 7482) @@ -1176,29 +1176,52 @@ } private Iterator<File> getLogs(final File f, final FileFilter fileFilter) { - ArrayList<File> files = new ArrayList<File>(); - - recursiveAdd(files, f, fileFilter); - - return files.iterator(); + + final ArrayList<File> files = new ArrayList<File>(); + + recursiveAdd(files, f, fileFilter); + + return files.iterator(); + } - protected void assertLogCount(final File logDir, final long count) { + protected void awaitLogCount(final File logDir, final long count) { + assertCondition(new Runnable() { + public void run() { + assertLogCount(logDir, count); + } + }, 5000, TimeUnit.MILLISECONDS); + + } + + /** + * Note: There is typically some non-determinism around when an HALog file + * is closed and a new one is opened in a 2-phase commit. Therefore you + * should generally use {@link #awaitLogCount(File, long)} rather than this + * method. + * + * @param logDir + * @param count + */ + private void assertLogCount(final File logDir, final long count) { + final long actual = recursiveCount(logDir, IHALogReader.HALOG_FILTER); - + if (actual != count) { - - final Iterator<File> logs = getLogs(logDir, IHALogReader.HALOG_FILTER); - StringBuilder fnmes = new StringBuilder(); - while (logs.hasNext()) { - fnmes.append("\n" + logs.next().getName()); - } - - fail("Actual log files: " + actual + ", expected: " + count + ", files: " + fnmes); - + + final Iterator<File> logs = getLogs(logDir, + IHALogReader.HALOG_FILTER); + StringBuilder fnmes = new StringBuilder(); + while (logs.hasNext()) { + fnmes.append("\n" + logs.next().getName()); + } + + fail("Actual log files: " + actual + ", expected: " + count + + ", files: " + fnmes); + } } - + } Modified: branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3JournalServer.java =================================================================== --- branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3JournalServer.java 2013-10-24 22:40:05 UTC (rev 7481) +++ branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3JournalServer.java 2013-10-25 13:20:15 UTC (rev 7482) @@ -1688,42 +1688,66 @@ */ public void testStartABC_halog() throws Exception { - // Start 3 services - startA(); - startB(); - startC(); - - awaitFullyMetQuorum(); - // setup log directories final File logsA = getHALogDirA(); final File logsB = getHALogDirB(); final File logsC = getHALogDirC(); + /* + * Start 2 services and wait for first commit point. The HALogs will not + * be purged because the quorum is not fully met. + */ + HAGlue serverA = startA(); + HAGlue serverB = startB(); + final long token = awaitMetQuorum(); + // initial token value. + assertEquals(0L, token); + awaitCommitCounter(1L, serverA, serverB); + awaitLogCount(logsA, 2); + awaitLogCount(logsB, 2); + + /* + * Start next service. The service will resync and join with the met + * quorum. The HALog files will not be purged - they do not get purged + * until the fully met quorum goes through a commit point. + */ + HAGlue serverC = startC(); + final long token2 = awaitFullyMetQuorum(); + assertEquals(token2,token); + awaitCommitCounter(1L, serverA, serverB, + serverC); + // committed log files not purged prior to fully met commit - assertLogCount(logsA, 2); - assertLogCount(logsB, 2); - assertLogCount(logsC, 2); + awaitLogCount(logsA, 2); + awaitLogCount(logsB, 2); + awaitLogCount(logsC, 2); + + // + // Above here the code is shared with halogRestart() + // // Run through transaction simpleTransaction(); // again check that only open log files remaining - assertLogCount(logsA, 1); - assertLogCount(logsB, 1); - assertLogCount(logsC, 1); + awaitLogCount(logsA, 1); + awaitLogCount(logsB, 1); + awaitLogCount(logsC, 1); assertEquals(1L, recursiveCount(getHALogDirA(),IHALogReader.HALOG_FILTER)); assertEquals(1L, recursiveCount(getHALogDirB(),IHALogReader.HALOG_FILTER)); assertEquals(1L, recursiveCount(getHALogDirC(),IHALogReader.HALOG_FILTER)); // Now run several transactions - for (int i = 0; i < 5; i++) - simpleTransaction(); + for (int i = 0; i < 5; i++) { + + simpleTransaction(); + + } // again check that only open log files remaining - assertLogCount(logsA, 1); - assertLogCount(logsB, 1); - assertLogCount(logsC, 1); + awaitLogCount(logsA, 1); + awaitLogCount(logsB, 1); + awaitLogCount(logsC, 1); } /** @@ -1733,37 +1757,52 @@ * @throws Exception */ public void testStartABC_halogRestart() throws Exception { - // Start 3 services in strict order. -// startA(); -// Thread.sleep(1000); // ensure A will be leader -// startB(); -// startC(); - new ABC(true/*sequential*/); - awaitFullyMetQuorum(); - // setup log directories final File logsA = getHALogDirA(); final File logsB = getHALogDirB(); final File logsC = getHALogDirC(); - final long token = awaitFullyMetQuorum(); - + /* + * Start 2 services and wait for first commit point. The HALogs will not + * be purged because the quorum is not fully met. + */ + HAGlue serverA = startA(); + HAGlue serverB = startB(); + final long token = awaitMetQuorum(); // initial token value. assertEquals(0L, token); + awaitCommitCounter(1L, serverA, serverB); + awaitLogCount(logsA, 2); + awaitLogCount(logsB, 2); + /* + * Start next service. The service will resync and join with the met + * quorum. The HALog files will not be purged - they do not get purged + * until the fully met quorum goes through a commit point. + */ + HAGlue serverC = startC(); + final long token2 = awaitFullyMetQuorum(); + assertEquals(token2,token); + awaitCommitCounter(1L, serverA, serverB, + serverC); + // committed log files not purged prior to fully met commit - assertLogCount(logsA, 2); - assertLogCount(logsB, 2); - assertLogCount(logsC, 2); + awaitLogCount(logsA, 2); + awaitLogCount(logsB, 2); + awaitLogCount(logsC, 2); + + // + // Above here the code is shared with halogRestart2() + // // Run through transaction simpleTransaction(); // again check that only open log files remaining - assertLogCount(logsA, 1); - assertLogCount(logsB, 1); - assertLogCount(logsC, 1); + awaitLogCount(logsA, 1); + awaitLogCount(logsB, 1); + awaitLogCount(logsC, 1); // Now shutdown all servers shutdownB(); @@ -1771,13 +1810,13 @@ shutdownA(); // and check that there are no logs - assertLogCount(logsA, 0); - assertLogCount(logsB, 0); - assertLogCount(logsC, 0); + awaitLogCount(logsA, 0); + awaitLogCount(logsB, 0); + awaitLogCount(logsC, 0); // startup AB - final HAGlue serverA = startA(); - final HAGlue serverB = startB(); + serverA = startA(); + serverB = startB(); final long token1 = awaitMetQuorum(); awaitHAStatus(serverA, HAStatusEnum.Leader); @@ -1785,63 +1824,74 @@ // Verify new quorum token. assertEquals(token + 1, token1); -((HAGlueTest)serverA).log("MARK"); -((HAGlueTest)serverB).log("MARK"); -Thread.sleep(1000/*ms*/); // FIXME Why this delay? // and check that there are open logs - assertLogCount(logsA, 1); - assertLogCount(logsB, 1); + awaitLogCount(logsA, 1); + awaitLogCount(logsB, 1); // add C - final HAGlue serverC = startC(); + serverC = startC(); assertEquals(token1, awaitFullyMetQuorum()); awaitHAStatus(serverA, HAStatusEnum.Leader); awaitHAStatus(serverB, HAStatusEnum.Follower); awaitHAStatus(serverC, HAStatusEnum.Follower); -((HAGlueTest)serverA).log("MARK"); -((HAGlueTest)serverB).log("MARK"); -((HAGlueTest)serverC).log("MARK"); -Thread.sleep(1000/*ms*/); // FIXME Why this delay? // and check again for ABC - assertLogCount(logsA, 1); - assertLogCount(logsB, 1); - assertLogCount(logsC, 1); + awaitLogCount(logsA, 1); + awaitLogCount(logsB, 1); + awaitLogCount(logsC, 1); } /** * Variant where A is shutdown first. */ public void testStartABC_halogRestart2() throws Exception { - // Start 3 services in strict order. -// startA(); -// Thread.sleep(1000); // ensure A will be leader -// startB(); -// startC(); - new ABC(true/* sequential */); - - final long token = awaitFullyMetQuorum(); - - assertEquals(0L, token); - + // setup log directories final File logsA = getHALogDirA(); final File logsB = getHALogDirB(); final File logsC = getHALogDirC(); + /* + * Start 2 services and wait for first commit point. The HALogs will not + * be purged because the quorum is not fully met. + */ + HAGlue serverA = startA(); + HAGlue serverB = startB(); + final long token = awaitMetQuorum(); + // initial token value. + assertEquals(0L, token); + awaitCommitCounter(1L, serverA, serverB); + awaitLogCount(logsA, 2); + awaitLogCount(logsB, 2); + + /* + * Start next service. The service will resync and join with the met + * quorum. The HALog files will not be purged - they do not get purged + * until the fully met quorum goes through a commit point. + */ + HAGlue serverC = startC(); + final long token2 = awaitFullyMetQuorum(); + assertEquals(token2,token); + awaitCommitCounter(1L, serverA, serverB, + serverC); + // committed log files not purged prior to fully met commit - assertLogCount(logsA, 2); - assertLogCount(logsB, 2); - assertLogCount(logsC, 2); + awaitLogCount(logsA, 2); + awaitLogCount(logsB, 2); + awaitLogCount(logsC, 2); + + // + // Above here the code is shared with halogRestart() + // // Run through transaction simpleTransaction(); // again check that only open log files remaining - assertLogCount(logsA, 1); - assertLogCount(logsB, 1); - assertLogCount(logsC, 1); + awaitLogCount(logsA, 1); + awaitLogCount(logsB, 1); + awaitLogCount(logsC, 1); // Verify token unchanged. assertEquals(token, awaitFullyMetQuorum()); @@ -1852,18 +1902,15 @@ shutdownC(); // and check that there are no logs - assertLogCount(logsA, 0); - assertLogCount(logsB, 0); - assertLogCount(logsC, 0); + awaitLogCount(logsA, 0); + awaitLogCount(logsB, 0); + awaitLogCount(logsC, 0); // startup AB - final HAGlue serverA = startA(); - final HAGlue serverB = startB(); + serverA = startA(); + serverB = startB(); final long token1 = awaitMetQuorum(); -((HAGlueTest)serverA).log("MARK"); -((HAGlueTest)serverB).log("MARK"); -Thread.sleep(1000/*ms*/); // FIXME Why this delay? /* * Verify new quorum token (could be a quorum meet when the leader @@ -1872,23 +1919,19 @@ assertTrue(token1 >= token + 1); // and check that there are open logs - assertLogCount(logsA, 1); - assertLogCount(logsB, 1); + awaitLogCount(logsA, 1); + awaitLogCount(logsB, 1); // add C - final HAGlue serverC = startC(); + serverC = startC(); // Verify quorum token is unchanged. assertEquals(token1, awaitFullyMetQuorum()); -((HAGlueTest)serverA).log("MARK"); -((HAGlueTest)serverB).log("MARK"); -((HAGlueTest)serverC).log("MARK"); -Thread.sleep(1000/*ms*/); // FIXME Why this delay? // and check again for ABC - assertLogCount(logsA, 1); - assertLogCount(logsB, 1); - assertLogCount(logsC, 1); + awaitLogCount(logsA, 1); + awaitLogCount(logsB, 1); + awaitLogCount(logsC, 1); } public void testABCMultiTransactionFollowerReads() throws Exception { @@ -2336,18 +2379,18 @@ simpleTransaction(); // 5 committed files + 1 open == 6 - assertLogCount(logsA, 6); - assertLogCount(logsB, 6); + awaitLogCount(logsA, 6); + awaitLogCount(logsB, 6); // and restart C with empty journal, forces Rebuild startC(); awaitFullyMetQuorum(); - assertLogCount(logsA, 6); - assertLogCount(logsB, 6); + awaitLogCount(logsA, 6); + awaitLogCount(logsB, 6); // Log count on C is 1 after quiescent rebuild - assertLogCount(logsC, 1); + awaitLogCount(logsC, 1); log.warn("CHECK: Committed log files not copied on Rebuild"); @@ -2356,9 +2399,9 @@ simpleTransaction(); // and check that only open log files remaining - assertLogCount(logsA, 1); - assertLogCount(logsB, 1); - assertLogCount(logsC, 1); + awaitLogCount(logsA, 1); + awaitLogCount(logsB, 1); + awaitLogCount(logsC, 1); } Modified: branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3JournalServerWithHALogs.java =================================================================== --- branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3JournalServerWithHALogs.java 2013-10-24 22:40:05 UTC (rev 7481) +++ branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHA3JournalServerWithHALogs.java 2013-10-25 13:20:15 UTC (rev 7482) @@ -152,9 +152,9 @@ * Note: This is (lastCommitCounter+1) since an empty HALog was created * for the next commit point. */ - assertLogCount(getHALogDirA(), commitCounter1 + 1); - assertLogCount(getHALogDirB(), commitCounter1 + 1); - assertLogCount(getHALogDirC(), commitCounter1 + 1); + awaitLogCount(getHALogDirA(), commitCounter1 + 1); + awaitLogCount(getHALogDirB(), commitCounter1 + 1); + awaitLogCount(getHALogDirC(), commitCounter1 + 1); /* * Shutdown C. @@ -209,9 +209,9 @@ awaitCommitCounter(commitCounter2, new HAGlue[] { serverA, serverB }); // Verify the expected #of HALogs on each service. - assertLogCount(getHALogDirA(), commitCounter2 + 1); - assertLogCount(getHALogDirB(), commitCounter2 + 1); - assertLogCount(getHALogDirC(), commitCounter2); + awaitLogCount(getHALogDirA(), commitCounter2 + 1); + awaitLogCount(getHALogDirB(), commitCounter2 + 1); + awaitLogCount(getHALogDirC(), commitCounter2); // Verify HALog file for next commit point on A is NOT empty. { @@ -264,9 +264,9 @@ * Note: Each service will have an empty HALog for the next commit * point. */ - assertLogCount(getHALogDirA(), commitCounter2+1); - assertLogCount(getHALogDirB(), commitCounter2+1); - assertLogCount(getHALogDirC(), commitCounter2+1); + awaitLogCount(getHALogDirA(), commitCounter2+1); + awaitLogCount(getHALogDirB(), commitCounter2+1); + awaitLogCount(getHALogDirC(), commitCounter2+1); } Modified: branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHAJournalServerOverride.java =================================================================== --- branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHAJournalServerOverride.java 2013-10-24 22:40:05 UTC (rev 7481) +++ branches/ZK_DISCONNECT_HANDLING/bigdata-jini/src/test/com/bigdata/journal/jini/ha/TestHAJournalServerOverride.java 2013-10-25 13:20:15 UTC (rev 7482) @@ -492,10 +492,13 @@ * might have a consensus around the new commit point.) * * TODO Consider leader failure scenarios in this test suite, not just - * scenarios where B fails. Probably we should also cover failures of C (the - * 2nd follower). We should also cover scenariors where the quorum is barely - * met and a single failure causes a rejected commit (local decision) or - * 2-phase abort (joined services in joint agreement). + * scenarios where B fails. We MUST also cover failures of C (the 2nd + * follower). We should also cover scenariors where the quorum is barely met + * and a single failure causes a rejected commit (local decision) or 2-phase + * abort (joined services in joint agreement). + * + * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/760" > + * Review commit2Phase semantics when a follower fails </a> */ public void testStartABC_commit2Phase_B_fails() throws Exception { @@ -517,61 +520,92 @@ new Class[] { IHA2PhaseCommitMessage.class }, 0/* nwait */, 1/* nfail */); - // Simple transaction. - simpleTransaction(); + /** + * FIXME We need to resolve the correct behavior when B fails the commit + * after having prepared. Two code paths are outlined below. The + * implementation currently does an abort2Phase() when the + * commit2Phase() observe an error for B. That causes the commit point + * to NOT advance. + * + * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/760" > + * Review commit2Phase semantics when a follower fails </a> + */ -// try { -// // Simple transaction. -// simpleTransaction(); -// fail("Expecting failed transaction"); -// } catch(HttpException ex) { -// if (!ex.getMessage().contains( -// SpuriousTestException.class.getName())) { -// /* -// * Wrong inner cause. -// * -// * Note: The stack trace of the local exception does not include -// * the remote stack trace. The cause is formatted into the HTTP -// * response body. -// */ -// fail("Expecting " + ClocksNotSynchronizedException.class, t); -// } -// } - - // Verify quorum is unchanged. - assertEquals(token, quorum.token()); - - // Should be two commit points on {A,C]. - awaitCommitCounter(2L, startup.serverA, startup.serverC); - - /* - * B should go into an ERROR state and then into SeekConsensus and from - * there to RESYNC and finally back to RunMet. We can not reliably - * observe the intervening states. So what we really need to do is watch - * for B to move to the end of the pipeline and catch up to the same - * commit point. - */ + if(true) { - /* - * The pipeline should be reordered. B will do a service leave, then - * enter seek consensus, and then re-enter the pipeline. - */ - awaitPipeline(new HAGlue[] { startup.serverA, startup.serverC, - startup.serverB }); + // Simple transaction. + simpleTransaction(); - /* - * There should be two commit points on {A,C,B} (note that this assert - * does not pay attention to the pipeline order). - */ - awaitCommitCounter(2L, startup.serverA, startup.serverC, - startup.serverB); + // Verify quorum is unchanged. + assertEquals(token, quorum.token()); - // B should be a follower again. - awaitHAStatus(startup.serverB, HAStatusEnum.Follower); + // Should be two commit points on {A,C]. + awaitCommitCounter(2L, startup.serverA, startup.serverC); - // quorum token is unchanged. - assertEquals(token, quorum.token()); + /* + * B should go into an ERROR state and then into SeekConsensus and + * from there to RESYNC and finally back to RunMet. We can not + * reliably observe the intervening states. So what we really need + * to do is watch for B to move to the end of the pipeline and catch + * up to the same commit point. + */ + /* + * The pipeline should be reordered. B will do a service leave, then + * enter seek consensus, and then re-enter the pipeline. + */ + awaitPipeline(new HAGlue[] { startup.serverA, startup.serverC, + startup.serverB }); + + /* + * There should be two commit points on {A,C,B} (note that this + * assert does not pay attention to the pipeline order). + */ + awaitCommitCounter(2L, startup.serverA, startup.serverC, + startup.serverB); + + // B should be a follower again. + awaitHAStatus(startup.serverB, HAStatusEnum.Follower); + + // quorum token is unchanged. + assertEquals(token, quorum.token()); + + } else { + + try { + + // Simple transaction. + simpleTransaction(); + + fail("Expecting failed transaction"); + + } catch (Exception t) { + + if (!t.getMessage().contains( + SpuriousTestException.class.getName())) { + /* + * Wrong inner cause. + * + * Note: The stack trace of the local exception does not + * include the remote stack trace. The cause is formatted + * into the HTTP response body. + */ + fail("Expecting " + SpuriousTestException.class, t); + } + + } + + // Verify quorum is unchanged. + assertEquals(token, quorum.token()); + + // Should be ONE commit point on {A,B, C]. + awaitCommitCounter(1L, startup.serverA, startup.serverB, + startup.serverC); + + fail("finish test under these assumptions"); + + } + } /** This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |