From: <jer...@us...> - 2014-05-09 19:07:05
|
Revision: 8255 http://sourceforge.net/p/bigdata/code/8255 Author: jeremy_carroll Date: 2014-05-09 19:07:02 +0000 (Fri, 09 May 2014) Log Message: ----------- minor polishing, a few more tests Modified Paths: -------------- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java 2014-05-09 18:10:14 UTC (rev 8254) +++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java 2014-05-09 19:07:02 UTC (rev 8255) @@ -144,11 +144,10 @@ * * - the subword boundaries are identified in {@link #next()} * We then set up {@link #found} to contain the most - * recently found subword, with afterDiscard containing - * the same word as found with the {@link #discard} pattern - * applied. {@link #afterDiscard} is not equal to found; if there - * is nothing to discard then it is null. + * recently found subword. * + * - the soft hyphen discarding is processed in {@link #maybeDiscardHyphens()} + * * - if we are not {@link #alwaysDiscard}ing then {@link #afterDiscard} * can be set to null to return the non-discarded version on the next cycle. * @@ -216,14 +215,14 @@ afterDiscard = null; if (charPos + 1 < currentWord.length && softMatcher.find(charPos+1)) { charPos = softMatcher.end(); - considerMatch(); + maybeDiscardHyphens(); return true; } else { return nextWord(); } } - void considerMatch() { + void maybeDiscardHyphens() { found = CharBuffer.wrap(currentWord, charPos, currentWord.length - charPos); Matcher discarding = discard.matcher(found); if (discarding.find()) { @@ -240,7 +239,7 @@ termAtt.resizeTermBuffer(currentWord.length); charPos = 0; softMatcher = subWordBoundary.matcher(words[currentWordIx]); - considerMatch(); + maybeDiscardHyphens(); return true; } Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 18:10:14 UTC (rev 8254) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 19:07:02 UTC (rev 8255) @@ -102,13 +102,13 @@ return getNdx().getAnalyzer(lang, filterStopWords); } - protected void comparisonTest(String lang, boolean stopWordsSignificant, String text, String spaceSeparated) + protected void comparisonTest(String lang, boolean filterStopWords, String text, String spaceSeparated) throws IOException { if (spaceSeparated == null) { - String rslt = getTokenStream(getAnalyzer(lang, stopWordsSignificant), text); + String rslt = getTokenStream(getAnalyzer(lang, filterStopWords), text); throw new RuntimeException("Got \"" + rslt+ "\""); } - compareTokenStream(getAnalyzer(lang, stopWordsSignificant), text, + compareTokenStream(getAnalyzer(lang, filterStopWords), text, split(spaceSeparated)); //$NON-NLS-1$ } Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 18:10:14 UTC (rev 8254) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 19:07:02 UTC (rev 8255) @@ -35,6 +35,16 @@ import com.bigdata.search.ConfigurableAnalyzerFactory.AnalyzerOptions; +/** + * Unit tests for {@link ConfigurableAnalyzerFactory}. + * We use the same setup, as defined in {@link #getExtraProperties()} + * for all the tests. Some of the tests check whether bad combinations + * of options are detected and reported correctly. + * Others check that some input, in a particular language is + * tokenized as expected. + * @author jeremycarroll + * + */ public class TestConfigurableAnalyzerFactory extends AbstractSearchTest { public TestConfigurableAnalyzerFactory() { @@ -68,8 +78,8 @@ analyzer+"x-hyphen2."+AnalyzerOptions.WORD_BOUNDARY, " ", analyzer+"x-hyphen2."+AnalyzerOptions.ALWAYS_REMOVE_SOFT_HYPHENS, "true", analyzer+"x-keywords."+AnalyzerOptions.ANALYZER_CLASS, KeywordAnalyzer.class.getName(), - analyzer+"ru-x-de."+AnalyzerOptions.ANALYZER_CLASS, RussianAnalyzer.class.getName(), - analyzer+"ru-x-de."+AnalyzerOptions.STOPWORDS, GermanAnalyzer.class.getName(), + analyzer+"en-x-de."+AnalyzerOptions.ANALYZER_CLASS, StandardAnalyzer.class.getName(), + analyzer+"en-x-de."+AnalyzerOptions.STOPWORDS, GermanAnalyzer.class.getName(), }; } @@ -142,6 +152,25 @@ ); } + + public void testStopWordSwitch() throws IOException { + // en-x-de is an English Analyzer using german stopwords! + comparisonTest("en-x-de", + true, + "The fast car arrived slowly.", + "the fast car arrived slowly" + ); + comparisonTest("en-x-de", + true, + "The fast car die arrived slowly.", + "the fast car arrived slowly" + ); + comparisonTest("en-x-de", + false, + "The fast car die arrived slowly.", + "the fast car die arrived slowly" + ); + } public void testSyapseExample1() throws IOException { comparisonTest("x-splits", true, This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |