|
From: <jer...@us...> - 2014-05-09 19:07:05
|
Revision: 8255
http://sourceforge.net/p/bigdata/code/8255
Author: jeremy_carroll
Date: 2014-05-09 19:07:02 +0000 (Fri, 09 May 2014)
Log Message:
-----------
minor polishing, a few more tests
Modified Paths:
--------------
branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java
Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java 2014-05-09 18:10:14 UTC (rev 8254)
+++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java 2014-05-09 19:07:02 UTC (rev 8255)
@@ -144,11 +144,10 @@
*
* - the subword boundaries are identified in {@link #next()}
* We then set up {@link #found} to contain the most
- * recently found subword, with afterDiscard containing
- * the same word as found with the {@link #discard} pattern
- * applied. {@link #afterDiscard} is not equal to found; if there
- * is nothing to discard then it is null.
+ * recently found subword.
*
+ * - the soft hyphen discarding is processed in {@link #maybeDiscardHyphens()}
+ *
* - if we are not {@link #alwaysDiscard}ing then {@link #afterDiscard}
* can be set to null to return the non-discarded version on the next cycle.
*
@@ -216,14 +215,14 @@
afterDiscard = null;
if (charPos + 1 < currentWord.length && softMatcher.find(charPos+1)) {
charPos = softMatcher.end();
- considerMatch();
+ maybeDiscardHyphens();
return true;
} else {
return nextWord();
}
}
- void considerMatch() {
+ void maybeDiscardHyphens() {
found = CharBuffer.wrap(currentWord, charPos, currentWord.length - charPos);
Matcher discarding = discard.matcher(found);
if (discarding.find()) {
@@ -240,7 +239,7 @@
termAtt.resizeTermBuffer(currentWord.length);
charPos = 0;
softMatcher = subWordBoundary.matcher(words[currentWordIx]);
- considerMatch();
+ maybeDiscardHyphens();
return true;
}
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 18:10:14 UTC (rev 8254)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 19:07:02 UTC (rev 8255)
@@ -102,13 +102,13 @@
return getNdx().getAnalyzer(lang, filterStopWords);
}
- protected void comparisonTest(String lang, boolean stopWordsSignificant, String text, String spaceSeparated)
+ protected void comparisonTest(String lang, boolean filterStopWords, String text, String spaceSeparated)
throws IOException {
if (spaceSeparated == null) {
- String rslt = getTokenStream(getAnalyzer(lang, stopWordsSignificant), text);
+ String rslt = getTokenStream(getAnalyzer(lang, filterStopWords), text);
throw new RuntimeException("Got \"" + rslt+ "\"");
}
- compareTokenStream(getAnalyzer(lang, stopWordsSignificant), text,
+ compareTokenStream(getAnalyzer(lang, filterStopWords), text,
split(spaceSeparated)); //$NON-NLS-1$
}
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 18:10:14 UTC (rev 8254)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 19:07:02 UTC (rev 8255)
@@ -35,6 +35,16 @@
import com.bigdata.search.ConfigurableAnalyzerFactory.AnalyzerOptions;
+/**
+ * Unit tests for {@link ConfigurableAnalyzerFactory}.
+ * We use the same setup, as defined in {@link #getExtraProperties()}
+ * for all the tests. Some of the tests check whether bad combinations
+ * of options are detected and reported correctly.
+ * Others check that some input, in a particular language is
+ * tokenized as expected.
+ * @author jeremycarroll
+ *
+ */
public class TestConfigurableAnalyzerFactory extends AbstractSearchTest {
public TestConfigurableAnalyzerFactory() {
@@ -68,8 +78,8 @@
analyzer+"x-hyphen2."+AnalyzerOptions.WORD_BOUNDARY, " ",
analyzer+"x-hyphen2."+AnalyzerOptions.ALWAYS_REMOVE_SOFT_HYPHENS, "true",
analyzer+"x-keywords."+AnalyzerOptions.ANALYZER_CLASS, KeywordAnalyzer.class.getName(),
- analyzer+"ru-x-de."+AnalyzerOptions.ANALYZER_CLASS, RussianAnalyzer.class.getName(),
- analyzer+"ru-x-de."+AnalyzerOptions.STOPWORDS, GermanAnalyzer.class.getName(),
+ analyzer+"en-x-de."+AnalyzerOptions.ANALYZER_CLASS, StandardAnalyzer.class.getName(),
+ analyzer+"en-x-de."+AnalyzerOptions.STOPWORDS, GermanAnalyzer.class.getName(),
};
}
@@ -142,6 +152,25 @@
);
}
+
+ public void testStopWordSwitch() throws IOException {
+ // en-x-de is an English Analyzer using german stopwords!
+ comparisonTest("en-x-de",
+ true,
+ "The fast car arrived slowly.",
+ "the fast car arrived slowly"
+ );
+ comparisonTest("en-x-de",
+ true,
+ "The fast car die arrived slowly.",
+ "the fast car arrived slowly"
+ );
+ comparisonTest("en-x-de",
+ false,
+ "The fast car die arrived slowly.",
+ "the fast car die arrived slowly"
+ );
+ }
public void testSyapseExample1() throws IOException {
comparisonTest("x-splits",
true,
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|