[Bigdata-commit] SF.net SVN: bigdata:[8255] branches/TEXT_ANALYZERS/bigdata/src

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 8255
          http://sourceforge.net/p/bigdata/code/8255
Author:   jeremy_carroll
Date:     2014-05-09 19:07:02 +0000 (Fri, 09 May 2014)
Log Message:
-----------
minor polishing, a few more tests

Modified Paths:
--------------
    branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java
    branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java
    branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java

Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java
===================================================================

--- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java	2014-05-09 18:10:14 UTC (rev 8254)
+++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java	2014-05-09 19:07:02 UTC (rev 8255)
@@ -144,11 +144,10 @@
 	 *   
 	 * - the subword boundaries are identified in {@link #next()}
 	 *   We then set up {@link #found} to contain the most
-	 *   recently found subword, with afterDiscard containing
-	 *   the same word as found with the {@link #discard} pattern
-	 *   applied. {@link #afterDiscard} is not equal to found; if there
-	 *   is nothing to discard then it is null.
+	 *   recently found subword.
 	 *   
+	 * - the soft hyphen discarding is processed in {@link #maybeDiscardHyphens()}
+	 *   
 	 *   - if we are not {@link #alwaysDiscard}ing then {@link #afterDiscard}
 	 *   can be set to null to return the non-discarded version on the next cycle.
 	 *   
@@ -216,14 +215,14 @@
 			afterDiscard = null;
 			if (charPos + 1 < currentWord.length && softMatcher.find(charPos+1)) {
 				charPos = softMatcher.end();
-				considerMatch();
+				maybeDiscardHyphens();
 				return true;
 			} else {
 				return nextWord();
 			}
 		}
 
-		void considerMatch() {
+		void maybeDiscardHyphens() {
 			found = CharBuffer.wrap(currentWord, charPos, currentWord.length - charPos);
 			Matcher discarding = discard.matcher(found);
 			if (discarding.find()) {
@@ -240,7 +239,7 @@
 			termAtt.resizeTermBuffer(currentWord.length);
 			charPos = 0;
 			softMatcher = subWordBoundary.matcher(words[currentWordIx]);
-			considerMatch();
+			maybeDiscardHyphens();
 			return true;
 		}
 

Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java	2014-05-09 18:10:14 UTC (rev 8254)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java	2014-05-09 19:07:02 UTC (rev 8255)
@@ -102,13 +102,13 @@
 		return getNdx().getAnalyzer(lang, filterStopWords);
 	}
 
-	protected void comparisonTest(String lang, boolean stopWordsSignificant, String text, String spaceSeparated)
+	protected void comparisonTest(String lang, boolean filterStopWords, String text, String spaceSeparated)
 			throws IOException {
 		if (spaceSeparated == null) {
-			String rslt = getTokenStream(getAnalyzer(lang, stopWordsSignificant), text);
+			String rslt = getTokenStream(getAnalyzer(lang, filterStopWords), text);
 			throw new RuntimeException("Got \"" + rslt+ "\"");
 		}
-			compareTokenStream(getAnalyzer(lang, stopWordsSignificant), text,
+			compareTokenStream(getAnalyzer(lang, filterStopWords), text,
 					split(spaceSeparated)); //$NON-NLS-1$
 			}
 

Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java	2014-05-09 18:10:14 UTC (rev 8254)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java	2014-05-09 19:07:02 UTC (rev 8255)
@@ -35,6 +35,16 @@
 
 import com.bigdata.search.ConfigurableAnalyzerFactory.AnalyzerOptions;
 
+/**
+ * Unit tests for {@link ConfigurableAnalyzerFactory}.
+ * We use the same setup, as defined in {@link #getExtraProperties()}
+ * for all the tests. Some of the tests check whether bad combinations
+ * of options are detected and reported correctly.
+ * Others check that some input, in a particular language is
+ * tokenized as expected.
+ * @author jeremycarroll
+ *
+ */
 public class TestConfigurableAnalyzerFactory extends AbstractSearchTest {
 
 	public TestConfigurableAnalyzerFactory() {
@@ -68,8 +78,8 @@
 		analyzer+"x-hyphen2."+AnalyzerOptions.WORD_BOUNDARY, " ",
 		analyzer+"x-hyphen2."+AnalyzerOptions.ALWAYS_REMOVE_SOFT_HYPHENS, "true",
 		analyzer+"x-keywords."+AnalyzerOptions.ANALYZER_CLASS, KeywordAnalyzer.class.getName(),
-		analyzer+"ru-x-de."+AnalyzerOptions.ANALYZER_CLASS, RussianAnalyzer.class.getName(),
-		analyzer+"ru-x-de."+AnalyzerOptions.STOPWORDS, GermanAnalyzer.class.getName(),
+		analyzer+"en-x-de."+AnalyzerOptions.ANALYZER_CLASS, StandardAnalyzer.class.getName(),
+		analyzer+"en-x-de."+AnalyzerOptions.STOPWORDS, GermanAnalyzer.class.getName(),
 		};
 	}
 	
@@ -142,6 +152,25 @@
     			);
     	
     }
+    
+    public void testStopWordSwitch() throws IOException {
+    	// en-x-de is an English Analyzer using german stopwords!
+    	comparisonTest("en-x-de",
+    			true,
+    			"The fast car arrived slowly.",
+    			"the fast car arrived slowly"
+    			);
+    	comparisonTest("en-x-de",
+    			true,
+    			"The fast car die arrived slowly.",
+    			"the fast car arrived slowly"
+    			);
+    	comparisonTest("en-x-de",
+    			false,
+    			"The fast car die arrived slowly.",
+    			"the fast car die arrived slowly"
+    			);
+    }
     public void testSyapseExample1() throws IOException {
     	comparisonTest("x-splits", 
     			true,

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





[Bigdata-commit] SF.net SVN: bigdata:[8255] branches/TEXT_ANALYZERS/bigdata/src

Fast, scalable, robust graph database platform

[Bigdata-commit] SF.net SVN: bigdata:[8255] branches/TEXT_ANALYZERS/bigdata/src