Thread: [Bigdata-commit] SF.net SVN: bigdata:[8248] branches/TEXT_ANALYZERS/bigdata/src

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 8248
          http://sourceforge.net/p/bigdata/code/8248
Author:   jeremy_carroll
Date:     2014-05-09 17:42:44 +0000 (Fri, 09 May 2014)
Log Message:
-----------
First version of TermCompletionAnalyzer, and also tests now passing

Modified Paths:
--------------
    branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java
    branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java
    branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java
    branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestAll.java
    branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAsDefaultAnalyzerFactory.java
    branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestDefaultAnalyzerFactory.java

Added Paths:
-----------
    branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java
    branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java

Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java
===================================================================

--- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java	2014-05-09 17:07:05 UTC (rev 8247)
+++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java	2014-05-09 17:42:44 UTC (rev 8248)
@@ -331,7 +331,7 @@
          * (Note the {@link Pattern#UNICODE_CHARACTER_CLASS} flag is enabled).
          * It is an error if a different analyzer class is specified.
          */
-        String PATTERN = ".pattern";
+        String PATTERN = "pattern";
     	
     }
 
@@ -474,7 +474,7 @@
 		 */
 		public Set<?> getStopWords() {
 			
-			if (AnalyzerOptions.STOPWORDS_VALUE_NONE.equals(stopwords)) 
+			if (doNotUseStopWords()) 
 				return Collections.EMPTY_SET;
 			
 			if (useDefaultStopWords()) {
@@ -484,6 +484,10 @@
 			return getStopWordsForClass(stopwords);
 		}
 
+		boolean doNotUseStopWords() {
+			return AnalyzerOptions.STOPWORDS_VALUE_NONE.equals(stopwords) || (stopwords == null && pattern != null);
+		}
+
 		protected Set<?> getStopWordsForClass(String clazzName) {
 			Class<? extends Analyzer> analyzerClass = getAnalyzerClass(clazzName);
 			try {
@@ -500,7 +504,7 @@
 		}
 
 		protected boolean useDefaultStopWords() {
-			return stopwords == null || AnalyzerOptions.STOPWORDS_VALUE_DEFAULT.equals(stopwords);
+			return ( stopwords == null && pattern == null ) || AnalyzerOptions.STOPWORDS_VALUE_DEFAULT.equals(stopwords);
 		}
 
 		public boolean setProperty(String shortProperty, String value) {
@@ -550,8 +554,13 @@
             if (hasConstructor(cls, Version.class, Set.class)) {
 
             	// RussianAnalyzer is missing any way to access stop words.
-            	if (RussianAnalyzer.class.equals(cls) && useDefaultStopWords()) {
-            		return new AnalyzerPair(languageRange, new RussianAnalyzer(Version.LUCENE_CURRENT), new RussianAnalyzer(Version.LUCENE_CURRENT, Collections.EMPTY_SET));
+            	if (RussianAnalyzer.class.equals(cls)) {
+            		if (useDefaultStopWords()) {
+            		    return new AnalyzerPair(languageRange, new RussianAnalyzer(Version.LUCENE_CURRENT), new RussianAnalyzer(Version.LUCENE_CURRENT, Collections.EMPTY_SET));
+            		}
+            		if (doNotUseStopWords()) {
+            		    return new AnalyzerPair(languageRange,  new RussianAnalyzer(Version.LUCENE_CURRENT, Collections.EMPTY_SET));	
+            		}
             	}
             	return new VersionSetAnalyzerPair(this, cls);
             }
@@ -719,7 +728,7 @@
 			String prop = (String)en.nextElement();
 			if (prop.equals(Options.INCLUDE_DEFAULTS)) continue;
 			if (prop.startsWith(Options.ANALYZER)) {
-				String languageRangeAndProperty[] = prop.substring(Options.ANALYZER.length()).split("[.]");
+				String languageRangeAndProperty[] = prop.substring(Options.ANALYZER.length()).replaceAll("_","*").split("[.]");
 				if (languageRangeAndProperty.length == 2) {
 
 					String languageRange = languageRangeAndProperty[0].toLowerCase(Locale.US);  // Turkish "I" could create a problem

Added: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java	                        (rev 0)
+++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java	2014-05-09 17:42:44 UTC (rev 8248)
@@ -0,0 +1,88 @@
+package com.bigdata.search;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.nio.CharBuffer;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.Attribute;
+
+
+public class TermCompletionAnalyzer extends Analyzer {
+	
+	Pattern hard = Pattern.compile(" ", Pattern.UNICODE_CHARACTER_CLASS);
+	Pattern soft = Pattern.compile("(?<!\\p{L}|\\p{N})(?=\\p{L}|\\p{N})|(?<!\\p{Lu})(?=\\p{Lu})|(?<=\\p{N})(?=\\p{L})", Pattern.UNICODE_CHARACTER_CLASS);
+
+	public TermCompletionAnalyzer() {
+		// TODO Auto-generated constructor stub
+	}
+
+	private class TermCompletionTokenStream extends TokenStream {
+
+		final int length;
+		final String[] words;
+		char currentWord[] = new char[]{};
+		Matcher softMatcher;
+		int currentWordIx = -1;
+		int charPos = 0;
+		final TermAttribute termAtt;
+		public TermCompletionTokenStream(StringReader reader) {
+		    termAtt = addAttribute(TermAttribute.class);
+			try {
+				reader.mark(Integer.MAX_VALUE);
+				length = (int) reader.skip(Integer.MAX_VALUE);
+				reader.reset();
+				char fileContent[] = new char[length];
+				reader.read(fileContent);
+				words = hard.split(new String(fileContent));
+			} catch (IOException e) {
+				throw new RuntimeException("Impossible",e);
+			}
+		}
+		@Override
+		public boolean incrementToken() throws IOException {
+			if ( next() ) {
+				int lg = currentWord.length - charPos;
+				System.arraycopy(currentWord,  charPos, termAtt.termBuffer(), 0, lg );
+				termAtt.setTermLength(lg);
+				return true;
+			} else {
+				return false;
+			}
+		}
+		private boolean next() {
+			if (currentWordIx >= words.length) {
+				return false;
+			}
+			if (charPos +1 < currentWord.length && softMatcher.find(charPos+1)) {
+				charPos = softMatcher.end();
+				return true;
+			} else {
+				return nextWord();
+			}
+		}
+		private boolean nextWord() {
+			currentWordIx++;
+			if (currentWordIx >= words.length) {
+				return false;
+			}
+			currentWord = words[currentWordIx].toCharArray();
+			termAtt.resizeTermBuffer(currentWord.length);
+			charPos = 0;
+			softMatcher = soft.matcher(words[currentWordIx]);
+			return true;
+		}
+
+	}
+	
+
+	@Override
+	public TokenStream tokenStream(String ignoredFieldName, Reader reader) {
+		return new TermCompletionTokenStream((StringReader)reader);
+	}
+}

Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java	2014-05-09 17:07:05 UTC (rev 8247)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java	2014-05-09 17:42:44 UTC (rev 8248)
@@ -27,11 +27,7 @@
 package com.bigdata.search;
 
 import java.io.IOException;
-import java.io.StringReader;
 
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 public abstract class AbstractAnalyzerFactoryTest extends AbstractSearchTest {
 
@@ -42,37 +38,16 @@
     	super(arg0);
 	}
     
+    @Override
     public void setUp() throws Exception {
     	super.setUp();
-	    init(getExtraProperties());
+    	init(getExtraProperties());
     }
+    
+    
     abstract String[] getExtraProperties();
     
-	private Analyzer getAnalyzer(String lang, boolean filterStopWords) {
-		return getNdx().getAnalyzer(lang, filterStopWords);
-	}
-	
-	private void comparisonTest(String lang, 
-			boolean stopWordsSignificant, 
-			String text, 
-			String spaceSeparated)  throws IOException {
-		compareTokenStream(getAnalyzer(lang, stopWordsSignificant), text,
-				spaceSeparated.split(" ")); //$NON-NLS-1$
-		}
-	private void compareTokenStream(Analyzer a, String text, String expected[]) throws IOException {
-		TokenStream s = a.tokenStream(null, new StringReader(text));
-		int ix = 0;
-        while (s.incrementToken()) {
-            final TermAttribute term = s.getAttribute(TermAttribute.class);
-            final String word = term.term();
-            assertTrue(ix < expected.length);
-        	assertEquals(word, expected[ix++]);
-        }
-        assertEquals(ix, expected.length);
-	}
-	
-
-    public void testEnglishFilterStopWords() throws IOException {
+	public void testEnglishFilterStopWords() throws IOException {
     	for (String lang: new String[]{ "eng", null, "" }) { //$NON-NLS-1$ //$NON-NLS-2$
     	    comparisonTest(lang,
     			true,
@@ -159,14 +134,20 @@
     }
 
 	private void checkConfig(String classname, String ...langs) {
+		checkConfig(isBroken(), classname, langs);
+		
+	}
+	protected void checkConfig(boolean threeLetterOnly, String classname, String ...langs) {
 		for (String lang:langs) {
 			// The DefaultAnalyzerFactory only works for language tags of length exactly three.
-//			if (lang != null && lang.length()==3)
+			if ((!threeLetterOnly) || (lang != null && lang.length()==3))
 			{
 			assertEquals(classname, getAnalyzer(lang,true).getClass().getSimpleName());
-			assertEquals(classname, getAnalyzer(lang+NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.0"),true).getClass().getSimpleName()); //$NON-NLS-1$
+			if (!threeLetterOnly) assertEquals(classname, getAnalyzer(lang+"-x-foobar",true).getClass().getSimpleName()); //$NON-NLS-1$
 			}
 		}
 		
 	}
+
+	abstract boolean isBroken() ;
 }

Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java	2014-05-09 17:07:05 UTC (rev 8247)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java	2014-05-09 17:42:44 UTC (rev 8248)
@@ -26,8 +26,14 @@
  */
 package com.bigdata.search;
 
+import java.io.IOException;
+import java.io.StringReader;
 import java.util.Properties;
 
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
 import com.bigdata.journal.IIndexManager;
 import com.bigdata.journal.ITx;
 import com.bigdata.journal.ProxyTestCase;
@@ -62,7 +68,7 @@
 	}
 
 	FullTextIndex<Long> createFullTextIndex(String namespace, String ...propertyValuePairs) {
-        return createFullTextIndex(namespace, getProperties(), propertyValuePairs);
+        return createFullTextIndex(namespace, (Properties)getProperties().clone(), propertyValuePairs);
 	}
 	
 	public void tearDown() throws Exception {
@@ -92,4 +98,51 @@
 		return properties;
 	}
 
+	protected Analyzer getAnalyzer(String lang, boolean filterStopWords) {
+		return getNdx().getAnalyzer(lang, filterStopWords);
+	}
+
+	protected void comparisonTest(String lang, boolean stopWordsSignificant, String text, String spaceSeparated)
+			throws IOException {
+		if (spaceSeparated == null) {
+			String rslt = getTokenStream(getAnalyzer(lang, stopWordsSignificant), text);
+			throw new RuntimeException("Got \"" + rslt+ "\"");
+		}
+			compareTokenStream(getAnalyzer(lang, stopWordsSignificant), text,
+					split(spaceSeparated)); //$NON-NLS-1$
+			}
+
+	private String[] split(String spaceSeparated) {
+		if (spaceSeparated.length()==0) {
+			return new String[0];
+		}
+		return spaceSeparated.split(" ");
+	}
+
+	protected String getTokenStream(Analyzer a, String text) throws IOException {
+		StringBuffer sb = new StringBuffer();
+		TokenStream s = a.tokenStream(null, new StringReader(text));
+		int ix = 0;
+	    while (s.incrementToken()) {
+	        final TermAttribute term = s.getAttribute(TermAttribute.class);
+	        if (sb.length()!=0) {
+	        	sb.append(" ");
+	        }
+	        sb.append(term.term());
+	    }
+		return sb.toString();
+	}
+
+	private void compareTokenStream(Analyzer a, String text, String expected[]) throws IOException {
+		TokenStream s = a.tokenStream(null, new StringReader(text));
+		int ix = 0;
+	    while (s.incrementToken()) {
+	        final TermAttribute term = s.getAttribute(TermAttribute.class);
+	        final String word = term.term();
+	        assertTrue(ix < expected.length);
+	    	assertEquals(word, expected[ix++]);
+	    }
+	    assertEquals(ix, expected.length);
+	}
+
 }

Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestAll.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestAll.java	2014-05-09 17:07:05 UTC (rev 8247)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestAll.java	2014-05-09 17:42:44 UTC (rev 8248)
@@ -114,6 +114,7 @@
         // which is intended to be the same as the intended
         // behavior of DefaultAnalyzerFactory
         suite.addTestSuite(TestConfigurableAsDefaultAnalyzerFactory.class);
+        suite.addTestSuite(TestConfigurableAnalyzerFactory.class);
 
         return suite;
     }

Added: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java	                        (rev 0)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java	2014-05-09 17:42:44 UTC (rev 8248)
@@ -0,0 +1,195 @@
+/**
+
+Copyright (C) SYSTAP, LLC 2006-2014.  All rights reserved.
+
+Contact:
+     SYSTAP, LLC
+     4501 Tower Road
+     Greensboro, NC 27410
+     lic...@bi...
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+/*
+ * Created on May 7, 2014
+ */
+package com.bigdata.search;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.KeywordAnalyzer;
+import org.apache.lucene.analysis.cjk.CJKAnalyzer;
+import org.apache.lucene.analysis.de.GermanAnalyzer;
+import org.apache.lucene.analysis.ru.RussianAnalyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.util.Version;
+
+import com.bigdata.search.ConfigurableAnalyzerFactory.AnalyzerOptions;
+
+public class TestConfigurableAnalyzerFactory extends AbstractSearchTest {
+
+	public TestConfigurableAnalyzerFactory() {
+	}
+
+	public TestConfigurableAnalyzerFactory(String arg0) {
+		super(arg0);
+	}
+
+    public void setUp() throws Exception {
+    	super.setUp();
+	    init(getExtraProperties());
+    }
+
+	private String[] getExtraProperties() {
+		String analyzer = ConfigurableAnalyzerFactory.Options.ANALYZER;
+		return new String[]{
+		FullTextIndex.Options.ANALYZER_FACTORY_CLASS, ConfigurableAnalyzerFactory.class.getName(),
+		analyzer+"*."+AnalyzerOptions.ANALYZER_CLASS, EmptyAnalyzer.class.getName(),
+		analyzer+"x-terms."+AnalyzerOptions.PATTERN, "\\W+",
+		analyzer+"x-splits."+AnalyzerOptions.ANALYZER_CLASS, TermCompletionAnalyzer.class.getName(),
+		analyzer+"x-splits."+AnalyzerOptions.STOPWORDS, AnalyzerOptions.STOPWORDS_VALUE_NONE,
+		analyzer+"x-keywords."+AnalyzerOptions.ANALYZER_CLASS, KeywordAnalyzer.class.getName(),
+		analyzer+"ru-x-de."+AnalyzerOptions.ANALYZER_CLASS, RussianAnalyzer.class.getName(),
+		analyzer+"ru-x-de."+AnalyzerOptions.STOPWORDS, GermanAnalyzer.class.getName(),
+		};
+	}
+	
+	private void badCombo(String errorMessage, String ... props) {
+		// Check that some combination of properties on a language create an error
+		String myProps[] = new String[props.length+4];
+		int i=0;
+		for (; i<props.length;i+=2) {
+			myProps[i] = ConfigurableAnalyzerFactory.Options.ANALYZER + "x-testme." + props[i];
+			myProps[i+1] = props[i+1];
+		}
+		myProps[i] = ConfigurableAnalyzerFactory.Options.ANALYZER + "_." + AnalyzerOptions.ANALYZER_CLASS;
+		myProps[i+1] = EmptyAnalyzer.class.getName();
+		myProps[i+2] = FullTextIndex.Options.ANALYZER_FACTORY_CLASS;
+		myProps[i+3] = ConfigurableAnalyzerFactory.class.getName();
+		try {
+		   this.createFullTextIndex("test-in-error"+getName(), myProps);
+		}
+		catch (RuntimeException e) {
+			Throwable t = e;
+			while (t.getCause() != null) {
+				t = t.getCause();
+			}
+			assertTrue(t.getMessage(),t.getMessage().contains(errorMessage));
+			return;
+		}
+		fail("No error detected");
+	}
+	public void testBadLike() {
+		badCombo("en-us-x-banana",AnalyzerOptions.LIKE,"en-us-x-banana");
+	}
+	public void testMissingClass() {
+		badCombo("exactly one",AnalyzerOptions.STOPWORDS,AnalyzerOptions.STOPWORDS_VALUE_DEFAULT);
+		
+	}
+	public void testLikeAndClass() {
+		badCombo("exactly one",AnalyzerOptions.LIKE,"*", AnalyzerOptions.ANALYZER_CLASS, EmptyAnalyzer.class.getName());
+	}
+	public void testLikeAndStopwords() {
+		badCombo("stopwords",AnalyzerOptions.LIKE,"*", AnalyzerOptions.STOPWORDS,AnalyzerOptions.STOPWORDS_VALUE_DEFAULT);
+	}
+	public void testCantAlwaysHaveStopWords() {
+		badCombo("not supported",
+				AnalyzerOptions.ANALYZER_CLASS, EmptyAnalyzer.class.getName(),
+				AnalyzerOptions.STOPWORDS,StandardAnalyzer.class.getName()
+				);
+		
+	}
+	public void testCantAlwaysHaveDefaultStopWords() {
+		badCombo("not supported",
+				AnalyzerOptions.ANALYZER_CLASS, EmptyAnalyzer.class.getName(),
+				AnalyzerOptions.STOPWORDS,AnalyzerOptions.STOPWORDS_VALUE_DEFAULT
+				);
+		
+	}
+	public void testCantFindRussianStopWords() {
+		badCombo("find",
+				AnalyzerOptions.ANALYZER_CLASS, GermanAnalyzer.class.getName(),
+				AnalyzerOptions.STOPWORDS,RussianAnalyzer.class.getName()
+				);
+		
+	}
+
+
+    public void testEmptyAnalyzer() throws IOException {
+    	comparisonTest("en",
+    			false,
+    			"The fast car arrived slowly.",
+    			""
+    			);
+    	
+    }
+    public void testSyapseExample1() throws IOException {
+    	comparisonTest("x-splits", 
+    			true,
+    			"ADENOCARCINOMA OF LUNG, SOMATIC [ERBB2, INS/DUP, NT2322]",
+    			"ADENOCARCINOMA OF LUNG, SOMATIC [ERBB2, ERBB2, INS/DUP, DUP, NT2322]"
+    			);
+    	
+    }
+    public void testSyapseExample2() throws IOException {
+    	comparisonTest("x-splits", 
+    			true,
+    			"\u2265\u2265\u22653-11.13-11.1",
+    			"\u2265\u2265\u22653-11.13-11.1 3-11.13-11.1 11.13-11.1 13-11.1 11.1 1"
+    			);
+    	
+    }
+    public void testSyapseExample4() throws IOException {
+    	comparisonTest("x-splits", 
+    			true,
+    			"\u00b1-ACE3.1.1",
+    			"\u00b1-ACE3.1.1 ACE3.1.1 1.1 1"
+    			);
+    	
+    }
+    public void testSyapseExample3() throws IOException {
+    	comparisonTest("x-splits", 
+    			true,
+    			"2,2,3-trimethylbutane",
+    			"2,2,3-trimethylbutane 2,3-trimethylbutane 3-trimethylbutane trimethylbutane"
+    			);
+    	
+    }
+    public void testSyapseExample5() throws IOException {
+    	comparisonTest("x-splits", 
+    			true,
+    			"CD8_alpha-low Langerhans cell",
+    			"CD8_alpha-low alpha-low low Langerhans cell"
+    			);
+    	
+    }
+    public void testSyapseExample6() throws IOException {
+    	comparisonTest("x-splits", 
+    			true,
+    			"6-Monoacetylmorphine:Mass Content:Point in time:Meconium:Quantitative",
+    			"6-Monoacetylmorphine:Mass Monoacetylmorphine:Mass Mass Content:Point Point in time:Meconium:Quantitative Meconium:Quantitative Quantitative"
+    			);
+    	
+    }
+    public void testSyapseExample7() throws IOException {
+    	comparisonTest("x-splits", 
+    			true,
+    			"N,N-dimethyl",
+    			"N,N-dimethyl N-dimethyl dimethyl"
+    			);
+    	
+    }
+
+}

Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAsDefaultAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAsDefaultAnalyzerFactory.java	2014-05-09 17:07:05 UTC (rev 8247)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAsDefaultAnalyzerFactory.java	2014-05-09 17:42:44 UTC (rev 8248)
@@ -40,4 +40,9 @@
 		return new String[]{FullTextIndex.Options.ANALYZER_FACTORY_CLASS, ConfigurableAnalyzerFactory.class.getName()};
 	}
 
+	@Override
+	boolean isBroken() {
+		return false;
+	}
+
 }

Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestDefaultAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestDefaultAnalyzerFactory.java	2014-05-09 17:07:05 UTC (rev 8247)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestDefaultAnalyzerFactory.java	2014-05-09 17:42:44 UTC (rev 8248)
@@ -40,4 +40,27 @@
 		return new String[0];
 	}
 
+	/**
+	 * The DefaultAnalyzerFactory has bizarre behavior concerning
+	 * language specific settings.
+	 * The three letter ISO 639-1 language tags for the languages
+	 * for which Lucene has Analyzers use those Analyzers; whereas the two digit ISO
+	 * language tags, which are the ones recommended by the IETF and the W3C, 
+	 * all use the StandardAnalyzer (English). Also a language tag with a subtag
+	 * uses the StandardAnalyzer, even if it is a recognized three letter ISO code.
+	 */
+	@Override
+	boolean isBroken() {
+		return true;
+	}
+
+	/**
+	 * Given legacy concerns, we should preserve the incorrect behavior!
+	 */
+    public void testIsBroken() {
+    	checkConfig(false, "StandardAnalyzer", 
+        		"en", "eng", "", null, "ru",
+        		"pt", "zh", "por-br", "cs", "dut-za", "nl", "de", "gre-at", "el", "th"); 
+    }
+
 }

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





Thread: [Bigdata-commit] SF.net SVN: bigdata:[8248] branches/TEXT_ANALYZERS/bigdata/src

Fast, scalable, robust graph database platform

bigdata-commit