From: <jer...@us...> - 2014-05-09 17:42:49
|
Revision: 8248 http://sourceforge.net/p/bigdata/code/8248 Author: jeremy_carroll Date: 2014-05-09 17:42:44 +0000 (Fri, 09 May 2014) Log Message: ----------- First version of TermCompletionAnalyzer, and also tests now passing Modified Paths: -------------- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestAll.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAsDefaultAnalyzerFactory.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestDefaultAnalyzerFactory.java Added Paths: ----------- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 17:07:05 UTC (rev 8247) +++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 17:42:44 UTC (rev 8248) @@ -331,7 +331,7 @@ * (Note the {@link Pattern#UNICODE_CHARACTER_CLASS} flag is enabled). * It is an error if a different analyzer class is specified. */ - String PATTERN = ".pattern"; + String PATTERN = "pattern"; } @@ -474,7 +474,7 @@ */ public Set<?> getStopWords() { - if (AnalyzerOptions.STOPWORDS_VALUE_NONE.equals(stopwords)) + if (doNotUseStopWords()) return Collections.EMPTY_SET; if (useDefaultStopWords()) { @@ -484,6 +484,10 @@ return getStopWordsForClass(stopwords); } + boolean doNotUseStopWords() { + return AnalyzerOptions.STOPWORDS_VALUE_NONE.equals(stopwords) || (stopwords == null && pattern != null); + } + protected Set<?> getStopWordsForClass(String clazzName) { Class<? extends Analyzer> analyzerClass = getAnalyzerClass(clazzName); try { @@ -500,7 +504,7 @@ } protected boolean useDefaultStopWords() { - return stopwords == null || AnalyzerOptions.STOPWORDS_VALUE_DEFAULT.equals(stopwords); + return ( stopwords == null && pattern == null ) || AnalyzerOptions.STOPWORDS_VALUE_DEFAULT.equals(stopwords); } public boolean setProperty(String shortProperty, String value) { @@ -550,8 +554,13 @@ if (hasConstructor(cls, Version.class, Set.class)) { // RussianAnalyzer is missing any way to access stop words. - if (RussianAnalyzer.class.equals(cls) && useDefaultStopWords()) { - return new AnalyzerPair(languageRange, new RussianAnalyzer(Version.LUCENE_CURRENT), new RussianAnalyzer(Version.LUCENE_CURRENT, Collections.EMPTY_SET)); + if (RussianAnalyzer.class.equals(cls)) { + if (useDefaultStopWords()) { + return new AnalyzerPair(languageRange, new RussianAnalyzer(Version.LUCENE_CURRENT), new RussianAnalyzer(Version.LUCENE_CURRENT, Collections.EMPTY_SET)); + } + if (doNotUseStopWords()) { + return new AnalyzerPair(languageRange, new RussianAnalyzer(Version.LUCENE_CURRENT, Collections.EMPTY_SET)); + } } return new VersionSetAnalyzerPair(this, cls); } @@ -719,7 +728,7 @@ String prop = (String)en.nextElement(); if (prop.equals(Options.INCLUDE_DEFAULTS)) continue; if (prop.startsWith(Options.ANALYZER)) { - String languageRangeAndProperty[] = prop.substring(Options.ANALYZER.length()).split("[.]"); + String languageRangeAndProperty[] = prop.substring(Options.ANALYZER.length()).replaceAll("_","*").split("[.]"); if (languageRangeAndProperty.length == 2) { String languageRange = languageRangeAndProperty[0].toLowerCase(Locale.US); // Turkish "I" could create a problem Added: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java (rev 0) +++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java 2014-05-09 17:42:44 UTC (rev 8248) @@ -0,0 +1,88 @@ +package com.bigdata.search; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.nio.CharBuffer; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.util.Attribute; + + +public class TermCompletionAnalyzer extends Analyzer { + + Pattern hard = Pattern.compile(" ", Pattern.UNICODE_CHARACTER_CLASS); + Pattern soft = Pattern.compile("(?<!\\p{L}|\\p{N})(?=\\p{L}|\\p{N})|(?<!\\p{Lu})(?=\\p{Lu})|(?<=\\p{N})(?=\\p{L})", Pattern.UNICODE_CHARACTER_CLASS); + + public TermCompletionAnalyzer() { + // TODO Auto-generated constructor stub + } + + private class TermCompletionTokenStream extends TokenStream { + + final int length; + final String[] words; + char currentWord[] = new char[]{}; + Matcher softMatcher; + int currentWordIx = -1; + int charPos = 0; + final TermAttribute termAtt; + public TermCompletionTokenStream(StringReader reader) { + termAtt = addAttribute(TermAttribute.class); + try { + reader.mark(Integer.MAX_VALUE); + length = (int) reader.skip(Integer.MAX_VALUE); + reader.reset(); + char fileContent[] = new char[length]; + reader.read(fileContent); + words = hard.split(new String(fileContent)); + } catch (IOException e) { + throw new RuntimeException("Impossible",e); + } + } + @Override + public boolean incrementToken() throws IOException { + if ( next() ) { + int lg = currentWord.length - charPos; + System.arraycopy(currentWord, charPos, termAtt.termBuffer(), 0, lg ); + termAtt.setTermLength(lg); + return true; + } else { + return false; + } + } + private boolean next() { + if (currentWordIx >= words.length) { + return false; + } + if (charPos +1 < currentWord.length && softMatcher.find(charPos+1)) { + charPos = softMatcher.end(); + return true; + } else { + return nextWord(); + } + } + private boolean nextWord() { + currentWordIx++; + if (currentWordIx >= words.length) { + return false; + } + currentWord = words[currentWordIx].toCharArray(); + termAtt.resizeTermBuffer(currentWord.length); + charPos = 0; + softMatcher = soft.matcher(words[currentWordIx]); + return true; + } + + } + + + @Override + public TokenStream tokenStream(String ignoredFieldName, Reader reader) { + return new TermCompletionTokenStream((StringReader)reader); + } +} Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java 2014-05-09 17:07:05 UTC (rev 8247) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java 2014-05-09 17:42:44 UTC (rev 8248) @@ -27,11 +27,7 @@ package com.bigdata.search; import java.io.IOException; -import java.io.StringReader; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; public abstract class AbstractAnalyzerFactoryTest extends AbstractSearchTest { @@ -42,37 +38,16 @@ super(arg0); } + @Override public void setUp() throws Exception { super.setUp(); - init(getExtraProperties()); + init(getExtraProperties()); } + + abstract String[] getExtraProperties(); - private Analyzer getAnalyzer(String lang, boolean filterStopWords) { - return getNdx().getAnalyzer(lang, filterStopWords); - } - - private void comparisonTest(String lang, - boolean stopWordsSignificant, - String text, - String spaceSeparated) throws IOException { - compareTokenStream(getAnalyzer(lang, stopWordsSignificant), text, - spaceSeparated.split(" ")); //$NON-NLS-1$ - } - private void compareTokenStream(Analyzer a, String text, String expected[]) throws IOException { - TokenStream s = a.tokenStream(null, new StringReader(text)); - int ix = 0; - while (s.incrementToken()) { - final TermAttribute term = s.getAttribute(TermAttribute.class); - final String word = term.term(); - assertTrue(ix < expected.length); - assertEquals(word, expected[ix++]); - } - assertEquals(ix, expected.length); - } - - - public void testEnglishFilterStopWords() throws IOException { + public void testEnglishFilterStopWords() throws IOException { for (String lang: new String[]{ "eng", null, "" }) { //$NON-NLS-1$ //$NON-NLS-2$ comparisonTest(lang, true, @@ -159,14 +134,20 @@ } private void checkConfig(String classname, String ...langs) { + checkConfig(isBroken(), classname, langs); + + } + protected void checkConfig(boolean threeLetterOnly, String classname, String ...langs) { for (String lang:langs) { // The DefaultAnalyzerFactory only works for language tags of length exactly three. -// if (lang != null && lang.length()==3) + if ((!threeLetterOnly) || (lang != null && lang.length()==3)) { assertEquals(classname, getAnalyzer(lang,true).getClass().getSimpleName()); - assertEquals(classname, getAnalyzer(lang+NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.0"),true).getClass().getSimpleName()); //$NON-NLS-1$ + if (!threeLetterOnly) assertEquals(classname, getAnalyzer(lang+"-x-foobar",true).getClass().getSimpleName()); //$NON-NLS-1$ } } } + + abstract boolean isBroken() ; } Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 17:07:05 UTC (rev 8247) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 17:42:44 UTC (rev 8248) @@ -26,8 +26,14 @@ */ package com.bigdata.search; +import java.io.IOException; +import java.io.StringReader; import java.util.Properties; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + import com.bigdata.journal.IIndexManager; import com.bigdata.journal.ITx; import com.bigdata.journal.ProxyTestCase; @@ -62,7 +68,7 @@ } FullTextIndex<Long> createFullTextIndex(String namespace, String ...propertyValuePairs) { - return createFullTextIndex(namespace, getProperties(), propertyValuePairs); + return createFullTextIndex(namespace, (Properties)getProperties().clone(), propertyValuePairs); } public void tearDown() throws Exception { @@ -92,4 +98,51 @@ return properties; } + protected Analyzer getAnalyzer(String lang, boolean filterStopWords) { + return getNdx().getAnalyzer(lang, filterStopWords); + } + + protected void comparisonTest(String lang, boolean stopWordsSignificant, String text, String spaceSeparated) + throws IOException { + if (spaceSeparated == null) { + String rslt = getTokenStream(getAnalyzer(lang, stopWordsSignificant), text); + throw new RuntimeException("Got \"" + rslt+ "\""); + } + compareTokenStream(getAnalyzer(lang, stopWordsSignificant), text, + split(spaceSeparated)); //$NON-NLS-1$ + } + + private String[] split(String spaceSeparated) { + if (spaceSeparated.length()==0) { + return new String[0]; + } + return spaceSeparated.split(" "); + } + + protected String getTokenStream(Analyzer a, String text) throws IOException { + StringBuffer sb = new StringBuffer(); + TokenStream s = a.tokenStream(null, new StringReader(text)); + int ix = 0; + while (s.incrementToken()) { + final TermAttribute term = s.getAttribute(TermAttribute.class); + if (sb.length()!=0) { + sb.append(" "); + } + sb.append(term.term()); + } + return sb.toString(); + } + + private void compareTokenStream(Analyzer a, String text, String expected[]) throws IOException { + TokenStream s = a.tokenStream(null, new StringReader(text)); + int ix = 0; + while (s.incrementToken()) { + final TermAttribute term = s.getAttribute(TermAttribute.class); + final String word = term.term(); + assertTrue(ix < expected.length); + assertEquals(word, expected[ix++]); + } + assertEquals(ix, expected.length); + } + } Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestAll.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestAll.java 2014-05-09 17:07:05 UTC (rev 8247) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestAll.java 2014-05-09 17:42:44 UTC (rev 8248) @@ -114,6 +114,7 @@ // which is intended to be the same as the intended // behavior of DefaultAnalyzerFactory suite.addTestSuite(TestConfigurableAsDefaultAnalyzerFactory.class); + suite.addTestSuite(TestConfigurableAnalyzerFactory.class); return suite; } Added: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java (rev 0) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 17:42:44 UTC (rev 8248) @@ -0,0 +1,195 @@ +/** + +Copyright (C) SYSTAP, LLC 2006-2014. All rights reserved. + +Contact: + SYSTAP, LLC + 4501 Tower Road + Greensboro, NC 27410 + lic...@bi... + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +/* + * Created on May 7, 2014 + */ +package com.bigdata.search; + +import java.io.IOException; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.KeywordAnalyzer; +import org.apache.lucene.analysis.cjk.CJKAnalyzer; +import org.apache.lucene.analysis.de.GermanAnalyzer; +import org.apache.lucene.analysis.ru.RussianAnalyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.util.Version; + +import com.bigdata.search.ConfigurableAnalyzerFactory.AnalyzerOptions; + +public class TestConfigurableAnalyzerFactory extends AbstractSearchTest { + + public TestConfigurableAnalyzerFactory() { + } + + public TestConfigurableAnalyzerFactory(String arg0) { + super(arg0); + } + + public void setUp() throws Exception { + super.setUp(); + init(getExtraProperties()); + } + + private String[] getExtraProperties() { + String analyzer = ConfigurableAnalyzerFactory.Options.ANALYZER; + return new String[]{ + FullTextIndex.Options.ANALYZER_FACTORY_CLASS, ConfigurableAnalyzerFactory.class.getName(), + analyzer+"*."+AnalyzerOptions.ANALYZER_CLASS, EmptyAnalyzer.class.getName(), + analyzer+"x-terms."+AnalyzerOptions.PATTERN, "\\W+", + analyzer+"x-splits."+AnalyzerOptions.ANALYZER_CLASS, TermCompletionAnalyzer.class.getName(), + analyzer+"x-splits."+AnalyzerOptions.STOPWORDS, AnalyzerOptions.STOPWORDS_VALUE_NONE, + analyzer+"x-keywords."+AnalyzerOptions.ANALYZER_CLASS, KeywordAnalyzer.class.getName(), + analyzer+"ru-x-de."+AnalyzerOptions.ANALYZER_CLASS, RussianAnalyzer.class.getName(), + analyzer+"ru-x-de."+AnalyzerOptions.STOPWORDS, GermanAnalyzer.class.getName(), + }; + } + + private void badCombo(String errorMessage, String ... props) { + // Check that some combination of properties on a language create an error + String myProps[] = new String[props.length+4]; + int i=0; + for (; i<props.length;i+=2) { + myProps[i] = ConfigurableAnalyzerFactory.Options.ANALYZER + "x-testme." + props[i]; + myProps[i+1] = props[i+1]; + } + myProps[i] = ConfigurableAnalyzerFactory.Options.ANALYZER + "_." + AnalyzerOptions.ANALYZER_CLASS; + myProps[i+1] = EmptyAnalyzer.class.getName(); + myProps[i+2] = FullTextIndex.Options.ANALYZER_FACTORY_CLASS; + myProps[i+3] = ConfigurableAnalyzerFactory.class.getName(); + try { + this.createFullTextIndex("test-in-error"+getName(), myProps); + } + catch (RuntimeException e) { + Throwable t = e; + while (t.getCause() != null) { + t = t.getCause(); + } + assertTrue(t.getMessage(),t.getMessage().contains(errorMessage)); + return; + } + fail("No error detected"); + } + public void testBadLike() { + badCombo("en-us-x-banana",AnalyzerOptions.LIKE,"en-us-x-banana"); + } + public void testMissingClass() { + badCombo("exactly one",AnalyzerOptions.STOPWORDS,AnalyzerOptions.STOPWORDS_VALUE_DEFAULT); + + } + public void testLikeAndClass() { + badCombo("exactly one",AnalyzerOptions.LIKE,"*", AnalyzerOptions.ANALYZER_CLASS, EmptyAnalyzer.class.getName()); + } + public void testLikeAndStopwords() { + badCombo("stopwords",AnalyzerOptions.LIKE,"*", AnalyzerOptions.STOPWORDS,AnalyzerOptions.STOPWORDS_VALUE_DEFAULT); + } + public void testCantAlwaysHaveStopWords() { + badCombo("not supported", + AnalyzerOptions.ANALYZER_CLASS, EmptyAnalyzer.class.getName(), + AnalyzerOptions.STOPWORDS,StandardAnalyzer.class.getName() + ); + + } + public void testCantAlwaysHaveDefaultStopWords() { + badCombo("not supported", + AnalyzerOptions.ANALYZER_CLASS, EmptyAnalyzer.class.getName(), + AnalyzerOptions.STOPWORDS,AnalyzerOptions.STOPWORDS_VALUE_DEFAULT + ); + + } + public void testCantFindRussianStopWords() { + badCombo("find", + AnalyzerOptions.ANALYZER_CLASS, GermanAnalyzer.class.getName(), + AnalyzerOptions.STOPWORDS,RussianAnalyzer.class.getName() + ); + + } + + + public void testEmptyAnalyzer() throws IOException { + comparisonTest("en", + false, + "The fast car arrived slowly.", + "" + ); + + } + public void testSyapseExample1() throws IOException { + comparisonTest("x-splits", + true, + "ADENOCARCINOMA OF LUNG, SOMATIC [ERBB2, INS/DUP, NT2322]", + "ADENOCARCINOMA OF LUNG, SOMATIC [ERBB2, ERBB2, INS/DUP, DUP, NT2322]" + ); + + } + public void testSyapseExample2() throws IOException { + comparisonTest("x-splits", + true, + "\u2265\u2265\u22653-11.13-11.1", + "\u2265\u2265\u22653-11.13-11.1 3-11.13-11.1 11.13-11.1 13-11.1 11.1 1" + ); + + } + public void testSyapseExample4() throws IOException { + comparisonTest("x-splits", + true, + "\u00b1-ACE3.1.1", + "\u00b1-ACE3.1.1 ACE3.1.1 1.1 1" + ); + + } + public void testSyapseExample3() throws IOException { + comparisonTest("x-splits", + true, + "2,2,3-trimethylbutane", + "2,2,3-trimethylbutane 2,3-trimethylbutane 3-trimethylbutane trimethylbutane" + ); + + } + public void testSyapseExample5() throws IOException { + comparisonTest("x-splits", + true, + "CD8_alpha-low Langerhans cell", + "CD8_alpha-low alpha-low low Langerhans cell" + ); + + } + public void testSyapseExample6() throws IOException { + comparisonTest("x-splits", + true, + "6-Monoacetylmorphine:Mass Content:Point in time:Meconium:Quantitative", + "6-Monoacetylmorphine:Mass Monoacetylmorphine:Mass Mass Content:Point Point in time:Meconium:Quantitative Meconium:Quantitative Quantitative" + ); + + } + public void testSyapseExample7() throws IOException { + comparisonTest("x-splits", + true, + "N,N-dimethyl", + "N,N-dimethyl N-dimethyl dimethyl" + ); + + } + +} Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAsDefaultAnalyzerFactory.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAsDefaultAnalyzerFactory.java 2014-05-09 17:07:05 UTC (rev 8247) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAsDefaultAnalyzerFactory.java 2014-05-09 17:42:44 UTC (rev 8248) @@ -40,4 +40,9 @@ return new String[]{FullTextIndex.Options.ANALYZER_FACTORY_CLASS, ConfigurableAnalyzerFactory.class.getName()}; } + @Override + boolean isBroken() { + return false; + } + } Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestDefaultAnalyzerFactory.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestDefaultAnalyzerFactory.java 2014-05-09 17:07:05 UTC (rev 8247) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestDefaultAnalyzerFactory.java 2014-05-09 17:42:44 UTC (rev 8248) @@ -40,4 +40,27 @@ return new String[0]; } + /** + * The DefaultAnalyzerFactory has bizarre behavior concerning + * language specific settings. + * The three letter ISO 639-1 language tags for the languages + * for which Lucene has Analyzers use those Analyzers; whereas the two digit ISO + * language tags, which are the ones recommended by the IETF and the W3C, + * all use the StandardAnalyzer (English). Also a language tag with a subtag + * uses the StandardAnalyzer, even if it is a recognized three letter ISO code. + */ + @Override + boolean isBroken() { + return true; + } + + /** + * Given legacy concerns, we should preserve the incorrect behavior! + */ + public void testIsBroken() { + checkConfig(false, "StandardAnalyzer", + "en", "eng", "", null, "ru", + "pt", "zh", "por-br", "cs", "dut-za", "nl", "de", "gre-at", "el", "th"); + } + } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <jer...@us...> - 2014-05-09 17:43:00
|
Revision: 8249 http://sourceforge.net/p/bigdata/code/8249 Author: jeremy_carroll Date: 2014-05-09 17:42:56 +0000 (Fri, 09 May 2014) Log Message: ----------- copyright and tidying up Modified Paths: -------------- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/NonEnglishExamples.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java 2014-05-09 17:42:44 UTC (rev 8248) +++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java 2014-05-09 17:42:56 UTC (rev 8249) @@ -1,3 +1,29 @@ +/** + +Copyright (C) SYSTAP, LLC 2006-2014. All rights reserved. + +Contact: + SYSTAP, LLC + 4501 Tower Road + Greensboro, NC 27410 + lic...@bi... + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +/* + * Created on May 8, 2014 by Jeremy J. Carroll, Syapse Inc. + */ package com.bigdata.search; import java.io.IOException; @@ -3,5 +29,4 @@ import java.io.Reader; import java.io.StringReader; -import java.nio.CharBuffer; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -10,7 +35,6 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import org.apache.lucene.util.Attribute; public class TermCompletionAnalyzer extends Analyzer { @@ -19,7 +43,6 @@ Pattern soft = Pattern.compile("(?<!\\p{L}|\\p{N})(?=\\p{L}|\\p{N})|(?<!\\p{Lu})(?=\\p{Lu})|(?<=\\p{N})(?=\\p{L})", Pattern.UNICODE_CHARACTER_CLASS); public TermCompletionAnalyzer() { - // TODO Auto-generated constructor stub } private class TermCompletionTokenStream extends TokenStream { Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 17:42:44 UTC (rev 8248) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 17:42:56 UTC (rev 8249) @@ -122,7 +122,6 @@ protected String getTokenStream(Analyzer a, String text) throws IOException { StringBuffer sb = new StringBuffer(); TokenStream s = a.tokenStream(null, new StringReader(text)); - int ix = 0; while (s.incrementToken()) { final TermAttribute term = s.getAttribute(TermAttribute.class); if (sb.length()!=0) { Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/NonEnglishExamples.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/NonEnglishExamples.java 2014-05-09 17:42:44 UTC (rev 8248) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/NonEnglishExamples.java 2014-05-09 17:42:56 UTC (rev 8249) @@ -1,3 +1,29 @@ +/** + +Copyright (C) SYSTAP, LLC 2006-2014. All rights reserved. + +Contact: + SYSTAP, LLC + 4501 Tower Road + Greensboro, NC 27410 + lic...@bi... + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +/* + * Created on May 7, 2014 by Jeremy J. Carroll, Syapse Inc. + */ package com.bigdata.search; import java.util.MissingResourceException; Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 17:42:44 UTC (rev 8248) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 17:42:56 UTC (rev 8249) @@ -28,13 +28,10 @@ import java.io.IOException; -import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.KeywordAnalyzer; -import org.apache.lucene.analysis.cjk.CJKAnalyzer; import org.apache.lucene.analysis.de.GermanAnalyzer; import org.apache.lucene.analysis.ru.RussianAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.util.Version; import com.bigdata.search.ConfigurableAnalyzerFactory.AnalyzerOptions; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <jer...@us...> - 2014-05-09 17:43:20
|
Revision: 8251 http://sourceforge.net/p/bigdata/code/8251 Author: jeremy_carroll Date: 2014-05-09 17:43:16 +0000 (Fri, 09 May 2014) Log Message: ----------- Got tests working again, and cleaned up somewhat Modified Paths: -------------- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 17:43:05 UTC (rev 8250) +++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 17:43:16 UTC (rev 8251) @@ -326,12 +326,48 @@ String STOPWORDS_VALUE_NONE = "none"; /** - * If this property is present then the analyzer being used is a - * {@link PatternAnalyzer} and the value is the pattern to use. + * The value of the pattern parameter to + * {@link PatternAnalyzer#PatternAnalyzer(Version, Pattern, boolean, Set)} * (Note the {@link Pattern#UNICODE_CHARACTER_CLASS} flag is enabled). * It is an error if a different analyzer class is specified. */ String PATTERN = "pattern"; + /** + * The value of the wordBoundary parameter to + * {@link TermCompletionAnalyzer#TermCompletionAnalyzer(Pattern, Pattern, Pattern, boolean)} + * (Note the {@link Pattern#UNICODE_CHARACTER_CLASS} flag is enabled). + * It is an error if a different analyzer class is specified. + */ + String WORD_BOUNDARY = "wordBoundary"; + /** + * The value of the subWordBoundary parameter to + * {@link TermCompletionAnalyzer#TermCompletionAnalyzer(Pattern, Pattern, Pattern, boolean)} + * (Note the {@link Pattern#UNICODE_CHARACTER_CLASS} flag is enabled). + * It is an error if a different analyzer class is specified. + */ + String SUB_WORD_BOUNDARY = "subWordBoundary"; + /** + * The value of the softHyphens parameter to + * {@link TermCompletionAnalyzer#TermCompletionAnalyzer(Pattern, Pattern, Pattern, boolean)} + * (Note the {@link Pattern#UNICODE_CHARACTER_CLASS} flag is enabled). + * It is an error if a different analyzer class is specified. + */ + String SOFT_HYPHENS = "softHypens"; + /** + * The value of the alwaysRemoveSoftHypens parameter to + * {@link TermCompletionAnalyzer#TermCompletionAnalyzer(Pattern, Pattern, Pattern, boolean)} + * (Note the {@link Pattern#UNICODE_CHARACTER_CLASS} flag is enabled). + * It is an error if a different analyzer class is specified. + */ + String ALWAYS_REMOVE_SOFT_HYPHENS = "alwaysRemoveSoftHypens"; + + boolean DEFAULT_ALWAYS_REMOVE_SOFT_HYPHENS = false; + + /** + * The default sub-word boundary is a pattern that never matches, + * i.e. there are no sub-word boundaries. + */ + Pattern DEFAULT_SUB_WORD_BOUNDARY = Pattern.compile("(?!)"); } @@ -382,16 +418,7 @@ this.withoutStopWords = copyMe.withoutStopWords; } - - public Analyzer getAnalyzer(boolean filterStopwords) { - return filterStopwords ? withStopWords : withoutStopWords; - } - @Override - public String toString() { - return range.full + "=(" + withStopWords.getClass().getSimpleName() +")"; - } - AnalyzerPair(String range, Constructor<? extends Analyzer> cons, Object ... params) throws Exception { this(range, cons.newInstance(params), cons.newInstance(useEmptyStopWordSet(params))); } @@ -409,7 +436,16 @@ } return rslt; } + + public Analyzer getAnalyzer(boolean filterStopwords) { + return filterStopwords ? withStopWords : withoutStopWords; + } @Override + public String toString() { + return range.full + "=(" + withStopWords.getClass().getSimpleName() +")"; + } + + @Override public int compareTo(AnalyzerPair o) { return range.compareTo(o.range); } @@ -437,10 +473,10 @@ private static class PatternAnalyzerPair extends AnalyzerPair { - public PatternAnalyzerPair(ConfigOptionsToAnalyzer lro, String pattern) throws Exception { + public PatternAnalyzerPair(ConfigOptionsToAnalyzer lro, Pattern pattern) throws Exception { super(lro.languageRange, getConstructor(PatternAnalyzer.class,Version.class,Pattern.class,Boolean.TYPE,Set.class), Version.LUCENE_CURRENT, - Pattern.compile(pattern, Pattern.UNICODE_CHARACTER_CLASS), + pattern, true, lro.getStopWords()); } @@ -459,9 +495,13 @@ String like; String className; String stopwords; - String pattern; + Pattern pattern; final String languageRange; AnalyzerPair result; + Pattern wordBoundary; + Pattern subWordBoundary; + Pattern softHyphens; + Boolean alwaysRemoveSoftHyphens; public ConfigOptionsToAnalyzer(String languageRange) { this.languageRange = languageRange; @@ -515,7 +555,15 @@ } else if (shortProperty.equals(AnalyzerOptions.STOPWORDS) ) { stopwords = value; } else if (shortProperty.equals(AnalyzerOptions.PATTERN) ) { - pattern = value; + pattern = Pattern.compile(value,Pattern.UNICODE_CHARACTER_CLASS); + } else if (shortProperty.equals(AnalyzerOptions.WORD_BOUNDARY) ) { + wordBoundary = Pattern.compile(value,Pattern.UNICODE_CHARACTER_CLASS); + } else if (shortProperty.equals(AnalyzerOptions.SUB_WORD_BOUNDARY) ) { + subWordBoundary = Pattern.compile(value,Pattern.UNICODE_CHARACTER_CLASS); + } else if (shortProperty.equals(AnalyzerOptions.SOFT_HYPHENS) ) { + softHyphens = Pattern.compile(value,Pattern.UNICODE_CHARACTER_CLASS); + } else if (shortProperty.equals(AnalyzerOptions.ALWAYS_REMOVE_SOFT_HYPHENS) ) { + alwaysRemoveSoftHyphens = Boolean.valueOf(value); } else { return false; } @@ -529,6 +577,27 @@ } className = PatternAnalyzer.class.getName(); } + if (this.wordBoundary != null ) { + if ( className != null && className != TermCompletionAnalyzer.class.getName()) { + throw new RuntimeException("Bad Option: Language range "+languageRange + " with pattern propety for class "+ className); + } + className = TermCompletionAnalyzer.class.getName(); + + if ( subWordBoundary == null ) { + subWordBoundary = AnalyzerOptions.DEFAULT_SUB_WORD_BOUNDARY; + } + if ( alwaysRemoveSoftHyphens != null && softHyphens == null ) { + throw new RuntimeException("Bad option: Language range "+languageRange + ": must specify softHypens when setting alwaysRemoveSoftHyphens"); + } + if (softHyphens != null && alwaysRemoveSoftHyphens == null) { + alwaysRemoveSoftHyphens = AnalyzerOptions.DEFAULT_ALWAYS_REMOVE_SOFT_HYPHENS; + } + + } else if ( subWordBoundary != null || softHyphens != null || alwaysRemoveSoftHyphens != null || + TermCompletionAnalyzer.class.getName().equals(className) ) { + throw new RuntimeException("Bad option: Language range "+languageRange + ": must specify wordBoundary for TermCompletionAnalyzer"); + } + if (PatternAnalyzer.class.getName().equals(className) && pattern == null ) { throw new RuntimeException("Bad Option: Language range "+languageRange + " must specify pattern for PatternAnalyzer."); } @@ -547,8 +616,23 @@ } if (pattern != null) { return new PatternAnalyzerPair(this, pattern); - - } + } + if (softHyphens != null) { + return new AnalyzerPair( + languageRange, + new TermCompletionAnalyzer( + wordBoundary, + subWordBoundary, + softHyphens, + alwaysRemoveSoftHyphens)); + } + if (wordBoundary != null) { + return new AnalyzerPair( + languageRange, + new TermCompletionAnalyzer( + wordBoundary, + subWordBoundary)); + } final Class<? extends Analyzer> cls = getAnalyzerClass(); if (hasConstructor(cls, Version.class, Set.class)) { Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java 2014-05-09 17:43:05 UTC (rev 8250) +++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java 2014-05-09 17:43:16 UTC (rev 8251) @@ -81,8 +81,8 @@ */ public class TermCompletionAnalyzer extends Analyzer { - private final Pattern wordBoundary; // = Pattern.compile(" ", Pattern.UNICODE_CHARACTER_CLASS); - private final Pattern subWordBoundary; // = Pattern.compile("(?<!\\p{L}|\\p{N})(?=\\p{L}|\\p{N})|(?<!\\p{Lu})(?=\\p{Lu})|(?<=\\p{N})(?=\\p{L})", Pattern.UNICODE_CHARACTER_CLASS); + private final Pattern wordBoundary; + private final Pattern subWordBoundary; private final Pattern discard; private final boolean alwaysDiscard; @@ -90,24 +90,25 @@ /** * Divide the input into words and short tokens * as with {@link #TermCompletionAnalyzer(Pattern, Pattern)}. - * If alsoWithSoftHypens is true then output each token, - * and in any case output each token with every - * match to softHyphenEtc deleted. + * Each term is generated, and then an additional term + * is generated with softHypens (defined by the pattern), + * removed. If the alwaysRemoveSoftHypens flag is true, + * then the first term (before the removal) is suppressed. * * @param wordBoundary The definition of space (e.g. " ") * @param subWordBoundary Also index after matches to this (e.g. "-") - * @param softHyphenEtc Discard these characters from matches - * @param alsoWithSoftHyphens If true the discard step is optional. + * @param softHyphens Discard these characters from matches + * @param alwaysRemoveSoftHypens If false the discard step is optional. */ public TermCompletionAnalyzer(Pattern wordBoundary, Pattern subWordBoundary, - Pattern softHyphenEtc, - boolean alsoWithSoftHyphens) { + Pattern softHyphens, + boolean alwaysRemoveSoftHypens) { this.wordBoundary = wordBoundary; this.subWordBoundary = subWordBoundary; - if (softHyphenEtc != null) { - discard = softHyphenEtc; - alwaysDiscard = !alsoWithSoftHyphens; + if (softHyphens != null) { + discard = softHyphens; + alwaysDiscard = alwaysRemoveSoftHypens; } else { discard = Pattern.compile("(?!)"); // never matches alwaysDiscard = true; @@ -115,9 +116,10 @@ } /** * Divide the input into words, separated by the wordBoundary, - * and return a token for the whole word, and then for the - * remainder of the word after each successive match of the - * subWordBoundary. + * and return a token for each whole word, and then + * generate further tokens for each word by removing prefixes + * up to and including each successive match of + * subWordBoundary * @param wordBoundary * @param subWordBoundary */ @@ -189,8 +191,9 @@ afterDiscard.getChars(0, lg, termAtt.termBuffer(), 0); termAtt.setTermLength(lg); } else { - found.get(termAtt.termBuffer()); - termAtt.setTermLength(found.length()); + int lg = found.length(); + found.get(termAtt.termBuffer(), 0, lg); + termAtt.setTermLength(lg); } return true; } else { @@ -211,7 +214,7 @@ } } afterDiscard = null; - if (charPos +1 < currentWord.length && softMatcher.find(charPos+1)) { + if (charPos + 1 < currentWord.length && softMatcher.find(charPos+1)) { charPos = softMatcher.end(); found = CharBuffer.wrap(currentWord, charPos, currentWord.length - charPos); Matcher discarding = discard.matcher(found); @@ -232,6 +235,7 @@ currentWord = words[currentWordIx].toCharArray(); termAtt.resizeTermBuffer(currentWord.length); charPos = 0; + found = CharBuffer.wrap(currentWord, charPos, currentWord.length - charPos); softMatcher = subWordBoundary.matcher(words[currentWordIx]); return true; } Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 17:43:05 UTC (rev 8250) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 17:43:16 UTC (rev 8251) @@ -139,7 +139,7 @@ final TermAttribute term = s.getAttribute(TermAttribute.class); final String word = term.term(); assertTrue(ix < expected.length); - assertEquals(word, expected[ix++]); + assertEquals(expected[ix++], word); } assertEquals(ix, expected.length); } Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 17:43:05 UTC (rev 8250) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 17:43:16 UTC (rev 8251) @@ -57,6 +57,8 @@ analyzer+"x-terms."+AnalyzerOptions.PATTERN, "\\W+", analyzer+"x-splits."+AnalyzerOptions.ANALYZER_CLASS, TermCompletionAnalyzer.class.getName(), analyzer+"x-splits."+AnalyzerOptions.STOPWORDS, AnalyzerOptions.STOPWORDS_VALUE_NONE, + analyzer+"x-splits."+AnalyzerOptions.WORD_BOUNDARY, " ", + analyzer+"x-splits."+AnalyzerOptions.SUB_WORD_BOUNDARY, "(?<!\\p{L}|\\p{N})(?=\\p{L}|\\p{N})|(?<!\\p{Lu})(?=\\p{Lu})|(?<=\\p{N})(?=\\p{L})", analyzer+"x-keywords."+AnalyzerOptions.ANALYZER_CLASS, KeywordAnalyzer.class.getName(), analyzer+"ru-x-de."+AnalyzerOptions.ANALYZER_CLASS, RussianAnalyzer.class.getName(), analyzer+"ru-x-de."+AnalyzerOptions.STOPWORDS, GermanAnalyzer.class.getName(), This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <jer...@us...> - 2014-05-09 18:10:17
|
Revision: 8254 http://sourceforge.net/p/bigdata/code/8254 Author: jeremy_carroll Date: 2014-05-09 18:10:14 +0000 (Fri, 09 May 2014) Log Message: ----------- Added test for term completion, with bug fix! Modified Paths: -------------- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java 2014-05-09 17:44:11 UTC (rev 8253) +++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java 2014-05-09 18:10:14 UTC (rev 8254) @@ -216,16 +216,20 @@ afterDiscard = null; if (charPos + 1 < currentWord.length && softMatcher.find(charPos+1)) { charPos = softMatcher.end(); - found = CharBuffer.wrap(currentWord, charPos, currentWord.length - charPos); - Matcher discarding = discard.matcher(found); - if (discarding.find()) { - afterDiscard = discarding.replaceAll(""); - } + considerMatch(); return true; } else { return nextWord(); } } + + void considerMatch() { + found = CharBuffer.wrap(currentWord, charPos, currentWord.length - charPos); + Matcher discarding = discard.matcher(found); + if (discarding.find()) { + afterDiscard = discarding.replaceAll(""); + } + } private boolean nextWord() { currentWordIx++; @@ -235,8 +239,8 @@ currentWord = words[currentWordIx].toCharArray(); termAtt.resizeTermBuffer(currentWord.length); charPos = 0; - found = CharBuffer.wrap(currentWord, charPos, currentWord.length - charPos); softMatcher = subWordBoundary.matcher(words[currentWordIx]); + considerMatch(); return true; } Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 17:44:11 UTC (rev 8253) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 18:10:14 UTC (rev 8254) @@ -59,6 +59,14 @@ analyzer+"x-splits."+AnalyzerOptions.STOPWORDS, AnalyzerOptions.STOPWORDS_VALUE_NONE, analyzer+"x-splits."+AnalyzerOptions.WORD_BOUNDARY, " ", analyzer+"x-splits."+AnalyzerOptions.SUB_WORD_BOUNDARY, "(?<!\\p{L}|\\p{N})(?=\\p{L}|\\p{N})|(?<!\\p{Lu})(?=\\p{Lu})|(?<=\\p{N})(?=\\p{L})", + analyzer+"x-hyphen."+AnalyzerOptions.SUB_WORD_BOUNDARY, "[-.]", + analyzer+"x-hyphen."+AnalyzerOptions.SOFT_HYPHENS, "-", + analyzer+"x-hyphen."+AnalyzerOptions.WORD_BOUNDARY, " ", + analyzer+"x-hyphen."+AnalyzerOptions.ALWAYS_REMOVE_SOFT_HYPHENS, "false", + analyzer+"x-hyphen2."+AnalyzerOptions.SUB_WORD_BOUNDARY, "[-.]", + analyzer+"x-hyphen2."+AnalyzerOptions.SOFT_HYPHENS, "-", + analyzer+"x-hyphen2."+AnalyzerOptions.WORD_BOUNDARY, " ", + analyzer+"x-hyphen2."+AnalyzerOptions.ALWAYS_REMOVE_SOFT_HYPHENS, "true", analyzer+"x-keywords."+AnalyzerOptions.ANALYZER_CLASS, KeywordAnalyzer.class.getName(), analyzer+"ru-x-de."+AnalyzerOptions.ANALYZER_CLASS, RussianAnalyzer.class.getName(), analyzer+"ru-x-de."+AnalyzerOptions.STOPWORDS, GermanAnalyzer.class.getName(), @@ -190,5 +198,21 @@ ); } + public void testSyapseExample8() throws IOException { + comparisonTest("x-hyphen", + true, + "\u00b1-ACE3.1.1 ab-bc.cd-de", + "\u00b1ACE3.1.1 \u00b1-ACE3.1.1 ACE3.1.1 1.1 1 abbc.cdde ab-bc.cd-de bc.cdde bc.cd-de cdde cd-de de" + ); + + } + public void testSyapseExample9() throws IOException { + comparisonTest("x-hyphen2", + true, + "\u00b1-ACE3.1.1 ab-bc.cd-de", + "\u00b1ACE3.1.1 ACE3.1.1 1.1 1 abbc.cdde bc.cdde cdde de" + ); + + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <jer...@us...> - 2014-05-09 19:07:05
|
Revision: 8255 http://sourceforge.net/p/bigdata/code/8255 Author: jeremy_carroll Date: 2014-05-09 19:07:02 +0000 (Fri, 09 May 2014) Log Message: ----------- minor polishing, a few more tests Modified Paths: -------------- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java 2014-05-09 18:10:14 UTC (rev 8254) +++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java 2014-05-09 19:07:02 UTC (rev 8255) @@ -144,11 +144,10 @@ * * - the subword boundaries are identified in {@link #next()} * We then set up {@link #found} to contain the most - * recently found subword, with afterDiscard containing - * the same word as found with the {@link #discard} pattern - * applied. {@link #afterDiscard} is not equal to found; if there - * is nothing to discard then it is null. + * recently found subword. * + * - the soft hyphen discarding is processed in {@link #maybeDiscardHyphens()} + * * - if we are not {@link #alwaysDiscard}ing then {@link #afterDiscard} * can be set to null to return the non-discarded version on the next cycle. * @@ -216,14 +215,14 @@ afterDiscard = null; if (charPos + 1 < currentWord.length && softMatcher.find(charPos+1)) { charPos = softMatcher.end(); - considerMatch(); + maybeDiscardHyphens(); return true; } else { return nextWord(); } } - void considerMatch() { + void maybeDiscardHyphens() { found = CharBuffer.wrap(currentWord, charPos, currentWord.length - charPos); Matcher discarding = discard.matcher(found); if (discarding.find()) { @@ -240,7 +239,7 @@ termAtt.resizeTermBuffer(currentWord.length); charPos = 0; softMatcher = subWordBoundary.matcher(words[currentWordIx]); - considerMatch(); + maybeDiscardHyphens(); return true; } Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 18:10:14 UTC (rev 8254) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 19:07:02 UTC (rev 8255) @@ -102,13 +102,13 @@ return getNdx().getAnalyzer(lang, filterStopWords); } - protected void comparisonTest(String lang, boolean stopWordsSignificant, String text, String spaceSeparated) + protected void comparisonTest(String lang, boolean filterStopWords, String text, String spaceSeparated) throws IOException { if (spaceSeparated == null) { - String rslt = getTokenStream(getAnalyzer(lang, stopWordsSignificant), text); + String rslt = getTokenStream(getAnalyzer(lang, filterStopWords), text); throw new RuntimeException("Got \"" + rslt+ "\""); } - compareTokenStream(getAnalyzer(lang, stopWordsSignificant), text, + compareTokenStream(getAnalyzer(lang, filterStopWords), text, split(spaceSeparated)); //$NON-NLS-1$ } Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 18:10:14 UTC (rev 8254) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 19:07:02 UTC (rev 8255) @@ -35,6 +35,16 @@ import com.bigdata.search.ConfigurableAnalyzerFactory.AnalyzerOptions; +/** + * Unit tests for {@link ConfigurableAnalyzerFactory}. + * We use the same setup, as defined in {@link #getExtraProperties()} + * for all the tests. Some of the tests check whether bad combinations + * of options are detected and reported correctly. + * Others check that some input, in a particular language is + * tokenized as expected. + * @author jeremycarroll + * + */ public class TestConfigurableAnalyzerFactory extends AbstractSearchTest { public TestConfigurableAnalyzerFactory() { @@ -68,8 +78,8 @@ analyzer+"x-hyphen2."+AnalyzerOptions.WORD_BOUNDARY, " ", analyzer+"x-hyphen2."+AnalyzerOptions.ALWAYS_REMOVE_SOFT_HYPHENS, "true", analyzer+"x-keywords."+AnalyzerOptions.ANALYZER_CLASS, KeywordAnalyzer.class.getName(), - analyzer+"ru-x-de."+AnalyzerOptions.ANALYZER_CLASS, RussianAnalyzer.class.getName(), - analyzer+"ru-x-de."+AnalyzerOptions.STOPWORDS, GermanAnalyzer.class.getName(), + analyzer+"en-x-de."+AnalyzerOptions.ANALYZER_CLASS, StandardAnalyzer.class.getName(), + analyzer+"en-x-de."+AnalyzerOptions.STOPWORDS, GermanAnalyzer.class.getName(), }; } @@ -142,6 +152,25 @@ ); } + + public void testStopWordSwitch() throws IOException { + // en-x-de is an English Analyzer using german stopwords! + comparisonTest("en-x-de", + true, + "The fast car arrived slowly.", + "the fast car arrived slowly" + ); + comparisonTest("en-x-de", + true, + "The fast car die arrived slowly.", + "the fast car arrived slowly" + ); + comparisonTest("en-x-de", + false, + "The fast car die arrived slowly.", + "the fast car die arrived slowly" + ); + } public void testSyapseExample1() throws IOException { comparisonTest("x-splits", true, This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <jer...@us...> - 2014-05-09 22:39:13
|
Revision: 8257 http://sourceforge.net/p/bigdata/code/8257 Author: jeremy_carroll Date: 2014-05-09 22:39:10 +0000 (Fri, 09 May 2014) Log Message: ----------- Added extra test to check that by default we use StandardAnalyzer for everything; refactored a bit Modified Paths: -------------- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestAll.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAsDefaultAnalyzerFactory.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestDefaultAnalyzerFactory.java Added Paths: ----------- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractDefaultAnalyzerFactoryTest.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestUnconfiguredAnalyzerFactory.java Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 19:07:09 UTC (rev 8256) +++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 22:39:10 UTC (rev 8257) @@ -366,7 +366,7 @@ } - private static final String DEFAULT_PROPERTIES = + private static final String ALL_LUCENE_NATURAL_LANGUAGES = "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.*.like=eng\n" + "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.por.analyzerClass=org.apache.lucene.analysis.br.BrazilianAnalyzer\n" + "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.pt.like=por\n" + @@ -396,6 +396,9 @@ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.eng.analyzerClass=org.apache.lucene.analysis.standard.StandardAnalyzer\n" + "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.en.like=eng\n"; + private static final String LUCENE_STANDARD_ANALYZER = + "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.*.analyzerClass=org.apache.lucene.analysis.standard.StandardAnalyzer\n"; + private static class AnalyzerPair implements Comparable<AnalyzerPair>{ private final LanguageRange range; private final Analyzer withStopWords; @@ -703,6 +706,7 @@ * strategy so the code will still work on the {@link #MAX_LANG_CACHE_SIZE}+1 th entry. */ private static final int MAX_LANG_CACHE_SIZE = 500; + private String defaultLanguage; private final FullTextIndex<?> fullTextIndex; @@ -833,25 +837,20 @@ protected Properties initProperties() { final Properties parentProperties = fullTextIndex.getProperties(); Properties myProps; - if (Boolean.getBoolean(parentProperties.getProperty(Options.NATURAL_LANGUAGE_SUPPORT, Options.DEFAULT_NATURAL_LAMGUAGE_SUPPORT))) { - myProps = defaultProperties(); + if (Boolean.valueOf(parentProperties.getProperty(Options.NATURAL_LANGUAGE_SUPPORT, Options.DEFAULT_NATURAL_LAMGUAGE_SUPPORT))) { + myProps = loadPropertyString(ALL_LUCENE_NATURAL_LANGUAGES); } else { - myProps = new Properties(); + myProps = loadPropertyString(LUCENE_STANDARD_ANALYZER); } copyRelevantProperties(fullTextIndex.getProperties(), myProps); - - if (myProps.isEmpty()) { - return defaultProperties(); - } else { - return myProps; - } + return myProps; } - protected Properties defaultProperties() { + Properties loadPropertyString(String props) { Properties rslt = new Properties(); try { - rslt.load(new StringReader(DEFAULT_PROPERTIES)); + rslt.load(new StringReader(props)); } catch (IOException e) { throw new RuntimeException("Impossible - well clearly not!", e); } Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java 2014-05-09 19:07:09 UTC (rev 8256) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java 2014-05-09 22:39:10 UTC (rev 8257) @@ -1,153 +1,20 @@ -/** - -Copyright (C) SYSTAP, LLC 2006-2014. All rights reserved. - -Contact: - SYSTAP, LLC - 4501 Tower Road - Greensboro, NC 27410 - lic...@bi... - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -*/ -/* - * Created on May 7, 2014 - */ package com.bigdata.search; -import java.io.IOException; - - public abstract class AbstractAnalyzerFactoryTest extends AbstractSearchTest { - public AbstractAnalyzerFactoryTest() { + public AbstractAnalyzerFactoryTest() { } - - public AbstractAnalyzerFactoryTest(String arg0) { - super(arg0); + + public AbstractAnalyzerFactoryTest(String arg0) { + super(arg0); } - - @Override - public void setUp() throws Exception { - super.setUp(); - init(getExtraProperties()); - } - - - abstract String[] getExtraProperties(); - - public void testEnglishFilterStopWords() throws IOException { - for (String lang: new String[]{ "eng", null, "" }) { //$NON-NLS-1$ //$NON-NLS-2$ - comparisonTest(lang, - true, - "The test to end all tests! Forever.", //$NON-NLS-1$ - "test end all tests forever" //$NON-NLS-1$ - ); - } - } - public void testEnglishNoFilter() throws IOException { - for (String lang: new String[]{ "eng", null, "" }) { //$NON-NLS-1$ //$NON-NLS-2$ - comparisonTest(lang, - false, - "The test to end all tests! Forever.", //$NON-NLS-1$ - "the test to end all tests forever" //$NON-NLS-1$ - ); - } - } - - // Note we careful use a three letter language code for german. - // 'de' is more standard, but the DefaultAnalyzerFactory does not - // implement 'de' correctly. - public void testGermanFilterStopWords() throws IOException { - comparisonTest("ger", //$NON-NLS-1$ - true, - NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.10") + //$NON-NLS-1$ - NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.11"), //$NON-NLS-1$ - NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.12") //$NON-NLS-1$ - ); - - } - // Note we careful use a three letter language code for Russian. - // 'ru' is more standard, but the DefaultAnalyzerFactory does not - // implement 'ru' correctly. - public void testRussianFilterStopWords() throws IOException { - comparisonTest("rus", //$NON-NLS-1$ - true, - // I hope this is not offensive text. - NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.14") + //$NON-NLS-1$ - NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.15"), //$NON-NLS-1$ - NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.16") //$NON-NLS-1$ - ); - - } - public void testGermanNoStopWords() throws IOException { - comparisonTest("ger", //$NON-NLS-1$ - false, - NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.18") + //$NON-NLS-1$ - NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.19"), //$NON-NLS-1$ - NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.20") //$NON-NLS-1$ - ); - - } - public void testRussianNoStopWords() throws IOException { - comparisonTest("rus", //$NON-NLS-1$ - false, - NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.22") + //$NON-NLS-1$ - NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.23"), //$NON-NLS-1$ - NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.24") //$NON-NLS-1$ - ); - - } - public void testJapanese() throws IOException { - for (boolean filterStopWords: new Boolean[]{true, false}) { - comparisonTest("jpn", //$NON-NLS-1$ - filterStopWords, - NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.26"), //$NON-NLS-1$ - NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.27") + //$NON-NLS-1$ - NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.28") + //$NON-NLS-1$ - NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.29")); //$NON-NLS-1$ - } - } - public void testConfiguredLanguages() { - checkConfig("BrazilianAnalyzer", "por", "pt"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ - checkConfig("ChineseAnalyzer", "zho", "chi", "zh"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ - checkConfig("CJKAnalyzer", "jpn", "ja", "kor", "ko"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ - checkConfig("CzechAnalyzer", "ces", "cze", "cs"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ - checkConfig("DutchAnalyzer", "dut", "nld", "nl"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ - checkConfig("GermanAnalyzer", "deu", "ger", "de"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ - checkConfig("GreekAnalyzer", "gre", "ell", "el"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ - checkConfig("RussianAnalyzer", "rus", "ru"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ - checkConfig("ThaiAnalyzer", "th", "tha"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ - checkConfig("StandardAnalyzer", "en", "eng", "", null); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ - } - - private void checkConfig(String classname, String ...langs) { - checkConfig(isBroken(), classname, langs); - + @Override + public void setUp() throws Exception { + super.setUp(); + init(getExtraProperties()); } - protected void checkConfig(boolean threeLetterOnly, String classname, String ...langs) { - for (String lang:langs) { - // The DefaultAnalyzerFactory only works for language tags of length exactly three. - if ((!threeLetterOnly) || (lang != null && lang.length()==3)) - { - assertEquals(classname, getAnalyzer(lang,true).getClass().getSimpleName()); - if (!threeLetterOnly) assertEquals(classname, getAnalyzer(lang+"-x-foobar",true).getClass().getSimpleName()); //$NON-NLS-1$ - } - } - - } - abstract boolean isBroken() ; + abstract String[] getExtraProperties(); + } Copied: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractDefaultAnalyzerFactoryTest.java (from rev 8256, branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java) =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractDefaultAnalyzerFactoryTest.java (rev 0) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractDefaultAnalyzerFactoryTest.java 2014-05-09 22:39:10 UTC (rev 8257) @@ -0,0 +1,133 @@ +/** + +Copyright (C) SYSTAP, LLC 2006-2014. All rights reserved. + +Contact: + SYSTAP, LLC + 4501 Tower Road + Greensboro, NC 27410 + lic...@bi... + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +/* + * Created on May 7, 2014 + */ +package com.bigdata.search; + +import java.io.IOException; + + +public abstract class AbstractDefaultAnalyzerFactoryTest extends AbstractAnalyzerFactoryTest { + + public AbstractDefaultAnalyzerFactoryTest() { + } + + public AbstractDefaultAnalyzerFactoryTest(String arg0) { + super(arg0); + } + + public void testEnglishFilterStopWords() throws IOException { + for (String lang: new String[]{ "eng", null, "" }) { //$NON-NLS-1$ //$NON-NLS-2$ + comparisonTest(lang, + true, + "The test to end all tests! Forever.", //$NON-NLS-1$ + "test end all tests forever" //$NON-NLS-1$ + ); + } + } + public void testEnglishNoFilter() throws IOException { + for (String lang: new String[]{ "eng", null, "" }) { //$NON-NLS-1$ //$NON-NLS-2$ + comparisonTest(lang, + false, + "The test to end all tests! Forever.", //$NON-NLS-1$ + "the test to end all tests forever" //$NON-NLS-1$ + ); + } + } + + // Note we careful use a three letter language code for german. + // 'de' is more standard, but the DefaultAnalyzerFactory does not + // implement 'de' correctly. + public void testGermanFilterStopWords() throws IOException { + comparisonTest("ger", //$NON-NLS-1$ + true, + NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.10") + //$NON-NLS-1$ + NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.11"), //$NON-NLS-1$ + NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.12") //$NON-NLS-1$ + ); + + } + + // Note we careful use a three letter language code for Russian. + // 'ru' is more standard, but the DefaultAnalyzerFactory does not + // implement 'ru' correctly. + public void testRussianFilterStopWords() throws IOException { + comparisonTest("rus", //$NON-NLS-1$ + true, + // I hope this is not offensive text. + NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.14") + //$NON-NLS-1$ + NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.15"), //$NON-NLS-1$ + NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.16") //$NON-NLS-1$ + ); + + } + public void testGermanNoStopWords() throws IOException { + comparisonTest("ger", //$NON-NLS-1$ + false, + NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.18") + //$NON-NLS-1$ + NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.19"), //$NON-NLS-1$ + NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.20") //$NON-NLS-1$ + ); + + } + public void testRussianNoStopWords() throws IOException { + comparisonTest("rus", //$NON-NLS-1$ + false, + NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.22") + //$NON-NLS-1$ + NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.23"), //$NON-NLS-1$ + NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.24") //$NON-NLS-1$ + ); + + } + public void testJapanese() throws IOException { + for (boolean filterStopWords: new Boolean[]{true, false}) { + comparisonTest("jpn", //$NON-NLS-1$ + filterStopWords, + NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.26"), //$NON-NLS-1$ + NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.27") + //$NON-NLS-1$ + NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.28") + //$NON-NLS-1$ + NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.29")); //$NON-NLS-1$ + } + } + public void testConfiguredLanguages() { + checkConfig("BrazilianAnalyzer", "por", "pt"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ + checkConfig("ChineseAnalyzer", "zho", "chi", "zh"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ + checkConfig("CJKAnalyzer", "jpn", "ja", "kor", "ko"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ + checkConfig("CzechAnalyzer", "ces", "cze", "cs"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ + checkConfig("DutchAnalyzer", "dut", "nld", "nl"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ + checkConfig("GermanAnalyzer", "deu", "ger", "de"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ + checkConfig("GreekAnalyzer", "gre", "ell", "el"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ + checkConfig("RussianAnalyzer", "rus", "ru"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ + checkConfig("ThaiAnalyzer", "th", "tha"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ + checkConfig("StandardAnalyzer", "en", "eng", "", null); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ + } + + @Override + protected void checkConfig(String classname, String ...langs) { + checkConfig(isBroken(), classname, langs); + + } + abstract boolean isBroken() ; +} Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 19:07:09 UTC (rev 8256) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 22:39:10 UTC (rev 8257) @@ -135,13 +135,28 @@ private void compareTokenStream(Analyzer a, String text, String expected[]) throws IOException { TokenStream s = a.tokenStream(null, new StringReader(text)); int ix = 0; - while (s.incrementToken()) { - final TermAttribute term = s.getAttribute(TermAttribute.class); - final String word = term.term(); - assertTrue(ix < expected.length); - assertEquals(expected[ix++], word); - } - assertEquals(ix, expected.length); + while (s.incrementToken()) { + final TermAttribute term = s.getAttribute(TermAttribute.class); + final String word = term.term(); + assertTrue(ix < expected.length); + assertEquals(expected[ix++], word); + } + assertEquals(ix, expected.length); } + protected void checkConfig(boolean threeLetterOnly, String classname, String ...langs) { + for (String lang:langs) { + // The DefaultAnalyzerFactory only works for language tags of length exactly three. + if ((!threeLetterOnly) || (lang != null && lang.length()==3)) { + assertEquals(classname, getAnalyzer(lang,true).getClass().getSimpleName()); + if (!threeLetterOnly) { + assertEquals(classname, getAnalyzer(lang+"-x-foobar",true).getClass().getSimpleName()); //$NON-NLS-1$ + } + } + } + } + protected void checkConfig(String classname, String ...langs) { + checkConfig(false, classname, langs); + } + } Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestAll.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestAll.java 2014-05-09 19:07:09 UTC (rev 8256) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestAll.java 2014-05-09 22:39:10 UTC (rev 8257) @@ -115,6 +115,7 @@ // behavior of DefaultAnalyzerFactory suite.addTestSuite(TestConfigurableAsDefaultAnalyzerFactory.class); suite.addTestSuite(TestConfigurableAnalyzerFactory.class); + suite.addTestSuite(TestUnconfiguredAnalyzerFactory.class); return suite; } Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 19:07:09 UTC (rev 8256) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 22:39:10 UTC (rev 8257) @@ -45,7 +45,7 @@ * @author jeremycarroll * */ -public class TestConfigurableAnalyzerFactory extends AbstractSearchTest { +public class TestConfigurableAnalyzerFactory extends AbstractAnalyzerFactoryTest { public TestConfigurableAnalyzerFactory() { } @@ -54,12 +54,8 @@ super(arg0); } - public void setUp() throws Exception { - super.setUp(); - init(getExtraProperties()); - } - - private String[] getExtraProperties() { + @Override + String[] getExtraProperties() { String analyzer = ConfigurableAnalyzerFactory.Options.ANALYZER; return new String[]{ FullTextIndex.Options.ANALYZER_FACTORY_CLASS, ConfigurableAnalyzerFactory.class.getName(), Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAsDefaultAnalyzerFactory.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAsDefaultAnalyzerFactory.java 2014-05-09 19:07:09 UTC (rev 8256) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAsDefaultAnalyzerFactory.java 2014-05-09 22:39:10 UTC (rev 8257) @@ -26,7 +26,7 @@ */ package com.bigdata.search; -public class TestConfigurableAsDefaultAnalyzerFactory extends AbstractAnalyzerFactoryTest { +public class TestConfigurableAsDefaultAnalyzerFactory extends AbstractDefaultAnalyzerFactoryTest { public TestConfigurableAsDefaultAnalyzerFactory() { } @@ -37,7 +37,9 @@ @Override String[] getExtraProperties() { - return new String[]{FullTextIndex.Options.ANALYZER_FACTORY_CLASS, ConfigurableAnalyzerFactory.class.getName()}; + return new String[]{FullTextIndex.Options.ANALYZER_FACTORY_CLASS, ConfigurableAnalyzerFactory.class.getName(), + ConfigurableAnalyzerFactory.Options.NATURAL_LANGUAGE_SUPPORT, "true" + }; } @Override Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestDefaultAnalyzerFactory.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestDefaultAnalyzerFactory.java 2014-05-09 19:07:09 UTC (rev 8256) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestDefaultAnalyzerFactory.java 2014-05-09 22:39:10 UTC (rev 8257) @@ -26,7 +26,7 @@ */ package com.bigdata.search; -public class TestDefaultAnalyzerFactory extends AbstractAnalyzerFactoryTest { +public class TestDefaultAnalyzerFactory extends AbstractDefaultAnalyzerFactoryTest { public TestDefaultAnalyzerFactory() { } Added: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestUnconfiguredAnalyzerFactory.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestUnconfiguredAnalyzerFactory.java (rev 0) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestUnconfiguredAnalyzerFactory.java 2014-05-09 22:39:10 UTC (rev 8257) @@ -0,0 +1,24 @@ +package com.bigdata.search; + +public class TestUnconfiguredAnalyzerFactory extends AbstractAnalyzerFactoryTest { + + public TestUnconfiguredAnalyzerFactory() { + } + + public TestUnconfiguredAnalyzerFactory(String arg0) { + super(arg0); + } + + @Override + String[] getExtraProperties() { + return new String[]{ + FullTextIndex.Options.ANALYZER_FACTORY_CLASS, ConfigurableAnalyzerFactory.class.getName(), + }; + } + + public void testConfiguredLanguages() { + checkConfig("StandardAnalyzer", "por", "pt", "zho", "chi", "zh", "jpn", "ja", "kor", "ko", "ces", "cze", "cs", "dut", "nld", "nl", + "deu", "ger", "de", "gre", "ell", "el", "rus", "ru", "th", "tha", "en", "eng", "", null); + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <jer...@us...> - 2014-05-09 22:39:23
|
Revision: 8258 http://sourceforge.net/p/bigdata/code/8258 Author: jeremy_carroll Date: 2014-05-09 22:39:19 +0000 (Fri, 09 May 2014) Log Message: ----------- Documentation and formatting etc. Modified Paths: -------------- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestUnconfiguredAnalyzerFactory.java Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 22:39:10 UTC (rev 8257) +++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 22:39:19 UTC (rev 8258) @@ -95,8 +95,10 @@ * <p> * Other properties, from {@link AnalyzerOptions} start with * <code>c.b.s.C.analyzer.<em>language-range</em></code> where <code><em>language-range</em></code> conforms - * with the extended language range construct from RFC 4647, section 2.2. These are used to specify - * an analyzer for the given language range. + * with the extended language range construct from RFC 4647, section 2.2. + * There is an issue that bigdata does not allow '*' in property names, and we use the character '_' to + * substitute for '*' in extended language ranges in property names. + * These are used to specify an analyzer for the given language range. * <p> * If no analyzer is specified for the language range <code>*</code> then the {@link StandardAnalyzer} is used. * <p> @@ -113,6 +115,8 @@ * <dd>This uses whitespace to tokenize</dd> * <dt>{@link PatternAnalyzer}</dt> * <dd>This uses a regular expression to tokenize</dd> + * <dt>{@link TermCompletionAnalyzer}</dt> + * <dd>This uses up to three regular expressions to specify multiple tokens for each word, to address term completion use cases.</dd> * <dt>{@link EmptyAnalyzer}</dt> * <dd>This suppresses the functionality, by treating every expression as a stop word.</dd> * </dl> @@ -126,11 +130,26 @@ public class ConfigurableAnalyzerFactory implements IAnalyzerFactory { final private static transient Logger log = Logger.getLogger(ConfigurableAnalyzerFactory.class); - static class LanguageRange implements Comparable<LanguageRange> { + /** + * This is an implementation of RFC 4647 language range, + * targetted at some of the context of bigdata, and only + * supporting the extended filtering specified in section 3.3.2 + * <p> + * Language ranges are comparable so that + * sorting an array and then matching a language tage against each + * member of the array in sequence will give the longest match. + * i.e. the longer ranges come first. + * @author jeremycarroll + * + */ + public static class LanguageRange implements Comparable<LanguageRange> { private final String range[]; private final String full; - + /** + * Note range must be in lower case, this is not verified. + * @param range + */ public LanguageRange(String range) { this.range = range.split("-"); full = range; @@ -174,12 +193,22 @@ return full.hashCode(); } + /** + * This implements the algoirthm of section 3.3.2 of RFC 4647 + * as modified with the observation about private use tags + * in <a href="http://lists.w3.org/Archives/Public/www-international/2014AprJun/0084"> + * this message</a>. + * + * + * @param langTag The RFC 5646 Language tag in lower case + * @return The result of the algorithm + */ public boolean extendedFilterMatch(String langTag) { return extendedFilterMatch(langTag.toLowerCase(Locale.ROOT).split("-")); } // See RFC 4647, 3.3.2 - public boolean extendedFilterMatch(String[] language) { + boolean extendedFilterMatch(String[] language) { // RFC 4647 step 2 if (!matchSubTag(language[0], range[0])) { return false; @@ -227,13 +256,14 @@ */ public interface Options { /** - * By setting this option to true, then the behavior of the legacy {@link DefaultAnalyzerFactory} - * is added, and may be overridden by the settings of the user. + * By setting this option to true, then all the known Lucene Analyzers for natural + * languages are used for a range of language tags. + * These settings may then be overridden by the settings of the user. * Specifically the following properties are loaded, prior to loading the * user's specification (with <code>c.b.s.C</code> expanding to * <code>com.bigdata.search.ConfigurableAnalyzerFactory</code>) <pre> -c.b.s.C.analyzer.*.like=eng +c.b.s.C.analyzer._.like=eng c.b.s.C.analyzer.por.analyzerClass=org.apache.lucene.analysis.br.BrazilianAnalyzer c.b.s.C.analyzer.pt.like=por c.b.s.C.analyzer.zho.analyzerClass=org.apache.lucene.analysis.cn.ChineseAnalyzer @@ -281,7 +311,9 @@ /** * If specified this is the fully qualified name of a subclass of {@link Analyzer} * that has appropriate constructors. - * Either this or {@link #LIKE} or {@link #PATTERN} must be specified for each language range. + * This is set implicitly if some of the options below are selected (for example {@link #PATTERN}). + * For each configured language range, if it is not set, either explicitly or implicitly, then + * {@link #LIKE} must be specified. */ String ANALYZER_CLASS = "analyzerClass"; @@ -399,24 +431,64 @@ private static final String LUCENE_STANDARD_ANALYZER = "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.*.analyzerClass=org.apache.lucene.analysis.standard.StandardAnalyzer\n"; + /** + * This comment describes the implementation of {@link ConfigurableAnalyzerFactory}. + * The only method in the interface is {@link ConfigurableAnalyzerFactory#getAnalyzer(String, boolean)}, + * a map is used from language tag to {@link AnalyzerPair}, where the pair contains + * an {@link Analyzer} both with and without stopwords configured (some times these two analyzers are identical, + * if, for example, stop words are not supported or not required). + * <p> + * If there is no entry for the language tag in the map {@link ConfigurableAnalyzerFactory#langTag2AnalyzerPair}, + * then one is created, by walking down the array {@link ConfigurableAnalyzerFactory#config} of AnalyzerPairs + * until a matching one is found. + * <p> + * The bulk of the code in this class is invoked from the constructor in order to set up this + * {@link ConfigurableAnalyzerFactory#config} array. For example, all of the subclasses of {@link AnalyzerPair}s, + * are simply to call the appropriate constructor in the appropriate way: the difficulty is that many subclasses + * of {@link Analyzer} have constructors with different signatures, and our code needs to navigate each sort. + * @author jeremycarroll + * + */ private static class AnalyzerPair implements Comparable<AnalyzerPair>{ - private final LanguageRange range; + final LanguageRange range; private final Analyzer withStopWords; private final Analyzer withoutStopWords; + public Analyzer getAnalyzer(boolean filterStopwords) { + return filterStopwords ? withStopWords : withoutStopWords; + } + + public boolean extendedFilterMatch(String[] language) { + return range.extendedFilterMatch(language); + } + AnalyzerPair(String range, Analyzer withStopWords, Analyzer withOutStopWords) { this.range = new LanguageRange(range); this.withStopWords = withStopWords; this.withoutStopWords = withOutStopWords; } + /** + * This clone constructor implements {@link AnalyzerOptions#LIKE}. + * @param range + * @param copyMe + */ AnalyzerPair(String range, AnalyzerPair copyMe) { this.range = new LanguageRange(range); this.withStopWords = copyMe.withStopWords; this.withoutStopWords = copyMe.withoutStopWords; - } + /** + * If we have a constructor, with arguments including a populated + * stop word set, then we can use it to make both the withStopWords + * analyzer, and the withoutStopWords analyzer. + * @param range + * @param cons A Constructor including a {@link java.util.Set} argument + * for the stop words. + * @param params The arguments to pass to the constructor including a populated stopword set. + * @throws Exception + */ AnalyzerPair(String range, Constructor<? extends Analyzer> cons, Object ... params) throws Exception { this(range, cons.newInstance(params), cons.newInstance(useEmptyStopWordSet(params))); } @@ -435,9 +507,6 @@ return rslt; } - public Analyzer getAnalyzer(boolean filterStopwords) { - return filterStopwords ? withStopWords : withoutStopWords; - } @Override public String toString() { return range.full + "=(" + withStopWords.getClass().getSimpleName() +")"; @@ -447,30 +516,38 @@ public int compareTo(AnalyzerPair o) { return range.compareTo(o.range); } - - public boolean extendedFilterMatch(String[] language) { - return range.extendedFilterMatch(language); - } } + /** + * Used for Analyzer classes with a constructor with signature (Version, Set). + * @author jeremycarroll + * + */ private static class VersionSetAnalyzerPair extends AnalyzerPair { public VersionSetAnalyzerPair(ConfigOptionsToAnalyzer lro, Class<? extends Analyzer> cls) throws Exception { super(lro.languageRange, getConstructor(cls, Version.class, Set.class), Version.LUCENE_CURRENT, lro.getStopWords()); } } - + + /** + * Used for Analyzer classes which do not support stopwords and have a constructor with signature (Version). + * @author jeremycarroll + * + */ private static class VersionAnalyzerPair extends AnalyzerPair { - public VersionAnalyzerPair(String range, Class<? extends Analyzer> cls) throws Exception { super(range, getConstructor(cls, Version.class).newInstance(Version.LUCENE_CURRENT)); } } - + /** + * Special case code for {@link PatternAnalyzer} + * @author jeremycarroll + * + */ private static class PatternAnalyzerPair extends AnalyzerPair { - public PatternAnalyzerPair(ConfigOptionsToAnalyzer lro, Pattern pattern) throws Exception { super(lro.languageRange, getConstructor(PatternAnalyzer.class,Version.class,Pattern.class,Boolean.TYPE,Set.class), Version.LUCENE_CURRENT, @@ -485,6 +562,16 @@ * This class is initialized with the config options, using the {@link #setProperty(String, String)} * method, for a particular language range and works out which pair of {@link Analyzer}s * to use for that language range. + * <p> + * Instances of this class are only alive during the execution of + * {@link ConfigurableAnalyzerFactory#ConfigurableAnalyzerFactory(FullTextIndex)}, + * the life-cycle is: + * <ol> + * <li>The relveant config properties are applied, and are used to populate the fields. + * <li>The fields are validated + * <li>An {@link AnalyzerPair} is constructed + * </ol> + * * @author jeremycarroll * */ @@ -545,6 +632,10 @@ return ( stopwords == null && pattern == null ) || AnalyzerOptions.STOPWORDS_VALUE_DEFAULT.equals(stopwords); } + /** + * The first step in the life-cycle, used to initialize the fields. + * @return true if the property was recognized. + */ public boolean setProperty(String shortProperty, String value) { if (shortProperty.equals(AnalyzerOptions.LIKE) ) { like = value; @@ -568,6 +659,9 @@ return true; } + /** + * The second phase of the life-cycle, used for sanity checking. + */ public void validate() { if (pattern != null ) { if ( className != null && className != PatternAnalyzer.class.getName()) { @@ -608,6 +702,10 @@ } + /** + * The third and final phase of the life-cyle used for identifying + * the AnalyzerPair. + */ private AnalyzerPair construct() throws Exception { if (className == null) { return null; @@ -660,6 +758,29 @@ throw new RuntimeException("Bad option: cannot find constructor for class " + className + " for language range " + languageRange); } + /** + * Also part of the third phase of the life-cycle, following the {@link AnalyzerOptions#LIKE} + * properties. + * @param depth + * @param max + * @param analyzers + * @return + */ + AnalyzerPair followLikesToAnalyzerPair(int depth, int max, + Map<String, ConfigOptionsToAnalyzer> analyzers) { + if (result == null) { + if (depth == max) { + throw new RuntimeException("Bad configuration: - 'like' loop for language range " + languageRange); + } + ConfigOptionsToAnalyzer next = analyzers.get(like); + if (next == null) { + throw new RuntimeException("Bad option: - 'like' not found for language range " + languageRange+ " (not found: '"+ like +"')"); + } + result = new AnalyzerPair(languageRange, next.followLikesToAnalyzerPair(depth+1, max, analyzers)); + } + return result; + } + protected Class<? extends Analyzer> getAnalyzerClass() { return getAnalyzerClass(className); } @@ -678,22 +799,6 @@ void setAnalyzerPair(AnalyzerPair ap) { result = ap; } - - AnalyzerPair followLikesToAnalyzerPair(int depth, int max, - Map<String, ConfigOptionsToAnalyzer> analyzers) { - if (result == null) { - if (depth == max) { - throw new RuntimeException("Bad configuration: - 'like' loop for language range " + languageRange); - } - ConfigOptionsToAnalyzer next = analyzers.get(like); - if (next == null) { - throw new RuntimeException("Bad option: - 'like' not found for language range " + languageRange+ " (not found: '"+ like +"')"); - } - result = new AnalyzerPair(languageRange, next.followLikesToAnalyzerPair(depth+1, max, analyzers)); - } - return result; - } - } private final AnalyzerPair config[]; @@ -712,7 +817,13 @@ private final FullTextIndex<?> fullTextIndex; + /** + * Builds a new ConfigurableAnalyzerFactory. + * @param fullTextIndex + */ public ConfigurableAnalyzerFactory(final FullTextIndex<?> fullTextIndex) { + // A description of the operation of this method is found on AnalyzerPair and + // ConfigOptionsToAnalyzer. // despite our name, we actually make all the analyzers now, and getAnalyzer method is merely a lookup. if (fullTextIndex == null) @@ -837,9 +948,18 @@ protected Properties initProperties() { final Properties parentProperties = fullTextIndex.getProperties(); Properties myProps; - if (Boolean.valueOf(parentProperties.getProperty(Options.NATURAL_LANGUAGE_SUPPORT, Options.DEFAULT_NATURAL_LAMGUAGE_SUPPORT))) { + if (Boolean.valueOf(parentProperties.getProperty( + Options.NATURAL_LANGUAGE_SUPPORT, + Options.DEFAULT_NATURAL_LAMGUAGE_SUPPORT))) { + myProps = loadPropertyString(ALL_LUCENE_NATURAL_LANGUAGES); + + } else if (hasPropertiesForStarLanguageRange(parentProperties)){ + + myProps = new Properties(); + } else { + myProps = loadPropertyString(LUCENE_STANDARD_ANALYZER); } @@ -867,6 +987,17 @@ } } + private boolean hasPropertiesForStarLanguageRange(Properties from) { + Enumeration<?> en = from.propertyNames(); + while (en.hasMoreElements()) { + String prop = (String)en.nextElement(); + if (prop.startsWith(Options.ANALYZER+"_.") + || prop.startsWith(Options.ANALYZER+"*.")) { + return true; + } + } + return false; + } @Override public Analyzer getAnalyzer(String languageCode, boolean filterStopwords) { Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java 2014-05-09 22:39:10 UTC (rev 8257) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java 2014-05-09 22:39:19 UTC (rev 8258) @@ -1,3 +1,29 @@ +/** + +Copyright (C) SYSTAP, LLC 2006-2014. All rights reserved. + +Contact: + SYSTAP, LLC + 4501 Tower Road + Greensboro, NC 27410 + lic...@bi... + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +/* + * Created on May 9, 2014 + */ package com.bigdata.search; public abstract class AbstractAnalyzerFactoryTest extends AbstractSearchTest { Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 22:39:10 UTC (rev 8257) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 22:39:19 UTC (rev 8258) @@ -59,7 +59,8 @@ String analyzer = ConfigurableAnalyzerFactory.Options.ANALYZER; return new String[]{ FullTextIndex.Options.ANALYZER_FACTORY_CLASS, ConfigurableAnalyzerFactory.class.getName(), - analyzer+"*."+AnalyzerOptions.ANALYZER_CLASS, EmptyAnalyzer.class.getName(), + analyzer+"_."+AnalyzerOptions.LIKE, "x-empty", + analyzer+"x-empty."+AnalyzerOptions.ANALYZER_CLASS, EmptyAnalyzer.class.getName(), analyzer+"x-terms."+AnalyzerOptions.PATTERN, "\\W+", analyzer+"x-splits."+AnalyzerOptions.ANALYZER_CLASS, TermCompletionAnalyzer.class.getName(), analyzer+"x-splits."+AnalyzerOptions.STOPWORDS, AnalyzerOptions.STOPWORDS_VALUE_NONE, Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestUnconfiguredAnalyzerFactory.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestUnconfiguredAnalyzerFactory.java 2014-05-09 22:39:10 UTC (rev 8257) +++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestUnconfiguredAnalyzerFactory.java 2014-05-09 22:39:19 UTC (rev 8258) @@ -1,3 +1,29 @@ +/** + +Copyright (C) SYSTAP, LLC 2006-2014. All rights reserved. + +Contact: + SYSTAP, LLC + 4501 Tower Road + Greensboro, NC 27410 + lic...@bi... + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +/* + * Created on May 7, 2014 + */ package com.bigdata.search; public class TestUnconfiguredAnalyzerFactory extends AbstractAnalyzerFactoryTest { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |