|
From: <jer...@us...> - 2014-05-09 17:42:49
|
Revision: 8248
http://sourceforge.net/p/bigdata/code/8248
Author: jeremy_carroll
Date: 2014-05-09 17:42:44 +0000 (Fri, 09 May 2014)
Log Message:
-----------
First version of TermCompletionAnalyzer, and also tests now passing
Modified Paths:
--------------
branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestAll.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAsDefaultAnalyzerFactory.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestDefaultAnalyzerFactory.java
Added Paths:
-----------
branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java
Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 17:07:05 UTC (rev 8247)
+++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 17:42:44 UTC (rev 8248)
@@ -331,7 +331,7 @@
* (Note the {@link Pattern#UNICODE_CHARACTER_CLASS} flag is enabled).
* It is an error if a different analyzer class is specified.
*/
- String PATTERN = ".pattern";
+ String PATTERN = "pattern";
}
@@ -474,7 +474,7 @@
*/
public Set<?> getStopWords() {
- if (AnalyzerOptions.STOPWORDS_VALUE_NONE.equals(stopwords))
+ if (doNotUseStopWords())
return Collections.EMPTY_SET;
if (useDefaultStopWords()) {
@@ -484,6 +484,10 @@
return getStopWordsForClass(stopwords);
}
+ boolean doNotUseStopWords() {
+ return AnalyzerOptions.STOPWORDS_VALUE_NONE.equals(stopwords) || (stopwords == null && pattern != null);
+ }
+
protected Set<?> getStopWordsForClass(String clazzName) {
Class<? extends Analyzer> analyzerClass = getAnalyzerClass(clazzName);
try {
@@ -500,7 +504,7 @@
}
protected boolean useDefaultStopWords() {
- return stopwords == null || AnalyzerOptions.STOPWORDS_VALUE_DEFAULT.equals(stopwords);
+ return ( stopwords == null && pattern == null ) || AnalyzerOptions.STOPWORDS_VALUE_DEFAULT.equals(stopwords);
}
public boolean setProperty(String shortProperty, String value) {
@@ -550,8 +554,13 @@
if (hasConstructor(cls, Version.class, Set.class)) {
// RussianAnalyzer is missing any way to access stop words.
- if (RussianAnalyzer.class.equals(cls) && useDefaultStopWords()) {
- return new AnalyzerPair(languageRange, new RussianAnalyzer(Version.LUCENE_CURRENT), new RussianAnalyzer(Version.LUCENE_CURRENT, Collections.EMPTY_SET));
+ if (RussianAnalyzer.class.equals(cls)) {
+ if (useDefaultStopWords()) {
+ return new AnalyzerPair(languageRange, new RussianAnalyzer(Version.LUCENE_CURRENT), new RussianAnalyzer(Version.LUCENE_CURRENT, Collections.EMPTY_SET));
+ }
+ if (doNotUseStopWords()) {
+ return new AnalyzerPair(languageRange, new RussianAnalyzer(Version.LUCENE_CURRENT, Collections.EMPTY_SET));
+ }
}
return new VersionSetAnalyzerPair(this, cls);
}
@@ -719,7 +728,7 @@
String prop = (String)en.nextElement();
if (prop.equals(Options.INCLUDE_DEFAULTS)) continue;
if (prop.startsWith(Options.ANALYZER)) {
- String languageRangeAndProperty[] = prop.substring(Options.ANALYZER.length()).split("[.]");
+ String languageRangeAndProperty[] = prop.substring(Options.ANALYZER.length()).replaceAll("_","*").split("[.]");
if (languageRangeAndProperty.length == 2) {
String languageRange = languageRangeAndProperty[0].toLowerCase(Locale.US); // Turkish "I" could create a problem
Added: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java (rev 0)
+++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java 2014-05-09 17:42:44 UTC (rev 8248)
@@ -0,0 +1,88 @@
+package com.bigdata.search;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.nio.CharBuffer;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.Attribute;
+
+
+public class TermCompletionAnalyzer extends Analyzer {
+
+ Pattern hard = Pattern.compile(" ", Pattern.UNICODE_CHARACTER_CLASS);
+ Pattern soft = Pattern.compile("(?<!\\p{L}|\\p{N})(?=\\p{L}|\\p{N})|(?<!\\p{Lu})(?=\\p{Lu})|(?<=\\p{N})(?=\\p{L})", Pattern.UNICODE_CHARACTER_CLASS);
+
+ public TermCompletionAnalyzer() {
+ // TODO Auto-generated constructor stub
+ }
+
+ private class TermCompletionTokenStream extends TokenStream {
+
+ final int length;
+ final String[] words;
+ char currentWord[] = new char[]{};
+ Matcher softMatcher;
+ int currentWordIx = -1;
+ int charPos = 0;
+ final TermAttribute termAtt;
+ public TermCompletionTokenStream(StringReader reader) {
+ termAtt = addAttribute(TermAttribute.class);
+ try {
+ reader.mark(Integer.MAX_VALUE);
+ length = (int) reader.skip(Integer.MAX_VALUE);
+ reader.reset();
+ char fileContent[] = new char[length];
+ reader.read(fileContent);
+ words = hard.split(new String(fileContent));
+ } catch (IOException e) {
+ throw new RuntimeException("Impossible",e);
+ }
+ }
+ @Override
+ public boolean incrementToken() throws IOException {
+ if ( next() ) {
+ int lg = currentWord.length - charPos;
+ System.arraycopy(currentWord, charPos, termAtt.termBuffer(), 0, lg );
+ termAtt.setTermLength(lg);
+ return true;
+ } else {
+ return false;
+ }
+ }
+ private boolean next() {
+ if (currentWordIx >= words.length) {
+ return false;
+ }
+ if (charPos +1 < currentWord.length && softMatcher.find(charPos+1)) {
+ charPos = softMatcher.end();
+ return true;
+ } else {
+ return nextWord();
+ }
+ }
+ private boolean nextWord() {
+ currentWordIx++;
+ if (currentWordIx >= words.length) {
+ return false;
+ }
+ currentWord = words[currentWordIx].toCharArray();
+ termAtt.resizeTermBuffer(currentWord.length);
+ charPos = 0;
+ softMatcher = soft.matcher(words[currentWordIx]);
+ return true;
+ }
+
+ }
+
+
+ @Override
+ public TokenStream tokenStream(String ignoredFieldName, Reader reader) {
+ return new TermCompletionTokenStream((StringReader)reader);
+ }
+}
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java 2014-05-09 17:07:05 UTC (rev 8247)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java 2014-05-09 17:42:44 UTC (rev 8248)
@@ -27,11 +27,7 @@
package com.bigdata.search;
import java.io.IOException;
-import java.io.StringReader;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
public abstract class AbstractAnalyzerFactoryTest extends AbstractSearchTest {
@@ -42,37 +38,16 @@
super(arg0);
}
+ @Override
public void setUp() throws Exception {
super.setUp();
- init(getExtraProperties());
+ init(getExtraProperties());
}
+
+
abstract String[] getExtraProperties();
- private Analyzer getAnalyzer(String lang, boolean filterStopWords) {
- return getNdx().getAnalyzer(lang, filterStopWords);
- }
-
- private void comparisonTest(String lang,
- boolean stopWordsSignificant,
- String text,
- String spaceSeparated) throws IOException {
- compareTokenStream(getAnalyzer(lang, stopWordsSignificant), text,
- spaceSeparated.split(" ")); //$NON-NLS-1$
- }
- private void compareTokenStream(Analyzer a, String text, String expected[]) throws IOException {
- TokenStream s = a.tokenStream(null, new StringReader(text));
- int ix = 0;
- while (s.incrementToken()) {
- final TermAttribute term = s.getAttribute(TermAttribute.class);
- final String word = term.term();
- assertTrue(ix < expected.length);
- assertEquals(word, expected[ix++]);
- }
- assertEquals(ix, expected.length);
- }
-
-
- public void testEnglishFilterStopWords() throws IOException {
+ public void testEnglishFilterStopWords() throws IOException {
for (String lang: new String[]{ "eng", null, "" }) { //$NON-NLS-1$ //$NON-NLS-2$
comparisonTest(lang,
true,
@@ -159,14 +134,20 @@
}
private void checkConfig(String classname, String ...langs) {
+ checkConfig(isBroken(), classname, langs);
+
+ }
+ protected void checkConfig(boolean threeLetterOnly, String classname, String ...langs) {
for (String lang:langs) {
// The DefaultAnalyzerFactory only works for language tags of length exactly three.
-// if (lang != null && lang.length()==3)
+ if ((!threeLetterOnly) || (lang != null && lang.length()==3))
{
assertEquals(classname, getAnalyzer(lang,true).getClass().getSimpleName());
- assertEquals(classname, getAnalyzer(lang+NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.0"),true).getClass().getSimpleName()); //$NON-NLS-1$
+ if (!threeLetterOnly) assertEquals(classname, getAnalyzer(lang+"-x-foobar",true).getClass().getSimpleName()); //$NON-NLS-1$
}
}
}
+
+ abstract boolean isBroken() ;
}
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 17:07:05 UTC (rev 8247)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 17:42:44 UTC (rev 8248)
@@ -26,8 +26,14 @@
*/
package com.bigdata.search;
+import java.io.IOException;
+import java.io.StringReader;
import java.util.Properties;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
import com.bigdata.journal.IIndexManager;
import com.bigdata.journal.ITx;
import com.bigdata.journal.ProxyTestCase;
@@ -62,7 +68,7 @@
}
FullTextIndex<Long> createFullTextIndex(String namespace, String ...propertyValuePairs) {
- return createFullTextIndex(namespace, getProperties(), propertyValuePairs);
+ return createFullTextIndex(namespace, (Properties)getProperties().clone(), propertyValuePairs);
}
public void tearDown() throws Exception {
@@ -92,4 +98,51 @@
return properties;
}
+ protected Analyzer getAnalyzer(String lang, boolean filterStopWords) {
+ return getNdx().getAnalyzer(lang, filterStopWords);
+ }
+
+ protected void comparisonTest(String lang, boolean stopWordsSignificant, String text, String spaceSeparated)
+ throws IOException {
+ if (spaceSeparated == null) {
+ String rslt = getTokenStream(getAnalyzer(lang, stopWordsSignificant), text);
+ throw new RuntimeException("Got \"" + rslt+ "\"");
+ }
+ compareTokenStream(getAnalyzer(lang, stopWordsSignificant), text,
+ split(spaceSeparated)); //$NON-NLS-1$
+ }
+
+ private String[] split(String spaceSeparated) {
+ if (spaceSeparated.length()==0) {
+ return new String[0];
+ }
+ return spaceSeparated.split(" ");
+ }
+
+ protected String getTokenStream(Analyzer a, String text) throws IOException {
+ StringBuffer sb = new StringBuffer();
+ TokenStream s = a.tokenStream(null, new StringReader(text));
+ int ix = 0;
+ while (s.incrementToken()) {
+ final TermAttribute term = s.getAttribute(TermAttribute.class);
+ if (sb.length()!=0) {
+ sb.append(" ");
+ }
+ sb.append(term.term());
+ }
+ return sb.toString();
+ }
+
+ private void compareTokenStream(Analyzer a, String text, String expected[]) throws IOException {
+ TokenStream s = a.tokenStream(null, new StringReader(text));
+ int ix = 0;
+ while (s.incrementToken()) {
+ final TermAttribute term = s.getAttribute(TermAttribute.class);
+ final String word = term.term();
+ assertTrue(ix < expected.length);
+ assertEquals(word, expected[ix++]);
+ }
+ assertEquals(ix, expected.length);
+ }
+
}
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestAll.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestAll.java 2014-05-09 17:07:05 UTC (rev 8247)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestAll.java 2014-05-09 17:42:44 UTC (rev 8248)
@@ -114,6 +114,7 @@
// which is intended to be the same as the intended
// behavior of DefaultAnalyzerFactory
suite.addTestSuite(TestConfigurableAsDefaultAnalyzerFactory.class);
+ suite.addTestSuite(TestConfigurableAnalyzerFactory.class);
return suite;
}
Added: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java (rev 0)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 17:42:44 UTC (rev 8248)
@@ -0,0 +1,195 @@
+/**
+
+Copyright (C) SYSTAP, LLC 2006-2014. All rights reserved.
+
+Contact:
+ SYSTAP, LLC
+ 4501 Tower Road
+ Greensboro, NC 27410
+ lic...@bi...
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+/*
+ * Created on May 7, 2014
+ */
+package com.bigdata.search;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.KeywordAnalyzer;
+import org.apache.lucene.analysis.cjk.CJKAnalyzer;
+import org.apache.lucene.analysis.de.GermanAnalyzer;
+import org.apache.lucene.analysis.ru.RussianAnalyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.util.Version;
+
+import com.bigdata.search.ConfigurableAnalyzerFactory.AnalyzerOptions;
+
+public class TestConfigurableAnalyzerFactory extends AbstractSearchTest {
+
+ public TestConfigurableAnalyzerFactory() {
+ }
+
+ public TestConfigurableAnalyzerFactory(String arg0) {
+ super(arg0);
+ }
+
+ public void setUp() throws Exception {
+ super.setUp();
+ init(getExtraProperties());
+ }
+
+ private String[] getExtraProperties() {
+ String analyzer = ConfigurableAnalyzerFactory.Options.ANALYZER;
+ return new String[]{
+ FullTextIndex.Options.ANALYZER_FACTORY_CLASS, ConfigurableAnalyzerFactory.class.getName(),
+ analyzer+"*."+AnalyzerOptions.ANALYZER_CLASS, EmptyAnalyzer.class.getName(),
+ analyzer+"x-terms."+AnalyzerOptions.PATTERN, "\\W+",
+ analyzer+"x-splits."+AnalyzerOptions.ANALYZER_CLASS, TermCompletionAnalyzer.class.getName(),
+ analyzer+"x-splits."+AnalyzerOptions.STOPWORDS, AnalyzerOptions.STOPWORDS_VALUE_NONE,
+ analyzer+"x-keywords."+AnalyzerOptions.ANALYZER_CLASS, KeywordAnalyzer.class.getName(),
+ analyzer+"ru-x-de."+AnalyzerOptions.ANALYZER_CLASS, RussianAnalyzer.class.getName(),
+ analyzer+"ru-x-de."+AnalyzerOptions.STOPWORDS, GermanAnalyzer.class.getName(),
+ };
+ }
+
+ private void badCombo(String errorMessage, String ... props) {
+ // Check that some combination of properties on a language create an error
+ String myProps[] = new String[props.length+4];
+ int i=0;
+ for (; i<props.length;i+=2) {
+ myProps[i] = ConfigurableAnalyzerFactory.Options.ANALYZER + "x-testme." + props[i];
+ myProps[i+1] = props[i+1];
+ }
+ myProps[i] = ConfigurableAnalyzerFactory.Options.ANALYZER + "_." + AnalyzerOptions.ANALYZER_CLASS;
+ myProps[i+1] = EmptyAnalyzer.class.getName();
+ myProps[i+2] = FullTextIndex.Options.ANALYZER_FACTORY_CLASS;
+ myProps[i+3] = ConfigurableAnalyzerFactory.class.getName();
+ try {
+ this.createFullTextIndex("test-in-error"+getName(), myProps);
+ }
+ catch (RuntimeException e) {
+ Throwable t = e;
+ while (t.getCause() != null) {
+ t = t.getCause();
+ }
+ assertTrue(t.getMessage(),t.getMessage().contains(errorMessage));
+ return;
+ }
+ fail("No error detected");
+ }
+ public void testBadLike() {
+ badCombo("en-us-x-banana",AnalyzerOptions.LIKE,"en-us-x-banana");
+ }
+ public void testMissingClass() {
+ badCombo("exactly one",AnalyzerOptions.STOPWORDS,AnalyzerOptions.STOPWORDS_VALUE_DEFAULT);
+
+ }
+ public void testLikeAndClass() {
+ badCombo("exactly one",AnalyzerOptions.LIKE,"*", AnalyzerOptions.ANALYZER_CLASS, EmptyAnalyzer.class.getName());
+ }
+ public void testLikeAndStopwords() {
+ badCombo("stopwords",AnalyzerOptions.LIKE,"*", AnalyzerOptions.STOPWORDS,AnalyzerOptions.STOPWORDS_VALUE_DEFAULT);
+ }
+ public void testCantAlwaysHaveStopWords() {
+ badCombo("not supported",
+ AnalyzerOptions.ANALYZER_CLASS, EmptyAnalyzer.class.getName(),
+ AnalyzerOptions.STOPWORDS,StandardAnalyzer.class.getName()
+ );
+
+ }
+ public void testCantAlwaysHaveDefaultStopWords() {
+ badCombo("not supported",
+ AnalyzerOptions.ANALYZER_CLASS, EmptyAnalyzer.class.getName(),
+ AnalyzerOptions.STOPWORDS,AnalyzerOptions.STOPWORDS_VALUE_DEFAULT
+ );
+
+ }
+ public void testCantFindRussianStopWords() {
+ badCombo("find",
+ AnalyzerOptions.ANALYZER_CLASS, GermanAnalyzer.class.getName(),
+ AnalyzerOptions.STOPWORDS,RussianAnalyzer.class.getName()
+ );
+
+ }
+
+
+ public void testEmptyAnalyzer() throws IOException {
+ comparisonTest("en",
+ false,
+ "The fast car arrived slowly.",
+ ""
+ );
+
+ }
+ public void testSyapseExample1() throws IOException {
+ comparisonTest("x-splits",
+ true,
+ "ADENOCARCINOMA OF LUNG, SOMATIC [ERBB2, INS/DUP, NT2322]",
+ "ADENOCARCINOMA OF LUNG, SOMATIC [ERBB2, ERBB2, INS/DUP, DUP, NT2322]"
+ );
+
+ }
+ public void testSyapseExample2() throws IOException {
+ comparisonTest("x-splits",
+ true,
+ "\u2265\u2265\u22653-11.13-11.1",
+ "\u2265\u2265\u22653-11.13-11.1 3-11.13-11.1 11.13-11.1 13-11.1 11.1 1"
+ );
+
+ }
+ public void testSyapseExample4() throws IOException {
+ comparisonTest("x-splits",
+ true,
+ "\u00b1-ACE3.1.1",
+ "\u00b1-ACE3.1.1 ACE3.1.1 1.1 1"
+ );
+
+ }
+ public void testSyapseExample3() throws IOException {
+ comparisonTest("x-splits",
+ true,
+ "2,2,3-trimethylbutane",
+ "2,2,3-trimethylbutane 2,3-trimethylbutane 3-trimethylbutane trimethylbutane"
+ );
+
+ }
+ public void testSyapseExample5() throws IOException {
+ comparisonTest("x-splits",
+ true,
+ "CD8_alpha-low Langerhans cell",
+ "CD8_alpha-low alpha-low low Langerhans cell"
+ );
+
+ }
+ public void testSyapseExample6() throws IOException {
+ comparisonTest("x-splits",
+ true,
+ "6-Monoacetylmorphine:Mass Content:Point in time:Meconium:Quantitative",
+ "6-Monoacetylmorphine:Mass Monoacetylmorphine:Mass Mass Content:Point Point in time:Meconium:Quantitative Meconium:Quantitative Quantitative"
+ );
+
+ }
+ public void testSyapseExample7() throws IOException {
+ comparisonTest("x-splits",
+ true,
+ "N,N-dimethyl",
+ "N,N-dimethyl N-dimethyl dimethyl"
+ );
+
+ }
+
+}
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAsDefaultAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAsDefaultAnalyzerFactory.java 2014-05-09 17:07:05 UTC (rev 8247)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAsDefaultAnalyzerFactory.java 2014-05-09 17:42:44 UTC (rev 8248)
@@ -40,4 +40,9 @@
return new String[]{FullTextIndex.Options.ANALYZER_FACTORY_CLASS, ConfigurableAnalyzerFactory.class.getName()};
}
+ @Override
+ boolean isBroken() {
+ return false;
+ }
+
}
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestDefaultAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestDefaultAnalyzerFactory.java 2014-05-09 17:07:05 UTC (rev 8247)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestDefaultAnalyzerFactory.java 2014-05-09 17:42:44 UTC (rev 8248)
@@ -40,4 +40,27 @@
return new String[0];
}
+ /**
+ * The DefaultAnalyzerFactory has bizarre behavior concerning
+ * language specific settings.
+ * The three letter ISO 639-1 language tags for the languages
+ * for which Lucene has Analyzers use those Analyzers; whereas the two digit ISO
+ * language tags, which are the ones recommended by the IETF and the W3C,
+ * all use the StandardAnalyzer (English). Also a language tag with a subtag
+ * uses the StandardAnalyzer, even if it is a recognized three letter ISO code.
+ */
+ @Override
+ boolean isBroken() {
+ return true;
+ }
+
+ /**
+ * Given legacy concerns, we should preserve the incorrect behavior!
+ */
+ public void testIsBroken() {
+ checkConfig(false, "StandardAnalyzer",
+ "en", "eng", "", null, "ru",
+ "pt", "zh", "por-br", "cs", "dut-za", "nl", "de", "gre-at", "el", "th");
+ }
+
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <jer...@us...> - 2014-05-09 17:43:00
|
Revision: 8249
http://sourceforge.net/p/bigdata/code/8249
Author: jeremy_carroll
Date: 2014-05-09 17:42:56 +0000 (Fri, 09 May 2014)
Log Message:
-----------
copyright and tidying up
Modified Paths:
--------------
branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/NonEnglishExamples.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java
Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java 2014-05-09 17:42:44 UTC (rev 8248)
+++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java 2014-05-09 17:42:56 UTC (rev 8249)
@@ -1,3 +1,29 @@
+/**
+
+Copyright (C) SYSTAP, LLC 2006-2014. All rights reserved.
+
+Contact:
+ SYSTAP, LLC
+ 4501 Tower Road
+ Greensboro, NC 27410
+ lic...@bi...
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+/*
+ * Created on May 8, 2014 by Jeremy J. Carroll, Syapse Inc.
+ */
package com.bigdata.search;
import java.io.IOException;
@@ -3,5 +29,4 @@
import java.io.Reader;
import java.io.StringReader;
-import java.nio.CharBuffer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -10,7 +35,6 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
-import org.apache.lucene.util.Attribute;
public class TermCompletionAnalyzer extends Analyzer {
@@ -19,7 +43,6 @@
Pattern soft = Pattern.compile("(?<!\\p{L}|\\p{N})(?=\\p{L}|\\p{N})|(?<!\\p{Lu})(?=\\p{Lu})|(?<=\\p{N})(?=\\p{L})", Pattern.UNICODE_CHARACTER_CLASS);
public TermCompletionAnalyzer() {
- // TODO Auto-generated constructor stub
}
private class TermCompletionTokenStream extends TokenStream {
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 17:42:44 UTC (rev 8248)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 17:42:56 UTC (rev 8249)
@@ -122,7 +122,6 @@
protected String getTokenStream(Analyzer a, String text) throws IOException {
StringBuffer sb = new StringBuffer();
TokenStream s = a.tokenStream(null, new StringReader(text));
- int ix = 0;
while (s.incrementToken()) {
final TermAttribute term = s.getAttribute(TermAttribute.class);
if (sb.length()!=0) {
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/NonEnglishExamples.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/NonEnglishExamples.java 2014-05-09 17:42:44 UTC (rev 8248)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/NonEnglishExamples.java 2014-05-09 17:42:56 UTC (rev 8249)
@@ -1,3 +1,29 @@
+/**
+
+Copyright (C) SYSTAP, LLC 2006-2014. All rights reserved.
+
+Contact:
+ SYSTAP, LLC
+ 4501 Tower Road
+ Greensboro, NC 27410
+ lic...@bi...
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+/*
+ * Created on May 7, 2014 by Jeremy J. Carroll, Syapse Inc.
+ */
package com.bigdata.search;
import java.util.MissingResourceException;
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 17:42:44 UTC (rev 8248)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 17:42:56 UTC (rev 8249)
@@ -28,13 +28,10 @@
import java.io.IOException;
-import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.KeywordAnalyzer;
-import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.de.GermanAnalyzer;
import org.apache.lucene.analysis.ru.RussianAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.util.Version;
import com.bigdata.search.ConfigurableAnalyzerFactory.AnalyzerOptions;
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <jer...@us...> - 2014-05-09 17:43:20
|
Revision: 8251
http://sourceforge.net/p/bigdata/code/8251
Author: jeremy_carroll
Date: 2014-05-09 17:43:16 +0000 (Fri, 09 May 2014)
Log Message:
-----------
Got tests working again, and cleaned up somewhat
Modified Paths:
--------------
branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java
branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java
Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 17:43:05 UTC (rev 8250)
+++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 17:43:16 UTC (rev 8251)
@@ -326,12 +326,48 @@
String STOPWORDS_VALUE_NONE = "none";
/**
- * If this property is present then the analyzer being used is a
- * {@link PatternAnalyzer} and the value is the pattern to use.
+ * The value of the pattern parameter to
+ * {@link PatternAnalyzer#PatternAnalyzer(Version, Pattern, boolean, Set)}
* (Note the {@link Pattern#UNICODE_CHARACTER_CLASS} flag is enabled).
* It is an error if a different analyzer class is specified.
*/
String PATTERN = "pattern";
+ /**
+ * The value of the wordBoundary parameter to
+ * {@link TermCompletionAnalyzer#TermCompletionAnalyzer(Pattern, Pattern, Pattern, boolean)}
+ * (Note the {@link Pattern#UNICODE_CHARACTER_CLASS} flag is enabled).
+ * It is an error if a different analyzer class is specified.
+ */
+ String WORD_BOUNDARY = "wordBoundary";
+ /**
+ * The value of the subWordBoundary parameter to
+ * {@link TermCompletionAnalyzer#TermCompletionAnalyzer(Pattern, Pattern, Pattern, boolean)}
+ * (Note the {@link Pattern#UNICODE_CHARACTER_CLASS} flag is enabled).
+ * It is an error if a different analyzer class is specified.
+ */
+ String SUB_WORD_BOUNDARY = "subWordBoundary";
+ /**
+ * The value of the softHyphens parameter to
+ * {@link TermCompletionAnalyzer#TermCompletionAnalyzer(Pattern, Pattern, Pattern, boolean)}
+ * (Note the {@link Pattern#UNICODE_CHARACTER_CLASS} flag is enabled).
+ * It is an error if a different analyzer class is specified.
+ */
+ String SOFT_HYPHENS = "softHypens";
+ /**
+ * The value of the alwaysRemoveSoftHypens parameter to
+ * {@link TermCompletionAnalyzer#TermCompletionAnalyzer(Pattern, Pattern, Pattern, boolean)}
+ * (Note the {@link Pattern#UNICODE_CHARACTER_CLASS} flag is enabled).
+ * It is an error if a different analyzer class is specified.
+ */
+ String ALWAYS_REMOVE_SOFT_HYPHENS = "alwaysRemoveSoftHypens";
+
+ boolean DEFAULT_ALWAYS_REMOVE_SOFT_HYPHENS = false;
+
+ /**
+ * The default sub-word boundary is a pattern that never matches,
+ * i.e. there are no sub-word boundaries.
+ */
+ Pattern DEFAULT_SUB_WORD_BOUNDARY = Pattern.compile("(?!)");
}
@@ -382,16 +418,7 @@
this.withoutStopWords = copyMe.withoutStopWords;
}
-
- public Analyzer getAnalyzer(boolean filterStopwords) {
- return filterStopwords ? withStopWords : withoutStopWords;
- }
- @Override
- public String toString() {
- return range.full + "=(" + withStopWords.getClass().getSimpleName() +")";
- }
-
AnalyzerPair(String range, Constructor<? extends Analyzer> cons, Object ... params) throws Exception {
this(range, cons.newInstance(params), cons.newInstance(useEmptyStopWordSet(params)));
}
@@ -409,7 +436,16 @@
}
return rslt;
}
+
+ public Analyzer getAnalyzer(boolean filterStopwords) {
+ return filterStopwords ? withStopWords : withoutStopWords;
+ }
@Override
+ public String toString() {
+ return range.full + "=(" + withStopWords.getClass().getSimpleName() +")";
+ }
+
+ @Override
public int compareTo(AnalyzerPair o) {
return range.compareTo(o.range);
}
@@ -437,10 +473,10 @@
private static class PatternAnalyzerPair extends AnalyzerPair {
- public PatternAnalyzerPair(ConfigOptionsToAnalyzer lro, String pattern) throws Exception {
+ public PatternAnalyzerPair(ConfigOptionsToAnalyzer lro, Pattern pattern) throws Exception {
super(lro.languageRange, getConstructor(PatternAnalyzer.class,Version.class,Pattern.class,Boolean.TYPE,Set.class),
Version.LUCENE_CURRENT,
- Pattern.compile(pattern, Pattern.UNICODE_CHARACTER_CLASS),
+ pattern,
true,
lro.getStopWords());
}
@@ -459,9 +495,13 @@
String like;
String className;
String stopwords;
- String pattern;
+ Pattern pattern;
final String languageRange;
AnalyzerPair result;
+ Pattern wordBoundary;
+ Pattern subWordBoundary;
+ Pattern softHyphens;
+ Boolean alwaysRemoveSoftHyphens;
public ConfigOptionsToAnalyzer(String languageRange) {
this.languageRange = languageRange;
@@ -515,7 +555,15 @@
} else if (shortProperty.equals(AnalyzerOptions.STOPWORDS) ) {
stopwords = value;
} else if (shortProperty.equals(AnalyzerOptions.PATTERN) ) {
- pattern = value;
+ pattern = Pattern.compile(value,Pattern.UNICODE_CHARACTER_CLASS);
+ } else if (shortProperty.equals(AnalyzerOptions.WORD_BOUNDARY) ) {
+ wordBoundary = Pattern.compile(value,Pattern.UNICODE_CHARACTER_CLASS);
+ } else if (shortProperty.equals(AnalyzerOptions.SUB_WORD_BOUNDARY) ) {
+ subWordBoundary = Pattern.compile(value,Pattern.UNICODE_CHARACTER_CLASS);
+ } else if (shortProperty.equals(AnalyzerOptions.SOFT_HYPHENS) ) {
+ softHyphens = Pattern.compile(value,Pattern.UNICODE_CHARACTER_CLASS);
+ } else if (shortProperty.equals(AnalyzerOptions.ALWAYS_REMOVE_SOFT_HYPHENS) ) {
+ alwaysRemoveSoftHyphens = Boolean.valueOf(value);
} else {
return false;
}
@@ -529,6 +577,27 @@
}
className = PatternAnalyzer.class.getName();
}
+ if (this.wordBoundary != null ) {
+ if ( className != null && className != TermCompletionAnalyzer.class.getName()) {
+ throw new RuntimeException("Bad Option: Language range "+languageRange + " with pattern propety for class "+ className);
+ }
+ className = TermCompletionAnalyzer.class.getName();
+
+ if ( subWordBoundary == null ) {
+ subWordBoundary = AnalyzerOptions.DEFAULT_SUB_WORD_BOUNDARY;
+ }
+ if ( alwaysRemoveSoftHyphens != null && softHyphens == null ) {
+ throw new RuntimeException("Bad option: Language range "+languageRange + ": must specify softHypens when setting alwaysRemoveSoftHyphens");
+ }
+ if (softHyphens != null && alwaysRemoveSoftHyphens == null) {
+ alwaysRemoveSoftHyphens = AnalyzerOptions.DEFAULT_ALWAYS_REMOVE_SOFT_HYPHENS;
+ }
+
+ } else if ( subWordBoundary != null || softHyphens != null || alwaysRemoveSoftHyphens != null ||
+ TermCompletionAnalyzer.class.getName().equals(className) ) {
+ throw new RuntimeException("Bad option: Language range "+languageRange + ": must specify wordBoundary for TermCompletionAnalyzer");
+ }
+
if (PatternAnalyzer.class.getName().equals(className) && pattern == null ) {
throw new RuntimeException("Bad Option: Language range "+languageRange + " must specify pattern for PatternAnalyzer.");
}
@@ -547,8 +616,23 @@
}
if (pattern != null) {
return new PatternAnalyzerPair(this, pattern);
-
- }
+ }
+ if (softHyphens != null) {
+ return new AnalyzerPair(
+ languageRange,
+ new TermCompletionAnalyzer(
+ wordBoundary,
+ subWordBoundary,
+ softHyphens,
+ alwaysRemoveSoftHyphens));
+ }
+ if (wordBoundary != null) {
+ return new AnalyzerPair(
+ languageRange,
+ new TermCompletionAnalyzer(
+ wordBoundary,
+ subWordBoundary));
+ }
final Class<? extends Analyzer> cls = getAnalyzerClass();
if (hasConstructor(cls, Version.class, Set.class)) {
Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java 2014-05-09 17:43:05 UTC (rev 8250)
+++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java 2014-05-09 17:43:16 UTC (rev 8251)
@@ -81,8 +81,8 @@
*/
public class TermCompletionAnalyzer extends Analyzer {
- private final Pattern wordBoundary; // = Pattern.compile(" ", Pattern.UNICODE_CHARACTER_CLASS);
- private final Pattern subWordBoundary; // = Pattern.compile("(?<!\\p{L}|\\p{N})(?=\\p{L}|\\p{N})|(?<!\\p{Lu})(?=\\p{Lu})|(?<=\\p{N})(?=\\p{L})", Pattern.UNICODE_CHARACTER_CLASS);
+ private final Pattern wordBoundary;
+ private final Pattern subWordBoundary;
private final Pattern discard;
private final boolean alwaysDiscard;
@@ -90,24 +90,25 @@
/**
* Divide the input into words and short tokens
* as with {@link #TermCompletionAnalyzer(Pattern, Pattern)}.
- * If alsoWithSoftHypens is true then output each token,
- * and in any case output each token with every
- * match to softHyphenEtc deleted.
+ * Each term is generated, and then an additional term
+ * is generated with softHypens (defined by the pattern),
+ * removed. If the alwaysRemoveSoftHypens flag is true,
+ * then the first term (before the removal) is suppressed.
*
* @param wordBoundary The definition of space (e.g. " ")
* @param subWordBoundary Also index after matches to this (e.g. "-")
- * @param softHyphenEtc Discard these characters from matches
- * @param alsoWithSoftHyphens If true the discard step is optional.
+ * @param softHyphens Discard these characters from matches
+ * @param alwaysRemoveSoftHypens If false the discard step is optional.
*/
public TermCompletionAnalyzer(Pattern wordBoundary,
Pattern subWordBoundary,
- Pattern softHyphenEtc,
- boolean alsoWithSoftHyphens) {
+ Pattern softHyphens,
+ boolean alwaysRemoveSoftHypens) {
this.wordBoundary = wordBoundary;
this.subWordBoundary = subWordBoundary;
- if (softHyphenEtc != null) {
- discard = softHyphenEtc;
- alwaysDiscard = !alsoWithSoftHyphens;
+ if (softHyphens != null) {
+ discard = softHyphens;
+ alwaysDiscard = alwaysRemoveSoftHypens;
} else {
discard = Pattern.compile("(?!)"); // never matches
alwaysDiscard = true;
@@ -115,9 +116,10 @@
}
/**
* Divide the input into words, separated by the wordBoundary,
- * and return a token for the whole word, and then for the
- * remainder of the word after each successive match of the
- * subWordBoundary.
+ * and return a token for each whole word, and then
+ * generate further tokens for each word by removing prefixes
+ * up to and including each successive match of
+ * subWordBoundary
* @param wordBoundary
* @param subWordBoundary
*/
@@ -189,8 +191,9 @@
afterDiscard.getChars(0, lg, termAtt.termBuffer(), 0);
termAtt.setTermLength(lg);
} else {
- found.get(termAtt.termBuffer());
- termAtt.setTermLength(found.length());
+ int lg = found.length();
+ found.get(termAtt.termBuffer(), 0, lg);
+ termAtt.setTermLength(lg);
}
return true;
} else {
@@ -211,7 +214,7 @@
}
}
afterDiscard = null;
- if (charPos +1 < currentWord.length && softMatcher.find(charPos+1)) {
+ if (charPos + 1 < currentWord.length && softMatcher.find(charPos+1)) {
charPos = softMatcher.end();
found = CharBuffer.wrap(currentWord, charPos, currentWord.length - charPos);
Matcher discarding = discard.matcher(found);
@@ -232,6 +235,7 @@
currentWord = words[currentWordIx].toCharArray();
termAtt.resizeTermBuffer(currentWord.length);
charPos = 0;
+ found = CharBuffer.wrap(currentWord, charPos, currentWord.length - charPos);
softMatcher = subWordBoundary.matcher(words[currentWordIx]);
return true;
}
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 17:43:05 UTC (rev 8250)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 17:43:16 UTC (rev 8251)
@@ -139,7 +139,7 @@
final TermAttribute term = s.getAttribute(TermAttribute.class);
final String word = term.term();
assertTrue(ix < expected.length);
- assertEquals(word, expected[ix++]);
+ assertEquals(expected[ix++], word);
}
assertEquals(ix, expected.length);
}
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 17:43:05 UTC (rev 8250)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 17:43:16 UTC (rev 8251)
@@ -57,6 +57,8 @@
analyzer+"x-terms."+AnalyzerOptions.PATTERN, "\\W+",
analyzer+"x-splits."+AnalyzerOptions.ANALYZER_CLASS, TermCompletionAnalyzer.class.getName(),
analyzer+"x-splits."+AnalyzerOptions.STOPWORDS, AnalyzerOptions.STOPWORDS_VALUE_NONE,
+ analyzer+"x-splits."+AnalyzerOptions.WORD_BOUNDARY, " ",
+ analyzer+"x-splits."+AnalyzerOptions.SUB_WORD_BOUNDARY, "(?<!\\p{L}|\\p{N})(?=\\p{L}|\\p{N})|(?<!\\p{Lu})(?=\\p{Lu})|(?<=\\p{N})(?=\\p{L})",
analyzer+"x-keywords."+AnalyzerOptions.ANALYZER_CLASS, KeywordAnalyzer.class.getName(),
analyzer+"ru-x-de."+AnalyzerOptions.ANALYZER_CLASS, RussianAnalyzer.class.getName(),
analyzer+"ru-x-de."+AnalyzerOptions.STOPWORDS, GermanAnalyzer.class.getName(),
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <jer...@us...> - 2014-05-09 18:10:17
|
Revision: 8254
http://sourceforge.net/p/bigdata/code/8254
Author: jeremy_carroll
Date: 2014-05-09 18:10:14 +0000 (Fri, 09 May 2014)
Log Message:
-----------
Added test for term completion, with bug fix!
Modified Paths:
--------------
branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java
Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java 2014-05-09 17:44:11 UTC (rev 8253)
+++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java 2014-05-09 18:10:14 UTC (rev 8254)
@@ -216,16 +216,20 @@
afterDiscard = null;
if (charPos + 1 < currentWord.length && softMatcher.find(charPos+1)) {
charPos = softMatcher.end();
- found = CharBuffer.wrap(currentWord, charPos, currentWord.length - charPos);
- Matcher discarding = discard.matcher(found);
- if (discarding.find()) {
- afterDiscard = discarding.replaceAll("");
- }
+ considerMatch();
return true;
} else {
return nextWord();
}
}
+
+ void considerMatch() {
+ found = CharBuffer.wrap(currentWord, charPos, currentWord.length - charPos);
+ Matcher discarding = discard.matcher(found);
+ if (discarding.find()) {
+ afterDiscard = discarding.replaceAll("");
+ }
+ }
private boolean nextWord() {
currentWordIx++;
@@ -235,8 +239,8 @@
currentWord = words[currentWordIx].toCharArray();
termAtt.resizeTermBuffer(currentWord.length);
charPos = 0;
- found = CharBuffer.wrap(currentWord, charPos, currentWord.length - charPos);
softMatcher = subWordBoundary.matcher(words[currentWordIx]);
+ considerMatch();
return true;
}
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 17:44:11 UTC (rev 8253)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 18:10:14 UTC (rev 8254)
@@ -59,6 +59,14 @@
analyzer+"x-splits."+AnalyzerOptions.STOPWORDS, AnalyzerOptions.STOPWORDS_VALUE_NONE,
analyzer+"x-splits."+AnalyzerOptions.WORD_BOUNDARY, " ",
analyzer+"x-splits."+AnalyzerOptions.SUB_WORD_BOUNDARY, "(?<!\\p{L}|\\p{N})(?=\\p{L}|\\p{N})|(?<!\\p{Lu})(?=\\p{Lu})|(?<=\\p{N})(?=\\p{L})",
+ analyzer+"x-hyphen."+AnalyzerOptions.SUB_WORD_BOUNDARY, "[-.]",
+ analyzer+"x-hyphen."+AnalyzerOptions.SOFT_HYPHENS, "-",
+ analyzer+"x-hyphen."+AnalyzerOptions.WORD_BOUNDARY, " ",
+ analyzer+"x-hyphen."+AnalyzerOptions.ALWAYS_REMOVE_SOFT_HYPHENS, "false",
+ analyzer+"x-hyphen2."+AnalyzerOptions.SUB_WORD_BOUNDARY, "[-.]",
+ analyzer+"x-hyphen2."+AnalyzerOptions.SOFT_HYPHENS, "-",
+ analyzer+"x-hyphen2."+AnalyzerOptions.WORD_BOUNDARY, " ",
+ analyzer+"x-hyphen2."+AnalyzerOptions.ALWAYS_REMOVE_SOFT_HYPHENS, "true",
analyzer+"x-keywords."+AnalyzerOptions.ANALYZER_CLASS, KeywordAnalyzer.class.getName(),
analyzer+"ru-x-de."+AnalyzerOptions.ANALYZER_CLASS, RussianAnalyzer.class.getName(),
analyzer+"ru-x-de."+AnalyzerOptions.STOPWORDS, GermanAnalyzer.class.getName(),
@@ -190,5 +198,21 @@
);
}
+ public void testSyapseExample8() throws IOException {
+ comparisonTest("x-hyphen",
+ true,
+ "\u00b1-ACE3.1.1 ab-bc.cd-de",
+ "\u00b1ACE3.1.1 \u00b1-ACE3.1.1 ACE3.1.1 1.1 1 abbc.cdde ab-bc.cd-de bc.cdde bc.cd-de cdde cd-de de"
+ );
+
+ }
+ public void testSyapseExample9() throws IOException {
+ comparisonTest("x-hyphen2",
+ true,
+ "\u00b1-ACE3.1.1 ab-bc.cd-de",
+ "\u00b1ACE3.1.1 ACE3.1.1 1.1 1 abbc.cdde bc.cdde cdde de"
+ );
+
+ }
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <jer...@us...> - 2014-05-09 19:07:05
|
Revision: 8255
http://sourceforge.net/p/bigdata/code/8255
Author: jeremy_carroll
Date: 2014-05-09 19:07:02 +0000 (Fri, 09 May 2014)
Log Message:
-----------
minor polishing, a few more tests
Modified Paths:
--------------
branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java
Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java 2014-05-09 18:10:14 UTC (rev 8254)
+++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java 2014-05-09 19:07:02 UTC (rev 8255)
@@ -144,11 +144,10 @@
*
* - the subword boundaries are identified in {@link #next()}
* We then set up {@link #found} to contain the most
- * recently found subword, with afterDiscard containing
- * the same word as found with the {@link #discard} pattern
- * applied. {@link #afterDiscard} is not equal to found; if there
- * is nothing to discard then it is null.
+ * recently found subword.
*
+ * - the soft hyphen discarding is processed in {@link #maybeDiscardHyphens()}
+ *
* - if we are not {@link #alwaysDiscard}ing then {@link #afterDiscard}
* can be set to null to return the non-discarded version on the next cycle.
*
@@ -216,14 +215,14 @@
afterDiscard = null;
if (charPos + 1 < currentWord.length && softMatcher.find(charPos+1)) {
charPos = softMatcher.end();
- considerMatch();
+ maybeDiscardHyphens();
return true;
} else {
return nextWord();
}
}
- void considerMatch() {
+ void maybeDiscardHyphens() {
found = CharBuffer.wrap(currentWord, charPos, currentWord.length - charPos);
Matcher discarding = discard.matcher(found);
if (discarding.find()) {
@@ -240,7 +239,7 @@
termAtt.resizeTermBuffer(currentWord.length);
charPos = 0;
softMatcher = subWordBoundary.matcher(words[currentWordIx]);
- considerMatch();
+ maybeDiscardHyphens();
return true;
}
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 18:10:14 UTC (rev 8254)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 19:07:02 UTC (rev 8255)
@@ -102,13 +102,13 @@
return getNdx().getAnalyzer(lang, filterStopWords);
}
- protected void comparisonTest(String lang, boolean stopWordsSignificant, String text, String spaceSeparated)
+ protected void comparisonTest(String lang, boolean filterStopWords, String text, String spaceSeparated)
throws IOException {
if (spaceSeparated == null) {
- String rslt = getTokenStream(getAnalyzer(lang, stopWordsSignificant), text);
+ String rslt = getTokenStream(getAnalyzer(lang, filterStopWords), text);
throw new RuntimeException("Got \"" + rslt+ "\"");
}
- compareTokenStream(getAnalyzer(lang, stopWordsSignificant), text,
+ compareTokenStream(getAnalyzer(lang, filterStopWords), text,
split(spaceSeparated)); //$NON-NLS-1$
}
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 18:10:14 UTC (rev 8254)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 19:07:02 UTC (rev 8255)
@@ -35,6 +35,16 @@
import com.bigdata.search.ConfigurableAnalyzerFactory.AnalyzerOptions;
+/**
+ * Unit tests for {@link ConfigurableAnalyzerFactory}.
+ * We use the same setup, as defined in {@link #getExtraProperties()}
+ * for all the tests. Some of the tests check whether bad combinations
+ * of options are detected and reported correctly.
+ * Others check that some input, in a particular language is
+ * tokenized as expected.
+ * @author jeremycarroll
+ *
+ */
public class TestConfigurableAnalyzerFactory extends AbstractSearchTest {
public TestConfigurableAnalyzerFactory() {
@@ -68,8 +78,8 @@
analyzer+"x-hyphen2."+AnalyzerOptions.WORD_BOUNDARY, " ",
analyzer+"x-hyphen2."+AnalyzerOptions.ALWAYS_REMOVE_SOFT_HYPHENS, "true",
analyzer+"x-keywords."+AnalyzerOptions.ANALYZER_CLASS, KeywordAnalyzer.class.getName(),
- analyzer+"ru-x-de."+AnalyzerOptions.ANALYZER_CLASS, RussianAnalyzer.class.getName(),
- analyzer+"ru-x-de."+AnalyzerOptions.STOPWORDS, GermanAnalyzer.class.getName(),
+ analyzer+"en-x-de."+AnalyzerOptions.ANALYZER_CLASS, StandardAnalyzer.class.getName(),
+ analyzer+"en-x-de."+AnalyzerOptions.STOPWORDS, GermanAnalyzer.class.getName(),
};
}
@@ -142,6 +152,25 @@
);
}
+
+ public void testStopWordSwitch() throws IOException {
+ // en-x-de is an English Analyzer using german stopwords!
+ comparisonTest("en-x-de",
+ true,
+ "The fast car arrived slowly.",
+ "the fast car arrived slowly"
+ );
+ comparisonTest("en-x-de",
+ true,
+ "The fast car die arrived slowly.",
+ "the fast car arrived slowly"
+ );
+ comparisonTest("en-x-de",
+ false,
+ "The fast car die arrived slowly.",
+ "the fast car die arrived slowly"
+ );
+ }
public void testSyapseExample1() throws IOException {
comparisonTest("x-splits",
true,
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <jer...@us...> - 2014-05-09 22:39:13
|
Revision: 8257
http://sourceforge.net/p/bigdata/code/8257
Author: jeremy_carroll
Date: 2014-05-09 22:39:10 +0000 (Fri, 09 May 2014)
Log Message:
-----------
Added extra test to check that by default we use StandardAnalyzer for everything; refactored a bit
Modified Paths:
--------------
branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestAll.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAsDefaultAnalyzerFactory.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestDefaultAnalyzerFactory.java
Added Paths:
-----------
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractDefaultAnalyzerFactoryTest.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestUnconfiguredAnalyzerFactory.java
Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 19:07:09 UTC (rev 8256)
+++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 22:39:10 UTC (rev 8257)
@@ -366,7 +366,7 @@
}
- private static final String DEFAULT_PROPERTIES =
+ private static final String ALL_LUCENE_NATURAL_LANGUAGES =
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.*.like=eng\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.por.analyzerClass=org.apache.lucene.analysis.br.BrazilianAnalyzer\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.pt.like=por\n" +
@@ -396,6 +396,9 @@
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.eng.analyzerClass=org.apache.lucene.analysis.standard.StandardAnalyzer\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.en.like=eng\n";
+ private static final String LUCENE_STANDARD_ANALYZER =
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.*.analyzerClass=org.apache.lucene.analysis.standard.StandardAnalyzer\n";
+
private static class AnalyzerPair implements Comparable<AnalyzerPair>{
private final LanguageRange range;
private final Analyzer withStopWords;
@@ -703,6 +706,7 @@
* strategy so the code will still work on the {@link #MAX_LANG_CACHE_SIZE}+1 th entry.
*/
private static final int MAX_LANG_CACHE_SIZE = 500;
+
private String defaultLanguage;
private final FullTextIndex<?> fullTextIndex;
@@ -833,25 +837,20 @@
protected Properties initProperties() {
final Properties parentProperties = fullTextIndex.getProperties();
Properties myProps;
- if (Boolean.getBoolean(parentProperties.getProperty(Options.NATURAL_LANGUAGE_SUPPORT, Options.DEFAULT_NATURAL_LAMGUAGE_SUPPORT))) {
- myProps = defaultProperties();
+ if (Boolean.valueOf(parentProperties.getProperty(Options.NATURAL_LANGUAGE_SUPPORT, Options.DEFAULT_NATURAL_LAMGUAGE_SUPPORT))) {
+ myProps = loadPropertyString(ALL_LUCENE_NATURAL_LANGUAGES);
} else {
- myProps = new Properties();
+ myProps = loadPropertyString(LUCENE_STANDARD_ANALYZER);
}
copyRelevantProperties(fullTextIndex.getProperties(), myProps);
-
- if (myProps.isEmpty()) {
- return defaultProperties();
- } else {
- return myProps;
- }
+ return myProps;
}
- protected Properties defaultProperties() {
+ Properties loadPropertyString(String props) {
Properties rslt = new Properties();
try {
- rslt.load(new StringReader(DEFAULT_PROPERTIES));
+ rslt.load(new StringReader(props));
} catch (IOException e) {
throw new RuntimeException("Impossible - well clearly not!", e);
}
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java 2014-05-09 19:07:09 UTC (rev 8256)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java 2014-05-09 22:39:10 UTC (rev 8257)
@@ -1,153 +1,20 @@
-/**
-
-Copyright (C) SYSTAP, LLC 2006-2014. All rights reserved.
-
-Contact:
- SYSTAP, LLC
- 4501 Tower Road
- Greensboro, NC 27410
- lic...@bi...
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-*/
-/*
- * Created on May 7, 2014
- */
package com.bigdata.search;
-import java.io.IOException;
-
-
public abstract class AbstractAnalyzerFactoryTest extends AbstractSearchTest {
- public AbstractAnalyzerFactoryTest() {
+ public AbstractAnalyzerFactoryTest() {
}
-
- public AbstractAnalyzerFactoryTest(String arg0) {
- super(arg0);
+
+ public AbstractAnalyzerFactoryTest(String arg0) {
+ super(arg0);
}
-
- @Override
- public void setUp() throws Exception {
- super.setUp();
- init(getExtraProperties());
- }
-
-
- abstract String[] getExtraProperties();
-
- public void testEnglishFilterStopWords() throws IOException {
- for (String lang: new String[]{ "eng", null, "" }) { //$NON-NLS-1$ //$NON-NLS-2$
- comparisonTest(lang,
- true,
- "The test to end all tests! Forever.", //$NON-NLS-1$
- "test end all tests forever" //$NON-NLS-1$
- );
- }
- }
- public void testEnglishNoFilter() throws IOException {
- for (String lang: new String[]{ "eng", null, "" }) { //$NON-NLS-1$ //$NON-NLS-2$
- comparisonTest(lang,
- false,
- "The test to end all tests! Forever.", //$NON-NLS-1$
- "the test to end all tests forever" //$NON-NLS-1$
- );
- }
- }
-
- // Note we careful use a three letter language code for german.
- // 'de' is more standard, but the DefaultAnalyzerFactory does not
- // implement 'de' correctly.
- public void testGermanFilterStopWords() throws IOException {
- comparisonTest("ger", //$NON-NLS-1$
- true,
- NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.10") + //$NON-NLS-1$
- NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.11"), //$NON-NLS-1$
- NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.12") //$NON-NLS-1$
- );
-
- }
- // Note we careful use a three letter language code for Russian.
- // 'ru' is more standard, but the DefaultAnalyzerFactory does not
- // implement 'ru' correctly.
- public void testRussianFilterStopWords() throws IOException {
- comparisonTest("rus", //$NON-NLS-1$
- true,
- // I hope this is not offensive text.
- NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.14") + //$NON-NLS-1$
- NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.15"), //$NON-NLS-1$
- NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.16") //$NON-NLS-1$
- );
-
- }
- public void testGermanNoStopWords() throws IOException {
- comparisonTest("ger", //$NON-NLS-1$
- false,
- NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.18") + //$NON-NLS-1$
- NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.19"), //$NON-NLS-1$
- NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.20") //$NON-NLS-1$
- );
-
- }
- public void testRussianNoStopWords() throws IOException {
- comparisonTest("rus", //$NON-NLS-1$
- false,
- NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.22") + //$NON-NLS-1$
- NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.23"), //$NON-NLS-1$
- NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.24") //$NON-NLS-1$
- );
-
- }
- public void testJapanese() throws IOException {
- for (boolean filterStopWords: new Boolean[]{true, false}) {
- comparisonTest("jpn", //$NON-NLS-1$
- filterStopWords,
- NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.26"), //$NON-NLS-1$
- NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.27") + //$NON-NLS-1$
- NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.28") + //$NON-NLS-1$
- NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.29")); //$NON-NLS-1$
- }
- }
- public void testConfiguredLanguages() {
- checkConfig("BrazilianAnalyzer", "por", "pt"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
- checkConfig("ChineseAnalyzer", "zho", "chi", "zh"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
- checkConfig("CJKAnalyzer", "jpn", "ja", "kor", "ko"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$
- checkConfig("CzechAnalyzer", "ces", "cze", "cs"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
- checkConfig("DutchAnalyzer", "dut", "nld", "nl"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
- checkConfig("GermanAnalyzer", "deu", "ger", "de"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
- checkConfig("GreekAnalyzer", "gre", "ell", "el"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
- checkConfig("RussianAnalyzer", "rus", "ru"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
- checkConfig("ThaiAnalyzer", "th", "tha"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
- checkConfig("StandardAnalyzer", "en", "eng", "", null); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
- }
-
- private void checkConfig(String classname, String ...langs) {
- checkConfig(isBroken(), classname, langs);
-
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ init(getExtraProperties());
}
- protected void checkConfig(boolean threeLetterOnly, String classname, String ...langs) {
- for (String lang:langs) {
- // The DefaultAnalyzerFactory only works for language tags of length exactly three.
- if ((!threeLetterOnly) || (lang != null && lang.length()==3))
- {
- assertEquals(classname, getAnalyzer(lang,true).getClass().getSimpleName());
- if (!threeLetterOnly) assertEquals(classname, getAnalyzer(lang+"-x-foobar",true).getClass().getSimpleName()); //$NON-NLS-1$
- }
- }
-
- }
- abstract boolean isBroken() ;
+ abstract String[] getExtraProperties();
+
}
Copied: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractDefaultAnalyzerFactoryTest.java (from rev 8256, branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java)
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractDefaultAnalyzerFactoryTest.java (rev 0)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractDefaultAnalyzerFactoryTest.java 2014-05-09 22:39:10 UTC (rev 8257)
@@ -0,0 +1,133 @@
+/**
+
+Copyright (C) SYSTAP, LLC 2006-2014. All rights reserved.
+
+Contact:
+ SYSTAP, LLC
+ 4501 Tower Road
+ Greensboro, NC 27410
+ lic...@bi...
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+/*
+ * Created on May 7, 2014
+ */
+package com.bigdata.search;
+
+import java.io.IOException;
+
+
+public abstract class AbstractDefaultAnalyzerFactoryTest extends AbstractAnalyzerFactoryTest {
+
+ public AbstractDefaultAnalyzerFactoryTest() {
+ }
+
+ public AbstractDefaultAnalyzerFactoryTest(String arg0) {
+ super(arg0);
+ }
+
+ public void testEnglishFilterStopWords() throws IOException {
+ for (String lang: new String[]{ "eng", null, "" }) { //$NON-NLS-1$ //$NON-NLS-2$
+ comparisonTest(lang,
+ true,
+ "The test to end all tests! Forever.", //$NON-NLS-1$
+ "test end all tests forever" //$NON-NLS-1$
+ );
+ }
+ }
+ public void testEnglishNoFilter() throws IOException {
+ for (String lang: new String[]{ "eng", null, "" }) { //$NON-NLS-1$ //$NON-NLS-2$
+ comparisonTest(lang,
+ false,
+ "The test to end all tests! Forever.", //$NON-NLS-1$
+ "the test to end all tests forever" //$NON-NLS-1$
+ );
+ }
+ }
+
+ // Note we careful use a three letter language code for german.
+ // 'de' is more standard, but the DefaultAnalyzerFactory does not
+ // implement 'de' correctly.
+ public void testGermanFilterStopWords() throws IOException {
+ comparisonTest("ger", //$NON-NLS-1$
+ true,
+ NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.10") + //$NON-NLS-1$
+ NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.11"), //$NON-NLS-1$
+ NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.12") //$NON-NLS-1$
+ );
+
+ }
+
+ // Note we careful use a three letter language code for Russian.
+ // 'ru' is more standard, but the DefaultAnalyzerFactory does not
+ // implement 'ru' correctly.
+ public void testRussianFilterStopWords() throws IOException {
+ comparisonTest("rus", //$NON-NLS-1$
+ true,
+ // I hope this is not offensive text.
+ NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.14") + //$NON-NLS-1$
+ NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.15"), //$NON-NLS-1$
+ NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.16") //$NON-NLS-1$
+ );
+
+ }
+ public void testGermanNoStopWords() throws IOException {
+ comparisonTest("ger", //$NON-NLS-1$
+ false,
+ NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.18") + //$NON-NLS-1$
+ NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.19"), //$NON-NLS-1$
+ NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.20") //$NON-NLS-1$
+ );
+
+ }
+ public void testRussianNoStopWords() throws IOException {
+ comparisonTest("rus", //$NON-NLS-1$
+ false,
+ NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.22") + //$NON-NLS-1$
+ NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.23"), //$NON-NLS-1$
+ NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.24") //$NON-NLS-1$
+ );
+
+ }
+ public void testJapanese() throws IOException {
+ for (boolean filterStopWords: new Boolean[]{true, false}) {
+ comparisonTest("jpn", //$NON-NLS-1$
+ filterStopWords,
+ NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.26"), //$NON-NLS-1$
+ NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.27") + //$NON-NLS-1$
+ NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.28") + //$NON-NLS-1$
+ NonEnglishExamples.getString("AbstractAnalyzerFactoryTest.29")); //$NON-NLS-1$
+ }
+ }
+ public void testConfiguredLanguages() {
+ checkConfig("BrazilianAnalyzer", "por", "pt"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
+ checkConfig("ChineseAnalyzer", "zho", "chi", "zh"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
+ checkConfig("CJKAnalyzer", "jpn", "ja", "kor", "ko"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$
+ checkConfig("CzechAnalyzer", "ces", "cze", "cs"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
+ checkConfig("DutchAnalyzer", "dut", "nld", "nl"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
+ checkConfig("GermanAnalyzer", "deu", "ger", "de"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
+ checkConfig("GreekAnalyzer", "gre", "ell", "el"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
+ checkConfig("RussianAnalyzer", "rus", "ru"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
+ checkConfig("ThaiAnalyzer", "th", "tha"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
+ checkConfig("StandardAnalyzer", "en", "eng", "", null); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
+ }
+
+ @Override
+ protected void checkConfig(String classname, String ...langs) {
+ checkConfig(isBroken(), classname, langs);
+
+ }
+ abstract boolean isBroken() ;
+}
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 19:07:09 UTC (rev 8256)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractSearchTest.java 2014-05-09 22:39:10 UTC (rev 8257)
@@ -135,13 +135,28 @@
private void compareTokenStream(Analyzer a, String text, String expected[]) throws IOException {
TokenStream s = a.tokenStream(null, new StringReader(text));
int ix = 0;
- while (s.incrementToken()) {
- final TermAttribute term = s.getAttribute(TermAttribute.class);
- final String word = term.term();
- assertTrue(ix < expected.length);
- assertEquals(expected[ix++], word);
- }
- assertEquals(ix, expected.length);
+ while (s.incrementToken()) {
+ final TermAttribute term = s.getAttribute(TermAttribute.class);
+ final String word = term.term();
+ assertTrue(ix < expected.length);
+ assertEquals(expected[ix++], word);
+ }
+ assertEquals(ix, expected.length);
}
+ protected void checkConfig(boolean threeLetterOnly, String classname, String ...langs) {
+ for (String lang:langs) {
+ // The DefaultAnalyzerFactory only works for language tags of length exactly three.
+ if ((!threeLetterOnly) || (lang != null && lang.length()==3)) {
+ assertEquals(classname, getAnalyzer(lang,true).getClass().getSimpleName());
+ if (!threeLetterOnly) {
+ assertEquals(classname, getAnalyzer(lang+"-x-foobar",true).getClass().getSimpleName()); //$NON-NLS-1$
+ }
+ }
+ }
+ }
+ protected void checkConfig(String classname, String ...langs) {
+ checkConfig(false, classname, langs);
+ }
+
}
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestAll.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestAll.java 2014-05-09 19:07:09 UTC (rev 8256)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestAll.java 2014-05-09 22:39:10 UTC (rev 8257)
@@ -115,6 +115,7 @@
// behavior of DefaultAnalyzerFactory
suite.addTestSuite(TestConfigurableAsDefaultAnalyzerFactory.class);
suite.addTestSuite(TestConfigurableAnalyzerFactory.class);
+ suite.addTestSuite(TestUnconfiguredAnalyzerFactory.class);
return suite;
}
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 19:07:09 UTC (rev 8256)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 22:39:10 UTC (rev 8257)
@@ -45,7 +45,7 @@
* @author jeremycarroll
*
*/
-public class TestConfigurableAnalyzerFactory extends AbstractSearchTest {
+public class TestConfigurableAnalyzerFactory extends AbstractAnalyzerFactoryTest {
public TestConfigurableAnalyzerFactory() {
}
@@ -54,12 +54,8 @@
super(arg0);
}
- public void setUp() throws Exception {
- super.setUp();
- init(getExtraProperties());
- }
-
- private String[] getExtraProperties() {
+ @Override
+ String[] getExtraProperties() {
String analyzer = ConfigurableAnalyzerFactory.Options.ANALYZER;
return new String[]{
FullTextIndex.Options.ANALYZER_FACTORY_CLASS, ConfigurableAnalyzerFactory.class.getName(),
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAsDefaultAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAsDefaultAnalyzerFactory.java 2014-05-09 19:07:09 UTC (rev 8256)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAsDefaultAnalyzerFactory.java 2014-05-09 22:39:10 UTC (rev 8257)
@@ -26,7 +26,7 @@
*/
package com.bigdata.search;
-public class TestConfigurableAsDefaultAnalyzerFactory extends AbstractAnalyzerFactoryTest {
+public class TestConfigurableAsDefaultAnalyzerFactory extends AbstractDefaultAnalyzerFactoryTest {
public TestConfigurableAsDefaultAnalyzerFactory() {
}
@@ -37,7 +37,9 @@
@Override
String[] getExtraProperties() {
- return new String[]{FullTextIndex.Options.ANALYZER_FACTORY_CLASS, ConfigurableAnalyzerFactory.class.getName()};
+ return new String[]{FullTextIndex.Options.ANALYZER_FACTORY_CLASS, ConfigurableAnalyzerFactory.class.getName(),
+ ConfigurableAnalyzerFactory.Options.NATURAL_LANGUAGE_SUPPORT, "true"
+ };
}
@Override
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestDefaultAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestDefaultAnalyzerFactory.java 2014-05-09 19:07:09 UTC (rev 8256)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestDefaultAnalyzerFactory.java 2014-05-09 22:39:10 UTC (rev 8257)
@@ -26,7 +26,7 @@
*/
package com.bigdata.search;
-public class TestDefaultAnalyzerFactory extends AbstractAnalyzerFactoryTest {
+public class TestDefaultAnalyzerFactory extends AbstractDefaultAnalyzerFactoryTest {
public TestDefaultAnalyzerFactory() {
}
Added: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestUnconfiguredAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestUnconfiguredAnalyzerFactory.java (rev 0)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestUnconfiguredAnalyzerFactory.java 2014-05-09 22:39:10 UTC (rev 8257)
@@ -0,0 +1,24 @@
+package com.bigdata.search;
+
+public class TestUnconfiguredAnalyzerFactory extends AbstractAnalyzerFactoryTest {
+
+ public TestUnconfiguredAnalyzerFactory() {
+ }
+
+ public TestUnconfiguredAnalyzerFactory(String arg0) {
+ super(arg0);
+ }
+
+ @Override
+ String[] getExtraProperties() {
+ return new String[]{
+ FullTextIndex.Options.ANALYZER_FACTORY_CLASS, ConfigurableAnalyzerFactory.class.getName(),
+ };
+ }
+
+ public void testConfiguredLanguages() {
+ checkConfig("StandardAnalyzer", "por", "pt", "zho", "chi", "zh", "jpn", "ja", "kor", "ko", "ces", "cze", "cs", "dut", "nld", "nl",
+ "deu", "ger", "de", "gre", "ell", "el", "rus", "ru", "th", "tha", "en", "eng", "", null);
+ }
+
+}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <jer...@us...> - 2014-05-09 22:39:23
|
Revision: 8258
http://sourceforge.net/p/bigdata/code/8258
Author: jeremy_carroll
Date: 2014-05-09 22:39:19 +0000 (Fri, 09 May 2014)
Log Message:
-----------
Documentation and formatting etc.
Modified Paths:
--------------
branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java
branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestUnconfiguredAnalyzerFactory.java
Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 22:39:10 UTC (rev 8257)
+++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 22:39:19 UTC (rev 8258)
@@ -95,8 +95,10 @@
* <p>
* Other properties, from {@link AnalyzerOptions} start with
* <code>c.b.s.C.analyzer.<em>language-range</em></code> where <code><em>language-range</em></code> conforms
- * with the extended language range construct from RFC 4647, section 2.2. These are used to specify
- * an analyzer for the given language range.
+ * with the extended language range construct from RFC 4647, section 2.2.
+ * There is an issue that bigdata does not allow '*' in property names, and we use the character '_' to
+ * substitute for '*' in extended language ranges in property names.
+ * These are used to specify an analyzer for the given language range.
* <p>
* If no analyzer is specified for the language range <code>*</code> then the {@link StandardAnalyzer} is used.
* <p>
@@ -113,6 +115,8 @@
* <dd>This uses whitespace to tokenize</dd>
* <dt>{@link PatternAnalyzer}</dt>
* <dd>This uses a regular expression to tokenize</dd>
+ * <dt>{@link TermCompletionAnalyzer}</dt>
+ * <dd>This uses up to three regular expressions to specify multiple tokens for each word, to address term completion use cases.</dd>
* <dt>{@link EmptyAnalyzer}</dt>
* <dd>This suppresses the functionality, by treating every expression as a stop word.</dd>
* </dl>
@@ -126,11 +130,26 @@
public class ConfigurableAnalyzerFactory implements IAnalyzerFactory {
final private static transient Logger log = Logger.getLogger(ConfigurableAnalyzerFactory.class);
- static class LanguageRange implements Comparable<LanguageRange> {
+ /**
+ * This is an implementation of RFC 4647 language range,
+ * targetted at some of the context of bigdata, and only
+ * supporting the extended filtering specified in section 3.3.2
+ * <p>
+ * Language ranges are comparable so that
+ * sorting an array and then matching a language tage against each
+ * member of the array in sequence will give the longest match.
+ * i.e. the longer ranges come first.
+ * @author jeremycarroll
+ *
+ */
+ public static class LanguageRange implements Comparable<LanguageRange> {
private final String range[];
private final String full;
-
+ /**
+ * Note range must be in lower case, this is not verified.
+ * @param range
+ */
public LanguageRange(String range) {
this.range = range.split("-");
full = range;
@@ -174,12 +193,22 @@
return full.hashCode();
}
+ /**
+ * This implements the algoirthm of section 3.3.2 of RFC 4647
+ * as modified with the observation about private use tags
+ * in <a href="http://lists.w3.org/Archives/Public/www-international/2014AprJun/0084">
+ * this message</a>.
+ *
+ *
+ * @param langTag The RFC 5646 Language tag in lower case
+ * @return The result of the algorithm
+ */
public boolean extendedFilterMatch(String langTag) {
return extendedFilterMatch(langTag.toLowerCase(Locale.ROOT).split("-"));
}
// See RFC 4647, 3.3.2
- public boolean extendedFilterMatch(String[] language) {
+ boolean extendedFilterMatch(String[] language) {
// RFC 4647 step 2
if (!matchSubTag(language[0], range[0])) {
return false;
@@ -227,13 +256,14 @@
*/
public interface Options {
/**
- * By setting this option to true, then the behavior of the legacy {@link DefaultAnalyzerFactory}
- * is added, and may be overridden by the settings of the user.
+ * By setting this option to true, then all the known Lucene Analyzers for natural
+ * languages are used for a range of language tags.
+ * These settings may then be overridden by the settings of the user.
* Specifically the following properties are loaded, prior to loading the
* user's specification (with <code>c.b.s.C</code> expanding to
* <code>com.bigdata.search.ConfigurableAnalyzerFactory</code>)
<pre>
-c.b.s.C.analyzer.*.like=eng
+c.b.s.C.analyzer._.like=eng
c.b.s.C.analyzer.por.analyzerClass=org.apache.lucene.analysis.br.BrazilianAnalyzer
c.b.s.C.analyzer.pt.like=por
c.b.s.C.analyzer.zho.analyzerClass=org.apache.lucene.analysis.cn.ChineseAnalyzer
@@ -281,7 +311,9 @@
/**
* If specified this is the fully qualified name of a subclass of {@link Analyzer}
* that has appropriate constructors.
- * Either this or {@link #LIKE} or {@link #PATTERN} must be specified for each language range.
+ * This is set implicitly if some of the options below are selected (for example {@link #PATTERN}).
+ * For each configured language range, if it is not set, either explicitly or implicitly, then
+ * {@link #LIKE} must be specified.
*/
String ANALYZER_CLASS = "analyzerClass";
@@ -399,24 +431,64 @@
private static final String LUCENE_STANDARD_ANALYZER =
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.*.analyzerClass=org.apache.lucene.analysis.standard.StandardAnalyzer\n";
+ /**
+ * This comment describes the implementation of {@link ConfigurableAnalyzerFactory}.
+ * The only method in the interface is {@link ConfigurableAnalyzerFactory#getAnalyzer(String, boolean)},
+ * a map is used from language tag to {@link AnalyzerPair}, where the pair contains
+ * an {@link Analyzer} both with and without stopwords configured (some times these two analyzers are identical,
+ * if, for example, stop words are not supported or not required).
+ * <p>
+ * If there is no entry for the language tag in the map {@link ConfigurableAnalyzerFactory#langTag2AnalyzerPair},
+ * then one is created, by walking down the array {@link ConfigurableAnalyzerFactory#config} of AnalyzerPairs
+ * until a matching one is found.
+ * <p>
+ * The bulk of the code in this class is invoked from the constructor in order to set up this
+ * {@link ConfigurableAnalyzerFactory#config} array. For example, all of the subclasses of {@link AnalyzerPair}s,
+ * are simply to call the appropriate constructor in the appropriate way: the difficulty is that many subclasses
+ * of {@link Analyzer} have constructors with different signatures, and our code needs to navigate each sort.
+ * @author jeremycarroll
+ *
+ */
private static class AnalyzerPair implements Comparable<AnalyzerPair>{
- private final LanguageRange range;
+ final LanguageRange range;
private final Analyzer withStopWords;
private final Analyzer withoutStopWords;
+ public Analyzer getAnalyzer(boolean filterStopwords) {
+ return filterStopwords ? withStopWords : withoutStopWords;
+ }
+
+ public boolean extendedFilterMatch(String[] language) {
+ return range.extendedFilterMatch(language);
+ }
+
AnalyzerPair(String range, Analyzer withStopWords, Analyzer withOutStopWords) {
this.range = new LanguageRange(range);
this.withStopWords = withStopWords;
this.withoutStopWords = withOutStopWords;
}
+ /**
+ * This clone constructor implements {@link AnalyzerOptions#LIKE}.
+ * @param range
+ * @param copyMe
+ */
AnalyzerPair(String range, AnalyzerPair copyMe) {
this.range = new LanguageRange(range);
this.withStopWords = copyMe.withStopWords;
this.withoutStopWords = copyMe.withoutStopWords;
-
}
+ /**
+ * If we have a constructor, with arguments including a populated
+ * stop word set, then we can use it to make both the withStopWords
+ * analyzer, and the withoutStopWords analyzer.
+ * @param range
+ * @param cons A Constructor including a {@link java.util.Set} argument
+ * for the stop words.
+ * @param params The arguments to pass to the constructor including a populated stopword set.
+ * @throws Exception
+ */
AnalyzerPair(String range, Constructor<? extends Analyzer> cons, Object ... params) throws Exception {
this(range, cons.newInstance(params), cons.newInstance(useEmptyStopWordSet(params)));
}
@@ -435,9 +507,6 @@
return rslt;
}
- public Analyzer getAnalyzer(boolean filterStopwords) {
- return filterStopwords ? withStopWords : withoutStopWords;
- }
@Override
public String toString() {
return range.full + "=(" + withStopWords.getClass().getSimpleName() +")";
@@ -447,30 +516,38 @@
public int compareTo(AnalyzerPair o) {
return range.compareTo(o.range);
}
-
- public boolean extendedFilterMatch(String[] language) {
- return range.extendedFilterMatch(language);
- }
}
+ /**
+ * Used for Analyzer classes with a constructor with signature (Version, Set).
+ * @author jeremycarroll
+ *
+ */
private static class VersionSetAnalyzerPair extends AnalyzerPair {
public VersionSetAnalyzerPair(ConfigOptionsToAnalyzer lro,
Class<? extends Analyzer> cls) throws Exception {
super(lro.languageRange, getConstructor(cls, Version.class, Set.class), Version.LUCENE_CURRENT, lro.getStopWords());
}
}
-
+
+ /**
+ * Used for Analyzer classes which do not support stopwords and have a constructor with signature (Version).
+ * @author jeremycarroll
+ *
+ */
private static class VersionAnalyzerPair extends AnalyzerPair {
-
public VersionAnalyzerPair(String range, Class<? extends Analyzer> cls) throws Exception {
super(range, getConstructor(cls, Version.class).newInstance(Version.LUCENE_CURRENT));
}
}
-
+ /**
+ * Special case code for {@link PatternAnalyzer}
+ * @author jeremycarroll
+ *
+ */
private static class PatternAnalyzerPair extends AnalyzerPair {
-
public PatternAnalyzerPair(ConfigOptionsToAnalyzer lro, Pattern pattern) throws Exception {
super(lro.languageRange, getConstructor(PatternAnalyzer.class,Version.class,Pattern.class,Boolean.TYPE,Set.class),
Version.LUCENE_CURRENT,
@@ -485,6 +562,16 @@
* This class is initialized with the config options, using the {@link #setProperty(String, String)}
* method, for a particular language range and works out which pair of {@link Analyzer}s
* to use for that language range.
+ * <p>
+ * Instances of this class are only alive during the execution of
+ * {@link ConfigurableAnalyzerFactory#ConfigurableAnalyzerFactory(FullTextIndex)},
+ * the life-cycle is:
+ * <ol>
+ * <li>The relveant config properties are applied, and are used to populate the fields.
+ * <li>The fields are validated
+ * <li>An {@link AnalyzerPair} is constructed
+ * </ol>
+ *
* @author jeremycarroll
*
*/
@@ -545,6 +632,10 @@
return ( stopwords == null && pattern == null ) || AnalyzerOptions.STOPWORDS_VALUE_DEFAULT.equals(stopwords);
}
+ /**
+ * The first step in the life-cycle, used to initialize the fields.
+ * @return true if the property was recognized.
+ */
public boolean setProperty(String shortProperty, String value) {
if (shortProperty.equals(AnalyzerOptions.LIKE) ) {
like = value;
@@ -568,6 +659,9 @@
return true;
}
+ /**
+ * The second phase of the life-cycle, used for sanity checking.
+ */
public void validate() {
if (pattern != null ) {
if ( className != null && className != PatternAnalyzer.class.getName()) {
@@ -608,6 +702,10 @@
}
+ /**
+ * The third and final phase of the life-cyle used for identifying
+ * the AnalyzerPair.
+ */
private AnalyzerPair construct() throws Exception {
if (className == null) {
return null;
@@ -660,6 +758,29 @@
throw new RuntimeException("Bad option: cannot find constructor for class " + className + " for language range " + languageRange);
}
+ /**
+ * Also part of the third phase of the life-cycle, following the {@link AnalyzerOptions#LIKE}
+ * properties.
+ * @param depth
+ * @param max
+ * @param analyzers
+ * @return
+ */
+ AnalyzerPair followLikesToAnalyzerPair(int depth, int max,
+ Map<String, ConfigOptionsToAnalyzer> analyzers) {
+ if (result == null) {
+ if (depth == max) {
+ throw new RuntimeException("Bad configuration: - 'like' loop for language range " + languageRange);
+ }
+ ConfigOptionsToAnalyzer next = analyzers.get(like);
+ if (next == null) {
+ throw new RuntimeException("Bad option: - 'like' not found for language range " + languageRange+ " (not found: '"+ like +"')");
+ }
+ result = new AnalyzerPair(languageRange, next.followLikesToAnalyzerPair(depth+1, max, analyzers));
+ }
+ return result;
+ }
+
protected Class<? extends Analyzer> getAnalyzerClass() {
return getAnalyzerClass(className);
}
@@ -678,22 +799,6 @@
void setAnalyzerPair(AnalyzerPair ap) {
result = ap;
}
-
- AnalyzerPair followLikesToAnalyzerPair(int depth, int max,
- Map<String, ConfigOptionsToAnalyzer> analyzers) {
- if (result == null) {
- if (depth == max) {
- throw new RuntimeException("Bad configuration: - 'like' loop for language range " + languageRange);
- }
- ConfigOptionsToAnalyzer next = analyzers.get(like);
- if (next == null) {
- throw new RuntimeException("Bad option: - 'like' not found for language range " + languageRange+ " (not found: '"+ like +"')");
- }
- result = new AnalyzerPair(languageRange, next.followLikesToAnalyzerPair(depth+1, max, analyzers));
- }
- return result;
- }
-
}
private final AnalyzerPair config[];
@@ -712,7 +817,13 @@
private final FullTextIndex<?> fullTextIndex;
+ /**
+ * Builds a new ConfigurableAnalyzerFactory.
+ * @param fullTextIndex
+ */
public ConfigurableAnalyzerFactory(final FullTextIndex<?> fullTextIndex) {
+ // A description of the operation of this method is found on AnalyzerPair and
+ // ConfigOptionsToAnalyzer.
// despite our name, we actually make all the analyzers now, and getAnalyzer method is merely a lookup.
if (fullTextIndex == null)
@@ -837,9 +948,18 @@
protected Properties initProperties() {
final Properties parentProperties = fullTextIndex.getProperties();
Properties myProps;
- if (Boolean.valueOf(parentProperties.getProperty(Options.NATURAL_LANGUAGE_SUPPORT, Options.DEFAULT_NATURAL_LAMGUAGE_SUPPORT))) {
+ if (Boolean.valueOf(parentProperties.getProperty(
+ Options.NATURAL_LANGUAGE_SUPPORT,
+ Options.DEFAULT_NATURAL_LAMGUAGE_SUPPORT))) {
+
myProps = loadPropertyString(ALL_LUCENE_NATURAL_LANGUAGES);
+
+ } else if (hasPropertiesForStarLanguageRange(parentProperties)){
+
+ myProps = new Properties();
+
} else {
+
myProps = loadPropertyString(LUCENE_STANDARD_ANALYZER);
}
@@ -867,6 +987,17 @@
}
}
+ private boolean hasPropertiesForStarLanguageRange(Properties from) {
+ Enumeration<?> en = from.propertyNames();
+ while (en.hasMoreElements()) {
+ String prop = (String)en.nextElement();
+ if (prop.startsWith(Options.ANALYZER+"_.")
+ || prop.startsWith(Options.ANALYZER+"*.")) {
+ return true;
+ }
+ }
+ return false;
+ }
@Override
public Analyzer getAnalyzer(String languageCode, boolean filterStopwords) {
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java 2014-05-09 22:39:10 UTC (rev 8257)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/AbstractAnalyzerFactoryTest.java 2014-05-09 22:39:19 UTC (rev 8258)
@@ -1,3 +1,29 @@
+/**
+
+Copyright (C) SYSTAP, LLC 2006-2014. All rights reserved.
+
+Contact:
+ SYSTAP, LLC
+ 4501 Tower Road
+ Greensboro, NC 27410
+ lic...@bi...
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+/*
+ * Created on May 9, 2014
+ */
package com.bigdata.search;
public abstract class AbstractAnalyzerFactoryTest extends AbstractSearchTest {
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 22:39:10 UTC (rev 8257)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java 2014-05-09 22:39:19 UTC (rev 8258)
@@ -59,7 +59,8 @@
String analyzer = ConfigurableAnalyzerFactory.Options.ANALYZER;
return new String[]{
FullTextIndex.Options.ANALYZER_FACTORY_CLASS, ConfigurableAnalyzerFactory.class.getName(),
- analyzer+"*."+AnalyzerOptions.ANALYZER_CLASS, EmptyAnalyzer.class.getName(),
+ analyzer+"_."+AnalyzerOptions.LIKE, "x-empty",
+ analyzer+"x-empty."+AnalyzerOptions.ANALYZER_CLASS, EmptyAnalyzer.class.getName(),
analyzer+"x-terms."+AnalyzerOptions.PATTERN, "\\W+",
analyzer+"x-splits."+AnalyzerOptions.ANALYZER_CLASS, TermCompletionAnalyzer.class.getName(),
analyzer+"x-splits."+AnalyzerOptions.STOPWORDS, AnalyzerOptions.STOPWORDS_VALUE_NONE,
Modified: branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestUnconfiguredAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestUnconfiguredAnalyzerFactory.java 2014-05-09 22:39:10 UTC (rev 8257)
+++ branches/TEXT_ANALYZERS/bigdata/src/test/com/bigdata/search/TestUnconfiguredAnalyzerFactory.java 2014-05-09 22:39:19 UTC (rev 8258)
@@ -1,3 +1,29 @@
+/**
+
+Copyright (C) SYSTAP, LLC 2006-2014. All rights reserved.
+
+Contact:
+ SYSTAP, LLC
+ 4501 Tower Road
+ Greensboro, NC 27410
+ lic...@bi...
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+/*
+ * Created on May 7, 2014
+ */
package com.bigdata.search;
public class TestUnconfiguredAnalyzerFactory extends AbstractAnalyzerFactoryTest {
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|