|
From: <jer...@us...> - 2014-05-11 15:45:20
|
Revision: 8270
http://sourceforge.net/p/bigdata/code/8270
Author: jeremy_carroll
Date: 2014-05-11 15:45:17 +0000 (Sun, 11 May 2014)
Log Message:
-----------
Restructured ConfigurableAnalyzerFactory to allow lazy one-time initialization with weak caching by UUID of namespace, giving clearer lifecycle management
Modified Paths:
--------------
branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java
branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/TermCompletionAnalyzer.java
branches/BIGDATA_RELEASE_1_3_0/bigdata/src/test/com/bigdata/search/TestConfigurableAnalyzerFactory.java
branches/BIGDATA_RELEASE_1_3_0/bigdata/src/test/com/bigdata/search/TestLanguageRange.java
Added Paths:
-----------
branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/ConfiguredAnalyzerFactory.java
branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/LanguageRange.java
branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/NeedsConfiguringAnalyzerFactory.java
Modified: branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java
===================================================================
--- branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-11 15:08:38 UTC (rev 8269)
+++ branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-11 15:45:17 UTC (rev 8270)
@@ -27,17 +27,9 @@
package com.bigdata.search;
import java.io.IOException;
+import java.io.Reader;
import java.io.StringReader;
-import java.lang.reflect.Constructor;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Enumeration;
-import java.util.HashMap;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Properties;
import java.util.Set;
-import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
@@ -45,15 +37,14 @@
import org.apache.lucene.analysis.KeywordAnalyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.StopAnalyzer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PatternAnalyzer;
-import org.apache.lucene.analysis.ru.RussianAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;
-import com.bigdata.btree.keys.IKeyBuilder;
-import com.bigdata.btree.keys.KeyBuilder;
-
/**
* This class can be used with the bigdata properties file to specify
* which {@link Analyzer}s are used for which languages.
@@ -127,127 +118,6 @@
final private static transient Logger log = Logger.getLogger(ConfigurableAnalyzerFactory.class);
/**
- * This is an implementation of RFC 4647 language range,
- * targetted at the specific needs within bigdata, and only
- * supporting the extended filtering specified in section 3.3.2
- * <p>
- * Language ranges are comparable so that
- * sorting an array and then matching a language tag against each
- * member of the array in sequence will give the longest match.
- * i.e. the longer ranges come first.
- * @author jeremycarroll
- *
- */
- public static class LanguageRange implements Comparable<LanguageRange> {
-
- private final String range[];
- private final String full;
- /**
- * Note range must be in lower case, this is not verified.
- * @param range
- */
- public LanguageRange(String range) {
- this.range = range.split("-");
- full = range;
- }
-
- @Override
- public int compareTo(LanguageRange o) {
- if (equals(o)) {
- return 0;
- }
- int diff = o.range.length - range.length;
- if (diff != 0) {
- // longest first
- return diff;
- }
- if (range.length == 1) {
- // * last
- if (range[0].equals("*")) {
- return 1;
- }
- if (o.range[0].equals("*")) {
- return -1;
- }
- }
- // alphabetically
- for (int i=0; i<range.length; i++) {
- diff = range[i].compareTo(o.range[i]);
- if (diff != 0) {
- return diff;
- }
- }
- throw new RuntimeException("Impossible - supposedly");
- }
-
- @Override
- public boolean equals(Object o) {
- return (o instanceof LanguageRange) && ((LanguageRange)o).full.equals(full);
- }
- @Override
- public int hashCode() {
- return full.hashCode();
- }
-
- /**
- * This implements the algoirthm of section 3.3.2 of RFC 4647
- * as modified with the observation about private use tags
- * in <a href="http://lists.w3.org/Archives/Public/www-international/2014AprJun/0084">
- * this message</a>.
- *
- *
- * @param langTag The RFC 5646 Language tag in lower case
- * @return The result of the algorithm
- */
- public boolean extendedFilterMatch(String langTag) {
- return extendedFilterMatch(langTag.toLowerCase(Locale.ROOT).split("-"));
- }
-
- // See RFC 4647, 3.3.2
- boolean extendedFilterMatch(String[] language) {
- // RFC 4647 step 2
- if (!matchSubTag(language[0], range[0])) {
- return false;
- }
- int rPos = 1;
- int lPos = 1;
- // variant step - for private use flags
- if (language[0].equals("x") && range[0].equals("*")) {
- lPos = 0;
- }
- // RFC 4647 step 3
- while (rPos < range.length) {
- // step 3A
- if (range[rPos].equals("*")) {
- rPos ++;
- continue;
- }
- // step 3B
- if (lPos >= language.length) {
- return false;
- }
- // step 3C
- if (matchSubTag(language[lPos], range[rPos])) {
- lPos++;
- rPos++;
- continue;
- }
- if (language[lPos].length()==1) {
- return false;
- }
- lPos++;
- }
- // RFC 4647 step 4
- return true;
- }
-
- // RFC 4647, 3.3.2, step 1
- private boolean matchSubTag(String langSubTag, String rangeSubTag) {
- return langSubTag.equals(rangeSubTag) || "*".equals(rangeSubTag);
- }
-
- }
- /**
* Options understood by the {@link ConfigurableAnalyzerFactory}.
*/
public interface Options {
@@ -394,638 +264,55 @@
}
- private static final String ALL_LUCENE_NATURAL_LANGUAGES =
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.*.like=eng\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.por.analyzerClass=org.apache.lucene.analysis.br.BrazilianAnalyzer\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.pt.like=por\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.zho.analyzerClass=org.apache.lucene.analysis.cn.ChineseAnalyzer\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.chi.like=zho\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.zh.like=zho\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.jpn.analyzerClass=org.apache.lucene.analysis.cjk.CJKAnalyzer\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.ja.like=jpn\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.kor.like=jpn\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.ko.like=kor\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.ces.analyzerClass=org.apache.lucene.analysis.cz.CzechAnalyzer\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.cze.like=ces\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.cs.like=ces\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.dut.analyzerClass=org.apache.lucene.analysis.nl.DutchAnalyzer\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.nld.like=dut\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.nl.like=dut\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.deu.analyzerClass=org.apache.lucene.analysis.de.GermanAnalyzer\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.ger.like=deu\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.de.like=deu\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.gre.analyzerClass=org.apache.lucene.analysis.el.GreekAnalyzer\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.ell.like=gre\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.el.like=gre\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.rus.analyzerClass=org.apache.lucene.analysis.ru.RussianAnalyzer\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.ru.like=rus\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.tha.analyzerClass=org.apache.lucene.analysis.th.ThaiAnalyzer\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.th.like=tha\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.eng.analyzerClass=org.apache.lucene.analysis.standard.StandardAnalyzer\n" +
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.en.like=eng\n";
-
- private static final String LUCENE_STANDARD_ANALYZER =
- "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.*.analyzerClass=org.apache.lucene.analysis.standard.StandardAnalyzer\n";
-
- /**
- * This comment describes the implementation of {@link ConfigurableAnalyzerFactory}.
- * The only method in the interface is {@link ConfigurableAnalyzerFactory#getAnalyzer(String, boolean)},
- * a map is used from language tag to {@link AnalyzerPair}, where the pair contains
- * an {@link Analyzer} both with and without stopwords configured (some times these two analyzers are identical,
- * if, for example, stop words are not supported or not required).
- * <p>
- * If there is no entry for the language tag in the map {@link ConfigurableAnalyzerFactory#langTag2AnalyzerPair},
- * then one is created, by walking down the array {@link ConfigurableAnalyzerFactory#config} of AnalyzerPairs
- * until a matching one is found.
- * <p>
- * The bulk of the code in this class is invoked from the constructor in order to set up this
- * {@link ConfigurableAnalyzerFactory#config} array. For example, all of the subclasses of {@link AnalyzerPair}s,
- * are simply to call the appropriate constructor in the appropriate way: the difficulty is that many subclasses
- * of {@link Analyzer} have constructors with different signatures, and our code needs to navigate each sort.
- * @author jeremycarroll
- *
- */
- private static class AnalyzerPair implements Comparable<AnalyzerPair>{
- final LanguageRange range;
- private final Analyzer withStopWords;
- private final Analyzer withoutStopWords;
-
- public Analyzer getAnalyzer(boolean filterStopwords) {
- return filterStopwords ? withStopWords : withoutStopWords;
- }
-
- public boolean extendedFilterMatch(String[] language) {
- return range.extendedFilterMatch(language);
- }
-
- AnalyzerPair(String range, Analyzer withStopWords, Analyzer withOutStopWords) {
- this.range = new LanguageRange(range);
- this.withStopWords = withStopWords;
- this.withoutStopWords = withOutStopWords;
- }
-
- /**
- * This clone constructor implements {@link AnalyzerOptions#LIKE}.
- * @param range
- * @param copyMe
- */
- AnalyzerPair(String range, AnalyzerPair copyMe) {
- this.range = new LanguageRange(range);
- this.withStopWords = copyMe.withStopWords;
- this.withoutStopWords = copyMe.withoutStopWords;
- }
-
- /**
- * If we have a constructor, with arguments including a populated
- * stop word set, then we can use it to make both the withStopWords
- * analyzer, and the withoutStopWords analyzer.
- * @param range
- * @param cons A Constructor including a {@link java.util.Set} argument
- * for the stop words.
- * @param params The arguments to pass to the constructor including a populated stopword set.
- * @throws Exception
- */
- AnalyzerPair(String range, Constructor<? extends Analyzer> cons, Object ... params) throws Exception {
- this(range, cons.newInstance(params), cons.newInstance(useEmptyStopWordSet(params)));
- }
- AnalyzerPair(String range, Analyzer stopWordsNotSupported) {
- this(range, stopWordsNotSupported, stopWordsNotSupported);
- }
- private static Object[] useEmptyStopWordSet(Object[] params) {
- Object rslt[] = new Object[params.length];
- for (int i=0; i<params.length; i++) {
- if (params[i] instanceof Set) {
- rslt[i] = Collections.EMPTY_SET;
- } else {
- rslt[i] = params[i];
- }
- }
- return rslt;
- }
-
- @Override
- public String toString() {
- return range.full + "=(" + withStopWords.getClass().getSimpleName() +")";
- }
-
- @Override
- public int compareTo(AnalyzerPair o) {
- return range.compareTo(o.range);
- }
- }
-
-
- /**
- * Used for Analyzer classes with a constructor with signature (Version, Set).
- * @author jeremycarroll
- *
- */
- private static class VersionSetAnalyzerPair extends AnalyzerPair {
- public VersionSetAnalyzerPair(ConfigOptionsToAnalyzer lro,
- Class<? extends Analyzer> cls) throws Exception {
- super(lro.languageRange, getConstructor(cls, Version.class, Set.class), Version.LUCENE_CURRENT, lro.getStopWords());
- }
- }
-
- /**
- * Used for Analyzer classes which do not support stopwords and have a constructor with signature (Version).
- * @author jeremycarroll
- *
- */
- private static class VersionAnalyzerPair extends AnalyzerPair {
- public VersionAnalyzerPair(String range, Class<? extends Analyzer> cls) throws Exception {
- super(range, getConstructor(cls, Version.class).newInstance(Version.LUCENE_CURRENT));
- }
- }
-
- /**
- * Special case code for {@link PatternAnalyzer}
- * @author jeremycarroll
- *
- */
- private static class PatternAnalyzerPair extends AnalyzerPair {
- public PatternAnalyzerPair(ConfigOptionsToAnalyzer lro, Pattern pattern) throws Exception {
- super(lro.languageRange, getConstructor(PatternAnalyzer.class,Version.class,Pattern.class,Boolean.TYPE,Set.class),
- Version.LUCENE_CURRENT,
- pattern,
- true,
- lro.getStopWords());
- }
- }
-
-
- /**
- * This class is initialized with the config options, using the {@link #setProperty(String, String)}
- * method, for a particular language range and works out which pair of {@link Analyzer}s
- * to use for that language range.
- * <p>
- * Instances of this class are only alive during the execution of
- * {@link ConfigurableAnalyzerFactory#ConfigurableAnalyzerFactory(FullTextIndex)},
- * the life-cycle is:
- * <ol>
- * <li>The relveant config properties are applied, and are used to populate the fields.
- * <li>The fields are validated
- * <li>An {@link AnalyzerPair} is constructed
- * </ol>
- *
- * @author jeremycarroll
- *
- */
- private static class ConfigOptionsToAnalyzer {
-
- String like;
- String className;
- String stopwords;
- Pattern pattern;
- final String languageRange;
- AnalyzerPair result;
- Pattern wordBoundary;
- Pattern subWordBoundary;
- Pattern softHyphens;
- Boolean alwaysRemoveSoftHyphens;
-
- public ConfigOptionsToAnalyzer(String languageRange) {
- this.languageRange = languageRange;
- }
-
- /**
- * This is called only when we have already identified that
- * the class does support stopwords.
- * @return
- */
- public Set<?> getStopWords() {
-
- if (doNotUseStopWords())
- return Collections.EMPTY_SET;
-
- if (useDefaultStopWords()) {
- return getStopWordsForClass(className);
- }
-
- return getStopWordsForClass(stopwords);
- }
-
- boolean doNotUseStopWords() {
- return AnalyzerOptions.STOPWORDS_VALUE_NONE.equals(stopwords) || (stopwords == null && pattern != null);
- }
-
- protected Set<?> getStopWordsForClass(String clazzName) {
- Class<? extends Analyzer> analyzerClass = getAnalyzerClass(clazzName);
- try {
- return (Set<?>) analyzerClass.getMethod("getDefaultStopSet").invoke(null);
- } catch (Exception e) {
- if (StandardAnalyzer.class.equals(analyzerClass)) {
- return StandardAnalyzer.STOP_WORDS_SET;
- }
- if (StopAnalyzer.class.equals(analyzerClass)) {
- return StopAnalyzer.ENGLISH_STOP_WORDS_SET;
- }
- throw new RuntimeException("Failed to find stop words from " + clazzName + " for language range "+languageRange);
- }
- }
-
- protected boolean useDefaultStopWords() {
- return ( stopwords == null && pattern == null ) || AnalyzerOptions.STOPWORDS_VALUE_DEFAULT.equals(stopwords);
- }
-
- /**
- * The first step in the life-cycle, used to initialize the fields.
- * @return true if the property was recognized.
- */
- public boolean setProperty(String shortProperty, String value) {
- if (shortProperty.equals(AnalyzerOptions.LIKE) ) {
- like = value;
- } else if (shortProperty.equals(AnalyzerOptions.ANALYZER_CLASS) ) {
- className = value;
- } else if (shortProperty.equals(AnalyzerOptions.STOPWORDS) ) {
- stopwords = value;
- } else if (shortProperty.equals(AnalyzerOptions.PATTERN) ) {
- pattern = Pattern.compile(value,Pattern.UNICODE_CHARACTER_CLASS);
- } else if (shortProperty.equals(AnalyzerOptions.WORD_BOUNDARY) ) {
- wordBoundary = Pattern.compile(value,Pattern.UNICODE_CHARACTER_CLASS);
- } else if (shortProperty.equals(AnalyzerOptions.SUB_WORD_BOUNDARY) ) {
- subWordBoundary = Pattern.compile(value,Pattern.UNICODE_CHARACTER_CLASS);
- } else if (shortProperty.equals(AnalyzerOptions.SOFT_HYPHENS) ) {
- softHyphens = Pattern.compile(value,Pattern.UNICODE_CHARACTER_CLASS);
- } else if (shortProperty.equals(AnalyzerOptions.ALWAYS_REMOVE_SOFT_HYPHENS) ) {
- alwaysRemoveSoftHyphens = Boolean.valueOf(value);
- } else {
- return false;
- }
- return true;
- }
-
- /**
- * The second phase of the life-cycle, used for sanity checking.
- */
- public void validate() {
- if (pattern != null ) {
- if ( className != null && className != PatternAnalyzer.class.getName()) {
- throw new RuntimeException("Bad Option: Language range "+languageRange + " with pattern propety for class "+ className);
- }
- className = PatternAnalyzer.class.getName();
- }
- if (this.wordBoundary != null ) {
- if ( className != null && className != TermCompletionAnalyzer.class.getName()) {
- throw new RuntimeException("Bad Option: Language range "+languageRange + " with pattern propety for class "+ className);
- }
- className = TermCompletionAnalyzer.class.getName();
-
- if ( subWordBoundary == null ) {
- subWordBoundary = AnalyzerOptions.DEFAULT_SUB_WORD_BOUNDARY;
- }
- if ( alwaysRemoveSoftHyphens != null && softHyphens == null ) {
- throw new RuntimeException("Bad option: Language range "+languageRange + ": must specify softHypens when setting alwaysRemoveSoftHyphens");
- }
- if (softHyphens != null && alwaysRemoveSoftHyphens == null) {
- alwaysRemoveSoftHyphens = AnalyzerOptions.DEFAULT_ALWAYS_REMOVE_SOFT_HYPHENS;
- }
-
- } else if ( subWordBoundary != null || softHyphens != null || alwaysRemoveSoftHyphens != null ||
- TermCompletionAnalyzer.class.getName().equals(className) ) {
- throw new RuntimeException("Bad option: Language range "+languageRange + ": must specify wordBoundary for TermCompletionAnalyzer");
- }
-
- if (PatternAnalyzer.class.getName().equals(className) && pattern == null ) {
- throw new RuntimeException("Bad Option: Language range "+languageRange + " must specify pattern for PatternAnalyzer.");
- }
- if ( (like != null) == (className != null) ) {
- throw new RuntimeException("Bad Option: Language range "+languageRange + " must specify exactly one of implementation class or like.");
- }
- if (stopwords != null && like != null) {
- throw new RuntimeException("Bad Option: Language range "+languageRange + " must not specify stopwords with like.");
- }
-
- }
-
- /**
- * The third and final phase of the life-cyle used for identifying
- * the AnalyzerPair.
- */
- private AnalyzerPair construct() throws Exception {
- if (className == null) {
- return null;
- }
- if (pattern != null) {
- return new PatternAnalyzerPair(this, pattern);
- }
- if (softHyphens != null) {
- return new AnalyzerPair(
- languageRange,
- new TermCompletionAnalyzer(
- wordBoundary,
- subWordBoundary,
- softHyphens,
- alwaysRemoveSoftHyphens));
- }
- if (wordBoundary != null) {
- return new AnalyzerPair(
- languageRange,
- new TermCompletionAnalyzer(
- wordBoundary,
- subWordBoundary));
- }
- final Class<? extends Analyzer> cls = getAnalyzerClass();
-
- if (hasConstructor(cls, Version.class, Set.class)) {
-
- // RussianAnalyzer is missing any way to access stop words.
- if (RussianAnalyzer.class.equals(cls)) {
- if (useDefaultStopWords()) {
- return new AnalyzerPair(languageRange, new RussianAnalyzer(Version.LUCENE_CURRENT), new RussianAnalyzer(Version.LUCENE_CURRENT, Collections.EMPTY_SET));
- }
- if (doNotUseStopWords()) {
- return new AnalyzerPair(languageRange, new RussianAnalyzer(Version.LUCENE_CURRENT, Collections.EMPTY_SET));
- }
- }
- return new VersionSetAnalyzerPair(this, cls);
- }
-
- if (stopwords != null && !stopwords.equals(AnalyzerOptions.STOPWORDS_VALUE_NONE)) {
- throw new RuntimeException("Bad option: language range: " + languageRange + " stopwords are not supported by " + className);
- }
- if (hasConstructor(cls, Version.class)) {
- return new VersionAnalyzerPair(languageRange, cls);
- }
-
- if (hasConstructor(cls)) {
- return new AnalyzerPair(languageRange, cls.newInstance());
- }
- throw new RuntimeException("Bad option: cannot find constructor for class " + className + " for language range " + languageRange);
- }
-
- /**
- * Also part of the third phase of the life-cycle, following the {@link AnalyzerOptions#LIKE}
- * properties.
- * @param depth
- * @param max
- * @param analyzers
- * @return
- */
- AnalyzerPair followLikesToAnalyzerPair(int depth, int max,
- Map<String, ConfigOptionsToAnalyzer> analyzers) {
- if (result == null) {
- if (depth == max) {
- throw new RuntimeException("Bad configuration: - 'like' loop for language range " + languageRange);
- }
- ConfigOptionsToAnalyzer next = analyzers.get(like);
- if (next == null) {
- throw new RuntimeException("Bad option: - 'like' not found for language range " + languageRange+ " (not found: '"+ like +"')");
- }
- result = new AnalyzerPair(languageRange, next.followLikesToAnalyzerPair(depth+1, max, analyzers));
- }
- return result;
- }
-
- protected Class<? extends Analyzer> getAnalyzerClass() {
- return getAnalyzerClass(className);
- }
-
- @SuppressWarnings("unchecked")
- protected Class<? extends Analyzer> getAnalyzerClass(String className2) {
- final Class<? extends Analyzer> cls;
- try {
- cls = (Class<? extends Analyzer>) Class.forName(className2);
- } catch (ClassNotFoundException e) {
- throw new RuntimeException("Bad option: cannot find class " + className2 + " for language range " + languageRange, e);
- }
- return cls;
- }
-
- void setAnalyzerPair(AnalyzerPair ap) {
- result = ap;
- }
- }
-
- private final AnalyzerPair config[];
-
- private final Map<String, AnalyzerPair> langTag2AnalyzerPair = new ConcurrentHashMap<String, AnalyzerPair>();
-
/**
- * While it would be very unusual to have more than 500 different language tags in a store
- * it is possible - we use a max size to prevent a memory explosion, and a naive caching
- * strategy so the code will still work on the {@link #MAX_LANG_CACHE_SIZE}+1 th entry.
+ * Initialization is a little tricky, because on the very first
+ * call to the constructor with a new namespace or a new journal
+ * the fullTextIndex is not ready for use.
+ * Therefore we delegate to an unconfigured object
+ * which on the first call to {@link NeedsConfiguringAnalyzerFactory#getAnalyzer(String, boolean)}
+ * does the configuration and replaces itself here with a
+ * {@link ConfiguredAnalyzerFactory}
*/
- private static final int MAX_LANG_CACHE_SIZE = 500;
+ IAnalyzerFactory delegate;
-
- private String defaultLanguage;
- private final FullTextIndex<?> fullTextIndex;
-
-
/**
* Builds a new ConfigurableAnalyzerFactory.
* @param fullTextIndex
*/
public ConfigurableAnalyzerFactory(final FullTextIndex<?> fullTextIndex) {
- // A description of the operation of this method is found on AnalyzerPair and
- // ConfigOptionsToAnalyzer.
- // despite our name, we actually make all the analyzers now, and getAnalyzer method is merely a lookup.
-
- if (fullTextIndex == null)
- throw new IllegalArgumentException();
-
- this.fullTextIndex = fullTextIndex;
-
- final Properties properties = initProperties();
-
- final Map<String, ConfigOptionsToAnalyzer> analyzers = new HashMap<String, ConfigOptionsToAnalyzer>();
-
- properties2analyzers(properties, analyzers);
-
- if (!analyzers.containsKey("*")) {
- throw new RuntimeException("Bad config: must specify behavior on language range '*'");
- }
-
- for (ConfigOptionsToAnalyzer a: analyzers.values()) {
- a.validate();
- }
-
- try {
- for (ConfigOptionsToAnalyzer a: analyzers.values()) {
- a.setAnalyzerPair(a.construct());
- }
- } catch (Exception e) {
- throw new RuntimeException("Cannot construct ConfigurableAnalyzerFactory", e);
- }
- int sz = analyzers.size();
- for (ConfigOptionsToAnalyzer a: analyzers.values()) {
- a.followLikesToAnalyzerPair(0, sz, analyzers);
- }
-
- config = new AnalyzerPair[sz];
- int i = 0;
- for (ConfigOptionsToAnalyzer a: analyzers.values()) {
- config[i++] = a.result;
- }
- Arrays.sort(config);
- if (log.isInfoEnabled()) {
- StringBuilder sb = new StringBuilder();
- sb.append("Installed text Analyzer's: ");
- for (AnalyzerPair ap: config) {
- sb.append(ap.toString());
- sb.append(", ");
- }
- log.info(sb.toString());
- }
+ delegate = new NeedsConfiguringAnalyzerFactory(this, fullTextIndex);
}
- private String getDefaultLanguage(final FullTextIndex<?> fullTextIndex) {
-
- final IKeyBuilder keyBuilder = fullTextIndex.getKeyBuilder();
+ static int loggerIdCounter = 0;
+ @Override
+ public Analyzer getAnalyzer(final String languageCode, boolean filterStopwords) {
- if (keyBuilder.isUnicodeSupported()) {
-
- // The configured local for the database.
- final Locale locale = ((KeyBuilder) keyBuilder)
- .getSortKeyGenerator().getLocale();
-
- // The analyzer for that locale.
- return locale.getLanguage();
-
+ final Analyzer unlogged = delegate.getAnalyzer(languageCode, filterStopwords);
+ if (log.isDebugEnabled()) {
+ return new Analyzer() {
+ @Override
+ public TokenStream tokenStream(final String fieldName, final Reader reader) {
+ final int id = loggerIdCounter++;
+ final String term = TermCompletionAnalyzer.getStringReaderContents((StringReader)reader);
+ log.debug(id + " " + languageCode +" **"+term+"**");
+ return new TokenFilter(unlogged.tokenStream(fieldName, reader)){
+
+ TermAttribute attr = addAttribute(TermAttribute.class);
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ log.debug(id + " |"+attr.term()+"|");
+ return true;
+ }
+ return false;
+ }};
+ }
+ };
} else {
- // Rule, Britannia!
- return "en";
-
+ return unlogged;
}
- }
- private String getDefaultLanguage() {
- if (defaultLanguage == null) {
- defaultLanguage = getDefaultLanguage(fullTextIndex);
- }
- return defaultLanguage;
- }
-
- private static boolean hasConstructor(Class<? extends Analyzer> cls, Class<?> ... parameterTypes) {
- return getConstructor(cls, parameterTypes) != null;
- }
-
- protected static Constructor<? extends Analyzer> getConstructor(Class<? extends Analyzer> cls,
- Class<?>... parameterTypes) {
- try {
- return cls.getConstructor(parameterTypes);
- } catch (NoSuchMethodException | SecurityException e) {
- return null;
- }
- }
-
- private void properties2analyzers(Properties props, Map<String, ConfigOptionsToAnalyzer> analyzers) {
- Enumeration<?> en = props.propertyNames();
- while (en.hasMoreElements()) {
-
- String prop = (String)en.nextElement();
- if (prop.equals(Options.NATURAL_LANGUAGE_SUPPORT)) continue;
- if (prop.startsWith(Options.ANALYZER)) {
- String languageRangeAndProperty[] = prop.substring(Options.ANALYZER.length()).replaceAll("_","*").split("[.]");
- if (languageRangeAndProperty.length == 2) {
-
- String languageRange = languageRangeAndProperty[0].toLowerCase(Locale.US); // Turkish "I" could create a problem
- String shortProperty = languageRangeAndProperty[1];
- String value = props.getProperty(prop);
- log.info("Setting language range: " + languageRange + "/" + shortProperty + " = " + value);
- ConfigOptionsToAnalyzer cons = analyzers.get(languageRange);
- if (cons == null) {
- cons = new ConfigOptionsToAnalyzer(languageRange);
- analyzers.put(languageRange, cons);
- }
- if (cons.setProperty(shortProperty, value)) {
- continue;
- }
- }
- }
-
- log.warn("Failed to process configuration property: " + prop);
- }
-
}
- protected Properties initProperties() {
- final Properties parentProperties = fullTextIndex.getProperties();
- Properties myProps;
- if (Boolean.valueOf(parentProperties.getProperty(
- Options.NATURAL_LANGUAGE_SUPPORT,
- Options.DEFAULT_NATURAL_LANGUAGE_SUPPORT))) {
-
- myProps = loadPropertyString(ALL_LUCENE_NATURAL_LANGUAGES);
-
- } else if (hasPropertiesForStarLanguageRange(parentProperties)){
-
- myProps = new Properties();
-
- } else {
-
- myProps = loadPropertyString(LUCENE_STANDARD_ANALYZER);
- }
-
- copyRelevantProperties(fullTextIndex.getProperties(), myProps);
- return myProps;
- }
-
- Properties loadPropertyString(String props) {
- Properties rslt = new Properties();
- try {
- rslt.load(new StringReader(props));
- } catch (IOException e) {
- throw new RuntimeException("Impossible - well clearly not!", e);
- }
- return rslt;
- }
-
- private void copyRelevantProperties(Properties from, Properties to) {
- Enumeration<?> en = from.propertyNames();
- while (en.hasMoreElements()) {
- String prop = (String)en.nextElement();
- if (prop.startsWith(ConfigurableAnalyzerFactory.class.getName())) {
- to.setProperty(prop, from.getProperty(prop));
- }
- }
- }
-
- private boolean hasPropertiesForStarLanguageRange(Properties from) {
- Enumeration<?> en = from.propertyNames();
- while (en.hasMoreElements()) {
- String prop = (String)en.nextElement();
- if (prop.startsWith(Options.ANALYZER+"_.")
- || prop.startsWith(Options.ANALYZER+"*.")) {
- return true;
- }
- }
- return false;
- }
- @Override
- public Analyzer getAnalyzer(String languageCode, boolean filterStopwords) {
-
- if (languageCode == null || languageCode.equals("")) {
-
- languageCode = getDefaultLanguage();
- }
-
- AnalyzerPair pair = langTag2AnalyzerPair.get(languageCode);
-
- if (pair == null) {
- pair = lookupPair(languageCode);
-
- // naive cache - clear everything if cache is full
- if (langTag2AnalyzerPair.size() == MAX_LANG_CACHE_SIZE) {
- langTag2AnalyzerPair.clear();
- }
- // there is a race condition below, but we don't care who wins.
- langTag2AnalyzerPair.put(languageCode, pair);
- }
-
- return pair.getAnalyzer(filterStopwords);
-
- }
-
- private AnalyzerPair lookupPair(String languageCode) {
- String language[] = languageCode.split("-");
- for (AnalyzerPair p: config) {
- if (p.extendedFilterMatch(language)) {
- return p;
- }
- }
- throw new RuntimeException("Impossible - supposedly - did not match '*'");
- }
}
Added: branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/ConfiguredAnalyzerFactory.java
===================================================================
--- branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/ConfiguredAnalyzerFactory.java (rev 0)
+++ branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/ConfiguredAnalyzerFactory.java 2014-05-11 15:45:17 UTC (rev 8270)
@@ -0,0 +1,161 @@
+/**
+
+Copyright (C) SYSTAP, LLC 2006-2014. All rights reserved.
+
+Contact:
+ SYSTAP, LLC
+ 4501 Tower Road
+ Greensboro, NC 27410
+ lic...@bi...
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+/*
+ * Created on May 6, 2014 by Jeremy J. Carroll, Syapse Inc.
+ */
+package com.bigdata.search;
+
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+import org.apache.lucene.analysis.Analyzer;
+
+import com.bigdata.search.ConfigurableAnalyzerFactory.AnalyzerOptions;
+/**
+ * This comment describes the implementation of {@link ConfiguredAnalyzerFactory}.
+ * The only method in the interface is {@link ConfiguredAnalyzerFactory#getAnalyzer(String, boolean)},
+ * a map is used from language tag to {@link AnalyzerPair}, where the pair contains
+ * an {@link Analyzer} both with and without stopwords configured (some times these two analyzers are identical,
+ * if, for example, stop words are not supported or not required).
+ * <p>
+ * If there is no entry for the language tag in the map {@link ConfiguredAnalyzerFactory#langTag2AnalyzerPair},
+ * then one is created, by walking down the array {@link ConfiguredAnalyzerFactory#config} of AnalyzerPairs
+ * until a matching one is found.
+ * @author jeremycarroll
+ *
+ */
+class ConfiguredAnalyzerFactory implements IAnalyzerFactory {
+
+
+ /**
+ * These provide a mapping from a language range to a pair of Analyzers
+ * and sort with the best-match (i.e. longest match) first.
+ * @author jeremycarroll
+ *
+ */
+ protected static class AnalyzerPair implements Comparable<AnalyzerPair>{
+ final LanguageRange range;
+ private final Analyzer withStopWords;
+ private final Analyzer withoutStopWords;
+
+ public Analyzer getAnalyzer(boolean filterStopwords) {
+ return filterStopwords ? withStopWords : withoutStopWords;
+ }
+
+ public boolean extendedFilterMatch(String[] language) {
+ return range.extendedFilterMatch(language);
+ }
+
+ AnalyzerPair(String range, Analyzer withStopWords, Analyzer withOutStopWords) {
+ this.range = new LanguageRange(range);
+ this.withStopWords = withStopWords;
+ this.withoutStopWords = withOutStopWords;
+ }
+
+ /**
+ * This clone constructor implements {@link AnalyzerOptions#LIKE}.
+ * @param range
+ * @param copyMe
+ */
+ AnalyzerPair(String range, AnalyzerPair copyMe) {
+ this(range, copyMe.withStopWords, copyMe.withoutStopWords);
+ }
+
+ @Override
+ public String toString() {
+ return range.full + "=(" + withStopWords.getClass().getSimpleName() +")";
+ }
+
+ @Override
+ public int compareTo(AnalyzerPair o) {
+ return range.compareTo(o.range);
+ }
+ }
+
+
+ private final AnalyzerPair config[];
+
+ /**
+ * This caches the result of looking up a lang tag in the
+ * config of language ranges.
+ */
+ private final Map<String, AnalyzerPair> langTag2AnalyzerPair = new ConcurrentHashMap<String, AnalyzerPair>();;
+
+ /**
+ * While it would be very unusual to have more than 500 different language tags in a store
+ * it is possible - we use a max size to prevent a memory explosion, and a naive caching
+ * strategy so the code will still work on the {@link #MAX_LANG_CACHE_SIZE}+1 th entry.
+ */
+ private static final int MAX_LANG_CACHE_SIZE = 500;
+
+
+ private final String defaultLanguage;
+ /**
+ * Builds a new ConfigurableAnalyzerFactory.
+ * @param fullTextIndex
+ */
+ public ConfiguredAnalyzerFactory(AnalyzerPair config[], String defaultLanguage) {
+ this.config = config;
+ this.defaultLanguage = defaultLanguage;
+ }
+
+ private String getDefaultLanguage() {
+ return defaultLanguage;
+ }
+
+ @Override
+ public Analyzer getAnalyzer(String languageCode, boolean filterStopwords) {
+
+ if (languageCode == null || languageCode.equals("")) {
+
+ languageCode = getDefaultLanguage();
+ }
+
+ AnalyzerPair pair = langTag2AnalyzerPair.get(languageCode);
+
+ if (pair == null) {
+ pair = lookupPair(languageCode);
+
+ // naive cache - clear everything if cache is full
+ if (langTag2AnalyzerPair.size() == MAX_LANG_CACHE_SIZE) {
+ langTag2AnalyzerPair.clear();
+ }
+ // there is a race condition below, but we don't care who wins.
+ langTag2AnalyzerPair.put(languageCode, pair);
+ }
+
+ return pair.getAnalyzer(filterStopwords);
+
+ }
+
+ private AnalyzerPair lookupPair(String languageCode) {
+ String language[] = languageCode.split("-");
+ for (AnalyzerPair p: config) {
+ if (p.extendedFilterMatch(language)) {
+ return p;
+ }
+ }
+ throw new RuntimeException("Impossible - supposedly - did not match '*'");
+ }
+}
Added: branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/LanguageRange.java
===================================================================
--- branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/LanguageRange.java (rev 0)
+++ branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/LanguageRange.java 2014-05-11 15:45:17 UTC (rev 8270)
@@ -0,0 +1,126 @@
+package com.bigdata.search;
+
+import java.util.Locale;
+
+
+/**
+ * This is an implementation of RFC 4647 language range,
+ * targetted at the specific needs within bigdata, and only
+ * supporting the extended filtering specified in section 3.3.2
+ * <p>
+ * Language ranges are comparable so that
+ * sorting an array and then matching a language tag against each
+ * member of the array in sequence will give the longest match.
+ * i.e. the longer ranges come first.
+ * @author jeremycarroll
+ *
+ */
+public class LanguageRange implements Comparable<LanguageRange> {
+
+ private final String range[];
+ final String full;
+ /**
+ * Note range must be in lower case, this is not verified.
+ * @param range
+ */
+ public LanguageRange(String range) {
+ this.range = range.split("-");
+ full = range;
+ }
+
+ @Override
+ public int compareTo(LanguageRange o) {
+ if (equals(o)) {
+ return 0;
+ }
+ int diff = o.range.length - range.length;
+ if (diff != 0) {
+ // longest first
+ return diff;
+ }
+ if (range.length == 1) {
+ // * last
+ if (range[0].equals("*")) {
+ return 1;
+ }
+ if (o.range[0].equals("*")) {
+ return -1;
+ }
+ }
+ // alphabetically
+ for (int i=0; i<range.length; i++) {
+ diff = range[i].compareTo(o.range[i]);
+ if (diff != 0) {
+ return diff;
+ }
+ }
+ throw new RuntimeException("Impossible - supposedly");
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ return (o instanceof LanguageRange) && ((LanguageRange)o).full.equals(full);
+ }
+ @Override
+ public int hashCode() {
+ return full.hashCode();
+ }
+
+ /**
+ * This implements the algoirthm of section 3.3.2 of RFC 4647
+ * as modified with the observation about private use tags
+ * in <a href="http://lists.w3.org/Archives/Public/www-international/2014AprJun/0084">
+ * this message</a>.
+ *
+ *
+ * @param langTag The RFC 5646 Language tag in lower case
+ * @return The result of the algorithm
+ */
+ public boolean extendedFilterMatch(String langTag) {
+ return extendedFilterMatch(langTag.toLowerCase(Locale.ROOT).split("-"));
+ }
+
+ // See RFC 4647, 3.3.2
+ boolean extendedFilterMatch(String[] language) {
+ // RFC 4647 step 2
+ if (!matchSubTag(language[0], range[0])) {
+ return false;
+ }
+ int rPos = 1;
+ int lPos = 1;
+ // variant step - for private use flags
+ if (language[0].equals("x") && range[0].equals("*")) {
+ lPos = 0;
+ }
+ // RFC 4647 step 3
+ while (rPos < range.length) {
+ // step 3A
+ if (range[rPos].equals("*")) {
+ rPos ++;
+ continue;
+ }
+ // step 3B
+ if (lPos >= language.length) {
+ return false;
+ }
+ // step 3C
+ if (matchSubTag(language[lPos], range[rPos])) {
+ lPos++;
+ rPos++;
+ continue;
+ }
+ if (language[lPos].length()==1) {
+ return false;
+ }
+ lPos++;
+ }
+ // RFC 4647 step 4
+ return true;
+ }
+
+ // RFC 4647, 3.3.2, step 1
+ private boolean matchSubTag(String langSubTag, String rangeSubTag) {
+ return langSubTag.equals(rangeSubTag) || "*".equals(rangeSubTag);
+ }
+
+}
\ No newline at end of file
Copied: branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/NeedsConfiguringAnalyzerFactory.java (from rev 8263, branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java)
===================================================================
--- branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/NeedsConfiguringAnalyzerFactory.java (rev 0)
+++ branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/NeedsConfiguringAnalyzerFactory.java 2014-05-11 15:45:17 UTC (rev 8270)
@@ -0,0 +1,649 @@
+/**
+
+Copyright (C) SYSTAP, LLC 2006-2014. All rights reserved.
+
+Contact:
+ SYSTAP, LLC
+ 4501 Tower Road
+ Greensboro, NC 27410
+ lic...@bi...
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+/*
+ * Created on May 6, 2014 by Jeremy J. Carroll, Syapse Inc.
+ */
+package com.bigdata.search;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.lang.reflect.Constructor;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+import java.util.UUID;
+import java.util.WeakHashMap;
+import java.util.regex.Pattern;
+
+import org.apache.log4j.Logger;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.StopAnalyzer;
+import org.apache.lucene.analysis.miscellaneous.PatternAnalyzer;
+import org.apache.lucene.analysis.ru.RussianAnalyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.util.Version;
+
+import com.bigdata.btree.keys.IKeyBuilder;
+import com.bigdata.btree.keys.KeyBuilder;
+import com.bigdata.search.ConfigurableAnalyzerFactory.AnalyzerOptions;
+import com.bigdata.search.ConfigurableAnalyzerFactory.Options;
+
+
+/**
+ * <p>
+ * The bulk of the code in this class is invoked from {@link #init()} to set up the array of
+ * {@link ConfiguredAnalyzerFactory.AnalyzerPair}s. For example, all of the subclasses of {@link AnalyzerPair}s,
+ * are simply to call the appropriate constructor in the appropriate way: the difficulty is that many subclasses
+ * of {@link Analyzer} have constructors with different signatures, and our code needs to navigate each sort.
+ * @author jeremycarroll
+ *
+ */
+class NeedsConfiguringAnalyzerFactory implements IAnalyzerFactory {
+ final private static transient Logger log = Logger.getLogger(NeedsConfiguringAnalyzerFactory.class);
+
+ /**
+ * We create only one {@link ConfiguredAnalyzerFactory} per namespace
+ * and store it here. The UUID is stable and allows us to side-step lifecycle
+ * issues such as creation and destruction of namespaces, potentially with different properties.
+ * We use a WeakHashMap to ensure that after the destruction of a namespace we clean up.
+ * We have to synchronize this for thread safety.
+ */
+ private static final Map<UUID, ConfiguredAnalyzerFactory> allConfigs =
+ Collections.synchronizedMap(new WeakHashMap<UUID, ConfiguredAnalyzerFactory>());
+
+
+ private static final String ALL_LUCENE_NATURAL_LANGUAGES =
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.*.like=eng\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.por.analyzerClass=org.apache.lucene.analysis.br.BrazilianAnalyzer\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.pt.like=por\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.zho.analyzerClass=org.apache.lucene.analysis.cn.ChineseAnalyzer\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.chi.like=zho\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.zh.like=zho\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.jpn.analyzerClass=org.apache.lucene.analysis.cjk.CJKAnalyzer\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.ja.like=jpn\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.kor.like=jpn\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.ko.like=kor\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.ces.analyzerClass=org.apache.lucene.analysis.cz.CzechAnalyzer\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.cze.like=ces\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.cs.like=ces\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.dut.analyzerClass=org.apache.lucene.analysis.nl.DutchAnalyzer\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.nld.like=dut\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.nl.like=dut\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.deu.analyzerClass=org.apache.lucene.analysis.de.GermanAnalyzer\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.ger.like=deu\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.de.like=deu\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.gre.analyzerClass=org.apache.lucene.analysis.el.GreekAnalyzer\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.ell.like=gre\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.el.like=gre\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.rus.analyzerClass=org.apache.lucene.analysis.ru.RussianAnalyzer\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.ru.like=rus\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.tha.analyzerClass=org.apache.lucene.analysis.th.ThaiAnalyzer\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.th.like=tha\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.eng.analyzerClass=org.apache.lucene.analysis.standard.StandardAnalyzer\n" +
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.en.like=eng\n";
+
+ private static final String LUCENE_STANDARD_ANALYZER =
+ "com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.*.analyzerClass=org.apache.lucene.analysis.standard.StandardAnalyzer\n";
+
+ static int loggerIdCounter = 0;
+
+ /**
+ * This class and all its subclasses provide a variety of patterns
+ * for mapping from the various constructor patterns of subclasses
+ * of {@link Analyzer} to {@link ConfiguredAnalyzerFactory#AnalyzerPair}.
+ * @author jeremycarroll
+ *
+ */
+ private static class AnalyzerPair extends ConfiguredAnalyzerFactory.AnalyzerPair {
+
+ AnalyzerPair(String range, Analyzer withStopWords, Analyzer withOutStopWords) {
+ super(range, withStopWords, withOutStopWords);
+ }
+
+ /**
+ * This clone constructor implements {@link AnalyzerOptions#LIKE}.
+ * @param range
+ * @param copyMe
+ */
+ AnalyzerPair(String range, AnalyzerPair copyMe) {
+ super(range, copyMe);
+ }
+
+ /**
+ * If we have a constructor, with arguments including a populated
+ * stop word set, then we can use it to make both the withStopWords
+ * analyzer, and the withoutStopWords analyzer.
+ * @param range
+ * @param cons A Constructor including a {@link java.util.Set} argument
+ * for the stop words.
+ * @param params The arguments to pass to the constructor including a populated stopword set.
+ * @throws Exception
+ */
+ AnalyzerPair(String range, Constructor<? extends Analyzer> cons, Object ... params) throws Exception {
+ this(range, cons.newInstance(params), cons.newInstance(useEmptyStopWordSet(params)));
+ }
+ AnalyzerPair(String range, Analyzer stopWordsNotSupported) {
+ this(range, stopWordsNotSupported, stopWordsNotSupported);
+ }
+ private static Object[] useEmptyStopWordSet(Object[] params) {
+ Object rslt[] = new Object[params.length];
+ for (int i=0; i<params.length; i++) {
+ if (params[i] instanceof Set) {
+ rslt[i] = Collections.EMPTY_SET;
+ } else {
+ rslt[i] = params[i];
+ }
+ }
+ return rslt;
+ }
+
+ }
+
+
+ /**
+ * Used for Analyzer classes with a constructor with signature (Version, Set).
+ * @author jeremycarroll
+ *
+ */
+ private static class VersionSetAnalyzerPair extends AnalyzerPair {
+ public VersionSetAnalyzerPair(ConfigOptionsToAnalyzer lro,
+ Class<? extends Analyzer> cls) throws Exception {
+ super(lro.languageRange, getConstructor(cls, Version.class, Set.class), Version.LUCENE_CURRENT, lro.getStopWords());
+ }
+ }
+
+ /**
+ * Used for Analyzer classes which do not support stopwords and have a constructor with signature (Version).
+ * @author jeremycarroll
+ *
+ */
+ private static class VersionAnalyzerPair extends AnalyzerPair {
+ public VersionAnalyzerPair(String range, Class<? extends Analyzer> cls) throws Exception {
+ super(range, getConstructor(cls, Version.class).newInstance(Version.LUCENE_CURRENT));
+ }
+ }
+
+ /**
+ * Special case code for {@link PatternAnalyzer}
+ * @author jeremycarroll
+ *
+ */
+ private static class PatternAnalyzerPair extends AnalyzerPair {
+ public PatternAnalyzerPair(ConfigOptionsToAnalyzer lro, Pattern pattern) throws Exception {
+ super(lro.languageRange, getConstructor(PatternAnalyzer.class,Version.class,Pattern.class,Boolean.TYPE,Set.class),
+ Version.LUCENE_CURRENT,
+ pattern,
+ true,
+ lro.getStopWords());
+ }
+ }
+
+
+
+ /**
+ * This class is initialized with the config options, using the {@link #setProperty(String, String)}
+ * method, for a particular language range and works out which pair of {@link Analyzer}s
+ * to use for that language range.
+ * <p>
+ * Instances of this class are only alive during the execution of
+ * {@link NeedsConfiguringAnalyzerFactory#ConfigurableAnalyzerFactory(FullTextIndex)},
+ * the life-cycle is:
+ * <ol>
+ * <li>The relveant config properties are applied, and are used to populate the fields.
+ * <li>The fields are validated
+ * <li>An {@link AnalyzerPair} is constructed
+ * </ol>
+ *
+ * @author jeremycarroll
+ *
+ */
+ private static class ConfigOptionsToAnalyzer {
+
+ String like;
+ String className;
+ String stopwords;
+ Pattern pattern;
+ final String languageRange;
+ AnalyzerPair result;
+ Pattern wordBoundary;
+ Pattern subWordBoundary;
+ Pattern softHyphens;
+ Boolean alwaysRemoveSoftHyphens;
+
+ public ConfigOptionsToAnalyzer(String languageRange) {
+ this.languageRange = languageRange;
+ }
+
+ /**
+ * This is called only when we have already identified that
+ * the class does support stopwords.
+ * @return
+ */
+ public Set<?> getStopWords() {
+
+ if (doNotUseStopWords())
+ return Collections.EMPTY_SET;
+
+ if (useDefaultStopWords()) {
+ return getStopWordsForClass(className);
+ }
+
+ return getStopWordsForClass(stopwords);
+ }
+
+ boolean doNotUseStopWords() {
+ return AnalyzerOptions.STOPWORDS_VALUE_NONE.equals(stopwords) || (stopwords == null && pattern != null);
+ }
+
+ protected Set<?> getStopWordsForClass(String clazzName) {
+ Class<? extends Analyzer> analyzerClass = getAnalyzerClass(clazzName);
+ try {
+ return (Set<?>) analyzerClass.getMethod("getDefaultStopSet").invoke(null);
+ } catch (Exception e) {
+ if (StandardAnalyzer.class.equals(analyzerClass)) {
+ return StandardAnalyzer.STOP_WORDS_SET;
+ }
+ if (StopAnalyzer.class.equals(analyzerClass)) {
+ return StopAnalyzer.ENGLISH_STOP_WORDS_SET;
+ }
+ throw new RuntimeException("Failed to find stop words from " + clazzName + " for language range "+languageRange);
+ }
+ }
+
+ protected boolean useDefaultStopWords() {
+ return ( stopwords == null && pattern == null ) || AnalyzerOptions.STOPWORDS_VALUE_DEFAULT.equals(stopwords);
+ }
+
+ /**
+ * The first step in the life-cycle, used to initialize the fields.
+ * @return true if the property was recognized.
+ */
+ public boolean setProperty(String shortProperty, String value) {
+ if (shortProperty.equals(AnalyzerOptions.LIKE) ) {
+ like = value;
+ } else if (shortProperty.equals(AnalyzerOptions.ANALYZER_CLASS) ) {
+ className = value;
+ } else if (shortProperty.equals(AnalyzerOptions.STOPWORDS) ) {
+ stopwords = value;
+ } else if (shortProperty.equals(AnalyzerOptions.PATTERN) ) {
+ pattern = Pattern.compile(value,Pattern.UNICODE_CHARACTER_CLASS);
+ } else if (shortProperty.equals(AnalyzerOptions.WORD_BOUNDARY) ) {
+ wordBoundary = Pattern.compile(value,Pattern.UNICODE_CHARACTER_CLASS);
+ } else if (shortProperty.equals(AnalyzerOptions.SUB_WORD_BOUNDARY) ) {
+ subWordBoundary = Pattern.compile(value,Pattern.UNICODE_CHARACTER_CLASS);
+ } else if (shortProperty.equals(AnalyzerOptions.SOFT_HYPHENS) ) {
+ softHyphens = Pattern.compile(value,Pattern.UNICODE_CHARACTER_CLASS);
+ } else if (shortProperty.equals(AnalyzerOptions.ALWAYS_REMOVE_SOFT_HYPHENS) ) {
+ alwaysRemoveSoftHyphens = Boolean.valueOf(value);
+ } else {
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * The second phase of the life-cycle, used for sanity checking.
+ */
+ public void validate() {
+ if (pattern != null ) {
+ if ( className != null && className != PatternAnalyzer.class.getName()) {
+ throw new RuntimeException("Bad Option: Language range "+languageRange + " with pattern propety for class "+ className);
+ }
+ className = PatternAnalyzer.class.getName();
+ }
+ if (this.wordBoundary != null ) {
+ if ( className != null && className != TermCompletionAnalyzer.class.getName()) {
+ throw new RuntimeException("Bad Option: Language range "+languageRange + " with pattern propety for class "+ className);
+ }
+ className = TermCompletionAnalyzer.class.getName();
+
+ if ( subWordBoundary == null ) {
+ subWordBoundary = AnalyzerOptions.DEFAULT_SUB_WORD_BOUNDARY;
+ }
+ if ( alwaysRemoveSoftHyphens != null && softHyphens == null ) {
+ throw new RuntimeException("Bad option: Language range "+languageRange + ": must specify softHypens when setting alwaysRemoveSoftHyphens");
+ }
+ if (softHyphens != null && alwaysRemoveSoftHyphens == null) {
+ alwaysRemoveSoftHyphens = AnalyzerOptions.DEFAULT_ALWAYS_REMOVE_SOFT_HYPHENS;
+ }
+
+ } else if ( subWordBoundary != null || softHyphens != null || alwaysRemoveSoftHyphens != null ||
+ TermCompletionAnalyzer.class.getName().equals(className) ) {
+ throw new RuntimeException("Bad option: Language range "+languageRange + ": must specify wordBoundary for TermCompletionAnalyzer");
+ }
+
+ if (PatternAnalyzer.class.getName().equals(className) && pattern == null ) {
+ throw new RuntimeException("Bad Option: Language range "+languageRange + " must specify pattern for PatternAnalyzer.");
+ }
+ if ( (like != null) == (className != null) ) {
+ throw new RuntimeException("Bad Option: Language range "+languageRange + " must specify exactly one of implementation class or like.");
+ }
+ if (stopwords != null && like != null) {
+ throw new RuntimeException("Bad Option: Language range "+languageRange + " must not specify stopwords with like.");
+ }
+
+ }
+
+ /**
+ * The third and final phase of the life-cyle used for identifying
+ * the AnalyzerPair.
+ */
+ private AnalyzerPair construct() throws Exception {
+ if (className == null) {
+ return null;
+ }
+ if (pattern != null) {
+ return new PatternAnalyzerPair(this, pattern);
+ }
+ if (softHyphens != null) {
+ return new AnalyzerPair(
+ languageRange,
+ new TermCompletionAnalyzer(
+ wordBoundary,
+ subWordBoundary,
+ softHyphens,
+ alwaysRemoveSoftHyphens));
+ }
+ if (wordBoundary != null) {
+ return new AnalyzerPair(
+ languageRange,
+ new TermCompletionAnalyzer(
+ wordBoundary,
+ subWordBoundary));
+ }
+ final Class<? extends Analyzer> cls = getAnalyzerClass();
+
+ if (hasConstructor(cls, Version.class, Set.class)) {
+
+ // RussianAnalyzer is missing any way to access stop words.
+ if (RussianAnalyzer.class.equals(cls)) {
+ if (useDefaultStopWords()) {
+ return new AnalyzerPair(languageRange, new RussianAnalyzer(Version.LUCENE_CURRENT), new RussianAnalyzer(Version.LUCENE_CURRENT, Collections.EMPTY_SET));
+ }
+ if (doNotUseStopWords()) {
+ return new AnalyzerPair(languageRange, new RussianAnalyzer(Version.LUCENE_CURRENT, Collections.EMPTY_SET));
+ }
+ }
+ return new VersionSetAnalyzerPair(this, cls);
+ }
+
+ if (stopwords != null && !stopwords.equals(AnalyzerOptions.STOPWORDS_VALUE_NONE)) {
+ throw new RuntimeException("Bad option: language range: " + languageRange + " stopwords are not supported by " + className);
+ }
+ if (hasConstructor(cls, Version.class)) {
+ return new VersionAnalyzerPair(languageRange, cls);
+ }
+
+ if (hasConstructor(cls)) {
+ return new AnalyzerPair(languageRange, cls.newInstance());
+ }
+ throw new RuntimeException("Bad option: cannot find constructor for class " + className + " for language range " + languageRange);
+ }
+
+ /**
+ * Also part of the third phase of the life-cycle, following the {@link AnalyzerOptions#LIKE}
+ * properties.
+ * @param depth
+ * @param max
+ * @param analyzers
+ * @return
+ */
+ AnalyzerPair followLikesToAnalyzerPair(int depth, int max,
+ Map<String, ConfigOptionsToAnalyzer> analyzers) {
+ if (result == null) {
+ if (depth == max) {
+ throw new RuntimeException("Bad configuration: - 'like' loop for language range " + languageRange);
+ }
+ ConfigOptionsToAnalyzer next = analyzers.get(like);
+ if (next == null) {
+ throw new RuntimeException("Bad ...
[truncated message content] |