From: <jer...@us...> - 2014-05-09 19:07:12
|
Revision: 8256 http://sourceforge.net/p/bigdata/code/8256 Author: jeremy_carroll Date: 2014-05-09 19:07:09 +0000 (Fri, 09 May 2014) Log Message: ----------- Addressing trac 915 by documenting the current behavior and deprecating DefaultAnalyzerFactory and suggestion the use of ConfigurableAnalyzerFactory instead Modified Paths: -------------- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/DefaultAnalyzerFactory.java Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 19:07:02 UTC (rev 8255) +++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 19:07:09 UTC (rev 8256) @@ -90,7 +90,7 @@ * Properties from {@link Options} apply to the factory. * <p> * - * If there are no such properties at all then the property {@link Options#INCLUDE_DEFAULTS} is set to true, + * If there are no such properties at all then the property {@link Options#NATURAL_LANGUAGE_SUPPORT} is set to true, * and the behavior of this class is the same as the legacy {@link DefaultAnalyzerFactory}. * <p> * Other properties, from {@link AnalyzerOptions} start with @@ -117,7 +117,7 @@ * <dd>This suppresses the functionality, by treating every expression as a stop word.</dd> * </dl> * there are in addition the language specific analyzers that are included - * by using the option {@link Options#INCLUDE_DEFAULTS} + * by using the option {@link Options#NATURAL_LANGUAGE_SUPPORT} * * * @author jeremycarroll @@ -265,18 +265,13 @@ * * */ - String INCLUDE_DEFAULTS = ConfigurableAnalyzerFactory.class.getName() + ".includeDefaults"; + String NATURAL_LANGUAGE_SUPPORT = ConfigurableAnalyzerFactory.class.getName() + ".includeDefaults"; /** * This is the prefix to all properties configuring the individual analyzers. */ String ANALYZER = ConfigurableAnalyzerFactory.class.getName() + ".analyzer."; -/** - * If there is no configuration at all, then the defaults are included, - * but any configuration at all totally replaces the defaults, unless - * {@link #INCLUDE_DEFAULTS} - * is explicitly set to true. - */ - String DEFAULT_INCLUDE_DEFAULTS = "false"; + + String DEFAULT_NATURAL_LAMGUAGE_SUPPORT = "false"; } /** * Options understood by analyzers created by {@link ConfigurableAnalyzerFactory}. @@ -810,7 +805,7 @@ while (en.hasMoreElements()) { String prop = (String)en.nextElement(); - if (prop.equals(Options.INCLUDE_DEFAULTS)) continue; + if (prop.equals(Options.NATURAL_LANGUAGE_SUPPORT)) continue; if (prop.startsWith(Options.ANALYZER)) { String languageRangeAndProperty[] = prop.substring(Options.ANALYZER.length()).replaceAll("_","*").split("[.]"); if (languageRangeAndProperty.length == 2) { @@ -838,7 +833,7 @@ protected Properties initProperties() { final Properties parentProperties = fullTextIndex.getProperties(); Properties myProps; - if (Boolean.getBoolean(parentProperties.getProperty(Options.INCLUDE_DEFAULTS, Options.DEFAULT_INCLUDE_DEFAULTS))) { + if (Boolean.getBoolean(parentProperties.getProperty(Options.NATURAL_LANGUAGE_SUPPORT, Options.DEFAULT_NATURAL_LAMGUAGE_SUPPORT))) { myProps = defaultProperties(); } else { myProps = new Properties(); Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/DefaultAnalyzerFactory.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/DefaultAnalyzerFactory.java 2014-05-09 19:07:02 UTC (rev 8255) +++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/DefaultAnalyzerFactory.java 2014-05-09 19:07:09 UTC (rev 8256) @@ -29,7 +29,6 @@ import java.util.Collections; import java.util.HashMap; -import java.util.HashSet; import java.util.Locale; import java.util.Map; import java.util.Set; @@ -52,11 +51,24 @@ import com.bigdata.btree.keys.KeyBuilder; /** + * This is the default implementation but could be regarded as legacy since + * it fails to use the correct {@link Analyzer} for almost all languages (other than + * English). It uses the correct natural language analyzer for literals tagged with + * "por", "deu", "ger", "zho", "chi", "jpn", "kor", "ces", "cze", "dut", "nld", "gre", "ell", + * "fra", "fre", "rus" and "tha". + * This codes do not work if they are used with subtags, e.g. "ger-AT" is treated as English. + * No two letter code works correctly: note that the W3C and + * IETF recommend the use of the two letter forms instead of the three letter forms. + * <p> * Default implementation registers a bunch of {@link Analyzer}s for various * language codes and then serves the appropriate {@link Analyzer} based on * the specified language code. * * @author <a href="mailto:tho...@us...">Bryan Thompson</a> + * @deprecated Using {@link ConfigurableAnalyzerFactory} with + * the {@link ConfigurableAnalyzerFactory.Options#NATURAL_LANGUAGE_SUPPORT} + * uses the appropriate natural language analyzers for the two letter codes + * and for tags which include sub-tags. * @version $Id$ */ public class DefaultAnalyzerFactory implements IAnalyzerFactory { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <jer...@us...> - 2014-05-09 23:08:38
|
Revision: 8259 http://sourceforge.net/p/bigdata/code/8259 Author: jeremy_carroll Date: 2014-05-09 23:08:34 +0000 (Fri, 09 May 2014) Log Message: ----------- javadoc changes Modified Paths: -------------- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/DefaultAnalyzerFactory.java Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 22:39:19 UTC (rev 8258) +++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 23:08:34 UTC (rev 8259) @@ -66,6 +66,7 @@ * Supported classes included all the natural language specific classes from Lucene, and also: * <ul> * <li>{@link PatternAnalyzer} + * <li>{@link TermCompletionAnalyzer} * <li>{@link KeywordAnalyzer} * <li>{@link SimpleAnalyzer} * <li>{@link StopAnalyzer} @@ -76,7 +77,6 @@ * <ul> * <li>no arguments * <li>{@link Version} - * <li>{@link Set} (of strings, the stop words) * <li>{@link Version}, {@link Set} * </ul> * is usable. If the class has a static method named <code>getDefaultStopSet()</code> then this is assumed @@ -89,10 +89,6 @@ * abbreviate to <code>c.b.s.C</code> in this documentation. * Properties from {@link Options} apply to the factory. * <p> - * - * If there are no such properties at all then the property {@link Options#NATURAL_LANGUAGE_SUPPORT} is set to true, - * and the behavior of this class is the same as the legacy {@link DefaultAnalyzerFactory}. - * <p> * Other properties, from {@link AnalyzerOptions} start with * <code>c.b.s.C.analyzer.<em>language-range</em></code> where <code><em>language-range</em></code> conforms * with the extended language range construct from RFC 4647, section 2.2. @@ -103,7 +99,7 @@ * If no analyzer is specified for the language range <code>*</code> then the {@link StandardAnalyzer} is used. * <p> * Given any specific language, then the analyzer matching the longest configured language range, - * measured in number of subtags is used {@link #getAnalyzer(String, boolean)} + * measured in number of subtags is returned by {@link #getAnalyzer(String, boolean)} * In the event of a tie, the alphabetically first language range is used. * The algorithm to find a match is "Extended Filtering" as defined in section 3.3.2 of RFC 4647. * <p> @@ -132,11 +128,11 @@ /** * This is an implementation of RFC 4647 language range, - * targetted at some of the context of bigdata, and only + * targetted at the specific needs within bigdata, and only * supporting the extended filtering specified in section 3.3.2 * <p> * Language ranges are comparable so that - * sorting an array and then matching a language tage against each + * sorting an array and then matching a language tag against each * member of the array in sequence will give the longest match. * i.e. the longer ranges come first. * @author jeremycarroll Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/DefaultAnalyzerFactory.java =================================================================== --- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/DefaultAnalyzerFactory.java 2014-05-09 22:39:19 UTC (rev 8258) +++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/DefaultAnalyzerFactory.java 2014-05-09 23:08:34 UTC (rev 8259) @@ -51,18 +51,15 @@ import com.bigdata.btree.keys.KeyBuilder; /** - * This is the default implementation but could be regarded as legacy since + * This is the default implementation but should be regarded as legacy since * it fails to use the correct {@link Analyzer} for almost all languages (other than - * English). It uses the correct natural language analyzer for literals tagged with + * English). It uses the correct natural language analyzer only for literals tagged with + * certain three letter ISO 639 codes: * "por", "deu", "ger", "zho", "chi", "jpn", "kor", "ces", "cze", "dut", "nld", "gre", "ell", - * "fra", "fre", "rus" and "tha". - * This codes do not work if they are used with subtags, e.g. "ger-AT" is treated as English. - * No two letter code works correctly: note that the W3C and + * "fra", "fre", "rus" and "tha". All other tags are treated as English. + * These codes do not work if they are used with subtags, e.g. "ger-AT" is treated as English. + * No two letter code, other than "en" works correctly: note that the W3C and * IETF recommend the use of the two letter forms instead of the three letter forms. - * <p> - * Default implementation registers a bunch of {@link Analyzer}s for various - * language codes and then serves the appropriate {@link Analyzer} based on - * the specified language code. * * @author <a href="mailto:tho...@us...">Bryan Thompson</a> * @deprecated Using {@link ConfigurableAnalyzerFactory} with This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |