|
From: <jer...@us...> - 2014-05-09 19:07:12
|
Revision: 8256
http://sourceforge.net/p/bigdata/code/8256
Author: jeremy_carroll
Date: 2014-05-09 19:07:09 +0000 (Fri, 09 May 2014)
Log Message:
-----------
Addressing trac 915 by documenting the current behavior and deprecating DefaultAnalyzerFactory and suggestion the use of ConfigurableAnalyzerFactory instead
Modified Paths:
--------------
branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java
branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/DefaultAnalyzerFactory.java
Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 19:07:02 UTC (rev 8255)
+++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 19:07:09 UTC (rev 8256)
@@ -90,7 +90,7 @@
* Properties from {@link Options} apply to the factory.
* <p>
*
- * If there are no such properties at all then the property {@link Options#INCLUDE_DEFAULTS} is set to true,
+ * If there are no such properties at all then the property {@link Options#NATURAL_LANGUAGE_SUPPORT} is set to true,
* and the behavior of this class is the same as the legacy {@link DefaultAnalyzerFactory}.
* <p>
* Other properties, from {@link AnalyzerOptions} start with
@@ -117,7 +117,7 @@
* <dd>This suppresses the functionality, by treating every expression as a stop word.</dd>
* </dl>
* there are in addition the language specific analyzers that are included
- * by using the option {@link Options#INCLUDE_DEFAULTS}
+ * by using the option {@link Options#NATURAL_LANGUAGE_SUPPORT}
*
*
* @author jeremycarroll
@@ -265,18 +265,13 @@
*
*
*/
- String INCLUDE_DEFAULTS = ConfigurableAnalyzerFactory.class.getName() + ".includeDefaults";
+ String NATURAL_LANGUAGE_SUPPORT = ConfigurableAnalyzerFactory.class.getName() + ".includeDefaults";
/**
* This is the prefix to all properties configuring the individual analyzers.
*/
String ANALYZER = ConfigurableAnalyzerFactory.class.getName() + ".analyzer.";
-/**
- * If there is no configuration at all, then the defaults are included,
- * but any configuration at all totally replaces the defaults, unless
- * {@link #INCLUDE_DEFAULTS}
- * is explicitly set to true.
- */
- String DEFAULT_INCLUDE_DEFAULTS = "false";
+
+ String DEFAULT_NATURAL_LAMGUAGE_SUPPORT = "false";
}
/**
* Options understood by analyzers created by {@link ConfigurableAnalyzerFactory}.
@@ -810,7 +805,7 @@
while (en.hasMoreElements()) {
String prop = (String)en.nextElement();
- if (prop.equals(Options.INCLUDE_DEFAULTS)) continue;
+ if (prop.equals(Options.NATURAL_LANGUAGE_SUPPORT)) continue;
if (prop.startsWith(Options.ANALYZER)) {
String languageRangeAndProperty[] = prop.substring(Options.ANALYZER.length()).replaceAll("_","*").split("[.]");
if (languageRangeAndProperty.length == 2) {
@@ -838,7 +833,7 @@
protected Properties initProperties() {
final Properties parentProperties = fullTextIndex.getProperties();
Properties myProps;
- if (Boolean.getBoolean(parentProperties.getProperty(Options.INCLUDE_DEFAULTS, Options.DEFAULT_INCLUDE_DEFAULTS))) {
+ if (Boolean.getBoolean(parentProperties.getProperty(Options.NATURAL_LANGUAGE_SUPPORT, Options.DEFAULT_NATURAL_LAMGUAGE_SUPPORT))) {
myProps = defaultProperties();
} else {
myProps = new Properties();
Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/DefaultAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/DefaultAnalyzerFactory.java 2014-05-09 19:07:02 UTC (rev 8255)
+++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/DefaultAnalyzerFactory.java 2014-05-09 19:07:09 UTC (rev 8256)
@@ -29,7 +29,6 @@
import java.util.Collections;
import java.util.HashMap;
-import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
@@ -52,11 +51,24 @@
import com.bigdata.btree.keys.KeyBuilder;
/**
+ * This is the default implementation but could be regarded as legacy since
+ * it fails to use the correct {@link Analyzer} for almost all languages (other than
+ * English). It uses the correct natural language analyzer for literals tagged with
+ * "por", "deu", "ger", "zho", "chi", "jpn", "kor", "ces", "cze", "dut", "nld", "gre", "ell",
+ * "fra", "fre", "rus" and "tha".
+ * This codes do not work if they are used with subtags, e.g. "ger-AT" is treated as English.
+ * No two letter code works correctly: note that the W3C and
+ * IETF recommend the use of the two letter forms instead of the three letter forms.
+ * <p>
* Default implementation registers a bunch of {@link Analyzer}s for various
* language codes and then serves the appropriate {@link Analyzer} based on
* the specified language code.
*
* @author <a href="mailto:tho...@us...">Bryan Thompson</a>
+ * @deprecated Using {@link ConfigurableAnalyzerFactory} with
+ * the {@link ConfigurableAnalyzerFactory.Options#NATURAL_LANGUAGE_SUPPORT}
+ * uses the appropriate natural language analyzers for the two letter codes
+ * and for tags which include sub-tags.
* @version $Id$
*/
public class DefaultAnalyzerFactory implements IAnalyzerFactory {
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <jer...@us...> - 2014-05-09 23:08:38
|
Revision: 8259
http://sourceforge.net/p/bigdata/code/8259
Author: jeremy_carroll
Date: 2014-05-09 23:08:34 +0000 (Fri, 09 May 2014)
Log Message:
-----------
javadoc changes
Modified Paths:
--------------
branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java
branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/DefaultAnalyzerFactory.java
Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 22:39:19 UTC (rev 8258)
+++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java 2014-05-09 23:08:34 UTC (rev 8259)
@@ -66,6 +66,7 @@
* Supported classes included all the natural language specific classes from Lucene, and also:
* <ul>
* <li>{@link PatternAnalyzer}
+ * <li>{@link TermCompletionAnalyzer}
* <li>{@link KeywordAnalyzer}
* <li>{@link SimpleAnalyzer}
* <li>{@link StopAnalyzer}
@@ -76,7 +77,6 @@
* <ul>
* <li>no arguments
* <li>{@link Version}
- * <li>{@link Set} (of strings, the stop words)
* <li>{@link Version}, {@link Set}
* </ul>
* is usable. If the class has a static method named <code>getDefaultStopSet()</code> then this is assumed
@@ -89,10 +89,6 @@
* abbreviate to <code>c.b.s.C</code> in this documentation.
* Properties from {@link Options} apply to the factory.
* <p>
- *
- * If there are no such properties at all then the property {@link Options#NATURAL_LANGUAGE_SUPPORT} is set to true,
- * and the behavior of this class is the same as the legacy {@link DefaultAnalyzerFactory}.
- * <p>
* Other properties, from {@link AnalyzerOptions} start with
* <code>c.b.s.C.analyzer.<em>language-range</em></code> where <code><em>language-range</em></code> conforms
* with the extended language range construct from RFC 4647, section 2.2.
@@ -103,7 +99,7 @@
* If no analyzer is specified for the language range <code>*</code> then the {@link StandardAnalyzer} is used.
* <p>
* Given any specific language, then the analyzer matching the longest configured language range,
- * measured in number of subtags is used {@link #getAnalyzer(String, boolean)}
+ * measured in number of subtags is returned by {@link #getAnalyzer(String, boolean)}
* In the event of a tie, the alphabetically first language range is used.
* The algorithm to find a match is "Extended Filtering" as defined in section 3.3.2 of RFC 4647.
* <p>
@@ -132,11 +128,11 @@
/**
* This is an implementation of RFC 4647 language range,
- * targetted at some of the context of bigdata, and only
+ * targetted at the specific needs within bigdata, and only
* supporting the extended filtering specified in section 3.3.2
* <p>
* Language ranges are comparable so that
- * sorting an array and then matching a language tage against each
+ * sorting an array and then matching a language tag against each
* member of the array in sequence will give the longest match.
* i.e. the longer ranges come first.
* @author jeremycarroll
Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/DefaultAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/DefaultAnalyzerFactory.java 2014-05-09 22:39:19 UTC (rev 8258)
+++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/DefaultAnalyzerFactory.java 2014-05-09 23:08:34 UTC (rev 8259)
@@ -51,18 +51,15 @@
import com.bigdata.btree.keys.KeyBuilder;
/**
- * This is the default implementation but could be regarded as legacy since
+ * This is the default implementation but should be regarded as legacy since
* it fails to use the correct {@link Analyzer} for almost all languages (other than
- * English). It uses the correct natural language analyzer for literals tagged with
+ * English). It uses the correct natural language analyzer only for literals tagged with
+ * certain three letter ISO 639 codes:
* "por", "deu", "ger", "zho", "chi", "jpn", "kor", "ces", "cze", "dut", "nld", "gre", "ell",
- * "fra", "fre", "rus" and "tha".
- * This codes do not work if they are used with subtags, e.g. "ger-AT" is treated as English.
- * No two letter code works correctly: note that the W3C and
+ * "fra", "fre", "rus" and "tha". All other tags are treated as English.
+ * These codes do not work if they are used with subtags, e.g. "ger-AT" is treated as English.
+ * No two letter code, other than "en" works correctly: note that the W3C and
* IETF recommend the use of the two letter forms instead of the three letter forms.
- * <p>
- * Default implementation registers a bunch of {@link Analyzer}s for various
- * language codes and then serves the appropriate {@link Analyzer} based on
- * the specified language code.
*
* @author <a href="mailto:tho...@us...">Bryan Thompson</a>
* @deprecated Using {@link ConfigurableAnalyzerFactory} with
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|