Thread: [Bigdata-commit] SF.net SVN: bigdata:[8256] branches/TEXT_ANALYZERS/bigdata/src/java/com/ bigdata/s

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 8256
          http://sourceforge.net/p/bigdata/code/8256
Author:   jeremy_carroll
Date:     2014-05-09 19:07:09 +0000 (Fri, 09 May 2014)
Log Message:
-----------
Addressing trac 915 by documenting the current behavior and deprecating DefaultAnalyzerFactory and suggestion the use of ConfigurableAnalyzerFactory instead

Modified Paths:
--------------
    branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java
    branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/DefaultAnalyzerFactory.java

Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java
===================================================================

--- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java	2014-05-09 19:07:02 UTC (rev 8255)
+++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/ConfigurableAnalyzerFactory.java	2014-05-09 19:07:09 UTC (rev 8256)
@@ -90,7 +90,7 @@
  * Properties from {@link Options} apply to the factory.
  * <p>
  * 
- * If there are no such properties at all then the property {@link Options#INCLUDE_DEFAULTS} is set to true,
+ * If there are no such properties at all then the property {@link Options#NATURAL_LANGUAGE_SUPPORT} is set to true,
  * and the behavior of this class is the same as the legacy {@link DefaultAnalyzerFactory}.
  * <p>
  * Other properties, from {@link AnalyzerOptions} start with
@@ -117,7 +117,7 @@
  * <dd>This suppresses the functionality, by treating every expression as a stop word.</dd>
  * </dl>
  * there are in addition the language specific analyzers that are included
- * by using the option {@link Options#INCLUDE_DEFAULTS}
+ * by using the option {@link Options#NATURAL_LANGUAGE_SUPPORT}
  * 
  * 
  * @author jeremycarroll
@@ -265,18 +265,13 @@
     	 * 
     	 * 
     	 */
-        String INCLUDE_DEFAULTS = ConfigurableAnalyzerFactory.class.getName() + ".includeDefaults";
+        String NATURAL_LANGUAGE_SUPPORT = ConfigurableAnalyzerFactory.class.getName() + ".includeDefaults";
         /**
          * This is the prefix to all properties configuring the individual analyzers.
          */
         String ANALYZER = ConfigurableAnalyzerFactory.class.getName() + ".analyzer.";
-/**
- * If there is no configuration at all, then the defaults are included,
- * but any configuration at all totally replaces the defaults, unless 
- * {@link #INCLUDE_DEFAULTS}
- * is explicitly set to true.
- */
-        String DEFAULT_INCLUDE_DEFAULTS = "false";
+
+        String DEFAULT_NATURAL_LAMGUAGE_SUPPORT = "false";
     }
     /**
      * Options understood by analyzers created by {@link ConfigurableAnalyzerFactory}.
@@ -810,7 +805,7 @@
 		while (en.hasMoreElements()) {
 			
 			String prop = (String)en.nextElement();
-			if (prop.equals(Options.INCLUDE_DEFAULTS)) continue;
+			if (prop.equals(Options.NATURAL_LANGUAGE_SUPPORT)) continue;
 			if (prop.startsWith(Options.ANALYZER)) {
 				String languageRangeAndProperty[] = prop.substring(Options.ANALYZER.length()).replaceAll("_","*").split("[.]");
 				if (languageRangeAndProperty.length == 2) {
@@ -838,7 +833,7 @@
 	protected Properties initProperties() {
 		final Properties parentProperties = fullTextIndex.getProperties();
         Properties myProps;
-        if (Boolean.getBoolean(parentProperties.getProperty(Options.INCLUDE_DEFAULTS, Options.DEFAULT_INCLUDE_DEFAULTS))) {
+        if (Boolean.getBoolean(parentProperties.getProperty(Options.NATURAL_LANGUAGE_SUPPORT, Options.DEFAULT_NATURAL_LAMGUAGE_SUPPORT))) {
         	myProps = defaultProperties();
         } else {
         	myProps = new Properties();

Modified: branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/DefaultAnalyzerFactory.java
===================================================================
--- branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/DefaultAnalyzerFactory.java	2014-05-09 19:07:02 UTC (rev 8255)
+++ branches/TEXT_ANALYZERS/bigdata/src/java/com/bigdata/search/DefaultAnalyzerFactory.java	2014-05-09 19:07:09 UTC (rev 8256)
@@ -29,7 +29,6 @@
 
 import java.util.Collections;
 import java.util.HashMap;
-import java.util.HashSet;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
@@ -52,11 +51,24 @@
 import com.bigdata.btree.keys.KeyBuilder;
 
 /**
+ * This is the default implementation but could be regarded as legacy since
+ * it fails to use the correct {@link Analyzer} for almost all languages (other than
+ * English). It uses the correct natural language analyzer for literals tagged with
+ * "por", "deu", "ger", "zho", "chi", "jpn", "kor", "ces", "cze", "dut", "nld", "gre", "ell",
+ * "fra", "fre", "rus" and "tha". 
+ * This codes do not work if they are used with subtags, e.g. "ger-AT" is treated as English.
+ * No two letter code works correctly: note that the W3C and 
+ * IETF recommend the use of the two letter forms instead of the three letter forms.
+ * <p>
  * Default implementation registers a bunch of {@link Analyzer}s for various
  * language codes and then serves the appropriate {@link Analyzer} based on
  * the specified language code.
  * 
  * @author <a href="mailto:tho...@us...">Bryan Thompson</a>
+ * @deprecated Using {@link ConfigurableAnalyzerFactory} with 
+ *    the {@link ConfigurableAnalyzerFactory.Options#NATURAL_LANGUAGE_SUPPORT} 
+ *    uses the appropriate natural language analyzers for the two letter codes
+ *    and for tags which include sub-tags.
  * @version $Id$
  */
 public class DefaultAnalyzerFactory implements IAnalyzerFactory {

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





Thread: [Bigdata-commit] SF.net SVN: bigdata:[8256] branches/TEXT_ANALYZERS/bigdata/src/java/com/ bigdata/s

Fast, scalable, robust graph database platform

bigdata-commit