[Archive-access-cvs] SF.net SVN: archive-access:[2743] tags/nutchwax-0_12_5/archive/src

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2743
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2743&view=rev
Author:   binzino
Date:     2009-06-23 21:35:00 +0000 (Tue, 23 Jun 2009)

Log Message:
-----------
Fix WAX-45 and WAX-48.  ConfigurableIndexingFilter can handle all the fields relevant to Nutch(WAX).  Update the nute-site.xml accordingly.  Also, remove the site and url query filters from nutch-site.xml and configure NutchWAX query filter to take over for them.

Modified Paths:
--------------
    tags/nutchwax-0_12_5/archive/src/nutch/conf/nutch-site.xml
    tags/nutchwax-0_12_5/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java
    tags/nutchwax-0_12_5/archive/src/plugin/query-nutchwax/plugin.xml

Modified: tags/nutchwax-0_12_5/archive/src/nutch/conf/nutch-site.xml
===================================================================

--- tags/nutchwax-0_12_5/archive/src/nutch/conf/nutch-site.xml	2009-06-23 21:17:31 UTC (rev 2742)
+++ tags/nutchwax-0_12_5/archive/src/nutch/conf/nutch-site.xml	2009-06-23 21:35:00 UTC (rev 2743)
@@ -10,19 +10,18 @@
   <!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. -->
   <!-- Also, add 'parse-pdf' -->
   <!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' -->
-  <value>protocol-http|parse-(text|html|js|pdf)|index-(basic|nutchwax)|query-(basic|site|url|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value>
+  <value>protocol-http|parse-(text|html|js|pdf)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value>
 </property>
 
-<!-- The indexing filter order *must* be specified in order for
-     NutchWAX's ConfigurableIndexingFilter to be called *after* the
-     BasicIndexingFilter.  This is necessary so that the
-     ConfigurableIndexingFilter can over-write some of the values put
-     into the Lucene document by the BasicIndexingFilter.
-     
-     The over-written values are the 'url' and 'digest' fields, which
-     NutchWAX needs to handle specially in order for de-duplication to
-     work properly.
-  -->
+<!-- 
+  When using *only* the 'index-nutchwax' in 'plugin.includes' above, 
+  we don't need to specify an order since there is only one plugin.
+
+  However, if you choose to use the Nutch 'index-basic', then you have
+  to specify the order such that the NutchWAX ConfigurableIndexingFilter
+  is after it.  Whichever plugin comes last over-writes the values
+  of those that come before it.
+
 <property>
   <name>indexingfilter.order</name>
   <value>
@@ -30,29 +29,31 @@
     org.archive.nutchwax.index.ConfigurableIndexingFilter
   </value>
 </property>
+  -->
 
 <property>
   <!-- Configure the 'index-nutchwax' plugin.  Specify how the metadata fields added by the Importer are mapped to the Lucene documents during indexing.
-       The specifications here are of the form "src-key:lowercase:store:tokenize:dest-key"
+       The specifications here are of the form "src-key:lowercase:store:index:dest-key"
        Where the only required part is the "src-key", the rest will assume the following defaults:
           lowercase = true
           store     = true
-          tokenize  = false
+          index     = tokenized
           exclusive = true
           dest-key  = src-key
     -->
   <name>nutchwax.filter.index</name>
   <value>
-    url:false:true:true
-    url:false:true:false:true:exacturl
-    orig:false
-    digest:false
-    filename:false
-    fileoffset:false
-    collection
-    date
-    type
-    length
+    title:false:true:tokenized
+    content:false:false:tokenized
+    site:false:false:untokenized
+
+    url:false:true:no
+    digest:false:true:no
+
+    collection:true:true:no_norms
+    date:true:true:no_norms
+    type:true:true:no_norms
+    length:false:true:no
   </value>
 </property>
 
@@ -70,15 +71,10 @@
   <!-- We do *not* use this filter for handling "date" queries, there is a specific filter for that: DateQueryFilter -->
   <name>nutchwax.filter.query</name>
   <value>
-    raw:digest:false
-    raw:filename:false
-    raw:fileoffset:false
-    raw:exacturl:false
     group:collection
+    group:site:false
     group:type
-    field:anchor
     field:content
-    field:host
     field:title
   </value>
 </property>

Modified: tags/nutchwax-0_12_5/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java
===================================================================
--- tags/nutchwax-0_12_5/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java	2009-06-23 21:17:31 UTC (rev 2742)
+++ tags/nutchwax-0_12_5/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java	2009-06-23 21:35:00 UTC (rev 2743)
@@ -20,6 +20,8 @@
  */
 package org.archive.nutchwax.index;
 
+import java.net.MalformedURLException;
+import java.net.URL;
 import java.util.List;
 import java.util.ArrayList;
 
@@ -27,6 +29,7 @@
 import org.apache.commons.logging.LogFactory;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Index;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
@@ -46,10 +49,14 @@
   private Configuration conf;
   private List<FieldSpecification> fieldSpecs;
 
+  private int MAX_TITLE_LENGTH;
+
   public void setConf( Configuration conf )
   {
     this.conf = conf;
-    
+
+    this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
+
     String filterSpecs = conf.get( "nutchwax.filter.index" );
     
     if ( null == filterSpecs )
@@ -65,12 +72,12 @@
       {
         String spec[] = filterSpec.split("[:]");
 
-        String  srcKey    = spec[0];
-        boolean lowerCase = true;
-        boolean store     = true;
-        boolean tokenize  = false;
-        boolean exclusive = true;
-        String  destKey   = srcKey;
+        String  srcKey     = spec[0];
+        boolean lowerCase  = true;
+        boolean store      = true;
+        Index   index      = Index.TOKENIZED;
+        boolean exclusive  = true;
+        String  destKey    = srcKey;
         switch ( spec.length )
           {
           default:
@@ -79,7 +86,10 @@
           case 5:
             exclusive = Boolean.parseBoolean( spec[4] );
           case 4:
-            tokenize  = Boolean.parseBoolean( spec[3] );
+            index     = "tokenized".  equals(spec[3]) ? Index.TOKENIZED : 
+                        "untokenized".equals(spec[3]) ? Index.UN_TOKENIZED : 
+                        "no_norms".   equals(spec[3]) ? Index.NO_NORMS :
+                        Index.NO;
           case 3:
             store     = Boolean.parseBoolean( spec[2] );
           case 2:
@@ -89,9 +99,9 @@
             ;
           }
 
-        LOG.info( "Add field specification: " + srcKey + ":" + lowerCase + ":" + store + ":" + tokenize + ":" + exclusive + ":" + destKey );
+        LOG.info( "Add field specification: " + srcKey + ":" + lowerCase + ":" + store + ":" + index + ":" + exclusive + ":" + destKey );
 
-        this.fieldSpecs.add( new FieldSpecification( srcKey, lowerCase, store, tokenize, exclusive, destKey ) );
+        this.fieldSpecs.add( new FieldSpecification( srcKey, lowerCase, store, index, exclusive, destKey ) );
       }
   }
 
@@ -100,16 +110,16 @@
     String  srcKey;
     boolean lowerCase;
     boolean store;
-    boolean tokenize;
+    Index   index;
     boolean exclusive;
     String  destKey;
 
-    public FieldSpecification( String srcKey, boolean lowerCase, boolean store, boolean tokenize, boolean exclusive, String destKey )
+    public FieldSpecification( String srcKey, boolean lowerCase, boolean store, Index index, boolean exclusive, String destKey )
     {
       this.srcKey    = srcKey;
       this.lowerCase = lowerCase;
       this.store     = store;
-      this.tokenize  = tokenize;
+      this.index     = index;
       this.exclusive = exclusive;
       this.destKey   = destKey;
     }
@@ -124,14 +134,47 @@
    * Transfer NutchWAX field values stored in the parsed content to
    * the Lucene document.
    */
-  public Document filter( Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks )
+  public Document filter( Document doc, Parse parse, Text key, CrawlDatum datum, Inlinks inlinks )
     throws IndexingException
   {
     Metadata meta = parse.getData().getContentMeta();
 
     for ( FieldSpecification spec : this.fieldSpecs )
       {
-        String value = meta.get( spec.srcKey );
+        String value = null;
+        if ( "site".equals( spec.srcKey ) || "host".equals( spec.srcKey ) )
+          {
+            try
+              {
+                value = (new URL( meta.get( "url" ) ) ).getHost( );
+              }
+            catch ( MalformedURLException mue ) { /* Eat it */ }
+          }
+        else if ( "content".equals( spec.srcKey ) ) 
+          {
+            value = parse.getText( );
+          }
+        else if ( "title".equals( spec.srcKey ) )
+          {
+            value = parse.getData().getTitle();
+            if ( value.length() > MAX_TITLE_LENGTH )      // truncate title if needed
+              {
+                value = value.substring( 0, MAX_TITLE_LENGTH );
+              }
+          }
+        else if ( "type".equals( spec.srcKey ) )
+          {
+            value = meta.get( spec.srcKey );
+
+            if ( value == null ) continue ;
+
+            int p = value.indexOf( ';' );
+            if ( p >= 0 ) value = value.substring( 0, p );
+          }
+        else
+          {
+            value = meta.get( spec.srcKey );
+          }
         
         if ( value == null ) continue;
 
@@ -144,11 +187,14 @@
           {
             doc.removeFields( spec.destKey );
           }
-        
-        doc.add( new Field( spec.destKey, 
-                            value, 
-                            spec.store    ? Field.Store.YES : Field.Store.NO, 
-                            spec.tokenize ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED ) );
+
+        if ( spec.store || spec.index != Index.NO )
+          {
+            doc.add( new Field( spec.destKey, 
+                                value, 
+                                spec.store ? Field.Store.YES : Field.Store.NO, 
+                                spec.index ) );
+          }
       }
 
     return doc;

Modified: tags/nutchwax-0_12_5/archive/src/plugin/query-nutchwax/plugin.xml
===================================================================
--- tags/nutchwax-0_12_5/archive/src/plugin/query-nutchwax/plugin.xml	2009-06-23 21:17:31 UTC (rev 2742)
+++ tags/nutchwax-0_12_5/archive/src/plugin/query-nutchwax/plugin.xml	2009-06-23 21:35:00 UTC (rev 2743)
@@ -40,8 +40,8 @@
               point="org.apache.nutch.searcher.QueryFilter">
       <implementation id="ConfigurableQueryFilter"
                       class="org.archive.nutchwax.query.ConfigurableQueryFilter">
-        <parameter name="raw-fields" value="collection,date,digest,exacturl,filename,fileoffset,type" />
-        <parameter name="fields"     value="anchor,content,host,title" />
+        <parameter name="raw-fields" value="collection,site,type" />
+        <parameter name="fields"     value="content,title" />
       </implementation>
    </extension>
               


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.