| 
      
      
      From: <bi...@us...> - 2010-03-18 22:40:45
      
     | 
| Revision: 2979
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2979&view=rev
Author:   binzino
Date:     2010-03-18 22:40:39 +0000 (Thu, 18 Mar 2010)
Log Message:
-----------
WAX-74.  Add support for storing field value in compressed form.
Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml
    trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java
Modified: trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml	2010-03-18 22:11:53 UTC (rev 2978)
+++ trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml	2010-03-18 22:40:39 UTC (rev 2979)
@@ -44,11 +44,10 @@
   <name>nutchwax.filter.index</name>
   <value>
     title:false:true:tokenized
-    content:false:false:tokenized
+    content:false:compress:tokenized
     site:false:false:untokenized
 
     url:false:true:tokenized
-    digest:false:true:no
 
     collection:true:true:no_norms
     date:true:true:no_norms
Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java	2010-03-18 22:11:53 UTC (rev 2978)
+++ trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java	2010-03-18 22:40:39 UTC (rev 2979)
@@ -36,6 +36,7 @@
 import org.apache.nutch.indexer.NutchDocument;
 import org.apache.nutch.indexer.lucene.LuceneWriter;
 import org.apache.nutch.indexer.lucene.LuceneWriter.INDEX;
+import org.apache.nutch.indexer.lucene.LuceneWriter.STORE;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.Parse;
 
@@ -74,7 +75,7 @@
 
         String  srcKey     = spec[0];
         boolean lowerCase  = true;
-        boolean store      = true;
+        STORE   store      = STORE.YES;
         INDEX   index      = INDEX.TOKENIZED;
         boolean exclusive  = true;
         String  destKey    = srcKey;
@@ -91,7 +92,10 @@
                         "no_norms".   equals(spec[3]) ? INDEX.NO_NORMS :
                         INDEX.NO;
           case 3:
-            store     = Boolean.parseBoolean( spec[2] );
+            //store     = Boolean.parseBoolean( spec[2] );
+            store     = "true".    equals(spec[2]) ? STORE.YES :
+                        "compress".equals(spec[2]) ? STORE.COMPRESS :
+                        STORE.NO;
           case 2:
             lowerCase = Boolean.parseBoolean( spec[1] );
           case 1:
@@ -109,12 +113,12 @@
   {
     String  srcKey;
     boolean lowerCase;
-    boolean store;
+    STORE   store;
     INDEX   index;
     boolean exclusive;
     String  destKey;
 
-    public FieldSpecification( String srcKey, boolean lowerCase, boolean store, INDEX index, boolean exclusive, String destKey )
+    public FieldSpecification( String srcKey, boolean lowerCase, STORE store, INDEX index, boolean exclusive, String destKey )
     {
       this.srcKey    = srcKey;
       this.lowerCase = lowerCase;
@@ -147,6 +151,12 @@
             try
               {
                 value = (new URL( meta.get( "url" ) ) ).getHost( );
+
+                // Strip off any "www." header.
+                if ( value.startsWith( "www." ) )
+                  {
+                    value = value.substring( 4 );
+                  }
               }
             catch ( MalformedURLException mue ) { /* Eat it */ }
           }
@@ -171,6 +181,11 @@
             int p = value.indexOf( ';' );
             if ( p >= 0 ) value = value.substring( 0, p );
           }
+        else if ( "collection".equals( spec.srcKey ) )
+          {
+            // Use value given in config first, otherwise what's in the metadata object.
+            value = conf.get( "nutchwax.index.collection", meta.get( spec.srcKey ) );
+          }
         else
           {
             value = meta.get( spec.srcKey );
@@ -188,7 +203,7 @@
             doc.removeField( spec.destKey );
           }
 
-        if ( spec.store || spec.index != INDEX.NO )
+        if ( spec.store != STORE.NO || spec.index != INDEX.NO )
           {
             doc.add( spec.destKey, value );
           }
@@ -202,13 +217,13 @@
   {
     for ( FieldSpecification spec : this.fieldSpecs )
       {
-        if ( ! spec.store && spec.index == INDEX.NO )
+        if ( spec.store == STORE.NO && spec.index == INDEX.NO )
           {
             continue ;
           }
 
         LuceneWriter.addFieldOptions( spec.destKey, 
-                                      spec.store ? LuceneWriter.STORE.YES : LuceneWriter.STORE.NO,
+                                      spec.store,
                                       spec.index,
                                       conf );
       }
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
 |