Thread: [Archive-access-cvs] SF.net SVN: archive-access:[2494] trunk/archive-access/projects/nutchwax/ arch

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2494
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2494&view=rev
Author:   binzino
Date:     2008-07-25 20:24:53 +0000 (Fri, 25 Jul 2008)

Log Message:
-----------
Added HTTPStatusCodeFilter and configuration thereof in nutch-site.xml.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java

Modified: trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml
===================================================================

--- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml	2008-07-25 02:46:16 UTC (rev 2493)
+++ trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml	2008-07-25 20:24:53 UTC (rev 2494)
@@ -32,7 +32,7 @@
 </property>
 
 <property>
-  <!-- Configure the 'index-nutchwax' plugin.  Specify how the metadata fields added by the ArcsToSegment are mapped to the Lucene documents during indexing.
+  <!-- Configure the 'index-nutchwax' plugin.  Specify how the metadata fields added by the Importer are mapped to the Lucene documents during indexing.
        The specifications here are of the form "src-key:lowercase:store:tokenize:dest-key"
        Where the only required part is the "src-key", the rest will assume the following defaults:
           lowercase = true
@@ -111,9 +111,16 @@
 <property>
   <name>nutchwax.urlfilter.wayback.canonicalizer</name>
   <value>org.archive.wayback.util.url.AggressiveUrlCanonicalizer</value>
-  <description></description>
+  <description>Implementation of URL canonicalizer to use.</description>
 </property>
 
+<property>
+  <name>nutchwax.filter.http.status</name>
+  <value>
+    200-299
+  </value>
+</property>
+
 <!-- Similar to Nutch's
        file.content.limit
        http.content.limit

Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2008-07-25 02:46:16 UTC (rev 2493)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2008-07-25 20:24:53 UTC (rev 2494)
@@ -20,6 +20,8 @@
 import java.net.MalformedURLException;
 import java.util.Map.Entry;
 import java.util.Iterator;
+import java.util.List;
+import java.util.ArrayList;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -106,12 +108,8 @@
   private ParseUtil      parseUtil;
   private URLNormalizers normalizers;
   private int            interval;
+  private HTTPStatusCodeFilter httpStatusCodeFilter;
 
-  private long           numSkipped;
-  private long           numImported;
-  private long           bytesSkipped;
-  private long           bytesImported;
-
   /**
    * ?: Is this necessary?
    */
@@ -146,6 +144,8 @@
     this.parseUtil   = new ParseUtil     ( jobConf );
     this.normalizers = new URLNormalizers( jobConf, URLNormalizers.SCOPE_FETCHER );
     this.interval    = jobConf.getInt( "db.fetch.interval.default", 2592000      );
+
+    this.httpStatusCodeFilter = new HTTPStatusCodeFilter( jobConf.get( "nutchwax.filter.http.status" ) );
   }
 
   /**
@@ -233,6 +233,13 @@
     
     if ( LOG.isInfoEnabled() ) LOG.info( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ") [" + meta.getLength( ) + "]" );
 
+    if ( ! this.httpStatusCodeFilter.isAllowed( record.getStatusCode( ) ) )
+      {
+        if ( LOG.isInfoEnabled() ) LOG.info( "Skip     URL: " + meta.getUrl() + " HTTP status:" + record.getStatusCode() );
+
+        return false;
+      }
+
     try
       {
         // Skip the HTTP headers in the response body, so that the
@@ -313,6 +320,7 @@
         contentMetadata.set( NutchWax.DIGEST_KEY,         meta.getDigest()   );
         contentMetadata.set( NutchWax.CONTENT_TYPE_KEY,   meta.getMimetype() );
         contentMetadata.set( NutchWax.CONTENT_LENGTH_KEY, String.valueOf( meta.getLength() ) );
+        contentMetadata.set( NutchWax.HTTP_RESPONSE_KEY,  String.valueOf( record.getStatusCode() ) );
 
         Content content = new Content( url, url, bytes, meta.getMimetype(), contentMetadata, getConf() );
 
@@ -677,3 +685,96 @@
   }
 
 }
+
+
+/**
+ * This should all be moved into some sort of filtering plugin.
+ * Unfortunately the URLFilter plugin interface isn't adequate as it
+ * only looks at a URL string.  Rather than jamming a response code
+ * through that interface, we do a one-off filter class here.
+ *
+ * A long-term solution would be to create a new Nutch extension point
+ * interface that takes an ARCRecord rather than a URL string.  That
+ * way we can write filters that can operate on any part of an
+ * ARCRecord, not just the URL.
+ */
+class HTTPStatusCodeFilter
+{
+  List<Range> ranges = new ArrayList<Range>( );
+
+  public HTTPStatusCodeFilter( String configuration )
+  {
+    if ( configuration == null )
+      {
+        return ;
+      }
+
+    configuration = configuration.trim( );
+
+    for ( String value : configuration.split( "\\s+" ) )
+      {
+        Range range = new Range( );
+
+        // Special handling for "none" where an ARCRecord doesn't have
+        // an HTTP status code.  The ARCRecord.getStatusCode() returns
+        // -1 in that case, so we make a range for it.
+        if ( value.toLowerCase( ).equals( "none" ) )
+          {
+            range.lower = -1;
+            range.upper = -1;
+
+            this.ranges.add( range );
+
+            continue;
+          }
+
+        String values[] = value.split( "[-]" );
+
+        try
+          {
+            switch ( values.length )
+              {
+              case 2:
+                // It's a range, N-M
+                range.lower = Integer.parseInt( values[0] );
+                range.upper = Integer.parseInt( values[1] );
+                break;
+                
+              case 1:
+                // It's a single value, convert to a single-value range
+                range.lower = Integer.parseInt( values[0] );
+                range.upper = range.lower;
+                break;
+                
+              default:
+                // Bad format
+                Importer.LOG.warn( "Illegal format for nutchwax.filter.http.status: " + range );
+                continue ;
+              }
+
+            this.ranges.add( range );
+          }
+        catch ( NumberFormatException nfe )
+          {
+            Importer.LOG.warn( "Illegal format for nutchwax.filter.http.status: " + range, nfe );
+          }
+      }
+
+  }
+
+  public boolean isAllowed( int code )
+  {
+    for ( Range r : this.ranges )
+      {
+          return ( r.lower <= code && code <= r.upper );
+      }
+
+    return false;
+  }
+
+  static class Range 
+  {
+    int lower;
+    int upper;
+  }
+}

Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java	2008-07-25 02:46:16 UTC (rev 2493)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java	2008-07-25 20:24:53 UTC (rev 2494)
@@ -31,4 +31,5 @@
   public static final String DIGEST_KEY         = "digest";
   public static final String CONTENT_TYPE_KEY   = "type";
   public static final String CONTENT_LENGTH_KEY = "length";
+  public static final String HTTP_RESPONSE_KEY  = "http_response_code";
 }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




Thread: [Archive-access-cvs] SF.net SVN: archive-access:[2494] trunk/archive-access/projects/nutchwax/ arch

archive-access-cvs