|
From: <bi...@us...> - 2008-07-25 20:24:45
|
Revision: 2494
http://archive-access.svn.sourceforge.net/archive-access/?rev=2494&view=rev
Author: binzino
Date: 2008-07-25 20:24:53 +0000 (Fri, 25 Jul 2008)
Log Message:
-----------
Added HTTPStatusCodeFilter and configuration thereof in nutch-site.xml.
Modified Paths:
--------------
trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml
trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java
trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java
Modified: trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-07-25 02:46:16 UTC (rev 2493)
+++ trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-07-25 20:24:53 UTC (rev 2494)
@@ -32,7 +32,7 @@
</property>
<property>
- <!-- Configure the 'index-nutchwax' plugin. Specify how the metadata fields added by the ArcsToSegment are mapped to the Lucene documents during indexing.
+ <!-- Configure the 'index-nutchwax' plugin. Specify how the metadata fields added by the Importer are mapped to the Lucene documents during indexing.
The specifications here are of the form "src-key:lowercase:store:tokenize:dest-key"
Where the only required part is the "src-key", the rest will assume the following defaults:
lowercase = true
@@ -111,9 +111,16 @@
<property>
<name>nutchwax.urlfilter.wayback.canonicalizer</name>
<value>org.archive.wayback.util.url.AggressiveUrlCanonicalizer</value>
- <description></description>
+ <description>Implementation of URL canonicalizer to use.</description>
</property>
+<property>
+ <name>nutchwax.filter.http.status</name>
+ <value>
+ 200-299
+ </value>
+</property>
+
<!-- Similar to Nutch's
file.content.limit
http.content.limit
Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-07-25 02:46:16 UTC (rev 2493)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-07-25 20:24:53 UTC (rev 2494)
@@ -20,6 +20,8 @@
import java.net.MalformedURLException;
import java.util.Map.Entry;
import java.util.Iterator;
+import java.util.List;
+import java.util.ArrayList;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -106,12 +108,8 @@
private ParseUtil parseUtil;
private URLNormalizers normalizers;
private int interval;
+ private HTTPStatusCodeFilter httpStatusCodeFilter;
- private long numSkipped;
- private long numImported;
- private long bytesSkipped;
- private long bytesImported;
-
/**
* ?: Is this necessary?
*/
@@ -146,6 +144,8 @@
this.parseUtil = new ParseUtil ( jobConf );
this.normalizers = new URLNormalizers( jobConf, URLNormalizers.SCOPE_FETCHER );
this.interval = jobConf.getInt( "db.fetch.interval.default", 2592000 );
+
+ this.httpStatusCodeFilter = new HTTPStatusCodeFilter( jobConf.get( "nutchwax.filter.http.status" ) );
}
/**
@@ -233,6 +233,13 @@
if ( LOG.isInfoEnabled() ) LOG.info( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ") [" + meta.getLength( ) + "]" );
+ if ( ! this.httpStatusCodeFilter.isAllowed( record.getStatusCode( ) ) )
+ {
+ if ( LOG.isInfoEnabled() ) LOG.info( "Skip URL: " + meta.getUrl() + " HTTP status:" + record.getStatusCode() );
+
+ return false;
+ }
+
try
{
// Skip the HTTP headers in the response body, so that the
@@ -313,6 +320,7 @@
contentMetadata.set( NutchWax.DIGEST_KEY, meta.getDigest() );
contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, meta.getMimetype() );
contentMetadata.set( NutchWax.CONTENT_LENGTH_KEY, String.valueOf( meta.getLength() ) );
+ contentMetadata.set( NutchWax.HTTP_RESPONSE_KEY, String.valueOf( record.getStatusCode() ) );
Content content = new Content( url, url, bytes, meta.getMimetype(), contentMetadata, getConf() );
@@ -677,3 +685,96 @@
}
}
+
+
+/**
+ * This should all be moved into some sort of filtering plugin.
+ * Unfortunately the URLFilter plugin interface isn't adequate as it
+ * only looks at a URL string. Rather than jamming a response code
+ * through that interface, we do a one-off filter class here.
+ *
+ * A long-term solution would be to create a new Nutch extension point
+ * interface that takes an ARCRecord rather than a URL string. That
+ * way we can write filters that can operate on any part of an
+ * ARCRecord, not just the URL.
+ */
+class HTTPStatusCodeFilter
+{
+ List<Range> ranges = new ArrayList<Range>( );
+
+ public HTTPStatusCodeFilter( String configuration )
+ {
+ if ( configuration == null )
+ {
+ return ;
+ }
+
+ configuration = configuration.trim( );
+
+ for ( String value : configuration.split( "\\s+" ) )
+ {
+ Range range = new Range( );
+
+ // Special handling for "none" where an ARCRecord doesn't have
+ // an HTTP status code. The ARCRecord.getStatusCode() returns
+ // -1 in that case, so we make a range for it.
+ if ( value.toLowerCase( ).equals( "none" ) )
+ {
+ range.lower = -1;
+ range.upper = -1;
+
+ this.ranges.add( range );
+
+ continue;
+ }
+
+ String values[] = value.split( "[-]" );
+
+ try
+ {
+ switch ( values.length )
+ {
+ case 2:
+ // It's a range, N-M
+ range.lower = Integer.parseInt( values[0] );
+ range.upper = Integer.parseInt( values[1] );
+ break;
+
+ case 1:
+ // It's a single value, convert to a single-value range
+ range.lower = Integer.parseInt( values[0] );
+ range.upper = range.lower;
+ break;
+
+ default:
+ // Bad format
+ Importer.LOG.warn( "Illegal format for nutchwax.filter.http.status: " + range );
+ continue ;
+ }
+
+ this.ranges.add( range );
+ }
+ catch ( NumberFormatException nfe )
+ {
+ Importer.LOG.warn( "Illegal format for nutchwax.filter.http.status: " + range, nfe );
+ }
+ }
+
+ }
+
+ public boolean isAllowed( int code )
+ {
+ for ( Range r : this.ranges )
+ {
+ return ( r.lower <= code && code <= r.upper );
+ }
+
+ return false;
+ }
+
+ static class Range
+ {
+ int lower;
+ int upper;
+ }
+}
Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java 2008-07-25 02:46:16 UTC (rev 2493)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java 2008-07-25 20:24:53 UTC (rev 2494)
@@ -31,4 +31,5 @@
public static final String DIGEST_KEY = "digest";
public static final String CONTENT_TYPE_KEY = "type";
public static final String CONTENT_LENGTH_KEY = "length";
+ public static final String HTTP_RESPONSE_KEY = "http_response_code";
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|