From: <bi...@us...> - 2008-07-25 02:46:07
|
Revision: 2493 http://archive-access.svn.sourceforge.net/archive-access/?rev=2493&view=rev Author: binzino Date: 2008-07-25 02:46:16 +0000 (Fri, 25 Jul 2008) Log Message: ----------- Integrated into Hadoop framework via Tool interface and Configured superclass. This enables us to read Nutch(WAX) configuration properties, in particular the url canonicalizer implementation to use. Fix JIRA: WAX-6. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2008-07-24 23:35:54 UTC (rev 2492) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2008-07-25 02:46:16 UTC (rev 2493) @@ -38,8 +38,14 @@ import org.apache.lucene.document.Field; import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +import org.apache.nutch.util.NutchConfiguration; + import org.archive.wayback.UrlCanonicalizer; -import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; import org.archive.nutchwax.NutchWax; @@ -48,10 +54,9 @@ * Reads series of (digest+URL,date) lines, finds corresponding * document in index, and adds the date to it. */ -public class DateAdder +public class DateAdder extends Configured implements Tool { - public static void main(String[] args) - throws Exception + public int run( String[] args ) throws Exception { if ( args.length < 4 ) { @@ -111,7 +116,7 @@ IndexWriter writer = new IndexWriter( destIndexDir, new WhitespaceAnalyzer( ), true ); - UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer( ); + UrlCanonicalizer canonicalizer = getCanonicalizer( this.getConf( ) ); for ( int i = 0 ; i < reader.numDocs( ) ; i++ ) { @@ -155,6 +160,47 @@ reader.close( ); writer.close( ); + + return 0; } + + /** + * Utility function to instantiate a UrlCanonicalizer based on an + * implementation specified in the configuration. + */ + public static UrlCanonicalizer getCanonicalizer( Configuration conf ) + { + // Which Wayback canonicalizer to use: Aggressive, Identity, etc. + String canonicalizerClassName = conf.get( "nutchwax.urlfilter.wayback.canonicalizer" ); + + if ( canonicalizerClassName == null || canonicalizerClassName.trim().length() == 0 ) + { + throw new RuntimeException( "Missing value for property: nutchwax.urlfilter.wayback.canonicalizer" ); + } + + try + { + UrlCanonicalizer canonicalizer = (UrlCanonicalizer) Class.forName( canonicalizerClassName ).newInstance( ); + + return canonicalizer; + } + catch ( Exception e ) + { + // If we can't instantiate it, there's not much else we can do + // other than just throw the Exception. + throw new RuntimeException( e ); + } + } + + /** + * Command-line driver. Runs the Importer as a Hadoop job. + */ + public static void main( String args[] ) throws Exception + { + int result = ToolRunner.run( NutchConfiguration.create(), new DateAdder(), args ); + + System.exit( result ); + } + } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |