From: <bi...@us...> - 2008-07-25 02:46:07
|
Revision: 2493 http://archive-access.svn.sourceforge.net/archive-access/?rev=2493&view=rev Author: binzino Date: 2008-07-25 02:46:16 +0000 (Fri, 25 Jul 2008) Log Message: ----------- Integrated into Hadoop framework via Tool interface and Configured superclass. This enables us to read Nutch(WAX) configuration properties, in particular the url canonicalizer implementation to use. Fix JIRA: WAX-6. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2008-07-24 23:35:54 UTC (rev 2492) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2008-07-25 02:46:16 UTC (rev 2493) @@ -38,8 +38,14 @@ import org.apache.lucene.document.Field; import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +import org.apache.nutch.util.NutchConfiguration; + import org.archive.wayback.UrlCanonicalizer; -import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; import org.archive.nutchwax.NutchWax; @@ -48,10 +54,9 @@ * Reads series of (digest+URL,date) lines, finds corresponding * document in index, and adds the date to it. */ -public class DateAdder +public class DateAdder extends Configured implements Tool { - public static void main(String[] args) - throws Exception + public int run( String[] args ) throws Exception { if ( args.length < 4 ) { @@ -111,7 +116,7 @@ IndexWriter writer = new IndexWriter( destIndexDir, new WhitespaceAnalyzer( ), true ); - UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer( ); + UrlCanonicalizer canonicalizer = getCanonicalizer( this.getConf( ) ); for ( int i = 0 ; i < reader.numDocs( ) ; i++ ) { @@ -155,6 +160,47 @@ reader.close( ); writer.close( ); + + return 0; } + + /** + * Utility function to instantiate a UrlCanonicalizer based on an + * implementation specified in the configuration. + */ + public static UrlCanonicalizer getCanonicalizer( Configuration conf ) + { + // Which Wayback canonicalizer to use: Aggressive, Identity, etc. + String canonicalizerClassName = conf.get( "nutchwax.urlfilter.wayback.canonicalizer" ); + + if ( canonicalizerClassName == null || canonicalizerClassName.trim().length() == 0 ) + { + throw new RuntimeException( "Missing value for property: nutchwax.urlfilter.wayback.canonicalizer" ); + } + + try + { + UrlCanonicalizer canonicalizer = (UrlCanonicalizer) Class.forName( canonicalizerClassName ).newInstance( ); + + return canonicalizer; + } + catch ( Exception e ) + { + // If we can't instantiate it, there's not much else we can do + // other than just throw the Exception. + throw new RuntimeException( e ); + } + } + + /** + * Command-line driver. Runs the Importer as a Hadoop job. + */ + public static void main( String args[] ) throws Exception + { + int result = ToolRunner.run( NutchConfiguration.create(), new DateAdder(), args ); + + System.exit( result ); + } + } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-09-25 21:13:31
|
Revision: 2595 http://archive-access.svn.sourceforge.net/archive-access/?rev=2595&view=rev Author: binzino Date: 2008-09-25 21:13:25 +0000 (Thu, 25 Sep 2008) Log Message: ----------- Added try/catch around use of UrlCanonicalizer so that we ignore URIs that are malformed. A warning is emitted to stderr. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2008-09-22 19:55:50 UTC (rev 2594) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2008-09-25 21:13:25 UTC (rev 2595) @@ -137,14 +137,27 @@ newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED ) ); } - // First, apply URL canonicalization from Wayback - String canonicalizedUrl = canonicalizer.urlStringToKey( oldDoc.get( NutchWax.URL_KEY ) ); + // Obtain the new dates for the document. + String newDates = null; + try + { + // First, apply URL canonicalization from Wayback + String canonicalizedUrl = canonicalizer.urlStringToKey( oldDoc.get( NutchWax.URL_KEY ) ); - // Now, get the digest+ URL of the document, look for it in - // the updateRecords and if found, add the date. - String key = canonicalizedUrl + oldDoc.get( NutchWax.DIGEST_KEY ); + // Now, get the digest+URL of the document, look for it in + // the updateRecords and if found, add the date. + String key = canonicalizedUrl + oldDoc.get( NutchWax.DIGEST_KEY ); + + newDates = dateRecords.get( key ); + } + catch ( Exception e ) + { + // The canonicalizer can throw various types of exceptions + // due to malformed URIs. + System.err.println( "WARN: Not adding dates on malformed URI: " + oldDoc.get( NutchWax.URL_KEY ) ); + } - String newDates = dateRecords.get( key ); + // If there are any new dates, add them to the new document. if ( newDates != null ) { for ( String date : newDates.split("\\s+") ) @@ -153,6 +166,7 @@ } } + // Finally, add the new document to the new index. writer.addDocument( newDoc ); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |