From: <bi...@us...> - 2011-06-13 18:20:28
|
Revision: 3463 http://archive-access.svn.sourceforge.net/archive-access/?rev=3463&view=rev Author: binzino Date: 2011-06-13 18:20:21 +0000 (Mon, 13 Jun 2011) Log Message: ----------- Added custom reducer to allow for multiple values for the same key. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2011-06-10 02:24:23 UTC (rev 3462) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2011-06-13 18:20:21 UTC (rev 3463) @@ -36,6 +36,7 @@ import org.apache.hadoop.mapred.JobStatus; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.RunningJob; import org.apache.hadoop.mapred.TextInputFormat; @@ -94,7 +95,7 @@ * to the importing of ARC files. I've noted those details with * comments prefaced with "?:". */ -public class Importer extends Configured implements Tool, Mapper<WritableComparable, Writable, Text, NutchWritable> +public class Importer extends Configured implements Tool, Mapper<WritableComparable, Writable, Text, NutchWritable>, Reducer<WritableComparable,Writable,WritableComparable,Writable> { public static final Log LOG = LogFactory.getLog( Importer.class ); @@ -154,6 +155,23 @@ } + public void reduce( WritableComparable key, + Iterator<Writable> values, + OutputCollector<WritableComparable, Writable> output, + Reporter reporter + ) + throws IOException + { + + while ( values.hasNext( ) ) + { + if ( LOG.isInfoEnabled() ) LOG.info( "Reduce: key = " + key.toString() ); + + output.collect( key, values.next( ) ); + } + } + + /** * <p>Runs the Map job to import records from an archive file into a * Nutch segment.</p> @@ -588,7 +606,11 @@ if ( LOG.isWarnEnabled() ) LOG.warn( "Couldn't pass score, url = " + key, e ); } - output.collect( key, new NutchWritable( new ParseImpl( new ParseText( parse.getText() ), parse.getData(), parse.isCanonical() ) ) ); + String parsedText = parse.getText(); + + // TODO: Limit size of parsedText. + + output.collect( key, new NutchWritable( new ParseImpl( new ParseText( parsedText ), parse.getData(), parse.isCanonical() ) ) ); } } } @@ -719,7 +741,8 @@ FileInputFormat.addInputPath( job, manifestPath ); job.setInputFormat( TextInputFormat.class ); - job.setMapperClass( Importer.class ); + job.setMapperClass ( Importer.class ); + job.setReducerClass( Importer.class ); //job.setOutputPath ( segmentPath ); FileOutputFormat.setOutputPath( job, segmentPath ); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |