From: <bi...@us...> - 2010-08-09 23:52:38
|
Revision: 3213 http://archive-access.svn.sourceforge.net/archive-access/?rev=3213&view=rev Author: binzino Date: 2010-08-09 23:52:32 +0000 (Mon, 09 Aug 2010) Log Message: ----------- Added date merging hacks. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java 2010-08-09 23:52:07 UTC (rev 3212) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java 2010-08-09 23:52:32 UTC (rev 3213) @@ -73,7 +73,15 @@ final Writable value = values.next().get(); // unwrap if (value instanceof ParseData) { - parseData = (ParseData)value; + if ( parseData != null ) + { + // HACK: Merge dates + parseData = mergeDates( key, parseData, (ParseData) value ); + } + else + { + parseData = (ParseData)value; + } } else if (value instanceof ParseText) { parseText = (ParseText)value; } else if (LOG.isWarnEnabled()) { @@ -118,6 +126,25 @@ output.collect(key, doc); } + private ParseData mergeDates( Text key, ParseData d, ParseData s ) + { + Metadata dest = d.getContentMeta(); + Metadata src = s.getContentMeta(); + + if ( dest == null ) return s; + if ( src == null ) return d; + + // HACK: All we care about right now are the dates. + String[] sourceDates = src.getValues( "date" ); + for ( String date : sourceDates ) + { + LOG.warn( "Merging: " + key + " : " + date ); + dest.add( "date", date ); + } + + return d; + } + public void close() throws IOException { } public static void initMRJob(Collection<Path> segments, This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |