|
From: <bi...@us...> - 2010-08-09 23:52:38
|
Revision: 3213
http://archive-access.svn.sourceforge.net/archive-access/?rev=3213&view=rev
Author: binzino
Date: 2010-08-09 23:52:32 +0000 (Mon, 09 Aug 2010)
Log Message:
-----------
Added date merging hacks.
Modified Paths:
--------------
tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java
Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java 2010-08-09 23:52:07 UTC (rev 3212)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java 2010-08-09 23:52:32 UTC (rev 3213)
@@ -73,7 +73,15 @@
final Writable value = values.next().get(); // unwrap
if (value instanceof ParseData) {
- parseData = (ParseData)value;
+ if ( parseData != null )
+ {
+ // HACK: Merge dates
+ parseData = mergeDates( key, parseData, (ParseData) value );
+ }
+ else
+ {
+ parseData = (ParseData)value;
+ }
} else if (value instanceof ParseText) {
parseText = (ParseText)value;
} else if (LOG.isWarnEnabled()) {
@@ -118,6 +126,25 @@
output.collect(key, doc);
}
+ private ParseData mergeDates( Text key, ParseData d, ParseData s )
+ {
+ Metadata dest = d.getContentMeta();
+ Metadata src = s.getContentMeta();
+
+ if ( dest == null ) return s;
+ if ( src == null ) return d;
+
+ // HACK: All we care about right now are the dates.
+ String[] sourceDates = src.getValues( "date" );
+ for ( String date : sourceDates )
+ {
+ LOG.warn( "Merging: " + key + " : " + date );
+ dest.add( "date", date );
+ }
+
+ return d;
+ }
+
public void close() throws IOException { }
public static void initMRJob(Collection<Path> segments,
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|