|
From: <bi...@us...> - 2010-07-12 02:26:43
|
Revision: 3169
http://archive-access.svn.sourceforge.net/archive-access/?rev=3169&view=rev
Author: binzino
Date: 2010-07-12 02:26:34 +0000 (Mon, 12 Jul 2010)
Log Message:
-----------
Remove setting of segment and digest fields.
Modified Paths:
--------------
tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java
Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java 2010-07-11 00:09:27 UTC (rev 3168)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java 2010-07-12 02:26:34 UTC (rev 3169)
@@ -96,10 +96,10 @@
}
// add segment, used to map from merged index back to segment files
- doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));
+ //doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));
// add digest, used by dedup
- doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY));
+ //doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY));
final Parse parse = new ParseImpl(parseText, parseData);
try {
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bi...@us...> - 2010-08-09 23:52:38
|
Revision: 3213
http://archive-access.svn.sourceforge.net/archive-access/?rev=3213&view=rev
Author: binzino
Date: 2010-08-09 23:52:32 +0000 (Mon, 09 Aug 2010)
Log Message:
-----------
Added date merging hacks.
Modified Paths:
--------------
tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java
Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java 2010-08-09 23:52:07 UTC (rev 3212)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java 2010-08-09 23:52:32 UTC (rev 3213)
@@ -73,7 +73,15 @@
final Writable value = values.next().get(); // unwrap
if (value instanceof ParseData) {
- parseData = (ParseData)value;
+ if ( parseData != null )
+ {
+ // HACK: Merge dates
+ parseData = mergeDates( key, parseData, (ParseData) value );
+ }
+ else
+ {
+ parseData = (ParseData)value;
+ }
} else if (value instanceof ParseText) {
parseText = (ParseText)value;
} else if (LOG.isWarnEnabled()) {
@@ -118,6 +126,25 @@
output.collect(key, doc);
}
+ private ParseData mergeDates( Text key, ParseData d, ParseData s )
+ {
+ Metadata dest = d.getContentMeta();
+ Metadata src = s.getContentMeta();
+
+ if ( dest == null ) return s;
+ if ( src == null ) return d;
+
+ // HACK: All we care about right now are the dates.
+ String[] sourceDates = src.getValues( "date" );
+ for ( String date : sourceDates )
+ {
+ LOG.warn( "Merging: " + key + " : " + date );
+ dest.add( "date", date );
+ }
+
+ return d;
+ }
+
public void close() throws IOException { }
public static void initMRJob(Collection<Path> segments,
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bi...@us...> - 2010-10-27 07:08:15
|
Revision: 3311
http://archive-access.svn.sourceforge.net/archive-access/?rev=3311&view=rev
Author: binzino
Date: 2010-10-27 07:08:09 +0000 (Wed, 27 Oct 2010)
Log Message:
-----------
Removed log message about merging dates.
Modified Paths:
--------------
tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java
Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java 2010-10-27 07:07:51 UTC (rev 3310)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java 2010-10-27 07:08:09 UTC (rev 3311)
@@ -138,7 +138,6 @@
String[] sourceDates = src.getValues( "date" );
for ( String date : sourceDates )
{
- LOG.warn( "Merging: " + key + " : " + date );
dest.add( "date", date );
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bi...@us...> - 2010-10-28 00:54:36
|
Revision: 3319
http://archive-access.svn.sourceforge.net/archive-access/?rev=3319&view=rev
Author: binzino
Date: 2010-10-28 00:54:30 +0000 (Thu, 28 Oct 2010)
Log Message:
-----------
Changed log message to debug.
Modified Paths:
--------------
tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java
Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java 2010-10-28 00:54:00 UTC (rev 3318)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java 2010-10-28 00:54:30 UTC (rev 3319)
@@ -103,6 +103,8 @@
return ;
}
+ if ( LOG.isDebugEnabled( ) ) LOG.debug( "Indexing: " + metadata.get("type") + " " + key );
+
// add segment, used to map from merged index back to segment files
//doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|