From: <bi...@us...> - 2010-07-12 02:26:43
|
Revision: 3169 http://archive-access.svn.sourceforge.net/archive-access/?rev=3169&view=rev Author: binzino Date: 2010-07-12 02:26:34 +0000 (Mon, 12 Jul 2010) Log Message: ----------- Remove setting of segment and digest fields. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java 2010-07-11 00:09:27 UTC (rev 3168) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java 2010-07-12 02:26:34 UTC (rev 3169) @@ -96,10 +96,10 @@ } // add segment, used to map from merged index back to segment files - doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY)); + //doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY)); // add digest, used by dedup - doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY)); + //doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY)); final Parse parse = new ParseImpl(parseText, parseData); try { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2010-08-09 23:52:38
|
Revision: 3213 http://archive-access.svn.sourceforge.net/archive-access/?rev=3213&view=rev Author: binzino Date: 2010-08-09 23:52:32 +0000 (Mon, 09 Aug 2010) Log Message: ----------- Added date merging hacks. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java 2010-08-09 23:52:07 UTC (rev 3212) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java 2010-08-09 23:52:32 UTC (rev 3213) @@ -73,7 +73,15 @@ final Writable value = values.next().get(); // unwrap if (value instanceof ParseData) { - parseData = (ParseData)value; + if ( parseData != null ) + { + // HACK: Merge dates + parseData = mergeDates( key, parseData, (ParseData) value ); + } + else + { + parseData = (ParseData)value; + } } else if (value instanceof ParseText) { parseText = (ParseText)value; } else if (LOG.isWarnEnabled()) { @@ -118,6 +126,25 @@ output.collect(key, doc); } + private ParseData mergeDates( Text key, ParseData d, ParseData s ) + { + Metadata dest = d.getContentMeta(); + Metadata src = s.getContentMeta(); + + if ( dest == null ) return s; + if ( src == null ) return d; + + // HACK: All we care about right now are the dates. + String[] sourceDates = src.getValues( "date" ); + for ( String date : sourceDates ) + { + LOG.warn( "Merging: " + key + " : " + date ); + dest.add( "date", date ); + } + + return d; + } + public void close() throws IOException { } public static void initMRJob(Collection<Path> segments, This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2010-10-27 07:08:15
|
Revision: 3311 http://archive-access.svn.sourceforge.net/archive-access/?rev=3311&view=rev Author: binzino Date: 2010-10-27 07:08:09 +0000 (Wed, 27 Oct 2010) Log Message: ----------- Removed log message about merging dates. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java 2010-10-27 07:07:51 UTC (rev 3310) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java 2010-10-27 07:08:09 UTC (rev 3311) @@ -138,7 +138,6 @@ String[] sourceDates = src.getValues( "date" ); for ( String date : sourceDates ) { - LOG.warn( "Merging: " + key + " : " + date ); dest.add( "date", date ); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2010-10-28 00:54:36
|
Revision: 3319 http://archive-access.svn.sourceforge.net/archive-access/?rev=3319&view=rev Author: binzino Date: 2010-10-28 00:54:30 +0000 (Thu, 28 Oct 2010) Log Message: ----------- Changed log message to debug. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java 2010-10-28 00:54:00 UTC (rev 3318) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java 2010-10-28 00:54:30 UTC (rev 3319) @@ -103,6 +103,8 @@ return ; } + if ( LOG.isDebugEnabled( ) ) LOG.debug( "Indexing: " + metadata.get("type") + " " + key ); + // add segment, used to map from merged index back to segment files //doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY)); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |