Revision: 3211 http://archive-access.svn.sourceforge.net/archive-access/?rev=3211&view=rev Author: binzino Date: 2010-08-09 23:51:25 +0000 (Mon, 09 Aug 2010) Log Message: ----------- Added some comments. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/DateIndexer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/DateIndexer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/DateIndexer.java 2010-08-09 16:04:53 UTC (rev 3210) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/DateIndexer.java 2010-08-09 23:51:25 UTC (rev 3211) @@ -36,7 +36,16 @@ import org.apache.nutch.parse.Parse; /** + * Add fields to NutchDocument related to dates. * + * All of the crawl dates are taken from the Parse object, reduced to + * a set of unique dates (in case of redundancies), then the dates + * are added to the NutchDocument in multiple fields: + * <ul> + * <li><code>date</code> stored for later retrieval.</li> + * <li><code>year</code> only the year (YYYY), indexed not stored.</l> + * <li><code>yearmonth</code> year and month (YYYYMM), indexed not stored.</li> + * </ul> */ public class DateIndexer implements IndexingFilter { @@ -55,16 +64,14 @@ } /** - * <p>Set Lucene document field to fixed value.</p> - * <p> - * Remove field if specified value is <code>null</code>. - * </p> + * */ public NutchDocument filter( NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks ) throws IndexingException { Metadata meta = parse.getData().getContentMeta(); + // Use a Set to reduce list of dates to just unique values. Set<String> dates = new HashSet<String>( Arrays.asList( meta.getValues( "date" ) ) ); for ( String date : dates ) This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |