Revision: 3210 http://archive-access.svn.sourceforge.net/archive-access/?rev=3210&view=rev Author: binzino Date: 2010-08-09 16:04:53 +0000 (Mon, 09 Aug 2010) Log Message: ----------- Initial revision. Added Paths: ----------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/DateIndexer.java Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/DateIndexer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/DateIndexer.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/DateIndexer.java 2010-08-09 16:04:53 UTC (rev 3210) @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2008 Internet Archive. + * + * This file is part of the archive-access tools project + * (http://sourceforge.net/projects/archive-access). + * + * The archive-access tools are free software; you can redistribute them and/or + * modify them under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or any + * later version. + * + * The archive-access tools are distributed in the hope that they will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser + * Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License along with + * the archive-access tools; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.nutchwax.index; + +import java.util.*; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.indexer.lucene.LuceneWriter; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Parse; + +/** + * + */ +public class DateIndexer implements IndexingFilter +{ + public static final Log LOG = LogFactory.getLog( DateIndexer.class ); + + private Configuration conf; + + public void setConf( Configuration conf ) + { + this.conf = conf; + } + + public Configuration getConf() + { + return this.conf; + } + + /** + * <p>Set Lucene document field to fixed value.</p> + * <p> + * Remove field if specified value is <code>null</code>. + * </p> + */ + public NutchDocument filter( NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks ) + throws IndexingException + { + Metadata meta = parse.getData().getContentMeta(); + + Set<String> dates = new HashSet<String>( Arrays.asList( meta.getValues( "date" ) ) ); + + for ( String date : dates ) + { + doc.add( "date", date ); + doc.add( "year", date.substring( 0, 4 ) ); + doc.add( "yearmonth", date.substring( 0, 6 ) ); + } + + return doc; + } + + public void addIndexBackendOptions( Configuration conf ) + { + LuceneWriter.addFieldOptions( "date", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, conf ); + LuceneWriter.addFieldOptions( "year", LuceneWriter.STORE.NO, LuceneWriter.INDEX.UNTOKENIZED, conf ); + LuceneWriter.addFieldOptions( "yearmonth", LuceneWriter.STORE.NO, LuceneWriter.INDEX.UNTOKENIZED, conf ); + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |