Revision: 3210
http://archive-access.svn.sourceforge.net/archive-access/?rev=3210&view=rev
Author: binzino
Date: 2010-08-09 16:04:53 +0000 (Mon, 09 Aug 2010)
Log Message:
-----------
Initial revision.
Added Paths:
-----------
tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/DateIndexer.java
Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/DateIndexer.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/DateIndexer.java (rev 0)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/DateIndexer.java 2010-08-09 16:04:53 UTC (rev 3210)
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2008 Internet Archive.
+ *
+ * This file is part of the archive-access tools project
+ * (http://sourceforge.net/projects/archive-access).
+ *
+ * The archive-access tools are free software; you can redistribute them and/or
+ * modify them under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or any
+ * later version.
+ *
+ * The archive-access tools are distributed in the hope that they will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
+ * Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License along with
+ * the archive-access tools; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+package org.archive.nutchwax.index;
+
+import java.util.*;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.lucene.LuceneWriter;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+
+/**
+ *
+ */
+public class DateIndexer implements IndexingFilter
+{
+ public static final Log LOG = LogFactory.getLog( DateIndexer.class );
+
+ private Configuration conf;
+
+ public void setConf( Configuration conf )
+ {
+ this.conf = conf;
+ }
+
+ public Configuration getConf()
+ {
+ return this.conf;
+ }
+
+ /**
+ * <p>Set Lucene document field to fixed value.</p>
+ * <p>
+ * Remove field if specified value is <code>null</code>.
+ * </p>
+ */
+ public NutchDocument filter( NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks )
+ throws IndexingException
+ {
+ Metadata meta = parse.getData().getContentMeta();
+
+ Set<String> dates = new HashSet<String>( Arrays.asList( meta.getValues( "date" ) ) );
+
+ for ( String date : dates )
+ {
+ doc.add( "date", date );
+ doc.add( "year", date.substring( 0, 4 ) );
+ doc.add( "yearmonth", date.substring( 0, 6 ) );
+ }
+
+ return doc;
+ }
+
+ public void addIndexBackendOptions( Configuration conf )
+ {
+ LuceneWriter.addFieldOptions( "date", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, conf );
+ LuceneWriter.addFieldOptions( "year", LuceneWriter.STORE.NO, LuceneWriter.INDEX.UNTOKENIZED, conf );
+ LuceneWriter.addFieldOptions( "yearmonth", LuceneWriter.STORE.NO, LuceneWriter.INDEX.UNTOKENIZED, conf );
+ }
+
+}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|