From: <bi...@us...> - 2010-10-27 07:07:03
|
Revision: 3308 http://archive-access.svn.sourceforge.net/archive-access/?rev=3308&view=rev Author: binzino Date: 2010-10-27 07:06:57 +0000 (Wed, 27 Oct 2010) Log Message: ----------- Dates are stored as-is, but indexed in YYYY and YYYYMM format. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2010-10-27 07:06:04 UTC (rev 3307) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2010-10-27 07:06:57 UTC (rev 3308) @@ -27,7 +27,7 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.analysis.*; import org.apache.lucene.store.NIOFSDirectory; import org.apache.hadoop.conf.Configured; @@ -106,7 +106,7 @@ sourceReaders[i] = IndexReader.open( new NIOFSDirectory( new File( args[i+1] ) ), true ); } - IndexWriter writer = new IndexWriter( new NIOFSDirectory( new File( destIndexDir ) ), null, IndexWriter.MaxFieldLength.UNLIMITED ); + IndexWriter writer = new IndexWriter( new NIOFSDirectory( new File( destIndexDir ) ), new KeywordAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED ); UrlCanonicalizer canonicalizer = getCanonicalizer( this.getConf( ) ); @@ -128,7 +128,9 @@ } for ( String date : uniqueDates ) { - newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.NOT_ANALYZED ) ); + newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.NO ) ); + newDoc.add( new Field( NutchWax.DATE_KEY, date.substring( 0, 4 ), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS ) ); + newDoc.add( new Field( NutchWax.DATE_KEY, date.substring( 0, 6 ), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS ) ); } // Obtain the new dates for the document. @@ -156,7 +158,9 @@ { for ( String date : newDates.split("\\s+") ) { - newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.NOT_ANALYZED ) ); + newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.NO ) ); + newDoc.add( new Field( NutchWax.DATE_KEY, date.substring( 0, 4 ), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS ) ); + newDoc.add( new Field( NutchWax.DATE_KEY, date.substring( 0, 6 ), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS ) ); } } @@ -207,6 +211,5 @@ System.exit( result ); } - } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |