|
From: <bi...@us...> - 2010-10-27 07:07:03
|
Revision: 3308
http://archive-access.svn.sourceforge.net/archive-access/?rev=3308&view=rev
Author: binzino
Date: 2010-10-27 07:06:57 +0000 (Wed, 27 Oct 2010)
Log Message:
-----------
Dates are stored as-is, but indexed in YYYY and YYYYMM format.
Modified Paths:
--------------
tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java
Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2010-10-27 07:06:04 UTC (rev 3307)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2010-10-27 07:06:57 UTC (rev 3308)
@@ -27,7 +27,7 @@
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
-import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.*;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.hadoop.conf.Configured;
@@ -106,7 +106,7 @@
sourceReaders[i] = IndexReader.open( new NIOFSDirectory( new File( args[i+1] ) ), true );
}
- IndexWriter writer = new IndexWriter( new NIOFSDirectory( new File( destIndexDir ) ), null, IndexWriter.MaxFieldLength.UNLIMITED );
+ IndexWriter writer = new IndexWriter( new NIOFSDirectory( new File( destIndexDir ) ), new KeywordAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED );
UrlCanonicalizer canonicalizer = getCanonicalizer( this.getConf( ) );
@@ -128,7 +128,9 @@
}
for ( String date : uniqueDates )
{
- newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.NOT_ANALYZED ) );
+ newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.NO ) );
+ newDoc.add( new Field( NutchWax.DATE_KEY, date.substring( 0, 4 ), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS ) );
+ newDoc.add( new Field( NutchWax.DATE_KEY, date.substring( 0, 6 ), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS ) );
}
// Obtain the new dates for the document.
@@ -156,7 +158,9 @@
{
for ( String date : newDates.split("\\s+") )
{
- newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.NOT_ANALYZED ) );
+ newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.NO ) );
+ newDoc.add( new Field( NutchWax.DATE_KEY, date.substring( 0, 4 ), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS ) );
+ newDoc.add( new Field( NutchWax.DATE_KEY, date.substring( 0, 6 ), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS ) );
}
}
@@ -207,6 +211,5 @@
System.exit( result );
}
-
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bi...@us...> - 2010-10-28 22:47:11
|
Revision: 3324
http://archive-access.svn.sourceforge.net/archive-access/?rev=3324&view=rev
Author: binzino
Date: 2010-10-28 22:47:05 +0000 (Thu, 28 Oct 2010)
Log Message:
-----------
Do not use compound index format.
Modified Paths:
--------------
tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java
Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2010-10-28 22:46:40 UTC (rev 3323)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2010-10-28 22:47:05 UTC (rev 3324)
@@ -107,7 +107,8 @@
}
IndexWriter writer = new IndexWriter( new NIOFSDirectory( new File( destIndexDir ) ), new KeywordAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED );
-
+ writer.setUseCompoundFile( false );
+
UrlCanonicalizer canonicalizer = getCanonicalizer( this.getConf( ) );
for ( int i = 0 ; i < reader.numDocs( ) ; i++ )
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bi...@us...> - 2011-08-01 21:49:22
|
Revision: 3491
http://archive-access.svn.sourceforge.net/archive-access/?rev=3491&view=rev
Author: binzino
Date: 2011-08-01 21:49:16 +0000 (Mon, 01 Aug 2011)
Log Message:
-----------
Changed maxDate to be exclusive.
Modified Paths:
--------------
tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java
Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2011-08-01 21:16:54 UTC (rev 3490)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2011-08-01 21:49:16 UTC (rev 3491)
@@ -81,12 +81,8 @@
recordsStream = new FileInputStream( recordsFile );
}
- System.out.println( "this.conf: " + this.getConf() );
-
String filterSpecs = this.getConf().get( "nutchwax.filter.dates.allow" );
- System.out.println( "filterSpecs: " + filterSpecs );
-
if ( filterSpecs != null )
{
String spec = filterSpecs.trim();
@@ -104,16 +100,24 @@
break;
case 2:
minDate = Long.parseLong( values[0] + "00000000000000".substring( values[0].length() ) );
- maxDate = Long.parseLong( values[1] + "99999999999999".substring( values[1].length() ) );
+ maxDate = Long.parseLong( values[1] + "00000000000000".substring( values[1].length() ) );
break;
default:
- LOG.warn( "Illegal format for nutchwax.filter.dates.allow: " + values );
+ LOG.error( "Illegal format for nutchwax.filter.dates.allow: " + values );
+ return 1;
}
}
catch ( NumberFormatException nfe )
{
- LOG.warn( "Illegal format for nutchwax.filter.dates.allow: " + values, nfe );
+ LOG.error( "Illegal format for nutchwax.filter.dates.allow: " + values, nfe );
+ return 1;
}
+
+ if ( minDate >= maxDate )
+ {
+ LOG.error( "Min date must be before max date for nutchwax.filter.dates.allow: " + minDate + ", " + maxDate );
+ return 1;
+ }
}
LOG.info( "Allowing dates in range: " + minDate + "-" + maxDate );
@@ -233,14 +237,14 @@
{
long d = Long.parseLong( date );
- if ( minDate <= d && d <= maxDate )
+ if ( minDate <= d && d < maxDate )
{
- LOG.info( "Include date: " + date );
+ LOG.debug( "Include date: " + date );
return true;
}
else
{
- LOG.info( "Exclude date: " + date );
+ LOG.debug( "Exclude date: " + date );
return false;
}
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|