| 
     
      
      
      From: <bi...@us...> - 2010-10-27 07:07:03
       
   | 
Revision: 3308
          http://archive-access.svn.sourceforge.net/archive-access/?rev=3308&view=rev
Author:   binzino
Date:     2010-10-27 07:06:57 +0000 (Wed, 27 Oct 2010)
Log Message:
-----------
Dates are stored as-is, but indexed in YYYY and YYYYMM format.
Modified Paths:
--------------
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java
Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java	2010-10-27 07:06:04 UTC (rev 3307)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java	2010-10-27 07:06:57 UTC (rev 3308)
@@ -27,7 +27,7 @@
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
-import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.*;
 import org.apache.lucene.store.NIOFSDirectory;
 
 import org.apache.hadoop.conf.Configured;
@@ -106,7 +106,7 @@
         sourceReaders[i] = IndexReader.open( new NIOFSDirectory( new File( args[i+1] ) ), true );
       }
 
-    IndexWriter writer = new IndexWriter( new NIOFSDirectory( new File( destIndexDir ) ), null, IndexWriter.MaxFieldLength.UNLIMITED );
+    IndexWriter writer = new IndexWriter( new NIOFSDirectory( new File( destIndexDir ) ), new KeywordAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED );
     
     UrlCanonicalizer canonicalizer = getCanonicalizer( this.getConf( ) );
 
@@ -128,7 +128,9 @@
           }
         for ( String date : uniqueDates )
           {
-            newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.NOT_ANALYZED ) );
+            newDoc.add( new Field( NutchWax.DATE_KEY, date,                   Field.Store.YES, Field.Index.NO ) );
+            newDoc.add( new Field( NutchWax.DATE_KEY, date.substring( 0, 4 ), Field.Store.NO,  Field.Index.NOT_ANALYZED_NO_NORMS ) );
+            newDoc.add( new Field( NutchWax.DATE_KEY, date.substring( 0, 6 ), Field.Store.NO,  Field.Index.NOT_ANALYZED_NO_NORMS ) );
           }
 
         // Obtain the new dates for the document.
@@ -156,7 +158,9 @@
           {
             for ( String date : newDates.split("\\s+") )
               {
-                newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.NOT_ANALYZED ) );
+                newDoc.add( new Field( NutchWax.DATE_KEY, date,                   Field.Store.YES, Field.Index.NO ) );
+                newDoc.add( new Field( NutchWax.DATE_KEY, date.substring( 0, 4 ), Field.Store.NO,  Field.Index.NOT_ANALYZED_NO_NORMS ) );
+                newDoc.add( new Field( NutchWax.DATE_KEY, date.substring( 0, 6 ), Field.Store.NO,  Field.Index.NOT_ANALYZED_NO_NORMS ) );
               }
           }
 
@@ -207,6 +211,5 @@
 
     System.exit( result );
   }
-
   
 }
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
 | 
| 
     
      
      
      From: <bi...@us...> - 2010-10-28 22:47:11
       
   | 
Revision: 3324
          http://archive-access.svn.sourceforge.net/archive-access/?rev=3324&view=rev
Author:   binzino
Date:     2010-10-28 22:47:05 +0000 (Thu, 28 Oct 2010)
Log Message:
-----------
Do not use compound index format.
Modified Paths:
--------------
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java
Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java	2010-10-28 22:46:40 UTC (rev 3323)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java	2010-10-28 22:47:05 UTC (rev 3324)
@@ -107,7 +107,8 @@
       }
 
     IndexWriter writer = new IndexWriter( new NIOFSDirectory( new File( destIndexDir ) ), new KeywordAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED );
-    
+    writer.setUseCompoundFile( false );
+
     UrlCanonicalizer canonicalizer = getCanonicalizer( this.getConf( ) );
 
     for ( int i = 0 ; i < reader.numDocs( ) ; i++ )
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
 | 
| 
     
      
      
      From: <bi...@us...> - 2011-08-01 21:49:22
       
   | 
Revision: 3491
          http://archive-access.svn.sourceforge.net/archive-access/?rev=3491&view=rev
Author:   binzino
Date:     2011-08-01 21:49:16 +0000 (Mon, 01 Aug 2011)
Log Message:
-----------
Changed maxDate to be exclusive.
Modified Paths:
--------------
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java
Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java	2011-08-01 21:16:54 UTC (rev 3490)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java	2011-08-01 21:49:16 UTC (rev 3491)
@@ -81,12 +81,8 @@
         recordsStream = new FileInputStream( recordsFile );
       }
 
-    System.out.println( "this.conf: " + this.getConf() );
-    
     String filterSpecs = this.getConf().get( "nutchwax.filter.dates.allow" );
 
-    System.out.println( "filterSpecs: " + filterSpecs );
-
     if ( filterSpecs != null ) 
       {
         String spec = filterSpecs.trim();
@@ -104,16 +100,24 @@
                 break;
               case 2:
                 minDate = Long.parseLong( values[0] + "00000000000000".substring( values[0].length() ) );
-                maxDate = Long.parseLong( values[1] + "99999999999999".substring( values[1].length() ) );
+                maxDate = Long.parseLong( values[1] + "00000000000000".substring( values[1].length() ) );
                 break;
               default:
-                LOG.warn( "Illegal format for nutchwax.filter.dates.allow: " + values );
+                LOG.error( "Illegal format for nutchwax.filter.dates.allow: " + values );
+                return 1;
               }
           }
         catch ( NumberFormatException nfe )
           {
-            LOG.warn( "Illegal format for nutchwax.filter.dates.allow: " + values, nfe );
+            LOG.error( "Illegal format for nutchwax.filter.dates.allow: " + values, nfe );
+            return 1;
           }
+
+        if ( minDate >= maxDate )
+          {
+            LOG.error( "Min date must be before max date for nutchwax.filter.dates.allow: " + minDate + ", " + maxDate );
+            return 1;
+          }
       }
 
     LOG.info( "Allowing dates in range: " + minDate + "-" + maxDate );
@@ -233,14 +237,14 @@
       {
         long d = Long.parseLong( date );
         
-        if ( minDate <= d && d <= maxDate )
+        if ( minDate <= d && d < maxDate )
           {
-            LOG.info( "Include date: " + date );
+            LOG.debug( "Include date: " + date );
             return true;
           }
         else
           {
-            LOG.info( "Exclude date: " + date );
+            LOG.debug( "Exclude date: " + date );
             return false;
           }
       }
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
 |