|
From: <bi...@us...> - 2009-07-24 18:43:35
|
Revision: 2792
http://archive-access.svn.sourceforge.net/archive-access/?rev=2792&view=rev
Author: binzino
Date: 2009-07-24 18:43:16 +0000 (Fri, 24 Jul 2009)
Log Message:
-----------
WAX-56. Dates from all sources, including the text file are put into a Set before being added to the output directory.
Modified Paths:
--------------
tags/nutchwax-0_12_7/archive/src/java/org/archive/nutchwax/tools/DateAdder.java
Modified: tags/nutchwax-0_12_7/archive/src/java/org/archive/nutchwax/tools/DateAdder.java
===================================================================
--- tags/nutchwax-0_12_7/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2009-07-22 21:36:23 UTC (rev 2791)
+++ tags/nutchwax-0_12_7/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2009-07-24 18:43:16 UTC (rev 2792)
@@ -76,7 +76,7 @@
recordsStream = new FileInputStream( recordsFile );
}
- // Read date-addition records from stdin.
+ // Read date-addition records.
Map<String,String> dateRecords = new HashMap<String,String>( );
BufferedReader br = new BufferedReader( new InputStreamReader( recordsStream, "UTF-8" ) );
String line;
@@ -89,7 +89,7 @@
continue;
}
- // Key is hash+url, value is String which is a " "-separated list of dates
+ // Key is url+hash, value is String which is a " "-separated list of dates
String key = fields[0] + fields[1];
String dates = dateRecords.get( key );
if ( dates != null )
@@ -113,6 +113,7 @@
}
IndexWriter writer = new IndexWriter( destIndexDir, new WhitespaceAnalyzer( ), true );
+ writer.setUseCompoundFile(false);
UrlCanonicalizer canonicalizer = getCanonicalizer( this.getConf( ) );
@@ -132,23 +133,20 @@
Collections.addAll( uniqueDates, dates );
}
- for ( String date : uniqueDates )
- {
- newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED ) );
- }
// Obtain the new dates for the document.
- String newDates = null;
try
{
// First, apply URL canonicalization from Wayback
String canonicalizedUrl = canonicalizer.urlStringToKey( oldDoc.get( NutchWax.URL_KEY ) );
- // Now, get the digest+URL of the document, look for it in
- // the updateRecords and if found, add the date.
+ // As above, they key is hash+url, value will bea a String which is a " "-separated list of dates
String key = canonicalizedUrl + oldDoc.get( NutchWax.DIGEST_KEY );
- newDates = dateRecords.get( key );
+ String newDates = dateRecords.get( key );
+
+ // If there are any new dates, add them to the set.
+ if ( newDates != null ) Collections.addAll( uniqueDates, newDates.split( "\\s+" ) );
}
catch ( Exception e )
{
@@ -157,13 +155,10 @@
System.err.println( "WARN: Not adding dates on malformed URI: " + oldDoc.get( NutchWax.URL_KEY ) );
}
- // If there are any new dates, add them to the new document.
- if ( newDates != null )
+ // Add the updated list of uniqueDates, which the new (unique) ones.
+ for ( String date : uniqueDates )
{
- for ( String date : newDates.split("\\s+") )
- {
- newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED ) );
- }
+ newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.NO_NORMS ) );
}
// Finally, add the new document to the new index.
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|