[Archive-access-cvs] SF.net SVN: archive-access: [2333] trunk/archive-access/projects/nutchwax/ ar

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2333
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2333&view=rev
Author:   binzino
Date:     2008-06-26 15:36:45 -0700 (Thu, 26 Jun 2008)

Log Message:
-----------
Initial revision.

Added Paths:
-----------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java

Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java
===================================================================

--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java	2008-06-26 22:36:45 UTC (rev 2333)
@@ -0,0 +1,169 @@
+/*
+ * Copyright (C) 2008 Internet Archive.
+ * 
+ * This file is part of the archive-access tools project
+ * (http://sourceforge.net/projects/archive-access).
+ * 
+ * The archive-access tools are free software; you can redistribute them and/or
+ * modify them under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or any
+ * later version.
+ * 
+ * The archive-access tools are distributed in the hope that they will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
+ * Public License for more details.
+ * 
+ * You should have received a copy of the GNU Lesser Public License along with
+ * the archive-access tools; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+package org.archive.nutchwax.tools;
+
+import java.io.File;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.Collections;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+
+import org.archive.wayback.UrlCanonicalizer;
+import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
+
+/**
+ * Reads series of (digest+URL,date) lines, finds corresponding
+ * document in index, and adds the date to it.
+ */
+public class DateAdder
+{
+
+
+  public static void main(String[] args)
+    throws Exception
+  {
+    if ( args.length < 4 )
+      {
+        System.out.println( "DateAdder <key-index> <source1> ... <sourceN> <dest> <records>" );
+        System.exit( 0 );
+      }
+
+    String mainIndexDir = args[0].trim();
+    String destIndexDir = args[args.length - 2].trim();
+    String recordsFile  = args[args.length - 1].trim();
+    
+    InputStream recordsStream;
+    if ( "-".equals( recordsFile ) )
+      {
+        recordsStream = System.in;
+      }
+    else
+      {
+        recordsStream = new FileInputStream( recordsFile );
+      }
+
+    // Read date-addition records from stdin.
+    Map<String,String> dateRecords = new HashMap<String,String>( );
+    BufferedReader br = new BufferedReader( new InputStreamReader( recordsStream, "UTF-8" ) );
+    String line;
+    while ( (line = br.readLine( )) != null )
+      {
+        String parts[] = line.split("\\s+");
+        if ( parts.length != 3 ) 
+          {
+            System.out.println( "Malformed line: " + line );
+            continue;
+          }
+
+        // Key is hash+url, value is String which is a " "-separated list of dates
+        String key   = parts[0] + parts[1];
+        String dates = dateRecords.get( key );
+        if ( dates != null )
+          {
+            dates += " " + parts[2];
+            dateRecords.put( key, dates );
+          }
+        else
+          {
+            dateRecords.put( key , parts[2] );
+          }
+
+      }
+
+    IndexReader reader = IndexReader.open( mainIndexDir );
+    
+    IndexReader sourceReaders[] = new IndexReader[args.length-3];
+    for ( int i = 0 ; i < sourceReaders.length ; i++ )
+      {
+        sourceReaders[i] = IndexReader.open( args[i+1] );
+      }
+
+    IndexWriter writer = new IndexWriter( destIndexDir, new WhitespaceAnalyzer( ), true );
+    
+    UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer( );
+
+    for ( int i = 0 ; i < reader.numDocs( ) ; i++ )
+      {
+        Document oldDoc = reader.document( i );
+        Document newDoc = new Document( );
+
+        // Copy the source values to the new document.
+        /*
+        String dates[] = oldDoc.getValues( "date" );
+
+        if ( dates != null )
+          {
+            for ( String date : dates )
+              {
+                newDoc.add( new Field( "date", date, Field.Store.YES, Field.Index.UN_TOKENIZED ) );
+              }
+          }
+        */
+        Set<String> uniqueDates = new HashSet<String>( );
+        for ( IndexReader source : sourceReaders )
+          {
+            Document sourceDoc = source.document( i );
+            
+            String dates[] = sourceDoc.getValues( "date" );
+
+            java.util.Collections.addAll( uniqueDates, dates );
+          }
+        for ( String date : uniqueDates )
+          {
+            newDoc.add( new Field( "date", date, Field.Store.YES, Field.Index.UN_TOKENIZED ) );
+          }
+
+        // First, apply URL canonicalization from Wayback
+        String canonicalizedUrl = canonicalizer.urlStringToKey( oldDoc.get( "url" ) );
+
+        // Now, get the digest+ URL of the document, look for it in
+        // the updateRecords and if found, add the date.
+        String key = canonicalizedUrl + oldDoc.get( "archive-digest" );
+
+        String newDates = dateRecords.get( key );
+        if ( newDates != null )
+          {
+            for ( String date : newDates.split("\\s+") )
+              {
+                newDoc.add( new Field( "date", date, Field.Store.YES, Field.Index.UN_TOKENIZED ) );
+              }
+          }
+
+        writer.addDocument( newDoc );
+      }
+
+    reader.close( );
+    writer.close( );
+  }
+  
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[Archive-access-cvs] SF.net SVN: archive-access: [2333] trunk/archive-access/projects/nutchwax/ ar

[Archive-access-cvs] SF.net SVN: archive-access: [2333] trunk/archive-access/projects/nutchwax/ archive/src/java/org/archive/nutchwax/tools/DateAdder.java