From: <bi...@us...> - 2008-06-26 22:36:36
|
Revision: 2333 http://archive-access.svn.sourceforge.net/archive-access/?rev=2333&view=rev Author: binzino Date: 2008-06-26 15:36:45 -0700 (Thu, 26 Jun 2008) Log Message: ----------- Initial revision. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2008-06-26 22:36:45 UTC (rev 2333) @@ -0,0 +1,169 @@ +/* + * Copyright (C) 2008 Internet Archive. + * + * This file is part of the archive-access tools project + * (http://sourceforge.net/projects/archive-access). + * + * The archive-access tools are free software; you can redistribute them and/or + * modify them under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or any + * later version. + * + * The archive-access tools are distributed in the hope that they will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser + * Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License along with + * the archive-access tools; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.nutchwax.tools; + +import java.io.File; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.Map; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; +import java.util.Collections; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.analysis.WhitespaceAnalyzer; + +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; + +/** + * Reads series of (digest+URL,date) lines, finds corresponding + * document in index, and adds the date to it. + */ +public class DateAdder +{ + + + public static void main(String[] args) + throws Exception + { + if ( args.length < 4 ) + { + System.out.println( "DateAdder <key-index> <source1> ... <sourceN> <dest> <records>" ); + System.exit( 0 ); + } + + String mainIndexDir = args[0].trim(); + String destIndexDir = args[args.length - 2].trim(); + String recordsFile = args[args.length - 1].trim(); + + InputStream recordsStream; + if ( "-".equals( recordsFile ) ) + { + recordsStream = System.in; + } + else + { + recordsStream = new FileInputStream( recordsFile ); + } + + // Read date-addition records from stdin. + Map<String,String> dateRecords = new HashMap<String,String>( ); + BufferedReader br = new BufferedReader( new InputStreamReader( recordsStream, "UTF-8" ) ); + String line; + while ( (line = br.readLine( )) != null ) + { + String parts[] = line.split("\\s+"); + if ( parts.length != 3 ) + { + System.out.println( "Malformed line: " + line ); + continue; + } + + // Key is hash+url, value is String which is a " "-separated list of dates + String key = parts[0] + parts[1]; + String dates = dateRecords.get( key ); + if ( dates != null ) + { + dates += " " + parts[2]; + dateRecords.put( key, dates ); + } + else + { + dateRecords.put( key , parts[2] ); + } + + } + + IndexReader reader = IndexReader.open( mainIndexDir ); + + IndexReader sourceReaders[] = new IndexReader[args.length-3]; + for ( int i = 0 ; i < sourceReaders.length ; i++ ) + { + sourceReaders[i] = IndexReader.open( args[i+1] ); + } + + IndexWriter writer = new IndexWriter( destIndexDir, new WhitespaceAnalyzer( ), true ); + + UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer( ); + + for ( int i = 0 ; i < reader.numDocs( ) ; i++ ) + { + Document oldDoc = reader.document( i ); + Document newDoc = new Document( ); + + // Copy the source values to the new document. + /* + String dates[] = oldDoc.getValues( "date" ); + + if ( dates != null ) + { + for ( String date : dates ) + { + newDoc.add( new Field( "date", date, Field.Store.YES, Field.Index.UN_TOKENIZED ) ); + } + } + */ + Set<String> uniqueDates = new HashSet<String>( ); + for ( IndexReader source : sourceReaders ) + { + Document sourceDoc = source.document( i ); + + String dates[] = sourceDoc.getValues( "date" ); + + java.util.Collections.addAll( uniqueDates, dates ); + } + for ( String date : uniqueDates ) + { + newDoc.add( new Field( "date", date, Field.Store.YES, Field.Index.UN_TOKENIZED ) ); + } + + // First, apply URL canonicalization from Wayback + String canonicalizedUrl = canonicalizer.urlStringToKey( oldDoc.get( "url" ) ); + + // Now, get the digest+ URL of the document, look for it in + // the updateRecords and if found, add the date. + String key = canonicalizedUrl + oldDoc.get( "archive-digest" ); + + String newDates = dateRecords.get( key ); + if ( newDates != null ) + { + for ( String date : newDates.split("\\s+") ) + { + newDoc.add( new Field( "date", date, Field.Store.YES, Field.Index.UN_TOKENIZED ) ); + } + } + + writer.addDocument( newDoc ); + } + + reader.close( ); + writer.close( ); + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |