From: <bi...@us...> - 2008-06-29 00:17:39
|
Revision: 2342 http://archive-access.svn.sourceforge.net/archive-access/?rev=2342&view=rev Author: binzino Date: 2008-06-28 17:17:48 -0700 (Sat, 28 Jun 2008) Log Message: ----------- Changed "key" used to identify document from URL to URL+digest. Also, this value is stored in a metadata field named "orig" or order to work-around a bad assumption in Nutch's FetchedSegements.getUrl(). Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-06-28 23:55:28 UTC (rev 2341) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-06-29 00:17:48 UTC (rev 2342) @@ -276,11 +276,38 @@ return false; } + // We create a key which combines the URL and digest values. + // This is necessary because Nutch stores all the data in + // MapFiles, which are basically just {key,value} pairs. + // + // If we use just the URL as the key (which is the way Nutch + // usually works) then we have problems with multiple, + // different copies of the same URL. If we try and store two + // different copies of the same URL (each having a different + // digest) and only use the URL as the key, when the MapFile + // is written, only *one* copy of the page will be stored. + // + // Think about it, we're basically doing: + // MapFile.put( url, value1 ); + // MapFile.put( url, value2 ); + // Only one of those url,value mappings will keep, the other + // is over-written. + // + // So, by using the url+digest as the key, we can have all the + // data stored. The only problem is all over in Nutch where + // the key==url is assumed :( + String key = url + " " + meta.getDigest( ); + Metadata contentMetadata = new Metadata(); // Set the segment name, just as is done by standard Nutch fetching. // Then, add the NutchWAX-specific metadata fields. contentMetadata.set( Nutch .SEGMENT_NAME_KEY, segmentName ); + // We store both the normal URL and the URL+digest key for + // later retrieval by the indexing plugin(s). + contentMetadata.set( NutchWax.URL_KEY, url ); + contentMetadata.set( NutchWax.ORIG_KEY, key ); + contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, meta.getMimetype() ); contentMetadata.set( NutchWax.ARCNAME_KEY, meta.getArcFile().getName() ); contentMetadata.set( NutchWax.COLLECTION_KEY, collectionName ); @@ -289,7 +316,7 @@ Content content = new Content( url, url, bytes, meta.getMimetype(), contentMetadata, getConf() ); - output( output, new Text( url ), content ); + output( output, new Text( key ), content ); return true; } @@ -342,7 +369,9 @@ Text key, Content content ) { - // Create the datum + LOG.debug( "output( " + key + " )" ); + + // Create the crawl datum. This CrawlDatum datum = new CrawlDatum( CrawlDatum.STATUS_FETCH_SUCCESS, this.interval, 1.0f ); // ?: I have no idea why we need to store the ProtocolStatus in @@ -418,7 +447,7 @@ { for ( Entry<Text, Parse> entry : parseResult ) { - Text url = entry.getKey(); + Text url = entry.getKey(); Parse parse = entry.getValue(); ParseStatus parseStatus = parse.getData().getStatus(); @@ -440,10 +469,20 @@ parse.getData().getContentMeta().set( Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature) ); parse.getData().getContentMeta().set( Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime() ) ); + // ?: What is this all about? It was in the original ArcSegmentCreator.java that + // inspired this code. But I can't figure out why we need it. If anything + // this will always be false since our key is now URL+digest, not just URL. + // Since it's always false, let's leave it out. + /* if ( url.equals( key ) ) { datum.setSignature( signature ); } + else + { + if ( LOG.isWarnEnabled() ) LOG.warn( "ParseResult entry key and url differ: key=" + key + " url=" + url ); + } + */ // ?: As above, we'll leave the scoring hooks in place. try @@ -455,7 +494,7 @@ if ( LOG.isWarnEnabled() ) LOG.warn( "Couldn't pass score, url = " + key, e ); } - output.collect( url, new NutchWritable( new ParseImpl( new ParseText( parse.getText() ), parse.getData(), parse.isCanonical() ) ) ); + output.collect( key, new NutchWritable( new ParseImpl( new ParseText( parse.getText() ), parse.getData(), parse.isCanonical() ) ) ); } } } Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2008-06-28 23:55:28 UTC (rev 2341) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2008-06-29 00:17:48 UTC (rev 2342) @@ -41,14 +41,15 @@ import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; +import org.archive.nutchwax.NutchWax; + + /** * Reads series of (digest+URL,date) lines, finds corresponding * document in index, and adds the date to it. */ public class DateAdder { - - public static void main(String[] args) throws Exception { @@ -117,45 +118,35 @@ Document oldDoc = reader.document( i ); Document newDoc = new Document( ); - // Copy the source values to the new document. - /* - String dates[] = oldDoc.getValues( "date" ); - - if ( dates != null ) - { - for ( String date : dates ) - { - newDoc.add( new Field( "date", date, Field.Store.YES, Field.Index.UN_TOKENIZED ) ); - } - } - */ + // Copy the values from all the source indices to the new + // document. Set<String> uniqueDates = new HashSet<String>( ); for ( IndexReader source : sourceReaders ) { Document sourceDoc = source.document( i ); - String dates[] = sourceDoc.getValues( "date" ); + String dates[] = sourceDoc.getValues( NutchWax.DATE_KEY ); java.util.Collections.addAll( uniqueDates, dates ); } for ( String date : uniqueDates ) { - newDoc.add( new Field( "date", date, Field.Store.YES, Field.Index.UN_TOKENIZED ) ); + newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED ) ); } // First, apply URL canonicalization from Wayback - String canonicalizedUrl = canonicalizer.urlStringToKey( oldDoc.get( "url" ) ); + String canonicalizedUrl = canonicalizer.urlStringToKey( oldDoc.get( NutchWax.URL_KEY ) ); // Now, get the digest+ URL of the document, look for it in // the updateRecords and if found, add the date. - String key = canonicalizedUrl + oldDoc.get( "archive-digest" ); + String key = canonicalizedUrl + oldDoc.get( NutchWax.DIGEST_KEY ); String newDates = dateRecords.get( key ); if ( newDates != null ) { for ( String date : newDates.split("\\s+") ) { - newDoc.add( new Field( "date", date, Field.Store.YES, Field.Index.UN_TOKENIZED ) ); + newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED ) ); } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |