[Archive-access-cvs] SF.net SVN: archive-access: [2342] trunk/archive-access/projects/nutchwax/ ar

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2342
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2342&view=rev
Author:   binzino
Date:     2008-06-28 17:17:48 -0700 (Sat, 28 Jun 2008)

Log Message:
-----------
Changed "key" used to identify document from URL to URL+digest.  Also,
this value is stored in a metadata field named "orig" or order to
work-around a bad assumption in Nutch's FetchedSegements.getUrl().

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java

Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java
===================================================================

--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2008-06-28 23:55:28 UTC (rev 2341)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2008-06-29 00:17:48 UTC (rev 2342)
@@ -276,11 +276,38 @@
             return false;
           }
 
+        // We create a key which combines the URL and digest values.
+        // This is necessary because Nutch stores all the data in
+        // MapFiles, which are basically just {key,value} pairs.
+        //
+        // If we use just the URL as the key (which is the way Nutch
+        // usually works) then we have problems with multiple,
+        // different copies of the same URL.  If we try and store two
+        // different copies of the same URL (each having a different
+        // digest) and only use the URL as the key, when the MapFile
+        // is written, only *one* copy of the page will be stored.
+        //
+        // Think about it, we're basically doing:
+        //   MapFile.put( url, value1 );
+        //   MapFile.put( url, value2 );
+        // Only one of those url,value mappings will keep, the other
+        // is over-written.
+        //
+        // So, by using the url+digest as the key, we can have all the
+        // data stored.  The only problem is all over in Nutch where
+        // the key==url is assumed :(
+        String key = url + " " + meta.getDigest( );
+
         Metadata contentMetadata = new Metadata();
         // Set the segment name, just as is done by standard Nutch fetching.
         // Then, add the NutchWAX-specific metadata fields.
         contentMetadata.set( Nutch   .SEGMENT_NAME_KEY, segmentName );
 
+        // We store both the normal URL and the URL+digest key for
+        // later retrieval by the indexing plugin(s).
+        contentMetadata.set( NutchWax.URL_KEY,          url  );
+        contentMetadata.set( NutchWax.ORIG_KEY,         key  );
+
         contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, meta.getMimetype()          );
         contentMetadata.set( NutchWax.ARCNAME_KEY,      meta.getArcFile().getName() );
         contentMetadata.set( NutchWax.COLLECTION_KEY,   collectionName              );
@@ -289,7 +316,7 @@
 
         Content content = new Content( url, url, bytes, meta.getMimetype(), contentMetadata, getConf() );
 
-        output( output, new Text( url ), content );
+        output( output, new Text( key  ), content );
 
         return true;
       }
@@ -342,7 +369,9 @@
                        Text            key,
                        Content         content )
   {
-    // Create the datum
+    LOG.debug( "output( " + key + " )" );
+
+    // Create the crawl datum.  This 
     CrawlDatum datum = new CrawlDatum( CrawlDatum.STATUS_FETCH_SUCCESS, this.interval, 1.0f );
 
     // ?: I have no idea why we need to store the ProtocolStatus in
@@ -418,7 +447,7 @@
           {
             for ( Entry<Text, Parse> entry : parseResult )
               {
-                Text url    = entry.getKey();
+                Text  url   = entry.getKey();
                 Parse parse = entry.getValue();
                 ParseStatus parseStatus = parse.getData().getStatus();
                 
@@ -440,10 +469,20 @@
                 parse.getData().getContentMeta().set( Nutch.SIGNATURE_KEY,    StringUtil.toHexString(signature) );
                 parse.getData().getContentMeta().set( Nutch.FETCH_TIME_KEY,   Long.toString(datum.getFetchTime() ) );
                 
+                // ?: What is this all about?  It was in the original ArcSegmentCreator.java that
+                // inspired this code.  But I can't figure out why we need it.  If anything
+                // this will always be false since our key is now URL+digest, not just URL.
+                // Since it's always false, let's leave it out.
+                /*
                 if ( url.equals( key ) )
                   {
                     datum.setSignature( signature );
                   }
+                else
+                  {
+                    if ( LOG.isWarnEnabled() ) LOG.warn( "ParseResult entry key and url differ: key=" + key + " url=" + url );
+                  }
+                */
 
                 // ?: As above, we'll leave the scoring hooks in place.
                 try
@@ -455,7 +494,7 @@
                     if ( LOG.isWarnEnabled() ) LOG.warn( "Couldn't pass score, url = " + key, e );
                   }
 
-                output.collect( url, new NutchWritable( new ParseImpl( new ParseText( parse.getText() ), parse.getData(), parse.isCanonical() ) ) );
+                output.collect( key, new NutchWritable( new ParseImpl( new ParseText( parse.getText() ), parse.getData(), parse.isCanonical() ) ) );
               }
           }
       }

Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java	2008-06-28 23:55:28 UTC (rev 2341)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java	2008-06-29 00:17:48 UTC (rev 2342)
@@ -41,14 +41,15 @@
 import org.archive.wayback.UrlCanonicalizer;
 import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
 
+import org.archive.nutchwax.NutchWax;
+
+
 /**
  * Reads series of (digest+URL,date) lines, finds corresponding
  * document in index, and adds the date to it.
  */
 public class DateAdder
 {
-
-
   public static void main(String[] args)
     throws Exception
   {
@@ -117,45 +118,35 @@
         Document oldDoc = reader.document( i );
         Document newDoc = new Document( );
 
-        // Copy the source values to the new document.
-        /*
-        String dates[] = oldDoc.getValues( "date" );
-
-        if ( dates != null )
-          {
-            for ( String date : dates )
-              {
-                newDoc.add( new Field( "date", date, Field.Store.YES, Field.Index.UN_TOKENIZED ) );
-              }
-          }
-        */
+        // Copy the values from all the source indices to the new
+        // document.
         Set<String> uniqueDates = new HashSet<String>( );
         for ( IndexReader source : sourceReaders )
           {
             Document sourceDoc = source.document( i );
             
-            String dates[] = sourceDoc.getValues( "date" );
+            String dates[] = sourceDoc.getValues( NutchWax.DATE_KEY );
 
             java.util.Collections.addAll( uniqueDates, dates );
           }
         for ( String date : uniqueDates )
           {
-            newDoc.add( new Field( "date", date, Field.Store.YES, Field.Index.UN_TOKENIZED ) );
+            newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED ) );
           }
 
         // First, apply URL canonicalization from Wayback
-        String canonicalizedUrl = canonicalizer.urlStringToKey( oldDoc.get( "url" ) );
+        String canonicalizedUrl = canonicalizer.urlStringToKey( oldDoc.get( NutchWax.URL_KEY ) );
 
         // Now, get the digest+ URL of the document, look for it in
         // the updateRecords and if found, add the date.
-        String key = canonicalizedUrl + oldDoc.get( "archive-digest" );
+        String key = canonicalizedUrl + oldDoc.get( NutchWax.DIGEST_KEY );
 
         String newDates = dateRecords.get( key );
         if ( newDates != null )
           {
             for ( String date : newDates.split("\\s+") )
               {
-                newDoc.add( new Field( "date", date, Field.Store.YES, Field.Index.UN_TOKENIZED ) );
+                newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED ) );
               }
           }
 


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[Archive-access-cvs] SF.net SVN: archive-access: [2342] trunk/archive-access/projects/nutchwax/ ar

[Archive-access-cvs] SF.net SVN: archive-access: [2342] trunk/archive-access/projects/nutchwax/ archive/src/java/org/archive/nutchwax