[Archive-access-cvs] SF.net SVN: archive-access:[2589] trunk/archive-access/projects/nutchwax/ arch

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2589
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2589&view=rev
Author:   binzino
Date:     2008-09-02 23:26:08 +0000 (Tue, 02 Sep 2008)

Log Message:
-----------
Changed parsing of dup/date file to allow for extra, unused fields.  Also updated dedup-cdx script to add archive filename to output.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java
    trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java

Modified: trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx
===================================================================

--- trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx	2008-08-29 22:18:41 UTC (rev 2588)
+++ trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx	2008-09-02 23:26:08 UTC (rev 2589)
@@ -11,16 +11,26 @@
     echo "Duplicate records are found by sorting all the CDX records, then"
     echo "comparing subsequent records by URL+digest."
     echo 
-    echo "Output is in abbreviated form of \"URL digest date\", ex:"
+    echo "Output is in abbreviated form of \"URL digest date arcname\", ex:"
     echo 
-    echo "  example.org sha1:H4NTDLP5DNH6KON63ZALKEV5ELVUDGXJ 20070208173443"
-    echo "  example.org sha1:H4NTDLP5DNH6KON63ZALKEV5ELVUDGXJ 20080626121505"
+    echo "  example.org sha1:H4NTDLP5DNH6KON63ZALKEV5ELVUDGXJ 20070208173443 foo.arc.gz"
+    echo "  example.org sha1:H4NTDLP5DNH6KON63ZALKEV5ELVUDGXJ 20080626121505 bar.arc.gz"
     echo 
     echo "The output of this script can be used as an exclusions file for"
     echo "importing ARC files with NutchWAX, and also for adding dates"
     echo "to a parallel index."
     echo 
+    echo "NOTE: This script uses Unix 'sort' binary.  If you wish to use a different"
+    echo "implementation, specify it via the SORT shell variable, e.g.:"
+    echo 
+    echo "  SORT=my_cool_sort dedup-cdx file1.cdx"
+    echo 
     exit 1;
 fi
 
-cat $@ | awk '{ print $1 " sha1:" $6 " " $2 }' | sort -u | awk '{ if ( url == $1 && digest == $2 ) print $1 " " $2 " " $3 ; url = $1 ; digest = $2 }' 
+# Use Unix 'sort', unless over-ridden by caller.
+if [ -z "$SORT" ]; then
+  SORT=sort
+fi
+
+cat $@ | awk '{ print $1, "sha1:" $6, $2, $9 }' | $SORT -u | awk '{ if ( url == $1 && digest == $2 ) print $1, $2, $3, $4 ; url = $1 ; digest = $2 }'

Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java	2008-08-29 22:18:41 UTC (rev 2588)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java	2008-09-02 23:26:08 UTC (rev 2589)
@@ -84,24 +84,24 @@
     String line;
     while ( (line = br.readLine( )) != null )
       {
-        String parts[] = line.split("\\s+");
-        if ( parts.length != 3 ) 
+        String fields[] = line.split("\\s+");
+        if ( fields.length < 3 ) 
           {
-            System.out.println( "Malformed line: " + line );
+            System.out.println( "Malformed line, not enough fields (" + fields.length +"): " + line );
             continue;
           }
 
         // Key is hash+url, value is String which is a " "-separated list of dates
-        String key   = parts[0] + parts[1];
+        String key   = fields[0] + fields[1];
         String dates = dateRecords.get( key );
         if ( dates != null )
           {
-            dates += " " + parts[2];
+            dates += " " + fields[2];
             dateRecords.put( key, dates );
           }
         else
           {
-            dateRecords.put( key , parts[2] );
+            dateRecords.put( key , fields[2] );
           }
 
       }

Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java	2008-08-29 22:18:41 UTC (rev 2588)
+++ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java	2008-09-02 23:26:08 UTC (rev 2589)
@@ -48,7 +48,6 @@
  * same logic as the Wayback.  By making Wayback canonicalization
  * available, we can use exclusion rules generated from CDX files.
  */
-// TODO: Add logging
 public class WaybackURLFilter implements URLFilter
 {
   public static final Log LOG = LogFactory.getLog( WaybackURLFilter.class );
@@ -75,7 +74,7 @@
     if ( s.length != 3 )
       {
         // Don't filter.
-        LOG.info( "Allowing: " + urlString );
+        LOG.info( "Allowing : " + urlString );
 
         return urlString;
       }
@@ -94,7 +93,7 @@
         
         // Then, build a key to be compared against the exclusion
         // list.
-        String key = url + " " + digest + " " + date;
+        String key = url + digest + date;
 
         exclude = this.exclusions.contains( key );
       }
@@ -192,6 +191,20 @@
             String line;
             while ( (line = reader.readLine()) != null )
               {
+                String fields[] = line.split( "\\s+" );
+                
+                if ( fields.length < 3 )
+                  {
+                    LOG.warn( "Malformed exclusion, not enough fields ("+fields.length+"): " + line );
+                    continue ;
+                  }
+
+                // We only want the first three fields.  Chop-off anything extra.
+                if ( fields.length >= 3 )
+                  {
+                    line = fields[0] + fields[1] + fields[2];
+                  }
+
                 exclusions.add( line );
               }
           }
@@ -222,5 +235,5 @@
 
     return exclusions;
   }
-  
+
 }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[Archive-access-cvs] SF.net SVN: archive-access:[2589] trunk/archive-access/projects/nutchwax/ arch

[Archive-access-cvs] SF.net SVN: archive-access:[2589] trunk/archive-access/projects/nutchwax/ archive