From: <bi...@us...> - 2008-09-02 23:26:04
|
Revision: 2589 http://archive-access.svn.sourceforge.net/archive-access/?rev=2589&view=rev Author: binzino Date: 2008-09-02 23:26:08 +0000 (Tue, 02 Sep 2008) Log Message: ----------- Changed parsing of dup/date file to allow for extra, unused fields. Also updated dedup-cdx script to add archive filename to output. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java Modified: trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx =================================================================== --- trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx 2008-08-29 22:18:41 UTC (rev 2588) +++ trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx 2008-09-02 23:26:08 UTC (rev 2589) @@ -11,16 +11,26 @@ echo "Duplicate records are found by sorting all the CDX records, then" echo "comparing subsequent records by URL+digest." echo - echo "Output is in abbreviated form of \"URL digest date\", ex:" + echo "Output is in abbreviated form of \"URL digest date arcname\", ex:" echo - echo " example.org sha1:H4NTDLP5DNH6KON63ZALKEV5ELVUDGXJ 20070208173443" - echo " example.org sha1:H4NTDLP5DNH6KON63ZALKEV5ELVUDGXJ 20080626121505" + echo " example.org sha1:H4NTDLP5DNH6KON63ZALKEV5ELVUDGXJ 20070208173443 foo.arc.gz" + echo " example.org sha1:H4NTDLP5DNH6KON63ZALKEV5ELVUDGXJ 20080626121505 bar.arc.gz" echo echo "The output of this script can be used as an exclusions file for" echo "importing ARC files with NutchWAX, and also for adding dates" echo "to a parallel index." echo + echo "NOTE: This script uses Unix 'sort' binary. If you wish to use a different" + echo "implementation, specify it via the SORT shell variable, e.g.:" + echo + echo " SORT=my_cool_sort dedup-cdx file1.cdx" + echo exit 1; fi -cat $@ | awk '{ print $1 " sha1:" $6 " " $2 }' | sort -u | awk '{ if ( url == $1 && digest == $2 ) print $1 " " $2 " " $3 ; url = $1 ; digest = $2 }' +# Use Unix 'sort', unless over-ridden by caller. +if [ -z "$SORT" ]; then + SORT=sort +fi + +cat $@ | awk '{ print $1, "sha1:" $6, $2, $9 }' | $SORT -u | awk '{ if ( url == $1 && digest == $2 ) print $1, $2, $3, $4 ; url = $1 ; digest = $2 }' Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2008-08-29 22:18:41 UTC (rev 2588) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2008-09-02 23:26:08 UTC (rev 2589) @@ -84,24 +84,24 @@ String line; while ( (line = br.readLine( )) != null ) { - String parts[] = line.split("\\s+"); - if ( parts.length != 3 ) + String fields[] = line.split("\\s+"); + if ( fields.length < 3 ) { - System.out.println( "Malformed line: " + line ); + System.out.println( "Malformed line, not enough fields (" + fields.length +"): " + line ); continue; } // Key is hash+url, value is String which is a " "-separated list of dates - String key = parts[0] + parts[1]; + String key = fields[0] + fields[1]; String dates = dateRecords.get( key ); if ( dates != null ) { - dates += " " + parts[2]; + dates += " " + fields[2]; dateRecords.put( key, dates ); } else { - dateRecords.put( key , parts[2] ); + dateRecords.put( key , fields[2] ); } } Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java 2008-08-29 22:18:41 UTC (rev 2588) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java 2008-09-02 23:26:08 UTC (rev 2589) @@ -48,7 +48,6 @@ * same logic as the Wayback. By making Wayback canonicalization * available, we can use exclusion rules generated from CDX files. */ -// TODO: Add logging public class WaybackURLFilter implements URLFilter { public static final Log LOG = LogFactory.getLog( WaybackURLFilter.class ); @@ -75,7 +74,7 @@ if ( s.length != 3 ) { // Don't filter. - LOG.info( "Allowing: " + urlString ); + LOG.info( "Allowing : " + urlString ); return urlString; } @@ -94,7 +93,7 @@ // Then, build a key to be compared against the exclusion // list. - String key = url + " " + digest + " " + date; + String key = url + digest + date; exclude = this.exclusions.contains( key ); } @@ -192,6 +191,20 @@ String line; while ( (line = reader.readLine()) != null ) { + String fields[] = line.split( "\\s+" ); + + if ( fields.length < 3 ) + { + LOG.warn( "Malformed exclusion, not enough fields ("+fields.length+"): " + line ); + continue ; + } + + // We only want the first three fields. Chop-off anything extra. + if ( fields.length >= 3 ) + { + line = fields[0] + fields[1] + fields[2]; + } + exclusions.add( line ); } } @@ -222,5 +235,5 @@ return exclusions; } - + } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |