Revision: 2348
http://archive-access.svn.sourceforge.net/archive-access/?rev=2348&view=rev
Author: binzino
Date: 2008-07-01 16:06:32 -0700 (Tue, 01 Jul 2008)
Log Message:
-----------
Added "-u" to sort command. Fixed error in usage info.
Modified Paths:
--------------
trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx
Modified: trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx 2008-07-01 22:52:08 UTC (rev 2347)
+++ trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx 2008-07-01 23:06:32 UTC (rev 2348)
@@ -13,14 +13,14 @@
echo
echo "Output is in abbreviated form of \"URL digest date\", ex:"
echo
+ echo " example.org sha1:H4NTDLP5DNH6KON63ZALKEV5ELVUDGXJ 20070208173443"
echo " example.org sha1:H4NTDLP5DNH6KON63ZALKEV5ELVUDGXJ 20080626121505"
- echo " example.org sha1:H4NTDLP5DNH6KON63ZALKEV5ELVUDGXJ 20070208173443"
echo
echo "The output of this script can be used as an exclusions file for"
- echo "importing (W)ARC files with NutchWAX, and also for adding dates"
+ echo "importing ARC files with NutchWAX, and also for adding dates"
echo "to a parallel index."
echo
exit 1;
fi
-cat $@ | awk '{ print $1 " sha1:" $6 " " $2 }' | sort | awk '{ if ( url == $1 && digest == $2 ) print $1 " " $2 " " $3 ; url = $1 ; digest = $2 }'
+cat $@ | awk '{ print $1 " sha1:" $6 " " $2 }' | sort -u | awk '{ if ( url == $1 && digest == $2 ) print $1 " " $2 " " $3 ; url = $1 ; digest = $2 }'
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|