From: <bi...@us...> - 2008-09-25 21:13:31
|
Revision: 2595 http://archive-access.svn.sourceforge.net/archive-access/?rev=2595&view=rev Author: binzino Date: 2008-09-25 21:13:25 +0000 (Thu, 25 Sep 2008) Log Message: ----------- Added try/catch around use of UrlCanonicalizer so that we ignore URIs that are malformed. A warning is emitted to stderr. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2008-09-22 19:55:50 UTC (rev 2594) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2008-09-25 21:13:25 UTC (rev 2595) @@ -137,14 +137,27 @@ newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED ) ); } - // First, apply URL canonicalization from Wayback - String canonicalizedUrl = canonicalizer.urlStringToKey( oldDoc.get( NutchWax.URL_KEY ) ); + // Obtain the new dates for the document. + String newDates = null; + try + { + // First, apply URL canonicalization from Wayback + String canonicalizedUrl = canonicalizer.urlStringToKey( oldDoc.get( NutchWax.URL_KEY ) ); - // Now, get the digest+ URL of the document, look for it in - // the updateRecords and if found, add the date. - String key = canonicalizedUrl + oldDoc.get( NutchWax.DIGEST_KEY ); + // Now, get the digest+URL of the document, look for it in + // the updateRecords and if found, add the date. + String key = canonicalizedUrl + oldDoc.get( NutchWax.DIGEST_KEY ); + + newDates = dateRecords.get( key ); + } + catch ( Exception e ) + { + // The canonicalizer can throw various types of exceptions + // due to malformed URIs. + System.err.println( "WARN: Not adding dates on malformed URI: " + oldDoc.get( NutchWax.URL_KEY ) ); + } - String newDates = dateRecords.get( key ); + // If there are any new dates, add them to the new document. if ( newDates != null ) { for ( String date : newDates.split("\\s+") ) @@ -153,6 +166,7 @@ } } + // Finally, add the new document to the new index. writer.addDocument( newDoc ); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |