From: <bi...@us...> - 2008-09-22 18:40:19
|
Revision: 2592 http://archive-access.svn.sourceforge.net/archive-access/?rev=2592&view=rev Author: binzino Date: 2008-09-22 18:40:08 +0000 (Mon, 22 Sep 2008) Log Message: ----------- WAX-21: Allow for blank linkes and comment lines in manifest file. Comment lines start with '#'. Extra whitespace at the start/end of all lines is also eliminated. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-09-22 18:07:59 UTC (rev 2591) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-09-22 18:40:08 UTC (rev 2592) @@ -19,7 +19,6 @@ import java.io.IOException; import java.net.MalformedURLException; import java.util.Map.Entry; -import java.util.Iterator; import java.util.List; import java.util.ArrayList; @@ -37,8 +36,6 @@ import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.TextInputFormat; -import org.apache.hadoop.mapred.TextOutputFormat; -import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; @@ -59,17 +56,14 @@ import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.ProtocolStatus; import org.apache.nutch.scoring.ScoringFilters; -import org.apache.nutch.util.LogUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.StringUtil; import org.archive.io.ArchiveReader; import org.archive.io.ArchiveReaderFactory; -import org.archive.io.ArchiveRecordHeader; import org.archive.io.arc.ARCRecord; import org.archive.io.arc.ARCRecordMetaData; -import org.archive.io.warc.WARCConstants; /** @@ -175,14 +169,22 @@ String arcUrl = ""; String collection = ""; String segmentName = getConf().get( Nutch.SEGMENT_NAME_KEY ); - + + // First, ignore blank manifest lines, and those that are comments. + String line = value.toString().trim( ); + if ( line.length() == 0 || line.charAt( 0 ) == '#' ) + { + // Ignore it. + return ; + } + // Each line of the manifest is "<url> <collection>" where <collection> is optional - String[] line = value.toString().split( "\\s+" ); - arcUrl = line[0]; + String[] parts = line.split( "\\s+" ); + arcUrl = parts[0]; - if ( line.length > 1 ) + if ( parts.length > 1 ) { - collection = line[1]; + collection = parts[1]; } if ( LOG.isInfoEnabled() ) LOG.info( "Importing ARC: " + arcUrl ); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |