From: <bi...@us...> - 2008-07-25 20:33:50
|
Revision: 2495 http://archive-access.svn.sourceforge.net/archive-access/?rev=2495&view=rev Author: binzino Date: 2008-07-25 20:33:59 +0000 (Fri, 25 Jul 2008) Log Message: ----------- Changed "none" to "unknown" for HTTPStatusCodeFilter to avoid confusion over whether "none" means "nothing is allowed at all" vs. "no code for this record". Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-07-25 20:24:53 UTC (rev 2494) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-07-25 20:33:59 UTC (rev 2495) @@ -715,10 +715,10 @@ { Range range = new Range( ); - // Special handling for "none" where an ARCRecord doesn't have + // Special handling for "unknown" where an ARCRecord doesn't have // an HTTP status code. The ARCRecord.getStatusCode() returns // -1 in that case, so we make a range for it. - if ( value.toLowerCase( ).equals( "none" ) ) + if ( value.toLowerCase( ).equals( "unknown" ) ) { range.lower = -1; range.upper = -1; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-08-28 21:54:45
|
Revision: 2586 http://archive-access.svn.sourceforge.net/archive-access/?rev=2586&view=rev Author: binzino Date: 2008-08-28 21:54:54 +0000 (Thu, 28 Aug 2008) Log Message: ----------- Nutch updated to Hadoop 0.17 and the Mapper interface added generics. So, this class was updated accordingly. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-08-28 21:44:41 UTC (rev 2585) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-08-28 21:54:54 UTC (rev 2586) @@ -97,7 +97,7 @@ * to the importing of ARC files. I've noted those details with * comments prefaced with "?:". */ -public class Importer extends Configured implements Tool, Mapper +public class Importer extends Configured implements Tool, Mapper<WritableComparable, Writable, Text, NutchWritable> { public static final Log LOG = LogFactory.getLog( Importer.class ); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-09-22 18:40:19
|
Revision: 2592 http://archive-access.svn.sourceforge.net/archive-access/?rev=2592&view=rev Author: binzino Date: 2008-09-22 18:40:08 +0000 (Mon, 22 Sep 2008) Log Message: ----------- WAX-21: Allow for blank linkes and comment lines in manifest file. Comment lines start with '#'. Extra whitespace at the start/end of all lines is also eliminated. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-09-22 18:07:59 UTC (rev 2591) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-09-22 18:40:08 UTC (rev 2592) @@ -19,7 +19,6 @@ import java.io.IOException; import java.net.MalformedURLException; import java.util.Map.Entry; -import java.util.Iterator; import java.util.List; import java.util.ArrayList; @@ -37,8 +36,6 @@ import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.TextInputFormat; -import org.apache.hadoop.mapred.TextOutputFormat; -import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; @@ -59,17 +56,14 @@ import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.ProtocolStatus; import org.apache.nutch.scoring.ScoringFilters; -import org.apache.nutch.util.LogUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.StringUtil; import org.archive.io.ArchiveReader; import org.archive.io.ArchiveReaderFactory; -import org.archive.io.ArchiveRecordHeader; import org.archive.io.arc.ARCRecord; import org.archive.io.arc.ARCRecordMetaData; -import org.archive.io.warc.WARCConstants; /** @@ -175,14 +169,22 @@ String arcUrl = ""; String collection = ""; String segmentName = getConf().get( Nutch.SEGMENT_NAME_KEY ); - + + // First, ignore blank manifest lines, and those that are comments. + String line = value.toString().trim( ); + if ( line.length() == 0 || line.charAt( 0 ) == '#' ) + { + // Ignore it. + return ; + } + // Each line of the manifest is "<url> <collection>" where <collection> is optional - String[] line = value.toString().split( "\\s+" ); - arcUrl = line[0]; + String[] parts = line.split( "\\s+" ); + arcUrl = parts[0]; - if ( line.length > 1 ) + if ( parts.length > 1 ) { - collection = line[1]; + collection = parts[1]; } if ( LOG.isInfoEnabled() ) LOG.info( "Importing ARC: " + arcUrl ); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-12-10 04:58:28
|
Revision: 2655 http://archive-access.svn.sourceforge.net/archive-access/?rev=2655&view=rev Author: binzino Date: 2008-12-10 04:58:24 +0000 (Wed, 10 Dec 2008) Log Message: ----------- Change output of messages from stderr to stdout. Add code to check return status of job and pass back to command-line via System.exti() call. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-12-09 01:58:04 UTC (rev 2654) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-12-10 04:58:24 UTC (rev 2655) @@ -36,6 +36,8 @@ import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.JobStatus; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; @@ -601,7 +603,7 @@ { if ( args.length < 2 ) { - System.err.println( "ERROR: Missing filename for option \"-e\"\n" ); + System.out.println( "ERROR: Missing filename for option \"-e\"\n" ); usage( ); return -1; } @@ -613,7 +615,7 @@ if ( args.length - pos < 1 ) { - System.err.println( "ERROR: Missing manifest file.\n" ); + System.out.println( "ERROR: Missing manifest file.\n" ); usage( ); return -1; } @@ -645,17 +647,20 @@ job.setOutputKeyClass ( Text.class ); job.setOutputValueClass( NutchWritable.class ); - JobClient.runJob( job ); + RunningJob rj = JobClient.runJob( job ); + + // Emit job id and status. + System.out.println( "JOB_STATUS: " + rj.getID( ) + ": " + (rj.isSuccessful( ) ? "SUCCESS" : "FAIL" ) ); + + return rj.isSuccessful( ) ? 0 : 1; } catch ( Exception e ) { LOG.fatal( "Importer: ", e ); - System.err.println( "Fatal error: " + e ); - e.printStackTrace( System.err ); + System.out.println( "Fatal error: " + e ); + e.printStackTrace( System.out ); return -1; } - - return 0; } /** @@ -673,7 +678,7 @@ + "necessary. This is to mirror the behavior of other Nutch actions.\n" ; - System.err.println( usage ); + System.out.println( usage ); } /** This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-05-05 20:24:28
|
Revision: 2699 http://archive-access.svn.sourceforge.net/archive-access/?rev=2699&view=rev Author: binzino Date: 2009-05-05 20:24:22 +0000 (Tue, 05 May 2009) Log Message: ----------- WAX-42. Add option to continue/abort importing after read error on archive file. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2009-05-05 20:20:45 UTC (rev 2698) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2009-05-05 20:24:22 UTC (rev 2699) @@ -210,6 +210,15 @@ reporter.progress(); } } + catch ( Exception e ) + { + LOG.warn( "Error processing archive file: " + arcUrl, e ); + + if ( jobConf.getBoolean( "nutchwax.import.abortOnArchiveReadError", false ) ) + { + throw new IOException( e ); + } + } finally { r.close(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-27 22:46:39
|
Revision: 2840 http://archive-access.svn.sourceforge.net/archive-access/?rev=2840&view=rev Author: binzino Date: 2009-10-27 22:46:25 +0000 (Tue, 27 Oct 2009) Log Message: ----------- Minor edits to conform to Nutch 1.0 API. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2009-10-27 21:38:28 UTC (rev 2839) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2009-10-27 22:46:25 UTC (rev 2840) @@ -30,14 +30,16 @@ import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.JobStatus; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.RunningJob; import org.apache.hadoop.mapred.TextInputFormat; -import org.apache.hadoop.mapred.RunningJob; -import org.apache.hadoop.mapred.JobStatus; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; @@ -46,8 +48,8 @@ import org.apache.nutch.fetcher.FetcherOutputFormat; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.net.URLFilterException; import org.apache.nutch.net.URLFilters; -import org.apache.nutch.net.URLFilterException; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseImpl; @@ -323,7 +325,7 @@ // We store both the normal URL and the URL+digest key for // later retrieval by the indexing plugin(s). contentMetadata.set( NutchWax.URL_KEY, url ); - contentMetadata.set( NutchWax.ORIG_KEY, key ); + //contentMetadata.set( NutchWax.ORIG_KEY, key ); contentMetadata.set( NutchWax.FILENAME_KEY, meta.getArcFile().getName() ); contentMetadata.set( NutchWax.FILEOFFSET_KEY, String.valueOf( record.getHeader().getOffset( ) ) ); @@ -650,12 +652,14 @@ job.setJobName( "Importer " + manifestPath ); job.set( Nutch.SEGMENT_NAME_KEY, segmentPath.getName() ); - job.setInputPath ( manifestPath); + //job.setInputPath ( manifestPath); + FileInputFormat.addInputPath( job, manifestPath ); job.setInputFormat( TextInputFormat.class ); job.setMapperClass( Importer.class ); - job.setOutputPath ( segmentPath ); + //job.setOutputPath ( segmentPath ); + FileOutputFormat.setOutputPath( job, segmentPath ); job.setOutputFormat ( FetcherOutputFormat.class ); job.setOutputKeyClass ( Text.class ); job.setOutputValueClass( NutchWritable.class ); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2010-01-12 22:17:50
|
Revision: 2943 http://archive-access.svn.sourceforge.net/archive-access/?rev=2943&view=rev Author: binzino Date: 2010-01-12 22:17:44 +0000 (Tue, 12 Jan 2010) Log Message: ----------- WAX-69. Comment out code that writes crawl_data. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2010-01-11 21:46:57 UTC (rev 2942) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2010-01-12 22:17:44 UTC (rev 2943) @@ -467,7 +467,14 @@ try { - output.collect( key, new NutchWritable( datum ) ); + // Some weird problem with Hadoop 0.19.x - when the crawl_data + // is merged during the reduce step, the classloader cannot + // find the org.apache.nutch.protocol.ProtocolStatus class. + // + // We avoid the whole issue by omitting the crawl_data all + // together, which we don't use anyways. + // + // output.collect( key, new NutchWritable( datum ) ); if ( jobConf.getBoolean( "nutchwax.import.store.content", false ) ) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |