From: <bi...@us...> - 2010-10-27 07:07:57
|
Revision: 3310 http://archive-access.svn.sourceforge.net/archive-access/?rev=3310&view=rev Author: binzino Date: 2010-10-27 07:07:51 +0000 (Wed, 27 Oct 2010) Log Message: ----------- Disabled the BoilerPipe stuff for now. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-10-27 07:07:20 UTC (rev 3309) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-10-27 07:07:51 UTC (rev 3310) @@ -338,6 +338,16 @@ contentMetadata.set( NutchWax.CONTENT_LENGTH_KEY, String.valueOf( meta.getLength() ) ); contentMetadata.set( NutchWax.HTTP_RESPONSE_KEY, String.valueOf( record.getStatusCode() ) ); + // BoilerPipe! + /* + if ( "text/html".equals( meta.getMimetype() ) ) + { + String boiledHTML = de.l3s.boilerpipe.extractors.DefaultExtractor.INSTANCE.getText( new org.xml.sax.InputSource( new java.io.ByteArrayInputStream( bytes ) ) ); + + contentMetadata.set( "boiledHTML", boiledHTML ); + } + */ + Content content = new Content( url, url, bytes, meta.getMimetype(), contentMetadata, getConf() ); output( output, new Text( key ), content ); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2010-10-28 00:53:17
|
Revision: 3317 http://archive-access.svn.sourceforge.net/archive-access/?rev=3317&view=rev Author: binzino Date: 2010-10-28 00:53:11 +0000 (Thu, 28 Oct 2010) Log Message: ----------- Added digest to metadata. Added use of auto-content-type-detection. Disabled BoilerPipe. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-10-28 00:52:10 UTC (rev 3316) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-10-28 00:53:11 UTC (rev 3317) @@ -334,13 +334,29 @@ contentMetadata.set( NutchWax.COLLECTION_KEY, collectionName ); contentMetadata.set( NutchWax.DATE_KEY, meta.getDate() ); contentMetadata.set( NutchWax.DIGEST_KEY, meta.getDigest() ); - contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, meta.getMimetype() ); contentMetadata.set( NutchWax.CONTENT_LENGTH_KEY, String.valueOf( meta.getLength() ) ); contentMetadata.set( NutchWax.HTTP_RESPONSE_KEY, String.valueOf( record.getStatusCode() ) ); + String type = (meta.getMimetype( ) == null ? "" : meta.getMimetype( )).split( "[;]" )[0].toLowerCase().trim(); + + // If the Content-Type from the HTTP response is "text/plain", + // set it to null to trigger full auto-detection via Tika. + if ( "text/plain".equals( type ) ) + { + type = null; + } + + Content content = new Content( url, url, bytes, type, contentMetadata, getConf() ); + + if ( LOG.isDebugEnabled() ) LOG.debug( "Auto-detect content-type: " + type + " " + content.getContentType( ) + " " + url ); + + // Store both the original and auto-detected content types. + contentMetadata.set( NutchWax.ORIGINAL_TYPE_KEY, meta.getMimetype( ) ); + contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, content.getContentType( ) ); + // BoilerPipe! /* - if ( "text/html".equals( meta.getMimetype() ) ) + if ( "text/html".equals( content.getContentType( ) ) ) { String boiledHTML = de.l3s.boilerpipe.extractors.DefaultExtractor.INSTANCE.getText( new org.xml.sax.InputSource( new java.io.ByteArrayInputStream( bytes ) ) ); @@ -348,8 +364,6 @@ } */ - Content content = new Content( url, url, bytes, meta.getMimetype(), contentMetadata, getConf() ); - output( output, new Text( key ), content ); return true; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2010-11-12 23:54:43
|
Revision: 3333 http://archive-access.svn.sourceforge.net/archive-access/?rev=3333&view=rev Author: binzino Date: 2010-11-12 23:54:34 +0000 (Fri, 12 Nov 2010) Log Message: ----------- Added config property to enable/disable BoilerPipe. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-11-11 05:49:07 UTC (rev 3332) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-11-12 23:54:34 UTC (rev 3333) @@ -354,15 +354,16 @@ contentMetadata.set( NutchWax.ORIGINAL_TYPE_KEY, meta.getMimetype( ) ); contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, content.getContentType( ) ); - // BoilerPipe! - /* - if ( "text/html".equals( content.getContentType( ) ) ) + if ( "text/html" .equals( content.getContentType( ) ) || + "application/xhtml+xml".equals( content.getContentType( ) ) || + "application/xhtml" .equals( content.getContentType( ) ) ) { - String boiledHTML = de.l3s.boilerpipe.extractors.DefaultExtractor.INSTANCE.getText( new org.xml.sax.InputSource( new java.io.ByteArrayInputStream( bytes ) ) ); - - contentMetadata.set( "boiledHTML", boiledHTML ); + if ( jobConf.getBoolean( "nutchwax.import.boilerpipe", false ) ) + { + // BoilerPipe! + contentMetadata.set( "boiled", de.l3s.boilerpipe.extractors.DefaultExtractor.INSTANCE.getText( new org.xml.sax.InputSource( new java.io.ByteArrayInputStream( bytes ) ) ) ); + } } - */ output( output, new Text( key ), content ); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2010-11-15 20:32:40
|
Revision: 3334 http://archive-access.svn.sourceforge.net/archive-access/?rev=3334&view=rev Author: binzino Date: 2010-11-15 20:32:34 +0000 (Mon, 15 Nov 2010) Log Message: ----------- Added nutchwax.import.content.limit.html property. If html file is larger than this value, it is skipped. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-11-12 23:54:34 UTC (rev 3333) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-11-15 20:32:34 UTC (rev 3334) @@ -247,7 +247,7 @@ { ARCRecordMetaData meta = record.getMetaData(); - if ( LOG.isDebugEnabled() ) LOG.debug( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ") [" + meta.getLength( ) + "]" ); + if ( LOG.isInfoEnabled() ) LOG.info( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ") [" + meta.getLength( ) + "]" ); if ( ! this.httpStatusCodeFilter.isAllowed( record.getStatusCode( ) ) ) { @@ -266,7 +266,8 @@ // We use record.available() rather than meta.getLength() // because the latter includes the size of the HTTP header, // which we just skipped. - byte[] bytes = readBytes( record, record.available( ) ); + long length = record.available(); + byte[] bytes = readBytes( record, length ); // If there is no digest, then we assume we're reading an // ARCRecord not a WARCRecord. In that case, we close the @@ -358,6 +359,13 @@ "application/xhtml+xml".equals( content.getContentType( ) ) || "application/xhtml" .equals( content.getContentType( ) ) ) { + long size = jobConf.getLong( "nutchwax.import.content.limit.html", -1 ); + if ( size > 0 && size < length ) + { + LOG.warn( "HTML file size exceeds threshold [" + size + "], skipping: " + meta.getUrl( ) + " [" + length + "]" ); + return false; + } + if ( jobConf.getBoolean( "nutchwax.import.boilerpipe", false ) ) { // BoilerPipe! @@ -365,7 +373,7 @@ } } - output( output, new Text( key ), content ); + output( output, new Text( key ), content ); return true; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2010-11-16 23:16:42
|
Revision: 3335 http://archive-access.svn.sourceforge.net/archive-access/?rev=3335&view=rev Author: binzino Date: 2010-11-16 23:16:35 +0000 (Tue, 16 Nov 2010) Log Message: ----------- Added config controls to trim input docs for text/plain and text/html to avoid performance problems with large (50+MB) input docs. Also added try/catch around boilerpipe. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-11-15 20:32:34 UTC (rev 3334) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-11-16 23:16:35 UTC (rev 3335) @@ -16,11 +16,10 @@ */ package org.archive.nutchwax; -import java.io.IOException; -import java.net.MalformedURLException; +import java.io.*; +import java.net.*; import java.util.Map.Entry; -import java.util.List; -import java.util.ArrayList; +import java.util.*; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -359,20 +358,43 @@ "application/xhtml+xml".equals( content.getContentType( ) ) || "application/xhtml" .equals( content.getContentType( ) ) ) { - long size = jobConf.getLong( "nutchwax.import.content.limit.html", -1 ); + int size = jobConf.getInt( "nutchwax.import.content.limit.html", -1 ); if ( size > 0 && size < length ) { - LOG.warn( "HTML file size exceeds threshold [" + size + "], skipping: " + meta.getUrl( ) + " [" + length + "]" ); - return false; + LOG.warn( "HTML file size exceeds threshold [" + size + "]: " + meta.getUrl( ) + " [" + length + "]" ); + + bytes = Arrays.copyOf( bytes, size ); + + content.setContent( bytes ); } - if ( jobConf.getBoolean( "nutchwax.import.boilerpipe", false ) ) + try { - // BoilerPipe! - contentMetadata.set( "boiled", de.l3s.boilerpipe.extractors.DefaultExtractor.INSTANCE.getText( new org.xml.sax.InputSource( new java.io.ByteArrayInputStream( bytes ) ) ) ); + if ( jobConf.getBoolean( "nutchwax.import.boilerpipe", false ) ) + { + // BoilerPipe! + contentMetadata.set( "boiled", de.l3s.boilerpipe.extractors.DefaultExtractor.INSTANCE.getText( new org.xml.sax.InputSource( new java.io.ByteArrayInputStream( bytes ) ) ) ); + } } + catch ( Exception e ) + { + LOG.warn( "Error boilerpiping: " + meta.getUrl( ) ); + } } + if ( "text/plain".equals( content.getContentType( ) ) ) + { + int size = jobConf.getInt( "nutchwax.import.content.limit.text", -1 ); + if ( size > 0 && size < length ) + { + LOG.warn( "Text file size exceeds threshold [" + size + "]: " + meta.getUrl( ) + " [" + length + "]" ); + + bytes = Arrays.copyOf( bytes, size ); + + content.setContent( bytes ); + } + } + output( output, new Text( key ), content ); return true; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2011-06-13 18:20:28
|
Revision: 3463 http://archive-access.svn.sourceforge.net/archive-access/?rev=3463&view=rev Author: binzino Date: 2011-06-13 18:20:21 +0000 (Mon, 13 Jun 2011) Log Message: ----------- Added custom reducer to allow for multiple values for the same key. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2011-06-10 02:24:23 UTC (rev 3462) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2011-06-13 18:20:21 UTC (rev 3463) @@ -36,6 +36,7 @@ import org.apache.hadoop.mapred.JobStatus; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.RunningJob; import org.apache.hadoop.mapred.TextInputFormat; @@ -94,7 +95,7 @@ * to the importing of ARC files. I've noted those details with * comments prefaced with "?:". */ -public class Importer extends Configured implements Tool, Mapper<WritableComparable, Writable, Text, NutchWritable> +public class Importer extends Configured implements Tool, Mapper<WritableComparable, Writable, Text, NutchWritable>, Reducer<WritableComparable,Writable,WritableComparable,Writable> { public static final Log LOG = LogFactory.getLog( Importer.class ); @@ -154,6 +155,23 @@ } + public void reduce( WritableComparable key, + Iterator<Writable> values, + OutputCollector<WritableComparable, Writable> output, + Reporter reporter + ) + throws IOException + { + + while ( values.hasNext( ) ) + { + if ( LOG.isInfoEnabled() ) LOG.info( "Reduce: key = " + key.toString() ); + + output.collect( key, values.next( ) ); + } + } + + /** * <p>Runs the Map job to import records from an archive file into a * Nutch segment.</p> @@ -588,7 +606,11 @@ if ( LOG.isWarnEnabled() ) LOG.warn( "Couldn't pass score, url = " + key, e ); } - output.collect( key, new NutchWritable( new ParseImpl( new ParseText( parse.getText() ), parse.getData(), parse.isCanonical() ) ) ); + String parsedText = parse.getText(); + + // TODO: Limit size of parsedText. + + output.collect( key, new NutchWritable( new ParseImpl( new ParseText( parsedText ), parse.getData(), parse.isCanonical() ) ) ); } } } @@ -719,7 +741,8 @@ FileInputFormat.addInputPath( job, manifestPath ); job.setInputFormat( TextInputFormat.class ); - job.setMapperClass( Importer.class ); + job.setMapperClass ( Importer.class ); + job.setReducerClass( Importer.class ); //job.setOutputPath ( segmentPath ); FileOutputFormat.setOutputPath( job, segmentPath ); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2011-08-29 22:05:19
|
Revision: 3508 http://archive-access.svn.sourceforge.net/archive-access/?rev=3508&view=rev Author: binzino Date: 2011-08-29 22:05:13 +0000 (Mon, 29 Aug 2011) Log Message: ----------- Removed some obsolete code that was commented-out long ago. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2011-08-18 22:52:50 UTC (rev 3507) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2011-08-29 22:05:13 UTC (rev 3508) @@ -344,8 +344,6 @@ // We store both the normal URL and the URL+digest key for // later retrieval by the indexing plugin(s). contentMetadata.set( NutchWax.URL_KEY, url ); - //contentMetadata.set( NutchWax.ORIG_KEY, key ); - contentMetadata.set( NutchWax.FILENAME_KEY, meta.getArcFile().getName() ); contentMetadata.set( NutchWax.FILEOFFSET_KEY, String.valueOf( record.getHeader().getOffset( ) ) ); @@ -737,14 +735,12 @@ job.setJobName( "Importer " + manifestPath ); job.set( Nutch.SEGMENT_NAME_KEY, segmentPath.getName() ); - //job.setInputPath ( manifestPath); FileInputFormat.addInputPath( job, manifestPath ); job.setInputFormat( TextInputFormat.class ); job.setMapperClass ( Importer.class ); job.setReducerClass( Importer.class ); - //job.setOutputPath ( segmentPath ); FileOutputFormat.setOutputPath( job, segmentPath ); job.setOutputFormat ( FetcherOutputFormat.class ); job.setOutputKeyClass ( Text.class ); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2012-01-19 21:44:49
|
Revision: 3597 http://archive-access.svn.sourceforge.net/archive-access/?rev=3597&view=rev Author: binzino Date: 2012-01-19 21:44:43 +0000 (Thu, 19 Jan 2012) Log Message: ----------- Fix splitting of line to allow for collection names with spaces in them. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2012-01-12 22:22:43 UTC (rev 3596) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2012-01-19 21:44:43 UTC (rev 3597) @@ -200,7 +200,7 @@ } // Each line of the manifest is "<url> <collection>" where <collection> is optional - String[] parts = line.split( "\\s+" ); + String[] parts = line.split( "\\s+", 2 ); arcUrl = parts[0]; if ( parts.length > 1 ) This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2012-01-24 18:28:20
|
Revision: 3602 http://archive-access.svn.sourceforge.net/archive-access/?rev=3602&view=rev Author: binzino Date: 2012-01-24 18:28:14 +0000 (Tue, 24 Jan 2012) Log Message: ----------- Change command-line option handling to allow for both manifests and naming (w)arc files directly. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2012-01-24 18:27:24 UTC (rev 3601) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2012-01-24 18:28:14 UTC (rev 3602) @@ -25,6 +25,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; @@ -693,62 +694,98 @@ return -1; } - JobConf job = new NutchJob( getConf() ); - - // Check for "-e <exclusions>" option. - int pos = 0; - if ( args[0].equals( "-e" ) ) + boolean isManifest = false; + boolean skipExisting = false; + String exclusions = null; + int i = 0; + for ( ; i < (args.length-1) ; i++ ) { - if ( args.length < 2 ) + if ( args[i].equals( "-e" ) ) { - System.out.println( "ERROR: Missing filename for option \"-e\"\n" ); - usage( ); - return -1; + i+=1; + if ( i >= (args.length-1) ) + { + usage(); + return 1; + } + + exclusions = args[i]; } + else if ( args[i].equals( "-m" ) ) + { + isManifest = true; + } + else if ( args[i].equals( "-s" ) ) + { + skipExisting = true; + } + else + { + break ; + } + } - job.set( "nutchwax.urlfilter.wayback.exclusions", args[1] ); - - pos = 2; - } - - if ( args.length - pos < 1 ) + if ( i > (args.length-2) ) { - System.out.println( "ERROR: Missing manifest file.\n" ); - usage( ); - return -1; + usage(); + return 1; } - Path manifestPath = new Path( args[pos++] ); + FileSystem fs = FileSystem.get( getConf() ); - Path segmentPath; - if ( args.length - pos < 1 ) + Path outputDir = new Path( args[args.length-1] ); + + if ( ! fs.getFileStatus( outputDir ).isDir() ) { - segmentPath = new Path( "segments", org.apache.nutch.crawl.Generator.generateSegmentName( ) ); + System.err.println( "ERROR: Output directory is not a directory: " + outputDir ); + return 2; } - else - { - segmentPath = new Path( args[pos] ); - } - + try { - job.setJobName( "Importer " + manifestPath ); - job.set( Nutch.SEGMENT_NAME_KEY, segmentPath.getName() ); + for ( ; i < (args.length-1) ; i++ ) + { + JobConf job = new NutchJob( getConf() ); + + if ( exclusions != null ) job.set( "nutchwax.urlfilter.wayback.exclusions", exclusions ); - FileInputFormat.addInputPath( job, manifestPath ); - job.setInputFormat( TextInputFormat.class ); + Path inputPath = new Path( args[i] ); + Path outputPath = new Path( outputDir, inputPath.getName() ); + + if ( fs.exists( outputPath ) ) + { + System.err.println( "ERROR: Output path already exists: " + outputPath ); + if ( ! skipExisting ) + { + return 3; + } + } + + job.setJobName( "Importer " + inputPath ); + job.set( Nutch.SEGMENT_NAME_KEY, outputPath.getName() ); - job.setMapperClass ( Importer.class ); - job.setReducerClass( Importer.class ); + FileInputFormat.setInputPaths( job, inputPath ); + if ( isManifest ) + { + job.setInputFormat( TextInputFormat.class ); + } + else + { + job.setInputFormat( FilenameInputFormat.class ); + } - FileOutputFormat.setOutputPath( job, segmentPath ); - job.setOutputFormat ( FetcherOutputFormat.class ); - job.setOutputKeyClass ( Text.class ); - job.setOutputValueClass( NutchWritable.class ); + job.setMapperClass ( Importer.class ); + job.setReducerClass( Importer.class ); + + FileOutputFormat.setOutputPath( job, outputPath ); + job.setOutputFormat ( FetcherOutputFormat.class ); + job.setOutputKeyClass ( Text.class ); + job.setOutputValueClass( NutchWritable.class ); + + RunningJob rj = JobClient.runJob( job ); + } - RunningJob rj = JobClient.runJob( job ); - - return rj.isSuccessful( ) ? 0 : 1; + return 0; } catch ( Exception e ) { @@ -765,13 +802,11 @@ public void usage( ) { String usage = - "Usage: Importer [opts] <manifest> [<segment>]\n" + "Usage: Importer [opts] <input> <output_dir>]\n" + "Options:\n" + " -e filename Exclusions file, over-rides configuration property.\n" + + " -m Inputs are manifest files\n" + "\n" - + "If <segment> not specified, a pathname will be automatically generated\n" - + "based on current time in sub-directory 'segments', which is created if\n" - + "necessary. This is to mirror the behavior of other Nutch actions.\n" ; System.out.println( usage ); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2012-01-24 18:58:18
|
Revision: 3603 http://archive-access.svn.sourceforge.net/archive-access/?rev=3603&view=rev Author: binzino Date: 2012-01-24 18:58:08 +0000 (Tue, 24 Jan 2012) Log Message: ----------- Employ -s option. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2012-01-24 18:28:14 UTC (rev 3602) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2012-01-24 18:58:08 UTC (rev 3603) @@ -737,7 +737,7 @@ if ( ! fs.getFileStatus( outputDir ).isDir() ) { - System.err.println( "ERROR: Output directory is not a directory: " + outputDir ); + LOG.fatal( "Output is not a directory: " + outputDir ); return 2; } @@ -754,11 +754,13 @@ if ( fs.exists( outputPath ) ) { - System.err.println( "ERROR: Output path already exists: " + outputPath ); - if ( ! skipExisting ) + if ( skipExisting ) { - return 3; + LOG.warn( "Skipping output path which already exists: " + outputPath ); + continue ; } + LOG.fatal( "Output path already exists: " + outputPath ); + return 3; } job.setJobName( "Importer " + inputPath ); @@ -790,9 +792,7 @@ catch ( Exception e ) { LOG.fatal( "Importer: ", e ); - System.out.println( "Fatal error: " + e ); - e.printStackTrace( System.out ); - return -1; + return 4; } } @@ -805,7 +805,9 @@ "Usage: Importer [opts] <input> <output_dir>]\n" + "Options:\n" + " -e filename Exclusions file, over-rides configuration property.\n" - + " -m Inputs are manifest files\n" + + " -m Inputs are manifest files\n" + + " -s Skip inputs where corresponding output directory exists.\n" + + " Without -s, processing reports error and stops.\n" + "\n" ; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |