From: <bi...@us...> - 2012-01-24 18:28:20
|
Revision: 3602 http://archive-access.svn.sourceforge.net/archive-access/?rev=3602&view=rev Author: binzino Date: 2012-01-24 18:28:14 +0000 (Tue, 24 Jan 2012) Log Message: ----------- Change command-line option handling to allow for both manifests and naming (w)arc files directly. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2012-01-24 18:27:24 UTC (rev 3601) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2012-01-24 18:28:14 UTC (rev 3602) @@ -25,6 +25,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; @@ -693,62 +694,98 @@ return -1; } - JobConf job = new NutchJob( getConf() ); - - // Check for "-e <exclusions>" option. - int pos = 0; - if ( args[0].equals( "-e" ) ) + boolean isManifest = false; + boolean skipExisting = false; + String exclusions = null; + int i = 0; + for ( ; i < (args.length-1) ; i++ ) { - if ( args.length < 2 ) + if ( args[i].equals( "-e" ) ) { - System.out.println( "ERROR: Missing filename for option \"-e\"\n" ); - usage( ); - return -1; + i+=1; + if ( i >= (args.length-1) ) + { + usage(); + return 1; + } + + exclusions = args[i]; } + else if ( args[i].equals( "-m" ) ) + { + isManifest = true; + } + else if ( args[i].equals( "-s" ) ) + { + skipExisting = true; + } + else + { + break ; + } + } - job.set( "nutchwax.urlfilter.wayback.exclusions", args[1] ); - - pos = 2; - } - - if ( args.length - pos < 1 ) + if ( i > (args.length-2) ) { - System.out.println( "ERROR: Missing manifest file.\n" ); - usage( ); - return -1; + usage(); + return 1; } - Path manifestPath = new Path( args[pos++] ); + FileSystem fs = FileSystem.get( getConf() ); - Path segmentPath; - if ( args.length - pos < 1 ) + Path outputDir = new Path( args[args.length-1] ); + + if ( ! fs.getFileStatus( outputDir ).isDir() ) { - segmentPath = new Path( "segments", org.apache.nutch.crawl.Generator.generateSegmentName( ) ); + System.err.println( "ERROR: Output directory is not a directory: " + outputDir ); + return 2; } - else - { - segmentPath = new Path( args[pos] ); - } - + try { - job.setJobName( "Importer " + manifestPath ); - job.set( Nutch.SEGMENT_NAME_KEY, segmentPath.getName() ); + for ( ; i < (args.length-1) ; i++ ) + { + JobConf job = new NutchJob( getConf() ); + + if ( exclusions != null ) job.set( "nutchwax.urlfilter.wayback.exclusions", exclusions ); - FileInputFormat.addInputPath( job, manifestPath ); - job.setInputFormat( TextInputFormat.class ); + Path inputPath = new Path( args[i] ); + Path outputPath = new Path( outputDir, inputPath.getName() ); + + if ( fs.exists( outputPath ) ) + { + System.err.println( "ERROR: Output path already exists: " + outputPath ); + if ( ! skipExisting ) + { + return 3; + } + } + + job.setJobName( "Importer " + inputPath ); + job.set( Nutch.SEGMENT_NAME_KEY, outputPath.getName() ); - job.setMapperClass ( Importer.class ); - job.setReducerClass( Importer.class ); + FileInputFormat.setInputPaths( job, inputPath ); + if ( isManifest ) + { + job.setInputFormat( TextInputFormat.class ); + } + else + { + job.setInputFormat( FilenameInputFormat.class ); + } - FileOutputFormat.setOutputPath( job, segmentPath ); - job.setOutputFormat ( FetcherOutputFormat.class ); - job.setOutputKeyClass ( Text.class ); - job.setOutputValueClass( NutchWritable.class ); + job.setMapperClass ( Importer.class ); + job.setReducerClass( Importer.class ); + + FileOutputFormat.setOutputPath( job, outputPath ); + job.setOutputFormat ( FetcherOutputFormat.class ); + job.setOutputKeyClass ( Text.class ); + job.setOutputValueClass( NutchWritable.class ); + + RunningJob rj = JobClient.runJob( job ); + } - RunningJob rj = JobClient.runJob( job ); - - return rj.isSuccessful( ) ? 0 : 1; + return 0; } catch ( Exception e ) { @@ -765,13 +802,11 @@ public void usage( ) { String usage = - "Usage: Importer [opts] <manifest> [<segment>]\n" + "Usage: Importer [opts] <input> <output_dir>]\n" + "Options:\n" + " -e filename Exclusions file, over-rides configuration property.\n" + + " -m Inputs are manifest files\n" + "\n" - + "If <segment> not specified, a pathname will be automatically generated\n" - + "based on current time in sub-directory 'segments', which is created if\n" - + "necessary. This is to mirror the behavior of other Nutch actions.\n" ; System.out.println( usage ); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |