archive-access-cvs Mailing List for Web Archive Access Utilities (Page 4)

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 3602
          http://archive-access.svn.sourceforge.net/archive-access/?rev=3602&view=rev
Author:   binzino
Date:     2012-01-24 18:28:14 +0000 (Tue, 24 Jan 2012)
Log Message:
-----------
Change command-line option handling to allow for both manifests and naming (w)arc files directly.

Modified Paths:
--------------
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java

Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java
===================================================================

--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java	2012-01-24 18:27:24 UTC (rev 3601)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java	2012-01-24 18:28:14 UTC (rev 3602)
@@ -25,6 +25,7 @@
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
@@ -693,62 +694,98 @@
         return -1;
       }
 
-    JobConf job = new NutchJob( getConf() );
-
-    // Check for "-e <exclusions>" option.
-    int pos = 0;
-    if ( args[0].equals( "-e" ) )
+    boolean isManifest   = false;
+    boolean skipExisting = false;
+    String  exclusions   = null;
+    int i = 0;
+    for ( ; i < (args.length-1) ; i++ )
       {
-        if ( args.length < 2 )
+        if ( args[i].equals( "-e" ) )
           {
-            System.out.println( "ERROR: Missing filename for option \"-e\"\n" );
-            usage( );
-            return -1;
+            i+=1;
+            if ( i >= (args.length-1) )
+              {
+                usage();
+                return 1;
+              }
+            
+            exclusions = args[i];
           }
+        else if ( args[i].equals( "-m" ) )
+          {
+            isManifest = true;
+          }
+        else if ( args[i].equals( "-s" ) )
+          {
+            skipExisting = true;
+          }
+        else
+          {
+            break ;
+          }
+      }
 
-        job.set( "nutchwax.urlfilter.wayback.exclusions", args[1] );
-
-        pos = 2;
-      }
-    
-    if ( args.length - pos < 1 )
+    if ( i > (args.length-2) )
       {
-        System.out.println( "ERROR: Missing manifest file.\n" );
-        usage( );
-        return -1;
+        usage();
+        return 1;
       }
 
-    Path manifestPath = new Path( args[pos++] );
+    FileSystem fs = FileSystem.get( getConf() );
 
-    Path segmentPath;
-    if ( args.length - pos < 1 )
+    Path outputDir = new Path( args[args.length-1] );
+
+    if ( ! fs.getFileStatus( outputDir ).isDir() )
       {
-        segmentPath = new Path( "segments", org.apache.nutch.crawl.Generator.generateSegmentName( ) );
+        System.err.println( "ERROR: Output directory is not a directory: " + outputDir );
+        return 2;
       }
-    else
-      {
-        segmentPath = new Path( args[pos] );
-      }
-
+    
     try
       {
-        job.setJobName( "Importer " + manifestPath );
-        job.set( Nutch.SEGMENT_NAME_KEY, segmentPath.getName() );
+        for ( ; i < (args.length-1) ; i++ )
+          {
+            JobConf job = new NutchJob( getConf() );
+            
+            if ( exclusions != null ) job.set( "nutchwax.urlfilter.wayback.exclusions", exclusions );
 
-        FileInputFormat.addInputPath( job, manifestPath );
-        job.setInputFormat( TextInputFormat.class );
+            Path inputPath  = new Path( args[i] );
+            Path outputPath = new Path( outputDir, inputPath.getName() );
+            
+            if ( fs.exists( outputPath ) )
+              {
+                System.err.println( "ERROR: Output path already exists: " + outputPath );
+                if ( ! skipExisting )
+                  {
+                    return 3;
+                  }
+              }
+            
+            job.setJobName( "Importer " + inputPath );
+            job.set( Nutch.SEGMENT_NAME_KEY, outputPath.getName() );
 
-        job.setMapperClass ( Importer.class   );
-        job.setReducerClass( Importer.class   );
+            FileInputFormat.setInputPaths( job, inputPath );
+            if ( isManifest )
+              {
+                job.setInputFormat( TextInputFormat.class );
+              }
+            else
+              {
+                job.setInputFormat( FilenameInputFormat.class );
+              }
 
-        FileOutputFormat.setOutputPath( job, segmentPath );
-        job.setOutputFormat    ( FetcherOutputFormat.class );
-        job.setOutputKeyClass  ( Text.class                );
-        job.setOutputValueClass( NutchWritable.class       );
+            job.setMapperClass ( Importer.class );
+            job.setReducerClass( Importer.class );
+            
+            FileOutputFormat.setOutputPath( job, outputPath );
+            job.setOutputFormat    ( FetcherOutputFormat.class );
+            job.setOutputKeyClass  ( Text.class                );
+            job.setOutputValueClass( NutchWritable.class       );
+            
+            RunningJob rj = JobClient.runJob( job );
+          }
 
-        RunningJob rj = JobClient.runJob( job );
-
-        return rj.isSuccessful( ) ? 0 : 1;
+        return 0;
       }
     catch ( Exception e )
       {
@@ -765,13 +802,11 @@
   public void usage( )
   {
     String usage = 
-        "Usage: Importer [opts] <manifest> [<segment>]\n" 
+        "Usage: Importer [opts] <input> <output_dir>]\n" 
       + "Options:\n" 
       + "  -e filename     Exclusions file, over-rides configuration property.\n" 
+      + "  -m              Inputs are manifest files\n"      
       + "\n" 
-      + "If <segment> not specified, a pathname will be automatically generated\n" 
-      + "based on current time in sub-directory 'segments', which is created if\n" 
-      + "necessary.  This is to mirror the behavior of other Nutch actions.\n"
       ;
     
     System.out.println( usage );

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





2005	Jan	Feb	Mar	Apr	May	Jun	Jul (1)	Aug (10)	Sep (36)	Oct (339)	Nov (103)	Dec (152)
2006	Jan (141)	Feb (102)	Mar (125)	Apr (203)	May (57)	Jun (30)	Jul (139)	Aug (46)	Sep (64)	Oct (105)	Nov (34)	Dec (162)
2007	Jan (81)	Feb (57)	Mar (141)	Apr (72)	May (9)	Jun (1)	Jul (144)	Aug (88)	Sep (40)	Oct (43)	Nov (34)	Dec (20)
2008	Jan (44)	Feb (45)	Mar (16)	Apr (36)	May (8)	Jun (77)	Jul (177)	Aug (66)	Sep (8)	Oct (33)	Nov (13)	Dec (37)
2009	Jan (2)	Feb (5)	Mar (8)	Apr	May (36)	Jun (19)	Jul (46)	Aug (8)	Sep (1)	Oct (66)	Nov (61)	Dec (10)
2010	Jan (13)	Feb (16)	Mar (38)	Apr (76)	May (47)	Jun (32)	Jul (35)	Aug (45)	Sep (20)	Oct (61)	Nov (24)	Dec (16)
2011	Jan (22)	Feb (34)	Mar (11)	Apr (8)	May (24)	Jun (23)	Jul (11)	Aug (42)	Sep (81)	Oct (48)	Nov (21)	Dec (20)
2012	Jan (30)	Feb (25)	Mar (4)	Apr (6)	May (1)	Jun (5)	Jul (5)	Aug (8)	Sep (6)	Oct (6)	Nov	Dec

archive-access-cvs Mailing List for Web Archive Access Utilities (Page 4)

archive-access-cvs — CVS commits