Thread: [Archive-access-cvs] SF.net SVN: archive-access:[2495] trunk/archive-access/projects/nutchwax/ arc

Brought to you by: binzino, bradtofel, gojomo, ia_igor, and 5 others

archive-access-cvs

[Archive-access-cvs] SF.net SVN: archive-access:[2495] trunk/archive-access/projects/nutchwax/ archive/src/java/org/archive/nutchwax/Importer.java

From: <bi...@us...> - 2008-07-25 20:33:50

Revision: 2495
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2495&view=rev
Author:   binzino
Date:     2008-07-25 20:33:59 +0000 (Fri, 25 Jul 2008)

Log Message:
-----------
Changed "none" to "unknown" for HTTPStatusCodeFilter to avoid
confusion over whether "none" means "nothing is allowed at all"
vs. "no code for this record".

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java

Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2008-07-25 20:24:53 UTC (rev 2494)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2008-07-25 20:33:59 UTC (rev 2495)
@@ -715,10 +715,10 @@
       {
         Range range = new Range( );
 
-        // Special handling for "none" where an ARCRecord doesn't have
+        // Special handling for "unknown" where an ARCRecord doesn't have
         // an HTTP status code.  The ARCRecord.getStatusCode() returns
         // -1 in that case, so we make a range for it.
-        if ( value.toLowerCase( ).equals( "none" ) )
+        if ( value.toLowerCase( ).equals( "unknown" ) )
           {
             range.lower = -1;
             range.upper = -1;


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2586] trunk/archive-access/projects/nutchwax/ archive/src/java/org/archive/nutchwax/Importer.java

From: <bi...@us...> - 2008-08-28 21:54:45

Revision: 2586
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2586&view=rev
Author:   binzino
Date:     2008-08-28 21:54:54 +0000 (Thu, 28 Aug 2008)

Log Message:
-----------
Nutch updated to Hadoop 0.17 and the Mapper interface added generics.
So, this class was updated accordingly.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java

Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2008-08-28 21:44:41 UTC (rev 2585)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2008-08-28 21:54:54 UTC (rev 2586)
@@ -97,7 +97,7 @@
  * to the importing of ARC files.  I've noted those details with
  * comments prefaced with "?:".
  */
-public class Importer extends Configured implements Tool, Mapper
+public class Importer extends Configured implements Tool, Mapper<WritableComparable, Writable, Text, NutchWritable>
 {
 
   public static final Log LOG = LogFactory.getLog( Importer.class );


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2592] trunk/archive-access/projects/nutchwax/ archive/src/java/org/archive/nutchwax/Importer.java

From: <bi...@us...> - 2008-09-22 18:40:19

Revision: 2592
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2592&view=rev
Author:   binzino
Date:     2008-09-22 18:40:08 +0000 (Mon, 22 Sep 2008)

Log Message:
-----------
WAX-21: Allow for blank linkes and comment lines in manifest file.  Comment lines start with '#'.  Extra whitespace at the start/end of all lines is also eliminated.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java

Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2008-09-22 18:07:59 UTC (rev 2591)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2008-09-22 18:40:08 UTC (rev 2592)
@@ -19,7 +19,6 @@
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.util.Map.Entry;
-import java.util.Iterator;
 import java.util.List;
 import java.util.ArrayList;
 
@@ -37,8 +36,6 @@
 import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.mapred.TextInputFormat;
-import org.apache.hadoop.mapred.TextOutputFormat;
-import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.crawl.CrawlDatum;
@@ -59,17 +56,14 @@
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ProtocolStatus;
 import org.apache.nutch.scoring.ScoringFilters;
-import org.apache.nutch.util.LogUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.StringUtil;
 
 import org.archive.io.ArchiveReader;
 import org.archive.io.ArchiveReaderFactory;
-import org.archive.io.ArchiveRecordHeader;
 import org.archive.io.arc.ARCRecord;
 import org.archive.io.arc.ARCRecordMetaData;
-import org.archive.io.warc.WARCConstants;
 
 
 /**
@@ -175,14 +169,22 @@
     String arcUrl      = "";
     String collection  = "";
     String segmentName = getConf().get( Nutch.SEGMENT_NAME_KEY );
-    
+
+    // First, ignore blank manifest lines, and those that are comments.
+    String line = value.toString().trim( );
+    if ( line.length() == 0 || line.charAt( 0 ) == '#' )
+      {
+        // Ignore it.
+        return ;
+      }
+
     // Each line of the manifest is "<url> <collection>" where <collection> is optional
-    String[] line = value.toString().split( "\\s+" );
-    arcUrl = line[0];
+    String[] parts = line.split( "\\s+" );
+    arcUrl = parts[0];
 
-    if ( line.length > 1 )
+    if ( parts.length > 1 )
       {
-        collection = line[1];
+        collection = parts[1];
       }
 
     if ( LOG.isInfoEnabled() ) LOG.info( "Importing ARC: " + arcUrl );


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2655] trunk/archive-access/projects/nutchwax/ archive/src/java/org/archive/nutchwax/Importer.java

From: <bi...@us...> - 2008-12-10 04:58:28

Revision: 2655
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2655&view=rev
Author:   binzino
Date:     2008-12-10 04:58:24 +0000 (Wed, 10 Dec 2008)

Log Message:
-----------
Change output of messages from stderr to stdout. Add code to check return status of job and pass back to command-line via System.exti() call.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java

Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2008-12-09 01:58:04 UTC (rev 2654)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2008-12-10 04:58:24 UTC (rev 2655)
@@ -36,6 +36,8 @@
 import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.mapred.TextInputFormat;
+import org.apache.hadoop.mapred.RunningJob;
+import org.apache.hadoop.mapred.JobStatus;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.crawl.CrawlDatum;
@@ -601,7 +603,7 @@
       {
         if ( args.length < 2 )
           {
-            System.err.println( "ERROR: Missing filename for option \"-e\"\n" );
+            System.out.println( "ERROR: Missing filename for option \"-e\"\n" );
             usage( );
             return -1;
           }
@@ -613,7 +615,7 @@
     
     if ( args.length - pos < 1 )
       {
-        System.err.println( "ERROR: Missing manifest file.\n" );
+        System.out.println( "ERROR: Missing manifest file.\n" );
         usage( );
         return -1;
       }
@@ -645,17 +647,20 @@
         job.setOutputKeyClass  ( Text.class                );
         job.setOutputValueClass( NutchWritable.class       );
 
-        JobClient.runJob( job );
+        RunningJob rj = JobClient.runJob( job );
+
+        // Emit job id and status.
+        System.out.println( "JOB_STATUS: " + rj.getID( ) + ": " + (rj.isSuccessful( ) ? "SUCCESS" : "FAIL" ) );
+
+        return rj.isSuccessful( ) ? 0 : 1;
       }
     catch ( Exception e )
       {
         LOG.fatal( "Importer: ", e );
-        System.err.println( "Fatal error: " + e );
-        e.printStackTrace( System.err );
+        System.out.println( "Fatal error: " + e );
+        e.printStackTrace( System.out );
         return -1;
       }
-    
-    return 0;
   }
 
   /**
@@ -673,7 +678,7 @@
       + "necessary.  This is to mirror the behavior of other Nutch actions.\n"
       ;
     
-    System.err.println( usage );
+    System.out.println( usage );
   }
 
   /**


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2699] trunk/archive-access/projects/nutchwax/ archive/src/java/org/archive/nutchwax/Importer.java

From: <bi...@us...> - 2009-05-05 20:24:28

Revision: 2699
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2699&view=rev
Author:   binzino
Date:     2009-05-05 20:24:22 +0000 (Tue, 05 May 2009)

Log Message:
-----------
WAX-42.  Add option to continue/abort importing after read error on
archive file.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java

Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2009-05-05 20:20:45 UTC (rev 2698)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2009-05-05 20:24:22 UTC (rev 2699)
@@ -210,6 +210,15 @@
             reporter.progress();
           }
       }
+    catch ( Exception e )
+      {
+        LOG.warn( "Error processing archive file: " + arcUrl, e );
+
+        if ( jobConf.getBoolean( "nutchwax.import.abortOnArchiveReadError", false ) )
+          {
+            throw new IOException( e );
+          }
+      }
     finally
       {
         r.close();


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2840] trunk/archive-access/projects/nutchwax/ archive/src/java/org/archive/nutchwax/Importer.java

From: <bi...@us...> - 2009-10-27 22:46:39

Revision: 2840
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2840&view=rev
Author:   binzino
Date:     2009-10-27 22:46:25 +0000 (Tue, 27 Oct 2009)

Log Message:
-----------
Minor edits to conform to Nutch 1.0 API.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java

Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2009-10-27 21:38:28 UTC (rev 2839)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2009-10-27 22:46:25 UTC (rev 2840)
@@ -30,14 +30,16 @@
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
 import org.apache.hadoop.mapred.JobClient;
 import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.JobStatus;
 import org.apache.hadoop.mapred.Mapper;
 import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.RunningJob;
 import org.apache.hadoop.mapred.TextInputFormat;
-import org.apache.hadoop.mapred.RunningJob;
-import org.apache.hadoop.mapred.JobStatus;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.crawl.CrawlDatum;
@@ -46,8 +48,8 @@
 import org.apache.nutch.fetcher.FetcherOutputFormat;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLFilterException;
 import org.apache.nutch.net.URLFilters;
-import org.apache.nutch.net.URLFilterException;
 import org.apache.nutch.net.URLNormalizers;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseImpl;
@@ -323,7 +325,7 @@
         // We store both the normal URL and the URL+digest key for
         // later retrieval by the indexing plugin(s).
         contentMetadata.set( NutchWax.URL_KEY,            url  );
-        contentMetadata.set( NutchWax.ORIG_KEY,           key  );
+        //contentMetadata.set( NutchWax.ORIG_KEY,           key  );
 
         contentMetadata.set( NutchWax.FILENAME_KEY,       meta.getArcFile().getName() );
         contentMetadata.set( NutchWax.FILEOFFSET_KEY,     String.valueOf( record.getHeader().getOffset( ) ) );
@@ -650,12 +652,14 @@
         job.setJobName( "Importer " + manifestPath );
         job.set( Nutch.SEGMENT_NAME_KEY, segmentPath.getName() );
 
-        job.setInputPath  ( manifestPath);
+        //job.setInputPath  ( manifestPath);
+        FileInputFormat.addInputPath( job, manifestPath );
         job.setInputFormat( TextInputFormat.class );
 
         job.setMapperClass( Importer.class   );
 
-        job.setOutputPath      ( segmentPath               );
+        //job.setOutputPath      ( segmentPath               );
+        FileOutputFormat.setOutputPath( job, segmentPath );
         job.setOutputFormat    ( FetcherOutputFormat.class );
         job.setOutputKeyClass  ( Text.class                );
         job.setOutputValueClass( NutchWritable.class       );


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2943] trunk/archive-access/projects/nutchwax/ archive/src/java/org/archive/nutchwax/Importer.java

From: <bi...@us...> - 2010-01-12 22:17:50

Revision: 2943
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2943&view=rev
Author:   binzino
Date:     2010-01-12 22:17:44 +0000 (Tue, 12 Jan 2010)

Log Message:
-----------
WAX-69.  Comment out code that writes crawl_data.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java

Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2010-01-11 21:46:57 UTC (rev 2942)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java	2010-01-12 22:17:44 UTC (rev 2943)
@@ -467,7 +467,14 @@
     
     try
       {
-        output.collect( key, new NutchWritable( datum ) );
+        // Some weird problem with Hadoop 0.19.x - when the crawl_data
+        // is merged during the reduce step, the classloader cannot
+        // find the org.apache.nutch.protocol.ProtocolStatus class.
+        //
+        // We avoid the whole issue by omitting the crawl_data all
+        // together, which we don't use anyways.
+        //
+        // output.collect( key, new NutchWritable( datum ) );
 
         if ( jobConf.getBoolean( "nutchwax.import.store.content", false ) )
           {


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.