[Archive-access-cvs] SF.net SVN: archive-access: [1642] trunk/archive-access/projects/nutchwax

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 1642
          http://archive-access.svn.sourceforge.net/archive-access/?rev=1642&view=rev
Author:   stack-sf
Date:     2007-03-26 18:22:31 -0700 (Mon, 26 Mar 2007)

Log Message:
-----------

M    nutchwax/src/java/org/archive/access/nutch/mapred/TaskLogMapRunner.java
    Use LineRecordReader (TODO: Finish analysis).
D    nutchwax/src/java/org/archive/access/nutch/jobs/ImportLogsReporter.java
A    nutchwax/src/java/org/archive/access/nutch/jobs/LogsReporter.java
    Renamed as LogsReporter.
M    nutchwax/.classpath
    Removed missing plugins.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/.classpath
    trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/mapred/TaskLogMapRunner.java

Added Paths:
-----------
    trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/LogsReporter.java

Removed Paths:
-------------
    trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/ImportLogsReporter.java

Modified: trunk/archive-access/projects/nutchwax/.classpath
===================================================================

--- trunk/archive-access/projects/nutchwax/.classpath	2007-03-27 01:20:31 UTC (rev 1641)
+++ trunk/archive-access/projects/nutchwax/.classpath	2007-03-27 01:22:31 UTC (rev 1642)
@@ -71,10 +71,6 @@
 	<classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-ext/parse-ext.jar"/>
 	<classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-html/parse-html.jar"/>
 	<classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-js/parse-js.jar"/>
-	<classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-msexcel/parse-msexcel.jar"/>
-	<classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-mspowerpoint/parse-mspowerpoint.jar"/>
-	<classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-msword/parse-msword.jar"/>
-	<classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-oo/parse-oo.jar"/>
 	<classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-pdf/parse-pdf.jar"/>
 	<classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-rss/parse-rss.jar"/>
 	<classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-swf/parse-swf.jar"/>
@@ -183,7 +179,7 @@
 	<classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-lang-2.1.jar"/>
 	<classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-logging-1.0.4.jar"/>
 	<classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-logging-api-1.0.4.jar"/>
-	<classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/hadoop-0.12.2-core.jar" />
+	<classpathentry sourcepath="/home/stack/checkouts/hadoop/src/java" kind="lib" path="nutchwax-thirdparty/nutch/lib/hadoop-0.12.2-core.jar"/>
 	<classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jakarta-oro-2.0.7.jar"/>
 	<classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jets3t-0.5.0.jar"/>
 	<classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jetty-5.1.4.jar"/>

Deleted: trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/ImportLogsReporter.java
===================================================================
--- trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/ImportLogsReporter.java	2007-03-27 01:20:31 UTC (rev 1641)
+++ trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/ImportLogsReporter.java	2007-03-27 01:22:31 UTC (rev 1642)
@@ -1,119 +0,0 @@
-package org.archive.access.nutch.jobs;
-
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.WritableComparable;
-import org.apache.hadoop.mapred.JobClient;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.Mapper;
-import org.apache.hadoop.mapred.OutputCollector;
-import org.apache.hadoop.mapred.Reporter;
-import org.apache.hadoop.mapred.TextInputFormat;
-import org.apache.hadoop.mapred.TextOutputFormat;
-import org.apache.hadoop.util.ToolBase;
-import org.archive.access.nutch.NutchwaxConfiguration;
-
-
-/**
- * Makes a report based off passed log inputs.
- * Inputs are logs of a NutchWAX import.  Report lists counts of errors,
- * problematic ARCs, etc.
- * @author stack
- * @see org.apache.hadoop.tool.Logalyzer
- */
-public class ImportLogsReporter extends ToolBase implements Mapper {
-    // private final Log LOG = LogFactory.getLog(this.getClass().getName());
-    // private long lineCount = 0;
-    
-    /**
-     * Parse first part of the log line.  Here are some sample log lines:
-     * <pre>
-     * 2007-01-24 15:33:24,954 WARN  regex.RegexURLNormalizer - can't find rules for scope 'outlink', using default
-     * 2007-01-24 15:33:24,570 INFO  nutch.ImportArcs - adding http://www.bbswitzerland.ch/images/sbb1.gif http://www.bbswitzerland.ch:80/images/sbb1.gif 1105 image/gif
-     * </pre>
-     * Group one of the below regex is WARN or INFO in above. Group two the
-     * name of the logger (nutch.importArcs in the above).  Group three is all
-     * the rest of the log string. 
-     */
-    private static final Pattern PREFIX =
-        Pattern.compile("\\S+\\s+\\S+\\s+(\\S+)\\s+(\\S+)\\s+-\\s+(.*)");
-
-    public void map(WritableComparable key, Writable value,
-            OutputCollector output, Reporter reporter)
-    throws IOException {
-        // lineCount++;
-        Matcher m = PREFIX.matcher(value.toString());
-        if (!m.matches() || isWARN(m.group(1)) || isERROR(m.group(1))) {
-            output.collect(key, new Text(value.toString()));
-        }
-    }
-    
-    protected boolean isWARN(final String level) {
-        return level.equals("WARN");
-    }
-    
-    protected boolean isERROR(final String level) {
-        return level.equals("ERROR");
-    }
-
-    public void configure(JobConf job) {
-        
-        // TODO Auto-generated method stub
-    }
-
-    public void close() throws IOException {
-        // System.out.println(lineCount);
-    }
-    
-    protected void report(final String input, final String output)
-    throws IOException {
-        Path inputDir = new Path(input);
-        if (!FileSystem.get(getConf()).exists(inputDir)) {
-            throw new FileNotFoundException(input);
-        }
-        Path outputDir = new Path(output);
-        
-        JobConf jc = new JobConf(getConf());
-        jc.setJobName("Import logs reporter");
-        
-        jc.setInputPath(inputDir);
-        jc.setInputFormat(TextInputFormat.class);
-        
-        jc.setMapperClass(this.getClass());
-        
-        jc.setOutputPath(outputDir);
-        jc.setOutputFormat(TextOutputFormat.class);
-        jc.setOutputKeyClass(LongWritable.class);
-        jc.setOutputValueClass(Text.class);
-        
-        // Write a single file
-        jc.setNumReduceTasks(1);
-        
-        JobClient.runJob(jc);
-    }
-    
-    public int run(String[] args) throws Exception {
-        final String usage = "Usage: ImportLogsReporter <input> <output>\n" +
-            " input   Directory of input files listing log file URIs\n" +
-            " output  Where we write resulting report.";
-        if (args.length != 2) {
-            System.err.print(usage);
-            return -1;
-        }
-        report(args[0], args[1]);
-        return 0;
-    }
-    
-    public static void main(String[] args) throws Exception {
-        System.exit(new ImportLogsReporter().
-            doMain(NutchwaxConfiguration.getConfiguration(), args));
-    }
-}

Added: trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/LogsReporter.java
===================================================================
--- trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/LogsReporter.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/LogsReporter.java	2007-03-27 01:22:31 UTC (rev 1642)
@@ -0,0 +1,123 @@
+package org.archive.access.nutch.jobs;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.TextInputFormat;
+import org.apache.hadoop.mapred.TextOutputFormat;
+import org.apache.hadoop.util.ToolBase;
+import org.archive.access.nutch.NutchwaxConfiguration;
+import org.archive.access.nutch.mapred.TaskLogMapRunner;
+import org.archive.mapred.ARCMapRunner;
+
+
+/**
+ * Makes a report based off passed log inputs.
+ * Inputs are logs of a NutchWAX import.  Report lists counts of errors,
+ * problematic ARCs, etc.
+ * @author stack
+ * @see org.apache.hadoop.tool.Logalyzer
+ */
+public class LogsReporter extends ToolBase implements Mapper {
+    // private final Log LOG = LogFactory.getLog(this.getClass().getName());
+    // private long lineCount = 0;
+    
+    /**
+     * Parse first part of the log line.  Here are some sample log lines:
+     * <pre>
+     * 2007-01-24 15:33:24,954 WARN  regex.RegexURLNormalizer - can't find rules for scope 'outlink', using default
+     * 2007-01-24 15:33:24,570 INFO  nutch.ImportArcs - adding http://www.bbswitzerland.ch/images/sbb1.gif http://www.bbswitzerland.ch:80/images/sbb1.gif 1105 image/gif
+     * </pre>
+     * Group one of the below regex is WARN or INFO in above. Group two the
+     * name of the logger (nutch.importArcs in the above).  Group three is all
+     * the rest of the log string. 
+     */
+    private static final Pattern PREFIX =
+        Pattern.compile("\\S+\\s+\\S+\\s+(\\S+)\\s+(\\S+)\\s+-\\s+(.*)");
+
+    public void map(WritableComparable key, Writable value,
+            OutputCollector output, Reporter reporter)
+    throws IOException {
+        // lineCount++;
+        Matcher m = PREFIX.matcher(value.toString());
+        if (!m.matches() || isWARN(m.group(1)) || isERROR(m.group(1))) {
+            output.collect(key, new Text(value.toString()));
+        }
+    }
+    
+    protected boolean isWARN(final String level) {
+        return level.equals("WARN");
+    }
+    
+    protected boolean isERROR(final String level) {
+        return level.equals("ERROR");
+    }
+
+    public void configure(JobConf job) {
+        
+        // TODO Auto-generated method stub
+    }
+
+    public void close() throws IOException {
+        // System.out.println(lineCount);
+    }
+    
+    protected void report(final String input, final String output)
+    throws IOException {
+        Path inputDir = new Path(input);
+        if (!FileSystem.get(getConf()).exists(inputDir)) {
+            throw new FileNotFoundException(input);
+        }
+        Path outputDir = new Path(output);
+        
+        JobConf jc = new JobConf(getConf());
+        jc.setJobName("Import logs reporter");
+        
+        jc.setMapRunnerClass(TaskLogMapRunner.class);
+        
+        jc.setInputPath(inputDir);
+        jc.setInputFormat(TextInputFormat.class);
+        
+        jc.setMapperClass(this.getClass());
+        
+        jc.setOutputPath(outputDir);
+        jc.setOutputFormat(TextOutputFormat.class);
+        jc.setOutputKeyClass(LongWritable.class);
+        jc.setOutputValueClass(Text.class);
+        
+        // Write a single file
+        jc.setNumReduceTasks(1);
+        
+        JobClient.runJob(jc);
+    }
+    
+    public int run(String[] args) throws Exception {
+        final String usage = "Usage: ImportLogsReporter <input> <output>\n" +
+            " input   Directory of input files listing log file URIs\n" +
+            " output  Where we write resulting report.";
+        if (args.length != 2) {
+            System.err.print(usage);
+            return -1;
+        }
+        report(args[0], args[1]);
+        return 0;
+    }
+    
+    public static void main(String[] args) throws Exception {
+        System.exit(new LogsReporter().
+            doMain(NutchwaxConfiguration.getConfiguration(), args));
+    }
+}

Modified: trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/mapred/TaskLogMapRunner.java
===================================================================
--- trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/mapred/TaskLogMapRunner.java	2007-03-27 01:20:31 UTC (rev 1641)
+++ trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/mapred/TaskLogMapRunner.java	2007-03-27 01:22:31 UTC (rev 1642)
@@ -27,9 +27,12 @@
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.LineRecordReader;
 import org.apache.hadoop.mapred.MapRunnable;
 import org.apache.hadoop.mapred.Mapper;
 import org.apache.hadoop.mapred.OutputCollector;
@@ -38,7 +41,7 @@
 import org.apache.hadoop.util.ReflectionUtils;
 
 /**
- * Calls a map for every line in a hadoop userlog directory.
+ * Calls a map for every line of a hadoop userlog directory.
  * @author stack
  */
 public class TaskLogMapRunner implements MapRunnable {
@@ -70,6 +73,12 @@
     throws IOException {
         URL u = new URL(logurl);
         TaskLogReader tlr = new TaskLogReader(u);
-        // TODO: Need to upgrade hadoop so can get new LineRecordReader.
+        LineRecordReader lrr = new LineRecordReader(tlr.getInputStream(), 0,
+            tlr.getTotalLogSize());
+        LongWritable lineKey = new LongWritable();
+        Text lineValue = new Text();
+        while(lrr.next(lineKey, lineValue)) {
+            LOG.info(lineKey.toString() + " " + lineValue.toString());
+        }
     }
 }
\ No newline at end of file


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.