From: <sta...@us...> - 2007-03-27 01:22:35
|
Revision: 1642 http://archive-access.svn.sourceforge.net/archive-access/?rev=1642&view=rev Author: stack-sf Date: 2007-03-26 18:22:31 -0700 (Mon, 26 Mar 2007) Log Message: ----------- M nutchwax/src/java/org/archive/access/nutch/mapred/TaskLogMapRunner.java Use LineRecordReader (TODO: Finish analysis). D nutchwax/src/java/org/archive/access/nutch/jobs/ImportLogsReporter.java A nutchwax/src/java/org/archive/access/nutch/jobs/LogsReporter.java Renamed as LogsReporter. M nutchwax/.classpath Removed missing plugins. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/.classpath trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/mapred/TaskLogMapRunner.java Added Paths: ----------- trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/LogsReporter.java Removed Paths: ------------- trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/ImportLogsReporter.java Modified: trunk/archive-access/projects/nutchwax/.classpath =================================================================== --- trunk/archive-access/projects/nutchwax/.classpath 2007-03-27 01:20:31 UTC (rev 1641) +++ trunk/archive-access/projects/nutchwax/.classpath 2007-03-27 01:22:31 UTC (rev 1642) @@ -71,10 +71,6 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-ext/parse-ext.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-html/parse-html.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-js/parse-js.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-msexcel/parse-msexcel.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-mspowerpoint/parse-mspowerpoint.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-msword/parse-msword.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-oo/parse-oo.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-pdf/parse-pdf.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-rss/parse-rss.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/build/parse-swf/parse-swf.jar"/> @@ -183,7 +179,7 @@ <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-lang-2.1.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-logging-1.0.4.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/commons-logging-api-1.0.4.jar"/> - <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/hadoop-0.12.2-core.jar" /> + <classpathentry sourcepath="/home/stack/checkouts/hadoop/src/java" kind="lib" path="nutchwax-thirdparty/nutch/lib/hadoop-0.12.2-core.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jakarta-oro-2.0.7.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jets3t-0.5.0.jar"/> <classpathentry kind="lib" path="nutchwax-thirdparty/nutch/lib/jetty-5.1.4.jar"/> Deleted: trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/ImportLogsReporter.java =================================================================== --- trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/ImportLogsReporter.java 2007-03-27 01:20:31 UTC (rev 1641) +++ trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/ImportLogsReporter.java 2007-03-27 01:22:31 UTC (rev 1642) @@ -1,119 +0,0 @@ -package org.archive.access.nutch.jobs; - -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.io.WritableComparable; -import org.apache.hadoop.mapred.JobClient; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.Mapper; -import org.apache.hadoop.mapred.OutputCollector; -import org.apache.hadoop.mapred.Reporter; -import org.apache.hadoop.mapred.TextInputFormat; -import org.apache.hadoop.mapred.TextOutputFormat; -import org.apache.hadoop.util.ToolBase; -import org.archive.access.nutch.NutchwaxConfiguration; - - -/** - * Makes a report based off passed log inputs. - * Inputs are logs of a NutchWAX import. Report lists counts of errors, - * problematic ARCs, etc. - * @author stack - * @see org.apache.hadoop.tool.Logalyzer - */ -public class ImportLogsReporter extends ToolBase implements Mapper { - // private final Log LOG = LogFactory.getLog(this.getClass().getName()); - // private long lineCount = 0; - - /** - * Parse first part of the log line. Here are some sample log lines: - * <pre> - * 2007-01-24 15:33:24,954 WARN regex.RegexURLNormalizer - can't find rules for scope 'outlink', using default - * 2007-01-24 15:33:24,570 INFO nutch.ImportArcs - adding http://www.bbswitzerland.ch/images/sbb1.gif http://www.bbswitzerland.ch:80/images/sbb1.gif 1105 image/gif - * </pre> - * Group one of the below regex is WARN or INFO in above. Group two the - * name of the logger (nutch.importArcs in the above). Group three is all - * the rest of the log string. - */ - private static final Pattern PREFIX = - Pattern.compile("\\S+\\s+\\S+\\s+(\\S+)\\s+(\\S+)\\s+-\\s+(.*)"); - - public void map(WritableComparable key, Writable value, - OutputCollector output, Reporter reporter) - throws IOException { - // lineCount++; - Matcher m = PREFIX.matcher(value.toString()); - if (!m.matches() || isWARN(m.group(1)) || isERROR(m.group(1))) { - output.collect(key, new Text(value.toString())); - } - } - - protected boolean isWARN(final String level) { - return level.equals("WARN"); - } - - protected boolean isERROR(final String level) { - return level.equals("ERROR"); - } - - public void configure(JobConf job) { - - // TODO Auto-generated method stub - } - - public void close() throws IOException { - // System.out.println(lineCount); - } - - protected void report(final String input, final String output) - throws IOException { - Path inputDir = new Path(input); - if (!FileSystem.get(getConf()).exists(inputDir)) { - throw new FileNotFoundException(input); - } - Path outputDir = new Path(output); - - JobConf jc = new JobConf(getConf()); - jc.setJobName("Import logs reporter"); - - jc.setInputPath(inputDir); - jc.setInputFormat(TextInputFormat.class); - - jc.setMapperClass(this.getClass()); - - jc.setOutputPath(outputDir); - jc.setOutputFormat(TextOutputFormat.class); - jc.setOutputKeyClass(LongWritable.class); - jc.setOutputValueClass(Text.class); - - // Write a single file - jc.setNumReduceTasks(1); - - JobClient.runJob(jc); - } - - public int run(String[] args) throws Exception { - final String usage = "Usage: ImportLogsReporter <input> <output>\n" + - " input Directory of input files listing log file URIs\n" + - " output Where we write resulting report."; - if (args.length != 2) { - System.err.print(usage); - return -1; - } - report(args[0], args[1]); - return 0; - } - - public static void main(String[] args) throws Exception { - System.exit(new ImportLogsReporter(). - doMain(NutchwaxConfiguration.getConfiguration(), args)); - } -} Added: trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/LogsReporter.java =================================================================== --- trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/LogsReporter.java (rev 0) +++ trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/jobs/LogsReporter.java 2007-03-27 01:22:31 UTC (rev 1642) @@ -0,0 +1,123 @@ +package org.archive.access.nutch.jobs; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.hadoop.mapred.TextOutputFormat; +import org.apache.hadoop.util.ToolBase; +import org.archive.access.nutch.NutchwaxConfiguration; +import org.archive.access.nutch.mapred.TaskLogMapRunner; +import org.archive.mapred.ARCMapRunner; + + +/** + * Makes a report based off passed log inputs. + * Inputs are logs of a NutchWAX import. Report lists counts of errors, + * problematic ARCs, etc. + * @author stack + * @see org.apache.hadoop.tool.Logalyzer + */ +public class LogsReporter extends ToolBase implements Mapper { + // private final Log LOG = LogFactory.getLog(this.getClass().getName()); + // private long lineCount = 0; + + /** + * Parse first part of the log line. Here are some sample log lines: + * <pre> + * 2007-01-24 15:33:24,954 WARN regex.RegexURLNormalizer - can't find rules for scope 'outlink', using default + * 2007-01-24 15:33:24,570 INFO nutch.ImportArcs - adding http://www.bbswitzerland.ch/images/sbb1.gif http://www.bbswitzerland.ch:80/images/sbb1.gif 1105 image/gif + * </pre> + * Group one of the below regex is WARN or INFO in above. Group two the + * name of the logger (nutch.importArcs in the above). Group three is all + * the rest of the log string. + */ + private static final Pattern PREFIX = + Pattern.compile("\\S+\\s+\\S+\\s+(\\S+)\\s+(\\S+)\\s+-\\s+(.*)"); + + public void map(WritableComparable key, Writable value, + OutputCollector output, Reporter reporter) + throws IOException { + // lineCount++; + Matcher m = PREFIX.matcher(value.toString()); + if (!m.matches() || isWARN(m.group(1)) || isERROR(m.group(1))) { + output.collect(key, new Text(value.toString())); + } + } + + protected boolean isWARN(final String level) { + return level.equals("WARN"); + } + + protected boolean isERROR(final String level) { + return level.equals("ERROR"); + } + + public void configure(JobConf job) { + + // TODO Auto-generated method stub + } + + public void close() throws IOException { + // System.out.println(lineCount); + } + + protected void report(final String input, final String output) + throws IOException { + Path inputDir = new Path(input); + if (!FileSystem.get(getConf()).exists(inputDir)) { + throw new FileNotFoundException(input); + } + Path outputDir = new Path(output); + + JobConf jc = new JobConf(getConf()); + jc.setJobName("Import logs reporter"); + + jc.setMapRunnerClass(TaskLogMapRunner.class); + + jc.setInputPath(inputDir); + jc.setInputFormat(TextInputFormat.class); + + jc.setMapperClass(this.getClass()); + + jc.setOutputPath(outputDir); + jc.setOutputFormat(TextOutputFormat.class); + jc.setOutputKeyClass(LongWritable.class); + jc.setOutputValueClass(Text.class); + + // Write a single file + jc.setNumReduceTasks(1); + + JobClient.runJob(jc); + } + + public int run(String[] args) throws Exception { + final String usage = "Usage: ImportLogsReporter <input> <output>\n" + + " input Directory of input files listing log file URIs\n" + + " output Where we write resulting report."; + if (args.length != 2) { + System.err.print(usage); + return -1; + } + report(args[0], args[1]); + return 0; + } + + public static void main(String[] args) throws Exception { + System.exit(new LogsReporter(). + doMain(NutchwaxConfiguration.getConfiguration(), args)); + } +} Modified: trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/mapred/TaskLogMapRunner.java =================================================================== --- trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/mapred/TaskLogMapRunner.java 2007-03-27 01:20:31 UTC (rev 1641) +++ trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/mapred/TaskLogMapRunner.java 2007-03-27 01:22:31 UTC (rev 1642) @@ -27,9 +27,12 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.LineRecordReader; import org.apache.hadoop.mapred.MapRunnable; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; @@ -38,7 +41,7 @@ import org.apache.hadoop.util.ReflectionUtils; /** - * Calls a map for every line in a hadoop userlog directory. + * Calls a map for every line of a hadoop userlog directory. * @author stack */ public class TaskLogMapRunner implements MapRunnable { @@ -70,6 +73,12 @@ throws IOException { URL u = new URL(logurl); TaskLogReader tlr = new TaskLogReader(u); - // TODO: Need to upgrade hadoop so can get new LineRecordReader. + LineRecordReader lrr = new LineRecordReader(tlr.getInputStream(), 0, + tlr.getTotalLogSize()); + LongWritable lineKey = new LongWritable(); + Text lineValue = new Text(); + while(lrr.next(lineKey, lineValue)) { + LOG.info(lineKey.toString() + " " + lineValue.toString()); + } } } \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |