From: <sta...@us...> - 2007-03-26 21:08:11
|
Revision: 1635 http://archive-access.svn.sourceforge.net/archive-access/?rev=1635&view=rev Author: stack-sf Date: 2007-03-26 14:08:10 -0700 (Mon, 26 Mar 2007) Log Message: ----------- M nutchwax/src/java/overview.html M nutchwax/README.txt Update hadoop and nutch versions. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/README.txt trunk/archive-access/projects/nutchwax/src/java/overview.html Modified: trunk/archive-access/projects/nutchwax/README.txt =================================================================== --- trunk/archive-access/projects/nutchwax/README.txt 2007-03-26 20:57:26 UTC (rev 1634) +++ trunk/archive-access/projects/nutchwax/README.txt 2007-03-26 21:08:10 UTC (rev 1635) @@ -6,144 +6,4 @@ HADOOP VERSION AND PATCHES -Hadoop release version is 0.9.2. 0.9.1 fails when you try to use local -filesystem. Turning of speculative reduce seems to fix things but the -hadoop 0.9.1. has it set to true in bundled hadoop-default.xml. See -HADOOP-827. - - -NUTCH VERSION AND PATCHES - -Below are patches made against the nutch thats built into nutchwax. -You may be able to do without them. Apply if you you are OOME'ing -because too many links found building crawldb or merging segments. - -# This patch fixes SegmentMerger OOME'ing. It puts upper bound on links -# we add to a page (Saw OOME in 1.8 Gig heap trying to add 500k links -# to single key. Also includes part of NUTCH-333. -Index: src/java/org/apache/nutch/segment/SegmentMerger.java -=================================================================== ---- src/java/org/apache/nutch/segment/SegmentMerger.java (revision 486923) -+++ src/java/org/apache/nutch/segment/SegmentMerger.java (working copy) -@@ -41,6 +41,7 @@ - import org.apache.nutch.parse.ParseText; - import org.apache.nutch.protocol.Content; - import org.apache.nutch.util.NutchConfiguration; -+import org.apache.nutch.util.NutchJob; - - /** - * This tool takes several segments and merges their data together. Only the -@@ -98,6 +99,7 @@ - private URLFilters filters = null; - private long sliceSize = -1; - private long curCount = 0; -+ private int maxLinked; - - /** - * Wraps inputs in an {@link MetaWrapper}, to permit merging different -@@ -257,6 +259,7 @@ - if (sliceSize > 0) { - sliceSize = sliceSize / conf.getNumReduceTasks(); - } -+ this.maxLinked = conf.getInt("db.linked.max", 1000); - } - - private Text newKey = new Text(); -@@ -301,7 +304,7 @@ - String lastPDname = null; - String lastPTname = null; - TreeMap linked = new TreeMap(); -- while (values.hasNext()) { -+ VALUES_LOOP: while (values.hasNext()) { - MetaWrapper wrapper = (MetaWrapper)values.next(); - Object o = wrapper.get(); - String spString = wrapper.getMeta(SEGMENT_PART_KEY); -@@ -355,6 +358,17 @@ - linked.put(sp.segmentName, segLinked); - } - segLinked.add(val); -+ if (segLinked.size() <= this.maxLinked) { -+ segLinked.add(val); -+ } else { -+ LOG.info("SKIPPING SEGLINKED LARGE " + -+ segLinked.size() + ", * linked size " + linked.size() + -+ ", name " + sp.segmentName + ", key " + key); -+ break VALUES_LOOP; -+ } -+ if ((segLinked.size() % 1000) == 0) { -+ LOG.info("SEGLINKED SIZE " + segLinked.size() + ", key " + key); -+ } - } else { - throw new IOException("Cannot determine segment part: " + sp.partName); - } -@@ -460,7 +474,7 @@ - if (LOG.isInfoEnabled()) { - LOG.info("Merging " + segs.length + " segments to " + out + "/" + segmentName); - } -- JobConf job = new JobConf(getConf()); -+ JobConf job = new NutchJob(getConf()); - job.setJobName("mergesegs " + out + "/" + segmentName); - job.setBoolean("segment.merger.filter", filter); - job.setLong("segment.merger.slice", slice); -Index: src/java/org/apache/nutch/segment/SegmentReader.java -=================================================================== ---- src/java/org/apache/nutch/segment/SegmentReader.java (revision 486923) -+++ src/java/org/apache/nutch/segment/SegmentReader.java (working copy) -@@ -36,6 +36,7 @@ - import org.apache.nutch.protocol.Content; - import org.apache.nutch.util.LogUtil; - import org.apache.nutch.util.NutchConfiguration; -+import org.apache.nutch.util.NutchJob; - - /** Dump the content of a segment. */ - public class SegmentReader extends Configured implements Reducer { -@@ -147,7 +148,7 @@ - } - - private JobConf createJobConf() { -- JobConf job = new JobConf(getConf()); -+ JobConf job = new NutchJob(getConf()); - job.setBoolean("segment.reader.co", this.co); - job.setBoolean("segment.reader.fe", this.fe); - job.setBoolean("segment.reader.ge", this.ge); - -# NUTCH-311 -# -Index: src/java/org/apache/nutch/crawl/CrawlDbReducer.java -=================================================================== ---- src/java/org/apache/nutch/crawl/CrawlDbReducer.java (revision 486923) -+++ src/java/org/apache/nutch/crawl/CrawlDbReducer.java (working copy) -@@ -38,11 +38,13 @@ - private ArrayList linked = new ArrayList(); - private ScoringFilters scfilters = null; - private boolean additionsAllowed; -+ private int maxLinked; - - public void configure(JobConf job) { - retryMax = job.getInt("db.fetch.retry.max", 3); - scfilters = new ScoringFilters(job); - additionsAllowed = job.getBoolean(CrawlDb.CRAWLDB_ADDITIONS_ALLOWED, true); -+ this.maxLinked = job.getInt("db.linked.max", 10000); - } - - public void close() {} -@@ -56,7 +58,7 @@ - byte[] signature = null; - linked.clear(); - -- while (values.hasNext()) { -+ VALUES_LOOP: while (values.hasNext()) { - CrawlDatum datum = (CrawlDatum)values.next(); - - if (highest == null || datum.getStatus() > highest.getStatus()) { -@@ -71,6 +73,10 @@ - break; - case CrawlDatum.STATUS_LINKED: - linked.add(datum); -+ if (linked.size() > this.maxLinked) { -+ LOG.info("Breaking. " + key + " has > than " + this.maxLinked); -+ break VALUES_LOOP; -+ } - break; - case CrawlDatum.STATUS_SIGNATURE: - signature = datum.getSignature(); +NutchWAX is built against nutch 0.9 which in turn uses hadoop 0.12.2. Modified: trunk/archive-access/projects/nutchwax/src/java/overview.html =================================================================== --- trunk/archive-access/projects/nutchwax/src/java/overview.html 2007-03-26 20:57:26 UTC (rev 1634) +++ trunk/archive-access/projects/nutchwax/src/java/overview.html 2007-03-26 21:08:10 UTC (rev 1635) @@ -51,8 +51,7 @@ the platform we use to run indexing jobs atop. Hadoop is an open source implementation of <a href="http://labs.google.com/papers/mapreduce.html">Google mapreduce</a> and <a href="http://labs.google.com/papers/gfs.html">Google -GFS</a>. NutchWAX 0.10.0 requires Hadoop 0.9.2. It will not work with later -versions. Hadoop has its own set of requirements. See +GFS</a>. NutchWAX requires Hadoop 0.12.2. Hadoop has its own set of requirements. See <i>Requirements</i> about midways down on the <a href="http://lucene.apache.org/hadoop/docs/api/overview-summary.html">Hadoop API</a> page. Hadoop binaries are available for download off the This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |