[Archive-access-cvs] SF.net SVN: archive-access: [1635] trunk/archive-access/projects/nutchwax

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 1635
          http://archive-access.svn.sourceforge.net/archive-access/?rev=1635&view=rev
Author:   stack-sf
Date:     2007-03-26 14:08:10 -0700 (Mon, 26 Mar 2007)

Log Message:
-----------

M    nutchwax/src/java/overview.html
M    nutchwax/README.txt
    Update hadoop and nutch versions.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/README.txt
    trunk/archive-access/projects/nutchwax/src/java/overview.html

Modified: trunk/archive-access/projects/nutchwax/README.txt
===================================================================

--- trunk/archive-access/projects/nutchwax/README.txt	2007-03-26 20:57:26 UTC (rev 1634)
+++ trunk/archive-access/projects/nutchwax/README.txt	2007-03-26 21:08:10 UTC (rev 1635)
@@ -6,144 +6,4 @@
 
 HADOOP VERSION AND PATCHES
 
-Hadoop release version is 0.9.2.  0.9.1 fails when you try to use local
-filesystem.  Turning of speculative reduce seems to fix things but the
-hadoop 0.9.1. has it set to true in bundled hadoop-default.xml.  See
-HADOOP-827.   
-
-
-NUTCH VERSION AND PATCHES
-
-Below are patches made against the nutch thats built into nutchwax.
-You may be able to do without them.  Apply if you you are OOME'ing
-because too many links found building crawldb or merging segments.
-
-# This patch fixes SegmentMerger OOME'ing.  It puts upper bound on links
-# we add to a page (Saw OOME in 1.8 Gig heap trying to add 500k links
-# to single key.  Also includes part of NUTCH-333.
-Index: src/java/org/apache/nutch/segment/SegmentMerger.java
-===================================================================
---- src/java/org/apache/nutch/segment/SegmentMerger.java	(revision 486923)
-+++ src/java/org/apache/nutch/segment/SegmentMerger.java	(working copy)
-@@ -41,6 +41,7 @@
- import org.apache.nutch.parse.ParseText;
- import org.apache.nutch.protocol.Content;
- import org.apache.nutch.util.NutchConfiguration;
-+import org.apache.nutch.util.NutchJob;
- 
- /**
-  * This tool takes several segments and merges their data together. Only the
-@@ -98,6 +99,7 @@
-   private URLFilters filters = null;
-   private long sliceSize = -1;
-   private long curCount = 0;
-+  private int maxLinked;
-   
-   /**
-    * Wraps inputs in an {@link MetaWrapper}, to permit merging different
-@@ -257,6 +259,7 @@
-     if (sliceSize > 0) {
-       sliceSize = sliceSize / conf.getNumReduceTasks();
-     }
-+    this.maxLinked = conf.getInt("db.linked.max", 1000);
-   }
-   
-   private Text newKey = new Text();
-@@ -301,7 +304,7 @@
-     String lastPDname = null;
-     String lastPTname = null;
-     TreeMap linked = new TreeMap();
--    while (values.hasNext()) {
-+    VALUES_LOOP: while (values.hasNext()) {
-       MetaWrapper wrapper = (MetaWrapper)values.next();
-       Object o = wrapper.get();
-       String spString = wrapper.getMeta(SEGMENT_PART_KEY);
-@@ -355,6 +358,17 @@
-             linked.put(sp.segmentName, segLinked);
-           }
-           segLinked.add(val);
-+          if (segLinked.size() <= this.maxLinked) {
-+        	  segLinked.add(val);
-+          } else {
-+        	  LOG.info("SKIPPING SEGLINKED LARGE " +
-+                  segLinked.size() + ", * linked size " + linked.size() +
-+                  ", name " + sp.segmentName + ", key " + key);
-+              break VALUES_LOOP;
-+           }
-+           if ((segLinked.size() % 1000) == 0) {
-+               LOG.info("SEGLINKED SIZE " + segLinked.size() + ", key " + key);
-+           }
-         } else {
-           throw new IOException("Cannot determine segment part: " + sp.partName);
-         }
-@@ -460,7 +474,7 @@
-     if (LOG.isInfoEnabled()) {
-       LOG.info("Merging " + segs.length + " segments to " + out + "/" + segmentName);
-     }
--    JobConf job = new JobConf(getConf());
-+    JobConf job = new NutchJob(getConf());
-     job.setJobName("mergesegs " + out + "/" + segmentName);
-     job.setBoolean("segment.merger.filter", filter);
-     job.setLong("segment.merger.slice", slice);
-Index: src/java/org/apache/nutch/segment/SegmentReader.java
-===================================================================
---- src/java/org/apache/nutch/segment/SegmentReader.java	(revision 486923)
-+++ src/java/org/apache/nutch/segment/SegmentReader.java	(working copy)
-@@ -36,6 +36,7 @@
- import org.apache.nutch.protocol.Content;
- import org.apache.nutch.util.LogUtil;
- import org.apache.nutch.util.NutchConfiguration;
-+import org.apache.nutch.util.NutchJob; 
- 
- /** Dump the content of a segment. */
- public class SegmentReader extends Configured implements Reducer {
-@@ -147,7 +148,7 @@
-   }
- 
-   private JobConf createJobConf() {
--    JobConf job = new JobConf(getConf());
-+    JobConf job = new NutchJob(getConf());
-     job.setBoolean("segment.reader.co", this.co);
-     job.setBoolean("segment.reader.fe", this.fe);
-     job.setBoolean("segment.reader.ge", this.ge);
-
-# NUTCH-311
-#
-Index: src/java/org/apache/nutch/crawl/CrawlDbReducer.java
-===================================================================
---- src/java/org/apache/nutch/crawl/CrawlDbReducer.java	(revision 486923)
-+++ src/java/org/apache/nutch/crawl/CrawlDbReducer.java	(working copy)
-@@ -38,11 +38,13 @@
-   private ArrayList linked = new ArrayList();
-   private ScoringFilters scfilters = null;
-   private boolean additionsAllowed;
-+  private int maxLinked;
- 
-   public void configure(JobConf job) {
-     retryMax = job.getInt("db.fetch.retry.max", 3);
-     scfilters = new ScoringFilters(job);
-     additionsAllowed = job.getBoolean(CrawlDb.CRAWLDB_ADDITIONS_ALLOWED, true);
-+    this.maxLinked = job.getInt("db.linked.max", 10000);
-   }
- 
-   public void close() {}
-@@ -56,7 +58,7 @@
-     byte[] signature = null;
-     linked.clear();
- 
--    while (values.hasNext()) {
-+    VALUES_LOOP: while (values.hasNext()) {
-       CrawlDatum datum = (CrawlDatum)values.next();
- 
-       if (highest == null || datum.getStatus() > highest.getStatus()) {
-@@ -71,6 +73,10 @@
-         break;
-       case CrawlDatum.STATUS_LINKED:
-         linked.add(datum);
-+        if (linked.size() > this.maxLinked) {
-+            LOG.info("Breaking. " + key + " has > than " + this.maxLinked);
-+            break VALUES_LOOP;
-+        }
-         break;
-       case CrawlDatum.STATUS_SIGNATURE:
-         signature = datum.getSignature();
+NutchWAX is built against nutch 0.9 which in turn uses hadoop 0.12.2.

Modified: trunk/archive-access/projects/nutchwax/src/java/overview.html
===================================================================
--- trunk/archive-access/projects/nutchwax/src/java/overview.html	2007-03-26 20:57:26 UTC (rev 1634)
+++ trunk/archive-access/projects/nutchwax/src/java/overview.html	2007-03-26 21:08:10 UTC (rev 1635)
@@ -51,8 +51,7 @@
 the platform we use to run indexing jobs atop.  Hadoop is an open source
 implementation of <a href="http://labs.google.com/papers/mapreduce.html">Google
 mapreduce</a> and <a href="http://labs.google.com/papers/gfs.html">Google
-GFS</a>.  NutchWAX 0.10.0 requires Hadoop 0.9.2.  It will not work with later
-versions.  Hadoop has its own set of requirements.  See
+GFS</a>.  NutchWAX requires Hadoop 0.12.2.  Hadoop has its own set of requirements.  See
 <i>Requirements</i> about midways down on the
 <a href="http://lucene.apache.org/hadoop/docs/api/overview-summary.html">Hadoop API</a>
 page.  Hadoop binaries are available for download off the 


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.