[Archive-access-cvs] SF.net SVN: archive-access: [1896] trunk/archive-access/projects/nutchwax

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 1896
          http://archive-access.svn.sourceforge.net/archive-access/?rev=1896&view=rev
Author:   jlee-archive
Date:     2007-08-01 14:44:31 -0700 (Wed, 01 Aug 2007)

Log Message:
-----------
Just cleaned up the code with some whitespace, consistent indenting, etc. No functional changes.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java
    trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Nutchwax.java
    trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/NutchwaxBean.java
    trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/NutchwaxConfiguration.java
    trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/NutchwaxDistributedSearch.java
    trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/NutchwaxOpenSearchServlet.java
    trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/NutchwaxQuery.java
    trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/NutchwaxTest.java
    trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/index-wax/src/java/org/archive/access/nutch/indexer/WaxIndexingFilter.java
    trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/parse-default/src/java/org/archive/access/nutch/parse/MetadataOnlyParser.java
    trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/parse-waxext/src/java/org/apache/nutch/parse/ext/WaxExtParser.java
    trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-anchor/src/java/org/apache/nutch/searcher/anchor/AnchorQueryFilter.java
    trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-content/src/java/org/apache/nutch/searcher/content/ContentQueryFilter.java
    trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-host/src/java/org/apache/nutch/searcher/host/HostQueryFilter.java
    trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-title/src/java/org/apache/nutch/searcher/title/TitleQueryFilter.java
    trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-wax/src/java/org/archive/access/nutch/searcher/WaxArcfileQueryFilter.java
    trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-wax/src/java/org/archive/access/nutch/searcher/WaxCollectionQueryFilter.java
    trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-wax/src/java/org/archive/access/nutch/searcher/WaxDateQueryFilter.java
    trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-wax/src/java/org/archive/access/nutch/searcher/WaxExacturlQueryFilter.java
    trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-wax/src/java/org/archive/access/nutch/searcher/WaxTypeQueryFilter.java

Modified: trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java
===================================================================

--- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java	2007-07-26 21:53:47 UTC (rev 1895)
+++ trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java	2007-08-01 21:44:31 UTC (rev 1896)
@@ -34,252 +34,329 @@
  * {@link org.apache.nutch.indexer.IndexMerger} or
  * {@link org.apache.nutch.indexer.IndexSorter}.
  * 
- * Takes input that has per line the name of the class to run and the arguments
- * to pass.  Here is an example line for IndexMerger:
- * <code>org.apache.nutch.indexer.IndexMerger -workingdir /tmp index-new indexes
- * </code>. Here is one for IndexSorter:
+ * Takes input that has per line the name of the class to run and the
+ * arguments to pass. Here is an example line for IndexMerger:
+ * <code>org.apache.nutch.indexer.IndexMerger -workingdir /tmp index-new
+ * indexes</code>. Here is one for IndexSorter:
  * <code>org.apache.nutch.indexer.IndexSorter /home/stack/tmp/crawl</code>
  * (Note that IndexSorter wants to refer to the local system; the indexes to
  * sort must be on local disk). We run as many tasks as there are input lines.
  * 
  * @author stack
  */
-public class Multiple extends ToolBase implements Mapper {
-    public final Log LOG = LogFactory.getLog(this.getClass());
-    private JobConf job;
+public class Multiple extends ToolBase implements Mapper
+{
+  public final Log LOG = LogFactory.getLog(this.getClass());
+  private JobConf job;
     
-	public void map(WritableComparable key, Writable value,
-			OutputCollector output, final Reporter reporter)
-	throws IOException {
-		final String [] words = value.toString().split("\\s");
-		if (words.length <= 0) {
-			return;
-		}
-        final String className = words[0];
-		// Set a timer running that will update reporter on a period.
-		Timer t = new Timer(false);
-		t.scheduleAtFixedRate(new TimerTask() {
-			@Override
-			public void run() {
-				try {
-					reporter.setStatus("Running " + className);
-				} catch (IOException e) {
-					e.printStackTrace();
-				}
-			}}, 0, 10000);
-		try {
-			int result = doMain(words);
-            reporter.setStatus("Done running " + className + ": " + result);
-            if (result != 0) {
-                throw new IOException(className + " returned non-null: " +
-                    result + ", check logs.");
-            }
-        } finally {
-			t.cancel();
-		}
-	}
-
-    /**
-     * Call {@link ToolBase#doMain(org.apache.hadoop.conf.Configuration, String[])}
-     * on the passed classname.
-     * @param args
-     * @return Result from call to doMain.
-     */
-    private int doMain(final String [] args) {
-        final String className = args[0];
-        // Redo args so absent our 'class' command.
-        String [] newArgs = Nutchwax.rewriteArgs(args, 1);
-        int result = -1;
-        try {
-            Object obj = Class.forName(className).newInstance();
-            result = ((ToolBase)obj).doMain(this.job, newArgs);
-        } catch (Exception e) {
-            LOG.error(className, e);
+  public void map(WritableComparable key, Writable value,
+    OutputCollector output, final Reporter reporter)
+    throws IOException
+  {
+    final String [] words = value.toString().split("\\s");
+    
+    if (words.length <= 0)
+    {
+      return;
+    }
+    
+    final String className = words[0];
+    
+    // Set a timer running that will update reporter on a period.
+    Timer t = new Timer(false);
+    
+    t.scheduleAtFixedRate(new TimerTask()
+    {
+      @Override
+      public void run()
+      {
+        try
+        {
+          reporter.setStatus("Running " + className);
         }
-        return result;
+        catch (IOException e)
+        {
+          e.printStackTrace();
+        }
+      }
+    }, 0, 10000);
+    
+    try
+    {
+      int result = doMain(words);
+      
+      reporter.setStatus("Done running " + className + ": " + result);
+      
+      if (result != 0)
+      {
+        throw new IOException(className + " returned non-null: " +
+          result + ", check logs.");
+      }
     }
+    finally
+    {
+      t.cancel();
+    }
+  }
 
-	public void configure(final JobConf j) {
-	    this.job = j;
-	}
+ /**
+  * Call {@link ToolBase#doMain(org.apache.hadoop.conf.Configuration, String[])}
+  * on the passed classname.
+  * @param args
+  * @return Result from call to doMain.
+  */
+  private int doMain(final String [] args)
+  {
+    final String className = args[0];
+    
+    // Redo args so absent our 'class' command.
+    String [] newArgs = Nutchwax.rewriteArgs(args, 1);
+    int result = -1;
+    
+    try
+    {
+      Object obj = Class.forName(className).newInstance();
+      result = ((ToolBase)obj).doMain(this.job, newArgs);
+    }
+    catch (Exception e)
+    {
+      LOG.error(className, e);
+    }
+    
+    return result;
+  }
 
-	public void close() throws IOException {
-		// TODO Auto-generated method stub
-	}
+  public void configure(final JobConf j)
+  {
+    this.job = j;
+  }
 
-	public static class MultipleInputFormat implements InputFormat {
-		
-		public RecordReader getRecordReader(final InputSplit split, 
-			final JobConf job, final Reporter reporter)
-		throws IOException {
-			// Only one record/line to read.
-			return new RecordReader() {
-				private final String line = ((LineInputSplit)split).line;
-				private boolean read = false;
-				
-				public void close() throws IOException {
-					// TODO Auto-generated method stub
-				}
+  public void close() throws IOException
+  {
+    // TODO Auto-generated method stub
+  }
 
-				public WritableComparable createKey() {
-					return new Text("");
-				}
+  public static class MultipleInputFormat implements InputFormat
+  {
+    public RecordReader getRecordReader(final InputSplit split, 
+      final JobConf job, final Reporter reporter)
+      throws IOException
+    {
+      // Only one record/line to read.
+      return new RecordReader()
+      {
+        private final String line = ((LineInputSplit)split).line;
+        private boolean read = false;
+        
+        public void close() throws IOException
+        {
+          // TODO Auto-generated method stub
+        }
 
-				public Writable createValue() {
-					return new Text("");
-				}
+        public WritableComparable createKey()
+        {
+          return new Text("");
+        }
 
-				public long getPos() throws IOException {
-					return 0;
-				}
+        public Writable createValue() {
+          return new Text("");
+        }
 
-				public float getProgress() throws IOException {
-					return getPos();
-				}
+        public long getPos() throws IOException
+        {
+          return 0;
+        }
 
-				public boolean next(Writable key, Writable value)
-				throws IOException {
-					if (read) {
-						return false;
-					}
-					read = true;
-					((Text)value).set(this.line);
-					return true;
-				}
-			};
-		}
+        public float getProgress() throws IOException
+        {
+          return getPos();
+        }
 
-		public InputSplit[] getSplits(JobConf job, int numSplits)
-		throws IOException {
-			Path[] inputs = job.getInputPaths();
-			List<String> lines = new ArrayList<String>();
-			for (int i = 0; i < inputs.length; i++) {
-				Path p = inputs[i];
-				FileSystem fs = p.getFileSystem(job);
-				Path [] ps = fs.listPaths(p);
-				for (int j = 0; j < ps.length; j++) {
-					if (fs.isDirectory(ps[j])) {
-						continue;
-					}
-					addFileLines(lines, fs, ps[j]);
-				}
-			}
-			List<LineInputSplit> splits =
-				new ArrayList<LineInputSplit>(lines.size());
-			for (String line: lines) {
-				splits.add(new LineInputSplit(line));
-			}
-            job.setNumMapTasks(lines.size());
-			return splits.toArray(new LineInputSplit [0]);
-		}
-		
-		private void addFileLines(final List<String> lines, final FileSystem fs,
-				final Path p)
-		throws IOException {
-			InputStream is = (InputStream)fs.open(p);
-			LineNumberReader lnr = null;
-			try {
-				lnr = new LineNumberReader(new InputStreamReader(is));
-				for (String l = null; (l = lnr.readLine()) != null;) {
-					if (l.length() > 0 && !l.trim().startsWith("#")) {
-						lines.add(l);
-					}
-				}
-			} finally {
-				if (lnr != null) {
-					lnr.close();
-				}
-				is.close();
-			}
-		}
+        public boolean next(Writable key, Writable value)
+          throws IOException
+        {
+          if (read)
+          {
+            return false;
+          }
+          
+          read = true;
+          
+          ((Text)value).set(this.line);
 
-		public void validateInput(JobConf job) throws IOException {
-			// Nothing to validate.
-		}
-	}
-	
-	public static class LineInputSplit implements InputSplit {
-		private String line;
-		
-		protected LineInputSplit() {
-			super();
-		}
-		
-		public LineInputSplit(final String l) {
-			line = l;
-		}
-		
-		public long getLength() throws IOException {
-			return line.length();
-		}
+          return true;
+        }
+      };
+    }
 
-		public String[] getLocations() throws IOException {
-			return new String[0];
-		}
+    public InputSplit[] getSplits(JobConf job, int numSplits)
+      throws IOException
+    {
+      Path[] inputs = job.getInputPaths();
 
-		public void readFields(DataInput in) throws IOException {
-			this.line = in.readLine();
-		}
+      List<String> lines = new ArrayList<String>();
 
-		public void write(DataOutput out) throws IOException {
-			out.writeBytes(this.line);
-		}
-	}
-	
-	public static void usage() {
-		System.out.println("Usage: multiple <input> <output>");
-        System.out.println("Runs concurrently all commands listed in " +
-            "<inputs>.");
-		System.out.println("Arguments:");
-		System.out.println(" <input>   Directory of input files with " +
-			"each line describing task to run");
-		System.out.println(" <output>  Output directory.");
-        System.out.println("Example input lines:");
-        System.out.println();
-        System.out.println(" An input line to specify a merge would look " +
-             "like:");
-        System.out.println();
-        System.out.println(" org.apache.nutch.indexer.IndexMerger " +
-            "-workingdir /3/hadoop-tmp index-monday indexes-monday");
-        System.out.println();
-        System.out.println(" Note that named class must implement " +
-        "org.apache.hadoop.util.ToolBase");
-        System.out.println();
-        System.out.println(" To copy from " +
-            "hdfs://HOST:PORT/user/stack/index-monday to");
-        System.out.println( " file:///0/searcher.dir/index:");
-        System.out.println();
-        System.out.println(" org.apache.hadoop.fs.FsShell " +
-            "/user/stack/index-monday /0/searcher.dir/index"); 
-        System.out.println();
-        System.out.println(" org.apache.nutch.indexer.IndexSorter " +
-            "/home/stack/tmp/crawl"); 
-        System.out.println();
-        System.out.println(" Note that IndexSorter refers to local " +
-            "filesystem and not to hdfs and is RAM-bound. Set");
-        System.out.println(" task child RAM with the mapred.child.java.opts " +
-                "property in your hadoop-site.xml.");
+      for (int i = 0; i < inputs.length; i++)
+      {
+        Path p = inputs[i];
+        FileSystem fs = p.getFileSystem(job);
+        Path [] ps = fs.listPaths(p);
+
+        for (int j = 0; j < ps.length; j++)
+        {
+          if (fs.isDirectory(ps[j]))
+          {
+            continue;
+          }
+          
+          addFileLines(lines, fs, ps[j]);
+        }
+      }
+      
+      List<LineInputSplit> splits =
+        new ArrayList<LineInputSplit>(lines.size());
         
-	}
-	
-	public int run(String[] args) throws Exception {
-		if (args.length != 2 ||
-				(args.length == 1 &&
-					(args[0].equals("-h") || args[0].equals("--help")))) {
-			usage();
-			return -1;
-		}
-		JobConf job = new JobConf(MultipleInputFormat.class);
-		job.setInputFormat(MultipleInputFormat.class);
-		job.setInputPath(new Path(args[0]));
-		job.setMapperClass(Multiple.class);
-		job.setOutputPath(new Path(args[1]));
-		JobClient.runJob(job);
-		return 0;
-	}
-	
-	public static void main(String[] args) throws Exception {
-		int res = new Multiple().doMain(NutchConfiguration.create(), args);
-		System.exit(res);
-	}
+      for (String line: lines)
+      {
+        splits.add(new LineInputSplit(line));
+      }
+      
+      job.setNumMapTasks(lines.size());
+      
+      return splits.toArray(new LineInputSplit [0]);
+    }
+    
+    private void addFileLines(final List<String> lines, final FileSystem fs,
+        final Path p)
+      throws IOException
+    {
+      InputStream is = (InputStream)fs.open(p);
+      LineNumberReader lnr = null;
+      
+      try
+      {
+        lnr = new LineNumberReader(new InputStreamReader(is));
+        
+        for (String l = null; (l = lnr.readLine()) != null;)
+        {
+          if (l.length() > 0 && !l.trim().startsWith("#"))
+          {
+            lines.add(l);
+          }
+        }
+      }
+      finally
+      {
+        if (lnr != null)
+        {
+          lnr.close();
+        }
+        
+        is.close();
+      }
+    }
+
+    public void validateInput(JobConf job) throws IOException
+    {
+      // Nothing to validate.
+    }
+  }
+  
+  public static class LineInputSplit implements InputSplit
+  {
+    private String line;
+    
+    protected LineInputSplit()
+    {
+      super();
+    }
+    
+    public LineInputSplit(final String l)
+    {
+      line = l;
+    }
+    
+    public long getLength() throws IOException
+    {
+      return line.length();
+    }
+
+    public String[] getLocations() throws IOException
+    {
+      return new String[0];
+    }
+
+    public void readFields(DataInput in) throws IOException
+    {
+      this.line = in.readLine();
+    }
+
+    public void write(DataOutput out) throws IOException
+    {
+      out.writeBytes(this.line);
+    }
+  }
+  
+  public static void usage()
+  {
+    System.out.println("Usage: multiple <input> <output>");
+    System.out.println("Runs concurrently all commands listed in " +
+      "<inputs>.");
+    System.out.println("Arguments:");
+    System.out.println(" <input>   Directory of input files with " +
+      "each line describing task to run");
+    System.out.println(" <output>  Output directory.");
+    System.out.println("Example input lines:");
+    System.out.println();
+    System.out.println(" An input line to specify a merge would look like:");
+    System.out.println();
+    System.out.println(" org.apache.nutch.indexer.IndexMerger " +
+      "-workingdir /3/hadoop-tmp index-monday indexes-monday");
+    System.out.println();
+    System.out.println(" Note that named class must implement " +
+      "org.apache.hadoop.util.ToolBase");
+    System.out.println();
+    System.out.println(" To copy from " +
+      "hdfs://HOST:PORT/user/stack/index-monday to");
+    System.out.println( " file:///0/searcher.dir/index:");
+    System.out.println();
+    System.out.println(" org.apache.hadoop.fs.FsShell " +
+      "/user/stack/index-monday /0/searcher.dir/index"); 
+    System.out.println();
+    System.out.println(" org.apache.nutch.indexer.IndexSorter " +
+      "/home/stack/tmp/crawl"); 
+    System.out.println();
+    System.out.println(" Note that IndexSorter refers to local " +
+      "filesystem and not to hdfs and is RAM-bound. Set");
+    System.out.println(" task child RAM with the mapred.child.java.opts " +
+      "property in your hadoop-site.xml.");
+  }
+  
+  public int run(String[] args) throws Exception
+  {
+    if (args.length != 2 ||
+        (args.length == 1 &&
+          (args[0].equals("-h") || args[0].equals("--help"))))
+    {
+      usage();
+      return -1;
+    }
+    
+    JobConf job = new JobConf(MultipleInputFormat.class);
+    job.setInputFormat(MultipleInputFormat.class);
+    job.setInputPath(new Path(args[0]));
+    job.setMapperClass(Multiple.class);
+    job.setOutputPath(new Path(args[1]));
+    
+    JobClient.runJob(job);
+    
+    return 0;
+  }
+  
+  public static void main(String[] args) throws Exception
+  {
+    int res = new Multiple().doMain(NutchConfiguration.create(), args);
+    
+    System.exit(res);
+  }
 }
\ No newline at end of file

Modified: trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Nutchwax.java
===================================================================
--- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Nutchwax.java	2007-07-26 21:53:47 UTC (rev 1895)
+++ trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Nutchwax.java	2007-08-01 21:44:31 UTC (rev 1896)
@@ -53,677 +53,916 @@
 /**
  * Script to run all indexing jobs from index through merge of final index.
  */
-public class Nutchwax {
-    public static final Log LOG =
-        LogFactory.getLog(Nutchwax.class.getName());
+public class Nutchwax
+{
+  public static final Log LOG =
+    LogFactory.getLog(Nutchwax.class.getName());
     
-    private static final String KEY_COLLECTION_PREFIX = "c=";
-    private static final String KEY_COLLECTION_SUFFIX = ",u=";
-    private static final Pattern COLLECTION =
-        Pattern.compile("^\\s*c=([^,]+),u=(.*)\\s*", Pattern.DOTALL);
+  private static final String KEY_COLLECTION_PREFIX = "c=";
+  private static final String KEY_COLLECTION_SUFFIX = ",u=";
+  private static final Pattern COLLECTION =
+    Pattern.compile("^\\s*c=([^,]+),u=(.*)\\s*", Pattern.DOTALL);
 
-    private final static List JOBS = Arrays.asList(new String[] {
-        "import", "update", "invert", "index", "dedup", "merge", "all",
-        "class", "search", "multiple"});
+  private final static List JOBS = Arrays.asList(new String[] {
+    "import", "update", "invert", "index", "dedup", "merge", "all",
+    "class", "search", "multiple"});
     
 
-    // Lazy initialize these two variables to delay complaint about hadoop not
-    // being present -- if its not.  Meantime I get command-line processing
-    // done.
-    private FileSystem fs = null;
-    private JobConf conf = null;
+  // Lazy initialize these two variables to delay complaint about hadoop not
+  // being present -- if its not.  Meantime I get command-line processing
+  // done.
+  private FileSystem fs = null;
+  private JobConf conf = null;
     
-    /**
-     * Default constructor.
-     * @throws IOException 
-     */
-    public Nutchwax() throws IOException {
-        super();
-    }
+  /**
+   * Default constructor.
+   * @throws IOException 
+   */
+  public Nutchwax() throws IOException
+  {
+    super();
+  }
     
-    public synchronized JobConf getJobConf() {
-        if (this.conf == null) {
-            this.conf = new JobConf(NutchwaxConfiguration.getConfiguration());
-        }
-        return this.conf;
+  public synchronized JobConf getJobConf()
+  {
+    if (this.conf == null)
+    {
+      this.conf = new JobConf(NutchwaxConfiguration.getConfiguration());
     }
+     
+    return this.conf;
+  }
     
-    public synchronized FileSystem getFS() throws IOException {
-        if (this.fs == null) {
-            this.fs = FileSystem.get(getJobConf());
-        }
-        return this.fs;
+  public synchronized FileSystem getFS() throws IOException
+  {
+    if (this.fs == null)
+    {
+      this.fs = FileSystem.get(getJobConf());
     }
+     
+    return this.fs;
+  }
     
-    protected class OutputDirectories {
-        private final Path output;
-        private final Path crawlDb;
-        private final Path linkDb;
-        private final Path segments;
-        private final Path indexes;
-        private final Path index;
-        private final Path tmpDir;
+  protected class OutputDirectories
+  {
+    private final Path output;
+    private final Path crawlDb;
+    private final Path linkDb;
+    private final Path segments;
+    private final Path indexes;
+    private final Path index;
+    private final Path tmpDir;
 
-        public OutputDirectories(final Path output) throws IOException {
-            this.output = output;
-            this.crawlDb = new Path(output + "/crawldb");
-            this.linkDb = new Path(output + "/linkdb");
-            this.segments = new Path(output + "/segments");
-            this.indexes = new Path(output + "/indexes");
-            this.index = new Path(output + "/index");
-            this.tmpDir = getJobConf().getLocalPath("mapred.temp.dir",
-                Generator.generateSegmentName());
-        }
+    public OutputDirectories(final Path output) throws IOException
+    {
+      this.output = output;
+      this.crawlDb = new Path(output + "/crawldb");
+      this.linkDb = new Path(output + "/linkdb");
+      this.segments = new Path(output + "/segments");
+      this.indexes = new Path(output + "/indexes");
+      this.index = new Path(output + "/index");
+      this.tmpDir = getJobConf().getLocalPath("mapred.temp.dir",
+        Generator.generateSegmentName());
+    }
 
-        public Path getCrawlDb() {
-            return crawlDb;
-        }
+    public Path getCrawlDb()
+    {
+      return crawlDb;
+    }
 
-        public Path getIndexes() {
-            return indexes;
-        }
+    public Path getIndexes()
+    {
+      return indexes;
+    }
 
-        public Path getLinkDb() {
-            return linkDb;
-        }
+    public Path getLinkDb()
+    {
+      return linkDb;
+    }
 
-        public Path getSegments() {
-            return segments;
-        }
+    public Path getSegments()
+    {
+      return segments;
+    }
 
-        public Path getTmpDir() {
-            return tmpDir;
-        }
+    public Path getTmpDir()
+    {
+      return tmpDir;
+    }
 
-        public Path getIndex() {
-            return index;
-        }
+    public Path getIndex()
+    {
+      return index;
+    }
 
-        public Path getOutput() {
-            return output;
-        }
+    public Path getOutput()
+    {
+      return output;
     }
+  }
 
-    /**
-     * Run passed list of mapreduce indexing jobs. Jobs are always run in
-     * order: import, update, etc.
-     * 
-     * @throws Exception
-     */
-    protected void doAll(final Path input, final String collectionName,
-            final OutputDirectories od)
-    throws Exception {
-        doImport(input, collectionName, od);
-        doUpdate(od);
-        doInvert(od);
-        doIndexing(od);
-        doDedup(od);
-        doMerge(od);
-        LOG.info("Nutchwax finished.");
-    }
+  /**
+   * Run passed list of mapreduce indexing jobs. Jobs are always run in
+   * order: import, update, etc.
+   * 
+   * @throws Exception
+   */
+  protected void doAll(final Path input, final String collectionName,
+    final OutputDirectories od)
+    throws Exception
+  {
+    doImport(input, collectionName, od);
+    doUpdate(od);
+    doInvert(od);
+    doIndexing(od);
+    doDedup(od);
+    doMerge(od);
+      
+    LOG.info("Nutchwax finished.");
+  }
     
-    protected void doImport(final Path input, String collectionName,
-            final OutputDirectories od)
-    throws IOException {
-        Path segment = new Path(od.getSegments(),
-            Generator.generateSegmentName() +
-                ((collectionName == null || collectionName.length() <= 0)?
-                        "": "-" + collectionName));
-        new ImportArcs(getJobConf()).importArcs(input, segment,
-            collectionName);
-    }
+  protected void doImport(final Path input, String collectionName,
+    final OutputDirectories od)
+    throws IOException
+  {
+    Path segment = new Path(od.getSegments(),
+      Generator.generateSegmentName() +
+        ((collectionName == null || collectionName.length() <= 0)?
+          "": "-" + collectionName));
+        
+    new ImportArcs(getJobConf()).importArcs(input, segment, collectionName);
+  }
     
-    protected void doUpdate(final OutputDirectories od)
-    throws IOException {
-        doUpdate(od, null);
+  protected void doUpdate(final OutputDirectories od)
+    throws IOException
+  {
+    doUpdate(od, null);
+  }
+    
+  protected void doUpdate(final OutputDirectories od,
+    final String[] segments)
+    throws IOException
+  {
+    LOG.info("updating crawldb " + od.getCrawlDb());
+      
+    // Need to make sure the db dir exists before progressing.
+    Path dbPath = new Path(od.getCrawlDb(), CrawlDb.CURRENT_NAME);
+        
+    if (!getFS().exists(dbPath))
+    {
+      getFS().mkdirs(dbPath);
     }
-    
-    protected void doUpdate(final OutputDirectories od,
-            final String[] segments)
-    throws IOException {
-        LOG.info("updating crawldb " + od.getCrawlDb());
-        // Need to make sure the db dir exists before progressing.
-        Path dbPath = new Path(od.getCrawlDb(), CrawlDb.CURRENT_NAME);
-        if (!getFS().exists(dbPath)) {
-            getFS().mkdirs(dbPath);
+      
+    CrawlDb cdb = new NutchwaxCrawlDb(getJobConf());
+      
+    if (segments != null)
+    {
+      List<Path> paths = new ArrayList<Path>(segments.length);
+        
+      for (int i = 0; i < segments.length; i++)
+      {
+        Path p = new Path(segments[i]);
+          
+        if (!getFS().exists(p))
+        {
+          throw new FileNotFoundException(p.toString());
         }
-        CrawlDb cdb = new NutchwaxCrawlDb(getJobConf());
-        if (segments != null) {
-            List<Path> paths = new ArrayList<Path>(segments.length);
-            for (int i = 0; i < segments.length; i++) {
-                Path p = new Path(segments[i]);
-                if (!getFS().exists(p)) {
-                    throw new FileNotFoundException(p.toString());
-                }
-                paths.add(p);
-            }
-            cdb.update(od.getCrawlDb(), paths.toArray(new Path[paths.size()]),
-                true, true);
-        } else {
-            Path[] allSegments = getSegments(od);
-            // This just does the last segment created.
-            cdb.update(od.getCrawlDb(),
-                new Path[] {allSegments[allSegments.length - 1]}, true, true);
-        }
+
+        paths.add(p);
+      }
+        
+      cdb.update(od.getCrawlDb(), paths.toArray(new Path[paths.size()]),
+        true, true);
     }
+    else
+    {
+      Path[] allSegments = getSegments(od);
+        
+      // This just does the last segment created.
+      cdb.update(od.getCrawlDb(),
+        new Path[] {allSegments[allSegments.length - 1]}, true, true);
+    }
+  }
 
-    protected Path [] getSegments(final OutputDirectories od)
-    throws IOException {
-        Path[] allSegments = getFS().listPaths(od.getSegments());
-        if (allSegments == null || allSegments.length <= 0) {
-            throw new FileNotFoundException(od.getSegments().toString());
-        }
-        return allSegments;
+  protected Path [] getSegments(final OutputDirectories od)
+    throws IOException
+  {
+    Path[] allSegments = getFS().listPaths(od.getSegments());
+        
+    if (allSegments == null || allSegments.length <= 0)
+    {
+      throw new FileNotFoundException(od.getSegments().toString());
     }
+      
+    return allSegments;
+  }
     
-    protected void doInvert(final OutputDirectories od, final Path [] segments)
-    throws IOException {
-        createLinkdb(od);
-        new NutchwaxLinkDb(getJobConf()).
-        	invert(od.getLinkDb(), segments, true, true, false);
-    }
+  protected void doInvert(final OutputDirectories od, final Path [] segments)
+    throws IOException
+  {
+    createLinkdb(od);
+      
+    new NutchwaxLinkDb(getJobConf()).
+      invert(od.getLinkDb(), segments, true, true, false);
+  }
     
-    protected void doInvert(final OutputDirectories od)
-    throws IOException {
-        LOG.info("inverting links in " + od.getSegments());
-        new NutchwaxLinkDb(getJobConf()).
-        	invert(od.getLinkDb(), getSegments(od), true, true, false);
+  protected void doInvert(final OutputDirectories od)
+    throws IOException
+  {
+    LOG.info("inverting links in " + od.getSegments());
+
+    new NutchwaxLinkDb(getJobConf()).
+      invert(od.getLinkDb(), getSegments(od), true, true, false);
+  }
+
+  protected boolean createLinkdb(final OutputDirectories od)
+    throws IOException
+  {
+    boolean result = false;
+
+    // Make sure the linkdb exists.  Otherwise the install where
+    // the temporary location gets moved to the permanent fails.
+    if (getFS().mkdirs(new Path(od.getLinkDb(),
+      NutchwaxLinkDb.CURRENT_NAME)))
+    {
+      LOG.info("Created " + od.getLinkDb());
+
+      result = true;
     }
+
+    return result;
+  }
     
-    protected boolean createLinkdb(final OutputDirectories od)
-    throws IOException {
-        boolean result = false;
-        // Make sure the linkdb exists.  Otherwise the install where
-        // the temporary location gets moved to the permanent fails.
-        if (getFS().mkdirs(new Path(od.getLinkDb(),
-                NutchwaxLinkDb.CURRENT_NAME))) {
-            LOG.info("Created " + od.getLinkDb());
-            result = true;
-        }
-        return result;
-    }
+  protected void doIndexing(final OutputDirectories od)
+    throws IOException
+  {
+    doIndexing(od, getFS().listPaths(od.getSegments()));
+  }
+
+  protected void doIndexing(final OutputDirectories od,
+    final Path [] segments)
+    throws IOException
+  {
+    LOG.info(" indexing " + segments);
+
+    new NutchwaxIndexer(getJobConf()).index(od.getIndexes(),
+      od.getCrawlDb(), od.getLinkDb(), segments);
+  }
+
+  protected void doDedup(final OutputDirectories od) throws IOException
+  {
+    LOG.info("dedup " + od.getIndex());
+
+    new DeleteDuplicates(getJobConf()).dedup(new Path[] {od.getIndexes()});
+  }
     
-    protected void doIndexing(final OutputDirectories od)
-    throws IOException {
-        doIndexing(od, getFS().listPaths(od.getSegments()));
+  protected void doMerge(final OutputDirectories od) throws IOException
+  {
+    LOG.info("index merge " + od.getOutput() + " using tmpDir=" +
+      od.getTmpDir());
+
+    new IndexMerger(getJobConf()).merge(getFS().listPaths(od.getIndexes()),
+      od.getIndex(), od.getTmpDir());
+  }
+
+  static String [] rewriteArgs(final String [] args, final int offset)
+  {
+    final String [] newArgs = new String[args.length - offset];
+
+    for (int i = 0; i < args.length; i++)
+    {
+      if (i < offset)
+      {
+        continue;
+      }
+
+      newArgs[i - offset] = args[i];
     }
-    
-    protected void doIndexing(final OutputDirectories od,
-        final Path [] segments)
-    throws IOException {
-        LOG.info(" indexing " + segments);
-        new NutchwaxIndexer(getJobConf()).index(od.getIndexes(),
-            od.getCrawlDb(), od.getLinkDb(), segments);
+
+    return newArgs;
+  }
+
+  static Object doClassMain(final String [] args)
+  {
+    // Redo args so absent our nutchwax 'class' command.
+    final String className = args[1];
+    String [] newArgs = rewriteArgs(args, 2);
+
+    // From http://www.javaworld.com/javaworld/javaqa/1999-06/01-outside.html
+    Class [] argTypes = new Class[1];
+    argTypes[0] = String[].class;
+    Object result = null;
+
+    try
+    {
+      Method mainMethod =
+        Class.forName(className).getDeclaredMethod("main", argTypes);
+      result = mainMethod.invoke(newArgs, new Object [] {newArgs});
     }
+    catch (Throwable t)
+    {
+      t.printStackTrace();
+    }
+
+    return result;
+  }
     
-    protected void doDedup(final OutputDirectories od) throws IOException {
-        LOG.info("dedup " + od.getIndex());
-        new DeleteDuplicates(getJobConf()).dedup(new Path[] {od.getIndexes()});
+  protected Object doSearch(final String [] args)
+  {
+    String [] newArgs = new String[args.length + 1];
+    newArgs[0] = args[0];
+    newArgs[1] = NutchwaxBean.class.getName();
+
+    for (int i = 1; i < args.length; i++)
+    {
+      newArgs[i + 1] = args[i];
     }
+
+    return doClassMain(newArgs);
+  }
+
+  protected void doMultiple(final String [] args) throws Exception
+  {
+    (new Multiple()).run(rewriteArgs(args, 1));
+  }
     
-    protected void doMerge(final OutputDirectories od) throws IOException {
-        LOG.info("index merge " + od.getOutput() + " using tmpDir=" +
-            od.getTmpDir());
-        new IndexMerger(getJobConf()).merge(getFS().listPaths(od.getIndexes()),
-            od.getIndex(), od.getTmpDir());
+  protected void doJob(final String jobName, final String [] args)
+    throws Exception
+  {
+    if (jobName.equals("import"))
+    {
+      // Usage: hadoop jar nutchwax.jar import input output name
+      if (args.length != 4)
+      {
+        ImportArcs.doImportUsage(
+          "ERROR: Wrong number of arguments passed.", 2);
+      }
+
+      final Path input = new Path(args[1]);
+      final Path output = new Path(args[2]);
+      final String collectionName = args[3];
+
+      checkArcsDir(input);
+      OutputDirectories od = new OutputDirectories(output);
+      doImport(input, collectionName, od);
     }
-    
-    static String [] rewriteArgs(final String [] args, final int offset) {
-        final String [] newArgs = new String[args.length - offset];
-        for (int i = 0; i < args.length; i++) {
-            if (i < offset) {
-                continue;
-            }
-            newArgs[i - offset] = args[i];
+    else if (jobName.equals("update"))
+    {
+      // Usage: hadoop jar nutchwax.jar update output
+      if (args.length < 2)
+      {
+        doUpdateUsage("ERROR: Wrong number of arguments passed.", 2);
+      }
+
+      OutputDirectories od = new OutputDirectories(new Path(args[1]));
+
+      if (args.length == 2)
+      {
+        doUpdate(od);
+      }
+      else
+      {
+        for (int i = 2; i < args.length; i++)
+        {
+          doUpdate(od, new String [] {args[i]});
         }
-        return newArgs;
+      }
     }
+    else if (jobName.equals("invert"))
+    {
+      // Usage: hadoop jar nutchwax.jar invert output
+      if (args.length < 2)
+      {
+        doInvertUsage("ERROR: Wrong number of arguments passed.", 2);
+      }
 
-    static Object doClassMain(final String [] args) {
-        // Redo args so absent our nutchwax 'class' command.
-        final String className = args[1];
-        String [] newArgs = rewriteArgs(args, 2);
-        // From http://www.javaworld.com/javaworld/javaqa/1999-06/01-outside.html
-        Class [] argTypes = new Class[1];
-        argTypes[0] = String[].class;
-        Object result = null;
-        try {
-            Method mainMethod =
-                Class.forName(className).getDeclaredMethod("main", argTypes);
-            result = mainMethod.invoke(newArgs, new Object [] {newArgs});
-        } catch (Throwable t) {
-            t.printStackTrace();
+      OutputDirectories od = new OutputDirectories(new Path(args[1]));
+
+      if (args.length == 2)
+      {
+        doInvert(od);
+      }
+      else
+      {
+        final int offset = 2;
+        Path [] segments = new Path[args.length - offset];
+
+        for (int i = offset; i < args.length; i++)
+        {
+          Path f = new Path(args[i]);
+
+          if (! getFS().exists(f))
+          {
+            throw new FileNotFoundException(f.toString());
+          }
+
+          segments[i - offset] = f;
         }
-        return result;
+
+        doInvert(od, segments);
+      }
     }
-    
-    protected Object doSearch(final String [] args) {
-        String [] newArgs = new String[args.length + 1];
-        newArgs[0] = args[0];
-        newArgs[1] = NutchwaxBean.class.getName();
-        for (int i = 1; i < args.length; i++) {
-            newArgs[i + 1] = args[i];
+    else if (jobName.equals("index"))
+    {
+      // Usage: hadoop jar nutchwax.jar index output
+      if (args.length < 2)
+      {
+        doIndexUsage("ERROR: Wrong number of arguments passed.", 2);
+      }
+
+      OutputDirectories od = new OutputDirectories(new Path(args[1]));
+
+      if (args.length == 2)
+      {
+        doIndexing(od);
+      }
+      else
+      {
+        final int offset = 2;
+        Path [] segments = new Path[args.length - offset];
+
+        for (int i = offset; i < args.length; i++)
+        {
+          Path f = new Path(args[i]);
+
+          if (! getFS().exists(f))
+          {
+            throw new FileNotFoundException(f.toString());
+          }
+
+          segments[i - offset] = f;
         }
-        return doClassMain(newArgs);
+
+        doIndexing(od, segments);
+      }
     }
-    
-    protected void doMultiple(final String [] args) throws Exception {
-    	(new Multiple()).run(rewriteArgs(args, 1));
+    else if (jobName.equals("dedup"))
+    {
+      // Usage: hadoop jar nutchwax.jar dedup output
+      if (args.length != 2)
+      {
+        doDedupUsage("Wrong number of arguments passed.", 2);
+      }
+
+      doDedup(new OutputDirectories(new Path(args[1])));
     }
-    
-    protected void doJob(final String jobName, final String [] args)
-    throws Exception {
-        if (jobName.equals("import")) {
-            // Usage: hadoop jar nutchwax.jar import input output name
-            if (args.length != 4) {
-                ImportArcs.doImportUsage(
-                    "ERROR: Wrong number of arguments passed.", 2);
-            }
-            final Path input = new Path(args[1]);
-            final Path output = new Path(args[2]);
-            final String collectionName = args[3];
-            checkArcsDir(input);
-            OutputDirectories od = new OutputDirectories(output);
-            doImport(input, collectionName, od);
-        } else if (jobName.equals("update")) {
-            // Usage: hadoop jar nutchwax.jar update output
-            if (args.length < 2) {
-                doUpdateUsage("ERROR: Wrong number of arguments passed.", 2);
-            }
-            OutputDirectories od = new OutputDirectories(new Path(args[1]));
-            if (args.length == 2) {
-                doUpdate(od);
-            } else {
-                for (int i = 2; i < args.length; i++) {
-                    doUpdate(od, new String [] {args[i]});
-                }
-            }
-        } else if (jobName.equals("invert")) {
-            // Usage: hadoop jar nutchwax.jar invert output
-            if (args.length < 2) {
-                doInvertUsage("ERROR: Wrong number of arguments passed.", 2);
-            }
-            OutputDirectories od = new OutputDirectories(new Path(args[1]));
-            if (args.length == 2) {
-                doInvert(od);
-            } else {
-                final int offset = 2;
-                Path [] segments = new Path[args.length - offset];
-                for (int i = offset; i < args.length; i++) {
-                    Path f = new Path(args[i]);
-                    if (!getFS().exists(f)) {
-                        throw new FileNotFoundException(f.toString());
-                    }
-                    segments[i - offset] = f;
-                }
-                doInvert(od, segments);
-            }
-        } else if (jobName.equals("index")) {
-            // Usage: hadoop jar nutchwax.jar index output
-            if (args.length < 2) {
-                doIndexUsage("ERROR: Wrong number of arguments passed.", 2);
-            }
-            OutputDirectories od = new OutputDirectories(new Path(args[1]));
-            if (args.length == 2) {
-                doIndexing(od);
-            } else {
-                final int offset = 2;
-                Path [] segments = new Path[args.length - offset];
-                for (int i = offset; i < args.length; i++) {
-                    Path f = new Path(args[i]);
-                    if (!getFS().exists(f)) {
-                        throw new FileNotFoundException(f.toString());
-                    }
-                    segments[i - offset] = f;
-                }
-                doIndexing(od, segments);
-            }
-        } else if (jobName.equals("dedup")) {
-            // Usage: hadoop jar nutchwax.jar dedup output
-            if (args.length != 2) {
-                doDedupUsage("Wrong number of arguments passed.", 2);
-            }
-            doDedup(new OutputDirectories(new Path(args[1])));
-        } else if (jobName.equals("merge")) {
-            // Usage: hadoop jar nutchwax.jar merge output");
-            if (args.length != 2) {
-                doMergeUsage("ERROR: Wrong number of arguments passed.", 2);
-            }
-            doMerge(new OutputDirectories(new Path(args[1])));
-        } else if (jobName.equals("all")) {
-            // Usage: hadoop jar nutchwax.jar import input output name
-            if (args.length != 4) {
-                doAllUsage("ERROR: Wrong number of arguments passed.", 2);
-            }
-            final Path input = new Path(args[1]);
-            final Path output = new Path(args[2]);
-            final String collectionName = args[3];
-            checkArcsDir(input);
-            OutputDirectories od = new OutputDirectories(output);
-            doAll(input, collectionName, od);
-        } else if (jobName.equals("class")) {
-            if (args.length < 2) {
-                doClassUsage("ERROR: Wrong number of arguments passed.", 2);
-            }
-            doClassMain(args);
-        } else if (jobName.equals("search")) {
-            if (args.length < 1) {
-                doClassUsage("ERROR: Wrong number of arguments passed.", 2);
-            }
-            doSearch(args);
-        } else if (jobName.equals("multiple")) {
-            doMultiple(args);
-        } else {
-            usage("ERROR: No handler for job name " + jobName, 4);
-            System.exit(0);
-        }
+    else if (jobName.equals("merge"))
+    {
+      // Usage: hadoop jar nutchwax.jar merge output");
+      if (args.length != 2)
+      {
+        doMergeUsage("ERROR: Wrong number of arguments passed.", 2);
+      }
+
+      doMerge(new OutputDirectories(new Path(args[1])));
     }
+    else if (jobName.equals("all"))
+    {
+      // Usage: hadoop jar nutchwax.jar import input output name
+      if (args.length != 4)
+      {
+        doAllUsage("ERROR: Wrong number of arguments passed.", 2);
+      }
 
-    /**
-     * Check the arcs dir exists and looks like it has files that list ARCs
-     * (rather than ARCs themselves).
-     * 
-     * @param arcsDir Directory to examine.
-     * @throws IOException
-     */
-    protected void checkArcsDir(final Path arcsDir)
-            throws IOException {
-        if (!getFS().exists(arcsDir)) {
-            throw new IOException(arcsDir + " does not exist.");
-        }
-        if (!fs.isDirectory(arcsDir)) {
-            throw new IOException(arcsDir + " is not a directory.");
-        }
+      final Path input = new Path(args[1]);
+      final Path output = new Path(args[2]);
+      final String collectionName = args[3];
 
-        final Path [] files = getFS().listPaths(arcsDir);
-        for (int i = 0; i < files.length; i++) {
-            if (!getFS().isFile(files[i])) {
-                throw new IOException(files[i] + " is not a file.");
-            }
-            if (files[i].getName().toLowerCase().endsWith(".arc.gz")) {
-                throw new IOException(files[i] + " is an ARC file (ARCSDIR " +
-                    "should contain text file listing ARCs rather than " +
-                    "actual ARCs).");
-            }
-        }
+      checkArcsDir(input);
+
+      OutputDirectories od = new OutputDirectories(output);
+
+      doAll(input, collectionName, od);
     }
+    else if (jobName.equals("class"))
+    {
+      if (args.length < 2)
+      {
+        doClassUsage("ERROR: Wrong number of arguments passed.", 2);
+      }
+
+      doClassMain(args);
+    }
+    else if (jobName.equals("search"))
+    {
+      if (args.length < 1)
+      {
+        doClassUsage("ERROR: Wrong number of arguments passed.", 2);
+      }
+
+      doSearch(args);
+    }
+    else if (jobName.equals("multiple"))
+    {
+      doMultiple(args);
+    }
+    else
+    {
+      usage("ERROR: No handler for job name " + jobName, 4);
+      System.exit(0);
+    }
+  }
+
+ /**
+  * Check the arcs dir exists and looks like it has files that list ARCs
+  * (rather than ARCs themselves).
+  * 
+  * @param arcsDir Directory to examine.
+  * @throws IOException
+  */
+  protected void checkArcsDir(final Path arcsDir)
+    throws IOException
+  {
+    if (! getFS().exists(arcsDir))
+    {
+      throw new IOException(arcsDir + " does not exist.");
+    }
+
+    if (! fs.isDirectory(arcsDir))
+    {
+      throw new IOException(arcsDir + " is not a directory.");
+    }
+
+    final Path [] files = getFS().listPaths(arcsDir);
+
+    for (int i = 0; i < files.length; i++)
+    {
+      if (! getFS().isFile(files[i]))
+      {
+        throw new IOException(files[i] + " is not a file.");
+      }
+
+      if (files[i].getName().toLowerCase().endsWith(".arc.gz"))
+      {
+        throw new IOException(files[i] + " is an ARC file (ARCSDIR " +
+          "should contain text file listing ARCs rather than " +
+          "actual ARCs).");
+      }
+    }
+  }
+
+  public static Text generateWaxKey(WritableComparable key,
+    final String collection)
+  {
+    return generateWaxKey(key.toString(), collection);
+  }
     
-    public static Text generateWaxKey(WritableComparable key,
-            final String collection) {
-        return generateWaxKey(key.toString(), collection);
+  public static Text generateWaxKey(final String keyStr,
+    final String collection)
+  {
+    if (collection == null)
+    {
+      throw new NullPointerException("Collection is null for " + keyStr);
     }
     
-    public static Text generateWaxKey(final String keyStr,
-            final String collection) {
-        if (collection == null) {
-            throw new NullPointerException("Collection is null for " + keyStr);
-        }
-        if (keyStr == null) {
-            throw new NullPointerException("keyStr is null");
-        }
-        if (keyStr.startsWith(KEY_COLLECTION_PREFIX)) {
-            LOG.warn("Key already has collection prefix: " + keyStr
-                    + ". Skipping.");
-            return new Text(keyStr);
-        }
+    if (keyStr == null)
+    {
+      throw new NullPointerException("keyStr is null");
+    }
+    
+    if (keyStr.startsWith(KEY_COLLECTION_PREFIX))
+    {
+      LOG.warn("Key already has collection prefix: " + keyStr
+        + ". Skipping.");
         
-        return new Text(KEY_COLLECTION_PREFIX + collection.trim() +
-            KEY_COLLECTION_SUFFIX + keyStr.trim());
+      return new Text(keyStr);
     }
+
+    return new Text(KEY_COLLECTION_PREFIX + collection.trim() +
+      KEY_COLLECTION_SUFFIX + keyStr.trim());
+  }
+
+  public static String getCollectionFromWaxKey(final WritableComparable key)
+    throws IOException
+  {
+    Matcher m = COLLECTION.matcher(key.toString());
     
-    public static String getCollectionFromWaxKey(final WritableComparable key)
-    throws IOException {
-        Matcher m = COLLECTION.matcher(key.toString());
-        if (m == null || !m.matches()) {
-            throw new IOException("Key doesn't have collection " +
-                    "prefix <" + key.toString() + ">");
-        }
-        return m.group(1);
+    if (m == null || !m.matches())
+    {
+      throw new IOException("Key doesn't have collection " +
+        "prefix <" + key.toString() + ">");
     }
     
-    public static String getUrlFromWaxKey(final WritableComparable key)
-    throws IOException {
-        Matcher m = COLLECTION.matcher(key.toString());
-        if (m == null || !m.matches()) {
-            throw new IOException("Key doesn't have collection " +
-                    " prefix: " + key);
-        }
-        return m.group(2);
+    return m.group(1);
+  }
+
+  public static String getUrlFromWaxKey(final WritableComparable key)
+    throws IOException
+  {
+    Matcher m = COLLECTION.matcher(key.toString());
+    
+    if (m == null || !m.matches())
+    {
+      throw new IOException("Key doesn't have collection " +
+        " prefix: " + key);
     }
     
-    public static long getDate(String d)
-    throws IOException {
-        long date = 0;
-        try {
-            date = ArchiveUtils.getDate(d).getTime();
-        } catch (final java.text.ParseException e) {
-            throw new IOException("Failed parse of date: " + d + ": " +
-                e.getMessage());
-        }
-        // Date can be < 0 if pre-1970 (Seen in some old ARCs).
-        return date >= 0? date: 0;
+    return m.group(2);
+  }
+    
+  public static long getDate(String d) throws IOException
+  {
+    long date = 0;
+    
+    try
+    {
+      date = ArchiveUtils.getDate(d).getTime();
     }
+    catch (final java.text.ParseException e)
+    {
+      throw new IOException("Failed parse of date: " + d + ": " +
+        e.getMessage());
+    }
+    
+    // Date can be < 0 if pre-1970 (Seen in some old ARCs).
+    return date >= 0? date: 0;
+  }
 
-    public static void usage(final String message, final int exitCode) {
-        if (message != null && message.length() > 0) {
-            System.out.println(message);
-        }
+  public static void usage(final String message, final int exitCode)
+  {
+    if (message != null && message.length() > 0)
+    {
+      System.out.println(message);
+    }
 
-        System.out.println("Usage: hadoop jar nutchwax.jar <job> [args]");
-        System.out.println("Launch NutchWAX job(s) on a hadoop platform.");
-        System.out.println("Type 'hadoop jar nutchwax.jar help <job>' for" +
-            " help on a specific job.");
-        System.out.println("Jobs (usually) must be run in the order " +
-            "listed below.");
-        System.out.println("Available jobs:");
-        System.out.println(" import   Import ARCs.");
-        System.out.println(" update   Update dbs with recent imports.");
-        System.out.println(" invert   Invert links.");
-        System.out.println(" index    Index segments.");
-        System.out.println(" dedup    Deduplicate by URL or content MD5.");
-        System.out.println(" merge    Merge segment indices into one.");
-        System.out.println(" all      Runs all above jobs in order.");
-        System.out.println(" class    Run the passed class's main.");
-        System.out.println(" search   Run a query against index under " +
-            "property 'searcher.dir'");
-        System.out.println(" multiple Run multiple concurrent tasks.");      
-        System.exit(exitCode);
-    }
+    System.out.println("Usage: hadoop jar nutchwax.jar <job> [args]");
+    System.out.println("Launch NutchWAX job(s) on a hadoop platform.");
+    System.out.println("Type 'hadoop jar nutchwax.jar help <job>' for" +
+      " help on a specific job.");
+    System.out.println("Jobs (usually) must be run in the order " +
+      "listed below.");
+    System.out.println("Available jobs:");
+    System.out.println(" import   Import ARCs.");
+    System.out.println(" update   Update dbs with recent imports.");
+    System.out.println(" invert   Invert links.");
+    System.out.println(" index    Index segments.");
+    System.out.println(" dedup    Deduplicate by URL or content MD5.");
+    System.out.println(" merge    Merge segment indices into one.");
+    System.out.println(" all      Runs all above jobs in order.");
+    System.out.println(" class    Run the passed class's main.");
+    System.out.println(" search   Run a query against index under " +
+      "property 'searcher.dir'");
+    System.out.println(" multiple Run multiple concurrent tasks.");
     
-    public static void doUpdateUsage(final String message,
-            final int exitCode) {
-        if (message != null && message.length() > 0) {
-            System.out.println(message);
-        }
-        System.out.println("Usage: hadoop jar nutchwax.jar update <output> " +
-                "[<segments>...]");
-        System.out.println("Arguments:");
-        System.out.println(" output    Directory to write crawldb under.");
-        System.out.println("Options:");
-        System.out.println(" segments  List of segments to update crawldb " +
-                "with. If none supplied, updates");
-        System.out.println("            using latest segment found.");
-        System.exit(exitCode);
+    System.exit(exitCode);
+  }
+    
+  public static void doUpdateUsage(final String message,
+    final int exitCode)
+  {
+    if (message != null && message.length() > 0)
+    {
+      System.out.println(message);
     }
     
-    public static void doInvertUsage(final String message,
-            final int exitCode) {
-        if (message != null && message.length() > 0) {
-            System.out.println(message);
-        }
-        System.out.println("Usage: hadoop jar nutchwax.jar invert <output> " +
-            "[<segments>...]");
-        System.out.println("Arguments:");
-        System.out.println(" output    Directory to write linkdb under.");
-        System.out.println("Options:");
-        System.out.println(" segments  List of segments to update linkdb " +
-            "with. If none supplied, all under");
-        System.out.println("           '<output>/segments/' " +
-            "are passed.");
-        System.exit(exitCode);
+    System.out.println("Usage: hadoop jar nutchwax.jar update <output> " +
+      "[<segments>...]");
+    System.out.println("Arguments:");
+    System.out.println(" output    Directory to write crawldb under.");
+    System.out.println("Options:");
+    System.out.println(" segments  List of segments to update crawldb " +
+      "with. If none supplied, updates");
+    System.out.println("            using latest segment found.");
+
+    System.exit(exitCode);
+  }
+    
+  public static void doInvertUsage(final String message,
+    final int exitCode)
+  {
+    if (message != null && message.length() > 0)
+    {
+      System.out.println(message);
     }
     
-    public static void doIndexUsage(final String message,
-            final int exitCode) {
-        if (message != null && message.length() > 0) {
-            System.out.println(message);
-        }
-        System.out.println("Usage: hadoop jar nutchwax.jar index <output> " +
-            "[<segments>...]");
-        System.out.println("Arguments:");
-        System.out.println(" output    Directory to write indexes under.");
-        System.out.println("Options:");
-        System.out.println(" segments  List of segments to index. " +
-            "If none supplied, all under");
-        System.out.println("           '<output>/segments/' " +
-            "are indexed.");
-        System.exit(exitCode);
+    System.out.println("Usage: hadoop jar nutchwax.jar invert <output> " +
+      "[<segments>...]");
+    System.out.println("Arguments:");
+    System.out.println(" output    Directory to write linkdb under.");
+    System.out.println("Options:");
+    System.out.println(" segments  List of segments to update linkdb " +
+      "with. If none supplied, all under");
+    System.out.println("           '<output>/segments/' " +
+      "are passed.");
+
+    System.exit(exitCode);
+  }
+    
+  public static void doIndexUsage(final String message,
+    final int exitCode)
+  {
+    if (message != null && message.length() > 0)
+    {
+      System.out.println(message);
     }
     
-    public static void doDedupUsage(final String message,
-            final int exitCode) {
-        if (message != null && message.length() > 0) {
-            System.out.println(message);
-        }
-        System.out.println("Usage: hadoop jar nutchwax.jar dedup <output>");
-        System.out.println("Arguments:");
-        System.out.println(" output  Directory in which indices" +
-            " to dedup reside.");
-        System.exit(exitCode);
+    System.out.println("Usage: hadoop jar nutchwax.jar index <output> " +
+      "[<segments>...]");
+    System.out.println("Arguments:");
+    System.out.println(" output    Directory to write indexes under.");
+    System.out.println("Options:");
+    System.out.println(" segments  List of segments to index. " +
+      "If none supplied, all under");
+    System.out.println("           '<output>/segments/' " +
+      "are indexed.");
+
+    System.exit(exitCode);
+  }
+    
+  public static void doDedupUsage(final String message,
+    final int exitCode)
+  {
+    if (message != null && message.length() > 0)
+    {
+      System.out.println(message);
     }
     
-    public static void doMergeUsage(final String message,
-            final int exitCode) {
-        if (message != null && message.length() > 0) {
-            System.out.println(message);
-        }
-        System.out.println("Usage: hadoop jar nutchwax.jar merge <output>");
-        System.out.println("Arguments:");
-        System.out.println(" output  Directory in which indices" +
-            " to merge reside.");
-        System.exit(exitCode);
+    System.out.println("Usage: hadoop jar nutchwax.jar dedup <output>");
+    System.out.println("Arguments:");
+    System.out.println(" output  Direc...
 
[truncated message content]