From: <jle...@us...> - 2007-08-01 21:44:29
|
Revision: 1896 http://archive-access.svn.sourceforge.net/archive-access/?rev=1896&view=rev Author: jlee-archive Date: 2007-08-01 14:44:31 -0700 (Wed, 01 Aug 2007) Log Message: ----------- Just cleaned up the code with some whitespace, consistent indenting, etc. No functional changes. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Nutchwax.java trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/NutchwaxBean.java trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/NutchwaxConfiguration.java trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/NutchwaxDistributedSearch.java trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/NutchwaxOpenSearchServlet.java trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/NutchwaxQuery.java trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/NutchwaxTest.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/index-wax/src/java/org/archive/access/nutch/indexer/WaxIndexingFilter.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/parse-default/src/java/org/archive/access/nutch/parse/MetadataOnlyParser.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/parse-waxext/src/java/org/apache/nutch/parse/ext/WaxExtParser.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-anchor/src/java/org/apache/nutch/searcher/anchor/AnchorQueryFilter.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-content/src/java/org/apache/nutch/searcher/content/ContentQueryFilter.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-host/src/java/org/apache/nutch/searcher/host/HostQueryFilter.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-title/src/java/org/apache/nutch/searcher/title/TitleQueryFilter.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-wax/src/java/org/archive/access/nutch/searcher/WaxArcfileQueryFilter.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-wax/src/java/org/archive/access/nutch/searcher/WaxCollectionQueryFilter.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-wax/src/java/org/archive/access/nutch/searcher/WaxDateQueryFilter.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-wax/src/java/org/archive/access/nutch/searcher/WaxExacturlQueryFilter.java trunk/archive-access/projects/nutchwax/nutchwax-plugins/src/plugin/query-wax/src/java/org/archive/access/nutch/searcher/WaxTypeQueryFilter.java Modified: trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java 2007-07-26 21:53:47 UTC (rev 1895) +++ trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Multiple.java 2007-08-01 21:44:31 UTC (rev 1896) @@ -34,252 +34,329 @@ * {@link org.apache.nutch.indexer.IndexMerger} or * {@link org.apache.nutch.indexer.IndexSorter}. * - * Takes input that has per line the name of the class to run and the arguments - * to pass. Here is an example line for IndexMerger: - * <code>org.apache.nutch.indexer.IndexMerger -workingdir /tmp index-new indexes - * </code>. Here is one for IndexSorter: + * Takes input that has per line the name of the class to run and the + * arguments to pass. Here is an example line for IndexMerger: + * <code>org.apache.nutch.indexer.IndexMerger -workingdir /tmp index-new + * indexes</code>. Here is one for IndexSorter: * <code>org.apache.nutch.indexer.IndexSorter /home/stack/tmp/crawl</code> * (Note that IndexSorter wants to refer to the local system; the indexes to * sort must be on local disk). We run as many tasks as there are input lines. * * @author stack */ -public class Multiple extends ToolBase implements Mapper { - public final Log LOG = LogFactory.getLog(this.getClass()); - private JobConf job; +public class Multiple extends ToolBase implements Mapper +{ + public final Log LOG = LogFactory.getLog(this.getClass()); + private JobConf job; - public void map(WritableComparable key, Writable value, - OutputCollector output, final Reporter reporter) - throws IOException { - final String [] words = value.toString().split("\\s"); - if (words.length <= 0) { - return; - } - final String className = words[0]; - // Set a timer running that will update reporter on a period. - Timer t = new Timer(false); - t.scheduleAtFixedRate(new TimerTask() { - @Override - public void run() { - try { - reporter.setStatus("Running " + className); - } catch (IOException e) { - e.printStackTrace(); - } - }}, 0, 10000); - try { - int result = doMain(words); - reporter.setStatus("Done running " + className + ": " + result); - if (result != 0) { - throw new IOException(className + " returned non-null: " + - result + ", check logs."); - } - } finally { - t.cancel(); - } - } - - /** - * Call {@link ToolBase#doMain(org.apache.hadoop.conf.Configuration, String[])} - * on the passed classname. - * @param args - * @return Result from call to doMain. - */ - private int doMain(final String [] args) { - final String className = args[0]; - // Redo args so absent our 'class' command. - String [] newArgs = Nutchwax.rewriteArgs(args, 1); - int result = -1; - try { - Object obj = Class.forName(className).newInstance(); - result = ((ToolBase)obj).doMain(this.job, newArgs); - } catch (Exception e) { - LOG.error(className, e); + public void map(WritableComparable key, Writable value, + OutputCollector output, final Reporter reporter) + throws IOException + { + final String [] words = value.toString().split("\\s"); + + if (words.length <= 0) + { + return; + } + + final String className = words[0]; + + // Set a timer running that will update reporter on a period. + Timer t = new Timer(false); + + t.scheduleAtFixedRate(new TimerTask() + { + @Override + public void run() + { + try + { + reporter.setStatus("Running " + className); } - return result; + catch (IOException e) + { + e.printStackTrace(); + } + } + }, 0, 10000); + + try + { + int result = doMain(words); + + reporter.setStatus("Done running " + className + ": " + result); + + if (result != 0) + { + throw new IOException(className + " returned non-null: " + + result + ", check logs."); + } } + finally + { + t.cancel(); + } + } - public void configure(final JobConf j) { - this.job = j; - } + /** + * Call {@link ToolBase#doMain(org.apache.hadoop.conf.Configuration, String[])} + * on the passed classname. + * @param args + * @return Result from call to doMain. + */ + private int doMain(final String [] args) + { + final String className = args[0]; + + // Redo args so absent our 'class' command. + String [] newArgs = Nutchwax.rewriteArgs(args, 1); + int result = -1; + + try + { + Object obj = Class.forName(className).newInstance(); + result = ((ToolBase)obj).doMain(this.job, newArgs); + } + catch (Exception e) + { + LOG.error(className, e); + } + + return result; + } - public void close() throws IOException { - // TODO Auto-generated method stub - } + public void configure(final JobConf j) + { + this.job = j; + } - public static class MultipleInputFormat implements InputFormat { - - public RecordReader getRecordReader(final InputSplit split, - final JobConf job, final Reporter reporter) - throws IOException { - // Only one record/line to read. - return new RecordReader() { - private final String line = ((LineInputSplit)split).line; - private boolean read = false; - - public void close() throws IOException { - // TODO Auto-generated method stub - } + public void close() throws IOException + { + // TODO Auto-generated method stub + } - public WritableComparable createKey() { - return new Text(""); - } + public static class MultipleInputFormat implements InputFormat + { + public RecordReader getRecordReader(final InputSplit split, + final JobConf job, final Reporter reporter) + throws IOException + { + // Only one record/line to read. + return new RecordReader() + { + private final String line = ((LineInputSplit)split).line; + private boolean read = false; + + public void close() throws IOException + { + // TODO Auto-generated method stub + } - public Writable createValue() { - return new Text(""); - } + public WritableComparable createKey() + { + return new Text(""); + } - public long getPos() throws IOException { - return 0; - } + public Writable createValue() { + return new Text(""); + } - public float getProgress() throws IOException { - return getPos(); - } + public long getPos() throws IOException + { + return 0; + } - public boolean next(Writable key, Writable value) - throws IOException { - if (read) { - return false; - } - read = true; - ((Text)value).set(this.line); - return true; - } - }; - } + public float getProgress() throws IOException + { + return getPos(); + } - public InputSplit[] getSplits(JobConf job, int numSplits) - throws IOException { - Path[] inputs = job.getInputPaths(); - List<String> lines = new ArrayList<String>(); - for (int i = 0; i < inputs.length; i++) { - Path p = inputs[i]; - FileSystem fs = p.getFileSystem(job); - Path [] ps = fs.listPaths(p); - for (int j = 0; j < ps.length; j++) { - if (fs.isDirectory(ps[j])) { - continue; - } - addFileLines(lines, fs, ps[j]); - } - } - List<LineInputSplit> splits = - new ArrayList<LineInputSplit>(lines.size()); - for (String line: lines) { - splits.add(new LineInputSplit(line)); - } - job.setNumMapTasks(lines.size()); - return splits.toArray(new LineInputSplit [0]); - } - - private void addFileLines(final List<String> lines, final FileSystem fs, - final Path p) - throws IOException { - InputStream is = (InputStream)fs.open(p); - LineNumberReader lnr = null; - try { - lnr = new LineNumberReader(new InputStreamReader(is)); - for (String l = null; (l = lnr.readLine()) != null;) { - if (l.length() > 0 && !l.trim().startsWith("#")) { - lines.add(l); - } - } - } finally { - if (lnr != null) { - lnr.close(); - } - is.close(); - } - } + public boolean next(Writable key, Writable value) + throws IOException + { + if (read) + { + return false; + } + + read = true; + + ((Text)value).set(this.line); - public void validateInput(JobConf job) throws IOException { - // Nothing to validate. - } - } - - public static class LineInputSplit implements InputSplit { - private String line; - - protected LineInputSplit() { - super(); - } - - public LineInputSplit(final String l) { - line = l; - } - - public long getLength() throws IOException { - return line.length(); - } + return true; + } + }; + } - public String[] getLocations() throws IOException { - return new String[0]; - } + public InputSplit[] getSplits(JobConf job, int numSplits) + throws IOException + { + Path[] inputs = job.getInputPaths(); - public void readFields(DataInput in) throws IOException { - this.line = in.readLine(); - } + List<String> lines = new ArrayList<String>(); - public void write(DataOutput out) throws IOException { - out.writeBytes(this.line); - } - } - - public static void usage() { - System.out.println("Usage: multiple <input> <output>"); - System.out.println("Runs concurrently all commands listed in " + - "<inputs>."); - System.out.println("Arguments:"); - System.out.println(" <input> Directory of input files with " + - "each line describing task to run"); - System.out.println(" <output> Output directory."); - System.out.println("Example input lines:"); - System.out.println(); - System.out.println(" An input line to specify a merge would look " + - "like:"); - System.out.println(); - System.out.println(" org.apache.nutch.indexer.IndexMerger " + - "-workingdir /3/hadoop-tmp index-monday indexes-monday"); - System.out.println(); - System.out.println(" Note that named class must implement " + - "org.apache.hadoop.util.ToolBase"); - System.out.println(); - System.out.println(" To copy from " + - "hdfs://HOST:PORT/user/stack/index-monday to"); - System.out.println( " file:///0/searcher.dir/index:"); - System.out.println(); - System.out.println(" org.apache.hadoop.fs.FsShell " + - "/user/stack/index-monday /0/searcher.dir/index"); - System.out.println(); - System.out.println(" org.apache.nutch.indexer.IndexSorter " + - "/home/stack/tmp/crawl"); - System.out.println(); - System.out.println(" Note that IndexSorter refers to local " + - "filesystem and not to hdfs and is RAM-bound. Set"); - System.out.println(" task child RAM with the mapred.child.java.opts " + - "property in your hadoop-site.xml."); + for (int i = 0; i < inputs.length; i++) + { + Path p = inputs[i]; + FileSystem fs = p.getFileSystem(job); + Path [] ps = fs.listPaths(p); + + for (int j = 0; j < ps.length; j++) + { + if (fs.isDirectory(ps[j])) + { + continue; + } + + addFileLines(lines, fs, ps[j]); + } + } + + List<LineInputSplit> splits = + new ArrayList<LineInputSplit>(lines.size()); - } - - public int run(String[] args) throws Exception { - if (args.length != 2 || - (args.length == 1 && - (args[0].equals("-h") || args[0].equals("--help")))) { - usage(); - return -1; - } - JobConf job = new JobConf(MultipleInputFormat.class); - job.setInputFormat(MultipleInputFormat.class); - job.setInputPath(new Path(args[0])); - job.setMapperClass(Multiple.class); - job.setOutputPath(new Path(args[1])); - JobClient.runJob(job); - return 0; - } - - public static void main(String[] args) throws Exception { - int res = new Multiple().doMain(NutchConfiguration.create(), args); - System.exit(res); - } + for (String line: lines) + { + splits.add(new LineInputSplit(line)); + } + + job.setNumMapTasks(lines.size()); + + return splits.toArray(new LineInputSplit [0]); + } + + private void addFileLines(final List<String> lines, final FileSystem fs, + final Path p) + throws IOException + { + InputStream is = (InputStream)fs.open(p); + LineNumberReader lnr = null; + + try + { + lnr = new LineNumberReader(new InputStreamReader(is)); + + for (String l = null; (l = lnr.readLine()) != null;) + { + if (l.length() > 0 && !l.trim().startsWith("#")) + { + lines.add(l); + } + } + } + finally + { + if (lnr != null) + { + lnr.close(); + } + + is.close(); + } + } + + public void validateInput(JobConf job) throws IOException + { + // Nothing to validate. + } + } + + public static class LineInputSplit implements InputSplit + { + private String line; + + protected LineInputSplit() + { + super(); + } + + public LineInputSplit(final String l) + { + line = l; + } + + public long getLength() throws IOException + { + return line.length(); + } + + public String[] getLocations() throws IOException + { + return new String[0]; + } + + public void readFields(DataInput in) throws IOException + { + this.line = in.readLine(); + } + + public void write(DataOutput out) throws IOException + { + out.writeBytes(this.line); + } + } + + public static void usage() + { + System.out.println("Usage: multiple <input> <output>"); + System.out.println("Runs concurrently all commands listed in " + + "<inputs>."); + System.out.println("Arguments:"); + System.out.println(" <input> Directory of input files with " + + "each line describing task to run"); + System.out.println(" <output> Output directory."); + System.out.println("Example input lines:"); + System.out.println(); + System.out.println(" An input line to specify a merge would look like:"); + System.out.println(); + System.out.println(" org.apache.nutch.indexer.IndexMerger " + + "-workingdir /3/hadoop-tmp index-monday indexes-monday"); + System.out.println(); + System.out.println(" Note that named class must implement " + + "org.apache.hadoop.util.ToolBase"); + System.out.println(); + System.out.println(" To copy from " + + "hdfs://HOST:PORT/user/stack/index-monday to"); + System.out.println( " file:///0/searcher.dir/index:"); + System.out.println(); + System.out.println(" org.apache.hadoop.fs.FsShell " + + "/user/stack/index-monday /0/searcher.dir/index"); + System.out.println(); + System.out.println(" org.apache.nutch.indexer.IndexSorter " + + "/home/stack/tmp/crawl"); + System.out.println(); + System.out.println(" Note that IndexSorter refers to local " + + "filesystem and not to hdfs and is RAM-bound. Set"); + System.out.println(" task child RAM with the mapred.child.java.opts " + + "property in your hadoop-site.xml."); + } + + public int run(String[] args) throws Exception + { + if (args.length != 2 || + (args.length == 1 && + (args[0].equals("-h") || args[0].equals("--help")))) + { + usage(); + return -1; + } + + JobConf job = new JobConf(MultipleInputFormat.class); + job.setInputFormat(MultipleInputFormat.class); + job.setInputPath(new Path(args[0])); + job.setMapperClass(Multiple.class); + job.setOutputPath(new Path(args[1])); + + JobClient.runJob(job); + + return 0; + } + + public static void main(String[] args) throws Exception + { + int res = new Multiple().doMain(NutchConfiguration.create(), args); + + System.exit(res); + } } \ No newline at end of file Modified: trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Nutchwax.java =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Nutchwax.java 2007-07-26 21:53:47 UTC (rev 1895) +++ trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/Nutchwax.java 2007-08-01 21:44:31 UTC (rev 1896) @@ -53,677 +53,916 @@ /** * Script to run all indexing jobs from index through merge of final index. */ -public class Nutchwax { - public static final Log LOG = - LogFactory.getLog(Nutchwax.class.getName()); +public class Nutchwax +{ + public static final Log LOG = + LogFactory.getLog(Nutchwax.class.getName()); - private static final String KEY_COLLECTION_PREFIX = "c="; - private static final String KEY_COLLECTION_SUFFIX = ",u="; - private static final Pattern COLLECTION = - Pattern.compile("^\\s*c=([^,]+),u=(.*)\\s*", Pattern.DOTALL); + private static final String KEY_COLLECTION_PREFIX = "c="; + private static final String KEY_COLLECTION_SUFFIX = ",u="; + private static final Pattern COLLECTION = + Pattern.compile("^\\s*c=([^,]+),u=(.*)\\s*", Pattern.DOTALL); - private final static List JOBS = Arrays.asList(new String[] { - "import", "update", "invert", "index", "dedup", "merge", "all", - "class", "search", "multiple"}); + private final static List JOBS = Arrays.asList(new String[] { + "import", "update", "invert", "index", "dedup", "merge", "all", + "class", "search", "multiple"}); - // Lazy initialize these two variables to delay complaint about hadoop not - // being present -- if its not. Meantime I get command-line processing - // done. - private FileSystem fs = null; - private JobConf conf = null; + // Lazy initialize these two variables to delay complaint about hadoop not + // being present -- if its not. Meantime I get command-line processing + // done. + private FileSystem fs = null; + private JobConf conf = null; - /** - * Default constructor. - * @throws IOException - */ - public Nutchwax() throws IOException { - super(); - } + /** + * Default constructor. + * @throws IOException + */ + public Nutchwax() throws IOException + { + super(); + } - public synchronized JobConf getJobConf() { - if (this.conf == null) { - this.conf = new JobConf(NutchwaxConfiguration.getConfiguration()); - } - return this.conf; + public synchronized JobConf getJobConf() + { + if (this.conf == null) + { + this.conf = new JobConf(NutchwaxConfiguration.getConfiguration()); } + + return this.conf; + } - public synchronized FileSystem getFS() throws IOException { - if (this.fs == null) { - this.fs = FileSystem.get(getJobConf()); - } - return this.fs; + public synchronized FileSystem getFS() throws IOException + { + if (this.fs == null) + { + this.fs = FileSystem.get(getJobConf()); } + + return this.fs; + } - protected class OutputDirectories { - private final Path output; - private final Path crawlDb; - private final Path linkDb; - private final Path segments; - private final Path indexes; - private final Path index; - private final Path tmpDir; + protected class OutputDirectories + { + private final Path output; + private final Path crawlDb; + private final Path linkDb; + private final Path segments; + private final Path indexes; + private final Path index; + private final Path tmpDir; - public OutputDirectories(final Path output) throws IOException { - this.output = output; - this.crawlDb = new Path(output + "/crawldb"); - this.linkDb = new Path(output + "/linkdb"); - this.segments = new Path(output + "/segments"); - this.indexes = new Path(output + "/indexes"); - this.index = new Path(output + "/index"); - this.tmpDir = getJobConf().getLocalPath("mapred.temp.dir", - Generator.generateSegmentName()); - } + public OutputDirectories(final Path output) throws IOException + { + this.output = output; + this.crawlDb = new Path(output + "/crawldb"); + this.linkDb = new Path(output + "/linkdb"); + this.segments = new Path(output + "/segments"); + this.indexes = new Path(output + "/indexes"); + this.index = new Path(output + "/index"); + this.tmpDir = getJobConf().getLocalPath("mapred.temp.dir", + Generator.generateSegmentName()); + } - public Path getCrawlDb() { - return crawlDb; - } + public Path getCrawlDb() + { + return crawlDb; + } - public Path getIndexes() { - return indexes; - } + public Path getIndexes() + { + return indexes; + } - public Path getLinkDb() { - return linkDb; - } + public Path getLinkDb() + { + return linkDb; + } - public Path getSegments() { - return segments; - } + public Path getSegments() + { + return segments; + } - public Path getTmpDir() { - return tmpDir; - } + public Path getTmpDir() + { + return tmpDir; + } - public Path getIndex() { - return index; - } + public Path getIndex() + { + return index; + } - public Path getOutput() { - return output; - } + public Path getOutput() + { + return output; } + } - /** - * Run passed list of mapreduce indexing jobs. Jobs are always run in - * order: import, update, etc. - * - * @throws Exception - */ - protected void doAll(final Path input, final String collectionName, - final OutputDirectories od) - throws Exception { - doImport(input, collectionName, od); - doUpdate(od); - doInvert(od); - doIndexing(od); - doDedup(od); - doMerge(od); - LOG.info("Nutchwax finished."); - } + /** + * Run passed list of mapreduce indexing jobs. Jobs are always run in + * order: import, update, etc. + * + * @throws Exception + */ + protected void doAll(final Path input, final String collectionName, + final OutputDirectories od) + throws Exception + { + doImport(input, collectionName, od); + doUpdate(od); + doInvert(od); + doIndexing(od); + doDedup(od); + doMerge(od); + + LOG.info("Nutchwax finished."); + } - protected void doImport(final Path input, String collectionName, - final OutputDirectories od) - throws IOException { - Path segment = new Path(od.getSegments(), - Generator.generateSegmentName() + - ((collectionName == null || collectionName.length() <= 0)? - "": "-" + collectionName)); - new ImportArcs(getJobConf()).importArcs(input, segment, - collectionName); - } + protected void doImport(final Path input, String collectionName, + final OutputDirectories od) + throws IOException + { + Path segment = new Path(od.getSegments(), + Generator.generateSegmentName() + + ((collectionName == null || collectionName.length() <= 0)? + "": "-" + collectionName)); + + new ImportArcs(getJobConf()).importArcs(input, segment, collectionName); + } - protected void doUpdate(final OutputDirectories od) - throws IOException { - doUpdate(od, null); + protected void doUpdate(final OutputDirectories od) + throws IOException + { + doUpdate(od, null); + } + + protected void doUpdate(final OutputDirectories od, + final String[] segments) + throws IOException + { + LOG.info("updating crawldb " + od.getCrawlDb()); + + // Need to make sure the db dir exists before progressing. + Path dbPath = new Path(od.getCrawlDb(), CrawlDb.CURRENT_NAME); + + if (!getFS().exists(dbPath)) + { + getFS().mkdirs(dbPath); } - - protected void doUpdate(final OutputDirectories od, - final String[] segments) - throws IOException { - LOG.info("updating crawldb " + od.getCrawlDb()); - // Need to make sure the db dir exists before progressing. - Path dbPath = new Path(od.getCrawlDb(), CrawlDb.CURRENT_NAME); - if (!getFS().exists(dbPath)) { - getFS().mkdirs(dbPath); + + CrawlDb cdb = new NutchwaxCrawlDb(getJobConf()); + + if (segments != null) + { + List<Path> paths = new ArrayList<Path>(segments.length); + + for (int i = 0; i < segments.length; i++) + { + Path p = new Path(segments[i]); + + if (!getFS().exists(p)) + { + throw new FileNotFoundException(p.toString()); } - CrawlDb cdb = new NutchwaxCrawlDb(getJobConf()); - if (segments != null) { - List<Path> paths = new ArrayList<Path>(segments.length); - for (int i = 0; i < segments.length; i++) { - Path p = new Path(segments[i]); - if (!getFS().exists(p)) { - throw new FileNotFoundException(p.toString()); - } - paths.add(p); - } - cdb.update(od.getCrawlDb(), paths.toArray(new Path[paths.size()]), - true, true); - } else { - Path[] allSegments = getSegments(od); - // This just does the last segment created. - cdb.update(od.getCrawlDb(), - new Path[] {allSegments[allSegments.length - 1]}, true, true); - } + + paths.add(p); + } + + cdb.update(od.getCrawlDb(), paths.toArray(new Path[paths.size()]), + true, true); } + else + { + Path[] allSegments = getSegments(od); + + // This just does the last segment created. + cdb.update(od.getCrawlDb(), + new Path[] {allSegments[allSegments.length - 1]}, true, true); + } + } - protected Path [] getSegments(final OutputDirectories od) - throws IOException { - Path[] allSegments = getFS().listPaths(od.getSegments()); - if (allSegments == null || allSegments.length <= 0) { - throw new FileNotFoundException(od.getSegments().toString()); - } - return allSegments; + protected Path [] getSegments(final OutputDirectories od) + throws IOException + { + Path[] allSegments = getFS().listPaths(od.getSegments()); + + if (allSegments == null || allSegments.length <= 0) + { + throw new FileNotFoundException(od.getSegments().toString()); } + + return allSegments; + } - protected void doInvert(final OutputDirectories od, final Path [] segments) - throws IOException { - createLinkdb(od); - new NutchwaxLinkDb(getJobConf()). - invert(od.getLinkDb(), segments, true, true, false); - } + protected void doInvert(final OutputDirectories od, final Path [] segments) + throws IOException + { + createLinkdb(od); + + new NutchwaxLinkDb(getJobConf()). + invert(od.getLinkDb(), segments, true, true, false); + } - protected void doInvert(final OutputDirectories od) - throws IOException { - LOG.info("inverting links in " + od.getSegments()); - new NutchwaxLinkDb(getJobConf()). - invert(od.getLinkDb(), getSegments(od), true, true, false); + protected void doInvert(final OutputDirectories od) + throws IOException + { + LOG.info("inverting links in " + od.getSegments()); + + new NutchwaxLinkDb(getJobConf()). + invert(od.getLinkDb(), getSegments(od), true, true, false); + } + + protected boolean createLinkdb(final OutputDirectories od) + throws IOException + { + boolean result = false; + + // Make sure the linkdb exists. Otherwise the install where + // the temporary location gets moved to the permanent fails. + if (getFS().mkdirs(new Path(od.getLinkDb(), + NutchwaxLinkDb.CURRENT_NAME))) + { + LOG.info("Created " + od.getLinkDb()); + + result = true; } + + return result; + } - protected boolean createLinkdb(final OutputDirectories od) - throws IOException { - boolean result = false; - // Make sure the linkdb exists. Otherwise the install where - // the temporary location gets moved to the permanent fails. - if (getFS().mkdirs(new Path(od.getLinkDb(), - NutchwaxLinkDb.CURRENT_NAME))) { - LOG.info("Created " + od.getLinkDb()); - result = true; - } - return result; - } + protected void doIndexing(final OutputDirectories od) + throws IOException + { + doIndexing(od, getFS().listPaths(od.getSegments())); + } + + protected void doIndexing(final OutputDirectories od, + final Path [] segments) + throws IOException + { + LOG.info(" indexing " + segments); + + new NutchwaxIndexer(getJobConf()).index(od.getIndexes(), + od.getCrawlDb(), od.getLinkDb(), segments); + } + + protected void doDedup(final OutputDirectories od) throws IOException + { + LOG.info("dedup " + od.getIndex()); + + new DeleteDuplicates(getJobConf()).dedup(new Path[] {od.getIndexes()}); + } - protected void doIndexing(final OutputDirectories od) - throws IOException { - doIndexing(od, getFS().listPaths(od.getSegments())); + protected void doMerge(final OutputDirectories od) throws IOException + { + LOG.info("index merge " + od.getOutput() + " using tmpDir=" + + od.getTmpDir()); + + new IndexMerger(getJobConf()).merge(getFS().listPaths(od.getIndexes()), + od.getIndex(), od.getTmpDir()); + } + + static String [] rewriteArgs(final String [] args, final int offset) + { + final String [] newArgs = new String[args.length - offset]; + + for (int i = 0; i < args.length; i++) + { + if (i < offset) + { + continue; + } + + newArgs[i - offset] = args[i]; } - - protected void doIndexing(final OutputDirectories od, - final Path [] segments) - throws IOException { - LOG.info(" indexing " + segments); - new NutchwaxIndexer(getJobConf()).index(od.getIndexes(), - od.getCrawlDb(), od.getLinkDb(), segments); + + return newArgs; + } + + static Object doClassMain(final String [] args) + { + // Redo args so absent our nutchwax 'class' command. + final String className = args[1]; + String [] newArgs = rewriteArgs(args, 2); + + // From http://www.javaworld.com/javaworld/javaqa/1999-06/01-outside.html + Class [] argTypes = new Class[1]; + argTypes[0] = String[].class; + Object result = null; + + try + { + Method mainMethod = + Class.forName(className).getDeclaredMethod("main", argTypes); + result = mainMethod.invoke(newArgs, new Object [] {newArgs}); } + catch (Throwable t) + { + t.printStackTrace(); + } + + return result; + } - protected void doDedup(final OutputDirectories od) throws IOException { - LOG.info("dedup " + od.getIndex()); - new DeleteDuplicates(getJobConf()).dedup(new Path[] {od.getIndexes()}); + protected Object doSearch(final String [] args) + { + String [] newArgs = new String[args.length + 1]; + newArgs[0] = args[0]; + newArgs[1] = NutchwaxBean.class.getName(); + + for (int i = 1; i < args.length; i++) + { + newArgs[i + 1] = args[i]; } + + return doClassMain(newArgs); + } + + protected void doMultiple(final String [] args) throws Exception + { + (new Multiple()).run(rewriteArgs(args, 1)); + } - protected void doMerge(final OutputDirectories od) throws IOException { - LOG.info("index merge " + od.getOutput() + " using tmpDir=" + - od.getTmpDir()); - new IndexMerger(getJobConf()).merge(getFS().listPaths(od.getIndexes()), - od.getIndex(), od.getTmpDir()); + protected void doJob(final String jobName, final String [] args) + throws Exception + { + if (jobName.equals("import")) + { + // Usage: hadoop jar nutchwax.jar import input output name + if (args.length != 4) + { + ImportArcs.doImportUsage( + "ERROR: Wrong number of arguments passed.", 2); + } + + final Path input = new Path(args[1]); + final Path output = new Path(args[2]); + final String collectionName = args[3]; + + checkArcsDir(input); + OutputDirectories od = new OutputDirectories(output); + doImport(input, collectionName, od); } - - static String [] rewriteArgs(final String [] args, final int offset) { - final String [] newArgs = new String[args.length - offset]; - for (int i = 0; i < args.length; i++) { - if (i < offset) { - continue; - } - newArgs[i - offset] = args[i]; + else if (jobName.equals("update")) + { + // Usage: hadoop jar nutchwax.jar update output + if (args.length < 2) + { + doUpdateUsage("ERROR: Wrong number of arguments passed.", 2); + } + + OutputDirectories od = new OutputDirectories(new Path(args[1])); + + if (args.length == 2) + { + doUpdate(od); + } + else + { + for (int i = 2; i < args.length; i++) + { + doUpdate(od, new String [] {args[i]}); } - return newArgs; + } } + else if (jobName.equals("invert")) + { + // Usage: hadoop jar nutchwax.jar invert output + if (args.length < 2) + { + doInvertUsage("ERROR: Wrong number of arguments passed.", 2); + } - static Object doClassMain(final String [] args) { - // Redo args so absent our nutchwax 'class' command. - final String className = args[1]; - String [] newArgs = rewriteArgs(args, 2); - // From http://www.javaworld.com/javaworld/javaqa/1999-06/01-outside.html - Class [] argTypes = new Class[1]; - argTypes[0] = String[].class; - Object result = null; - try { - Method mainMethod = - Class.forName(className).getDeclaredMethod("main", argTypes); - result = mainMethod.invoke(newArgs, new Object [] {newArgs}); - } catch (Throwable t) { - t.printStackTrace(); + OutputDirectories od = new OutputDirectories(new Path(args[1])); + + if (args.length == 2) + { + doInvert(od); + } + else + { + final int offset = 2; + Path [] segments = new Path[args.length - offset]; + + for (int i = offset; i < args.length; i++) + { + Path f = new Path(args[i]); + + if (! getFS().exists(f)) + { + throw new FileNotFoundException(f.toString()); + } + + segments[i - offset] = f; } - return result; + + doInvert(od, segments); + } } - - protected Object doSearch(final String [] args) { - String [] newArgs = new String[args.length + 1]; - newArgs[0] = args[0]; - newArgs[1] = NutchwaxBean.class.getName(); - for (int i = 1; i < args.length; i++) { - newArgs[i + 1] = args[i]; + else if (jobName.equals("index")) + { + // Usage: hadoop jar nutchwax.jar index output + if (args.length < 2) + { + doIndexUsage("ERROR: Wrong number of arguments passed.", 2); + } + + OutputDirectories od = new OutputDirectories(new Path(args[1])); + + if (args.length == 2) + { + doIndexing(od); + } + else + { + final int offset = 2; + Path [] segments = new Path[args.length - offset]; + + for (int i = offset; i < args.length; i++) + { + Path f = new Path(args[i]); + + if (! getFS().exists(f)) + { + throw new FileNotFoundException(f.toString()); + } + + segments[i - offset] = f; } - return doClassMain(newArgs); + + doIndexing(od, segments); + } } - - protected void doMultiple(final String [] args) throws Exception { - (new Multiple()).run(rewriteArgs(args, 1)); + else if (jobName.equals("dedup")) + { + // Usage: hadoop jar nutchwax.jar dedup output + if (args.length != 2) + { + doDedupUsage("Wrong number of arguments passed.", 2); + } + + doDedup(new OutputDirectories(new Path(args[1]))); } - - protected void doJob(final String jobName, final String [] args) - throws Exception { - if (jobName.equals("import")) { - // Usage: hadoop jar nutchwax.jar import input output name - if (args.length != 4) { - ImportArcs.doImportUsage( - "ERROR: Wrong number of arguments passed.", 2); - } - final Path input = new Path(args[1]); - final Path output = new Path(args[2]); - final String collectionName = args[3]; - checkArcsDir(input); - OutputDirectories od = new OutputDirectories(output); - doImport(input, collectionName, od); - } else if (jobName.equals("update")) { - // Usage: hadoop jar nutchwax.jar update output - if (args.length < 2) { - doUpdateUsage("ERROR: Wrong number of arguments passed.", 2); - } - OutputDirectories od = new OutputDirectories(new Path(args[1])); - if (args.length == 2) { - doUpdate(od); - } else { - for (int i = 2; i < args.length; i++) { - doUpdate(od, new String [] {args[i]}); - } - } - } else if (jobName.equals("invert")) { - // Usage: hadoop jar nutchwax.jar invert output - if (args.length < 2) { - doInvertUsage("ERROR: Wrong number of arguments passed.", 2); - } - OutputDirectories od = new OutputDirectories(new Path(args[1])); - if (args.length == 2) { - doInvert(od); - } else { - final int offset = 2; - Path [] segments = new Path[args.length - offset]; - for (int i = offset; i < args.length; i++) { - Path f = new Path(args[i]); - if (!getFS().exists(f)) { - throw new FileNotFoundException(f.toString()); - } - segments[i - offset] = f; - } - doInvert(od, segments); - } - } else if (jobName.equals("index")) { - // Usage: hadoop jar nutchwax.jar index output - if (args.length < 2) { - doIndexUsage("ERROR: Wrong number of arguments passed.", 2); - } - OutputDirectories od = new OutputDirectories(new Path(args[1])); - if (args.length == 2) { - doIndexing(od); - } else { - final int offset = 2; - Path [] segments = new Path[args.length - offset]; - for (int i = offset; i < args.length; i++) { - Path f = new Path(args[i]); - if (!getFS().exists(f)) { - throw new FileNotFoundException(f.toString()); - } - segments[i - offset] = f; - } - doIndexing(od, segments); - } - } else if (jobName.equals("dedup")) { - // Usage: hadoop jar nutchwax.jar dedup output - if (args.length != 2) { - doDedupUsage("Wrong number of arguments passed.", 2); - } - doDedup(new OutputDirectories(new Path(args[1]))); - } else if (jobName.equals("merge")) { - // Usage: hadoop jar nutchwax.jar merge output"); - if (args.length != 2) { - doMergeUsage("ERROR: Wrong number of arguments passed.", 2); - } - doMerge(new OutputDirectories(new Path(args[1]))); - } else if (jobName.equals("all")) { - // Usage: hadoop jar nutchwax.jar import input output name - if (args.length != 4) { - doAllUsage("ERROR: Wrong number of arguments passed.", 2); - } - final Path input = new Path(args[1]); - final Path output = new Path(args[2]); - final String collectionName = args[3]; - checkArcsDir(input); - OutputDirectories od = new OutputDirectories(output); - doAll(input, collectionName, od); - } else if (jobName.equals("class")) { - if (args.length < 2) { - doClassUsage("ERROR: Wrong number of arguments passed.", 2); - } - doClassMain(args); - } else if (jobName.equals("search")) { - if (args.length < 1) { - doClassUsage("ERROR: Wrong number of arguments passed.", 2); - } - doSearch(args); - } else if (jobName.equals("multiple")) { - doMultiple(args); - } else { - usage("ERROR: No handler for job name " + jobName, 4); - System.exit(0); - } + else if (jobName.equals("merge")) + { + // Usage: hadoop jar nutchwax.jar merge output"); + if (args.length != 2) + { + doMergeUsage("ERROR: Wrong number of arguments passed.", 2); + } + + doMerge(new OutputDirectories(new Path(args[1]))); } + else if (jobName.equals("all")) + { + // Usage: hadoop jar nutchwax.jar import input output name + if (args.length != 4) + { + doAllUsage("ERROR: Wrong number of arguments passed.", 2); + } - /** - * Check the arcs dir exists and looks like it has files that list ARCs - * (rather than ARCs themselves). - * - * @param arcsDir Directory to examine. - * @throws IOException - */ - protected void checkArcsDir(final Path arcsDir) - throws IOException { - if (!getFS().exists(arcsDir)) { - throw new IOException(arcsDir + " does not exist."); - } - if (!fs.isDirectory(arcsDir)) { - throw new IOException(arcsDir + " is not a directory."); - } + final Path input = new Path(args[1]); + final Path output = new Path(args[2]); + final String collectionName = args[3]; - final Path [] files = getFS().listPaths(arcsDir); - for (int i = 0; i < files.length; i++) { - if (!getFS().isFile(files[i])) { - throw new IOException(files[i] + " is not a file."); - } - if (files[i].getName().toLowerCase().endsWith(".arc.gz")) { - throw new IOException(files[i] + " is an ARC file (ARCSDIR " + - "should contain text file listing ARCs rather than " + - "actual ARCs)."); - } - } + checkArcsDir(input); + + OutputDirectories od = new OutputDirectories(output); + + doAll(input, collectionName, od); } + else if (jobName.equals("class")) + { + if (args.length < 2) + { + doClassUsage("ERROR: Wrong number of arguments passed.", 2); + } + + doClassMain(args); + } + else if (jobName.equals("search")) + { + if (args.length < 1) + { + doClassUsage("ERROR: Wrong number of arguments passed.", 2); + } + + doSearch(args); + } + else if (jobName.equals("multiple")) + { + doMultiple(args); + } + else + { + usage("ERROR: No handler for job name " + jobName, 4); + System.exit(0); + } + } + + /** + * Check the arcs dir exists and looks like it has files that list ARCs + * (rather than ARCs themselves). + * + * @param arcsDir Directory to examine. + * @throws IOException + */ + protected void checkArcsDir(final Path arcsDir) + throws IOException + { + if (! getFS().exists(arcsDir)) + { + throw new IOException(arcsDir + " does not exist."); + } + + if (! fs.isDirectory(arcsDir)) + { + throw new IOException(arcsDir + " is not a directory."); + } + + final Path [] files = getFS().listPaths(arcsDir); + + for (int i = 0; i < files.length; i++) + { + if (! getFS().isFile(files[i])) + { + throw new IOException(files[i] + " is not a file."); + } + + if (files[i].getName().toLowerCase().endsWith(".arc.gz")) + { + throw new IOException(files[i] + " is an ARC file (ARCSDIR " + + "should contain text file listing ARCs rather than " + + "actual ARCs)."); + } + } + } + + public static Text generateWaxKey(WritableComparable key, + final String collection) + { + return generateWaxKey(key.toString(), collection); + } - public static Text generateWaxKey(WritableComparable key, - final String collection) { - return generateWaxKey(key.toString(), collection); + public static Text generateWaxKey(final String keyStr, + final String collection) + { + if (collection == null) + { + throw new NullPointerException("Collection is null for " + keyStr); } - public static Text generateWaxKey(final String keyStr, - final String collection) { - if (collection == null) { - throw new NullPointerException("Collection is null for " + keyStr); - } - if (keyStr == null) { - throw new NullPointerException("keyStr is null"); - } - if (keyStr.startsWith(KEY_COLLECTION_PREFIX)) { - LOG.warn("Key already has collection prefix: " + keyStr - + ". Skipping."); - return new Text(keyStr); - } + if (keyStr == null) + { + throw new NullPointerException("keyStr is null"); + } + + if (keyStr.startsWith(KEY_COLLECTION_PREFIX)) + { + LOG.warn("Key already has collection prefix: " + keyStr + + ". Skipping."); - return new Text(KEY_COLLECTION_PREFIX + collection.trim() + - KEY_COLLECTION_SUFFIX + keyStr.trim()); + return new Text(keyStr); } + + return new Text(KEY_COLLECTION_PREFIX + collection.trim() + + KEY_COLLECTION_SUFFIX + keyStr.trim()); + } + + public static String getCollectionFromWaxKey(final WritableComparable key) + throws IOException + { + Matcher m = COLLECTION.matcher(key.toString()); - public static String getCollectionFromWaxKey(final WritableComparable key) - throws IOException { - Matcher m = COLLECTION.matcher(key.toString()); - if (m == null || !m.matches()) { - throw new IOException("Key doesn't have collection " + - "prefix <" + key.toString() + ">"); - } - return m.group(1); + if (m == null || !m.matches()) + { + throw new IOException("Key doesn't have collection " + + "prefix <" + key.toString() + ">"); } - public static String getUrlFromWaxKey(final WritableComparable key) - throws IOException { - Matcher m = COLLECTION.matcher(key.toString()); - if (m == null || !m.matches()) { - throw new IOException("Key doesn't have collection " + - " prefix: " + key); - } - return m.group(2); + return m.group(1); + } + + public static String getUrlFromWaxKey(final WritableComparable key) + throws IOException + { + Matcher m = COLLECTION.matcher(key.toString()); + + if (m == null || !m.matches()) + { + throw new IOException("Key doesn't have collection " + + " prefix: " + key); } - public static long getDate(String d) - throws IOException { - long date = 0; - try { - date = ArchiveUtils.getDate(d).getTime(); - } catch (final java.text.ParseException e) { - throw new IOException("Failed parse of date: " + d + ": " + - e.getMessage()); - } - // Date can be < 0 if pre-1970 (Seen in some old ARCs). - return date >= 0? date: 0; + return m.group(2); + } + + public static long getDate(String d) throws IOException + { + long date = 0; + + try + { + date = ArchiveUtils.getDate(d).getTime(); } + catch (final java.text.ParseException e) + { + throw new IOException("Failed parse of date: " + d + ": " + + e.getMessage()); + } + + // Date can be < 0 if pre-1970 (Seen in some old ARCs). + return date >= 0? date: 0; + } - public static void usage(final String message, final int exitCode) { - if (message != null && message.length() > 0) { - System.out.println(message); - } + public static void usage(final String message, final int exitCode) + { + if (message != null && message.length() > 0) + { + System.out.println(message); + } - System.out.println("Usage: hadoop jar nutchwax.jar <job> [args]"); - System.out.println("Launch NutchWAX job(s) on a hadoop platform."); - System.out.println("Type 'hadoop jar nutchwax.jar help <job>' for" + - " help on a specific job."); - System.out.println("Jobs (usually) must be run in the order " + - "listed below."); - System.out.println("Available jobs:"); - System.out.println(" import Import ARCs."); - System.out.println(" update Update dbs with recent imports."); - System.out.println(" invert Invert links."); - System.out.println(" index Index segments."); - System.out.println(" dedup Deduplicate by URL or content MD5."); - System.out.println(" merge Merge segment indices into one."); - System.out.println(" all Runs all above jobs in order."); - System.out.println(" class Run the passed class's main."); - System.out.println(" search Run a query against index under " + - "property 'searcher.dir'"); - System.out.println(" multiple Run multiple concurrent tasks."); - System.exit(exitCode); - } + System.out.println("Usage: hadoop jar nutchwax.jar <job> [args]"); + System.out.println("Launch NutchWAX job(s) on a hadoop platform."); + System.out.println("Type 'hadoop jar nutchwax.jar help <job>' for" + + " help on a specific job."); + System.out.println("Jobs (usually) must be run in the order " + + "listed below."); + System.out.println("Available jobs:"); + System.out.println(" import Import ARCs."); + System.out.println(" update Update dbs with recent imports."); + System.out.println(" invert Invert links."); + System.out.println(" index Index segments."); + System.out.println(" dedup Deduplicate by URL or content MD5."); + System.out.println(" merge Merge segment indices into one."); + System.out.println(" all Runs all above jobs in order."); + System.out.println(" class Run the passed class's main."); + System.out.println(" search Run a query against index under " + + "property 'searcher.dir'"); + System.out.println(" multiple Run multiple concurrent tasks."); - public static void doUpdateUsage(final String message, - final int exitCode) { - if (message != null && message.length() > 0) { - System.out.println(message); - } - System.out.println("Usage: hadoop jar nutchwax.jar update <output> " + - "[<segments>...]"); - System.out.println("Arguments:"); - System.out.println(" output Directory to write crawldb under."); - System.out.println("Options:"); - System.out.println(" segments List of segments to update crawldb " + - "with. If none supplied, updates"); - System.out.println(" using latest segment found."); - System.exit(exitCode); + System.exit(exitCode); + } + + public static void doUpdateUsage(final String message, + final int exitCode) + { + if (message != null && message.length() > 0) + { + System.out.println(message); } - public static void doInvertUsage(final String message, - final int exitCode) { - if (message != null && message.length() > 0) { - System.out.println(message); - } - System.out.println("Usage: hadoop jar nutchwax.jar invert <output> " + - "[<segments>...]"); - System.out.println("Arguments:"); - System.out.println(" output Directory to write linkdb under."); - System.out.println("Options:"); - System.out.println(" segments List of segments to update linkdb " + - "with. If none supplied, all under"); - System.out.println(" '<output>/segments/' " + - "are passed."); - System.exit(exitCode); + System.out.println("Usage: hadoop jar nutchwax.jar update <output> " + + "[<segments>...]"); + System.out.println("Arguments:"); + System.out.println(" output Directory to write crawldb under."); + System.out.println("Options:"); + System.out.println(" segments List of segments to update crawldb " + + "with. If none supplied, updates"); + System.out.println(" using latest segment found."); + + System.exit(exitCode); + } + + public static void doInvertUsage(final String message, + final int exitCode) + { + if (message != null && message.length() > 0) + { + System.out.println(message); } - public static void doIndexUsage(final String message, - final int exitCode) { - if (message != null && message.length() > 0) { - System.out.println(message); - } - System.out.println("Usage: hadoop jar nutchwax.jar index <output> " + - "[<segments>...]"); - System.out.println("Arguments:"); - System.out.println(" output Directory to write indexes under."); - System.out.println("Options:"); - System.out.println(" segments List of segments to index. " + - "If none supplied, all under"); - System.out.println(" '<output>/segments/' " + - "are indexed."); - System.exit(exitCode); + System.out.println("Usage: hadoop jar nutchwax.jar invert <output> " + + "[<segments>...]"); + System.out.println("Arguments:"); + System.out.println(" output Directory to write linkdb under."); + System.out.println("Options:"); + System.out.println(" segments List of segments to update linkdb " + + "with. If none supplied, all under"); + System.out.println(" '<output>/segments/' " + + "are passed."); + + System.exit(exitCode); + } + + public static void doIndexUsage(final String message, + final int exitCode) + { + if (message != null && message.length() > 0) + { + System.out.println(message); } - public static void doDedupUsage(final String message, - final int exitCode) { - if (message != null && message.length() > 0) { - System.out.println(message); - } - System.out.println("Usage: hadoop jar nutchwax.jar dedup <output>"); - System.out.println("Arguments:"); - System.out.println(" output Directory in which indices" + - " to dedup reside."); - System.exit(exitCode); + System.out.println("Usage: hadoop jar nutchwax.jar index <output> " + + "[<segments>...]"); + System.out.println("Arguments:"); + System.out.println(" output Directory to write indexes under."); + System.out.println("Options:"); + System.out.println(" segments List of segments to index. " + + "If none supplied, all under"); + System.out.println(" '<output>/segments/' " + + "are indexed."); + + System.exit(exitCode); + } + + public static void doDedupUsage(final String message, + final int exitCode) + { + if (message != null && message.length() > 0) + { + System.out.println(message); } - public static void doMergeUsage(final String message, - final int exitCode) { - if (message != null && message.length() > 0) { - System.out.println(message); - } - System.out.println("Usage: hadoop jar nutchwax.jar merge <output>"); - System.out.println("Arguments:"); - System.out.println(" output Directory in which indices" + - " to merge reside."); - System.exit(exitCode); + System.out.println("Usage: hadoop jar nutchwax.jar dedup <output>"); + System.out.println("Arguments:"); + System.out.println(" output Direc... [truncated message content] |