From: Doug C. <cu...@us...> - 2005-10-12 16:49:13
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv3474 Modified Files: Tag: mapred IndexArcs.java Log Message: Add some command line options. Index: IndexArcs.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch/Attic/IndexArcs.java,v retrieving revision 1.1.2.1 retrieving revision 1.1.2.2 diff -C2 -d -r1.1.2.1 -r1.1.2.2 *** IndexArcs.java 1 Sep 2005 18:45:29 -0000 1.1.2.1 --- IndexArcs.java 12 Oct 2005 16:49:04 -0000 1.1.2.2 *************** *** 40,45 **** /* Import and index a set of arc files. */ public static void main(String args[]) throws Exception { ! if (args.length < 1) { ! System.out.println("Usage: IndexArcs <arcsDir> [-dir d]"); return; } --- 40,45 ---- /* Import and index a set of arc files. */ public static void main(String args[]) throws Exception { ! if (args.length < 2) { ! System.out.println("Usage: IndexArcs <arcsDir> <crawlDir> [-noimport] [-noinvert] [-noindex]"); return; } *************** *** 47,85 **** JobConf conf = new JobConf(NutchConf.get()); ! File arcsDir = null; ! File dir = new File("crawl-" + getDate()); ! for (int i = 0; i < args.length; i++) { ! if ("-dir".equals(args[i])) { ! dir = new File(args[i+1]); ! i++; ! } else if (args[i] != null) { ! arcsDir = new File(args[i]); } } NutchFileSystem fs = NutchFileSystem.get(conf); - if (fs.exists(dir)) { - throw new RuntimeException(dir + " already exists."); - } ! LOG.info("IndexArcs started in: " + dir); LOG.info("arcsDir = " + arcsDir); ! File linkDb = new File(dir + "/linkdb"); ! File index = new File(dir + "/indexes"); ! File segments = new File(dir + "/segments"); ! File segment = new File(segments, getDate()); ! // import arcs ! new ImportArcs(conf).importArcs(arcsDir, segment); ! // invert links ! new LinkDb(conf).invert(linkDb, segments); ! // index everything ! new Indexer(conf).index(index, linkDb, fs.listFiles(segments)); ! LOG.info("IndexArcs finished: " + dir); } } --- 47,93 ---- JobConf conf = new JobConf(NutchConf.get()); ! File arcsDir = new File(args[0]); ! File crawlDir = new File(args[1]); ! boolean noImport = false; ! boolean noInvert = false; ! boolean noIndex = false; ! ! for (int i = 2; i < args.length; i++) { ! if ("-noimport".equals(args[i])) { ! noImport = true; ! } else if ("-noinvert".equals(args[i])) { ! noInvert = true; ! } else if ("-noindex".equals(args[i])) { ! noIndex = true; } } NutchFileSystem fs = NutchFileSystem.get(conf); ! LOG.info("IndexArcs started in: " + crawlDir); LOG.info("arcsDir = " + arcsDir); ! File linkDb = new File(crawlDir + "/linkdb"); ! File segments = new File(crawlDir + "/segments"); ! if (!noImport) { // import arcs ! File segment = new File(segments, getDate()); ! LOG.info("importing arcs in " + arcsDir + " to " + segment); ! new ImportArcs(conf).importArcs(arcsDir, segment); ! } ! if (!noInvert) { // invert links ! LOG.info("inverting links in " + segments); ! new LinkDb(conf).invert(linkDb, segments); ! } ! if (!noIndex) { // index ! File index = new File(crawlDir + "/indexes"); ! LOG.info("indexing " + crawlDir); ! new Indexer(conf).index(index, linkDb, fs.listFiles(segments)); ! } ! LOG.info("IndexArcs finished: " + crawlDir); } } |