From: Doug C. <cu...@us...> - 2005-10-20 23:30:57
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv27408/src/java/org/archive/access/nutch Modified Files: Tag: mapred ImportArcs.java IndexArcs.java Log Message: Pre-au fixes. Index: ImportArcs.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch/Attic/ImportArcs.java,v retrieving revision 1.1.2.1 retrieving revision 1.1.2.2 diff -C2 -d -r1.1.2.1 -r1.1.2.2 *** ImportArcs.java 1 Sep 2005 18:45:29 -0000 1.1.2.1 --- ImportArcs.java 20 Oct 2005 23:30:49 -0000 1.1.2.2 *************** *** 55,60 **** import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseStatus; ! import org.apache.nutch.parse.Parser; ! import org.apache.nutch.parse.ParserFactory; import org.apache.nutch.parse.ParseImpl; --- 55,59 ---- import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseStatus; ! import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.parse.ParseImpl; *************** *** 127,135 **** if (arcName == null) { // first entry has arc name ! String arcPath = new URI(rec.getMetaData().getUrl()).getPath(); ! arcName = new File(arcPath).getName(); ! if (arcName.endsWith(".arc")) { ! arcName = arcName.substring(0, arcName.indexOf(".arc")); ! } reporter.setStatus(arcName); } --- 126,130 ---- if (arcName == null) { // first entry has arc name ! arcName = rec.getMetaData().getUrl(); reporter.setStatus(arcName); } *************** *** 211,220 **** Content content = new Content(url, url, contentBytes, mimetype, metaData); - metaData.put(Fetcher.DIGEST_KEY, MD5Hash.digest(contentBytes).toString()); - metaData.put(Fetcher.SEGMENT_NAME_KEY, segmentName); - CrawlDatum datum = new CrawlDatum(); datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS); long date = 0; try { --- 206,216 ---- Content content = new Content(url, url, contentBytes, mimetype, metaData); CrawlDatum datum = new CrawlDatum(); datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS); + metaData.put(Fetcher.DIGEST_KEY, MD5Hash.digest(contentBytes).toString()); + metaData.put(Fetcher.SEGMENT_NAME_KEY, segmentName); + metaData.put(Fetcher.SCORE_KEY, Float.toString(datum.getScore())); + long date = 0; try { *************** *** 228,234 **** ParseStatus parseStatus; try { ! Parser parser = ParserFactory.getParser(content.getContentType(), ! content.getBaseUrl()); ! parse = parser.getParse(content); parseStatus = parse.getData().getStatus(); } catch (Exception e) { --- 224,228 ---- ParseStatus parseStatus; try { ! parse = ParseUtil.parse(content); parseStatus = parse.getData().getStatus(); } catch (Exception e) { Index: IndexArcs.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch/Attic/IndexArcs.java,v retrieving revision 1.1.2.2 retrieving revision 1.1.2.3 diff -C2 -d -r1.1.2.2 -r1.1.2.3 *** IndexArcs.java 12 Oct 2005 16:49:04 -0000 1.1.2.2 --- IndexArcs.java 20 Oct 2005 23:30:49 -0000 1.1.2.3 *************** *** 28,31 **** --- 28,32 ---- import org.apache.nutch.mapred.*; import org.apache.nutch.crawl.*; + import org.apache.nutch.indexer.IndexMerger; public class IndexArcs { *************** *** 51,54 **** --- 52,56 ---- boolean noImport = false; + boolean noUpdate = false; boolean noInvert = false; boolean noIndex = false; *************** *** 57,60 **** --- 59,64 ---- if ("-noimport".equals(args[i])) { noImport = true; + } else if ("-noupdate".equals(args[i])) { + noUpdate = true; } else if ("-noinvert".equals(args[i])) { noInvert = true; *************** *** 69,81 **** LOG.info("arcsDir = " + arcsDir); File linkDb = new File(crawlDir + "/linkdb"); File segments = new File(crawlDir + "/segments"); if (!noImport) { // import arcs - File segment = new File(segments, getDate()); LOG.info("importing arcs in " + arcsDir + " to " + segment); new ImportArcs(conf).importArcs(arcsDir, segment); } if (!noInvert) { // invert links LOG.info("inverting links in " + segments); --- 73,95 ---- LOG.info("arcsDir = " + arcsDir); + File crawlDb = new File(crawlDir + "/crawldb"); File linkDb = new File(crawlDir + "/linkdb"); File segments = new File(crawlDir + "/segments"); + File segment = new File(segments, getDate()); + File indexes = new File(crawlDir + "/indexes"); + File index = new File(crawlDir + "/index"); + + File tmpDir = conf.getLocalFile("crawl", getDate()); if (!noImport) { // import arcs LOG.info("importing arcs in " + arcsDir + " to " + segment); new ImportArcs(conf).importArcs(arcsDir, segment); } + if (!noUpdate) { // update crawldb + LOG.info("updating crawldb in " + crawlDb); + new CrawlDb(conf).update(crawlDb, segment); + } + if (!noInvert) { // invert links LOG.info("inverting links in " + segments); *************** *** 84,92 **** if (!noIndex) { // index - File index = new File(crawlDir + "/indexes"); LOG.info("indexing " + crawlDir); ! new Indexer(conf).index(index, linkDb, fs.listFiles(segments)); } LOG.info("IndexArcs finished: " + crawlDir); } --- 98,108 ---- if (!noIndex) { // index LOG.info("indexing " + crawlDir); ! new Indexer(conf).index(indexes,crawlDb,linkDb,fs.listFiles(segments)); } + new DeleteDuplicates(conf).dedup(new File[] { indexes }); + new IndexMerger(fs, fs.listFiles(indexes), index, tmpDir).merge(); + LOG.info("IndexArcs finished: " + crawlDir); } |