From: <sta...@us...> - 2007-02-21 16:19:28
|
Revision: 1507 http://archive-access.svn.sourceforge.net/archive-access/?rev=1507&view=rev Author: stack-sf Date: 2007-02-21 08:19:09 -0800 (Wed, 21 Feb 2007) Log Message: ----------- Move to nutch revision 508238 (from 492357). Includes move to hadoop 0.10.1. * . Update svn:externals. - third-party/nutch -r 492357 http://svn.apache.org/repos/asf/lucene/nutch/trunk + third-party/nutch -r 508238 http://svn.apache.org/repos/asf/lucene/nutch/trunk * src/java/org/archive/access/nutch/Nutchwax.java (invert): Add 'force removal of locks' to signature. * src/java/org/archive/access/nutch/NutchwaxIndexer.java Call parents indexer mapper. * src/java/org/archive/access/nutch/ImportArcs.java Put collection name from command line into job conf. * src/java/org/archive/access/nutch/NutchwaxLinkDb.java Add in lock handling for linkdb from parent. Revision Links: -------------- http://archive-access.svn.sourceforge.net/archive-access/?rev=508238&view=rev Modified Paths: -------------- trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/ImportArcs.java trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/Nutchwax.java trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/NutchwaxIndexer.java trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/NutchwaxLinkDb.java Property Changed: ---------------- trunk/archive-access/projects/nutchwax/ Property changes on: trunk/archive-access/projects/nutchwax ___________________________________________________________________ Name: svn:externals - third-party/nutch -r 492357 http://svn.apache.org/repos/asf/lucene/nutch/trunk + third-party/nutch -r 508238 http://svn.apache.org/repos/asf/lucene/nutch/trunk Modified: trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/ImportArcs.java =================================================================== --- trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/ImportArcs.java 2007-02-20 22:30:08 UTC (rev 1506) +++ trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/ImportArcs.java 2007-02-21 16:19:09 UTC (rev 1507) @@ -245,6 +245,8 @@ this.filters = new URLFilters(job); this.parseUtil = new ParseUtil(job); + + this.collectionName = job.get(ImportArcs.WAX_SUFFIX + ImportArcs.ARCCOLLECTION_KEY); } public void onARCOpen() { @@ -878,4 +880,4 @@ return -1; } } -} \ No newline at end of file +} Modified: trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/Nutchwax.java =================================================================== --- trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/Nutchwax.java 2007-02-20 22:30:08 UTC (rev 1506) +++ trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/Nutchwax.java 2007-02-21 16:19:09 UTC (rev 1507) @@ -216,14 +216,14 @@ throws IOException { createLinkdb(od); new NutchwaxLinkDb(getJobConf()). - invert(od.getLinkDb(), segments, true, true); + invert(od.getLinkDb(), segments, true, true, false); } protected void doInvert(final OutputDirectories od) throws IOException { LOG.info("inverting links in " + od.getSegments()); new NutchwaxLinkDb(getJobConf()). - invert(od.getLinkDb(), getSegments(od), true, true); + invert(od.getLinkDb(), getSegments(od), true, true, false); } protected boolean createLinkdb(final OutputDirectories od) Modified: trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/NutchwaxIndexer.java =================================================================== --- trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/NutchwaxIndexer.java 2007-02-20 22:30:08 UTC (rev 1506) +++ trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/NutchwaxIndexer.java 2007-02-21 16:19:09 UTC (rev 1507) @@ -83,6 +83,7 @@ job.addInputPath(new Path(linkDb, LinkDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); + job.setMapperClass(Indexer.class); job.setReducerClass(NutchwaxIndexer.class); job.setOutputPath(indexDir); Modified: trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/NutchwaxLinkDb.java =================================================================== --- trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/NutchwaxLinkDb.java 2007-02-20 22:30:08 UTC (rev 1506) +++ trunk/archive-access/projects/nutchwax/src/java/org/archive/access/nutch/NutchwaxLinkDb.java 2007-02-21 16:19:09 UTC (rev 1507) @@ -26,6 +26,7 @@ import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.parse.Outlink; import org.apache.nutch.parse.ParseData; +import org.apache.nutch.util.LockUtil; import org.apache.nutch.util.NutchJob; /** @@ -150,8 +151,12 @@ } public void invert(Path linkDb, final Path[] segments, - final boolean normalize, final boolean filter) + final boolean normalize, final boolean filter, boolean force) throws IOException { + Path lock = new Path(linkDb, LOCK_NAME); + FileSystem fs = FileSystem.get(getConf()); + LockUtil.createLockFile(fs, lock, force); + Path currentLinkDb = new Path(linkDb, CURRENT_NAME); if (LOG.isInfoEnabled()) { LOG.info("NutchwaxLinkDb: starting"); LOG.info("NutchwaxLinkDb: linkdb: " + linkDb); @@ -159,16 +164,19 @@ LOG.info("LinkDb: URL filter: " + filter); } JobConf job = createJob(getConf(), linkDb, normalize, filter); - for (int i = 0; i < segments.length; i++) { if (LOG.isInfoEnabled()) { LOG.info("LinkDb: adding segment: " + segments[i]); } job.addInputPath(new Path(segments[i], ParseData.DIR_NAME)); } - JobClient.runJob(job); - FileSystem fs = FileSystem.get(getConf()); - if (fs.exists(linkDb)) { + try { + JobClient.runJob(job); + } catch (IOException e) { + LockUtil.removeLockFile(fs, lock); + throw e; + } + if (fs.exists(currentLinkDb)) { if (LOG.isInfoEnabled()) { LOG.info("LinkDb: merging with existing linkdb: " + linkDb); } @@ -178,9 +186,15 @@ job.setJobName("NutchwaxLinkDb merge " + linkDb + " " + Arrays.asList(segments)); job.setMapperClass(NutchwaxLinkDbFilter.class); - job.addInputPath(new Path(linkDb, CURRENT_NAME)); + job.addInputPath(currentLinkDb); job.addInputPath(newLinkDb); - JobClient.runJob(job); + try { + JobClient.runJob(job); + } catch (IOException e) { + LockUtil.removeLockFile(fs, lock); + fs.delete(newLinkDb); + throw e; + } fs.delete(newLinkDb); } LinkDb.install(job, linkDb); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |